{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"%config InlineBackend.figure_formats = ['svg']\n",
"\n",
"import random\n",
"\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"# GroupBy"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"size\n",
"M 97\n",
"XL 90\n",
"XL 32\n",
"M 77\n",
"XL 39\n",
"Name: number, dtype: int64"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"random.seed(233)\n",
"\n",
"number = [random.randint(10, 100) for _ in range(10)]\n",
"size = [random.choices([\"M\", \"L\", \"XL\"], k=10)]\n",
"\n",
"s = pd.Series(number, index=size, name=\"number\").rename_axis(\"size\")\n",
"s.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"size\n",
"L 23\n",
"M 230\n",
"XL 312\n",
"Name: number, dtype: int64"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s.groupby(level=0).sum()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" size | \n",
" number | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" M | \n",
" 97 | \n",
"
\n",
" \n",
" | 1 | \n",
" XL | \n",
" 90 | \n",
"
\n",
" \n",
" | 2 | \n",
" XL | \n",
" 32 | \n",
"
\n",
" \n",
" | 3 | \n",
" M | \n",
" 77 | \n",
"
\n",
" \n",
" | 4 | \n",
" XL | \n",
" 39 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" size number\n",
"0 M 97\n",
"1 XL 90\n",
"2 XL 32\n",
"3 M 77\n",
"4 XL 39"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = s.to_frame().reset_index()\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"size\n",
"L 23\n",
"M 230\n",
"XL 312\n",
"Name: number, dtype: int64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(\"size\")[\"number\"].sum()"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"### GroupBy 的一些使用场景"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/0t/s0c95rbs6ds7w_b0d471p0kc0000gn/T/ipykernel_9426/460248279.py:2: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n",
" iris.columns = iris.columns.str.replace(\".\", \"_\")\n"
]
}
],
"source": [
"iris = pd.read_csv(\"../../data/iris.csv\")\n",
"iris.columns = iris.columns.str.replace(\".\", \"_\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 150 entries, 0 to 149\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 sepal_length 150 non-null float64\n",
" 1 sepal_width 150 non-null float64\n",
" 2 petal_length 150 non-null float64\n",
" 3 petal_width 150 non-null float64\n",
" 4 variety 150 non-null object \n",
"dtypes: float64(4), object(1)\n",
"memory usage: 6.0+ KB\n"
]
}
],
"source": [
"iris.info()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sepal_length | \n",
" sepal_width | \n",
" petal_length | \n",
" petal_width | \n",
" variety | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 5.1 | \n",
" 3.5 | \n",
" 1.4 | \n",
" 0.2 | \n",
" Setosa | \n",
"
\n",
" \n",
" | 1 | \n",
" 4.9 | \n",
" 3.0 | \n",
" 1.4 | \n",
" 0.2 | \n",
" Setosa | \n",
"
\n",
" \n",
" | 2 | \n",
" 4.7 | \n",
" 3.2 | \n",
" 1.3 | \n",
" 0.2 | \n",
" Setosa | \n",
"
\n",
" \n",
" | 3 | \n",
" 4.6 | \n",
" 3.1 | \n",
" 1.5 | \n",
" 0.2 | \n",
" Setosa | \n",
"
\n",
" \n",
" | 4 | \n",
" 5.0 | \n",
" 3.6 | \n",
" 1.4 | \n",
" 0.2 | \n",
" Setosa | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sepal_length sepal_width petal_length petal_width variety\n",
"0 5.1 3.5 1.4 0.2 Setosa\n",
"1 4.9 3.0 1.4 0.2 Setosa\n",
"2 4.7 3.2 1.3 0.2 Setosa\n",
"3 4.6 3.1 1.5 0.2 Setosa\n",
"4 5.0 3.6 1.4 0.2 Setosa"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"iris.head()"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"#### 自定义函数"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" variety | \n",
" weight | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Setosa | \n",
" 32.5 | \n",
"
\n",
" \n",
" | 1 | \n",
" Setosa | \n",
" 7.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" Setosa | \n",
" 15.5 | \n",
"
\n",
" \n",
" | 3 | \n",
" Setosa | \n",
" 8.5 | \n",
"
\n",
" \n",
" | 4 | \n",
" Setosa | \n",
" 36.0 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 145 | \n",
" Virginica | \n",
" 26.0 | \n",
"
\n",
" \n",
" | 146 | \n",
" Virginica | \n",
" 4.0 | \n",
"
\n",
" \n",
" | 147 | \n",
" Virginica | \n",
" 23.0 | \n",
"
\n",
" \n",
" | 148 | \n",
" Virginica | \n",
" 43.0 | \n",
"
\n",
" \n",
" | 149 | \n",
" Virginica | \n",
" 18.0 | \n",
"
\n",
" \n",
"
\n",
"
150 rows × 2 columns
\n",
"
"
],
"text/plain": [
" variety weight\n",
"0 Setosa 32.5\n",
"1 Setosa 7.0\n",
"2 Setosa 15.5\n",
"3 Setosa 8.5\n",
"4 Setosa 36.0\n",
".. ... ...\n",
"145 Virginica 26.0\n",
"146 Virginica 4.0\n",
"147 Virginica 23.0\n",
"148 Virginica 43.0\n",
"149 Virginica 18.0\n",
"\n",
"[150 rows x 2 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def rank_by_sepal(group):\n",
" df = group.copy()\n",
" df[\"weight\"] = 0.8*df[\"sepal_width\"] + 0.2*df[\"sepal_length\"]\n",
" return df[\"weight\"].rank()\n",
"\n",
"\n",
"iris.groupby(\"variety\").apply(rank_by_sepal).reset_index(level=0)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" variety | \n",
" weight | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Setosa | \n",
" 32.5 | \n",
"
\n",
" \n",
" | 1 | \n",
" Setosa | \n",
" 7.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" Setosa | \n",
" 15.5 | \n",
"
\n",
" \n",
" | 3 | \n",
" Setosa | \n",
" 8.5 | \n",
"
\n",
" \n",
" | 4 | \n",
" Setosa | \n",
" 36.0 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 145 | \n",
" Virginica | \n",
" 26.0 | \n",
"
\n",
" \n",
" | 146 | \n",
" Virginica | \n",
" 4.0 | \n",
"
\n",
" \n",
" | 147 | \n",
" Virginica | \n",
" 23.0 | \n",
"
\n",
" \n",
" | 148 | \n",
" Virginica | \n",
" 43.0 | \n",
"
\n",
" \n",
" | 149 | \n",
" Virginica | \n",
" 18.0 | \n",
"
\n",
" \n",
"
\n",
"
150 rows × 2 columns
\n",
"
"
],
"text/plain": [
" variety weight\n",
"0 Setosa 32.5\n",
"1 Setosa 7.0\n",
"2 Setosa 15.5\n",
"3 Setosa 8.5\n",
"4 Setosa 36.0\n",
".. ... ...\n",
"145 Virginica 26.0\n",
"146 Virginica 4.0\n",
"147 Virginica 23.0\n",
"148 Virginica 43.0\n",
"149 Virginica 18.0\n",
"\n",
"[150 rows x 2 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"iris.groupby(\"variety\").apply(lambda grp: rank_by_sepal(grp)).reset_index(level=0)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"#### 分组排序求前 TOP N"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" sepal_length | \n",
" sepal_width | \n",
" petal_length | \n",
" petal_width | \n",
" variety | \n",
"
\n",
" \n",
" | variety | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | Setosa | \n",
" 14 | \n",
" 5.8 | \n",
" 4.0 | \n",
" 1.2 | \n",
" 0.2 | \n",
" Setosa | \n",
"
\n",
" \n",
" | 15 | \n",
" 5.7 | \n",
" 4.4 | \n",
" 1.5 | \n",
" 0.4 | \n",
" Setosa | \n",
"
\n",
" \n",
" | Versicolor | \n",
" 50 | \n",
" 7.0 | \n",
" 3.2 | \n",
" 4.7 | \n",
" 1.4 | \n",
" Versicolor | \n",
"
\n",
" \n",
" | 52 | \n",
" 6.9 | \n",
" 3.1 | \n",
" 4.9 | \n",
" 1.5 | \n",
" Versicolor | \n",
"
\n",
" \n",
" | Virginica | \n",
" 131 | \n",
" 7.9 | \n",
" 3.8 | \n",
" 6.4 | \n",
" 2.0 | \n",
" Virginica | \n",
"
\n",
" \n",
" | 117 | \n",
" 7.7 | \n",
" 3.8 | \n",
" 6.7 | \n",
" 2.2 | \n",
" Virginica | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sepal_length sepal_width petal_length petal_width \\\n",
"variety \n",
"Setosa 14 5.8 4.0 1.2 0.2 \n",
" 15 5.7 4.4 1.5 0.4 \n",
"Versicolor 50 7.0 3.2 4.7 1.4 \n",
" 52 6.9 3.1 4.9 1.5 \n",
"Virginica 131 7.9 3.8 6.4 2.0 \n",
" 117 7.7 3.8 6.7 2.2 \n",
"\n",
" variety \n",
"variety \n",
"Setosa 14 Setosa \n",
" 15 Setosa \n",
"Versicolor 50 Versicolor \n",
" 52 Versicolor \n",
"Virginica 131 Virginica \n",
" 117 Virginica "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"iris.groupby(\"variety\").apply(\n",
" lambda grp: grp.sort_values([\"sepal_length\", \"sepal_width\"], ascending=False)\n",
" .head(2),\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sepal_length | \n",
" sepal_width | \n",
" petal_length | \n",
" petal_width | \n",
" variety | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 5.1 | \n",
" 3.5 | \n",
" 1.4 | \n",
" 0.2 | \n",
" Setosa | \n",
"
\n",
" \n",
" | 1 | \n",
" 4.9 | \n",
" 3.0 | \n",
" 1.4 | \n",
" 0.2 | \n",
" Setosa | \n",
"
\n",
" \n",
" | 50 | \n",
" 7.0 | \n",
" 3.2 | \n",
" 4.7 | \n",
" 1.4 | \n",
" Versicolor | \n",
"
\n",
" \n",
" | 51 | \n",
" 6.4 | \n",
" 3.2 | \n",
" 4.5 | \n",
" 1.5 | \n",
" Versicolor | \n",
"
\n",
" \n",
" | 100 | \n",
" 6.3 | \n",
" 3.3 | \n",
" 6.0 | \n",
" 2.5 | \n",
" Virginica | \n",
"
\n",
" \n",
" | 101 | \n",
" 5.8 | \n",
" 2.7 | \n",
" 5.1 | \n",
" 1.9 | \n",
" Virginica | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sepal_length sepal_width petal_length petal_width variety\n",
"0 5.1 3.5 1.4 0.2 Setosa\n",
"1 4.9 3.0 1.4 0.2 Setosa\n",
"50 7.0 3.2 4.7 1.4 Versicolor\n",
"51 6.4 3.2 4.5 1.5 Versicolor\n",
"100 6.3 3.3 6.0 2.5 Virginica\n",
"101 5.8 2.7 5.1 1.9 Virginica"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"iris.groupby(\"variety\").head(2)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sepal_length | \n",
" sepal_width | \n",
" petal_length | \n",
" petal_width | \n",
" variety | \n",
"
\n",
" \n",
" \n",
" \n",
" | 49 | \n",
" 5.0 | \n",
" 3.3 | \n",
" 1.4 | \n",
" 0.2 | \n",
" Setosa | \n",
"
\n",
" \n",
" | 99 | \n",
" 5.7 | \n",
" 2.8 | \n",
" 4.1 | \n",
" 1.3 | \n",
" Versicolor | \n",
"
\n",
" \n",
" | 149 | \n",
" 5.9 | \n",
" 3.0 | \n",
" 5.1 | \n",
" 1.8 | \n",
" Virginica | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sepal_length sepal_width petal_length petal_width variety\n",
"49 5.0 3.3 1.4 0.2 Setosa\n",
"99 5.7 2.8 4.1 1.3 Versicolor\n",
"149 5.9 3.0 5.1 1.8 Virginica"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"iris.groupby(\"variety\").tail(1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"#### 分组聚合"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sepal_length | \n",
" sepal_width | \n",
" petal_length | \n",
" petal_width | \n",
"
\n",
" \n",
" | variety | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | Setosa | \n",
" 0.266674 | \n",
" 0.189941 | \n",
" -0.357011 | \n",
" -0.436492 | \n",
"
\n",
" \n",
" | Setosa | \n",
" -0.300718 | \n",
" -1.129096 | \n",
" -0.357011 | \n",
" -0.436492 | \n",
"
\n",
" \n",
" | Setosa | \n",
" -0.868111 | \n",
" -0.601481 | \n",
" -0.932836 | \n",
" -0.436492 | \n",
"
\n",
" \n",
" | Setosa | \n",
" -1.151807 | \n",
" -0.865288 | \n",
" 0.218813 | \n",
" -0.436492 | \n",
"
\n",
" \n",
" | Setosa | \n",
" -0.017022 | \n",
" 0.453749 | \n",
" -0.357011 | \n",
" -0.436492 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | Virginica | \n",
" 0.176134 | \n",
" 0.080621 | \n",
" -0.637803 | \n",
" 0.997633 | \n",
"
\n",
" \n",
" | Virginica | \n",
" -0.452916 | \n",
" -1.469783 | \n",
" -1.000191 | \n",
" -0.458766 | \n",
"
\n",
" \n",
" | Virginica | \n",
" -0.138391 | \n",
" 0.080621 | \n",
" -0.637803 | \n",
" -0.094666 | \n",
"
\n",
" \n",
" | Virginica | \n",
" -0.610178 | \n",
" 1.320944 | \n",
" -0.275415 | \n",
" 0.997633 | \n",
"
\n",
" \n",
" | Virginica | \n",
" -1.081966 | \n",
" 0.080621 | \n",
" -0.818997 | \n",
" -0.822865 | \n",
"
\n",
" \n",
"
\n",
"
150 rows × 4 columns
\n",
"
"
],
"text/plain": [
" sepal_length sepal_width petal_length petal_width\n",
"variety \n",
"Setosa 0.266674 0.189941 -0.357011 -0.436492\n",
"Setosa -0.300718 -1.129096 -0.357011 -0.436492\n",
"Setosa -0.868111 -0.601481 -0.932836 -0.436492\n",
"Setosa -1.151807 -0.865288 0.218813 -0.436492\n",
"Setosa -0.017022 0.453749 -0.357011 -0.436492\n",
"... ... ... ... ...\n",
"Virginica 0.176134 0.080621 -0.637803 0.997633\n",
"Virginica -0.452916 -1.469783 -1.000191 -0.458766\n",
"Virginica -0.138391 0.080621 -0.637803 -0.094666\n",
"Virginica -0.610178 1.320944 -0.275415 0.997633\n",
"Virginica -1.081966 0.080621 -0.818997 -0.822865\n",
"\n",
"[150 rows x 4 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"normalize = lambda x: (x - x.mean()) / x.std()\n",
"\n",
"iris.set_index(\"variety\").groupby(level=0).transform(normalize)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count | \n",
" min | \n",
" std | \n",
" max | \n",
"
\n",
" \n",
" | variety | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | Setosa | \n",
" 50 | \n",
" 4.3 | \n",
" 0.352490 | \n",
" 5.8 | \n",
"
\n",
" \n",
" | Versicolor | \n",
" 50 | \n",
" 4.9 | \n",
" 0.516171 | \n",
" 7.0 | \n",
"
\n",
" \n",
" | Virginica | \n",
" 50 | \n",
" 4.9 | \n",
" 0.635880 | \n",
" 7.9 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" count min std max\n",
"variety \n",
"Setosa 50 4.3 0.352490 5.8\n",
"Versicolor 50 4.9 0.516171 7.0\n",
"Virginica 50 4.9 0.635880 7.9"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"iris.groupby(\"variety\").agg(\n",
" count = (\"variety\", \"count\"),\n",
" min = (\"sepal_length\", min),\n",
" std = (\"sepal_length\", \"std\"),\n",
" max = (\"sepal_length\", max),\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count | \n",
" min | \n",
" std | \n",
" max | \n",
"
\n",
" \n",
" \n",
" \n",
" | Setosa | \n",
" 50 | \n",
" 4.3 | \n",
" 0.352490 | \n",
" 5.8 | \n",
"
\n",
" \n",
" | Versicolor | \n",
" 50 | \n",
" 4.9 | \n",
" 0.516171 | \n",
" 7.0 | \n",
"
\n",
" \n",
" | Virginica | \n",
" 50 | \n",
" 4.9 | \n",
" 0.635880 | \n",
" 7.9 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" count min std max\n",
"Setosa 50 4.3 0.352490 5.8\n",
"Versicolor 50 4.9 0.516171 7.0\n",
"Virginica 50 4.9 0.635880 7.9"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# equal to:\n",
"\n",
"data = []\n",
"groups = iris[\"variety\"].unique()\n",
"for group in groups:\n",
" df = iris[iris[\"variety\"] == group]\n",
" stats = pd.DataFrame(\n",
" dict(\n",
" count = df.shape[0],\n",
" min = df[\"sepal_length\"].min(),\n",
" std = df[\"sepal_length\"].std(),\n",
" max = df[\"sepal_length\"].max(),\n",
" ),\n",
" index=[group],\n",
" )\n",
" data.append(stats)\n",
"\n",
"pd.concat(data)\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"#### 分组绘制可视化图形"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"variety\n",
"Setosa AxesSubplot(0.125,0.125;0.775x0.755)\n",
"Versicolor AxesSubplot(0.125,0.125;0.775x0.755)\n",
"Virginica AxesSubplot(0.125,0.125;0.775x0.755)\n",
"dtype: object"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/svg+xml": "\n\n\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": "\n\n\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": "\n\n\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"iris.groupby(\"variety\").plot.kde(legend=True, figsize=(16, 2))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/0t/s0c95rbs6ds7w_b0d471p0kc0000gn/T/ipykernel_9426/838838217.py:9: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.\n",
" fig.show()\n"
]
},
{
"data": {
"image/svg+xml": "\n\n\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"fig = plt.figure(figsize=(16, 10))\n",
"for n, (klass, grp) in enumerate(iris.groupby(\"variety\")):\n",
" location = 220+n+1\n",
" axes = fig.add_subplot(location)\n",
"\n",
" grp.drop(\"variety\", axis=1).plot.box(ax=axes)\n",
" axes.set_title(f\"variety={klass}\")\n",
"\n",
"fig.show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"# Resampler"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"DatetimeIndex: 732 entries, 2020-01-01 to 2022-01-01\n",
"Freq: D\n",
"Data columns (total 1 columns):\n",
" # Column Non-Null Count Dtype\n",
"--- ------ -------------- -----\n",
" 0 sales 732 non-null int64\n",
"dtypes: int64(1)\n",
"memory usage: 11.4 KB\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sales | \n",
"
\n",
" \n",
" | datetime | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 2020-01-01 | \n",
" 2956 | \n",
"
\n",
" \n",
" | 2020-01-02 | \n",
" 8704 | \n",
"
\n",
" \n",
" | 2020-01-03 | \n",
" 3881 | \n",
"
\n",
" \n",
" | 2020-01-04 | \n",
" 8031 | \n",
"
\n",
" \n",
" | 2020-01-05 | \n",
" 9124 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sales\n",
"datetime \n",
"2020-01-01 2956\n",
"2020-01-02 8704\n",
"2020-01-03 3881\n",
"2020-01-04 8031\n",
"2020-01-05 9124"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"import random\n",
"random.seed(233)\n",
"\n",
"dt = pd.date_range(start=\"20200101\", end=\"20220101\", freq=\"D\").set_names(\"datetime\")\n",
"data = pd.DataFrame(\n",
" [random.randrange(100, 10000) for _ in range(len(dt))],\n",
" index=dt,\n",
" columns=[\"sales\"],\n",
")\n",
"data.info()\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sales | \n",
"
\n",
" \n",
" | datetime | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 2020-12-31 | \n",
" 1850914 | \n",
"
\n",
" \n",
" | 2021-12-31 | \n",
" 1901049 | \n",
"
\n",
" \n",
" | 2022-12-31 | \n",
" 2581 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sales\n",
"datetime \n",
"2020-12-31 1850914\n",
"2021-12-31 1901049\n",
"2022-12-31 2581"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.resample(\"Y\").sum()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.resample(\"2W\")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dt.to_series().resample(\"3M\")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dt.to_period(\"2W\").to_series().resample(\"3d\")"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"# data.reset_index().resample(\"4d\") # raise type error here."
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" datetime | \n",
" sales | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 2020-01-01 | \n",
" 2956 | \n",
"
\n",
" \n",
" | 1 | \n",
" 2020-01-02 | \n",
" 8704 | \n",
"
\n",
" \n",
" | 2 | \n",
" 2020-01-03 | \n",
" 3881 | \n",
"
\n",
" \n",
" | 3 | \n",
" 2020-01-04 | \n",
" 8031 | \n",
"
\n",
" \n",
" | 4 | \n",
" 2020-01-05 | \n",
" 9124 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" datetime sales\n",
"0 2020-01-01 2956\n",
"1 2020-01-02 8704\n",
"2 2020-01-03 3881\n",
"3 2020-01-04 8031\n",
"4 2020-01-05 9124"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = data.reset_index()\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sales | \n",
"
\n",
" \n",
" | datetime | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 2020-12-31 | \n",
" 1850914 | \n",
"
\n",
" \n",
" | 2021-12-31 | \n",
" 1901049 | \n",
"
\n",
" \n",
" | 2022-12-31 | \n",
" 2581 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sales\n",
"datetime \n",
"2020-12-31 1850914\n",
"2021-12-31 1901049\n",
"2022-12-31 2581"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.groupby(pd.Grouper(key=\"datetime\", freq=\"Y\")).sum()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"image/svg+xml": "\n\n\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"ax = plt.subplot(111)\n",
"\n",
"random.seed(233)\n",
"\n",
"(\n",
" data.set_index(\"datetime\")\n",
" .assign(\n",
" overseas_sales = [random.randrange(1000, 5000) for _ in range(len(dt))],\n",
" )\n",
" .resample(\"3W\")\n",
" .sum()\n",
" .plot.line(\n",
" figsize=(10, 5),\n",
" ylabel=\"sales\",\n",
" xlabel=\"\",\n",
" ax=ax,\n",
" )\n",
")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"image/svg+xml": "\n\n\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"ax = plt.subplot(111)\n",
"\n",
"random.seed(233)\n",
"\n",
"(\n",
" data.assign(\n",
" overseas_sales = [\n",
" random.randrange(1000, 5000) for _ in range(len(dt))\n",
" ],\n",
" )\n",
" .groupby(pd.Grouper(key=\"datetime\", freq=\"3W\"))\n",
" .sum()\n",
" .plot.line(\n",
" figsize=(10, 5),\n",
" ylabel=\"sales\",\n",
" xlabel=\"\",\n",
" ax=ax,\n",
" )\n",
")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.4 ('sspai-100-hours-series-python')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
},
"vscode": {
"interpreter": {
"hash": "7a101baf08afe636412f97dd4a9fc2e65b6f84f0ec50413bf3e19b04a26b8ba6"
}
}
},
"nbformat": 4,
"nbformat_minor": 0
}