{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%matplotlib inline\n", "%config InlineBackend.figure_formats = ['svg']\n", "\n", "import random\n", "\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } }, "source": [ "# GroupBy" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": [ "size\n", "M 97\n", "XL 90\n", "XL 32\n", "M 77\n", "XL 39\n", "Name: number, dtype: int64" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "random.seed(233)\n", "\n", "number = [random.randint(10, 100) for _ in range(10)]\n", "size = [random.choices([\"M\", \"L\", \"XL\"], k=10)]\n", "\n", "s = pd.Series(number, index=size, name=\"number\").rename_axis(\"size\")\n", "s.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": [ "size\n", "L 23\n", "M 230\n", "XL 312\n", "Name: number, dtype: int64" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.groupby(level=0).sum()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sizenumber
0M97
1XL90
2XL32
3M77
4XL39
\n", "
" ], "text/plain": [ " size number\n", "0 M 97\n", "1 XL 90\n", "2 XL 32\n", "3 M 77\n", "4 XL 39" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = s.to_frame().reset_index()\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": [ "size\n", "L 23\n", "M 230\n", "XL 312\n", "Name: number, dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby(\"size\")[\"number\"].sum()" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } }, "source": [ "### GroupBy 的一些使用场景" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/0t/s0c95rbs6ds7w_b0d471p0kc0000gn/T/ipykernel_9426/460248279.py:2: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n", " iris.columns = iris.columns.str.replace(\".\", \"_\")\n" ] } ], "source": [ "iris = pd.read_csv(\"../../data/iris.csv\")\n", "iris.columns = iris.columns.str.replace(\".\", \"_\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 150 entries, 0 to 149\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 sepal_length 150 non-null float64\n", " 1 sepal_width 150 non-null float64\n", " 2 petal_length 150 non-null float64\n", " 3 petal_width 150 non-null float64\n", " 4 variety 150 non-null object \n", "dtypes: float64(4), object(1)\n", "memory usage: 6.0+ KB\n" ] } ], "source": [ "iris.info()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthvariety
05.13.51.40.2Setosa
14.93.01.40.2Setosa
24.73.21.30.2Setosa
34.63.11.50.2Setosa
45.03.61.40.2Setosa
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width variety\n", "0 5.1 3.5 1.4 0.2 Setosa\n", "1 4.9 3.0 1.4 0.2 Setosa\n", "2 4.7 3.2 1.3 0.2 Setosa\n", "3 4.6 3.1 1.5 0.2 Setosa\n", "4 5.0 3.6 1.4 0.2 Setosa" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.head()" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } }, "source": [ "#### 自定义函数" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
varietyweight
0Setosa32.5
1Setosa7.0
2Setosa15.5
3Setosa8.5
4Setosa36.0
.........
145Virginica26.0
146Virginica4.0
147Virginica23.0
148Virginica43.0
149Virginica18.0
\n", "

150 rows × 2 columns

\n", "
" ], "text/plain": [ " variety weight\n", "0 Setosa 32.5\n", "1 Setosa 7.0\n", "2 Setosa 15.5\n", "3 Setosa 8.5\n", "4 Setosa 36.0\n", ".. ... ...\n", "145 Virginica 26.0\n", "146 Virginica 4.0\n", "147 Virginica 23.0\n", "148 Virginica 43.0\n", "149 Virginica 18.0\n", "\n", "[150 rows x 2 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def rank_by_sepal(group):\n", " df = group.copy()\n", " df[\"weight\"] = 0.8*df[\"sepal_width\"] + 0.2*df[\"sepal_length\"]\n", " return df[\"weight\"].rank()\n", "\n", "\n", "iris.groupby(\"variety\").apply(rank_by_sepal).reset_index(level=0)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
varietyweight
0Setosa32.5
1Setosa7.0
2Setosa15.5
3Setosa8.5
4Setosa36.0
.........
145Virginica26.0
146Virginica4.0
147Virginica23.0
148Virginica43.0
149Virginica18.0
\n", "

150 rows × 2 columns

\n", "
" ], "text/plain": [ " variety weight\n", "0 Setosa 32.5\n", "1 Setosa 7.0\n", "2 Setosa 15.5\n", "3 Setosa 8.5\n", "4 Setosa 36.0\n", ".. ... ...\n", "145 Virginica 26.0\n", "146 Virginica 4.0\n", "147 Virginica 23.0\n", "148 Virginica 43.0\n", "149 Virginica 18.0\n", "\n", "[150 rows x 2 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.groupby(\"variety\").apply(lambda grp: rank_by_sepal(grp)).reset_index(level=0)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } }, "source": [ "#### 分组排序求前 TOP N" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthvariety
variety
Setosa145.84.01.20.2Setosa
155.74.41.50.4Setosa
Versicolor507.03.24.71.4Versicolor
526.93.14.91.5Versicolor
Virginica1317.93.86.42.0Virginica
1177.73.86.72.2Virginica
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width \\\n", "variety \n", "Setosa 14 5.8 4.0 1.2 0.2 \n", " 15 5.7 4.4 1.5 0.4 \n", "Versicolor 50 7.0 3.2 4.7 1.4 \n", " 52 6.9 3.1 4.9 1.5 \n", "Virginica 131 7.9 3.8 6.4 2.0 \n", " 117 7.7 3.8 6.7 2.2 \n", "\n", " variety \n", "variety \n", "Setosa 14 Setosa \n", " 15 Setosa \n", "Versicolor 50 Versicolor \n", " 52 Versicolor \n", "Virginica 131 Virginica \n", " 117 Virginica " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.groupby(\"variety\").apply(\n", " lambda grp: grp.sort_values([\"sepal_length\", \"sepal_width\"], ascending=False)\n", " .head(2),\n", ")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthvariety
05.13.51.40.2Setosa
14.93.01.40.2Setosa
507.03.24.71.4Versicolor
516.43.24.51.5Versicolor
1006.33.36.02.5Virginica
1015.82.75.11.9Virginica
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width variety\n", "0 5.1 3.5 1.4 0.2 Setosa\n", "1 4.9 3.0 1.4 0.2 Setosa\n", "50 7.0 3.2 4.7 1.4 Versicolor\n", "51 6.4 3.2 4.5 1.5 Versicolor\n", "100 6.3 3.3 6.0 2.5 Virginica\n", "101 5.8 2.7 5.1 1.9 Virginica" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.groupby(\"variety\").head(2)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthvariety
495.03.31.40.2Setosa
995.72.84.11.3Versicolor
1495.93.05.11.8Virginica
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width variety\n", "49 5.0 3.3 1.4 0.2 Setosa\n", "99 5.7 2.8 4.1 1.3 Versicolor\n", "149 5.9 3.0 5.1 1.8 Virginica" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.groupby(\"variety\").tail(1)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } }, "source": [ "#### 分组聚合" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_width
variety
Setosa0.2666740.189941-0.357011-0.436492
Setosa-0.300718-1.129096-0.357011-0.436492
Setosa-0.868111-0.601481-0.932836-0.436492
Setosa-1.151807-0.8652880.218813-0.436492
Setosa-0.0170220.453749-0.357011-0.436492
...............
Virginica0.1761340.080621-0.6378030.997633
Virginica-0.452916-1.469783-1.000191-0.458766
Virginica-0.1383910.080621-0.637803-0.094666
Virginica-0.6101781.320944-0.2754150.997633
Virginica-1.0819660.080621-0.818997-0.822865
\n", "

150 rows × 4 columns

\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width\n", "variety \n", "Setosa 0.266674 0.189941 -0.357011 -0.436492\n", "Setosa -0.300718 -1.129096 -0.357011 -0.436492\n", "Setosa -0.868111 -0.601481 -0.932836 -0.436492\n", "Setosa -1.151807 -0.865288 0.218813 -0.436492\n", "Setosa -0.017022 0.453749 -0.357011 -0.436492\n", "... ... ... ... ...\n", "Virginica 0.176134 0.080621 -0.637803 0.997633\n", "Virginica -0.452916 -1.469783 -1.000191 -0.458766\n", "Virginica -0.138391 0.080621 -0.637803 -0.094666\n", "Virginica -0.610178 1.320944 -0.275415 0.997633\n", "Virginica -1.081966 0.080621 -0.818997 -0.822865\n", "\n", "[150 rows x 4 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "normalize = lambda x: (x - x.mean()) / x.std()\n", "\n", "iris.set_index(\"variety\").groupby(level=0).transform(normalize)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countminstdmax
variety
Setosa504.30.3524905.8
Versicolor504.90.5161717.0
Virginica504.90.6358807.9
\n", "
" ], "text/plain": [ " count min std max\n", "variety \n", "Setosa 50 4.3 0.352490 5.8\n", "Versicolor 50 4.9 0.516171 7.0\n", "Virginica 50 4.9 0.635880 7.9" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.groupby(\"variety\").agg(\n", " count = (\"variety\", \"count\"),\n", " min = (\"sepal_length\", min),\n", " std = (\"sepal_length\", \"std\"),\n", " max = (\"sepal_length\", max),\n", ")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countminstdmax
Setosa504.30.3524905.8
Versicolor504.90.5161717.0
Virginica504.90.6358807.9
\n", "
" ], "text/plain": [ " count min std max\n", "Setosa 50 4.3 0.352490 5.8\n", "Versicolor 50 4.9 0.516171 7.0\n", "Virginica 50 4.9 0.635880 7.9" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# equal to:\n", "\n", "data = []\n", "groups = iris[\"variety\"].unique()\n", "for group in groups:\n", " df = iris[iris[\"variety\"] == group]\n", " stats = pd.DataFrame(\n", " dict(\n", " count = df.shape[0],\n", " min = df[\"sepal_length\"].min(),\n", " std = df[\"sepal_length\"].std(),\n", " max = df[\"sepal_length\"].max(),\n", " ),\n", " index=[group],\n", " )\n", " data.append(stats)\n", "\n", "pd.concat(data)\n" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } }, "source": [ "#### 分组绘制可视化图形" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": [ "variety\n", "Setosa AxesSubplot(0.125,0.125;0.775x0.755)\n", "Versicolor AxesSubplot(0.125,0.125;0.775x0.755)\n", "Virginica AxesSubplot(0.125,0.125;0.775x0.755)\n", "dtype: object" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/svg+xml": "\n\n\n \n \n \n \n 2022-06-23T10:18:10.454851\n image/svg+xml\n \n \n Matplotlib v3.5.2, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "image/svg+xml": "\n\n\n \n \n \n \n 2022-06-23T10:18:10.536604\n image/svg+xml\n \n \n Matplotlib v3.5.2, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "image/svg+xml": "\n\n\n \n \n \n \n 2022-06-23T10:18:10.613853\n image/svg+xml\n \n \n Matplotlib v3.5.2, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "iris.groupby(\"variety\").plot.kde(legend=True, figsize=(16, 2))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/0t/s0c95rbs6ds7w_b0d471p0kc0000gn/T/ipykernel_9426/838838217.py:9: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.\n", " fig.show()\n" ] }, { "data": { "image/svg+xml": "\n\n\n \n \n \n \n 2022-06-23T10:18:10.971824\n image/svg+xml\n \n \n Matplotlib v3.5.2, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "fig = plt.figure(figsize=(16, 10))\n", "for n, (klass, grp) in enumerate(iris.groupby(\"variety\")):\n", " location = 220+n+1\n", " axes = fig.add_subplot(location)\n", "\n", " grp.drop(\"variety\", axis=1).plot.box(ax=axes)\n", " axes.set_title(f\"variety={klass}\")\n", "\n", "fig.show()" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } }, "source": [ "# Resampler" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "DatetimeIndex: 732 entries, 2020-01-01 to 2022-01-01\n", "Freq: D\n", "Data columns (total 1 columns):\n", " # Column Non-Null Count Dtype\n", "--- ------ -------------- -----\n", " 0 sales 732 non-null int64\n", "dtypes: int64(1)\n", "memory usage: 11.4 KB\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sales
datetime
2020-01-012956
2020-01-028704
2020-01-033881
2020-01-048031
2020-01-059124
\n", "
" ], "text/plain": [ " sales\n", "datetime \n", "2020-01-01 2956\n", "2020-01-02 8704\n", "2020-01-03 3881\n", "2020-01-04 8031\n", "2020-01-05 9124" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "import random\n", "random.seed(233)\n", "\n", "dt = pd.date_range(start=\"20200101\", end=\"20220101\", freq=\"D\").set_names(\"datetime\")\n", "data = pd.DataFrame(\n", " [random.randrange(100, 10000) for _ in range(len(dt))],\n", " index=dt,\n", " columns=[\"sales\"],\n", ")\n", "data.info()\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sales
datetime
2020-12-311850914
2021-12-311901049
2022-12-312581
\n", "
" ], "text/plain": [ " sales\n", "datetime \n", "2020-12-31 1850914\n", "2021-12-31 1901049\n", "2022-12-31 2581" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.resample(\"Y\").sum()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.resample(\"2W\")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dt.to_series().resample(\"3M\")" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dt.to_period(\"2W\").to_series().resample(\"3d\")" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# data.reset_index().resample(\"4d\") # raise type error here." ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datetimesales
02020-01-012956
12020-01-028704
22020-01-033881
32020-01-048031
42020-01-059124
\n", "
" ], "text/plain": [ " datetime sales\n", "0 2020-01-01 2956\n", "1 2020-01-02 8704\n", "2 2020-01-03 3881\n", "3 2020-01-04 8031\n", "4 2020-01-05 9124" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = data.reset_index()\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sales
datetime
2020-12-311850914
2021-12-311901049
2022-12-312581
\n", "
" ], "text/plain": [ " sales\n", "datetime \n", "2020-12-31 1850914\n", "2021-12-31 1901049\n", "2022-12-31 2581" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.groupby(pd.Grouper(key=\"datetime\", freq=\"Y\")).sum()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "image/svg+xml": "\n\n\n \n \n \n \n 2022-06-23T10:18:11.523631\n image/svg+xml\n \n \n Matplotlib v3.5.2, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "\n", "ax = plt.subplot(111)\n", "\n", "random.seed(233)\n", "\n", "(\n", " data.set_index(\"datetime\")\n", " .assign(\n", " overseas_sales = [random.randrange(1000, 5000) for _ in range(len(dt))],\n", " )\n", " .resample(\"3W\")\n", " .sum()\n", " .plot.line(\n", " figsize=(10, 5),\n", " ylabel=\"sales\",\n", " xlabel=\"\",\n", " ax=ax,\n", " )\n", ")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "image/svg+xml": "\n\n\n \n \n \n \n 2022-06-23T10:18:11.731873\n image/svg+xml\n \n \n Matplotlib v3.5.2, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "\n", "ax = plt.subplot(111)\n", "\n", "random.seed(233)\n", "\n", "(\n", " data.assign(\n", " overseas_sales = [\n", " random.randrange(1000, 5000) for _ in range(len(dt))\n", " ],\n", " )\n", " .groupby(pd.Grouper(key=\"datetime\", freq=\"3W\"))\n", " .sum()\n", " .plot.line(\n", " figsize=(10, 5),\n", " ylabel=\"sales\",\n", " xlabel=\"\",\n", " ax=ax,\n", " )\n", ")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3.10.4 ('sspai-100-hours-series-python')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.4" }, "vscode": { "interpreter": { "hash": "7a101baf08afe636412f97dd4a9fc2e65b6f84f0ec50413bf3e19b04a26b8ba6" } } }, "nbformat": 4, "nbformat_minor": 0 }