diff --git a/code/14/concat_style_combination.ipynb b/code/14/concat_style_combination.ipynb new file mode 100644 index 0000000..ad467c8 --- /dev/null +++ b/code/14/concat_style_combination.ipynb @@ -0,0 +1,911 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# concat" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "s1 = pd.Series([\"red\", \"black\"], name=\"color\")\n", + "s2 = pd.Series([\"white\", \"gray\"], name=\"colorname\")\n", + "df1 = pd.DataFrame(\n", + " {\n", + " \"color\": [\"red\", \"orange\", \"blue\", \"green\"],\n", + " }\n", + ")\n", + "df2 = pd.DataFrame(\n", + " {\n", + " \"color\": [\"black\", \"white\", \"yellow\", \"purple\"],\n", + " \"shape\": [\"circle\", \"square\", \"triangle\", \"star\"],\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 red\n", + "1 black\n", + "0 white\n", + "1 gray\n", + "dtype: object" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "pd.concat([s1, s2])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
colorshape
0redNaN
1orangeNaN
2blueNaN
3greenNaN
0blackcircle
1whitesquare
2yellowtriangle
3purplestar
\n", + "
" + ], + "text/plain": [ + " color shape\n", + "0 red NaN\n", + "1 orange NaN\n", + "2 blue NaN\n", + "3 green NaN\n", + "0 black circle\n", + "1 white square\n", + "2 yellow triangle\n", + "3 purple star" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df1, df2])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
colorshape
1orangeNaN
1whitesquare
3greenNaN
3purplestar
\n", + "
" + ], + "text/plain": [ + " color shape\n", + "1 orange NaN\n", + "1 white square\n", + "3 green NaN\n", + "3 purple star" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df1, df2]).loc[[1,3], :]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
colorshape
1orangeNaN
3greenNaN
\n", + "
" + ], + "text/plain": [ + " color shape\n", + "1 orange NaN\n", + "3 green NaN" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df1, df2]).iloc[[1,3], :]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
colorshape
1orangeNaN
3greenNaN
\n", + "
" + ], + "text/plain": [ + " color shape\n", + "1 orange NaN\n", + "3 green NaN" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df1, df2], ignore_index=True).loc[[1,3], :]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
colorshape
1orangeNaN
3greenNaN
\n", + "
" + ], + "text/plain": [ + " color shape\n", + "1 orange NaN\n", + "3 green NaN" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df1, df2], ignore_index=True).iloc[[1,3], :]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
colorshape
df10redNaN
1orangeNaN
2blueNaN
3greenNaN
df20blackcircle
1whitesquare
2yellowtriangle
3purplestar
\n", + "
" + ], + "text/plain": [ + " color shape\n", + "df1 0 red NaN\n", + " 1 orange NaN\n", + " 2 blue NaN\n", + " 3 green NaN\n", + "df2 0 black circle\n", + " 1 white square\n", + " 2 yellow triangle\n", + " 3 purple star" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df1, df2], keys=[\"df1\", \"df2\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "MultiIndex([('df1', 0),\n", + " ('df1', 1),\n", + " ('df1', 2),\n", + " ('df1', 3),\n", + " ('df2', 0),\n", + " ('df2', 1),\n", + " ('df2', 2),\n", + " ('df2', 3)],\n", + " )" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df1, df2], keys=[\"df1\", \"df2\"]).index" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
colorshape
df11orangeNaN
3greenNaN
\n", + "
" + ], + "text/plain": [ + " color shape\n", + "df1 1 orange NaN\n", + " 3 green NaN" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df1, df2], keys=[\"df1\", \"df2\"]).loc[('df1', [1,3]), :]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
color0
0redNaN
1orangeNaN
2blueNaN
3greenNaN
0NaNwhite
1NaNgray
\n", + "
" + ], + "text/plain": [ + " color 0\n", + "0 red NaN\n", + "1 orange NaN\n", + "2 blue NaN\n", + "3 green NaN\n", + "0 NaN white\n", + "1 NaN gray" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df1, s2]) # default axis=0" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
colorcolorname
0redwhite
1orangegray
2blueNaN
3greenNaN
\n", + "
" + ], + "text/plain": [ + " color colorname\n", + "0 red white\n", + "1 orange gray\n", + "2 blue NaN\n", + "3 green NaN" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df1, s2], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
colorcolorname
0redwhite
1orangegray
2blueNaN
3greenNaN
\n", + "
" + ], + "text/plain": [ + " color colorname\n", + "0 red white\n", + "1 orange gray\n", + "2 blue NaN\n", + "3 green NaN" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df1, s2], axis=\"columns\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.4 ('sspai-100-hours-series-python')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + }, + "vscode": { + "interpreter": { + "hash": "7a101baf08afe636412f97dd4a9fc2e65b6f84f0ec50413bf3e19b04a26b8ba6" + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/code/14/database_style_combination.ipynb b/code/14/database_style_combination.ipynb new file mode 100644 index 0000000..469bf77 --- /dev/null +++ b/code/14/database_style_combination.ipynb @@ -0,0 +1,2997 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import random\n", + "from string import ascii_letters\n", + "\n", + "import pandas as pd\n", + "\n", + "random.seed(3.14)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# 数据库风格的聚合方式" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idx
01x1
12x2
23x3
\n", + "
" + ], + "text/plain": [ + " id x\n", + "0 1 x1\n", + "1 2 x2\n", + "2 3 x3" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x = pd.DataFrame(\n", + " {\n", + " \"id\": [1, 2, 3],\n", + " \"x\": [\"x1\", \"x2\", \"x3\"],\n", + " }\n", + ")\n", + "x.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idy
01y1
12y3
23y4
33y5
44y6
\n", + "
" + ], + "text/plain": [ + " id y\n", + "0 1 y1\n", + "1 2 y3\n", + "2 3 y4\n", + "3 3 y5\n", + "4 4 y6" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "y = pd.DataFrame(\n", + " {\n", + " \"id\": [1, 2, 3, 3, 4],\n", + " \"y\": [\"y1\", \"y3\", \"y4\", \"y5\", \"y6\"],\n", + " }\n", + ")\n", + "y.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idxy
01x1y1
12x2y3
23x3y4
33x3y5
\n", + "
" + ], + "text/plain": [ + " id x y\n", + "0 1 x1 y1\n", + "1 2 x2 y3\n", + "2 3 x3 y4\n", + "3 3 x3 y5" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.merge(y, on=\"id\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idxy
01x1y1
12x2y3
23x3y4
33x3y5
\n", + "
" + ], + "text/plain": [ + " id x y\n", + "0 1 x1 y1\n", + "1 2 x2 y3\n", + "2 3 x3 y4\n", + "3 3 x3 y5" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# default join way\n", + "x.merge(y, on=\"id\", how=\"inner\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idxy
01x1y1
12x2y3
23x3y4
33x3y5
44NaNy6
\n", + "
" + ], + "text/plain": [ + " id x y\n", + "0 1 x1 y1\n", + "1 2 x2 y3\n", + "2 3 x3 y4\n", + "3 3 x3 y5\n", + "4 4 NaN y6" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.merge(y, on=\"id\", how=\"outer\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idxy
01x1y1
12x2y3
23x3y4
33x3y5
\n", + "
" + ], + "text/plain": [ + " id x y\n", + "0 1 x1 y1\n", + "1 2 x2 y3\n", + "2 3 x3 y4\n", + "3 3 x3 y5" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.merge(y, on=\"id\", how=\"left\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idxy
01x1y1
12x2y3
23x3y4
33x3y5
44NaNy6
\n", + "
" + ], + "text/plain": [ + " id x y\n", + "0 1 x1 y1\n", + "1 2 x2 y3\n", + "2 3 x3 y4\n", + "3 3 x3 y5\n", + "4 4 NaN y6" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.merge(y, on=\"id\", how=\"right\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "#### 关联条件" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "x[\"uid\"] = [10, 23, 40]\n", + "x[\"gender\"] = [\"male\", \"female\", \"male\"]\n", + "\n", + "y[\"uid\"] = [10, 10, 41, 43, 23]\n", + "y[\"sex\"] = [\"male\", None, None, \"female\", \"female\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idxuidgender
01x110male
12x223female
23x340male
\n", + "
" + ], + "text/plain": [ + " id x uid gender\n", + "0 1 x1 10 male\n", + "1 2 x2 23 female\n", + "2 3 x3 40 male" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idyuidsex
01y110male
12y310None
23y441None
33y543female
44y623female
\n", + "
" + ], + "text/plain": [ + " id y uid sex\n", + "0 1 y1 10 male\n", + "1 2 y3 10 None\n", + "2 3 y4 41 None\n", + "3 3 y5 43 female\n", + "4 4 y6 23 female" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idxuid_xgenderyuid_ysex
01x110maley110male
12x223femaley310None
23x340maley441None
33x340maley543female
\n", + "
" + ], + "text/plain": [ + " id x uid_x gender y uid_y sex\n", + "0 1 x1 10 male y1 10 male\n", + "1 2 x2 23 female y3 10 None\n", + "2 3 x3 40 male y4 41 None\n", + "3 3 x3 40 male y5 43 female" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.merge(y, on=\"id\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idxuidgenderysex
01x110maley1male
\n", + "
" + ], + "text/plain": [ + " id x uid gender y sex\n", + "0 1 x1 10 male y1 male" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.merge(y, on=[\"id\", \"uid\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
id_xxuidgender
01x110male
12x223female
23x340male
\n", + "
" + ], + "text/plain": [ + " id_x x uid gender\n", + "0 1 x1 10 male\n", + "1 2 x2 23 female\n", + "2 3 x3 40 male" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x = x.rename(columns={\"id\": \"id_x\"})\n", + "x.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
id_xxuid_xgenderidyuid_ysex
01x110male1y110male
12x223female2y310None
23x340male3y441None
33x340male3y543female
\n", + "
" + ], + "text/plain": [ + " id_x x uid_x gender id y uid_y sex\n", + "0 1 x1 10 male 1 y1 10 male\n", + "1 2 x2 23 female 2 y3 10 None\n", + "2 3 x3 40 male 3 y4 41 None\n", + "3 3 x3 40 male 3 y5 43 female" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.merge(y, left_on=\"id_x\", right_on=\"id\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
id_xxuid_xgenderidyuid_ysex
01x110male1.0y110.0male
12x223femaleNaNNaNNaNNaN
23x340maleNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " id_x x uid_x gender id y uid_y sex\n", + "0 1 x1 10 male 1.0 y1 10.0 male\n", + "1 2 x2 23 female NaN NaN NaN NaN\n", + "2 3 x3 40 male NaN NaN NaN NaN" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.merge(y, left_on=[\"id_x\", \"gender\"], right_on=[\"id\", \"sex\"], how=\"left\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### 类型不一致问题" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abdatetime
01O2021-01-01
13L2021-01-02
25k2021-01-03
36F2021-01-04
47Z2021-01-05
\n", + "
" + ], + "text/plain": [ + " a b datetime\n", + "0 1 O 2021-01-01\n", + "1 3 L 2021-01-02\n", + "2 5 k 2021-01-03\n", + "3 6 F 2021-01-04\n", + "4 7 Z 2021-01-05" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "left = pd.DataFrame(\n", + " {\n", + " \"a\": [1, 3, 5, 6, 7],\n", + " \"b\": random.choices(ascii_letters, k=5),\n", + " \"datetime\": pd.date_range(\"20210101\", periods=5, freq=\"D\"),\n", + " }\n", + ")\n", + "left.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
acdatetime
01u2021-01-03
13t2021-01-04
25V2021-01-05
37x2021-01-06
49m2021-01-07
\n", + "
" + ], + "text/plain": [ + " a c datetime\n", + "0 1 u 2021-01-03\n", + "1 3 t 2021-01-04\n", + "2 5 V 2021-01-05\n", + "3 7 x 2021-01-06\n", + "4 9 m 2021-01-07" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "right = pd.DataFrame(\n", + " {\n", + " \"a\": [\"1\", \"3\", \"5\", \"7\", \"9\"],\n", + " \"c\": random.choices(ascii_letters, k=5),\n", + " \"datetime\": pd.date_range(\"20210103\", periods=5, freq=\"D\").map(lambda v: str(v.date())),\n", + " }\n", + ")\n", + "right.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# error\n", + "# `a` in left is integer type, but in right is string.\n", + "\n", + "# left.merge(right, on=[\"a\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# error\n", + "# `a` in left is datetime type, but in right is string.\n", + "\n", + "# left.merge(right, on=[\"datetime\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# error\n", + "# the type of keys in left isn't same as the right keys.\n", + "\n", + "# left.merge(right, on=[\"a\", \"datetime\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abdatetime_xcdatetime_y
01O2021-01-01u2021-01-03
13L2021-01-02t2021-01-04
25k2021-01-03V2021-01-05
37Z2021-01-05x2021-01-06
\n", + "
" + ], + "text/plain": [ + " a b datetime_x c datetime_y\n", + "0 1 O 2021-01-01 u 2021-01-03\n", + "1 3 L 2021-01-02 t 2021-01-04\n", + "2 5 k 2021-01-03 V 2021-01-05\n", + "3 7 Z 2021-01-05 x 2021-01-06" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "right[\"a\"] = right[\"a\"].astype(int)\n", + "left.merge(right, on=[\"a\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
a_xbdatetimea_yc
05k2021-01-031u
16F2021-01-043t
27Z2021-01-055V
\n", + "
" + ], + "text/plain": [ + " a_x b datetime a_y c\n", + "0 5 k 2021-01-03 1 u\n", + "1 6 F 2021-01-04 3 t\n", + "2 7 Z 2021-01-05 5 V" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "right[\"datetime\"] = pd.to_datetime(right[\"datetime\"])\n", + "left.merge(right, on=[\"datetime\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "整型与浮点型关联时的情况:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
acdatetime
01.0u2021-01-03
13.0t2021-01-04
25.0V2021-01-05
37.0x2021-01-06
49.0m2021-01-07
\n", + "
" + ], + "text/plain": [ + " a c datetime\n", + "0 1.0 u 2021-01-03\n", + "1 3.0 t 2021-01-04\n", + "2 5.0 V 2021-01-05\n", + "3 7.0 x 2021-01-06\n", + "4 9.0 m 2021-01-07" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "right[\"a\"] = right[\"a\"].astype(float)\n", + "right.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abdatetime_xcdatetime_y
01O2021-01-01u2021-01-03
13L2021-01-02t2021-01-04
25k2021-01-03V2021-01-05
37Z2021-01-05x2021-01-06
\n", + "
" + ], + "text/plain": [ + " a b datetime_x c datetime_y\n", + "0 1 O 2021-01-01 u 2021-01-03\n", + "1 3 L 2021-01-02 t 2021-01-04\n", + "2 5 k 2021-01-03 V 2021-01-05\n", + "3 7 Z 2021-01-05 x 2021-01-06" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "left.merge(right, on=[\"a\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "a int64\n", + "b object\n", + "datetime_x datetime64[ns]\n", + "c object\n", + "datetime_y datetime64[ns]\n", + "dtype: object" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "left.merge(right, on=[\"a\"]).dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
acdatetime
01.0u2021-01-03
13.1t2021-01-04
25.0V2021-01-05
37.4x2021-01-06
49.5m2021-01-07
\n", + "
" + ], + "text/plain": [ + " a c datetime\n", + "0 1.0 u 2021-01-03\n", + "1 3.1 t 2021-01-04\n", + "2 5.0 V 2021-01-05\n", + "3 7.4 x 2021-01-06\n", + "4 9.5 m 2021-01-07" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "right[\"a\"] = [1.0, 3.1, 5.0, 7.4, 9.5]\n", + "right.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/Bobot/.virtualenvs/sspai-100-hours-series-python/lib/python3.10/site-packages/pandas/core/reshape/merge.py:1215: UserWarning: You are merging on int and float columns where the float values are not equal to their int representation.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abdatetime_xcdatetime_y
01O2021-01-01u2021-01-03
15k2021-01-03V2021-01-05
\n", + "
" + ], + "text/plain": [ + " a b datetime_x c datetime_y\n", + "0 1 O 2021-01-01 u 2021-01-03\n", + "1 5 k 2021-01-03 V 2021-01-05" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "left.merge(right, on=[\"a\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
a_xbdatetimea_yc
05k2021-01-031.0u
16F2021-01-043.1t
27Z2021-01-055.0V
\n", + "
" + ], + "text/plain": [ + " a_x b datetime a_y c\n", + "0 5 k 2021-01-03 1.0 u\n", + "1 6 F 2021-01-04 3.1 t\n", + "2 7 Z 2021-01-05 5.0 V" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "left.merge(right, on=[\"datetime\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/Bobot/.virtualenvs/sspai-100-hours-series-python/lib/python3.10/site-packages/pandas/core/reshape/merge.py:1215: UserWarning: You are merging on int and float columns where the float values are not equal to their int representation.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abdatetimecdatetime_right
01O2021-01-01u2021-01-03
15k2021-01-03V2021-01-05
\n", + "
" + ], + "text/plain": [ + " a b datetime c datetime_right\n", + "0 1 O 2021-01-01 u 2021-01-03\n", + "1 5 k 2021-01-03 V 2021-01-05" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "left.merge(right, on=[\"a\"], suffixes=(\"\", \"_right\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abdatetimec
05k2021-01-03u
16F2021-01-04t
27Z2021-01-05V
\n", + "
" + ], + "text/plain": [ + " a b datetime c\n", + "0 5 k 2021-01-03 u\n", + "1 6 F 2021-01-04 t\n", + "2 7 Z 2021-01-05 V" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "left.merge(right.drop(\"a\", axis=1), on=[\"datetime\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## join" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
letter_left
id
1r
3a
3q
6e
7A
\n", + "
" + ], + "text/plain": [ + " letter_left\n", + "id \n", + "1 r\n", + "3 a\n", + "3 q\n", + "6 e\n", + "7 A" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "left = pd.DataFrame(\n", + " {\n", + " \"id\": [1, 3, 3, 6, 7],\n", + " \"letter_left\": random.sample(ascii_letters, k=5),\n", + " }\n", + ").set_index(\"id\")\n", + "left.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idletter_right
01J
13v
25e
37r
49t
\n", + "
" + ], + "text/plain": [ + " id letter_right\n", + "0 1 J\n", + "1 3 v\n", + "2 5 e\n", + "3 7 r\n", + "4 9 t" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "right = pd.DataFrame(\n", + " {\n", + " \"id\": [\"1\", \"3\", \"5\", \"7\", \"9\"],\n", + " \"letter_right\": random.choices(ascii_letters, k=5),\n", + " }\n", + ")\n", + "\n", + "right.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
letter_leftidletter_right
1r3v
3a7r
3q7r
6eNaNNaN
7ANaNNaN
\n", + "
" + ], + "text/plain": [ + " letter_left id letter_right\n", + "1 r 3 v\n", + "3 a 7 r\n", + "3 q 7 r\n", + "6 e NaN NaN\n", + "7 A NaN NaN" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# default left join\n", + "\n", + "left.join(right)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
letter_leftidletter_right
1r3v
3a7r
3q7r
6eNaNNaN
7ANaNNaN
\n", + "
" + ], + "text/plain": [ + " letter_left id letter_right\n", + "1 r 3 v\n", + "3 a 7 r\n", + "3 q 7 r\n", + "6 e NaN NaN\n", + "7 A NaN NaN" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "left.join(right, how=\"left\")" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
letter
id
1r
3a
3q
6e
7A
\n", + "
" + ], + "text/plain": [ + " letter\n", + "id \n", + "1 r\n", + "3 a\n", + "3 q\n", + "6 e\n", + "7 A" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "left = left.rename(columns={\"letter_left\": \"letter\"})\n", + "left.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idletter
01J
13v
25e
37r
49t
\n", + "
" + ], + "text/plain": [ + " id letter\n", + "0 1 J\n", + "1 3 v\n", + "2 5 e\n", + "3 7 r\n", + "4 9 t" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "right = right.rename(columns={\"letter_right\": \"letter\"})\n", + "right.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
letter_leftidletter_right
1r3v
3a7r
3q7r
\n", + "
" + ], + "text/plain": [ + " letter_left id letter_right\n", + "1 r 3 v\n", + "3 a 7 r\n", + "3 q 7 r" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "left.join(right, how=\"inner\", lsuffix=\"_left\", rsuffix=\"_right\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.4 ('sspai-100-hours-series-python')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + }, + "vscode": { + "interpreter": { + "hash": "7a101baf08afe636412f97dd4a9fc2e65b6f84f0ec50413bf3e19b04a26b8ba6" + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}