{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import random\n", "from string import ascii_letters\n", "\n", "import pandas as pd\n", "\n", "random.seed(3.14)\n" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } }, "source": [ "# 数据库风格的聚合方式" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idx
01x1
12x2
23x3
\n", "
" ], "text/plain": [ " id x\n", "0 1 x1\n", "1 2 x2\n", "2 3 x3" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x = pd.DataFrame(\n", " {\n", " \"id\": [1, 2, 3],\n", " \"x\": [\"x1\", \"x2\", \"x3\"],\n", " }\n", ")\n", "x.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idy
01y1
12y3
23y4
33y5
44y6
\n", "
" ], "text/plain": [ " id y\n", "0 1 y1\n", "1 2 y3\n", "2 3 y4\n", "3 3 y5\n", "4 4 y6" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "y = pd.DataFrame(\n", " {\n", " \"id\": [1, 2, 3, 3, 4],\n", " \"y\": [\"y1\", \"y3\", \"y4\", \"y5\", \"y6\"],\n", " }\n", ")\n", "y.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idxy
01x1y1
12x2y3
23x3y4
33x3y5
\n", "
" ], "text/plain": [ " id x y\n", "0 1 x1 y1\n", "1 2 x2 y3\n", "2 3 x3 y4\n", "3 3 x3 y5" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x.merge(y, on=\"id\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idxy
01x1y1
12x2y3
23x3y4
33x3y5
\n", "
" ], "text/plain": [ " id x y\n", "0 1 x1 y1\n", "1 2 x2 y3\n", "2 3 x3 y4\n", "3 3 x3 y5" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# default join way\n", "x.merge(y, on=\"id\", how=\"inner\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idxy
01x1y1
12x2y3
23x3y4
33x3y5
44NaNy6
\n", "
" ], "text/plain": [ " id x y\n", "0 1 x1 y1\n", "1 2 x2 y3\n", "2 3 x3 y4\n", "3 3 x3 y5\n", "4 4 NaN y6" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x.merge(y, on=\"id\", how=\"outer\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idxy
01x1y1
12x2y3
23x3y4
33x3y5
\n", "
" ], "text/plain": [ " id x y\n", "0 1 x1 y1\n", "1 2 x2 y3\n", "2 3 x3 y4\n", "3 3 x3 y5" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x.merge(y, on=\"id\", how=\"left\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idxy
01x1y1
12x2y3
23x3y4
33x3y5
44NaNy6
\n", "
" ], "text/plain": [ " id x y\n", "0 1 x1 y1\n", "1 2 x2 y3\n", "2 3 x3 y4\n", "3 3 x3 y5\n", "4 4 NaN y6" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x.merge(y, on=\"id\", how=\"right\")" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } }, "source": [ "#### 关联条件" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "x[\"uid\"] = [10, 23, 40]\n", "x[\"gender\"] = [\"male\", \"female\", \"male\"]\n", "\n", "y[\"uid\"] = [10, 10, 41, 43, 23]\n", "y[\"sex\"] = [\"male\", None, None, \"female\", \"female\"]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idxuidgender
01x110male
12x223female
23x340male
\n", "
" ], "text/plain": [ " id x uid gender\n", "0 1 x1 10 male\n", "1 2 x2 23 female\n", "2 3 x3 40 male" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idyuidsex
01y110male
12y310None
23y441None
33y543female
44y623female
\n", "
" ], "text/plain": [ " id y uid sex\n", "0 1 y1 10 male\n", "1 2 y3 10 None\n", "2 3 y4 41 None\n", "3 3 y5 43 female\n", "4 4 y6 23 female" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y.head()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idxuid_xgenderyuid_ysex
01x110maley110male
12x223femaley310None
23x340maley441None
33x340maley543female
\n", "
" ], "text/plain": [ " id x uid_x gender y uid_y sex\n", "0 1 x1 10 male y1 10 male\n", "1 2 x2 23 female y3 10 None\n", "2 3 x3 40 male y4 41 None\n", "3 3 x3 40 male y5 43 female" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x.merge(y, on=\"id\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idxuidgenderysex
01x110maley1male
\n", "
" ], "text/plain": [ " id x uid gender y sex\n", "0 1 x1 10 male y1 male" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x.merge(y, on=[\"id\", \"uid\"])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
id_xxuidgender
01x110male
12x223female
23x340male
\n", "
" ], "text/plain": [ " id_x x uid gender\n", "0 1 x1 10 male\n", "1 2 x2 23 female\n", "2 3 x3 40 male" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x = x.rename(columns={\"id\": \"id_x\"})\n", "x.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
id_xxuid_xgenderidyuid_ysex
01x110male1y110male
12x223female2y310None
23x340male3y441None
33x340male3y543female
\n", "
" ], "text/plain": [ " id_x x uid_x gender id y uid_y sex\n", "0 1 x1 10 male 1 y1 10 male\n", "1 2 x2 23 female 2 y3 10 None\n", "2 3 x3 40 male 3 y4 41 None\n", "3 3 x3 40 male 3 y5 43 female" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x.merge(y, left_on=\"id_x\", right_on=\"id\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
id_xxuid_xgenderidyuid_ysex
01x110male1.0y110.0male
12x223femaleNaNNaNNaNNaN
23x340maleNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " id_x x uid_x gender id y uid_y sex\n", "0 1 x1 10 male 1.0 y1 10.0 male\n", "1 2 x2 23 female NaN NaN NaN NaN\n", "2 3 x3 40 male NaN NaN NaN NaN" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x.merge(y, left_on=[\"id_x\", \"gender\"], right_on=[\"id\", \"sex\"], how=\"left\")" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } }, "source": [ "### 类型不一致问题" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abdatetime
01O2021-01-01
13L2021-01-02
25k2021-01-03
36F2021-01-04
47Z2021-01-05
\n", "
" ], "text/plain": [ " a b datetime\n", "0 1 O 2021-01-01\n", "1 3 L 2021-01-02\n", "2 5 k 2021-01-03\n", "3 6 F 2021-01-04\n", "4 7 Z 2021-01-05" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "left = pd.DataFrame(\n", " {\n", " \"a\": [1, 3, 5, 6, 7],\n", " \"b\": random.choices(ascii_letters, k=5),\n", " \"datetime\": pd.date_range(\"20210101\", periods=5, freq=\"D\"),\n", " }\n", ")\n", "left.head()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
acdatetime
01u2021-01-03
13t2021-01-04
25V2021-01-05
37x2021-01-06
49m2021-01-07
\n", "
" ], "text/plain": [ " a c datetime\n", "0 1 u 2021-01-03\n", "1 3 t 2021-01-04\n", "2 5 V 2021-01-05\n", "3 7 x 2021-01-06\n", "4 9 m 2021-01-07" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "right = pd.DataFrame(\n", " {\n", " \"a\": [\"1\", \"3\", \"5\", \"7\", \"9\"],\n", " \"c\": random.choices(ascii_letters, k=5),\n", " \"datetime\": pd.date_range(\"20210103\", periods=5, freq=\"D\").map(lambda v: str(v.date())),\n", " }\n", ")\n", "right.head()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# error\n", "# `a` in left is integer type, but in right is string.\n", "\n", "# left.merge(right, on=[\"a\"])" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# error\n", "# `a` in left is datetime type, but in right is string.\n", "\n", "# left.merge(right, on=[\"datetime\"])" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# error\n", "# the type of keys in left isn't same as the right keys.\n", "\n", "# left.merge(right, on=[\"a\", \"datetime\"])" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abdatetime_xcdatetime_y
01O2021-01-01u2021-01-03
13L2021-01-02t2021-01-04
25k2021-01-03V2021-01-05
37Z2021-01-05x2021-01-06
\n", "
" ], "text/plain": [ " a b datetime_x c datetime_y\n", "0 1 O 2021-01-01 u 2021-01-03\n", "1 3 L 2021-01-02 t 2021-01-04\n", "2 5 k 2021-01-03 V 2021-01-05\n", "3 7 Z 2021-01-05 x 2021-01-06" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "right[\"a\"] = right[\"a\"].astype(int)\n", "left.merge(right, on=[\"a\"])" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
a_xbdatetimea_yc
05k2021-01-031u
16F2021-01-043t
27Z2021-01-055V
\n", "
" ], "text/plain": [ " a_x b datetime a_y c\n", "0 5 k 2021-01-03 1 u\n", "1 6 F 2021-01-04 3 t\n", "2 7 Z 2021-01-05 5 V" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "right[\"datetime\"] = pd.to_datetime(right[\"datetime\"])\n", "left.merge(right, on=[\"datetime\"])" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } }, "source": [ "整型与浮点型关联时的情况:" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
acdatetime
01.0u2021-01-03
13.0t2021-01-04
25.0V2021-01-05
37.0x2021-01-06
49.0m2021-01-07
\n", "
" ], "text/plain": [ " a c datetime\n", "0 1.0 u 2021-01-03\n", "1 3.0 t 2021-01-04\n", "2 5.0 V 2021-01-05\n", "3 7.0 x 2021-01-06\n", "4 9.0 m 2021-01-07" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "right[\"a\"] = right[\"a\"].astype(float)\n", "right.head()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abdatetime_xcdatetime_y
01O2021-01-01u2021-01-03
13L2021-01-02t2021-01-04
25k2021-01-03V2021-01-05
37Z2021-01-05x2021-01-06
\n", "
" ], "text/plain": [ " a b datetime_x c datetime_y\n", "0 1 O 2021-01-01 u 2021-01-03\n", "1 3 L 2021-01-02 t 2021-01-04\n", "2 5 k 2021-01-03 V 2021-01-05\n", "3 7 Z 2021-01-05 x 2021-01-06" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "left.merge(right, on=[\"a\"])" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": [ "a int64\n", "b object\n", "datetime_x datetime64[ns]\n", "c object\n", "datetime_y datetime64[ns]\n", "dtype: object" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "left.merge(right, on=[\"a\"]).dtypes" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
acdatetime
01.0u2021-01-03
13.1t2021-01-04
25.0V2021-01-05
37.4x2021-01-06
49.5m2021-01-07
\n", "
" ], "text/plain": [ " a c datetime\n", "0 1.0 u 2021-01-03\n", "1 3.1 t 2021-01-04\n", "2 5.0 V 2021-01-05\n", "3 7.4 x 2021-01-06\n", "4 9.5 m 2021-01-07" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "right[\"a\"] = [1.0, 3.1, 5.0, 7.4, 9.5]\n", "right.head()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/Bobot/.virtualenvs/sspai-100-hours-series-python/lib/python3.10/site-packages/pandas/core/reshape/merge.py:1215: UserWarning: You are merging on int and float columns where the float values are not equal to their int representation.\n", " warnings.warn(\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abdatetime_xcdatetime_y
01O2021-01-01u2021-01-03
15k2021-01-03V2021-01-05
\n", "
" ], "text/plain": [ " a b datetime_x c datetime_y\n", "0 1 O 2021-01-01 u 2021-01-03\n", "1 5 k 2021-01-03 V 2021-01-05" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "left.merge(right, on=[\"a\"])" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
a_xbdatetimea_yc
05k2021-01-031.0u
16F2021-01-043.1t
27Z2021-01-055.0V
\n", "
" ], "text/plain": [ " a_x b datetime a_y c\n", "0 5 k 2021-01-03 1.0 u\n", "1 6 F 2021-01-04 3.1 t\n", "2 7 Z 2021-01-05 5.0 V" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "left.merge(right, on=[\"datetime\"])" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/Bobot/.virtualenvs/sspai-100-hours-series-python/lib/python3.10/site-packages/pandas/core/reshape/merge.py:1215: UserWarning: You are merging on int and float columns where the float values are not equal to their int representation.\n", " warnings.warn(\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abdatetimecdatetime_right
01O2021-01-01u2021-01-03
15k2021-01-03V2021-01-05
\n", "
" ], "text/plain": [ " a b datetime c datetime_right\n", "0 1 O 2021-01-01 u 2021-01-03\n", "1 5 k 2021-01-03 V 2021-01-05" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "left.merge(right, on=[\"a\"], suffixes=(\"\", \"_right\"))" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abdatetimec
05k2021-01-03u
16F2021-01-04t
27Z2021-01-05V
\n", "
" ], "text/plain": [ " a b datetime c\n", "0 5 k 2021-01-03 u\n", "1 6 F 2021-01-04 t\n", "2 7 Z 2021-01-05 V" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "left.merge(right.drop(\"a\", axis=1), on=[\"datetime\"])" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } }, "source": [ "## join" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
letter_left
id
1r
3a
3q
6e
7A
\n", "
" ], "text/plain": [ " letter_left\n", "id \n", "1 r\n", "3 a\n", "3 q\n", "6 e\n", "7 A" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "left = pd.DataFrame(\n", " {\n", " \"id\": [1, 3, 3, 6, 7],\n", " \"letter_left\": random.sample(ascii_letters, k=5),\n", " }\n", ").set_index(\"id\")\n", "left.head()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idletter_right
01J
13v
25e
37r
49t
\n", "
" ], "text/plain": [ " id letter_right\n", "0 1 J\n", "1 3 v\n", "2 5 e\n", "3 7 r\n", "4 9 t" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "right = pd.DataFrame(\n", " {\n", " \"id\": [\"1\", \"3\", \"5\", \"7\", \"9\"],\n", " \"letter_right\": random.choices(ascii_letters, k=5),\n", " }\n", ")\n", "\n", "right.head()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
letter_leftidletter_right
1r3v
3a7r
3q7r
6eNaNNaN
7ANaNNaN
\n", "
" ], "text/plain": [ " letter_left id letter_right\n", "1 r 3 v\n", "3 a 7 r\n", "3 q 7 r\n", "6 e NaN NaN\n", "7 A NaN NaN" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# default left join\n", "\n", "left.join(right)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
letter_leftidletter_right
1r3v
3a7r
3q7r
6eNaNNaN
7ANaNNaN
\n", "
" ], "text/plain": [ " letter_left id letter_right\n", "1 r 3 v\n", "3 a 7 r\n", "3 q 7 r\n", "6 e NaN NaN\n", "7 A NaN NaN" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "left.join(right, how=\"left\")" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
letter
id
1r
3a
3q
6e
7A
\n", "
" ], "text/plain": [ " letter\n", "id \n", "1 r\n", "3 a\n", "3 q\n", "6 e\n", "7 A" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "left = left.rename(columns={\"letter_left\": \"letter\"})\n", "left.head()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idletter
01J
13v
25e
37r
49t
\n", "
" ], "text/plain": [ " id letter\n", "0 1 J\n", "1 3 v\n", "2 5 e\n", "3 7 r\n", "4 9 t" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "right = right.rename(columns={\"letter_right\": \"letter\"})\n", "right.head()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
letter_leftidletter_right
1r3v
3a7r
3q7r
\n", "
" ], "text/plain": [ " letter_left id letter_right\n", "1 r 3 v\n", "3 a 7 r\n", "3 q 7 r" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "left.join(right, how=\"inner\", lsuffix=\"_left\", rsuffix=\"_right\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3.10.4 ('sspai-100-hours-series-python')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.4" }, "vscode": { "interpreter": { "hash": "7a101baf08afe636412f97dd4a9fc2e65b6f84f0ec50413bf3e19b04a26b8ba6" } } }, "nbformat": 4, "nbformat_minor": 0 }