Files
sspai-100-hours-series-python/code/14/database_style_combination.ipynb
2022-06-15 15:03:01 +08:00

2998 lines
72 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import random\n",
"from string import ascii_letters\n",
"\n",
"import pandas as pd\n",
"\n",
"random.seed(3.14)\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"# 数据库风格的聚合方式"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>x</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>x1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>x2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>x3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id x\n",
"0 1 x1\n",
"1 2 x2\n",
"2 3 x3"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = pd.DataFrame(\n",
" {\n",
" \"id\": [1, 2, 3],\n",
" \"x\": [\"x1\", \"x2\", \"x3\"],\n",
" }\n",
")\n",
"x.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>y</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>y1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>y3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>y4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>y5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>y6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id y\n",
"0 1 y1\n",
"1 2 y3\n",
"2 3 y4\n",
"3 3 y5\n",
"4 4 y6"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"y = pd.DataFrame(\n",
" {\n",
" \"id\": [1, 2, 3, 3, 4],\n",
" \"y\": [\"y1\", \"y3\", \"y4\", \"y5\", \"y6\"],\n",
" }\n",
")\n",
"y.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>x</th>\n",
" <th>y</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>x1</td>\n",
" <td>y1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>x2</td>\n",
" <td>y3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>x3</td>\n",
" <td>y4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>x3</td>\n",
" <td>y5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id x y\n",
"0 1 x1 y1\n",
"1 2 x2 y3\n",
"2 3 x3 y4\n",
"3 3 x3 y5"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x.merge(y, on=\"id\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>x</th>\n",
" <th>y</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>x1</td>\n",
" <td>y1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>x2</td>\n",
" <td>y3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>x3</td>\n",
" <td>y4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>x3</td>\n",
" <td>y5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id x y\n",
"0 1 x1 y1\n",
"1 2 x2 y3\n",
"2 3 x3 y4\n",
"3 3 x3 y5"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# default join way\n",
"x.merge(y, on=\"id\", how=\"inner\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>x</th>\n",
" <th>y</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>x1</td>\n",
" <td>y1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>x2</td>\n",
" <td>y3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>x3</td>\n",
" <td>y4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>x3</td>\n",
" <td>y5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>NaN</td>\n",
" <td>y6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id x y\n",
"0 1 x1 y1\n",
"1 2 x2 y3\n",
"2 3 x3 y4\n",
"3 3 x3 y5\n",
"4 4 NaN y6"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x.merge(y, on=\"id\", how=\"outer\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>x</th>\n",
" <th>y</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>x1</td>\n",
" <td>y1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>x2</td>\n",
" <td>y3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>x3</td>\n",
" <td>y4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>x3</td>\n",
" <td>y5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id x y\n",
"0 1 x1 y1\n",
"1 2 x2 y3\n",
"2 3 x3 y4\n",
"3 3 x3 y5"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x.merge(y, on=\"id\", how=\"left\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>x</th>\n",
" <th>y</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>x1</td>\n",
" <td>y1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>x2</td>\n",
" <td>y3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>x3</td>\n",
" <td>y4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>x3</td>\n",
" <td>y5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>NaN</td>\n",
" <td>y6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id x y\n",
"0 1 x1 y1\n",
"1 2 x2 y3\n",
"2 3 x3 y4\n",
"3 3 x3 y5\n",
"4 4 NaN y6"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x.merge(y, on=\"id\", how=\"right\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"#### 关联条件"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"x[\"uid\"] = [10, 23, 40]\n",
"x[\"gender\"] = [\"male\", \"female\", \"male\"]\n",
"\n",
"y[\"uid\"] = [10, 10, 41, 43, 23]\n",
"y[\"sex\"] = [\"male\", None, None, \"female\", \"female\"]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>x</th>\n",
" <th>uid</th>\n",
" <th>gender</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>x1</td>\n",
" <td>10</td>\n",
" <td>male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>x2</td>\n",
" <td>23</td>\n",
" <td>female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>x3</td>\n",
" <td>40</td>\n",
" <td>male</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id x uid gender\n",
"0 1 x1 10 male\n",
"1 2 x2 23 female\n",
"2 3 x3 40 male"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>y</th>\n",
" <th>uid</th>\n",
" <th>sex</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>y1</td>\n",
" <td>10</td>\n",
" <td>male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>y3</td>\n",
" <td>10</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>y4</td>\n",
" <td>41</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>y5</td>\n",
" <td>43</td>\n",
" <td>female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>y6</td>\n",
" <td>23</td>\n",
" <td>female</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id y uid sex\n",
"0 1 y1 10 male\n",
"1 2 y3 10 None\n",
"2 3 y4 41 None\n",
"3 3 y5 43 female\n",
"4 4 y6 23 female"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>x</th>\n",
" <th>uid_x</th>\n",
" <th>gender</th>\n",
" <th>y</th>\n",
" <th>uid_y</th>\n",
" <th>sex</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>x1</td>\n",
" <td>10</td>\n",
" <td>male</td>\n",
" <td>y1</td>\n",
" <td>10</td>\n",
" <td>male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>x2</td>\n",
" <td>23</td>\n",
" <td>female</td>\n",
" <td>y3</td>\n",
" <td>10</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>x3</td>\n",
" <td>40</td>\n",
" <td>male</td>\n",
" <td>y4</td>\n",
" <td>41</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>x3</td>\n",
" <td>40</td>\n",
" <td>male</td>\n",
" <td>y5</td>\n",
" <td>43</td>\n",
" <td>female</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id x uid_x gender y uid_y sex\n",
"0 1 x1 10 male y1 10 male\n",
"1 2 x2 23 female y3 10 None\n",
"2 3 x3 40 male y4 41 None\n",
"3 3 x3 40 male y5 43 female"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x.merge(y, on=\"id\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>x</th>\n",
" <th>uid</th>\n",
" <th>gender</th>\n",
" <th>y</th>\n",
" <th>sex</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>x1</td>\n",
" <td>10</td>\n",
" <td>male</td>\n",
" <td>y1</td>\n",
" <td>male</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id x uid gender y sex\n",
"0 1 x1 10 male y1 male"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x.merge(y, on=[\"id\", \"uid\"])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id_x</th>\n",
" <th>x</th>\n",
" <th>uid</th>\n",
" <th>gender</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>x1</td>\n",
" <td>10</td>\n",
" <td>male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>x2</td>\n",
" <td>23</td>\n",
" <td>female</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>x3</td>\n",
" <td>40</td>\n",
" <td>male</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id_x x uid gender\n",
"0 1 x1 10 male\n",
"1 2 x2 23 female\n",
"2 3 x3 40 male"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = x.rename(columns={\"id\": \"id_x\"})\n",
"x.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id_x</th>\n",
" <th>x</th>\n",
" <th>uid_x</th>\n",
" <th>gender</th>\n",
" <th>id</th>\n",
" <th>y</th>\n",
" <th>uid_y</th>\n",
" <th>sex</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>x1</td>\n",
" <td>10</td>\n",
" <td>male</td>\n",
" <td>1</td>\n",
" <td>y1</td>\n",
" <td>10</td>\n",
" <td>male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>x2</td>\n",
" <td>23</td>\n",
" <td>female</td>\n",
" <td>2</td>\n",
" <td>y3</td>\n",
" <td>10</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>x3</td>\n",
" <td>40</td>\n",
" <td>male</td>\n",
" <td>3</td>\n",
" <td>y4</td>\n",
" <td>41</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>x3</td>\n",
" <td>40</td>\n",
" <td>male</td>\n",
" <td>3</td>\n",
" <td>y5</td>\n",
" <td>43</td>\n",
" <td>female</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id_x x uid_x gender id y uid_y sex\n",
"0 1 x1 10 male 1 y1 10 male\n",
"1 2 x2 23 female 2 y3 10 None\n",
"2 3 x3 40 male 3 y4 41 None\n",
"3 3 x3 40 male 3 y5 43 female"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x.merge(y, left_on=\"id_x\", right_on=\"id\")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id_x</th>\n",
" <th>x</th>\n",
" <th>uid_x</th>\n",
" <th>gender</th>\n",
" <th>id</th>\n",
" <th>y</th>\n",
" <th>uid_y</th>\n",
" <th>sex</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>x1</td>\n",
" <td>10</td>\n",
" <td>male</td>\n",
" <td>1.0</td>\n",
" <td>y1</td>\n",
" <td>10.0</td>\n",
" <td>male</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>x2</td>\n",
" <td>23</td>\n",
" <td>female</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>x3</td>\n",
" <td>40</td>\n",
" <td>male</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id_x x uid_x gender id y uid_y sex\n",
"0 1 x1 10 male 1.0 y1 10.0 male\n",
"1 2 x2 23 female NaN NaN NaN NaN\n",
"2 3 x3 40 male NaN NaN NaN NaN"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x.merge(y, left_on=[\"id_x\", \"gender\"], right_on=[\"id\", \"sex\"], how=\"left\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"### 类型不一致问题"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>datetime</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>O</td>\n",
" <td>2021-01-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>L</td>\n",
" <td>2021-01-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5</td>\n",
" <td>k</td>\n",
" <td>2021-01-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>6</td>\n",
" <td>F</td>\n",
" <td>2021-01-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>7</td>\n",
" <td>Z</td>\n",
" <td>2021-01-05</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" a b datetime\n",
"0 1 O 2021-01-01\n",
"1 3 L 2021-01-02\n",
"2 5 k 2021-01-03\n",
"3 6 F 2021-01-04\n",
"4 7 Z 2021-01-05"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"left = pd.DataFrame(\n",
" {\n",
" \"a\": [1, 3, 5, 6, 7],\n",
" \"b\": random.choices(ascii_letters, k=5),\n",
" \"datetime\": pd.date_range(\"20210101\", periods=5, freq=\"D\"),\n",
" }\n",
")\n",
"left.head()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>c</th>\n",
" <th>datetime</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>u</td>\n",
" <td>2021-01-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>t</td>\n",
" <td>2021-01-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5</td>\n",
" <td>V</td>\n",
" <td>2021-01-05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7</td>\n",
" <td>x</td>\n",
" <td>2021-01-06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>9</td>\n",
" <td>m</td>\n",
" <td>2021-01-07</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" a c datetime\n",
"0 1 u 2021-01-03\n",
"1 3 t 2021-01-04\n",
"2 5 V 2021-01-05\n",
"3 7 x 2021-01-06\n",
"4 9 m 2021-01-07"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"right = pd.DataFrame(\n",
" {\n",
" \"a\": [\"1\", \"3\", \"5\", \"7\", \"9\"],\n",
" \"c\": random.choices(ascii_letters, k=5),\n",
" \"datetime\": pd.date_range(\"20210103\", periods=5, freq=\"D\").map(lambda v: str(v.date())),\n",
" }\n",
")\n",
"right.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"# error\n",
"# `a` in left is integer type, but in right is string.\n",
"\n",
"# left.merge(right, on=[\"a\"])"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"# error\n",
"# `a` in left is datetime type, but in right is string.\n",
"\n",
"# left.merge(right, on=[\"datetime\"])"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"# error\n",
"# the type of keys in left isn't same as the right keys.\n",
"\n",
"# left.merge(right, on=[\"a\", \"datetime\"])"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>datetime_x</th>\n",
" <th>c</th>\n",
" <th>datetime_y</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>O</td>\n",
" <td>2021-01-01</td>\n",
" <td>u</td>\n",
" <td>2021-01-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>L</td>\n",
" <td>2021-01-02</td>\n",
" <td>t</td>\n",
" <td>2021-01-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5</td>\n",
" <td>k</td>\n",
" <td>2021-01-03</td>\n",
" <td>V</td>\n",
" <td>2021-01-05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7</td>\n",
" <td>Z</td>\n",
" <td>2021-01-05</td>\n",
" <td>x</td>\n",
" <td>2021-01-06</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" a b datetime_x c datetime_y\n",
"0 1 O 2021-01-01 u 2021-01-03\n",
"1 3 L 2021-01-02 t 2021-01-04\n",
"2 5 k 2021-01-03 V 2021-01-05\n",
"3 7 Z 2021-01-05 x 2021-01-06"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"right[\"a\"] = right[\"a\"].astype(int)\n",
"left.merge(right, on=[\"a\"])"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a_x</th>\n",
" <th>b</th>\n",
" <th>datetime</th>\n",
" <th>a_y</th>\n",
" <th>c</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5</td>\n",
" <td>k</td>\n",
" <td>2021-01-03</td>\n",
" <td>1</td>\n",
" <td>u</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6</td>\n",
" <td>F</td>\n",
" <td>2021-01-04</td>\n",
" <td>3</td>\n",
" <td>t</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>7</td>\n",
" <td>Z</td>\n",
" <td>2021-01-05</td>\n",
" <td>5</td>\n",
" <td>V</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" a_x b datetime a_y c\n",
"0 5 k 2021-01-03 1 u\n",
"1 6 F 2021-01-04 3 t\n",
"2 7 Z 2021-01-05 5 V"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"right[\"datetime\"] = pd.to_datetime(right[\"datetime\"])\n",
"left.merge(right, on=[\"datetime\"])"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"整型与浮点型关联时的情况:"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>c</th>\n",
" <th>datetime</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>u</td>\n",
" <td>2021-01-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" <td>t</td>\n",
" <td>2021-01-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5.0</td>\n",
" <td>V</td>\n",
" <td>2021-01-05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7.0</td>\n",
" <td>x</td>\n",
" <td>2021-01-06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>9.0</td>\n",
" <td>m</td>\n",
" <td>2021-01-07</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" a c datetime\n",
"0 1.0 u 2021-01-03\n",
"1 3.0 t 2021-01-04\n",
"2 5.0 V 2021-01-05\n",
"3 7.0 x 2021-01-06\n",
"4 9.0 m 2021-01-07"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"right[\"a\"] = right[\"a\"].astype(float)\n",
"right.head()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>datetime_x</th>\n",
" <th>c</th>\n",
" <th>datetime_y</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>O</td>\n",
" <td>2021-01-01</td>\n",
" <td>u</td>\n",
" <td>2021-01-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>L</td>\n",
" <td>2021-01-02</td>\n",
" <td>t</td>\n",
" <td>2021-01-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5</td>\n",
" <td>k</td>\n",
" <td>2021-01-03</td>\n",
" <td>V</td>\n",
" <td>2021-01-05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7</td>\n",
" <td>Z</td>\n",
" <td>2021-01-05</td>\n",
" <td>x</td>\n",
" <td>2021-01-06</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" a b datetime_x c datetime_y\n",
"0 1 O 2021-01-01 u 2021-01-03\n",
"1 3 L 2021-01-02 t 2021-01-04\n",
"2 5 k 2021-01-03 V 2021-01-05\n",
"3 7 Z 2021-01-05 x 2021-01-06"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"left.merge(right, on=[\"a\"])"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"a int64\n",
"b object\n",
"datetime_x datetime64[ns]\n",
"c object\n",
"datetime_y datetime64[ns]\n",
"dtype: object"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"left.merge(right, on=[\"a\"]).dtypes"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>c</th>\n",
" <th>datetime</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>u</td>\n",
" <td>2021-01-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.1</td>\n",
" <td>t</td>\n",
" <td>2021-01-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5.0</td>\n",
" <td>V</td>\n",
" <td>2021-01-05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7.4</td>\n",
" <td>x</td>\n",
" <td>2021-01-06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>9.5</td>\n",
" <td>m</td>\n",
" <td>2021-01-07</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" a c datetime\n",
"0 1.0 u 2021-01-03\n",
"1 3.1 t 2021-01-04\n",
"2 5.0 V 2021-01-05\n",
"3 7.4 x 2021-01-06\n",
"4 9.5 m 2021-01-07"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"right[\"a\"] = [1.0, 3.1, 5.0, 7.4, 9.5]\n",
"right.head()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/Bobot/.virtualenvs/sspai-100-hours-series-python/lib/python3.10/site-packages/pandas/core/reshape/merge.py:1215: UserWarning: You are merging on int and float columns where the float values are not equal to their int representation.\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>datetime_x</th>\n",
" <th>c</th>\n",
" <th>datetime_y</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>O</td>\n",
" <td>2021-01-01</td>\n",
" <td>u</td>\n",
" <td>2021-01-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5</td>\n",
" <td>k</td>\n",
" <td>2021-01-03</td>\n",
" <td>V</td>\n",
" <td>2021-01-05</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" a b datetime_x c datetime_y\n",
"0 1 O 2021-01-01 u 2021-01-03\n",
"1 5 k 2021-01-03 V 2021-01-05"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"left.merge(right, on=[\"a\"])"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a_x</th>\n",
" <th>b</th>\n",
" <th>datetime</th>\n",
" <th>a_y</th>\n",
" <th>c</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5</td>\n",
" <td>k</td>\n",
" <td>2021-01-03</td>\n",
" <td>1.0</td>\n",
" <td>u</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6</td>\n",
" <td>F</td>\n",
" <td>2021-01-04</td>\n",
" <td>3.1</td>\n",
" <td>t</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>7</td>\n",
" <td>Z</td>\n",
" <td>2021-01-05</td>\n",
" <td>5.0</td>\n",
" <td>V</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" a_x b datetime a_y c\n",
"0 5 k 2021-01-03 1.0 u\n",
"1 6 F 2021-01-04 3.1 t\n",
"2 7 Z 2021-01-05 5.0 V"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"left.merge(right, on=[\"datetime\"])"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/Bobot/.virtualenvs/sspai-100-hours-series-python/lib/python3.10/site-packages/pandas/core/reshape/merge.py:1215: UserWarning: You are merging on int and float columns where the float values are not equal to their int representation.\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>datetime</th>\n",
" <th>c</th>\n",
" <th>datetime_right</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>O</td>\n",
" <td>2021-01-01</td>\n",
" <td>u</td>\n",
" <td>2021-01-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5</td>\n",
" <td>k</td>\n",
" <td>2021-01-03</td>\n",
" <td>V</td>\n",
" <td>2021-01-05</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" a b datetime c datetime_right\n",
"0 1 O 2021-01-01 u 2021-01-03\n",
"1 5 k 2021-01-03 V 2021-01-05"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"left.merge(right, on=[\"a\"], suffixes=(\"\", \"_right\"))"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>datetime</th>\n",
" <th>c</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5</td>\n",
" <td>k</td>\n",
" <td>2021-01-03</td>\n",
" <td>u</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6</td>\n",
" <td>F</td>\n",
" <td>2021-01-04</td>\n",
" <td>t</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>7</td>\n",
" <td>Z</td>\n",
" <td>2021-01-05</td>\n",
" <td>V</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" a b datetime c\n",
"0 5 k 2021-01-03 u\n",
"1 6 F 2021-01-04 t\n",
"2 7 Z 2021-01-05 V"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"left.merge(right.drop(\"a\", axis=1), on=[\"datetime\"])"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## join"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>letter_left</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>r</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>q</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>A</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" letter_left\n",
"id \n",
"1 r\n",
"3 a\n",
"3 q\n",
"6 e\n",
"7 A"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"left = pd.DataFrame(\n",
" {\n",
" \"id\": [1, 3, 3, 6, 7],\n",
" \"letter_left\": random.sample(ascii_letters, k=5),\n",
" }\n",
").set_index(\"id\")\n",
"left.head()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>letter_right</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>J</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>v</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5</td>\n",
" <td>e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7</td>\n",
" <td>r</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>9</td>\n",
" <td>t</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id letter_right\n",
"0 1 J\n",
"1 3 v\n",
"2 5 e\n",
"3 7 r\n",
"4 9 t"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"right = pd.DataFrame(\n",
" {\n",
" \"id\": [\"1\", \"3\", \"5\", \"7\", \"9\"],\n",
" \"letter_right\": random.choices(ascii_letters, k=5),\n",
" }\n",
")\n",
"\n",
"right.head()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>letter_left</th>\n",
" <th>id</th>\n",
" <th>letter_right</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>r</td>\n",
" <td>3</td>\n",
" <td>v</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>a</td>\n",
" <td>7</td>\n",
" <td>r</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>q</td>\n",
" <td>7</td>\n",
" <td>r</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>e</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>A</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" letter_left id letter_right\n",
"1 r 3 v\n",
"3 a 7 r\n",
"3 q 7 r\n",
"6 e NaN NaN\n",
"7 A NaN NaN"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# default left join\n",
"\n",
"left.join(right)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>letter_left</th>\n",
" <th>id</th>\n",
" <th>letter_right</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>r</td>\n",
" <td>3</td>\n",
" <td>v</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>a</td>\n",
" <td>7</td>\n",
" <td>r</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>q</td>\n",
" <td>7</td>\n",
" <td>r</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>e</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>A</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" letter_left id letter_right\n",
"1 r 3 v\n",
"3 a 7 r\n",
"3 q 7 r\n",
"6 e NaN NaN\n",
"7 A NaN NaN"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"left.join(right, how=\"left\")"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>letter</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>r</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>q</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>A</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" letter\n",
"id \n",
"1 r\n",
"3 a\n",
"3 q\n",
"6 e\n",
"7 A"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"left = left.rename(columns={\"letter_left\": \"letter\"})\n",
"left.head()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>letter</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>J</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>v</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5</td>\n",
" <td>e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7</td>\n",
" <td>r</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>9</td>\n",
" <td>t</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id letter\n",
"0 1 J\n",
"1 3 v\n",
"2 5 e\n",
"3 7 r\n",
"4 9 t"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"right = right.rename(columns={\"letter_right\": \"letter\"})\n",
"right.head()"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>letter_left</th>\n",
" <th>id</th>\n",
" <th>letter_right</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>r</td>\n",
" <td>3</td>\n",
" <td>v</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>a</td>\n",
" <td>7</td>\n",
" <td>r</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>q</td>\n",
" <td>7</td>\n",
" <td>r</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" letter_left id letter_right\n",
"1 r 3 v\n",
"3 a 7 r\n",
"3 q 7 r"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"left.join(right, how=\"inner\", lsuffix=\"_left\", rsuffix=\"_right\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.4 ('sspai-100-hours-series-python')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
},
"vscode": {
"interpreter": {
"hash": "7a101baf08afe636412f97dd4a9fc2e65b6f84f0ec50413bf3e19b04a26b8ba6"
}
}
},
"nbformat": 4,
"nbformat_minor": 0
}