1049 lines
24 KiB
Plaintext
1049 lines
24 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"collapsed": true,
|
|
"pycharm": {
|
|
"name": "#%% md\n"
|
|
}
|
|
},
|
|
"source": [
|
|
"# Prerequisite"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pathlib\n",
|
|
"\n",
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"pd.set_option(\"display.max_columns\", 20)\n",
|
|
"\n",
|
|
"DATA_PATH = pathlib.Path(\"../../data/titanic.csv\")\n",
|
|
"\n",
|
|
"data = pd.read_csv(DATA_PATH)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|
"RangeIndex: 891 entries, 0 to 890\n",
|
|
"Data columns (total 12 columns):\n",
|
|
" # Column Non-Null Count Dtype \n",
|
|
"--- ------ -------------- ----- \n",
|
|
" 0 PassengerId 891 non-null int64 \n",
|
|
" 1 Survived 891 non-null int64 \n",
|
|
" 2 Pclass 891 non-null int64 \n",
|
|
" 3 Name 891 non-null object \n",
|
|
" 4 Sex 891 non-null object \n",
|
|
" 5 Age 714 non-null float64\n",
|
|
" 6 SibSp 891 non-null int64 \n",
|
|
" 7 Parch 891 non-null int64 \n",
|
|
" 8 Ticket 891 non-null object \n",
|
|
" 9 Fare 891 non-null float64\n",
|
|
" 10 Cabin 204 non-null object \n",
|
|
" 11 Embarked 889 non-null object \n",
|
|
"dtypes: float64(2), int64(5), object(5)\n",
|
|
"memory usage: 83.7+ KB\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"data.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"PassengerId 0\n",
|
|
"Survived 0\n",
|
|
"Pclass 0\n",
|
|
"Name 0\n",
|
|
"Sex 0\n",
|
|
"Age 177\n",
|
|
"SibSp 0\n",
|
|
"Parch 0\n",
|
|
"Ticket 0\n",
|
|
"Fare 0\n",
|
|
"Cabin 687\n",
|
|
"Embarked 2\n",
|
|
"dtype: int64"
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data.isna().sum()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>PassengerId</th>\n",
|
|
" <th>Survived</th>\n",
|
|
" <th>Pclass</th>\n",
|
|
" <th>Name</th>\n",
|
|
" <th>Sex</th>\n",
|
|
" <th>Age</th>\n",
|
|
" <th>SibSp</th>\n",
|
|
" <th>Parch</th>\n",
|
|
" <th>Ticket</th>\n",
|
|
" <th>Fare</th>\n",
|
|
" <th>Cabin</th>\n",
|
|
" <th>Embarked</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>Braund, Mr. Owen Harris</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>22.0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>A/5 21171</td>\n",
|
|
" <td>7.2500</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>S</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>2</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>38.0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>PC 17599</td>\n",
|
|
" <td>71.2833</td>\n",
|
|
" <td>C85</td>\n",
|
|
" <td>C</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>3</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>Heikkinen, Miss. Laina</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>26.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>STON/O2. 3101282</td>\n",
|
|
" <td>7.9250</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>S</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>4</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>35.0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>113803</td>\n",
|
|
" <td>53.1000</td>\n",
|
|
" <td>C123</td>\n",
|
|
" <td>S</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>Allen, Mr. William Henry</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>35.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>373450</td>\n",
|
|
" <td>8.0500</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>S</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" PassengerId Survived Pclass \\\n",
|
|
"0 1 0 3 \n",
|
|
"1 2 1 1 \n",
|
|
"2 3 1 3 \n",
|
|
"3 4 1 1 \n",
|
|
"4 5 0 3 \n",
|
|
"\n",
|
|
" Name Sex Age SibSp \\\n",
|
|
"0 Braund, Mr. Owen Harris male 22.0 1 \n",
|
|
"1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
|
|
"2 Heikkinen, Miss. Laina female 26.0 0 \n",
|
|
"3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
|
|
"4 Allen, Mr. William Henry male 35.0 0 \n",
|
|
"\n",
|
|
" Parch Ticket Fare Cabin Embarked \n",
|
|
"0 0 A/5 21171 7.2500 NaN S \n",
|
|
"1 0 PC 17599 71.2833 C85 C \n",
|
|
"2 0 STON/O2. 3101282 7.9250 NaN S \n",
|
|
"3 0 113803 53.1000 C123 S \n",
|
|
"4 0 373450 8.0500 NaN S "
|
|
]
|
|
},
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%% md\n"
|
|
}
|
|
},
|
|
"source": [
|
|
"# 在原数据的基础上进一步挖掘信息"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"Youth 451\n",
|
|
"Unknown 177\n",
|
|
"Middle-aged 128\n",
|
|
"Teenagers 66\n",
|
|
"Chilren 47\n",
|
|
"Elderly 22\n",
|
|
"Name: AgeCategory, dtype: int64"
|
|
]
|
|
},
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Age Group\n",
|
|
"def classify_age(age):\n",
|
|
" if 0 < age <= 6:\n",
|
|
" return \"Chilren\"\n",
|
|
" elif 6 < age <= 17:\n",
|
|
" return \"Teenagers\"\n",
|
|
" elif 17 < age <= 40:\n",
|
|
" return \"Youth\"\n",
|
|
" elif 40 < age <= 60:\n",
|
|
" return \"Middle-aged\"\n",
|
|
" elif 60 < age <= 100:\n",
|
|
" return \"Elderly\"\n",
|
|
" else:\n",
|
|
" return \"Unknown\"\n",
|
|
"\n",
|
|
"\n",
|
|
"data[\"AgeCategory\"] = data[\"Age\"].map(classify_age)\n",
|
|
"data[\"AgeCategory\"].value_counts()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"3 491\n",
|
|
"1 216\n",
|
|
"2 184\n",
|
|
"Name: Pclass, dtype: int64"
|
|
]
|
|
},
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Pclass stats\n",
|
|
"data[\"Pclass\"].value_counts(sort=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"count 891.000000\n",
|
|
"mean 32.204208\n",
|
|
"std 49.693429\n",
|
|
"min 0.000000\n",
|
|
"25% 7.910400\n",
|
|
"50% 14.454200\n",
|
|
"75% 31.000000\n",
|
|
"max 512.329200\n",
|
|
"Name: Fare, dtype: float64"
|
|
]
|
|
},
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Fare stats\n",
|
|
"data[\"Fare\"].describe()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>count</th>\n",
|
|
" <th>std</th>\n",
|
|
" <th>min</th>\n",
|
|
" <th>25%</th>\n",
|
|
" <th>50%</th>\n",
|
|
" <th>75%</th>\n",
|
|
" <th>max</th>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>Pclass</th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>211.0</td>\n",
|
|
" <td>78.212155</td>\n",
|
|
" <td>5.0000</td>\n",
|
|
" <td>32.9104</td>\n",
|
|
" <td>61.9792</td>\n",
|
|
" <td>99.9625</td>\n",
|
|
" <td>512.3292</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>178.0</td>\n",
|
|
" <td>13.083169</td>\n",
|
|
" <td>10.5000</td>\n",
|
|
" <td>13.0000</td>\n",
|
|
" <td>15.0229</td>\n",
|
|
" <td>26.0000</td>\n",
|
|
" <td>73.5000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>487.0</td>\n",
|
|
" <td>11.760718</td>\n",
|
|
" <td>4.0125</td>\n",
|
|
" <td>7.7500</td>\n",
|
|
" <td>8.0500</td>\n",
|
|
" <td>15.5000</td>\n",
|
|
" <td>69.5500</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" count std min 25% 50% 75% max\n",
|
|
"Pclass \n",
|
|
"1 211.0 78.212155 5.0000 32.9104 61.9792 99.9625 512.3292\n",
|
|
"2 178.0 13.083169 10.5000 13.0000 15.0229 26.0000 73.5000\n",
|
|
"3 487.0 11.760718 4.0125 7.7500 8.0500 15.5000 69.5500"
|
|
]
|
|
},
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"grp = data.loc[data[\"Fare\"] >= 1].groupby(\"Pclass\")[\"Fare\"].describe()\n",
|
|
"grp = grp.drop(\"mean\", axis=1)\n",
|
|
"grp"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"normal 443\n",
|
|
"middle 311\n",
|
|
"upper 122\n",
|
|
"Name: FareLevel, dtype: int64"
|
|
]
|
|
},
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data[\"FareLevel\"] = pd.cut(\n",
|
|
" data[\"Fare\"], bins=[0, 15, 60, 600], labels=[\"normal\", \"middle\", \"upper\"]\n",
|
|
")\n",
|
|
"data[\"FareLevel\"].value_counts()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%% md\n"
|
|
}
|
|
},
|
|
"source": [
|
|
"# 无用和缺失数据的处理"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',\n",
|
|
" 'Parch', 'Ticket', 'Fare', 'Embarked', 'AgeCategory', 'FareLevel'],\n",
|
|
" dtype='object')\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Cabin\n",
|
|
"data = data.drop(\"Cabin\", axis=1)\n",
|
|
"print(data.columns)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',\n",
|
|
" 'Parch', 'Fare', 'Embarked', 'AgeCategory', 'FareLevel'],\n",
|
|
" dtype='object')\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Ticket\n",
|
|
"data = data.drop(\"Ticket\", axis=1)\n",
|
|
"print(data.columns)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" a b\n",
|
|
"0 1.0 NaN\n",
|
|
"1 NaN a\n",
|
|
"2 2.0 NaN\n",
|
|
"3 4.0 c\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"\n",
|
|
"data_with_na = pd.DataFrame(\n",
|
|
" dict(\n",
|
|
" a=[1, np.nan, 2, 4],\n",
|
|
" b=[np.nan, \"a\", np.nan, \"c\"],\n",
|
|
" )\n",
|
|
")\n",
|
|
"print(data_with_na)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>a</th>\n",
|
|
" <th>b</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>0.0</td>\n",
|
|
" <td>a</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>2.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>4.0</td>\n",
|
|
" <td>c</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" a b\n",
|
|
"0 1.0 0\n",
|
|
"1 0.0 a\n",
|
|
"2 2.0 0\n",
|
|
"3 4.0 c"
|
|
]
|
|
},
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data_with_na.fillna(0)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>a</th>\n",
|
|
" <th>b</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>missing</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>0.0</td>\n",
|
|
" <td>a</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>2.0</td>\n",
|
|
" <td>missing</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>4.0</td>\n",
|
|
" <td>c</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" a b\n",
|
|
"0 1.0 missing\n",
|
|
"1 0.0 a\n",
|
|
"2 2.0 missing\n",
|
|
"3 4.0 c"
|
|
]
|
|
},
|
|
"execution_count": 16,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data_with_na.fillna({\"a\": 0, \"b\": \"missing\"})"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0 1.0\n",
|
|
"1 0.0\n",
|
|
"2 2.0\n",
|
|
"3 4.0\n",
|
|
"Name: a, dtype: float64"
|
|
]
|
|
},
|
|
"execution_count": 17,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data_with_na[\"a\"].fillna(0)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0 missing\n",
|
|
"1 a\n",
|
|
"2 missing\n",
|
|
"3 c\n",
|
|
"Name: b, dtype: object"
|
|
]
|
|
},
|
|
"execution_count": 18,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data_with_na[\"b\"].fillna(\"missing\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"the rows before `dropna()`: 891\n",
|
|
"the rows after `dropna()`: 889\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Dropping `Embarked` column NA value.\n",
|
|
"\n",
|
|
"print(f\"the rows before `dropna()`: {data.shape[0]}\")\n",
|
|
"data = data.dropna(subset=[\"Embarked\"])\n",
|
|
"print(f\"the rows after `dropna()`: {data.shape[0]}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%% md\n"
|
|
}
|
|
},
|
|
"source": [
|
|
"# 脏数据和重复数据的处理"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"Mr 517\n",
|
|
"Miss 181\n",
|
|
"Mrs 124\n",
|
|
"Master 40\n",
|
|
"Dr 7\n",
|
|
"Rev 6\n",
|
|
"Mlle 2\n",
|
|
"Major 2\n",
|
|
"Col 2\n",
|
|
"the Countess 1\n",
|
|
"Capt 1\n",
|
|
"Ms 1\n",
|
|
"Sir 1\n",
|
|
"Lady 1\n",
|
|
"Mme 1\n",
|
|
"Don 1\n",
|
|
"Jonkheer 1\n",
|
|
"Name: Title, dtype: int64"
|
|
]
|
|
},
|
|
"execution_count": 20,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"pat = r\"(?P<FirstName>.*), (?P<Title>.*?)\\. (?P<LastName>.*)\"\n",
|
|
"data[\"Title\"] = data[\"Name\"].str.extract(pat, expand=True)[\"Title\"]\n",
|
|
"data[\"Title\"].value_counts()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"title_mapping = {\n",
|
|
" \"Mlle\": \"Miss\",\n",
|
|
" \"Mme\": \"Mrs\",\n",
|
|
" \"Ms\": \"Miss\",\n",
|
|
"}\n",
|
|
"data[\"Title\"] = data[\"Title\"].replace(title_mapping)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Index(['Dr', 'Rev', 'Major', 'Col', 'Don', 'Lady', 'Sir', 'Capt',\n",
|
|
" 'the Countess', 'Jonkheer'],\n",
|
|
" dtype='object')\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"title_stats = data[\"Title\"].value_counts()\n",
|
|
"others = title_stats[title_stats < 10].index\n",
|
|
"print(others)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"data[\"Title\"] = data[\"Title\"].replace(others, \"Other\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"Mr 517\n",
|
|
"Miss 184\n",
|
|
"Mrs 125\n",
|
|
"Master 40\n",
|
|
"Other 23\n",
|
|
"Name: Title, dtype: int64"
|
|
]
|
|
},
|
|
"execution_count": 24,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data[\"Title\"].value_counts()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"interpreter": {
|
|
"hash": "13977d4cc82dee5f9d9535ceb495bd0ab12a43c33c664e5f0d53c24cf634b67f"
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3.9.0 ('pandas-startup')",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0
|
|
}
|