feat(projects): 新增Scikit-learn综合案例示例代码及样例数据
This commit is contained in:
20
data/spaceship-titanic/README.md
Normal file
20
data/spaceship-titanic/README.md
Normal file
@@ -0,0 +1,20 @@
|
||||
# 数据说明
|
||||
|
||||
项目介绍及数据来源见 Kaggle:[Spaceship Titanic](https://www.kaggle.com/competitions/spaceship-titanic/overview)。
|
||||
|
||||
数据详情:
|
||||
|
||||
- `data.csv` 训练集样本量约为 8700 人;
|
||||
- 数据字段详情:
|
||||
- 数据字段共有 14 个;
|
||||
- 字段详情:
|
||||
- `PassengerId`:每个乘客的唯一标识。其形式为 `gggg_pp`,其中 `gggg` 表示乘客一同乘坐的团体,`pp` 是他们在团体中的编号。团体中的个人通常是家庭成员,**但这也不是绝对的**;
|
||||
- `HomePlanet`:乘客所离开的星球,通常是他们永久居住的星球;
|
||||
- `CryoSleep`:表示乘客是否选择在航行期间进入深度睡眠,处于深度睡眠状态的乘客被限制在他们坐在的船舱内;
|
||||
- `Cabin`:乘客所住的船舱及座位号,采取 `甲板/编号/侧面` 的形式编号,其中侧面的 P 代表左舷(船身左半边),S 代表右舷(船身右半边);
|
||||
- `Destination`:乘客此次航行的目的地;
|
||||
- `Age`:乘客的年龄;
|
||||
- `VIP`:乘客是否为 VIP;
|
||||
- `RoomService`,`FoodCourt`,`ShoppingMall`,`Spa`,`VRDeck`:即乘客在太空船的众多豪华设施项中的**花销金额**;
|
||||
- `Name`:乘客的名字和姓氏;
|
||||
- `Transported`:乘客是否被运送到另一个空间,**这也是训练集的参考答案以及需要在测试集中预测的结果**。
|
||||
8694
data/spaceship-titanic/data.csv
Executable file
8694
data/spaceship-titanic/data.csv
Executable file
File diff suppressed because it is too large
Load Diff
41
projects/machine-learning/mock_choosing_watermelon.py
Normal file
41
projects/machine-learning/mock_choosing_watermelon.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import unittest
|
||||
|
||||
|
||||
def get_answer_by_feature(feature, answer):
|
||||
if feature in answer:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_better_watermelon(appearance, pattern, spot_range, echo):
|
||||
features = [
|
||||
get_answer_by_feature(appearance, {"round", "oval"}),
|
||||
get_answer_by_feature(pattern, {"clear", "bright"}),
|
||||
get_answer_by_feature(spot_range, {"middle", "big"}),
|
||||
get_answer_by_feature(echo, {"clear", "pleasant"}),
|
||||
]
|
||||
|
||||
if features.count(True) >= 3:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class TestChooseBetterWatermelon(unittest.TestCase):
|
||||
|
||||
samples = [
|
||||
dict(appearance="broken", pattern="clear", spot_range="big", echo="depressing"),
|
||||
dict(appearance="round", pattern="clear", spot_range="big", echo="pleasant"),
|
||||
dict(appearance="oval", pattern="bright", spot_range="middle", echo="clear"),
|
||||
dict(
|
||||
appearance="round", pattern="blured", spot_range="small", echo="depressing"
|
||||
),
|
||||
]
|
||||
|
||||
def test_is_better_watermelon(self):
|
||||
for sample in self.samples:
|
||||
with self.subTest(**sample):
|
||||
self.assertTrue(is_better_watermelon(**sample))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
@@ -0,0 +1,507 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"from sklearn.linear_model import LogisticRegression\n",
|
||||
"from sklearn.preprocessing import LabelEncoder"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>appearance</th>\n",
|
||||
" <th>pattern</th>\n",
|
||||
" <th>spot_range</th>\n",
|
||||
" <th>echo</th>\n",
|
||||
" <th>is_good</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>broken</td>\n",
|
||||
" <td>clear</td>\n",
|
||||
" <td>big</td>\n",
|
||||
" <td>depressing</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>round</td>\n",
|
||||
" <td>clear</td>\n",
|
||||
" <td>big</td>\n",
|
||||
" <td>pleasant</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>oval</td>\n",
|
||||
" <td>bright</td>\n",
|
||||
" <td>middle</td>\n",
|
||||
" <td>clear</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>round</td>\n",
|
||||
" <td>blured</td>\n",
|
||||
" <td>small</td>\n",
|
||||
" <td>depressing</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" appearance pattern spot_range echo is_good\n",
|
||||
"0 broken clear big depressing 0\n",
|
||||
"1 round clear big pleasant 1\n",
|
||||
"2 oval bright middle clear 1\n",
|
||||
"3 round blured small depressing 0"
|
||||
]
|
||||
},
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data = [\n",
|
||||
" dict(\n",
|
||||
" appearance=\"broken\",\n",
|
||||
" pattern=\"clear\",\n",
|
||||
" spot_range=\"big\",\n",
|
||||
" echo=\"depressing\",\n",
|
||||
" is_good=0,\n",
|
||||
" ),\n",
|
||||
" dict(\n",
|
||||
" appearance=\"round\",\n",
|
||||
" pattern=\"clear\",\n",
|
||||
" spot_range=\"big\",\n",
|
||||
" echo=\"pleasant\",\n",
|
||||
" is_good=1,\n",
|
||||
" ),\n",
|
||||
" dict(\n",
|
||||
" appearance=\"oval\",\n",
|
||||
" pattern=\"bright\",\n",
|
||||
" spot_range=\"middle\",\n",
|
||||
" echo=\"clear\",\n",
|
||||
" is_good=1,\n",
|
||||
" ),\n",
|
||||
" dict(\n",
|
||||
" appearance=\"round\",\n",
|
||||
" pattern=\"blured\",\n",
|
||||
" spot_range=\"small\",\n",
|
||||
" echo=\"depressing\",\n",
|
||||
" is_good=0,\n",
|
||||
" ),\n",
|
||||
"]\n",
|
||||
"df = pd.DataFrame(data)\n",
|
||||
"df.head()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'C': 1.0,\n",
|
||||
" 'class_weight': None,\n",
|
||||
" 'dual': False,\n",
|
||||
" 'fit_intercept': True,\n",
|
||||
" 'intercept_scaling': 1,\n",
|
||||
" 'l1_ratio': None,\n",
|
||||
" 'max_iter': 100,\n",
|
||||
" 'multi_class': 'auto',\n",
|
||||
" 'n_jobs': None,\n",
|
||||
" 'penalty': 'l2',\n",
|
||||
" 'random_state': 0,\n",
|
||||
" 'solver': 'lbfgs',\n",
|
||||
" 'tol': 0.0001,\n",
|
||||
" 'verbose': 0,\n",
|
||||
" 'warm_start': False}"
|
||||
]
|
||||
},
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"encoder = LabelEncoder()\n",
|
||||
"model = LogisticRegression(random_state=0)\n",
|
||||
"\n",
|
||||
"X = df.drop([\"is_good\"], axis=1).apply(encoder.fit_transform)\n",
|
||||
"y = df[\"is_good\"].values\n",
|
||||
"\n",
|
||||
"estimator = model.fit(X, y)\n",
|
||||
"estimator.get_params()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>appearance</th>\n",
|
||||
" <th>pattern</th>\n",
|
||||
" <th>spot_range</th>\n",
|
||||
" <th>echo</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" appearance pattern spot_range echo\n",
|
||||
"0 0 0 2 0\n",
|
||||
"1 1 1 1 1\n",
|
||||
"2 2 1 0 1"
|
||||
]
|
||||
},
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"unknown_data = [\n",
|
||||
" dict(\n",
|
||||
" appearance=\"broken\",\n",
|
||||
" pattern=\"blured\",\n",
|
||||
" spot_range=\"small\",\n",
|
||||
" echo=\"clear\",\n",
|
||||
" ),\n",
|
||||
" dict(\n",
|
||||
" appearance=\"oval\",\n",
|
||||
" pattern=\"clear\",\n",
|
||||
" spot_range=\"middle\",\n",
|
||||
" echo=\"depressing\",\n",
|
||||
" ),\n",
|
||||
" dict(\n",
|
||||
" appearance=\"round\",\n",
|
||||
" pattern=\"clear\",\n",
|
||||
" spot_range=\"big\",\n",
|
||||
" echo=\"depressing\",\n",
|
||||
" ),\n",
|
||||
"]\n",
|
||||
"records = pd.DataFrame(unknown_data)\n",
|
||||
"target = records.apply(encoder.fit_transform)\n",
|
||||
"target.head()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/Bobot/pyenvs/pandas-startup/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names\n",
|
||||
" warnings.warn(\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>appearance</th>\n",
|
||||
" <th>pattern</th>\n",
|
||||
" <th>spot_range</th>\n",
|
||||
" <th>echo</th>\n",
|
||||
" <th>is_good</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>broken</td>\n",
|
||||
" <td>blured</td>\n",
|
||||
" <td>small</td>\n",
|
||||
" <td>clear</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>oval</td>\n",
|
||||
" <td>clear</td>\n",
|
||||
" <td>middle</td>\n",
|
||||
" <td>depressing</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>round</td>\n",
|
||||
" <td>clear</td>\n",
|
||||
" <td>big</td>\n",
|
||||
" <td>depressing</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" appearance pattern spot_range echo is_good\n",
|
||||
"0 broken blured small clear 0\n",
|
||||
"1 oval clear middle depressing 0\n",
|
||||
"2 round clear big depressing 1"
|
||||
]
|
||||
},
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"records[\"is_good\"] = estimator.predict(target.values)\n",
|
||||
"records.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/Bobot/pyenvs/pandas-startup/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names\n",
|
||||
" warnings.warn(\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>appearance</th>\n",
|
||||
" <th>pattern</th>\n",
|
||||
" <th>spot_range</th>\n",
|
||||
" <th>echo</th>\n",
|
||||
" <th>is_good</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>broken</td>\n",
|
||||
" <td>blured</td>\n",
|
||||
" <td>small</td>\n",
|
||||
" <td>clear</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>oval</td>\n",
|
||||
" <td>clear</td>\n",
|
||||
" <td>middle</td>\n",
|
||||
" <td>depressing</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>round</td>\n",
|
||||
" <td>clear</td>\n",
|
||||
" <td>big</td>\n",
|
||||
" <td>depressing</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" appearance pattern spot_range echo is_good\n",
|
||||
"0 broken blured small clear 0\n",
|
||||
"1 oval clear middle depressing 0\n",
|
||||
"2 round clear big depressing 1"
|
||||
]
|
||||
},
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"records[\"is_good\"] = estimator.predict(target.values)\n",
|
||||
"records.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/Bobot/pyenvs/pandas-startup/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names\n",
|
||||
" warnings.warn(\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([[0.74707445, 0.25292555],\n",
|
||||
" [0.56146879, 0.43853121],\n",
|
||||
" [0.38539327, 0.61460673]])"
|
||||
]
|
||||
},
|
||||
"execution_count": 31,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"estimator.predict_proba(target.values)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"interpreter": {
|
||||
"hash": "13977d4cc82dee5f9d9535ceb495bd0ab12a43c33c664e5f0d53c24cf634b67f"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.9.0 ('pandas-startup')",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.0"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
1537
projects/machine-learning/sklearn_base.ipynb
Normal file
1537
projects/machine-learning/sklearn_base.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
147017
projects/machine-learning/spaceship_titanic_prediction.ipynb
Normal file
147017
projects/machine-learning/spaceship_titanic_prediction.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user