Files
sspai-100-hours-series-python/projects/machine-learning/mock_ml_model_for_choosing_watermelon.ipynb

508 lines
14 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.preprocessing import LabelEncoder"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>appearance</th>\n",
" <th>pattern</th>\n",
" <th>spot_range</th>\n",
" <th>echo</th>\n",
" <th>is_good</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>broken</td>\n",
" <td>clear</td>\n",
" <td>big</td>\n",
" <td>depressing</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>round</td>\n",
" <td>clear</td>\n",
" <td>big</td>\n",
" <td>pleasant</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>oval</td>\n",
" <td>bright</td>\n",
" <td>middle</td>\n",
" <td>clear</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>round</td>\n",
" <td>blured</td>\n",
" <td>small</td>\n",
" <td>depressing</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" appearance pattern spot_range echo is_good\n",
"0 broken clear big depressing 0\n",
"1 round clear big pleasant 1\n",
"2 oval bright middle clear 1\n",
"3 round blured small depressing 0"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = [\n",
" dict(\n",
" appearance=\"broken\",\n",
" pattern=\"clear\",\n",
" spot_range=\"big\",\n",
" echo=\"depressing\",\n",
" is_good=0,\n",
" ),\n",
" dict(\n",
" appearance=\"round\",\n",
" pattern=\"clear\",\n",
" spot_range=\"big\",\n",
" echo=\"pleasant\",\n",
" is_good=1,\n",
" ),\n",
" dict(\n",
" appearance=\"oval\",\n",
" pattern=\"bright\",\n",
" spot_range=\"middle\",\n",
" echo=\"clear\",\n",
" is_good=1,\n",
" ),\n",
" dict(\n",
" appearance=\"round\",\n",
" pattern=\"blured\",\n",
" spot_range=\"small\",\n",
" echo=\"depressing\",\n",
" is_good=0,\n",
" ),\n",
"]\n",
"df = pd.DataFrame(data)\n",
"df.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'C': 1.0,\n",
" 'class_weight': None,\n",
" 'dual': False,\n",
" 'fit_intercept': True,\n",
" 'intercept_scaling': 1,\n",
" 'l1_ratio': None,\n",
" 'max_iter': 100,\n",
" 'multi_class': 'auto',\n",
" 'n_jobs': None,\n",
" 'penalty': 'l2',\n",
" 'random_state': 0,\n",
" 'solver': 'lbfgs',\n",
" 'tol': 0.0001,\n",
" 'verbose': 0,\n",
" 'warm_start': False}"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"encoder = LabelEncoder()\n",
"model = LogisticRegression(random_state=0)\n",
"\n",
"X = df.drop([\"is_good\"], axis=1).apply(encoder.fit_transform)\n",
"y = df[\"is_good\"].values\n",
"\n",
"estimator = model.fit(X, y)\n",
"estimator.get_params()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>appearance</th>\n",
" <th>pattern</th>\n",
" <th>spot_range</th>\n",
" <th>echo</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" appearance pattern spot_range echo\n",
"0 0 0 2 0\n",
"1 1 1 1 1\n",
"2 2 1 0 1"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"unknown_data = [\n",
" dict(\n",
" appearance=\"broken\",\n",
" pattern=\"blured\",\n",
" spot_range=\"small\",\n",
" echo=\"clear\",\n",
" ),\n",
" dict(\n",
" appearance=\"oval\",\n",
" pattern=\"clear\",\n",
" spot_range=\"middle\",\n",
" echo=\"depressing\",\n",
" ),\n",
" dict(\n",
" appearance=\"round\",\n",
" pattern=\"clear\",\n",
" spot_range=\"big\",\n",
" echo=\"depressing\",\n",
" ),\n",
"]\n",
"records = pd.DataFrame(unknown_data)\n",
"target = records.apply(encoder.fit_transform)\n",
"target.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/Bobot/pyenvs/pandas-startup/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>appearance</th>\n",
" <th>pattern</th>\n",
" <th>spot_range</th>\n",
" <th>echo</th>\n",
" <th>is_good</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>broken</td>\n",
" <td>blured</td>\n",
" <td>small</td>\n",
" <td>clear</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>oval</td>\n",
" <td>clear</td>\n",
" <td>middle</td>\n",
" <td>depressing</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>round</td>\n",
" <td>clear</td>\n",
" <td>big</td>\n",
" <td>depressing</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" appearance pattern spot_range echo is_good\n",
"0 broken blured small clear 0\n",
"1 oval clear middle depressing 0\n",
"2 round clear big depressing 1"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"records[\"is_good\"] = estimator.predict(target.values)\n",
"records.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/Bobot/pyenvs/pandas-startup/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>appearance</th>\n",
" <th>pattern</th>\n",
" <th>spot_range</th>\n",
" <th>echo</th>\n",
" <th>is_good</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>broken</td>\n",
" <td>blured</td>\n",
" <td>small</td>\n",
" <td>clear</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>oval</td>\n",
" <td>clear</td>\n",
" <td>middle</td>\n",
" <td>depressing</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>round</td>\n",
" <td>clear</td>\n",
" <td>big</td>\n",
" <td>depressing</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" appearance pattern spot_range echo is_good\n",
"0 broken blured small clear 0\n",
"1 oval clear middle depressing 0\n",
"2 round clear big depressing 1"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"records[\"is_good\"] = estimator.predict(target.values)\n",
"records.head()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/Bobot/pyenvs/pandas-startup/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/plain": [
"array([[0.74707445, 0.25292555],\n",
" [0.56146879, 0.43853121],\n",
" [0.38539327, 0.61460673]])"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"estimator.predict_proba(target.values)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"interpreter": {
"hash": "13977d4cc82dee5f9d9535ceb495bd0ab12a43c33c664e5f0d53c24cf634b67f"
},
"kernelspec": {
"display_name": "Python 3.9.0 ('pandas-startup')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.0"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}