diff --git a/code/19/pyjanitor.ipynb b/code/19/pyjanitor.ipynb
new file mode 100644
index 0000000..5246159
--- /dev/null
+++ b/code/19/pyjanitor.ipynb
@@ -0,0 +1,1826 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": true,
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "# Pyjanitor"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "可以将如下代码注释后并执行安装 Pyjanitor"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "#!pip install pyjanitor"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import janitor\n",
+ "\n",
+ "# or use `as` key word to set an alias.\n",
+ "import janitor as jn"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b231_ | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " a | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 3 | \n",
+ " b | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " c | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b231_\n",
+ "0 1 a\n",
+ "1 3 b\n",
+ "2 3 c"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import janitor as jn\n",
+ "import pandas as pd\n",
+ "\n",
+ "data = pd.DataFrame(\n",
+ " {\n",
+ " \"a\": [1, 3, 3],\n",
+ " \"b$231@!#_\": list(\"abc\"),\n",
+ " }\n",
+ ")\n",
+ "data.head()\n",
+ "data.pipe(jn.clean_names, remove_special=True)\n",
+ "data.clean_names(remove_special=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## coalesce"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " <NA> | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " <NA> | \n",
+ " 1.3 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " <NA> | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b\n",
+ "0 1 \n",
+ "1 1.3\n",
+ "2 3 \n",
+ "3 "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# native pandas code\n",
+ "\n",
+ "import pandas as pd\n",
+ "\n",
+ "data = pd.DataFrame(\n",
+ " {\n",
+ " \"a\": [1, pd.NA, 3, pd.NA],\n",
+ " \"b\": [pd.NA, 1.3, pd.NA, pd.NA],\n",
+ " }\n",
+ ")\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def use_or_not(row):\n",
+ " a, b = pd.isna(row[\"a\"]), pd.isna(row[\"b\"])\n",
+ " if a and not b:\n",
+ " return row[\"b\"]\n",
+ " elif not a and b:\n",
+ " return row[\"a\"]\n",
+ " else:\n",
+ " return pd.NA"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ " c | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " <NA> | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " <NA> | \n",
+ " 1.3 | \n",
+ " 1.3 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " <NA> | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b c\n",
+ "0 1 1\n",
+ "1 1.3 1.3\n",
+ "2 3 3\n",
+ "3 "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data[\"c\"] = data.apply(use_or_not, axis=1)\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ " c | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " <NA> | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " <NA> | \n",
+ " 1.3 | \n",
+ " 1.3 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " <NA> | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b c\n",
+ "0 1 1\n",
+ "1 1.3 1.3\n",
+ "2 3 3\n",
+ "3 "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# janitor code\n",
+ "import janitor\n",
+ "import pandas as pd\n",
+ "\n",
+ "data = pd.DataFrame(\n",
+ " {\n",
+ " \"a\": [1, pd.NA, 3, pd.NA],\n",
+ " \"b\": [pd.NA, 1.3, pd.NA, pd.NA],\n",
+ " \"c\": [3, pd.NA, 2, 3.1],\n",
+ " }\n",
+ ")\n",
+ "data.coalesce(\"a\", \"b\", target_column_name=\"c\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ " c | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " <NA> | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " <NA> | \n",
+ " 1.3 | \n",
+ " <NA> | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " <NA> | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " 3.1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b c\n",
+ "0 1 3\n",
+ "1 1.3 \n",
+ "2 3 2\n",
+ "3 3.1"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# DataFrame.bfill\n",
+ "import pandas as pd\n",
+ "\n",
+ "data = pd.DataFrame(\n",
+ " {\n",
+ " \"a\": [1, pd.NA, 3, pd.NA],\n",
+ " \"b\": [pd.NA, 1.3, pd.NA, pd.NA],\n",
+ " \"c\": [3, pd.NA, 2, 3.1],\n",
+ " }\n",
+ ")\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " <NA> | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1.3 | \n",
+ " 1.3 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " <NA> | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b\n",
+ "0 1 \n",
+ "1 1.3 1.3\n",
+ "2 3 \n",
+ "3 "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.filter([\"a\", \"b\"]).bfill(axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ " c | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1.3 | \n",
+ " 1.3 | \n",
+ " <NA> | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 3.1 | \n",
+ " 3.1 | \n",
+ " 3.1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b c\n",
+ "0 1.0 3.0 3\n",
+ "1 1.3 1.3 \n",
+ "2 3.0 2.0 2\n",
+ "3 3.1 3.1 3.1"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.filter([\"a\", \"b\", \"c\"]).bfill(axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 1.0\n",
+ "1 1.3\n",
+ "2 3.0\n",
+ "3 3.1\n",
+ "Name: a, dtype: float64"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.filter([\"a\", \"b\", \"c\"]).bfill(axis=1).iloc[:, 0]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## case_when"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# native pandas code\n",
+ "import pandas as pd\n",
+ "\n",
+ "df = pd.DataFrame(\n",
+ " {\n",
+ " \"a\": [0, 0, 1, 2, \"hi\"],\n",
+ " \"b\": [0, 3, 4, 5, \"bye\"],\n",
+ " \"c\": [6, 7, 8, 9, \"wait\"],\n",
+ " }\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def case_when(row):\n",
+ " if (row[\"a\"] == 0 and row[\"b\"] != 0) or row[\"c\"] == \"wait\":\n",
+ " return row[\"a\"]\n",
+ " elif row[\"a\"] == 0 and row[\"b\"] == 0:\n",
+ " return \"x\"\n",
+ " else:\n",
+ " return row[\"c\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ " c | \n",
+ " new_col | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ " x | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 7 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 8 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " 9 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " hi | \n",
+ " bye | \n",
+ " wait | \n",
+ " hi | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b c new_col\n",
+ "0 0 0 6 x\n",
+ "1 0 3 7 0\n",
+ "2 1 4 8 8\n",
+ "3 2 5 9 9\n",
+ "4 hi bye wait hi"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.assign(new_col=df.apply(case_when, axis=1))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ " c | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " hi | \n",
+ " bye | \n",
+ " wait | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b c\n",
+ "0 0 0 6\n",
+ "1 0 3 7\n",
+ "2 1 4 8\n",
+ "3 2 5 9\n",
+ "4 hi bye wait"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# janitor code\n",
+ "import janitor\n",
+ "import pandas as pd\n",
+ "\n",
+ "df = pd.DataFrame(\n",
+ " {\n",
+ " \"a\": [0, 0, 1, 2, \"hi\"],\n",
+ " \"b\": [0, 3, 4, 5, \"bye\"],\n",
+ " \"c\": [6, 7, 8, 9, \"wait\"],\n",
+ " }\n",
+ ")\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ " c | \n",
+ " new_col | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ " x | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 7 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 8 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " 9 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " hi | \n",
+ " bye | \n",
+ " wait | \n",
+ " hi | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b c new_col\n",
+ "0 0 0 6 x\n",
+ "1 0 3 7 0\n",
+ "2 1 4 8 8\n",
+ "3 2 5 9 9\n",
+ "4 hi bye wait hi"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# fmt:off\n",
+ "df.case_when(\n",
+ " ((df.a == 0) & (df.b != 0)) | (df.c == \"wait\"), df.a,\n",
+ " (df.b == 0) & (df.a == 0), \"x\",\n",
+ " df.c,\n",
+ " column_name=\"new_col\",\n",
+ ")\n",
+ "# fmt:on"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## concatenate_columns & deconcatenate_column"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " no. | \n",
+ " prefix | \n",
+ " base | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " auto | \n",
+ " matic | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " de | \n",
+ " code | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " em | \n",
+ " body | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " no. prefix base\n",
+ "0 1 auto matic\n",
+ "1 2 de code\n",
+ "2 3 em body"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# native pandas code\n",
+ "import pandas as pd\n",
+ "\n",
+ "data = pd.DataFrame(\n",
+ " {\n",
+ " \"no.\": [1, 2, 3],\n",
+ " \"prefix\": [\"auto\", \"de\", \"em\"],\n",
+ " \"base\": [\"matic\", \"code\", \"body\"],\n",
+ " }\n",
+ ")\n",
+ "\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 auto-matic\n",
+ "1 de-code\n",
+ "2 em-body\n",
+ "Name: prefix, dtype: object"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data[\"prefix\"].str.cat(data[\"base\"], sep=\"-\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 auto-matic\n",
+ "1 de-code\n",
+ "2 em-body\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data[\"prefix\"] + \"-\" + data[\"base\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 1-auto-matic\n",
+ "1 2-de-code\n",
+ "2 3-em-body\n",
+ "Name: no., dtype: object"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " data[\"no.\"]\n",
+ " .astype(str)\n",
+ " .str.cat(data[\"prefix\"], sep=\"-\")\n",
+ " .str.cat(data[\"base\"], sep=\"-\")\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " no. | \n",
+ " prefix | \n",
+ " base | \n",
+ " word | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " auto | \n",
+ " matic | \n",
+ " 1-auto-matic | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " de | \n",
+ " code | \n",
+ " 2-de-code | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " em | \n",
+ " body | \n",
+ " 3-em-body | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " no. prefix base word\n",
+ "0 1 auto matic 1-auto-matic\n",
+ "1 2 de code 2-de-code\n",
+ "2 3 em body 3-em-body"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# janitor code\n",
+ "import janitor\n",
+ "import pandas as pd\n",
+ "\n",
+ "data = pd.DataFrame(\n",
+ " {\n",
+ " \"no.\": [1, 2, 3],\n",
+ " \"prefix\": [\"auto\", \"de\", \"em\"],\n",
+ " \"base\": [\"matic\", \"code\", \"body\"],\n",
+ " }\n",
+ ")\n",
+ "data.concatenate_columns(\n",
+ " column_names=[\"no.\", \"prefix\", \"base\"],\n",
+ " new_column_name=\"word\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " date | \n",
+ " year | \n",
+ " month | \n",
+ " day | \n",
+ " year_month | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2022-01-02 | \n",
+ " 2022 | \n",
+ " 01 | \n",
+ " 02 | \n",
+ " 202201 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2022-01-09 | \n",
+ " 2022 | \n",
+ " 01 | \n",
+ " 09 | \n",
+ " 202201 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2022-01-16 | \n",
+ " 2022 | \n",
+ " 01 | \n",
+ " 16 | \n",
+ " 202201 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2022-01-23 | \n",
+ " 2022 | \n",
+ " 01 | \n",
+ " 23 | \n",
+ " 202201 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2022-01-30 | \n",
+ " 2022 | \n",
+ " 01 | \n",
+ " 30 | \n",
+ " 202201 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " date year month day year_month\n",
+ "0 2022-01-02 2022 01 02 202201\n",
+ "1 2022-01-09 2022 01 09 202201\n",
+ "2 2022-01-16 2022 01 16 202201\n",
+ "3 2022-01-23 2022 01 23 202201\n",
+ "4 2022-01-30 2022 01 30 202201"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import janitor\n",
+ "import pandas as pd\n",
+ "(\n",
+ " pd.DataFrame({\"date\": pd.date_range(\"20220101\", \"20220201\", freq=\"1W\")})\n",
+ " .astype(str)\n",
+ " .deconcatenate_column(\"date\", new_column_names=[\"year\", \"month\", \"day\"], sep=\"-\")\n",
+ " .assign(year_month=lambda df: df[\"year\"].str.cat(df[\"month\"], sep=\"\"))\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## join_apply"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b\n",
+ "0 1 2\n",
+ "1 3 4\n",
+ "2 5 6"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# native pandas code\n",
+ "import pandas as pd\n",
+ "\n",
+ "data = pd.DataFrame(\n",
+ " {\n",
+ " \"a\": [1, 3, 5],\n",
+ " \"b\": [2, 4, 6],\n",
+ " }\n",
+ ")\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ " c | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1.333333 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 3.333333 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 5.333333 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b c\n",
+ "0 1 2 1.333333\n",
+ "1 3 4 3.333333\n",
+ "2 5 6 5.333333"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.assign(c=lambda df: df.apply(lambda row: (row[\"a\"] * 2 + row[\"b\"]) / 3, axis=1))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ " c | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1.333333 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 3.333333 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 5.333333 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b c\n",
+ "0 1 2 1.333333\n",
+ "1 3 4 3.333333\n",
+ "2 5 6 5.333333"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# equal this:\n",
+ "data.assign(\n",
+ " c=(data[\"a\"]*2+data[\"b\"])/3\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ " c | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1.333333 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 3.333333 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 5.333333 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b c\n",
+ "0 1 2 1.333333\n",
+ "1 3 4 3.333333\n",
+ "2 5 6 5.333333"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# janitor code\n",
+ "import janitor\n",
+ "import pandas as pd\n",
+ "\n",
+ "data = pd.DataFrame(\n",
+ " {\n",
+ " \"a\": [1, 3, 5],\n",
+ " \"b\": [2, 4, 6],\n",
+ " }\n",
+ ")\n",
+ "\n",
+ "data.join_apply(lambda row: (row[\"a\"] * 2 + row[\"b\"]) / 3, new_column_name=\"c\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3.10.4 ('sspai-100-hours-series-python')",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.4"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "7a101baf08afe636412f97dd4a9fc2e65b6f84f0ec50413bf3e19b04a26b8ba6"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/code/19/statsmodels.ipynb b/code/19/statsmodels.ipynb
new file mode 100644
index 0000000..649b1ab
--- /dev/null
+++ b/code/19/statsmodels.ipynb
@@ -0,0 +1,692 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%matplotlib inline\n",
+ "%config InlineBackend.figure_format = 'svg'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import statsmodels.api as sm\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "plt.rc(\"figure\", figsize=(16, 8))\n",
+ "plt.rc(\"font\", size=14)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "data = sm.datasets.statecrime.load_pandas()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " violent | \n",
+ " murder | \n",
+ " hs_grad | \n",
+ " poverty | \n",
+ " single | \n",
+ " white | \n",
+ " urban | \n",
+ "
\n",
+ " \n",
+ " | state | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | Alabama | \n",
+ " 459.9 | \n",
+ " 7.1 | \n",
+ " 82.1 | \n",
+ " 17.5 | \n",
+ " 29.0 | \n",
+ " 70.0 | \n",
+ " 48.65 | \n",
+ "
\n",
+ " \n",
+ " | Alaska | \n",
+ " 632.6 | \n",
+ " 3.2 | \n",
+ " 91.4 | \n",
+ " 9.0 | \n",
+ " 25.5 | \n",
+ " 68.3 | \n",
+ " 44.46 | \n",
+ "
\n",
+ " \n",
+ " | Arizona | \n",
+ " 423.2 | \n",
+ " 5.5 | \n",
+ " 84.2 | \n",
+ " 16.5 | \n",
+ " 25.7 | \n",
+ " 80.0 | \n",
+ " 80.07 | \n",
+ "
\n",
+ " \n",
+ " | Arkansas | \n",
+ " 530.3 | \n",
+ " 6.3 | \n",
+ " 82.4 | \n",
+ " 18.8 | \n",
+ " 26.3 | \n",
+ " 78.4 | \n",
+ " 39.54 | \n",
+ "
\n",
+ " \n",
+ " | California | \n",
+ " 473.4 | \n",
+ " 5.4 | \n",
+ " 80.6 | \n",
+ " 14.2 | \n",
+ " 27.8 | \n",
+ " 62.7 | \n",
+ " 89.73 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " violent murder hs_grad poverty single white urban\n",
+ "state \n",
+ "Alabama 459.9 7.1 82.1 17.5 29.0 70.0 48.65\n",
+ "Alaska 632.6 3.2 91.4 9.0 25.5 68.3 44.46\n",
+ "Arizona 423.2 5.5 84.2 16.5 25.7 80.0 80.07\n",
+ "Arkansas 530.3 6.3 82.4 18.8 26.3 78.4 39.54\n",
+ "California 473.4 5.4 80.6 14.2 27.8 62.7 89.73"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Index: 51 entries, Alabama to Wyoming\n",
+ "Data columns (total 7 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 violent 51 non-null float64\n",
+ " 1 murder 51 non-null float64\n",
+ " 2 hs_grad 51 non-null float64\n",
+ " 3 poverty 51 non-null float64\n",
+ " 4 single 51 non-null float64\n",
+ " 5 white 51 non-null float64\n",
+ " 6 urban 51 non-null float64\n",
+ "dtypes: float64(7)\n",
+ "memory usage: 3.2+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "data.data.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " urban | \n",
+ " poverty | \n",
+ " hs_grad | \n",
+ " single | \n",
+ "
\n",
+ " \n",
+ " | state | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | Alabama | \n",
+ " 48.65 | \n",
+ " 17.5 | \n",
+ " 82.1 | \n",
+ " 29.0 | \n",
+ "
\n",
+ " \n",
+ " | Alaska | \n",
+ " 44.46 | \n",
+ " 9.0 | \n",
+ " 91.4 | \n",
+ " 25.5 | \n",
+ "
\n",
+ " \n",
+ " | Arizona | \n",
+ " 80.07 | \n",
+ " 16.5 | \n",
+ " 84.2 | \n",
+ " 25.7 | \n",
+ "
\n",
+ " \n",
+ " | Arkansas | \n",
+ " 39.54 | \n",
+ " 18.8 | \n",
+ " 82.4 | \n",
+ " 26.3 | \n",
+ "
\n",
+ " \n",
+ " | California | \n",
+ " 89.73 | \n",
+ " 14.2 | \n",
+ " 80.6 | \n",
+ " 27.8 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " urban poverty hs_grad single\n",
+ "state \n",
+ "Alabama 48.65 17.5 82.1 29.0\n",
+ "Alaska 44.46 9.0 91.4 25.5\n",
+ "Arizona 80.07 16.5 84.2 25.7\n",
+ "Arkansas 39.54 18.8 82.4 26.3\n",
+ "California 89.73 14.2 80.6 27.8"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.exog.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "state\n",
+ "Alabama 7.1\n",
+ "Alaska 3.2\n",
+ "Arizona 5.5\n",
+ "Arkansas 6.3\n",
+ "California 5.4\n",
+ "Name: murder, dtype: float64"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.endog.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " const | \n",
+ " urban | \n",
+ " poverty | \n",
+ " hs_grad | \n",
+ " single | \n",
+ "
\n",
+ " \n",
+ " | state | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | Alabama | \n",
+ " 1.0 | \n",
+ " 48.65 | \n",
+ " 17.5 | \n",
+ " 82.1 | \n",
+ " 29.0 | \n",
+ "
\n",
+ " \n",
+ " | Alaska | \n",
+ " 1.0 | \n",
+ " 44.46 | \n",
+ " 9.0 | \n",
+ " 91.4 | \n",
+ " 25.5 | \n",
+ "
\n",
+ " \n",
+ " | Arizona | \n",
+ " 1.0 | \n",
+ " 80.07 | \n",
+ " 16.5 | \n",
+ " 84.2 | \n",
+ " 25.7 | \n",
+ "
\n",
+ " \n",
+ " | Arkansas | \n",
+ " 1.0 | \n",
+ " 39.54 | \n",
+ " 18.8 | \n",
+ " 82.4 | \n",
+ " 26.3 | \n",
+ "
\n",
+ " \n",
+ " | California | \n",
+ " 1.0 | \n",
+ " 89.73 | \n",
+ " 14.2 | \n",
+ " 80.6 | \n",
+ " 27.8 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " const urban poverty hs_grad single\n",
+ "state \n",
+ "Alabama 1.0 48.65 17.5 82.1 29.0\n",
+ "Alaska 1.0 44.46 9.0 91.4 25.5\n",
+ "Arizona 1.0 80.07 16.5 84.2 25.7\n",
+ "Arkansas 1.0 39.54 18.8 82.4 26.3\n",
+ "California 1.0 89.73 14.2 80.6 27.8"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X = data.exog\n",
+ "X = sm.add_constant(X)\n",
+ "X.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "Y = data.endog"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "crime_model = sm.OLS(Y, X).fit()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# equal to this:\n",
+ "data_copy = data.data.copy()\n",
+ "data_copy = sm.add_constant(data_copy)\n",
+ "crime_model = sm.formula.ols(\n",
+ " \"murder ~ urban + poverty + hs_grad + single + C(const)\",\n",
+ " data=data_copy\n",
+ ").fit()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " OLS Regression Results \n",
+ "==============================================================================\n",
+ "Dep. Variable: murder R-squared: 0.813\n",
+ "Model: OLS Adj. R-squared: 0.797\n",
+ "Method: Least Squares F-statistic: 50.08\n",
+ "Date: Thu, 30 Jun 2022 Prob (F-statistic): 3.42e-16\n",
+ "Time: 09:07:46 Log-Likelihood: -95.050\n",
+ "No. Observations: 51 AIC: 200.1\n",
+ "Df Residuals: 46 BIC: 209.8\n",
+ "Df Model: 4 \n",
+ "Covariance Type: nonrobust \n",
+ "==============================================================================\n",
+ " coef std err t P>|t| [0.025 0.975]\n",
+ "------------------------------------------------------------------------------\n",
+ "Intercept -44.1024 12.086 -3.649 0.001 -68.430 -19.774\n",
+ "urban 0.0109 0.015 0.707 0.483 -0.020 0.042\n",
+ "poverty 0.4121 0.140 2.939 0.005 0.130 0.694\n",
+ "hs_grad 0.3059 0.117 2.611 0.012 0.070 0.542\n",
+ "single 0.6374 0.070 9.065 0.000 0.496 0.779\n",
+ "==============================================================================\n",
+ "Omnibus: 1.618 Durbin-Watson: 2.507\n",
+ "Prob(Omnibus): 0.445 Jarque-Bera (JB): 0.831\n",
+ "Skew: -0.220 Prob(JB): 0.660\n",
+ "Kurtosis: 3.445 Cond. No. 5.80e+03\n",
+ "==============================================================================\n",
+ "\n",
+ "Notes:\n",
+ "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
+ "[2] The condition number is large, 5.8e+03. This might indicate that there are\n",
+ "strong multicollinearity or other numerical problems.\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(crime_model.summary())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.8132403052312949\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(crime_model.rsquared)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Intercept -44.102416\n",
+ "urban 0.010888\n",
+ "poverty 0.412150\n",
+ "hs_grad 0.305927\n",
+ "single 0.637375\n",
+ "dtype: float64\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(crime_model.params)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "eval_env: 1\n",
+ "eval_env: 1\n",
+ "eval_env: 1\n",
+ "eval_env: 1\n",
+ "eval_env: 1\n"
+ ]
+ },
+ {
+ "data": {
+ "image/svg+xml": "\n\n\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "fig = sm.graphics.plot_partregress_grid(crime_model)\n",
+ "fig.tight_layout(pad=1.0)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": "\n\n\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "fig = sm.graphics.plot_fit(crime_model, \"single\")\n",
+ "fig.tight_layout(pad=1.0)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3.10.4 ('sspai-100-hours-series-python')",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.4"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "7a101baf08afe636412f97dd4a9fc2e65b6f84f0ec50413bf3e19b04a26b8ba6"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}