### Processing and transform features

#### Encode categorical features

In [1]:

import pandas as pd
from sklearn.preprocessing import LabelEncoder

data = pd.DataFrame(dict(
    id=[1, 2],
    gender=["male", "female"],
))
data.head()

Unnamed: 0,id,gender
0,1,male
1,2,female


In [2]:

target = data["gender"]

encoder = LabelEncoder()
encoder.fit(target)
encoder.classes_

array(['female', 'male'], dtype=object)

In [3]:
encoder.transform(target)

array([1, 0])

In [4]:
pd.factorize(data["gender"])

(array([0, 1]), Index(['male', 'female'], dtype='object'))

In [5]:
pd.get_dummies(data, prefix="is")

Unnamed: 0,id,is_female,is_male
0,1,0,1
1,2,1,0


In [6]:
from sklearn.preprocessing import OneHotEncoder

array = target.values.reshape(-1, 1)
print(array)

encoder = OneHotEncoder(sparse=False)
encoder.fit(array)
print(encoder.categories_)

[['male']
 ['female']]
[array(['female', 'male'], dtype=object)]


In [7]:
encoder.transform(array)

array([[0., 1.],
       [1., 0.]])

#### Scaling numerical features

In [8]:
import numpy as np
from sklearn.preprocessing import StandardScaler

data = np.array([
    [0, 0, 1],
    [-1, 1, 0],
    [-2, 0, 2],
    [2, 1, -1],
])
print(f"raw data mean: {data.mean(axis=0)}")
print(f"raw data std: {data.std(axis=0)}")

raw data mean: [-0.25  0.5   0.5 ]
raw data std: [1.47901995 0.5        1.11803399]


In [9]:
scaler = StandardScaler()
scaler.fit(data)
scaler.transform(data)

array([[ 0.16903085, -1.        ,  0.4472136 ],
       [-0.50709255,  1.        , -0.4472136 ],
       [-1.18321596, -1.        ,  1.34164079],
       [ 1.52127766,  1.        , -1.34164079]])

In [10]:
scaled = scaler.fit_transform(data)
print(f"Mean = {scaled.mean(axis=0)}")
print(f"Std = {scaled.std(axis=0)}")

Mean = [0. 0. 0.]
Std = [1. 1. 1.]


In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit_transform(data)

array([[0.5       , 0.        , 0.66666667],
       [0.25      , 1.        , 0.33333333],
       [0.        , 0.        , 1.        ],
       [1.        , 1.        , 0.        ]])

In [12]:
from sklearn.preprocessing import Normalizer

scaler = Normalizer(norm="l1")
scaler.fit_transform(data)

array([[ 0.  ,  0.  ,  1.  ],
       [-0.5 ,  0.5 ,  0.  ],
       [-0.5 ,  0.  ,  0.5 ],
       [ 0.5 ,  0.25, -0.25]])

### Split data

In [13]:

import pathlib

import pandas as pd
from sklearn.model_selection import train_test_split

csvfile = pathlib.Path("../../data/iris.csv")
iris = pd.read_csv(csvfile)
# iris["variety"] = pd.factorize(iris["variety"])[0]
iris.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [14]:
X = iris.drop("variety", axis=1)
X.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [15]:
y = iris["variety"]

In [16]:
# train:test = 7:3

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (105, 4)
X_test shape: (45, 4)
y_train shape: (105,)
y_test shape: (45,)


In [17]:
# train:test = 0.8:0.2

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (120, 4)
X_test shape: (30, 4)
y_train shape: (120,)
y_test shape: (30,)


In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=233,
)

In [19]:
# train, validation, test

X_train, X_, y_train, y_ = train_test_split(
    X, y, test_size=0.3, random_state=233
)

X_val, X_test, y_val, y_test = train_test_split(
    X_, y_, test_size=0.5, random_state=233
)

In [20]:
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (105, 4)
X_val shape: (22, 4)
X_test shape: (23, 4)
y_train shape: (105,)
y_val shape: (22,)
y_test shape: (23,)


## Grid Search


In [21]:
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

X, y = make_classification(
    n_samples=500,
    n_classes=3,
    n_features=10,
    n_informative=6,
    scale=None,
    random_state=233,
)

print(X[:2])

[[ -63.76056336  182.20484712   32.45346758   70.87751298  351.51266014
    -7.01893893   59.89221748   33.89847802   75.65454954   47.92124999]
 [   0.69419195    3.35562957    5.69461738 -152.93603272  187.21731051
    -3.10092616   -2.3676087  -236.78609838  -22.06231977 -159.77767906]]


In [22]:
print(y[:5])


[0 0 1 0 2]


In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=233
)

In [24]:

pipe = Pipeline(steps=[("scaler", StandardScaler()), ("model", SVC())])

param_grid = dict(
    model__C=[1, 10, 100, 1000],
    model__kernel=["linear", "sigmoid", "rbf"],
    model__gamma=[0.001, 0.0001],
    model__degree=[3, 4, 5],
)

gscv = GridSearchCV(
    pipe,
    param_grid=param_grid,
    cv=5,
    verbose=2,
)
gscv.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END model__C=1, model__degree=3, model__gamma=0.001, model__kernel=linear; total time=   0.0s
[CV] END model__C=1, model__degree=3, model__gamma=0.001, model__kernel=linear; total time=   0.0s
[CV] END model__C=1, model__degree=3, model__gamma=0.001, model__kernel=linear; total time=   0.0s
[CV] END model__C=1, model__degree=3, model__gamma=0.001, model__kernel=linear; total time=   0.0s
[CV] END model__C=1, model__degree=3, model__gamma=0.001, model__kernel=linear; total time=   0.0s
[CV] END model__C=1, model__degree=3, model__gamma=0.001, model__kernel=sigmoid; total time=   0.0s
[CV] END model__C=1, model__degree=3, model__gamma=0.001, model__kernel=sigmoid; total time=   0.0s
[CV] END model__C=1, model__degree=3, model__gamma=0.001, model__kernel=sigmoid; total time=   0.0s
[CV] END model__C=1, model__degree=3, model__gamma=0.001, model__kernel=sigmoid; total time=   0.0s
[CV] END model__C=1, model__degree=3, model

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', SVC())]),
             param_grid={'model__C': [1, 10, 100, 1000],
                         'model__degree': [3, 4, 5],
                         'model__gamma': [0.001, 0.0001],
                         'model__kernel': ['linear', 'sigmoid', 'rbf']},
             verbose=2)

In [25]:
gscv.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('model', SVC(C=1000, gamma=0.001))])

In [26]:
gscv.best_score_

0.768

In [27]:
gscv.best_params_

{'model__C': 1000,
 'model__degree': 3,
 'model__gamma': 0.001,
 'model__kernel': 'rbf'}

In [28]:
pipe.set_params(**gscv.best_params_)

Pipeline(steps=[('scaler', StandardScaler()),
                ('model', SVC(C=1000, gamma=0.001))])

In [29]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('model', SVC(C=1000, gamma=0.001))])

In [30]:
pipe.score(X_test, y_test)

0.792

## Pipeline

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

pipe = Pipeline(steps=[("scaler", StandardScaler()), ("model", SVC())])
pipe

Pipeline(steps=[('scaler', StandardScaler()), ('model', SVC())])

In [22]:
pipe = make_pipeline(StandardScaler(), SVC())
pipe

Pipeline(steps=[('standardscaler', StandardScaler()), ('svc', SVC())])

In [23]:
import pandas as pd
from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples=500,
    n_classes=3,
    n_features=10,
    n_informative=6,
    scale=None,
    random_state=233,
)

data = pd.DataFrame(
    X,
    columns=[f"feature{n + 1}" for n in range(0, 10)],
)
data["y"] = y
data.head()

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,y
0,-63.760563,182.204847,32.453468,70.877513,351.51266,-7.018939,59.892217,33.898478,75.65455,47.92125,0
1,0.694192,3.35563,5.694617,-152.936033,187.217311,-3.100926,-2.367609,-236.786098,-22.06232,-159.777679,0
2,-5.393549,184.011971,-38.685624,-119.91135,75.420933,-3.545307,78.284389,71.452933,-46.133634,-23.066554,1
3,-6.519019,39.696925,33.62207,-103.923978,97.519965,-0.220483,28.255805,-90.692126,-51.072717,6.297472,0
4,-29.057685,65.60443,72.474747,34.571492,155.318598,-2.501579,-98.117314,209.305727,-113.667228,24.212523,2


In [24]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

preprocessor = ColumnTransformer(
    transformers=[
        ("odd_features_with_standardscaler", StandardScaler(), [f"feature{n}" for n in [3, 5, 7, 9]]),
        ("even_features_with_mimaxscaler", MinMaxScaler(), [f"feature{n}" for n in [2, 4, 6, 8]]),
        ("passthrough_feature", "passthrough", ["feature1", "feature10"]),
    ],
    remainder="drop",
)


pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", SVC()),
    ]
)
pipe.fit(data, data["y"])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('odd_features_with_standardscaler',
                                                  StandardScaler(),
                                                  ['feature3', 'feature5',
                                                   'feature7', 'feature9']),
                                                 ('even_features_with_mimaxscaler',
                                                  MinMaxScaler(),
                                                  ['feature2', 'feature4',
                                                   'feature6', 'feature8']),
                                                 ('passthrough_feature',
                                                  'passthrough',
                                                  ['feature1', 'feature10'])])),
                ('model', SVC())])

## Persistence

In [31]:

import pickle
import pathlib
import tempfile

from sklearn.linear_model import LinearRegression

root = pathlib.Path(tempfile.mkdtemp())
fpath = root.joinpath("LinearRegression.pkl")

regressor = LinearRegression()


with open(fpath, "wb") as file:
    pickle.dump(regressor, file)

files = list(root.iterdir())
files

[PosixPath('/var/folders/0t/s0c95rbs6ds7w_b0d471p0kc0000gn/T/tmpeuwitk1w/LinearRegression.pkl')]

In [34]:
file = open(fpath, "rb")
model = pickle.load(file)
file.close()

model.get_params()

{'copy_X': True,
 'fit_intercept': True,
 'n_jobs': None,
 'normalize': 'deprecated',
 'positive': False}

In [37]:
import joblib
from sklearn.svm import SVC

classifier = SVC()
fpath = root.joinpath("SVC.joblib")

joblib.dump(classifier, fpath)
files = list(root.iterdir())
print(files)

[PosixPath('/var/folders/0t/s0c95rbs6ds7w_b0d471p0kc0000gn/T/tmpeuwitk1w/SVC.joblib'), PosixPath('/var/folders/0t/s0c95rbs6ds7w_b0d471p0kc0000gn/T/tmpeuwitk1w/LinearRegression.pkl')]


In [38]:
model = joblib.load(fpath)
model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}