Automobiles#

%load_ext autoreload
%autoreload 2
%matplotlib inline
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Imports#

from fastai.vision.all import *
from aiking.data.external import * #We need to import this after fastai modules
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, make_scorer
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
import seaborn as sns
import matplotlib.pyplot as plt
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
plt.style.use('ggplot')

Getting Dataset#

# kaggle datasets download -d toramky/automobile-dataset
path = untar_data("kaggle_datasets::toramky/automobile-dataset"); path
Path('/content/drive/MyDrive/PPV/S_Personal_Study/aiking/data/automobile-dataset')
path.ls()
(#1) [Path('/content/drive/MyDrive/PPV/S_Personal_Study/aiking/data/automobile-dataset/Automobile_data.csv')]
df = pd.read_csv(path/'Automobile_data.csv', na_values=["?"]); df.head()
symboling normalized-losses make fuel-type aspiration num-of-doors body-style drive-wheels engine-location wheel-base length width height curb-weight engine-type num-of-cylinders engine-size fuel-system bore stroke compression-ratio horsepower peak-rpm city-mpg highway-mpg price
0 3 NaN alfa-romero gas std two convertible rwd front 88.6 168.8 64.1 48.8 2548 dohc four 130 mpfi 3.47 2.68 9.0 111.0 5000.0 21 27 13495.0
1 3 NaN alfa-romero gas std two convertible rwd front 88.6 168.8 64.1 48.8 2548 dohc four 130 mpfi 3.47 2.68 9.0 111.0 5000.0 21 27 16500.0
2 1 NaN alfa-romero gas std two hatchback rwd front 94.5 171.2 65.5 52.4 2823 ohcv six 152 mpfi 2.68 3.47 9.0 154.0 5000.0 19 26 16500.0
3 2 164.0 audi gas std four sedan fwd front 99.8 176.6 66.2 54.3 2337 ohc four 109 mpfi 3.19 3.40 10.0 102.0 5500.0 24 30 13950.0
4 2 164.0 audi gas std four sedan 4wd front 99.4 176.6 66.4 54.3 2824 ohc five 136 mpfi 3.19 3.40 8.0 115.0 5500.0 18 22 17450.0
X = df.fillna(0).copy()
y = X.pop("price")
X
symboling normalized-losses make fuel-type aspiration num-of-doors body-style drive-wheels engine-location wheel-base length width height curb-weight engine-type num-of-cylinders engine-size fuel-system bore stroke compression-ratio horsepower peak-rpm city-mpg highway-mpg
0 3 0.0 alfa-romero gas std two convertible rwd front 88.6 168.8 64.1 48.8 2548 dohc four 130 mpfi 3.47 2.68 9.0 111.0 5000.0 21 27
1 3 0.0 alfa-romero gas std two convertible rwd front 88.6 168.8 64.1 48.8 2548 dohc four 130 mpfi 3.47 2.68 9.0 111.0 5000.0 21 27
2 1 0.0 alfa-romero gas std two hatchback rwd front 94.5 171.2 65.5 52.4 2823 ohcv six 152 mpfi 2.68 3.47 9.0 154.0 5000.0 19 26
3 2 164.0 audi gas std four sedan fwd front 99.8 176.6 66.2 54.3 2337 ohc four 109 mpfi 3.19 3.40 10.0 102.0 5500.0 24 30
4 2 164.0 audi gas std four sedan 4wd front 99.4 176.6 66.4 54.3 2824 ohc five 136 mpfi 3.19 3.40 8.0 115.0 5500.0 18 22
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
200 -1 95.0 volvo gas std four sedan rwd front 109.1 188.8 68.9 55.5 2952 ohc four 141 mpfi 3.78 3.15 9.5 114.0 5400.0 23 28
201 -1 95.0 volvo gas turbo four sedan rwd front 109.1 188.8 68.8 55.5 3049 ohc four 141 mpfi 3.78 3.15 8.7 160.0 5300.0 19 25
202 -1 95.0 volvo gas std four sedan rwd front 109.1 188.8 68.9 55.5 3012 ohcv six 173 mpfi 3.58 2.87 8.8 134.0 5500.0 18 23
203 -1 95.0 volvo diesel turbo four sedan rwd front 109.1 188.8 68.9 55.5 3217 ohc six 145 idi 3.01 3.40 23.0 106.0 4800.0 26 27
204 -1 95.0 volvo gas turbo four sedan rwd front 109.1 188.8 68.9 55.5 3062 ohc four 141 mpfi 3.78 3.15 9.5 114.0 5400.0 19 25

205 rows × 25 columns

Mutual Information (MI)#

for colname in X.select_dtypes("object"):
    X[colname], _ = X[colname].factorize()
X
symboling normalized-losses make fuel-type aspiration num-of-doors body-style drive-wheels engine-location wheel-base length width height curb-weight engine-type num-of-cylinders engine-size fuel-system bore stroke compression-ratio horsepower peak-rpm city-mpg highway-mpg
0 3 0.0 0 0 0 0 0 0 0 88.6 168.8 64.1 48.8 2548 0 0 130 0 3.47 2.68 9.0 111.0 5000.0 21 27
1 3 0.0 0 0 0 0 0 0 0 88.6 168.8 64.1 48.8 2548 0 0 130 0 3.47 2.68 9.0 111.0 5000.0 21 27
2 1 0.0 0 0 0 0 1 0 0 94.5 171.2 65.5 52.4 2823 1 1 152 0 2.68 3.47 9.0 154.0 5000.0 19 26
3 2 164.0 1 0 0 1 2 1 0 99.8 176.6 66.2 54.3 2337 2 0 109 0 3.19 3.40 10.0 102.0 5500.0 24 30
4 2 164.0 1 0 0 1 2 2 0 99.4 176.6 66.4 54.3 2824 2 2 136 0 3.19 3.40 8.0 115.0 5500.0 18 22
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
200 -1 95.0 21 0 0 1 2 0 0 109.1 188.8 68.9 55.5 2952 2 0 141 0 3.78 3.15 9.5 114.0 5400.0 23 28
201 -1 95.0 21 0 1 1 2 0 0 109.1 188.8 68.8 55.5 3049 2 0 141 0 3.78 3.15 8.7 160.0 5300.0 19 25
202 -1 95.0 21 0 0 1 2 0 0 109.1 188.8 68.9 55.5 3012 1 1 173 0 3.58 2.87 8.8 134.0 5500.0 18 23
203 -1 95.0 21 1 1 1 2 0 0 109.1 188.8 68.9 55.5 3217 2 1 145 6 3.01 3.40 23.0 106.0 4800.0 26 27
204 -1 95.0 21 0 1 1 2 0 0 109.1 188.8 68.9 55.5 3062 2 0 141 0 3.78 3.15 9.5 114.0 5400.0 19 25

205 rows × 25 columns

X.dtypes
symboling              int64
normalized-losses    float64
make                   int64
fuel-type              int64
aspiration             int64
num-of-doors           int64
body-style             int64
drive-wheels           int64
engine-location        int64
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type            int64
num-of-cylinders       int64
engine-size            int64
fuel-system            int64
bore                 float64
stroke               float64
compression-ratio    float64
horsepower           float64
peak-rpm             float64
city-mpg               int64
highway-mpg            int64
dtype: object
X.dtypes == int
symboling             True
normalized-losses    False
make                  True
fuel-type             True
aspiration            True
num-of-doors          True
body-style            True
drive-wheels          True
engine-location       True
wheel-base           False
length               False
width                False
height               False
curb-weight           True
engine-type           True
num-of-cylinders      True
engine-size           True
fuel-system           True
bore                 False
stroke               False
compression-ratio    False
horsepower           False
peak-rpm             False
city-mpg              True
highway-mpg           True
dtype: bool
discrete_features = X.dtypes == int; discrete_features
symboling             True
normalized-losses    False
make                  True
fuel-type             True
aspiration            True
num-of-doors          True
body-style            True
drive-wheels          True
engine-location       True
wheel-base           False
length               False
width                False
height               False
curb-weight           True
engine-type           True
num-of-cylinders      True
engine-size           True
fuel-system           True
bore                 False
stroke               False
compression-ratio    False
horsepower           False
peak-rpm             False
city-mpg              True
highway-mpg           True
dtype: bool
y[y=="?"]
Series([], Name: price, dtype: float64)
X
symboling normalized-losses make fuel-type aspiration num-of-doors body-style drive-wheels engine-location wheel-base length width height curb-weight engine-type num-of-cylinders engine-size fuel-system bore stroke compression-ratio horsepower peak-rpm city-mpg highway-mpg
0 3 0.0 0 0 0 0 0 0 0 88.6 168.8 64.1 48.8 2548 0 0 130 0 3.47 2.68 9.0 111.0 5000.0 21 27
1 3 0.0 0 0 0 0 0 0 0 88.6 168.8 64.1 48.8 2548 0 0 130 0 3.47 2.68 9.0 111.0 5000.0 21 27
2 1 0.0 0 0 0 0 1 0 0 94.5 171.2 65.5 52.4 2823 1 1 152 0 2.68 3.47 9.0 154.0 5000.0 19 26
3 2 164.0 1 0 0 1 2 1 0 99.8 176.6 66.2 54.3 2337 2 0 109 0 3.19 3.40 10.0 102.0 5500.0 24 30
4 2 164.0 1 0 0 1 2 2 0 99.4 176.6 66.4 54.3 2824 2 2 136 0 3.19 3.40 8.0 115.0 5500.0 18 22
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
200 -1 95.0 21 0 0 1 2 0 0 109.1 188.8 68.9 55.5 2952 2 0 141 0 3.78 3.15 9.5 114.0 5400.0 23 28
201 -1 95.0 21 0 1 1 2 0 0 109.1 188.8 68.8 55.5 3049 2 0 141 0 3.78 3.15 8.7 160.0 5300.0 19 25
202 -1 95.0 21 0 0 1 2 0 0 109.1 188.8 68.9 55.5 3012 1 1 173 0 3.58 2.87 8.8 134.0 5500.0 18 23
203 -1 95.0 21 1 1 1 2 0 0 109.1 188.8 68.9 55.5 3217 2 1 145 6 3.01 3.40 23.0 106.0 4800.0 26 27
204 -1 95.0 21 0 1 1 2 0 0 109.1 188.8 68.9 55.5 3062 2 0 141 0 3.78 3.15 9.5 114.0 5400.0 19 25

205 rows × 25 columns

mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns).sort_values(ascending=False)
mi_scores
curb-weight          1.389106
engine-size          1.008542
highway-mpg          0.933126
horsepower           0.838589
city-mpg             0.824853
width                0.654700
length               0.560122
wheel-base           0.557972
make                 0.556152
bore                 0.471904
fuel-system          0.447296
height               0.352667
stroke               0.323039
drive-wheels         0.314699
num-of-cylinders     0.313724
normalized-losses    0.288269
peak-rpm             0.226938
symboling            0.213840
compression-ratio    0.186054
engine-type          0.156490
aspiration           0.080632
body-style           0.057173
fuel-type            0.039664
engine-location      0.034692
num-of-doors         0.006551
Name: MI Scores, dtype: float64
mi_scores.sort_values(ascending=True).plot(kind='barh', figsize=(8,8))
<matplotlib.axes._subplots.AxesSubplot at 0x7f37ed82f9d0>
../../_images/automobile_example_19_1.png
sns.relplot(x='curb-weight', y='price', data=df)
<seaborn.axisgrid.FacetGrid at 0x7f37ed700050>
../../_images/automobile_example_20_1.png
  • MI doesnot take into account interaction effect

  • ‘fuel-type’ is low on MI but adding it improves the price explaination (Don’t reject just based on MI)

sns.lmplot(x="horsepower", y="price", hue="fuel-type", data=df);
../../_images/automobile_example_22_0.png
df.select_dtypes(['float','int'])
symboling normalized-losses wheel-base length width height curb-weight engine-size bore stroke compression-ratio horsepower peak-rpm city-mpg highway-mpg price
0 3 NaN 88.6 168.8 64.1 48.8 2548 130 3.47 2.68 9.0 111.0 5000.0 21 27 13495.0
1 3 NaN 88.6 168.8 64.1 48.8 2548 130 3.47 2.68 9.0 111.0 5000.0 21 27 16500.0
2 1 NaN 94.5 171.2 65.5 52.4 2823 152 2.68 3.47 9.0 154.0 5000.0 19 26 16500.0
3 2 164.0 99.8 176.6 66.2 54.3 2337 109 3.19 3.40 10.0 102.0 5500.0 24 30 13950.0
4 2 164.0 99.4 176.6 66.4 54.3 2824 136 3.19 3.40 8.0 115.0 5500.0 18 22 17450.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
200 -1 95.0 109.1 188.8 68.9 55.5 2952 141 3.78 3.15 9.5 114.0 5400.0 23 28 16845.0
201 -1 95.0 109.1 188.8 68.8 55.5 3049 141 3.78 3.15 8.7 160.0 5300.0 19 25 19045.0
202 -1 95.0 109.1 188.8 68.9 55.5 3012 173 3.58 2.87 8.8 134.0 5500.0 18 23 21485.0
203 -1 95.0 109.1 188.8 68.9 55.5 3217 145 3.01 3.40 23.0 106.0 4800.0 26 27 22470.0
204 -1 95.0 109.1 188.8 68.9 55.5 3062 141 3.78 3.15 9.5 114.0 5400.0 19 25 22625.0

205 rows × 16 columns

df.shape
(205, 26)
num_cols = df.select_dtypes(['float', 'int']).columns
cat_cols = df.select_dtypes('object').columns
@interact(num_col=num_cols, cat_col=cat_cols)
def plot(num_col, cat_col):
    sns.relplot(x=num_col, y='price', data=df)
    sns.lmplot(x=num_col, y="price", hue=cat_col, data=df)
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)

Baseline Model#

model = RandomForestRegressor(); model
RandomForestRegressor()
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
scores.mean()
0.7978904308207849
scores
array([0.85095168, 0.95340487, 0.7862429 , 0.91746912, 0.48138359])

This model is not yet fitted. It basically tracks the capacity / ( over-fitting of raw model on Data).cross_val_score copies the estimator before fitting the same on parts. The model then needs to be trained on complete dataset before predicting on the holdout data

model.fit(X_train, y_train)
RandomForestRegressor()
r2_score(y_holdout, model.predict(X_holdout))
0.8505374645526409
r2_score(y_train, model.predict(X_train))
0.96246216169697

Reduced Feature Model#

mi_scores[:15].index
Index(['curb-weight', 'engine-size', 'highway-mpg', 'horsepower', 'city-mpg',
       'width', 'length', 'wheel-base', 'make', 'bore', 'fuel-system',
       'height', 'stroke', 'drive-wheels', 'num-of-cylinders'],
      dtype='object')
model_reduced = RandomForestRegressor()
scores = cross_val_score(model_reduced, X_train[mi_scores[:15].index], y_train, cv=5, scoring='r2'); scores.mean()
0.7850512535366945
scores
array([0.85217899, 0.93233023, 0.76150246, 0.9104315 , 0.46881309])
model_reduced.fit(X_train[mi_scores[:15].index], y_train)
RandomForestRegressor()
r2_score(y_holdout, model_reduced.predict(X_holdout[mi_scores[:15].index])), r2_score(y_train, model_reduced.predict(X_train[mi_scores[:15].index]))
(0.8265900980656524, 0.9694392691698007)
def feature_model(X_train, y_train, X_holdout, y_holdout, mi_scores, estimator=RandomForestRegressor(), n_cols=10):
    sel_cols = mi_scores[:n_cols].index
    scores = cross_val_score(estimator, X_train[sel_cols], y_train, cv=5, scoring='r2')
    model.fit(X_train[sel_cols], y_train)
    return {
        'k': n_cols,
        'cv_mean': scores.mean(), 
        'holdout':r2_score(y_holdout, model.predict(X_holdout[sel_cols])), 
        'train': r2_score(y_train, model.predict(X_train[sel_cols]))
    }
    
feature_model(X_train, y_train, X_holdout, y_holdout, mi_scores, n_cols=10)
{'cv_mean': 0.787900056041571,
 'holdout': 0.8268471781204396,
 'k': 10,
 'train': 0.9732744993383893}
df_features = pd.DataFrame([feature_model(X_train, y_train, X_holdout, y_holdout, mi_scores, n_cols=k) for k in range(1,25)])
df_features.plot(x='k')
<matplotlib.axes._subplots.AxesSubplot at 0x7f37ed3a21d0>
../../_images/automobile_example_49_1.png
df_features
k cv_mean holdout train
0 1 0.681796 0.332996 0.953905
1 2 0.698563 0.564750 0.959669
2 3 0.763651 0.802725 0.962008
3 4 0.786535 0.828275 0.971400
4 5 0.786793 0.836906 0.965836
5 6 0.785895 0.832797 0.958112
6 7 0.781646 0.840111 0.968623
7 8 0.796786 0.834703 0.960458
8 9 0.797306 0.821109 0.971560
9 10 0.784905 0.836016 0.967543
10 11 0.791368 0.835919 0.968914
11 12 0.794696 0.836563 0.960086
12 13 0.783753 0.842935 0.970898
13 14 0.787471 0.833692 0.958966
14 15 0.795940 0.840154 0.967076
15 16 0.800306 0.834524 0.954770
16 17 0.797541 0.835549 0.967275
17 18 0.787937 0.842294 0.974459
18 19 0.793785 0.840984 0.966164
19 20 0.792703 0.845809 0.967088
20 21 0.800757 0.857779 0.962757
21 22 0.788593 0.841297 0.969550
22 23 0.799097 0.849720 0.968724
23 24 0.780353 0.844495 0.962705

Creating Custom Features#

def feature_extractors(X, callbacks=None):
    features = X
    if callbacks:
        features = pd.concat([callback(X) for callback in callbacks] + [X], axis=1)
    return features
            

def create_custom_features(X):
    features = pd.DataFrame()
    features['stroke_ratio'] = X['stroke']/X['bore']
    features['displacement'] = np.pi*((X['bore']*0.5)**2)*X['stroke']*X['num-of-cylinders']
    return features
feature_extractors(X, callbacks=[create_custom_features]).fillna(0)
stroke_ratio displacement symboling normalized-losses make fuel-type aspiration num-of-doors body-style drive-wheels engine-location wheel-base length width height curb-weight engine-type num-of-cylinders engine-size fuel-system bore stroke compression-ratio horsepower peak-rpm city-mpg highway-mpg
0 0.772334 0.000000 3 0.0 0 0 0 0 0 0 0 88.6 168.8 64.1 48.8 2548 0 0 130 0 3.47 2.68 9.0 111.0 5000.0 21 27
1 0.772334 0.000000 3 0.0 0 0 0 0 0 0 0 88.6 168.8 64.1 48.8 2548 0 0 130 0 3.47 2.68 9.0 111.0 5000.0 21 27
2 1.294776 19.574422 1 0.0 0 0 0 0 1 0 0 94.5 171.2 65.5 52.4 2823 1 1 152 0 2.68 3.47 9.0 154.0 5000.0 19 26
3 1.065831 0.000000 2 164.0 1 0 0 1 2 1 0 99.8 176.6 66.2 54.3 2337 2 0 109 0 3.19 3.40 10.0 102.0 5500.0 24 30
4 1.065831 54.347574 2 164.0 1 0 0 1 2 2 0 99.4 176.6 66.4 54.3 2824 2 2 136 0 3.19 3.40 8.0 115.0 5500.0 18 22
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
200 0.833333 0.000000 -1 95.0 21 0 0 1 2 0 0 109.1 188.8 68.9 55.5 2952 2 0 141 0 3.78 3.15 9.5 114.0 5400.0 23 28
201 0.833333 0.000000 -1 95.0 21 0 1 1 2 0 0 109.1 188.8 68.8 55.5 3049 2 0 141 0 3.78 3.15 8.7 160.0 5300.0 19 25
202 0.801676 28.889354 -1 95.0 21 0 0 1 2 0 0 109.1 188.8 68.9 55.5 3012 1 1 173 0 3.58 2.87 8.8 134.0 5500.0 18 23
203 1.129568 24.193672 -1 95.0 21 1 1 1 2 0 0 109.1 188.8 68.9 55.5 3217 2 1 145 6 3.01 3.40 23.0 106.0 4800.0 26 27
204 0.833333 0.000000 -1 95.0 21 0 1 1 2 0 0 109.1 188.8 68.9 55.5 3062 2 0 141 0 3.78 3.15 9.5 114.0 5400.0 19 25

205 rows × 27 columns

new_X = feature_extractors(X, callbacks=[create_custom_features]).fillna(0)
discrete_features = new_X.dtypes == int; discrete_features
mi_scores2 = mutual_info_regression(new_X, y, discrete_features=discrete_features)
mi_scores2 = pd.Series(mi_scores2, name="MI Scores", index=new_X.columns).sort_values(ascending=False)
mi_scores2.sort_values(ascending=True).plot(kind='barh', figsize=(8,8))
<matplotlib.axes._subplots.AxesSubplot at 0x7f37ed0d3c50>
../../_images/automobile_example_55_1.png
new_X_train = feature_extractors(X_train, callbacks=[create_custom_features]).fillna(0)
new_X_holdout = feature_extractors(X_holdout, callbacks=[create_custom_features]).fillna(0)
feature_model(new_X_train, y_train, new_X_holdout, y_holdout, mi_scores, n_cols=17)
{'cv_mean': 0.7993008628952347,
 'holdout': 0.8285570951484964,
 'k': 17,
 'train': 0.9727945533883016}
feature_model(X_train, y_train, X_holdout, y_holdout, mi_scores, n_cols=15)
{'cv_mean': 0.7971583669411606,
 'holdout': 0.8374002645219979,
 'k': 15,
 'train': 0.9663210512717149}
pp = make_pipeline(StandardScaler(), PCA(), ExtraTreesRegressor())
pp
Pipeline(steps=[('standardscaler', StandardScaler()), ('pca', PCA()),
                ('extratreesregressor', ExtraTreesRegressor())])
feature_model(X_train, y_train, X_holdout, y_holdout, mi_scores, n_cols=15, estimator=pp)
{'cv_mean': 0.7887728552753883,
 'holdout': 0.8424713912911774,
 'k': 15,
 'train': 0.9547614236269416}