Automobiles#

%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Imports#

from fastai.vision.all import *
from aiking.data.external import * #We need to import this after fastai modules
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, make_scorer
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
import seaborn as sns
import matplotlib.pyplot as plt
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

plt.style.use('ggplot')

Getting Dataset#

# kaggle datasets download -d toramky/automobile-dataset
path = untar_data("kaggle_datasets::toramky/automobile-dataset"); path

Path('/content/drive/MyDrive/PPV/S_Personal_Study/aiking/data/automobile-dataset')

path.ls()

(#1) [Path('/content/drive/MyDrive/PPV/S_Personal_Study/aiking/data/automobile-dataset/Automobile_data.csv')]

df = pd.read_csv(path/'Automobile_data.csv', na_values=["?"]); df.head()

	symboling	normalized-losses	make	fuel-type	aspiration	num-of-doors	body-style	drive-wheels	engine-location	wheel-base	length	width	height	curb-weight	engine-type	num-of-cylinders	engine-size	fuel-system	bore	stroke	compression-ratio	horsepower	peak-rpm	city-mpg	highway-mpg	price
0	3	NaN	alfa-romero	gas	std	two	convertible	rwd	front	88.6	168.8	64.1	48.8	2548	dohc	four	130	mpfi	3.47	2.68	9.0	111.0	5000.0	21	27	13495.0
1	3	NaN	alfa-romero	gas	std	two	convertible	rwd	front	88.6	168.8	64.1	48.8	2548	dohc	four	130	mpfi	3.47	2.68	9.0	111.0	5000.0	21	27	16500.0
2	1	NaN	alfa-romero	gas	std	two	hatchback	rwd	front	94.5	171.2	65.5	52.4	2823	ohcv	six	152	mpfi	2.68	3.47	9.0	154.0	5000.0	19	26	16500.0
3	2	164.0	audi	gas	std	four	sedan	fwd	front	99.8	176.6	66.2	54.3	2337	ohc	four	109	mpfi	3.19	3.40	10.0	102.0	5500.0	24	30	13950.0
4	2	164.0	audi	gas	std	four	sedan	4wd	front	99.4	176.6	66.4	54.3	2824	ohc	five	136	mpfi	3.19	3.40	8.0	115.0	5500.0	18	22	17450.0

X = df.fillna(0).copy()
y = X.pop("price")

	symboling	normalized-losses	make	fuel-type	aspiration	num-of-doors	body-style	drive-wheels	engine-location	wheel-base	length	width	height	curb-weight	engine-type	num-of-cylinders	engine-size	fuel-system	bore	stroke	compression-ratio	horsepower	peak-rpm	city-mpg	highway-mpg
0	3	0.0	alfa-romero	gas	std	two	convertible	rwd	front	88.6	168.8	64.1	48.8	2548	dohc	four	130	mpfi	3.47	2.68	9.0	111.0	5000.0	21	27
1	3	0.0	alfa-romero	gas	std	two	convertible	rwd	front	88.6	168.8	64.1	48.8	2548	dohc	four	130	mpfi	3.47	2.68	9.0	111.0	5000.0	21	27
2	1	0.0	alfa-romero	gas	std	two	hatchback	rwd	front	94.5	171.2	65.5	52.4	2823	ohcv	six	152	mpfi	2.68	3.47	9.0	154.0	5000.0	19	26
3	2	164.0	audi	gas	std	four	sedan	fwd	front	99.8	176.6	66.2	54.3	2337	ohc	four	109	mpfi	3.19	3.40	10.0	102.0	5500.0	24	30
4	2	164.0	audi	gas	std	four	sedan	4wd	front	99.4	176.6	66.4	54.3	2824	ohc	five	136	mpfi	3.19	3.40	8.0	115.0	5500.0	18	22
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
200	-1	95.0	volvo	gas	std	four	sedan	rwd	front	109.1	188.8	68.9	55.5	2952	ohc	four	141	mpfi	3.78	3.15	9.5	114.0	5400.0	23	28
201	-1	95.0	volvo	gas	turbo	four	sedan	rwd	front	109.1	188.8	68.8	55.5	3049	ohc	four	141	mpfi	3.78	3.15	8.7	160.0	5300.0	19	25
202	-1	95.0	volvo	gas	std	four	sedan	rwd	front	109.1	188.8	68.9	55.5	3012	ohcv	six	173	mpfi	3.58	2.87	8.8	134.0	5500.0	18	23
203	-1	95.0	volvo	diesel	turbo	four	sedan	rwd	front	109.1	188.8	68.9	55.5	3217	ohc	six	145	idi	3.01	3.40	23.0	106.0	4800.0	26	27
204	-1	95.0	volvo	gas	turbo	four	sedan	rwd	front	109.1	188.8	68.9	55.5	3062	ohc	four	141	mpfi	3.78	3.15	9.5	114.0	5400.0	19	25

205 rows × 25 columns

Mutual Information (MI)#

for colname in X.select_dtypes("object"):
    X[colname], _ = X[colname].factorize()
X

	symboling	normalized-losses	make	fuel-type	aspiration	num-of-doors	body-style	drive-wheels	engine-location	wheel-base	length	width	height	curb-weight	engine-type	num-of-cylinders	engine-size	fuel-system	bore	stroke	compression-ratio	horsepower	peak-rpm	city-mpg	highway-mpg
0	3	0.0	0	0	0	0	0	0	0	88.6	168.8	64.1	48.8	2548	0	0	130	0	3.47	2.68	9.0	111.0	5000.0	21	27
1	3	0.0	0	0	0	0	0	0	0	88.6	168.8	64.1	48.8	2548	0	0	130	0	3.47	2.68	9.0	111.0	5000.0	21	27
2	1	0.0	0	0	0	0	1	0	0	94.5	171.2	65.5	52.4	2823	1	1	152	0	2.68	3.47	9.0	154.0	5000.0	19	26
3	2	164.0	1	0	0	1	2	1	0	99.8	176.6	66.2	54.3	2337	2	0	109	0	3.19	3.40	10.0	102.0	5500.0	24	30
4	2	164.0	1	0	0	1	2	2	0	99.4	176.6	66.4	54.3	2824	2	2	136	0	3.19	3.40	8.0	115.0	5500.0	18	22
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
200	-1	95.0	21	0	0	1	2	0	0	109.1	188.8	68.9	55.5	2952	2	0	141	0	3.78	3.15	9.5	114.0	5400.0	23	28
201	-1	95.0	21	0	1	1	2	0	0	109.1	188.8	68.8	55.5	3049	2	0	141	0	3.78	3.15	8.7	160.0	5300.0	19	25
202	-1	95.0	21	0	0	1	2	0	0	109.1	188.8	68.9	55.5	3012	1	1	173	0	3.58	2.87	8.8	134.0	5500.0	18	23
203	-1	95.0	21	1	1	1	2	0	0	109.1	188.8	68.9	55.5	3217	2	1	145	6	3.01	3.40	23.0	106.0	4800.0	26	27
204	-1	95.0	21	0	1	1	2	0	0	109.1	188.8	68.9	55.5	3062	2	0	141	0	3.78	3.15	9.5	114.0	5400.0	19	25

205 rows × 25 columns

X.dtypes

symboling              int64
normalized-losses    float64
make                   int64
fuel-type              int64
aspiration             int64
num-of-doors           int64
body-style             int64
drive-wheels           int64
engine-location        int64
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type            int64
num-of-cylinders       int64
engine-size            int64
fuel-system            int64
bore                 float64
stroke               float64
compression-ratio    float64
horsepower           float64
peak-rpm             float64
city-mpg               int64
highway-mpg            int64
dtype: object

X.dtypes == int

symboling             True
normalized-losses    False
make                  True
fuel-type             True
aspiration            True
num-of-doors          True
body-style            True
drive-wheels          True
engine-location       True
wheel-base           False
length               False
width                False
height               False
curb-weight           True
engine-type           True
num-of-cylinders      True
engine-size           True
fuel-system           True
bore                 False
stroke               False
compression-ratio    False
horsepower           False
peak-rpm             False
city-mpg              True
highway-mpg           True
dtype: bool

discrete_features = X.dtypes == int; discrete_features

symboling             True
normalized-losses    False
make                  True
fuel-type             True
aspiration            True
num-of-doors          True
body-style            True
drive-wheels          True
engine-location       True
wheel-base           False
length               False
width                False
height               False
curb-weight           True
engine-type           True
num-of-cylinders      True
engine-size           True
fuel-system           True
bore                 False
stroke               False
compression-ratio    False
horsepower           False
peak-rpm             False
city-mpg              True
highway-mpg           True
dtype: bool

y[y=="?"]

Series([], Name: price, dtype: float64)

	symboling	normalized-losses	make	fuel-type	aspiration	num-of-doors	body-style	drive-wheels	engine-location	wheel-base	length	width	height	curb-weight	engine-type	num-of-cylinders	engine-size	fuel-system	bore	stroke	compression-ratio	horsepower	peak-rpm	city-mpg	highway-mpg
0	3	0.0	0	0	0	0	0	0	0	88.6	168.8	64.1	48.8	2548	0	0	130	0	3.47	2.68	9.0	111.0	5000.0	21	27
1	3	0.0	0	0	0	0	0	0	0	88.6	168.8	64.1	48.8	2548	0	0	130	0	3.47	2.68	9.0	111.0	5000.0	21	27
2	1	0.0	0	0	0	0	1	0	0	94.5	171.2	65.5	52.4	2823	1	1	152	0	2.68	3.47	9.0	154.0	5000.0	19	26
3	2	164.0	1	0	0	1	2	1	0	99.8	176.6	66.2	54.3	2337	2	0	109	0	3.19	3.40	10.0	102.0	5500.0	24	30
4	2	164.0	1	0	0	1	2	2	0	99.4	176.6	66.4	54.3	2824	2	2	136	0	3.19	3.40	8.0	115.0	5500.0	18	22
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
200	-1	95.0	21	0	0	1	2	0	0	109.1	188.8	68.9	55.5	2952	2	0	141	0	3.78	3.15	9.5	114.0	5400.0	23	28
201	-1	95.0	21	0	1	1	2	0	0	109.1	188.8	68.8	55.5	3049	2	0	141	0	3.78	3.15	8.7	160.0	5300.0	19	25
202	-1	95.0	21	0	0	1	2	0	0	109.1	188.8	68.9	55.5	3012	1	1	173	0	3.58	2.87	8.8	134.0	5500.0	18	23
203	-1	95.0	21	1	1	1	2	0	0	109.1	188.8	68.9	55.5	3217	2	1	145	6	3.01	3.40	23.0	106.0	4800.0	26	27
204	-1	95.0	21	0	1	1	2	0	0	109.1	188.8	68.9	55.5	3062	2	0	141	0	3.78	3.15	9.5	114.0	5400.0	19	25

205 rows × 25 columns

mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns).sort_values(ascending=False)
mi_scores

curb-weight          1.389106
engine-size          1.008542
highway-mpg          0.933126
horsepower           0.838589
city-mpg             0.824853
width                0.654700
length               0.560122
wheel-base           0.557972
make                 0.556152
bore                 0.471904
fuel-system          0.447296
height               0.352667
stroke               0.323039
drive-wheels         0.314699
num-of-cylinders     0.313724
normalized-losses    0.288269
peak-rpm             0.226938
symboling            0.213840
compression-ratio    0.186054
engine-type          0.156490
aspiration           0.080632
body-style           0.057173
fuel-type            0.039664
engine-location      0.034692
num-of-doors         0.006551
Name: MI Scores, dtype: float64

mi_scores.sort_values(ascending=True).plot(kind='barh', figsize=(8,8))

<matplotlib.axes._subplots.AxesSubplot at 0x7f37ed82f9d0>

../../_images/automobile_example_19_1.png

sns.relplot(x='curb-weight', y='price', data=df)

<seaborn.axisgrid.FacetGrid at 0x7f37ed700050>

../../_images/automobile_example_20_1.png

MI doesnot take into account interaction effect
‘fuel-type’ is low on MI but adding it improves the price explaination (Don’t reject just based on MI)

sns.lmplot(x="horsepower", y="price", hue="fuel-type", data=df);

../../_images/automobile_example_22_0.png

df.select_dtypes(['float','int'])

	symboling	normalized-losses	wheel-base	length	width	height	curb-weight	engine-size	bore	stroke	compression-ratio	horsepower	peak-rpm	city-mpg	highway-mpg	price
0	3	NaN	88.6	168.8	64.1	48.8	2548	130	3.47	2.68	9.0	111.0	5000.0	21	27	13495.0
1	3	NaN	88.6	168.8	64.1	48.8	2548	130	3.47	2.68	9.0	111.0	5000.0	21	27	16500.0
2	1	NaN	94.5	171.2	65.5	52.4	2823	152	2.68	3.47	9.0	154.0	5000.0	19	26	16500.0
3	2	164.0	99.8	176.6	66.2	54.3	2337	109	3.19	3.40	10.0	102.0	5500.0	24	30	13950.0
4	2	164.0	99.4	176.6	66.4	54.3	2824	136	3.19	3.40	8.0	115.0	5500.0	18	22	17450.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
200	-1	95.0	109.1	188.8	68.9	55.5	2952	141	3.78	3.15	9.5	114.0	5400.0	23	28	16845.0
201	-1	95.0	109.1	188.8	68.8	55.5	3049	141	3.78	3.15	8.7	160.0	5300.0	19	25	19045.0
202	-1	95.0	109.1	188.8	68.9	55.5	3012	173	3.58	2.87	8.8	134.0	5500.0	18	23	21485.0
203	-1	95.0	109.1	188.8	68.9	55.5	3217	145	3.01	3.40	23.0	106.0	4800.0	26	27	22470.0
204	-1	95.0	109.1	188.8	68.9	55.5	3062	141	3.78	3.15	9.5	114.0	5400.0	19	25	22625.0

205 rows × 16 columns

df.shape

(205, 26)

num_cols = df.select_dtypes(['float', 'int']).columns

cat_cols = df.select_dtypes('object').columns

@interact(num_col=num_cols, cat_col=cat_cols)
def plot(num_col, cat_col):
    sns.relplot(x=num_col, y='price', data=df)
    sns.lmplot(x=num_col, y="price", hue=cat_col, data=df)

X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)

Baseline Model#

model = RandomForestRegressor(); model

RandomForestRegressor()

scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')

scores.mean()

0.7978904308207849

scores

array([0.85095168, 0.95340487, 0.7862429 , 0.91746912, 0.48138359])

This model is not yet fitted. It basically tracks the capacity / ( over-fitting of raw model on Data).cross_val_score copies the estimator before fitting the same on parts. The model then needs to be trained on complete dataset before predicting on the holdout data

Note

https://stackoverflow.com/questions/42263915/using-sklearn-cross-val-score-and-kfolds-to-fit-and-help-predict-model

model.fit(X_train, y_train)

RandomForestRegressor()

r2_score(y_holdout, model.predict(X_holdout))

0.8505374645526409

r2_score(y_train, model.predict(X_train))

0.96246216169697

Reduced Feature Model#

mi_scores[:15].index

Index(['curb-weight', 'engine-size', 'highway-mpg', 'horsepower', 'city-mpg',
       'width', 'length', 'wheel-base', 'make', 'bore', 'fuel-system',
       'height', 'stroke', 'drive-wheels', 'num-of-cylinders'],
      dtype='object')

model_reduced = RandomForestRegressor()

scores = cross_val_score(model_reduced, X_train[mi_scores[:15].index], y_train, cv=5, scoring='r2'); scores.mean()

0.7850512535366945

scores

array([0.85217899, 0.93233023, 0.76150246, 0.9104315 , 0.46881309])

model_reduced.fit(X_train[mi_scores[:15].index], y_train)

RandomForestRegressor()

r2_score(y_holdout, model_reduced.predict(X_holdout[mi_scores[:15].index])), r2_score(y_train, model_reduced.predict(X_train[mi_scores[:15].index]))

(0.8265900980656524, 0.9694392691698007)

def feature_model(X_train, y_train, X_holdout, y_holdout, mi_scores, estimator=RandomForestRegressor(), n_cols=10):
    sel_cols = mi_scores[:n_cols].index
    scores = cross_val_score(estimator, X_train[sel_cols], y_train, cv=5, scoring='r2')
    model.fit(X_train[sel_cols], y_train)
    return {
        'k': n_cols,
        'cv_mean': scores.mean(), 
        'holdout':r2_score(y_holdout, model.predict(X_holdout[sel_cols])), 
        'train': r2_score(y_train, model.predict(X_train[sel_cols]))
    }
    

feature_model(X_train, y_train, X_holdout, y_holdout, mi_scores, n_cols=10)

{'cv_mean': 0.787900056041571,
 'holdout': 0.8268471781204396,
 'k': 10,
 'train': 0.9732744993383893}

df_features = pd.DataFrame([feature_model(X_train, y_train, X_holdout, y_holdout, mi_scores, n_cols=k) for k in range(1,25)])

df_features.plot(x='k')

<matplotlib.axes._subplots.AxesSubplot at 0x7f37ed3a21d0>

../../_images/automobile_example_49_1.png

df_features

	k	cv_mean	holdout	train
0	1	0.681796	0.332996	0.953905
1	2	0.698563	0.564750	0.959669
2	3	0.763651	0.802725	0.962008
3	4	0.786535	0.828275	0.971400
4	5	0.786793	0.836906	0.965836
5	6	0.785895	0.832797	0.958112
6	7	0.781646	0.840111	0.968623
7	8	0.796786	0.834703	0.960458
8	9	0.797306	0.821109	0.971560
9	10	0.784905	0.836016	0.967543
10	11	0.791368	0.835919	0.968914
11	12	0.794696	0.836563	0.960086
12	13	0.783753	0.842935	0.970898
13	14	0.787471	0.833692	0.958966
14	15	0.795940	0.840154	0.967076
15	16	0.800306	0.834524	0.954770
16	17	0.797541	0.835549	0.967275
17	18	0.787937	0.842294	0.974459
18	19	0.793785	0.840984	0.966164
19	20	0.792703	0.845809	0.967088
20	21	0.800757	0.857779	0.962757
21	22	0.788593	0.841297	0.969550
22	23	0.799097	0.849720	0.968724
23	24	0.780353	0.844495	0.962705

Creating Custom Features#

def feature_extractors(X, callbacks=None):
    features = X
    if callbacks:
        features = pd.concat([callback(X) for callback in callbacks] + [X], axis=1)
    return features
            

def create_custom_features(X):
    features = pd.DataFrame()
    features['stroke_ratio'] = X['stroke']/X['bore']
    features['displacement'] = np.pi*((X['bore']*0.5)**2)*X['stroke']*X['num-of-cylinders']
    return features

feature_extractors(X, callbacks=[create_custom_features]).fillna(0)

	stroke_ratio	displacement	symboling	normalized-losses	make	fuel-type	aspiration	num-of-doors	body-style	drive-wheels	engine-location	wheel-base	length	width	height	curb-weight	engine-type	num-of-cylinders	engine-size	fuel-system	bore	stroke	compression-ratio	horsepower	peak-rpm	city-mpg	highway-mpg
0	0.772334	0.000000	3	0.0	0	0	0	0	0	0	0	88.6	168.8	64.1	48.8	2548	0	0	130	0	3.47	2.68	9.0	111.0	5000.0	21	27
1	0.772334	0.000000	3	0.0	0	0	0	0	0	0	0	88.6	168.8	64.1	48.8	2548	0	0	130	0	3.47	2.68	9.0	111.0	5000.0	21	27
2	1.294776	19.574422	1	0.0	0	0	0	0	1	0	0	94.5	171.2	65.5	52.4	2823	1	1	152	0	2.68	3.47	9.0	154.0	5000.0	19	26
3	1.065831	0.000000	2	164.0	1	0	0	1	2	1	0	99.8	176.6	66.2	54.3	2337	2	0	109	0	3.19	3.40	10.0	102.0	5500.0	24	30
4	1.065831	54.347574	2	164.0	1	0	0	1	2	2	0	99.4	176.6	66.4	54.3	2824	2	2	136	0	3.19	3.40	8.0	115.0	5500.0	18	22
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
200	0.833333	0.000000	-1	95.0	21	0	0	1	2	0	0	109.1	188.8	68.9	55.5	2952	2	0	141	0	3.78	3.15	9.5	114.0	5400.0	23	28
201	0.833333	0.000000	-1	95.0	21	0	1	1	2	0	0	109.1	188.8	68.8	55.5	3049	2	0	141	0	3.78	3.15	8.7	160.0	5300.0	19	25
202	0.801676	28.889354	-1	95.0	21	0	0	1	2	0	0	109.1	188.8	68.9	55.5	3012	1	1	173	0	3.58	2.87	8.8	134.0	5500.0	18	23
203	1.129568	24.193672	-1	95.0	21	1	1	1	2	0	0	109.1	188.8	68.9	55.5	3217	2	1	145	6	3.01	3.40	23.0	106.0	4800.0	26	27
204	0.833333	0.000000	-1	95.0	21	0	1	1	2	0	0	109.1	188.8	68.9	55.5	3062	2	0	141	0	3.78	3.15	9.5	114.0	5400.0	19	25

205 rows × 27 columns

new_X = feature_extractors(X, callbacks=[create_custom_features]).fillna(0)
discrete_features = new_X.dtypes == int; discrete_features
mi_scores2 = mutual_info_regression(new_X, y, discrete_features=discrete_features)
mi_scores2 = pd.Series(mi_scores2, name="MI Scores", index=new_X.columns).sort_values(ascending=False)
mi_scores2.sort_values(ascending=True).plot(kind='barh', figsize=(8,8))

<matplotlib.axes._subplots.AxesSubplot at 0x7f37ed0d3c50>

../../_images/automobile_example_55_1.png

new_X_train = feature_extractors(X_train, callbacks=[create_custom_features]).fillna(0)
new_X_holdout = feature_extractors(X_holdout, callbacks=[create_custom_features]).fillna(0)
feature_model(new_X_train, y_train, new_X_holdout, y_holdout, mi_scores, n_cols=17)

{'cv_mean': 0.7993008628952347,
 'holdout': 0.8285570951484964,
 'k': 17,
 'train': 0.9727945533883016}

feature_model(X_train, y_train, X_holdout, y_holdout, mi_scores, n_cols=15)

{'cv_mean': 0.7971583669411606,
 'holdout': 0.8374002645219979,
 'k': 15,
 'train': 0.9663210512717149}

pp = make_pipeline(StandardScaler(), PCA(), ExtraTreesRegressor())
pp

Pipeline(steps=[('standardscaler', StandardScaler()), ('pca', PCA()),
                ('extratreesregressor', ExtraTreesRegressor())])

feature_model(X_train, y_train, X_holdout, y_holdout, mi_scores, n_cols=15, estimator=pp)

{'cv_mean': 0.7887728552753883,
 'holdout': 0.8424713912911774,
 'k': 15,
 'train': 0.9547614236269416}

AIBook

Automobiles

Contents

Automobiles#

Imports#

Getting Dataset#

Mutual Information (MI)#

Baseline Model#

Reduced Feature Model#

Creating Custom Features#