Feature Engineering#

%load_ext autoreload
%autoreload 2
%matplotlib inline
from aiking.data.external import * #We need to import this after fastai modules
import warnings
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from IPython.display import display, Image

sns.set(color_codes=True)
sns.set_palette(sns.color_palette("muted"))
warnings.filterwarnings("ignore")
pd.DataFrame(list(list_ds()))
0
0 oxford-iiit-pet
1 DoppelGanger
2 mnist_sample
3 Bears
4 camvid_tiny
5 imdb
6 california-housing-prices
7 adult_sample
8 imdb_tok
9 movie_lens_sample
10 titanic
11 Artsie
12 dummytbl
path = get_ds("california-housing-prices"); path
Path('/Landmark2/pdo/aiking/data/california-housing-prices')
path.ls()
(#3) [Path('/Landmark2/pdo/aiking/data/california-housing-prices/housing.csv'),Path('/Landmark2/pdo/aiking/data/california-housing-prices/housing_address.csv'),Path('/Landmark2/pdo/aiking/data/california-housing-prices/merged.csv')]
df = pd.read_csv(path/"merged.csv", index_col=0); df.head()
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity street_number route locality-political postal_code
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY 3130 Grizzly Peak Boulevard Berkeley 94705.0
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY 2005 Tunnel Road Oakland 94611.0
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY 6886 Chabot Road Oakland 94618.0
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY 6365 Florio Street Oakland 94618.0
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY 6365 Florio Street Oakland 94618.0
df.describe().T
count mean std min 25% 50% 75% max
longitude 20640.0 -119.569704 2.003532 -124.3500 -121.8000 -118.4900 -118.01000 -114.3100
latitude 20640.0 35.631861 2.135952 32.5400 33.9300 34.2600 37.71000 41.9500
housing_median_age 20640.0 28.639486 12.585558 1.0000 18.0000 29.0000 37.00000 52.0000
total_rooms 20640.0 2635.763081 2181.615252 2.0000 1447.7500 2127.0000 3148.00000 39320.0000
total_bedrooms 20433.0 537.870553 421.385070 1.0000 296.0000 435.0000 647.00000 6445.0000
population 20640.0 1425.476744 1132.462122 3.0000 787.0000 1166.0000 1725.00000 35682.0000
households 20640.0 499.539680 382.329753 1.0000 280.0000 409.0000 605.00000 6082.0000
median_income 20640.0 3.870671 1.899822 0.4999 2.5634 3.5348 4.74325 15.0001
median_house_value 20640.0 206855.816909 115395.615874 14999.0000 119600.0000 179700.0000 264725.00000 500001.0000
postal_code 20454.0 92996.901926 1858.067396 85344.0000 91505.2500 92840.0000 94601.00000 96161.0000
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 20640 entries, 0 to 20639
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
 10  street_number       19008 non-null  object 
 11  route               20091 non-null  object 
 12  locality-political  20452 non-null  object 
 13  postal_code         20454 non-null  float64
dtypes: float64(10), object(4)
memory usage: 2.4+ MB