Feature Engineering
Feature Engineering#
%load_ext autoreload
%autoreload 2
%matplotlib inline
from aiking.data.external import * #We need to import this after fastai modules
import warnings
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from IPython.display import display, Image
sns.set(color_codes=True)
sns.set_palette(sns.color_palette("muted"))
warnings.filterwarnings("ignore")
pd.DataFrame(list(list_ds()))
0 | |
---|---|
0 | oxford-iiit-pet |
1 | DoppelGanger |
2 | mnist_sample |
3 | Bears |
4 | camvid_tiny |
5 | imdb |
6 | california-housing-prices |
7 | adult_sample |
8 | imdb_tok |
9 | movie_lens_sample |
10 | titanic |
11 | Artsie |
12 | dummytbl |
path = get_ds("california-housing-prices"); path
Path('/Landmark2/pdo/aiking/data/california-housing-prices')
path.ls()
(#3) [Path('/Landmark2/pdo/aiking/data/california-housing-prices/housing.csv'),Path('/Landmark2/pdo/aiking/data/california-housing-prices/housing_address.csv'),Path('/Landmark2/pdo/aiking/data/california-housing-prices/merged.csv')]
df = pd.read_csv(path/"merged.csv", index_col=0); df.head()
longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | street_number | route | locality-political | postal_code | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -122.23 | 37.88 | 41.0 | 880.0 | 129.0 | 322.0 | 126.0 | 8.3252 | 452600.0 | NEAR BAY | 3130 | Grizzly Peak Boulevard | Berkeley | 94705.0 |
1 | -122.22 | 37.86 | 21.0 | 7099.0 | 1106.0 | 2401.0 | 1138.0 | 8.3014 | 358500.0 | NEAR BAY | 2005 | Tunnel Road | Oakland | 94611.0 |
2 | -122.24 | 37.85 | 52.0 | 1467.0 | 190.0 | 496.0 | 177.0 | 7.2574 | 352100.0 | NEAR BAY | 6886 | Chabot Road | Oakland | 94618.0 |
3 | -122.25 | 37.85 | 52.0 | 1274.0 | 235.0 | 558.0 | 219.0 | 5.6431 | 341300.0 | NEAR BAY | 6365 | Florio Street | Oakland | 94618.0 |
4 | -122.25 | 37.85 | 52.0 | 1627.0 | 280.0 | 565.0 | 259.0 | 3.8462 | 342200.0 | NEAR BAY | 6365 | Florio Street | Oakland | 94618.0 |
df.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
longitude | 20640.0 | -119.569704 | 2.003532 | -124.3500 | -121.8000 | -118.4900 | -118.01000 | -114.3100 |
latitude | 20640.0 | 35.631861 | 2.135952 | 32.5400 | 33.9300 | 34.2600 | 37.71000 | 41.9500 |
housing_median_age | 20640.0 | 28.639486 | 12.585558 | 1.0000 | 18.0000 | 29.0000 | 37.00000 | 52.0000 |
total_rooms | 20640.0 | 2635.763081 | 2181.615252 | 2.0000 | 1447.7500 | 2127.0000 | 3148.00000 | 39320.0000 |
total_bedrooms | 20433.0 | 537.870553 | 421.385070 | 1.0000 | 296.0000 | 435.0000 | 647.00000 | 6445.0000 |
population | 20640.0 | 1425.476744 | 1132.462122 | 3.0000 | 787.0000 | 1166.0000 | 1725.00000 | 35682.0000 |
households | 20640.0 | 499.539680 | 382.329753 | 1.0000 | 280.0000 | 409.0000 | 605.00000 | 6082.0000 |
median_income | 20640.0 | 3.870671 | 1.899822 | 0.4999 | 2.5634 | 3.5348 | 4.74325 | 15.0001 |
median_house_value | 20640.0 | 206855.816909 | 115395.615874 | 14999.0000 | 119600.0000 | 179700.0000 | 264725.00000 | 500001.0000 |
postal_code | 20454.0 | 92996.901926 | 1858.067396 | 85344.0000 | 91505.2500 | 92840.0000 | 94601.00000 | 96161.0000 |
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 20640 entries, 0 to 20639
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 longitude 20640 non-null float64
1 latitude 20640 non-null float64
2 housing_median_age 20640 non-null float64
3 total_rooms 20640 non-null float64
4 total_bedrooms 20433 non-null float64
5 population 20640 non-null float64
6 households 20640 non-null float64
7 median_income 20640 non-null float64
8 median_house_value 20640 non-null float64
9 ocean_proximity 20640 non-null object
10 street_number 19008 non-null object
11 route 20091 non-null object
12 locality-political 20452 non-null object
13 postal_code 20454 non-null float64
dtypes: float64(10), object(4)
memory usage: 2.4+ MB