EDA#

%load_ext autoreload
%autoreload 2
%matplotlib inline

Imports#

from aiking.data.external import * #We need to import this after fastai modules
import warnings
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from fastdownload import download_url
from IPython.display import display, Image

sns.set(color_codes=True)
sns.set_palette(sns.color_palette("muted"))
warnings.filterwarnings("ignore")

Getting Dataset#

path = untar_data("kaggle_datasets::camnugent/california-housing-prices"); path
# download_data??
#download_data("https://docs.google.com/uc?export=download&id=1mOK0uyRz5Zs-Qo7mVMlxwtb2xn1E6N9Q", fname=path/"housing_address.csv")
fname = download_url("https://docs.google.com/uc?export=download&id=1mOK0uyRz5Zs-Qo7mVMlxwtb2xn1E6N9Q", dest=path)
fname.rename(fname.parent/"housing_address.csv")
path.ls()
df = pd.read_csv(path/"housing.csv"); df.head()
df.info()
df.sample(n=5, random_state=42)
df['ocean_proximity'].value_counts()
df.describe().T

Data Visualization#

Histogram#

sns.distplot(df['housing_median_age'], kde=False)

Note

  • Highest ~50

  • Mean ~30

  • Second peak ~15. Why?

sns.distplot(df['median_income'], kde=False)

Correlation Matrix#

corr_matrix = df.corr();corr_matrix
corr_matrix['median_house_value'].sort_values(ascending=False)

Tip

We plan to predict median_house_value. This will give us factors which are highly correlated to this factor

sns.heatmap(corr_matrix, annot=True, fmt='.2f',linewidth=1)

Pearson Coefficient#

Image("https://lewtun.github.io/dslectures/images/correlation.png")

Pairplot#

attributes = ['median_house_value', 'median_income', 'total_rooms',
       'housing_median_age', 'ocean_proximity']
sns.pairplot(df[attributes], hue='ocean_proximity')
# corr_matrix['median_house_value'].sort_values(ascending=False).index

Note

Choosing attributes which are very highly correlated with choosen output

JointPlot#

sns.jointplot('median_income', 'median_house_value', data=df, kind='hex');

Note

  • Useful when scatter plot is too congested

  • What is this line at 500,000???

Auxilary Data#

df2 = pd.read_csv(path/"housing_address.csv"); df2.head()
df2.info()
df.shape, df2.shape
df2.columns
df2.head()
df2['locality-political'].nunique()

Note

Number of unique cities

Visualizing Geographical Data#

sns.scatterplot(x="longitude", y='latitude', data=df)

Note

  • Busy scatterplot hiding potential substructure. We can fix by changing transparency

sns.scatterplot(x="longitude", y='latitude', data=df, alpha=0.1)
df.columns
fig = sns.scatterplot(x="longitude", y='latitude', data=df, alpha=0.1, 
                hue='median_house_value', size='population', palette='viridis')
fig.legend(loc='center left', ncol=1, bbox_to_anchor=(1.01, 0.6))
(df['latitude'].min(),df['latitude'].max()), (df['longitude'].min(),df['longitude'].max())
fig = sns.scatterplot(x="longitude", y='latitude', data=df, alpha=0.1, 
                hue='median_house_value', size='population', palette='viridis')
fig.legend(loc='center left', ncol=1, bbox_to_anchor=(1.01, 0.6))
img = mpimg.imread("https://raw.githubusercontent.com/lewtun/dslectures/master/notebooks/images/california.png")
plt.imshow(img, extent=[df['longitude'].min(),df['longitude'].max(),df['latitude'].min(),df['latitude'].max()], alpha=0.5)
fig = sns.scatterplot(x="longitude", y='latitude', data=df, alpha=0.1, 
                hue='ocean_proximity', size='population', palette='viridis')
fig.legend(loc='center left', ncol=1, bbox_to_anchor=(1.01, 0.6))
img = mpimg.imread("https://raw.githubusercontent.com/lewtun/dslectures/master/notebooks/images/california.png")
plt.imshow(img, extent=[df['longitude'].min(),df['longitude'].max(),df['latitude'].min(),df['latitude'].max()], alpha=0.5)
df.head(1).T
df2.head(1).T

Merging Dataframes#

df['latitude_longitude'] = df['latitude'].astype('str')+","+df['longitude'].astype('str')
df.head(1).T
df.info()
df2.info()
df_merged = pd.merge(df, df2, how='left', 
         on='latitude_longitude')\
  .drop('latitude_longitude', axis=1)\
  
df_merged.to_csv(path/'merged.csv')
path.ls()
df_merged[['locality-political', 'population']].groupby('locality-political').sum().nlargest(10, columns='population', ).sort_values(by='population',ascending=True).plot.barh()