EDA
Contents
EDA#
%load_ext autoreload
%autoreload 2
%matplotlib inline
Imports#
from aiking.data.external import * #We need to import this after fastai modules
import warnings
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from fastdownload import download_url
from IPython.display import display, Image
sns.set(color_codes=True)
sns.set_palette(sns.color_palette("muted"))
warnings.filterwarnings("ignore")
Getting Dataset#
path = untar_data("kaggle_datasets::camnugent/california-housing-prices"); path
# download_data??
#download_data("https://docs.google.com/uc?export=download&id=1mOK0uyRz5Zs-Qo7mVMlxwtb2xn1E6N9Q", fname=path/"housing_address.csv")
fname = download_url("https://docs.google.com/uc?export=download&id=1mOK0uyRz5Zs-Qo7mVMlxwtb2xn1E6N9Q", dest=path)
fname.rename(fname.parent/"housing_address.csv")
path.ls()
df = pd.read_csv(path/"housing.csv"); df.head()
df.info()
df.sample(n=5, random_state=42)
df['ocean_proximity'].value_counts()
df.describe().T
Data Visualization#
Histogram#
sns.distplot(df['housing_median_age'], kde=False)
Note
Highest ~50
Mean ~30
Second peak ~15. Why?
sns.distplot(df['median_income'], kde=False)
Correlation Matrix#
corr_matrix = df.corr();corr_matrix
corr_matrix['median_house_value'].sort_values(ascending=False)
Tip
We plan to predict median_house_value. This will give us factors which are highly correlated to this factor
sns.heatmap(corr_matrix, annot=True, fmt='.2f',linewidth=1)
Pearson Coefficient#
Image("https://lewtun.github.io/dslectures/images/correlation.png")
Pairplot#
attributes = ['median_house_value', 'median_income', 'total_rooms',
'housing_median_age', 'ocean_proximity']
sns.pairplot(df[attributes], hue='ocean_proximity')
# corr_matrix['median_house_value'].sort_values(ascending=False).index
Note
Choosing attributes which are very highly correlated with choosen output
JointPlot#
sns.jointplot('median_income', 'median_house_value', data=df, kind='hex');
Note
Useful when scatter plot is too congested
What is this line at 500,000???
Auxilary Data#
df2 = pd.read_csv(path/"housing_address.csv"); df2.head()
df2.info()
df.shape, df2.shape
df2.columns
df2.head()
df2['locality-political'].nunique()
Note
Number of unique cities
Visualizing Geographical Data#
sns.scatterplot(x="longitude", y='latitude', data=df)
Note
Busy scatterplot hiding potential substructure. We can fix by changing transparency
sns.scatterplot(x="longitude", y='latitude', data=df, alpha=0.1)
df.columns
fig = sns.scatterplot(x="longitude", y='latitude', data=df, alpha=0.1,
hue='median_house_value', size='population', palette='viridis')
fig.legend(loc='center left', ncol=1, bbox_to_anchor=(1.01, 0.6))
Note
One way to draw map using png is described here https://towardsdatascience.com/easy-steps-to-plot-geographic-data-on-a-map-python-11217859a2db
(df['latitude'].min(),df['latitude'].max()), (df['longitude'].min(),df['longitude'].max())
fig = sns.scatterplot(x="longitude", y='latitude', data=df, alpha=0.1,
hue='median_house_value', size='population', palette='viridis')
fig.legend(loc='center left', ncol=1, bbox_to_anchor=(1.01, 0.6))
img = mpimg.imread("https://raw.githubusercontent.com/lewtun/dslectures/master/notebooks/images/california.png")
plt.imshow(img, extent=[df['longitude'].min(),df['longitude'].max(),df['latitude'].min(),df['latitude'].max()], alpha=0.5)
fig = sns.scatterplot(x="longitude", y='latitude', data=df, alpha=0.1,
hue='ocean_proximity', size='population', palette='viridis')
fig.legend(loc='center left', ncol=1, bbox_to_anchor=(1.01, 0.6))
img = mpimg.imread("https://raw.githubusercontent.com/lewtun/dslectures/master/notebooks/images/california.png")
plt.imshow(img, extent=[df['longitude'].min(),df['longitude'].max(),df['latitude'].min(),df['latitude'].max()], alpha=0.5)
df.head(1).T
df2.head(1).T
Merging Dataframes#
df['latitude_longitude'] = df['latitude'].astype('str')+","+df['longitude'].astype('str')
df.head(1).T
df.info()
df2.info()
df_merged = pd.merge(df, df2, how='left',
on='latitude_longitude')\
.drop('latitude_longitude', axis=1)\
df_merged.to_csv(path/'merged.csv')
path.ls()
df_merged[['locality-political', 'population']].groupby('locality-political').sum().nlargest(10, columns='population', ).sort_values(by='population',ascending=True).plot.barh()