# EDA

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

## Imports 

In [None]:
from aiking.data.external import * #We need to import this after fastai modules
import warnings
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from fastdownload import download_url
from IPython.display import display, Image

sns.set(color_codes=True)
sns.set_palette(sns.color_palette("muted"))
warnings.filterwarnings("ignore")

## Getting Dataset

In [None]:
path = untar_data("kaggle_datasets::camnugent/california-housing-prices"); path

In [None]:
# download_data??
#download_data("https://docs.google.com/uc?export=download&id=1mOK0uyRz5Zs-Qo7mVMlxwtb2xn1E6N9Q", fname=path/"housing_address.csv")
fname = download_url("https://docs.google.com/uc?export=download&id=1mOK0uyRz5Zs-Qo7mVMlxwtb2xn1E6N9Q", dest=path)
fname.rename(fname.parent/"housing_address.csv")

In [None]:
path.ls()

In [None]:
df = pd.read_csv(path/"housing.csv"); df.head()

In [None]:
df.info()

In [None]:
df.sample(n=5, random_state=42)

In [None]:
df['ocean_proximity'].value_counts()

In [None]:
df.describe().T

## Data Visualization

### Histogram

In [None]:
sns.distplot(df['housing_median_age'], kde=False)

```{note}
- Highest ~50
- Mean ~30 
- Second peak ~15. Why?
```

In [None]:
sns.distplot(df['median_income'], kde=False)

### Correlation Matrix

In [None]:
corr_matrix = df.corr();corr_matrix

In [None]:
corr_matrix['median_house_value'].sort_values(ascending=False)

```{tip}
We plan to predict median_house_value. This will give us factors which are highly correlated to this factor
```

In [None]:
sns.heatmap(corr_matrix, annot=True, fmt='.2f',linewidth=1)

### Pearson Coefficient

In [None]:
Image("https://lewtun.github.io/dslectures/images/correlation.png")

### Pairplot

In [None]:
attributes = ['median_house_value', 'median_income', 'total_rooms',
 'housing_median_age', 'ocean_proximity']
sns.pairplot(df[attributes], hue='ocean_proximity')
# corr_matrix['median_house_value'].sort_values(ascending=False).index

```{note}
Choosing attributes which are very highly correlated with choosen output
```

### JointPlot

In [None]:
sns.jointplot('median_income', 'median_house_value', data=df, kind='hex');

```{note}
- Useful when scatter plot is too congested
- What is this line at 500,000???

```

## Auxilary Data

In [None]:
df2 = pd.read_csv(path/"housing_address.csv"); df2.head()

In [None]:
df2.info()

In [None]:
df.shape, df2.shape

In [None]:
df2.columns

In [None]:
df2.head()

In [None]:
df2['locality-political'].nunique()

```{note}
Number of unique cities
```

### Visualizing Geographical Data

In [None]:
sns.scatterplot(x="longitude", y='latitude', data=df)

```{note}

- Busy scatterplot hiding potential substructure. We can fix by changing transparency
```

In [None]:
sns.scatterplot(x="longitude", y='latitude', data=df, alpha=0.1)

In [None]:
df.columns

In [None]:
fig = sns.scatterplot(x="longitude", y='latitude', data=df, alpha=0.1, 
 hue='median_house_value', size='population', palette='viridis')
fig.legend(loc='center left', ncol=1, bbox_to_anchor=(1.01, 0.6))

```{note}
- One way to draw map using png is described here https://towardsdatascience.com/easy-steps-to-plot-geographic-data-on-a-map-python-11217859a2db

```

In [None]:
(df['latitude'].min(),df['latitude'].max()), (df['longitude'].min(),df['longitude'].max())

In [None]:
fig = sns.scatterplot(x="longitude", y='latitude', data=df, alpha=0.1, 
 hue='median_house_value', size='population', palette='viridis')
fig.legend(loc='center left', ncol=1, bbox_to_anchor=(1.01, 0.6))
img = mpimg.imread("https://raw.githubusercontent.com/lewtun/dslectures/master/notebooks/images/california.png")
plt.imshow(img, extent=[df['longitude'].min(),df['longitude'].max(),df['latitude'].min(),df['latitude'].max()], alpha=0.5)

In [None]:
fig = sns.scatterplot(x="longitude", y='latitude', data=df, alpha=0.1, 
 hue='ocean_proximity', size='population', palette='viridis')
fig.legend(loc='center left', ncol=1, bbox_to_anchor=(1.01, 0.6))
img = mpimg.imread("https://raw.githubusercontent.com/lewtun/dslectures/master/notebooks/images/california.png")
plt.imshow(img, extent=[df['longitude'].min(),df['longitude'].max(),df['latitude'].min(),df['latitude'].max()], alpha=0.5)

In [None]:
df.head(1).T

In [None]:
df2.head(1).T

## Merging Dataframes

In [None]:
df['latitude_longitude'] = df['latitude'].astype('str')+","+df['longitude'].astype('str')
df.head(1).T

In [None]:
df.info()

In [None]:
df2.info()

In [None]:
df_merged = pd.merge(df, df2, how='left', 
 on='latitude_longitude')\
 .drop('latitude_longitude', axis=1)\
 
df_merged.to_csv(path/'merged.csv')

In [None]:
path.ls()

In [None]:
df_merged[['locality-political', 'population']].groupby('locality-political').sum().nlargest(10, columns='population', ).sort_values(by='population',ascending=True).plot.barh()