{ "cells": [ { "cell_type": "markdown", "id": "c8563200-7f9b-4a2b-b14f-a225e2e23a78", "metadata": {}, "source": [ "# EDA" ] }, { "cell_type": "code", "execution_count": null, "id": "f9062f3a-6d53-43f3-be14-783619ae3301", "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "id": "204731c8-6ddc-47a0-ae10-f5859120bf52", "metadata": {}, "source": [ "## Imports " ] }, { "cell_type": "code", "execution_count": null, "id": "046ffbf1-5d5d-457a-85f7-92a7ece2c5be", "metadata": {}, "outputs": [], "source": [ "from aiking.data.external import * #We need to import this after fastai modules\n", "import warnings\n", "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import matplotlib.image as mpimg\n", "from fastdownload import download_url\n", "from IPython.display import display, Image\n", "\n", "sns.set(color_codes=True)\n", "sns.set_palette(sns.color_palette(\"muted\"))\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "markdown", "id": "b1cc8035-b7a8-4606-9e80-f764aada5420", "metadata": {}, "source": [ "## Getting Dataset" ] }, { "cell_type": "code", "execution_count": null, "id": "68aeda78", "metadata": {}, "outputs": [], "source": [ "path = untar_data(\"kaggle_datasets::camnugent/california-housing-prices\"); path" ] }, { "cell_type": "code", "execution_count": null, "id": "dad5c4b4-d56e-4b08-8983-a91699344069", "metadata": {}, "outputs": [], "source": [ "# download_data??\n", "#download_data(\"https://docs.google.com/uc?export=download&id=1mOK0uyRz5Zs-Qo7mVMlxwtb2xn1E6N9Q\", fname=path/\"housing_address.csv\")\n", "fname = download_url(\"https://docs.google.com/uc?export=download&id=1mOK0uyRz5Zs-Qo7mVMlxwtb2xn1E6N9Q\", dest=path)\n", "fname.rename(fname.parent/\"housing_address.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "42465cce-b996-4e99-9da0-88fd616c3b4a", "metadata": {}, "outputs": [], "source": [ "path.ls()" ] }, { "cell_type": "code", "execution_count": null, "id": "2a8cacdb-612c-4242-be5b-180f15c2c14c", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(path/\"housing.csv\"); df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "1d4a9737-d1ac-4c0d-b87e-71869f2cc93d", "metadata": {}, "outputs": [], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "234b320e-4666-48d8-810f-35d044725ee5", "metadata": {}, "outputs": [], "source": [ "df.sample(n=5, random_state=42)" ] }, { "cell_type": "code", "execution_count": null, "id": "3ce38658-1a3f-4830-96e7-cbc133d03fb4", "metadata": {}, "outputs": [], "source": [ "df['ocean_proximity'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "76b7156c-b4f5-460a-ba04-8c802cd7e370", "metadata": {}, "outputs": [], "source": [ "df.describe().T" ] }, { "cell_type": "markdown", "id": "4c7ff70b-d8e1-4af4-a0de-d65fb5ad5002", "metadata": {}, "source": [ "## Data Visualization" ] }, { "cell_type": "markdown", "id": "80345ff3-df61-46f8-9147-f881424a9cf2", "metadata": {}, "source": [ "### Histogram" ] }, { "cell_type": "code", "execution_count": null, "id": "ac8e13cf-f875-477d-b683-05ad409e1965", "metadata": {}, "outputs": [], "source": [ "sns.distplot(df['housing_median_age'], kde=False)" ] }, { "cell_type": "markdown", "id": "5b0037e7-53b2-4a37-abe4-73bc91203522", "metadata": {}, "source": [ "```{note}\n", "- Highest ~50\n", "- Mean ~30 \n", "- Second peak ~15. Why?\n", "```" ] }, { "cell_type": "code", "execution_count": null, "id": "16e1e55b-782d-4b00-a191-7fa1f5f1681a", "metadata": {}, "outputs": [], "source": [ "sns.distplot(df['median_income'], kde=False)" ] }, { "cell_type": "markdown", "id": "c21120df-9aea-4caf-858f-c499ba5817f6", "metadata": {}, "source": [ "### Correlation Matrix" ] }, { "cell_type": "code", "execution_count": null, "id": "ace6807d-5ca1-4afc-9d7a-c6d460a65a7b", "metadata": {}, "outputs": [], "source": [ "corr_matrix = df.corr();corr_matrix" ] }, { "cell_type": "code", "execution_count": null, "id": "786b5712-5d46-4c7c-bcfd-f33aa850f431", "metadata": {}, "outputs": [], "source": [ "corr_matrix['median_house_value'].sort_values(ascending=False)" ] }, { "cell_type": "markdown", "id": "10f965ae-b7a3-48b6-9f7f-89c12bb72716", "metadata": {}, "source": [ "```{tip}\n", "We plan to predict median_house_value. This will give us factors which are highly correlated to this factor\n", "```" ] }, { "cell_type": "code", "execution_count": null, "id": "4e1994b8-da85-4aaa-9f1d-8b2266a7d75d", "metadata": {}, "outputs": [], "source": [ "sns.heatmap(corr_matrix, annot=True, fmt='.2f',linewidth=1)" ] }, { "cell_type": "markdown", "id": "34fdf71a-2762-4247-936c-6fe2042045e8", "metadata": {}, "source": [ "### Pearson Coefficient" ] }, { "cell_type": "code", "execution_count": null, "id": "21559c05-79ae-49df-9be8-2db693ffda4a", "metadata": {}, "outputs": [], "source": [ "Image(\"https://lewtun.github.io/dslectures/images/correlation.png\")" ] }, { "cell_type": "markdown", "id": "8e272fa3-4f7c-4a8e-828c-73e907d3c8ee", "metadata": {}, "source": [ "### Pairplot" ] }, { "cell_type": "code", "execution_count": null, "id": "f00a01be-eee5-452e-8f24-e7fbd9f05d4a", "metadata": {}, "outputs": [], "source": [ "attributes = ['median_house_value', 'median_income', 'total_rooms',\n", " 'housing_median_age', 'ocean_proximity']\n", "sns.pairplot(df[attributes], hue='ocean_proximity')\n", "# corr_matrix['median_house_value'].sort_values(ascending=False).index" ] }, { "cell_type": "markdown", "id": "94c6f3ff-bdb1-4a96-a033-cf207433e054", "metadata": {}, "source": [ "```{note}\n", "Choosing attributes which are very highly correlated with choosen output\n", "```" ] }, { "cell_type": "markdown", "id": "1a9c860b-8af9-412a-b877-02f270c14039", "metadata": {}, "source": [ "### JointPlot" ] }, { "cell_type": "code", "execution_count": null, "id": "c7c77d2a-412b-44f7-8da0-5120bb594083", "metadata": {}, "outputs": [], "source": [ "sns.jointplot('median_income', 'median_house_value', data=df, kind='hex');" ] }, { "cell_type": "markdown", "id": "f0bc98b3-039e-441d-892a-766aa787646b", "metadata": {}, "source": [ "```{note}\n", "- Useful when scatter plot is too congested\n", "- What is this line at 500,000???\n", "\n", "```" ] }, { "cell_type": "markdown", "id": "19b3a401-e1a9-4c88-9a9f-d62479f97940", "metadata": {}, "source": [ "## Auxilary Data" ] }, { "cell_type": "code", "execution_count": null, "id": "493d4693-76c5-4629-a68e-c8aa15c52124", "metadata": {}, "outputs": [], "source": [ "df2 = pd.read_csv(path/\"housing_address.csv\"); df2.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "c165ca22-b491-4ad5-8b11-cea2f363525f", "metadata": {}, "outputs": [], "source": [ "df2.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "9d3c4161-61cb-4b43-9f8b-f4c7595a6d91", "metadata": {}, "outputs": [], "source": [ "df.shape, df2.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "76118835-fd90-4f28-b643-63ebd493a0a3", "metadata": {}, "outputs": [], "source": [ "df2.columns" ] }, { "cell_type": "code", "execution_count": null, "id": "a00b84b0-8e97-468b-a4a5-b92b439be6f4", "metadata": {}, "outputs": [], "source": [ "df2.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "e49b31af-0312-46a5-988c-a63c5e0f74a4", "metadata": {}, "outputs": [], "source": [ "df2['locality-political'].nunique()" ] }, { "cell_type": "markdown", "id": "fbb4d94b-554a-42e4-b1c3-22f35621695b", "metadata": {}, "source": [ "```{note}\n", "Number of unique cities\n", "```" ] }, { "cell_type": "markdown", "id": "e26789d3-1c78-44d1-b9b5-aecebc3a6245", "metadata": {}, "source": [ "### Visualizing Geographical Data" ] }, { "cell_type": "code", "execution_count": null, "id": "e0d2833c-dff7-448a-885c-6036222d6677", "metadata": {}, "outputs": [], "source": [ "sns.scatterplot(x=\"longitude\", y='latitude', data=df)" ] }, { "cell_type": "markdown", "id": "f3929718-e467-4bb4-9407-a8ac71703b05", "metadata": {}, "source": [ "```{note}\n", "\n", "- Busy scatterplot hiding potential substructure. We can fix by changing transparency\n", "```" ] }, { "cell_type": "code", "execution_count": null, "id": "6045d1ef-7c4e-4f69-be67-610b6ba5c85b", "metadata": {}, "outputs": [], "source": [ "sns.scatterplot(x=\"longitude\", y='latitude', data=df, alpha=0.1)" ] }, { "cell_type": "code", "execution_count": null, "id": "20cc63db-43dd-46cd-aa7d-fefd1426783b", "metadata": {}, "outputs": [], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": null, "id": "0d4b8891-2c9e-4d6d-9123-6a15947e920f", "metadata": {}, "outputs": [], "source": [ "fig = sns.scatterplot(x=\"longitude\", y='latitude', data=df, alpha=0.1, \n", " hue='median_house_value', size='population', palette='viridis')\n", "fig.legend(loc='center left', ncol=1, bbox_to_anchor=(1.01, 0.6))" ] }, { "cell_type": "markdown", "id": "2aeb0f56-094d-4f10-a86a-dcd6a23b3bff", "metadata": {}, "source": [ "```{note}\n", "- One way to draw map using png is described here https://towardsdatascience.com/easy-steps-to-plot-geographic-data-on-a-map-python-11217859a2db\n", "\n", "```" ] }, { "cell_type": "code", "execution_count": null, "id": "c940f04a-94d9-4678-9a2c-61e15d9bee8d", "metadata": {}, "outputs": [], "source": [ "(df['latitude'].min(),df['latitude'].max()), (df['longitude'].min(),df['longitude'].max())" ] }, { "cell_type": "code", "execution_count": null, "id": "dc4e4af9-6025-4ac7-9e91-ec1f9a4af9c5", "metadata": {}, "outputs": [], "source": [ "fig = sns.scatterplot(x=\"longitude\", y='latitude', data=df, alpha=0.1, \n", " hue='median_house_value', size='population', palette='viridis')\n", "fig.legend(loc='center left', ncol=1, bbox_to_anchor=(1.01, 0.6))\n", "img = mpimg.imread(\"https://raw.githubusercontent.com/lewtun/dslectures/master/notebooks/images/california.png\")\n", "plt.imshow(img, extent=[df['longitude'].min(),df['longitude'].max(),df['latitude'].min(),df['latitude'].max()], alpha=0.5)" ] }, { "cell_type": "code", "execution_count": null, "id": "ab3731ae-395b-4ed6-ad09-b469461172f0", "metadata": {}, "outputs": [], "source": [ "fig = sns.scatterplot(x=\"longitude\", y='latitude', data=df, alpha=0.1, \n", " hue='ocean_proximity', size='population', palette='viridis')\n", "fig.legend(loc='center left', ncol=1, bbox_to_anchor=(1.01, 0.6))\n", "img = mpimg.imread(\"https://raw.githubusercontent.com/lewtun/dslectures/master/notebooks/images/california.png\")\n", "plt.imshow(img, extent=[df['longitude'].min(),df['longitude'].max(),df['latitude'].min(),df['latitude'].max()], alpha=0.5)" ] }, { "cell_type": "code", "execution_count": null, "id": "c6a0f2aa-8ef5-4462-bce9-13b92e0421ac", "metadata": {}, "outputs": [], "source": [ "df.head(1).T" ] }, { "cell_type": "code", "execution_count": null, "id": "7a0164c9-0faf-4004-85f7-518b45f765ab", "metadata": {}, "outputs": [], "source": [ "df2.head(1).T" ] }, { "cell_type": "markdown", "id": "236dec80-0d30-4077-b6d4-c5301040e42f", "metadata": {}, "source": [ "## Merging Dataframes" ] }, { "cell_type": "code", "execution_count": null, "id": "99c97f35-6f98-49de-9d7e-a59659a7daff", "metadata": {}, "outputs": [], "source": [ "df['latitude_longitude'] = df['latitude'].astype('str')+\",\"+df['longitude'].astype('str')\n", "df.head(1).T" ] }, { "cell_type": "code", "execution_count": null, "id": "ab5b3f6f-dc12-44da-ad3f-a97e202e8fa0", "metadata": {}, "outputs": [], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "8a3a8162-0ad3-4d4d-8550-3936a9c3adcd", "metadata": {}, "outputs": [], "source": [ "df2.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "7cde511a-c3b1-4240-a66e-512921b9de18", "metadata": {}, "outputs": [], "source": [ "df_merged = pd.merge(df, df2, how='left', \n", " on='latitude_longitude')\\\n", " .drop('latitude_longitude', axis=1)\\\n", " \n", "df_merged.to_csv(path/'merged.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "8dba251c-657b-4b92-b8d7-9c7675e050d2", "metadata": {}, "outputs": [], "source": [ "path.ls()" ] }, { "cell_type": "code", "execution_count": null, "id": "aeb18a59-a392-4d88-8a83-f6b0059b3d4a", "metadata": {}, "outputs": [], "source": [ "df_merged[['locality-political', 'population']].groupby('locality-political').sum().nlargest(10, columns='population', ).sort_values(by='population',ascending=True).plot.barh()" ] }, { "cell_type": "code", "execution_count": null, "id": "d406ec46-3c1e-4b69-bb7f-459c929f992d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 5 }