{ "cells": [ { "cell_type": "markdown", "id": "503e1f1a-363e-49b3-bee9-efc04490b935", "metadata": {}, "source": [ "# Grouping and Clustering" ] }, { "cell_type": "markdown", "id": "8b2b0539-7b95-47ff-bfc5-ca8564c6aec5", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": null, "id": "98871f2f-54d0-4fd2-8498-18d4f67c3813", "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "import numpy as np \n", "import scipy as sp\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn.cluster import AgglomerativeClustering, KMeans\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.preprocessing import StandardScaler" ] }, { "cell_type": "markdown", "id": "2e5127c2-895c-4658-9b0d-0ab43022be9a", "metadata": {}, "source": [ "## SKU Example" ] }, { "cell_type": "code", "execution_count": null, "id": "40342323-72f1-472f-95de-1433389f17d9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>ADS</th>\n", " <th>CV</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>0.68</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>3</td>\n", " <td>0.40</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>1</td>\n", " <td>0.59</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2</td>\n", " <td>0.39</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>9</td>\n", " <td>0.11</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " ADS CV\n", "0 1 0.68\n", "1 3 0.40\n", "2 1 0.59\n", "3 2 0.39\n", "4 9 0.11" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"DATA_2.01_SKU.csv\"); df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "f47a692d-1f33-493b-8051-c1b6eff06f12", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>ADS</th>\n", " <th>CV</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>count</th>\n", " <td>100.000000</td>\n", " <td>100.000000</td>\n", " </tr>\n", " <tr>\n", " <th>mean</th>\n", " <td>5.610000</td>\n", " <td>0.396000</td>\n", " </tr>\n", " <tr>\n", " <th>std</th>\n", " <td>4.211324</td>\n", " <td>0.237317</td>\n", " </tr>\n", " <tr>\n", " <th>min</th>\n", " <td>1.000000</td>\n", " <td>0.050000</td>\n", " </tr>\n", " <tr>\n", " <th>25%</th>\n", " <td>2.000000</td>\n", " <td>0.130000</td>\n", " </tr>\n", " <tr>\n", " <th>50%</th>\n", " <td>3.000000</td>\n", " <td>0.400000</td>\n", " </tr>\n", " <tr>\n", " <th>75%</th>\n", " <td>10.000000</td>\n", " <td>0.590000</td>\n", " </tr>\n", " <tr>\n", " <th>max</th>\n", " <td>14.000000</td>\n", " <td>0.960000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " ADS CV\n", "count 100.000000 100.000000\n", "mean 5.610000 0.396000\n", "std 4.211324 0.237317\n", "min 1.000000 0.050000\n", "25% 2.000000 0.130000\n", "50% 3.000000 0.400000\n", "75% 10.000000 0.590000\n", "max 14.000000 0.960000" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": null, "id": "a451bcfa-d3a6-4f32-ab46-857624bab1ce", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_369678/530051474.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.\n", " df.median()\n" ] }, { "data": { "text/plain": [ "ADS 3.0\n", "CV 0.4\n", "dtype: float64" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.median()" ] }, { "cell_type": "markdown", "id": "706989dc-9c20-449d-bfc0-b6821aed5c37", "metadata": {}, "source": [ "### Manual Segregation" ] }, { "cell_type": "code", "execution_count": null, "id": "8e54065a-e53d-4a87-b8cb-c2306073e1a2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0.8, 2, 'Crickets')" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "<Figure size 864x432 with 1 Axes>" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "fig, ax = plt.subplots(1,figsize=(12,6))\n", "df.plot.scatter(x=\"CV\",y=\"ADS\", xlabel=\"Coefficient of Variance\", ylabel=\"Average Daily Sales\", ax=ax)\n", "ax.axvline(x=0.2, color=\"red\")\n", "ax.axhline(y=4, color=\"red\")\n", "ax.text(0.13, 9.5, \"Horses\", color=\"red\")\n", "ax.text(0.65, 9.5, \"Wild Bulls\", color=\"red\")\n", "ax.text(0.8, 2, \"Crickets\", color=\"red\")" ] }, { "cell_type": "markdown", "id": "c2d501b1-9518-418c-954a-e255a3c2055b", "metadata": {}, "source": [ "### Hierarchial Clustering" ] }, { "cell_type": "code", "execution_count": null, "id": "bfeb0425-d5da-4c25-be90-5c40775cad62", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline(steps=[('standardscaler', StandardScaler()),\n", " ('agglomerativeclustering', AgglomerativeClustering())])\n" ] }, { "data": { "text/plain": [ "Pipeline(steps=[('standardscaler', StandardScaler()),\n", " ('kmeans', KMeans(n_clusters=3))])" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline = make_pipeline(StandardScaler(), AgglomerativeClustering(n_clusters=2, linkage='ward')); print(pipeline)\n", "pipeline2 = make_pipeline(StandardScaler(), KMeans(n_clusters=3)); pipeline2" ] }, { "cell_type": "code", "execution_count": null, "id": "e3cb97eb-1d43-4e43-bc15-2bb8bca54ea0", "metadata": {}, "outputs": [], "source": [ "df[\"group\"]=pipeline.fit_predict(df)\n", "df[\"group\"] =df[\"group\"].astype(\"category\")" ] }, { "cell_type": "code", "execution_count": null, "id": "24900e1e-70b9-4bdb-8992-0ab631572c91", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0.8, 2, 'Crickets')" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "<Figure size 864x432 with 2 Axes>" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "fig, ax = plt.subplots(1,figsize=(12,6))\n", "df.plot.scatter(x=\"CV\",y=\"ADS\", xlabel=\"Coefficient of Variance\", ylabel=\"Average Daily Sales\", c=\"group\", ax=ax, cmap=\"coolwarm\")\n", "ax.axvline(x=0.2, color=\"red\")\n", "ax.axhline(y=4, color=\"red\")\n", "ax.text(0.13, 9.5, \"Horses\", color=\"red\")\n", "ax.text(0.65, 9.5, \"Wild Bulls\", color=\"red\")\n", "ax.text(0.8, 2, \"Crickets\", color=\"red\")" ] }, { "cell_type": "markdown", "id": "d5f74add-ea77-4feb-ae75-9ce68b03b246", "metadata": {}, "source": [ "### Dendogram" ] }, { "cell_type": "code", "execution_count": null, "id": "5ede1fc4-40f7-4d4f-a488-8e494428f471", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AgglomerativeClustering(n_clusters=3)" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = pipeline['agglomerativeclustering']\n", "model" ] }, { "cell_type": "markdown", "id": "3fae2d63-0387-4978-8a0e-afd2bad2ba63", "metadata": {}, "source": [ "## HR Dataset" ] }, { "cell_type": "code", "execution_count": null, "id": "f8cc0e0b-9b16-45f4-af76-db573f3c32c3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>S</th>\n", " <th>LPE</th>\n", " <th>NP</th>\n", " <th>ANH</th>\n", " <th>TIC</th>\n", " <th>Newborn</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0.38</td>\n", " <td>0.53</td>\n", " <td>2</td>\n", " <td>157</td>\n", " <td>3</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>0.80</td>\n", " <td>0.86</td>\n", " <td>5</td>\n", " <td>262</td>\n", " <td>6</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>0.11</td>\n", " <td>0.88</td>\n", " <td>7</td>\n", " <td>272</td>\n", " <td>4</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>0.72</td>\n", " <td>0.87</td>\n", " <td>5</td>\n", " <td>223</td>\n", " <td>5</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>0.37</td>\n", " <td>0.52</td>\n", " <td>2</td>\n", " <td>159</td>\n", " <td>3</td>\n", " <td>0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " S LPE NP ANH TIC Newborn\n", "0 0.38 0.53 2 157 3 0\n", "1 0.80 0.86 5 262 6 0\n", "2 0.11 0.88 7 272 4 0\n", "3 0.72 0.87 5 223 5 0\n", "4 0.37 0.52 2 159 3 0" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = pd.read_csv(\"DATA_2.02_HR.csv\"); df2.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "b403173a-c3ef-4f96-9fa4-1ce39574ed78", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<AxesSubplot:xlabel='Number of Projects Done', ylabel='Last Project Evaluation'>" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "<Figure size 864x432 with 1 Axes>" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "fig, ax = plt.subplots(1,figsize=(12,6))\n", "df2.plot.scatter(y=\"NP\",y=\"LPE\", xlabel=\"Number of Projects Done\", ylabel=\"Last Project Evaluation\", ax=ax)" ] }, { "cell_type": "code", "execution_count": null, "id": "500bc785-4e00-479a-96e1-46d1c6a5f571", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline(steps=[('standardscaler', StandardScaler()),\n", " ('agglomerativeclustering', AgglomerativeClustering())])\n" ] } ], "source": [ "pipeline = make_pipeline(StandardScaler(), AgglomerativeClustering(n_clusters=2, linkage='ward')); print(pipeline)" ] }, { "cell_type": "code", "execution_count": null, "id": "54cb7bbb-f066-4471-aab5-4f2c665e7a24", "metadata": {}, "outputs": [], "source": [ "df2[\"group\"]=pipeline.fit_predict(df2[[\"S\", \"LPE\", \"NP\"]])\n", "df2[\"group\"] =df2[\"group\"].astype(\"category\")" ] }, { "cell_type": "code", "execution_count": null, "id": "a326f767-5979-4425-89d9-68d8717ad432", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>S</th>\n", " <th>LPE</th>\n", " <th>NP</th>\n", " <th>ANH</th>\n", " <th>TIC</th>\n", " <th>Newborn</th>\n", " </tr>\n", " <tr>\n", " <th>group</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0.64</td>\n", " <td>0.90</td>\n", " <td>5.0</td>\n", " <td>259.0</td>\n", " <td>5.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>0.41</td>\n", " <td>0.52</td>\n", " <td>2.0</td>\n", " <td>146.0</td>\n", " <td>3.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " S LPE NP ANH TIC Newborn\n", "group \n", "0 0.64 0.90 5.0 259.0 5.0 0.0\n", "1 0.41 0.52 2.0 146.0 3.0 0.0" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2.groupby(\"group\").median()" ] }, { "cell_type": "markdown", "id": "a031f636-4e0c-46fe-8c63-84d7d8edeeb3", "metadata": {}, "source": [ "## Telecom" ] }, { "cell_type": "code", "execution_count": null, "id": "e2dcafc5-b8e6-4d37-9f47-9334e56ee4b8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Calls</th>\n", " <th>Intern</th>\n", " <th>Text</th>\n", " <th>Data</th>\n", " <th>Age</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1.12</td>\n", " <td>0.19</td>\n", " <td>23.92</td>\n", " <td>0.18</td>\n", " <td>60</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1.08</td>\n", " <td>0.22</td>\n", " <td>17.76</td>\n", " <td>0.23</td>\n", " <td>54</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3.54</td>\n", " <td>0.26</td>\n", " <td>289.79</td>\n", " <td>1.99</td>\n", " <td>34</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>1.09</td>\n", " <td>0.21</td>\n", " <td>19.15</td>\n", " <td>0.21</td>\n", " <td>61</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>1.04</td>\n", " <td>0.24</td>\n", " <td>20.33</td>\n", " <td>0.20</td>\n", " <td>56</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Calls Intern Text Data Age\n", "0 1.12 0.19 23.92 0.18 60\n", "1 1.08 0.22 17.76 0.23 54\n", "2 3.54 0.26 289.79 1.99 34\n", "3 1.09 0.21 19.15 0.21 61\n", "4 1.04 0.24 20.33 0.20 56" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3 = pd.read_csv(\"DATA_2.03_Telco.csv\"); df3.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "68aeec7a-aaa2-4127-8a63-8e10e93b73d6", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 5 }