Grouping and Clustering#

Imports#

import pandas as pd 
import numpy as np 
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

SKU Example#

df = pd.read_csv("DATA_2.01_SKU.csv"); df.head()
ADS CV
0 1 0.68
1 3 0.40
2 1 0.59
3 2 0.39
4 9 0.11
df.describe()
ADS CV
count 100.000000 100.000000
mean 5.610000 0.396000
std 4.211324 0.237317
min 1.000000 0.050000
25% 2.000000 0.130000
50% 3.000000 0.400000
75% 10.000000 0.590000
max 14.000000 0.960000
df.median()
/tmp/ipykernel_369678/530051474.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  df.median()
ADS    3.0
CV     0.4
dtype: float64

Manual Segregation#

fig, ax = plt.subplots(1,figsize=(12,6))
df.plot.scatter(x="CV",y="ADS", xlabel="Coefficient of Variance", ylabel="Average Daily Sales", ax=ax)
ax.axvline(x=0.2, color="red")
ax.axhline(y=4, color="red")
ax.text(0.13, 9.5, "Horses", color="red")
ax.text(0.65, 9.5, "Wild Bulls", color="red")
ax.text(0.8, 2, "Crickets", color="red")
Text(0.8, 2, 'Crickets')
../../_images/W102_clustering_8_1.png

Hierarchial Clustering#

pipeline = make_pipeline(StandardScaler(), AgglomerativeClustering(n_clusters=2, linkage='ward')); print(pipeline)
pipeline2 = make_pipeline(StandardScaler(), KMeans(n_clusters=3)); pipeline2
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('agglomerativeclustering', AgglomerativeClustering())])
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=3))])
df["group"]=pipeline.fit_predict(df)
df["group"] =df["group"].astype("category")
fig, ax = plt.subplots(1,figsize=(12,6))
df.plot.scatter(x="CV",y="ADS", xlabel="Coefficient of Variance", ylabel="Average Daily Sales", c="group", ax=ax, cmap="coolwarm")
ax.axvline(x=0.2, color="red")
ax.axhline(y=4, color="red")
ax.text(0.13, 9.5, "Horses", color="red")
ax.text(0.65, 9.5, "Wild Bulls", color="red")
ax.text(0.8, 2, "Crickets", color="red")
Text(0.8, 2, 'Crickets')
../../_images/W102_clustering_12_1.png

Dendogram#

model = pipeline['agglomerativeclustering']
model
AgglomerativeClustering(n_clusters=3)

HR Dataset#

df2 = pd.read_csv("DATA_2.02_HR.csv"); df2.head()
S LPE NP ANH TIC Newborn
0 0.38 0.53 2 157 3 0
1 0.80 0.86 5 262 6 0
2 0.11 0.88 7 272 4 0
3 0.72 0.87 5 223 5 0
4 0.37 0.52 2 159 3 0
fig, ax = plt.subplots(1,figsize=(12,6))
df2.plot.scatter(y="NP",y="LPE", xlabel="Number of Projects Done", ylabel="Last Project Evaluation", ax=ax)
<AxesSubplot:xlabel='Number of Projects Done', ylabel='Last Project Evaluation'>
../../_images/W102_clustering_17_1.png
pipeline = make_pipeline(StandardScaler(), AgglomerativeClustering(n_clusters=2, linkage='ward')); print(pipeline)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('agglomerativeclustering', AgglomerativeClustering())])
df2["group"]=pipeline.fit_predict(df2[["S", "LPE", "NP"]])
df2["group"] =df2["group"].astype("category")
df2.groupby("group").median()
S LPE NP ANH TIC Newborn
group
0 0.64 0.90 5.0 259.0 5.0 0.0
1 0.41 0.52 2.0 146.0 3.0 0.0

Telecom#

df3 = pd.read_csv("DATA_2.03_Telco.csv"); df3.head()
Calls Intern Text Data Age
0 1.12 0.19 23.92 0.18 60
1 1.08 0.22 17.76 0.23 54
2 3.54 0.26 289.79 1.99 34
3 1.09 0.21 19.15 0.21 61
4 1.04 0.24 20.33 0.20 56