Statistical Segmentation#

Topics Covered

  • SQL

  • RFM Transformation

  • Sampling

  • Hierarichal Clustering and Dendogram

Imports#

import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, FunctionTransformer
sns.set()

Read Dataset#

df = pd.read_csv("W106_purchases.txt", sep='\t', 
                 names=['consumer_id', 'purchase_amount', 'date_of_purchase'], 
                 parse_dates=['date_of_purchase']); df.head()
consumer_id purchase_amount date_of_purchase
0 760 25.0 2009-11-06
1 860 50.0 2012-09-28
2 1200 100.0 2005-10-25
3 1420 50.0 2009-07-09
4 1940 70.0 2013-01-25
df['year_of_purchase'] = df['date_of_purchase'].dt.year
df.describe(include='all')
/tmp/ipykernel_22169/2884002236.py:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.
  df.describe(include='all')
consumer_id purchase_amount date_of_purchase year_of_purchase
count 51243.000000 51243.000000 51243 51243.000000
unique NaN NaN 1879 NaN
top NaN NaN 2013-12-31 00:00:00 NaN
freq NaN NaN 864 NaN
first NaN NaN 2005-01-02 00:00:00 NaN
last NaN NaN 2015-12-31 00:00:00 NaN
mean 108934.547938 62.337195 NaN 2010.869699
std 67650.610139 156.606801 NaN 2.883072
min 10.000000 5.000000 NaN 2005.000000
25% 57720.000000 25.000000 NaN 2009.000000
50% 102440.000000 30.000000 NaN 2011.000000
75% 160525.000000 60.000000 NaN 2013.000000
max 264200.000000 4500.000000 NaN 2015.000000
max_date=df['date_of_purchase'].max()+dt.timedelta(days=1)
df['days_since_purchase']=(max_date - df['date_of_purchase']).dt.days
sns.scatterplot(data=df, x='date_of_purchase', y='purchase_amount')
<AxesSubplot:xlabel='date_of_purchase', ylabel='purchase_amount'>
../../_images/01_statistical_segmentation_11_1.png
df.groupby('days_since_purchase')['purchase_amount'].sum().plot.line(figsize=(24,6)).invert_xaxis()
../../_images/01_statistical_segmentation_12_0.png
consumer_df = df.groupby('consumer_id').agg({'purchase_amount':'mean', 'year_of_purchase':'last', 'days_since_purchase':'last', 'date_of_purchase':'count'})\
.rename(columns={'purchase_amount':'monetary', 
                 'year_of_purchase':'last_purchase_year',
                 'days_since_purchase':'recency',
                 'date_of_purchase': 'frequency'})
consumer_df.head()
monetary last_purchase_year recency frequency
consumer_id
10 30.000000 2005 3829 1
80 71.428571 2009 2457 7
90 115.800000 2012 1096 10
120 20.000000 2012 1401 1
130 50.000000 2005 3710 2
sns.scatterplot(data=consumer_df,y='monetary', x='frequency')
<AxesSubplot:xlabel='frequency', ylabel='monetary'>
../../_images/01_statistical_segmentation_14_1.png
sns.scatterplot(data=consumer_df,y='monetary', x='recency').invert_xaxis()
../../_images/01_statistical_segmentation_15_0.png
sns.scatterplot(data=consumer_df,y='frequency', x='recency').invert_xaxis()
../../_images/01_statistical_segmentation_16_0.png
consumer_df.hist()
plt.tight_layout()
../../_images/01_statistical_segmentation_17_0.png
np.log(consumer_df['monetary']).hist(bins=100)
<AxesSubplot:>
../../_images/01_statistical_segmentation_18_1.png
np.log(consumer_df['frequency']).hist()
<AxesSubplot:>
../../_images/01_statistical_segmentation_19_1.png
# np.log(consumer_df['recency']).hist()
consumer_df.sample(frac=0.1).hist()
plt.tight_layout()
../../_images/01_statistical_segmentation_21_0.png
sample = consumer_df.sample(frac=0.1)
sns.boxplot(data=consumer_df, x='last_purchase_year', y='frequency').set(ylim=(0,10))
[(0.0, 10.0)]
../../_images/01_statistical_segmentation_23_1.png
sns.boxplot(data=consumer_df, x='last_purchase_year', y='monetary').set(ylim=(0,100))
[(0.0, 100.0)]
../../_images/01_statistical_segmentation_24_1.png

Segmentation with Hierarchal Clustering#

def process_df(df):
    df_cp = df.copy()
    df_cp['frequency'] = np.log1p(df_cp['frequency'])
    df_cp['monetary'] = np.log1p(df_cp['monetary'])
    return df_cp
process_df(sample).hist()
plt.tight_layout()
../../_images/01_statistical_segmentation_27_0.png
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)
pipeline = make_pipeline(FunctionTransformer(process_df), StandardScaler(),  AgglomerativeClustering(distance_threshold=0, n_clusters=None))
pipeline
Pipeline(steps=[('functiontransformer',
                 FunctionTransformer(func=<function process_df at 0x116c37259e50>)),
                ('standardscaler', StandardScaler()),
                ('agglomerativeclustering',
                 AgglomerativeClustering(distance_threshold=0,
                                         n_clusters=None))])
pipeline.fit(sample[['recency', 'frequency', 'monetary']])
model = pipeline['agglomerativeclustering']
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode="level", p=9)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()
../../_images/01_statistical_segmentation_30_0.png
pipeline2 = make_pipeline(FunctionTransformer(process_df), StandardScaler(),  AgglomerativeClustering(n_clusters=5))
pipeline2.fit(sample[['recency', 'frequency', 'monetary']])
pipeline2
Pipeline(steps=[('functiontransformer',
                 FunctionTransformer(func=<function process_df at 0x116c37259e50>)),
                ('standardscaler', StandardScaler()),
                ('agglomerativeclustering',
                 AgglomerativeClustering(n_clusters=5))])
sample['cluster'] = pipeline2.fit_predict(sample[['recency', 'frequency', 'monetary']])+1
sample
monetary last_purchase_year recency frequency cluster
consumer_id
134880 10.0 2009 2192 1 1
214990 30.0 2013 731 1 4
77150 30.0 2007 3102 1 1
88000 40.0 2007 2961 1 1
163970 29.0 2014 437 5 4
... ... ... ... ... ...
181050 10.0 2012 1188 1 4
144260 20.0 2010 1990 1 1
155540 20.0 2011 1688 1 4
61350 103.0 2011 1476 10 2
191070 12.5 2012 1097 2 4

1842 rows × 5 columns

sns.scatterplot(data=sample,y='frequency', x='recency', hue='cluster').invert_xaxis()
../../_images/01_statistical_segmentation_34_0.png
sns.scatterplot(data=sample,y='monetary', x='recency', hue='cluster').invert_xaxis()
../../_images/01_statistical_segmentation_35_0.png
ax = sns.scatterplot(data=sample,x='monetary', y='recency', hue='cluster')
ax.set_xlim(0,200)
ax.invert_yaxis()
../../_images/01_statistical_segmentation_36_0.png
ax = sns.scatterplot(data=sample,x='monetary', y='frequency', hue='cluster')
ax.set_xlim(0,200)
# ax.invert_yaxis()
(0.0, 200.0)
../../_images/01_statistical_segmentation_37_1.png
sample.reset_index().groupby('cluster').agg({'consumer_id':'count', 
                                             'last_purchase_year':'max', 
                                             'recency':['min','mean', 'max'], 
                                             'frequency':['min','mean', 'max'], 
                                             'monetary':['mean','sum']})
consumer_id last_purchase_year recency frequency monetary
count max min mean max min mean max mean sum
cluster
1 722 2013 1055 2593.411357 4012 1 1.674515 7 29.465882 21274.366667
2 296 2015 1 1873.834459 4002 3 7.841216 45 56.487871 16720.409733
3 316 2015 1 843.329114 3908 1 1.746835 10 85.591320 27046.857143
4 486 2015 1 689.253086 1840 1 1.668724 5 24.777778 12042.000000
5 22 2015 1 1616.772727 3730 1 2.454545 11 934.168545 20551.707987
sample.reset_index().groupby(['cluster', 'last_purchase_year']).agg({'consumer_id':'count',  
                                             'recency':['min','mean', 'max'], 
                                             'frequency':['min','mean', 'max'], 
                                             'monetary':['mean','sum']})
consumer_id recency frequency monetary
count min mean max min mean max mean sum
cluster last_purchase_year
1 2005 27 3691 3848.851852 4012 1 1.444444 3 26.666667 720.000000
2006 57 3288 3325.403509 3577 1 1.596491 4 30.584795 1743.333333
2007 182 2923 3095.049451 3272 1 1.538462 6 27.870421 5072.416667
2008 128 2557 2687.835938 2907 1 1.585938 5 31.358073 4013.833333
2009 126 2192 2340.039683 2550 1 1.603175 5 27.110053 3415.866667
2010 142 1827 1999.823944 2179 1 1.704225 7 31.716549 4503.750000
2011 42 1462 1655.428571 1815 1 2.476190 5 30.182540 1267.666667
2012 17 1136 1307.705882 1407 2 2.647059 3 30.049020 510.833333
2013 1 1055 1055.000000 1055 3 3.000000 3 26.666667 26.666667
2 2005 13 3655 3824.461538 4002 4 6.846154 10 51.167430 665.176587
2006 13 3288 3378.384615 3585 4 6.615385 13 73.217878 951.832418
2007 41 2923 3068.658537 3268 3 7.634146 18 63.403116 2599.527736
2008 28 2557 2710.214286 2913 3 7.571429 28 72.479592 2029.428571
2009 32 2192 2317.156250 2550 4 8.031250 45 72.810491 2329.935714
2010 29 1827 1924.034483 2121 4 7.379310 14 63.486777 1841.116522
2011 36 1462 1541.694444 1815 4 8.416667 19 48.145732 1733.246362
2012 25 1096 1289.440000 1437 4 7.480000 14 43.983333 1099.583333
2013 34 731 836.676471 1077 5 7.823529 16 41.220025 1401.480862
2014 21 366 509.095238 702 5 9.761905 16 47.271538 992.702298
2015 24 1 104.000000 339 5 7.875000 11 44.849139 1076.379329
3 2005 1 3908 3908.000000 3908 2 2.000000 2 100.000000 100.000000
2006 6 3288 3355.833333 3478 1 1.333333 3 116.666667 700.000000
2007 7 2933 3067.142857 3206 1 1.428571 2 121.428571 850.000000
2008 3 2631 2632.000000 2633 1 1.000000 1 116.666667 350.000000
2009 6 2213 2359.500000 2452 1 1.166667 2 130.833333 785.000000
2010 7 1827 1931.714286 2086 1 2.285714 4 109.761905 768.333333
2011 23 1462 1597.130435 1820 1 1.782609 5 99.362319 2285.333333
2012 45 1096 1211.911111 1435 1 1.555556 6 79.472222 3576.250000
2013 66 731 848.803030 1071 1 1.863636 7 80.297619 5299.642857
2014 58 366 497.224138 689 1 2.362069 10 76.004310 4408.250000
2015 94 1 98.659574 349 1 1.436170 7 84.298379 7924.047619
4 2010 1 1840 1840.000000 1840 1 1.000000 1 20.000000 20.000000
2011 41 1462 1550.609756 1794 1 1.048780 3 22.804878 935.000000
2012 96 1096 1193.739583 1401 1 1.510417 5 22.796007 2188.416667
2013 105 731 869.619048 1080 1 1.723810 5 26.074603 2737.833333
2014 100 366 486.470000 686 1 1.920000 5 25.011667 2501.166667
2015 143 1 104.937063 359 1 1.741259 5 25.591492 3659.583333
5 2005 3 3664 3690.666667 3730 1 2.000000 4 952.500000 2857.500000
2006 1 3363 3363.000000 3363 2 2.000000 2 340.000000 340.000000
2008 4 2616 2712.250000 2834 1 1.250000 2 679.431250 2717.725000
2010 1 1884 1884.000000 1884 1 1.000000 1 554.320000 554.320000
2011 2 1462 1549.500000 1637 7 9.000000 11 1606.506494 3213.012987
2012 1 1096 1096.000000 1096 1 1.000000 1 1080.000000 1080.000000
2013 3 752 840.666667 1011 1 1.333333 2 1434.980000 4304.940000
2014 3 367 513.333333 618 1 4.333333 10 800.000000 2400.000000
2015 4 1 36.000000 97 1 1.000000 1 771.052500 3084.210000
sample.reset_index().groupby(['last_purchase_year', 'cluster']).agg({'consumer_id':'count', 
                                             'recency':['mean'], 
                                             'frequency':['mean',], 
                                             'monetary':['mean','sum']}).transform(lambda col : (col-col.min())/(col.max()-col.min()), axis=1).plot(figsize=(24,6))
<AxesSubplot:xlabel='last_purchase_year,cluster'>
../../_images/01_statistical_segmentation_40_1.png
norm_sample = sample.reset_index().groupby(['last_purchase_year', 'cluster']).agg({'consumer_id':'count', 
                                             # 'recency':['mean'], 
                                             'frequency':['mean',], 
                                             'monetary':['mean', 'sum']})
norm_sample = (norm_sample - norm_sample.min())/(norm_sample.max() - norm_sample.min())
norm_sample.columns = ["_".join(a) for a in norm_sample.columns.to_flat_index()]
norm_sample.reset_index()
last_purchase_year cluster consumer_id_count frequency_mean monetary_mean monetary_sum
0 2005 1 0.143646 0.050725 0.004202 0.088562
1 2005 2 0.066298 0.667224 0.019645 0.081626
2 2005 3 0.000000 0.114130 0.050425 0.010121
3 2005 5 0.011050 0.114130 0.587769 0.358993
4 2006 1 0.309392 0.068078 0.006672 0.218032
5 2006 2 0.066298 0.640886 0.033544 0.117893
6 2006 3 0.027624 0.038043 0.060931 0.086032
7 2006 5 0.000000 0.114130 0.201701 0.040486
8 2007 1 1.000000 0.061455 0.004961 0.639219
9 2007 2 0.220994 0.757158 0.027358 0.326355
10 2007 3 0.033149 0.048913 0.063932 0.105009
11 2008 1 0.701657 0.066873 0.007159 0.505290
12 2008 2 0.149171 0.750000 0.033079 0.254228
13 2008 3 0.011050 0.000000 0.060931 0.041751
14 2008 5 0.016575 0.028533 0.415650 0.341309
15 2009 1 0.690608 0.068841 0.004482 0.429636
16 2009 2 0.171271 0.802480 0.033287 0.292247
17 2009 3 0.027624 0.019022 0.069860 0.096786
18 2010 1 0.779006 0.080374 0.007385 0.567273
19 2010 2 0.154696 0.728073 0.027410 0.230403
20 2010 3 0.033149 0.146739 0.056578 0.094677
21 2010 4 0.000000 0.000000 0.000000 0.000000
22 2010 5 0.000000 0.000000 0.336790 0.067601
23 2011 1 0.226519 0.168478 0.006418 0.157852
24 2011 2 0.193370 0.846467 0.017741 0.216756
25 2011 3 0.121547 0.089319 0.050023 0.286604
26 2011 4 0.220994 0.005567 0.001768 0.115763
27 2011 5 0.005525 0.913043 1.000000 0.403972
28 2012 1 0.088398 0.187980 0.006334 0.062099
29 2012 2 0.132597 0.739565 0.015117 0.136586
30 2012 3 0.243094 0.063406 0.037486 0.449928
31 2012 4 0.524862 0.058254 0.001762 0.274343
32 2012 5 0.000000 0.000000 0.668135 0.134109
33 2013 1 0.000000 0.228261 0.004202 0.000843
34 2013 2 0.182320 0.778772 0.013375 0.174781
35 2013 3 0.359116 0.098567 0.038007 0.667967
36 2013 4 0.574586 0.082609 0.003829 0.343853
37 2013 5 0.011050 0.038043 0.891884 0.542120
38 2014 2 0.110497 1.000000 0.017190 0.123064
39 2014 3 0.314917 0.155454 0.035300 0.555190
40 2014 4 0.546961 0.105000 0.003159 0.313911
41 2014 5 0.011050 0.380435 0.491646 0.301112
42 2015 2 0.127072 0.784647 0.015663 0.133650
43 2015 3 0.513812 0.049780 0.040528 1.000000
44 2015 4 0.784530 0.084600 0.003524 0.460471
45 2015 5 0.016575 0.000000 0.473400 0.387676
sns.catplot(x='last_purchase_year',  col='cluster', col_wrap=5,  kind='count',ci='sd', data=sample.reset_index())
<seaborn.axisgrid.FacetGrid at 0x116c28814580>
../../_images/01_statistical_segmentation_42_1.png
sns.catplot(x='last_purchase_year', y='monetary', col='cluster',col_wrap=5, kind='box',ci='sd', data=sample).set( ylim=(0,2000))
<seaborn.axisgrid.FacetGrid at 0x116c26b42310>
../../_images/01_statistical_segmentation_43_1.png
sns.catplot(x='last_purchase_year', y='monetary', col='cluster',col_wrap=5, kind='box',ci='sd', data=sample).set( ylim=(0,200))
<seaborn.axisgrid.FacetGrid at 0x116c26bdc5e0>
../../_images/01_statistical_segmentation_44_1.png
sns.catplot(x='last_purchase_year', y='frequency', col='cluster',col_wrap=5, kind='box',ci='sd', data=sample).set( ylim=(0,20))
<seaborn.axisgrid.FacetGrid at 0x116c28062a60>
../../_images/01_statistical_segmentation_45_1.png
sns.catplot(x='last_purchase_year', y='recency', col='cluster',col_wrap=5, kind='box',ci='sd', data=sample)
<seaborn.axisgrid.FacetGrid at 0x116c289c0280>
../../_images/01_statistical_segmentation_46_1.png
sample_cp = sample.copy()
sample_cp['recency'] = sample['recency']-sample.groupby('last_purchase_year')['recency'].transform('min')
sns.catplot(x='last_purchase_year', y='recency', col='cluster',col_wrap=5, kind='box',ci='sd', data=sample_cp)
<seaborn.axisgrid.FacetGrid at 0x116c283741c0>
../../_images/01_statistical_segmentation_47_1.png
sample
monetary last_purchase_year recency frequency cluster
consumer_id
134880 10.0 2009 2192 1 1
214990 30.0 2013 731 1 4
77150 30.0 2007 3102 1 1
88000 40.0 2007 2961 1 1
163970 29.0 2014 437 5 4
... ... ... ... ... ...
181050 10.0 2012 1188 1 4
144260 20.0 2010 1990 1 1
155540 20.0 2011 1688 1 4
61350 103.0 2011 1476 10 2
191070 12.5 2012 1097 2 4

1842 rows × 5 columns