Contents

Statistical Segmentation

Contents

Statistical Segmentation#

Topics Covered

SQL
RFM Transformation
Sampling
Hierarichal Clustering and Dendogram

Imports#

import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, FunctionTransformer

sns.set()

Read Dataset#

df = pd.read_csv("W106_purchases.txt", sep='\t', 
                 names=['consumer_id', 'purchase_amount', 'date_of_purchase'], 
                 parse_dates=['date_of_purchase']); df.head()

	consumer_id	purchase_amount	date_of_purchase
0	760	25.0	2009-11-06
1	860	50.0	2012-09-28
2	1200	100.0	2005-10-25
3	1420	50.0	2009-07-09
4	1940	70.0	2013-01-25

df['year_of_purchase'] = df['date_of_purchase'].dt.year

df.describe(include='all')

/tmp/ipykernel_22169/2884002236.py:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.
  df.describe(include='all')

	consumer_id	purchase_amount	date_of_purchase	year_of_purchase
count	51243.000000	51243.000000	51243	51243.000000
unique	NaN	NaN	1879	NaN
top	NaN	NaN	2013-12-31 00:00:00	NaN
freq	NaN	NaN	864	NaN
first	NaN	NaN	2005-01-02 00:00:00	NaN
last	NaN	NaN	2015-12-31 00:00:00	NaN
mean	108934.547938	62.337195	NaN	2010.869699
std	67650.610139	156.606801	NaN	2.883072
min	10.000000	5.000000	NaN	2005.000000
25%	57720.000000	25.000000	NaN	2009.000000
50%	102440.000000	30.000000	NaN	2011.000000
75%	160525.000000	60.000000	NaN	2013.000000
max	264200.000000	4500.000000	NaN	2015.000000

max_date=df['date_of_purchase'].max()+dt.timedelta(days=1)

df['days_since_purchase']=(max_date - df['date_of_purchase']).dt.days

sns.scatterplot(data=df, x='date_of_purchase', y='purchase_amount')

<AxesSubplot:xlabel='date_of_purchase', ylabel='purchase_amount'>

../../_images/01_statistical_segmentation_11_1.png

df.groupby('days_since_purchase')['purchase_amount'].sum().plot.line(figsize=(24,6)).invert_xaxis()

../../_images/01_statistical_segmentation_12_0.png

consumer_df = df.groupby('consumer_id').agg({'purchase_amount':'mean', 'year_of_purchase':'last', 'days_since_purchase':'last', 'date_of_purchase':'count'})\
.rename(columns={'purchase_amount':'monetary', 
                 'year_of_purchase':'last_purchase_year',
                 'days_since_purchase':'recency',
                 'date_of_purchase': 'frequency'})
consumer_df.head()

	monetary	last_purchase_year	recency	frequency
consumer_id
10	30.000000	2005	3829	1
80	71.428571	2009	2457	7
90	115.800000	2012	1096	10
120	20.000000	2012	1401	1
130	50.000000	2005	3710	2

sns.scatterplot(data=consumer_df,y='monetary', x='frequency')

<AxesSubplot:xlabel='frequency', ylabel='monetary'>

../../_images/01_statistical_segmentation_14_1.png

sns.scatterplot(data=consumer_df,y='monetary', x='recency').invert_xaxis()

../../_images/01_statistical_segmentation_15_0.png

sns.scatterplot(data=consumer_df,y='frequency', x='recency').invert_xaxis()

../../_images/01_statistical_segmentation_16_0.png

consumer_df.hist()
plt.tight_layout()

../../_images/01_statistical_segmentation_17_0.png

np.log(consumer_df['monetary']).hist(bins=100)

<AxesSubplot:>

../../_images/01_statistical_segmentation_18_1.png

np.log(consumer_df['frequency']).hist()

<AxesSubplot:>

../../_images/01_statistical_segmentation_19_1.png

# np.log(consumer_df['recency']).hist()

consumer_df.sample(frac=0.1).hist()
plt.tight_layout()

../../_images/01_statistical_segmentation_21_0.png

sample = consumer_df.sample(frac=0.1)

sns.boxplot(data=consumer_df, x='last_purchase_year', y='frequency').set(ylim=(0,10))

[(0.0, 10.0)]

../../_images/01_statistical_segmentation_23_1.png

sns.boxplot(data=consumer_df, x='last_purchase_year', y='monetary').set(ylim=(0,100))

[(0.0, 100.0)]

../../_images/01_statistical_segmentation_24_1.png

Segmentation with Hierarchal Clustering#

def process_df(df):
    df_cp = df.copy()
    df_cp['frequency'] = np.log1p(df_cp['frequency'])
    df_cp['monetary'] = np.log1p(df_cp['monetary'])
    return df_cp

process_df(sample).hist()
plt.tight_layout()

../../_images/01_statistical_segmentation_27_0.png

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

pipeline = make_pipeline(FunctionTransformer(process_df), StandardScaler(),  AgglomerativeClustering(distance_threshold=0, n_clusters=None))
pipeline

Pipeline(steps=[('functiontransformer',
                 FunctionTransformer(func=<function process_df at 0x116c37259e50>)),
                ('standardscaler', StandardScaler()),
                ('agglomerativeclustering',
                 AgglomerativeClustering(distance_threshold=0,
                                         n_clusters=None))])

pipeline.fit(sample[['recency', 'frequency', 'monetary']])
model = pipeline['agglomerativeclustering']
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode="level", p=9)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

../../_images/01_statistical_segmentation_30_0.png

pipeline2 = make_pipeline(FunctionTransformer(process_df), StandardScaler(),  AgglomerativeClustering(n_clusters=5))
pipeline2.fit(sample[['recency', 'frequency', 'monetary']])
pipeline2

Pipeline(steps=[('functiontransformer',
                 FunctionTransformer(func=<function process_df at 0x116c37259e50>)),
                ('standardscaler', StandardScaler()),
                ('agglomerativeclustering',
                 AgglomerativeClustering(n_clusters=5))])

sample['cluster'] = pipeline2.fit_predict(sample[['recency', 'frequency', 'monetary']])+1

sample

	monetary	last_purchase_year	recency	frequency	cluster
consumer_id
134880	10.0	2009	2192	1	1
214990	30.0	2013	731	1	4
77150	30.0	2007	3102	1	1
88000	40.0	2007	2961	1	1
163970	29.0	2014	437	5	4
...	...	...	...	...	...
181050	10.0	2012	1188	1	4
144260	20.0	2010	1990	1	1
155540	20.0	2011	1688	1	4
61350	103.0	2011	1476	10	2
191070	12.5	2012	1097	2	4

1842 rows × 5 columns

sns.scatterplot(data=sample,y='frequency', x='recency', hue='cluster').invert_xaxis()

../../_images/01_statistical_segmentation_34_0.png

sns.scatterplot(data=sample,y='monetary', x='recency', hue='cluster').invert_xaxis()

../../_images/01_statistical_segmentation_35_0.png

ax = sns.scatterplot(data=sample,x='monetary', y='recency', hue='cluster')
ax.set_xlim(0,200)
ax.invert_yaxis()

../../_images/01_statistical_segmentation_36_0.png

ax = sns.scatterplot(data=sample,x='monetary', y='frequency', hue='cluster')
ax.set_xlim(0,200)
# ax.invert_yaxis()

(0.0, 200.0)

../../_images/01_statistical_segmentation_37_1.png

sample.reset_index().groupby('cluster').agg({'consumer_id':'count', 
                                             'last_purchase_year':'max', 
                                             'recency':['min','mean', 'max'], 
                                             'frequency':['min','mean', 'max'], 
                                             'monetary':['mean','sum']})

	consumer_id	last_purchase_year	recency			frequency			monetary
	count	max	min	mean	max	min	mean	max	mean	sum
cluster
1	722	2013	1055	2593.411357	4012	1	1.674515	7	29.465882	21274.366667
2	296	2015	1	1873.834459	4002	3	7.841216	45	56.487871	16720.409733
3	316	2015	1	843.329114	3908	1	1.746835	10	85.591320	27046.857143
4	486	2015	1	689.253086	1840	1	1.668724	5	24.777778	12042.000000
5	22	2015	1	1616.772727	3730	1	2.454545	11	934.168545	20551.707987

sample.reset_index().groupby(['cluster', 'last_purchase_year']).agg({'consumer_id':'count',  
                                             'recency':['min','mean', 'max'], 
                                             'frequency':['min','mean', 'max'], 
                                             'monetary':['mean','sum']})

		consumer_id	recency			frequency			monetary
		count	min	mean	max	min	mean	max	mean	sum
cluster	last_purchase_year
1	2005	27	3691	3848.851852	4012	1	1.444444	3	26.666667	720.000000
	2006	57	3288	3325.403509	3577	1	1.596491	4	30.584795	1743.333333
	2007	182	2923	3095.049451	3272	1	1.538462	6	27.870421	5072.416667
	2008	128	2557	2687.835938	2907	1	1.585938	5	31.358073	4013.833333
	2009	126	2192	2340.039683	2550	1	1.603175	5	27.110053	3415.866667
	2010	142	1827	1999.823944	2179	1	1.704225	7	31.716549	4503.750000
	2011	42	1462	1655.428571	1815	1	2.476190	5	30.182540	1267.666667
	2012	17	1136	1307.705882	1407	2	2.647059	3	30.049020	510.833333
	2013	1	1055	1055.000000	1055	3	3.000000	3	26.666667	26.666667
2	2005	13	3655	3824.461538	4002	4	6.846154	10	51.167430	665.176587
	2006	13	3288	3378.384615	3585	4	6.615385	13	73.217878	951.832418
	2007	41	2923	3068.658537	3268	3	7.634146	18	63.403116	2599.527736
	2008	28	2557	2710.214286	2913	3	7.571429	28	72.479592	2029.428571
	2009	32	2192	2317.156250	2550	4	8.031250	45	72.810491	2329.935714
	2010	29	1827	1924.034483	2121	4	7.379310	14	63.486777	1841.116522
	2011	36	1462	1541.694444	1815	4	8.416667	19	48.145732	1733.246362
	2012	25	1096	1289.440000	1437	4	7.480000	14	43.983333	1099.583333
	2013	34	731	836.676471	1077	5	7.823529	16	41.220025	1401.480862
	2014	21	366	509.095238	702	5	9.761905	16	47.271538	992.702298
	2015	24	1	104.000000	339	5	7.875000	11	44.849139	1076.379329
3	2005	1	3908	3908.000000	3908	2	2.000000	2	100.000000	100.000000
	2006	6	3288	3355.833333	3478	1	1.333333	3	116.666667	700.000000
	2007	7	2933	3067.142857	3206	1	1.428571	2	121.428571	850.000000
	2008	3	2631	2632.000000	2633	1	1.000000	1	116.666667	350.000000
	2009	6	2213	2359.500000	2452	1	1.166667	2	130.833333	785.000000
	2010	7	1827	1931.714286	2086	1	2.285714	4	109.761905	768.333333
	2011	23	1462	1597.130435	1820	1	1.782609	5	99.362319	2285.333333
	2012	45	1096	1211.911111	1435	1	1.555556	6	79.472222	3576.250000
	2013	66	731	848.803030	1071	1	1.863636	7	80.297619	5299.642857
	2014	58	366	497.224138	689	1	2.362069	10	76.004310	4408.250000
	2015	94	1	98.659574	349	1	1.436170	7	84.298379	7924.047619
4	2010	1	1840	1840.000000	1840	1	1.000000	1	20.000000	20.000000
	2011	41	1462	1550.609756	1794	1	1.048780	3	22.804878	935.000000
	2012	96	1096	1193.739583	1401	1	1.510417	5	22.796007	2188.416667
	2013	105	731	869.619048	1080	1	1.723810	5	26.074603	2737.833333
	2014	100	366	486.470000	686	1	1.920000	5	25.011667	2501.166667
	2015	143	1	104.937063	359	1	1.741259	5	25.591492	3659.583333
5	2005	3	3664	3690.666667	3730	1	2.000000	4	952.500000	2857.500000
	2006	1	3363	3363.000000	3363	2	2.000000	2	340.000000	340.000000
	2008	4	2616	2712.250000	2834	1	1.250000	2	679.431250	2717.725000
	2010	1	1884	1884.000000	1884	1	1.000000	1	554.320000	554.320000
	2011	2	1462	1549.500000	1637	7	9.000000	11	1606.506494	3213.012987
	2012	1	1096	1096.000000	1096	1	1.000000	1	1080.000000	1080.000000
	2013	3	752	840.666667	1011	1	1.333333	2	1434.980000	4304.940000
	2014	3	367	513.333333	618	1	4.333333	10	800.000000	2400.000000
	2015	4	1	36.000000	97	1	1.000000	1	771.052500	3084.210000

sample.reset_index().groupby(['last_purchase_year', 'cluster']).agg({'consumer_id':'count', 
                                             'recency':['mean'], 
                                             'frequency':['mean',], 
                                             'monetary':['mean','sum']}).transform(lambda col : (col-col.min())/(col.max()-col.min()), axis=1).plot(figsize=(24,6))

<AxesSubplot:xlabel='last_purchase_year,cluster'>

../../_images/01_statistical_segmentation_40_1.png

norm_sample = sample.reset_index().groupby(['last_purchase_year', 'cluster']).agg({'consumer_id':'count', 
                                             # 'recency':['mean'], 
                                             'frequency':['mean',], 
                                             'monetary':['mean', 'sum']})
norm_sample = (norm_sample - norm_sample.min())/(norm_sample.max() - norm_sample.min())
norm_sample.columns = ["_".join(a) for a in norm_sample.columns.to_flat_index()]
norm_sample.reset_index()

	last_purchase_year	cluster	consumer_id_count	frequency_mean	monetary_mean	monetary_sum
0	2005	1	0.143646	0.050725	0.004202	0.088562
1	2005	2	0.066298	0.667224	0.019645	0.081626
2	2005	3	0.000000	0.114130	0.050425	0.010121
3	2005	5	0.011050	0.114130	0.587769	0.358993
4	2006	1	0.309392	0.068078	0.006672	0.218032
5	2006	2	0.066298	0.640886	0.033544	0.117893
6	2006	3	0.027624	0.038043	0.060931	0.086032
7	2006	5	0.000000	0.114130	0.201701	0.040486
8	2007	1	1.000000	0.061455	0.004961	0.639219
9	2007	2	0.220994	0.757158	0.027358	0.326355
10	2007	3	0.033149	0.048913	0.063932	0.105009
11	2008	1	0.701657	0.066873	0.007159	0.505290
12	2008	2	0.149171	0.750000	0.033079	0.254228
13	2008	3	0.011050	0.000000	0.060931	0.041751
14	2008	5	0.016575	0.028533	0.415650	0.341309
15	2009	1	0.690608	0.068841	0.004482	0.429636
16	2009	2	0.171271	0.802480	0.033287	0.292247
17	2009	3	0.027624	0.019022	0.069860	0.096786
18	2010	1	0.779006	0.080374	0.007385	0.567273
19	2010	2	0.154696	0.728073	0.027410	0.230403
20	2010	3	0.033149	0.146739	0.056578	0.094677
21	2010	4	0.000000	0.000000	0.000000	0.000000
22	2010	5	0.000000	0.000000	0.336790	0.067601
23	2011	1	0.226519	0.168478	0.006418	0.157852
24	2011	2	0.193370	0.846467	0.017741	0.216756
25	2011	3	0.121547	0.089319	0.050023	0.286604
26	2011	4	0.220994	0.005567	0.001768	0.115763
27	2011	5	0.005525	0.913043	1.000000	0.403972
28	2012	1	0.088398	0.187980	0.006334	0.062099
29	2012	2	0.132597	0.739565	0.015117	0.136586
30	2012	3	0.243094	0.063406	0.037486	0.449928
31	2012	4	0.524862	0.058254	0.001762	0.274343
32	2012	5	0.000000	0.000000	0.668135	0.134109
33	2013	1	0.000000	0.228261	0.004202	0.000843
34	2013	2	0.182320	0.778772	0.013375	0.174781
35	2013	3	0.359116	0.098567	0.038007	0.667967
36	2013	4	0.574586	0.082609	0.003829	0.343853
37	2013	5	0.011050	0.038043	0.891884	0.542120
38	2014	2	0.110497	1.000000	0.017190	0.123064
39	2014	3	0.314917	0.155454	0.035300	0.555190
40	2014	4	0.546961	0.105000	0.003159	0.313911
41	2014	5	0.011050	0.380435	0.491646	0.301112
42	2015	2	0.127072	0.784647	0.015663	0.133650
43	2015	3	0.513812	0.049780	0.040528	1.000000
44	2015	4	0.784530	0.084600	0.003524	0.460471
45	2015	5	0.016575	0.000000	0.473400	0.387676

sns.catplot(x='last_purchase_year',  col='cluster', col_wrap=5,  kind='count',ci='sd', data=sample.reset_index())

<seaborn.axisgrid.FacetGrid at 0x116c28814580>

../../_images/01_statistical_segmentation_42_1.png

sns.catplot(x='last_purchase_year', y='monetary', col='cluster',col_wrap=5, kind='box',ci='sd', data=sample).set( ylim=(0,2000))

<seaborn.axisgrid.FacetGrid at 0x116c26b42310>

../../_images/01_statistical_segmentation_43_1.png

sns.catplot(x='last_purchase_year', y='monetary', col='cluster',col_wrap=5, kind='box',ci='sd', data=sample).set( ylim=(0,200))

<seaborn.axisgrid.FacetGrid at 0x116c26bdc5e0>

../../_images/01_statistical_segmentation_44_1.png

sns.catplot(x='last_purchase_year', y='frequency', col='cluster',col_wrap=5, kind='box',ci='sd', data=sample).set( ylim=(0,20))

<seaborn.axisgrid.FacetGrid at 0x116c28062a60>

../../_images/01_statistical_segmentation_45_1.png

sns.catplot(x='last_purchase_year', y='recency', col='cluster',col_wrap=5, kind='box',ci='sd', data=sample)

<seaborn.axisgrid.FacetGrid at 0x116c289c0280>

../../_images/01_statistical_segmentation_46_1.png

sample_cp = sample.copy()
sample_cp['recency'] = sample['recency']-sample.groupby('last_purchase_year')['recency'].transform('min')
sns.catplot(x='last_purchase_year', y='recency', col='cluster',col_wrap=5, kind='box',ci='sd', data=sample_cp)

<seaborn.axisgrid.FacetGrid at 0x116c283741c0>

../../_images/01_statistical_segmentation_47_1.png

sample

	monetary	last_purchase_year	recency	frequency	cluster
consumer_id
134880	10.0	2009	2192	1	1
214990	30.0	2013	731	1	4
77150	30.0	2007	3102	1	1
88000	40.0	2007	2961	1	1
163970	29.0	2014	437	5	4
...	...	...	...	...	...
181050	10.0	2012	1188	1	4
144260	20.0	2010	1990	1	1
155540	20.0	2011	1688	1	4
61350	103.0	2011	1476	10	2
191070	12.5	2012	1097	2	4

1842 rows × 5 columns