Managerial Segmentation#

import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, FunctionTransformer

Read Dataset#

df = pd.read_csv("W106_purchases.txt", sep='\t', names=['consumer_id','purchase_amount', 'date_of_purchase'], parse_dates=['date_of_purchase'] ); df.head()
consumer_id purchase_amount date_of_purchase
0 760 25.0 2009-11-06
1 860 50.0 2012-09-28
2 1200 100.0 2005-10-25
3 1420 50.0 2009-07-09
4 1940 70.0 2013-01-25
df.describe(include='all')
/tmp/ipykernel_36820/2884002236.py:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.
  df.describe(include='all')
consumer_id purchase_amount date_of_purchase
count 51243.000000 51243.000000 51243
unique NaN NaN 1879
top NaN NaN 2013-12-31 00:00:00
freq NaN NaN 864
first NaN NaN 2005-01-02 00:00:00
last NaN NaN 2015-12-31 00:00:00
mean 108934.547938 62.337195 NaN
std 67650.610139 156.606801 NaN
min 10.000000 5.000000 NaN
25% 57720.000000 25.000000 NaN
50% 102440.000000 30.000000 NaN
75% 160525.000000 60.000000 NaN
max 264200.000000 4500.000000 NaN
df['year_of_purchase'] = df['date_of_purchase'].dt.year
max_date = df['date_of_purchase'].max() + dt.timedelta(days=1); max_date
Timestamp('2016-01-01 00:00:00')
df['days_since'] = (max_date - df['date_of_purchase']).dt.days
df.head()
consumer_id purchase_amount date_of_purchase year_of_purchase days_since
0 760 25.0 2009-11-06 2009 2247
1 860 50.0 2012-09-28 2012 1190
2 1200 100.0 2005-10-25 2005 3720
3 1420 50.0 2009-07-09 2009 2367
4 1940 70.0 2013-01-25 2013 1071

Segmentation#

consumer_df = df.groupby('consumer_id').agg(recency=('days_since', 'min'),
                                            frequency=('date_of_purchase', 'count'),
                                            monetary=('purchase_amount', 'mean'),
                                            first_purchase=('days_since', 'max'),
                                            first_purchase_date=('date_of_purchase', 'min'),
                                            first_purchase_year=('year_of_purchase', 'min'),
                                            last_purchase_date=('date_of_purchase', 'max'),
                                            last_purchase_year=('year_of_purchase', 'max'),
                                           ); consumer_df.head()
recency frequency monetary first_purchase first_purchase_date first_purchase_year last_purchase_date last_purchase_year
consumer_id
10 3829 1 30.000000 3829 2005-07-08 2005 2005-07-08 2005
80 343 7 71.428571 3751 2005-09-24 2005 2015-01-23 2015
90 758 10 115.800000 3783 2005-08-23 2005 2013-12-04 2013
120 1401 1 20.000000 1401 2012-03-01 2012 2012-03-01 2012
130 2970 2 50.000000 3710 2005-11-04 2005 2007-11-14 2007
consumer_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 18417 entries, 10 to 264200
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   recency              18417 non-null  int64         
 1   frequency            18417 non-null  int64         
 2   monetary             18417 non-null  float64       
 3   first_purchase       18417 non-null  int64         
 4   first_purchase_date  18417 non-null  datetime64[ns]
 5   first_purchase_year  18417 non-null  int64         
 6   last_purchase_date   18417 non-null  datetime64[ns]
 7   last_purchase_year   18417 non-null  int64         
dtypes: datetime64[ns](2), float64(1), int64(5)
memory usage: 1.3 MB
consumer_df.describe()
recency frequency monetary first_purchase first_purchase_year last_purchase_year
count 18417.000000 18417.000000 18417.000000 18417.000000 18417.000000 18417.000000
mean 1253.037900 2.782375 57.792985 1984.009882 2009.925178 2011.906825
std 1081.437868 2.936888 154.360109 1133.405441 3.068690 2.914005
min 1.000000 1.000000 5.000000 1.000000 2005.000000 2005.000000
25% 244.000000 1.000000 21.666667 988.000000 2007.000000 2010.000000
50% 1070.000000 2.000000 30.000000 2087.000000 2010.000000 2013.000000
75% 2130.000000 3.000000 50.000000 2992.000000 2013.000000 2015.000000
max 4014.000000 45.000000 4500.000000 4016.000000 2015.000000 2015.000000
consumer_df['segment'] = np.nan
# consumer_df.loc[consumer_df['recency'] <=365  , 'segment'] = 'active'
consumer_df.loc[(consumer_df['recency'] <=365)  & (consumer_df['monetary'] >= 100), 'segment'] = 'active high'
consumer_df.loc[(consumer_df['recency'] <=365)  & (consumer_df['monetary'] < 100), 'segment'] = 'active low'
consumer_df.loc[(consumer_df['recency'] <=365) & (consumer_df['first_purchase'] <=365) & (consumer_df['monetary'] >= 100) , 'segment'] = 'new active high'
consumer_df.loc[(consumer_df['recency'] <=365) & (consumer_df['first_purchase'] <=365) & (consumer_df['monetary'] < 100) , 'segment'] = 'new active low'
consumer_df.loc[(consumer_df['recency']>365) & (consumer_df['recency']<=2*365) & (consumer_df['monetary'] >= 100), 'segment'] = 'warm high'
consumer_df.loc[(consumer_df['recency']>365) & (consumer_df['recency']<=2*365) & (consumer_df['monetary'] < 100), 'segment'] = 'warm low'
consumer_df.loc[(consumer_df['recency']>365) & (consumer_df['recency']<=2*365) & (consumer_df['first_purchase'] <=2*365) & (consumer_df['monetary'] >= 100) , 'segment'] = 'new warm high'
consumer_df.loc[(consumer_df['recency']>365) & (consumer_df['recency']<=2*365) & (consumer_df['first_purchase'] <=2*365) & (consumer_df['monetary'] < 100) , 'segment'] = 'new warm low'
consumer_df.loc[(consumer_df['recency']>2*365) & (consumer_df['recency']<=3*365), 'segment'] = 'cold'
consumer_df.loc[(consumer_df['recency']>3*365), 'segment'] = 'inactive'
# consumer_df.groupby('segment').count()
consumer_df['segment'].value_counts(dropna=False)
inactive           9158
active low         3313
cold               1903
new active low     1249
warm low            901
new warm low        822
active high         573
new active high     263
warm high           119
new warm high       116
Name: segment, dtype: int64
consumer_df['segment'].value_counts(dropna=False).sum()
18417
segment_df = consumer_df.groupby('segment').agg(size=('recency', 'count'),
                                   recency_min=('recency', 'min'),
                                   recency=('recency', 'mean'),
                                   recency_max=('recency', 'max'),
                                   frequency_min=('frequency', 'min'),
                                   frequency=('frequency', 'mean'),
                                   frequency_max=('frequency', 'max'),
                                   monetary_min=('monetary', 'min'),
                                   monetary=('monetary', 'mean'),
                                   monetary_max=('monetary', 'max'),
                                   first_purchase=('first_purchase', 'max'),
                                   first_purchase_date=('first_purchase_date', 'min'),
                                   first_purchase_year=('first_purchase_year', 'min'),
                                   last_purchase_date=('last_purchase_date', 'max'),
                                   last_purchase_year=('last_purchase_year', 'max')
                                  ).sort_values(by=['last_purchase_year', 'first_purchase_year','monetary'], ascending=[False, False, False])
segment_df['revenue'] = np.round(segment_df['size']*segment_df['monetary'], 0)
segment_df
size recency_min recency recency_max frequency_min frequency frequency_max monetary_min monetary monetary_max first_purchase first_purchase_date first_purchase_year last_purchase_date last_purchase_year revenue
segment
new active high 263 1 82.372624 360 1 1.015209 2 100.0 283.381027 4500.000000 360 2015-01-06 2015 2015-12-31 2015 74529.0
new active low 1249 1 85.542034 359 1 1.052042 11 5.0 33.704697 90.000000 359 2015-01-07 2015 2015-12-31 2015 42097.0
active high 573 1 88.820244 344 2 5.888307 19 100.0 240.045740 4500.000000 4010 2005-01-08 2005 2015-12-31 2015 137546.0
active low 3313 1 108.361002 359 2 5.935406 45 5.0 40.724525 99.000000 4012 2005-01-06 2005 2015-12-31 2015 134920.0
new warm high 116 366 476.060345 701 1 1.060345 4 100.0 302.036379 4000.000000 707 2014-01-24 2014 2014-12-31 2014 35036.0
new warm low 822 366 513.996350 720 1 1.042579 3 5.0 33.374290 90.000000 720 2014-01-11 2014 2014-12-31 2014 27434.0
warm high 119 366 455.126050 665 2 4.714286 14 100.0 327.407457 4000.000000 4004 2005-01-14 2005 2014-12-31 2014 38961.0
warm low 901 366 474.377358 707 2 4.531632 20 5.0 38.591926 96.428571 4011 2005-01-07 2005 2014-12-31 2014 34771.0
cold 1903 731 857.781398 1087 1 2.303205 20 5.0 51.739893 2000.000000 4016 2005-01-02 2005 2013-12-31 2013 98461.0
inactive 9158 1096 2178.110832 4014 1 1.814479 23 5.0 48.112771 3043.750000 4016 2005-01-02 2005 2012-12-31 2012 440617.0
consumer_df['last_purchase_date'].max()
Timestamp('2015-12-31 00:00:00')
def get_segment_summary(consumer_df):
    if 'revenue' in consumer_df.columns:
        segment_df = consumer_df.groupby('segment').agg(size=('recency', 'count'),
                                       revenue_today=('revenue', 'sum'),
                                       avg_revenue_today=('revenue', 'mean'),
                                       revenue_till_date=('revenue_till_date', 'sum'),
                                       avg_revenue_till_date=('revenue_till_date', 'mean'),
                                       recency_min=('recency', 'min'),
                                       recency=('recency', 'mean'),
                                       recency_max=('recency', 'max'),
                                       frequency_min=('frequency', 'min'),
                                       frequency=('frequency', 'mean'),
                                       frequency_max=('frequency', 'max'),
                                       monetary_min=('monetary', 'min'),
                                       monetary=('monetary', 'mean'),
                                       monetary_max=('monetary', 'max'),
                                       first_purchase=('first_purchase', 'max'),
                                       first_purchase_date=('first_purchase_date', 'min'),
                                       first_purchase_year=('first_purchase_year', 'min'),
                                       last_purchase_date=('last_purchase_date', 'max'),
                                       last_purchase_year=('last_purchase_year', 'max')
                                      ).sort_values(by=['last_purchase_year', 'first_purchase_year','monetary'], ascending=[False, False, False])
    else:
        segment_df = consumer_df.groupby('segment').agg(size=('recency', 'count'),
                                       revenue_till_date=('revenue_till_date', 'sum'),
                                       avg_revenue_till_date=('revenue_till_date', 'mean'),
                                       recency_min=('recency', 'min'),
                                       recency=('recency', 'mean'),
                                       recency_max=('recency', 'max'),
                                       frequency_min=('frequency', 'min'),
                                       frequency=('frequency', 'mean'),
                                       frequency_max=('frequency', 'max'),
                                       monetary_min=('monetary', 'min'),
                                       monetary=('monetary', 'mean'),
                                       monetary_max=('monetary', 'max'),
                                       first_purchase=('first_purchase', 'max'),
                                       first_purchase_date=('first_purchase_date', 'min'),
                                       first_purchase_year=('first_purchase_year', 'min'),
                                       last_purchase_date=('last_purchase_date', 'max'),
                                       last_purchase_year=('last_purchase_year', 'max')
                                      ).sort_values(by=['last_purchase_year', 'first_purchase_year','monetary'], ascending=[False, False, False])   
    return segment_df
def get_consumer_df(df, offset=0):
    df = df.copy()
    df = df[df['days_since']>offset]
    consumer_df = df.groupby('consumer_id').agg(recency=('days_since', 'min'),
                                            frequency=('date_of_purchase', 'count'),
                                            monetary=('purchase_amount', 'mean'),
                                            revenue_till_date=('purchase_amount', 'sum'),
                                            first_purchase=('days_since', 'max'),
                                            first_purchase_date=('date_of_purchase', 'min'),
                                            first_purchase_year=('year_of_purchase', 'min'),
                                            last_purchase_date=('date_of_purchase', 'max'),
                                            last_purchase_year=('year_of_purchase', 'max'),
                                           ); consumer_df.head()
    consumer_df['recency'] = consumer_df['recency']  - offset
    consumer_df['first_purchase'] = consumer_df['first_purchase'] - offset
    consumer_df['segment'] = np.nan
    rec_filter  = consumer_df['recency'] 
    first_purchase_filter = consumer_df['first_purchase']
    consumer_df.loc[(rec_filter <=365)  & (consumer_df['monetary'] >= 100), 'segment'] = 'active high'
    consumer_df.loc[(rec_filter <=365)  & (consumer_df['monetary'] < 100), 'segment'] = 'active low'""
    consumer_df.loc[(rec_filter <=365) & (first_purchase_filter <=365) & (consumer_df['monetary'] >= 100) , 'segment'] = 'new active high'
    consumer_df.loc[(rec_filter <=365) & (first_purchase_filter <=365) & (consumer_df['monetary'] < 100) , 'segment'] = 'new active low'
    consumer_df.loc[(rec_filter>365) & (rec_filter<=2*365) & (consumer_df['monetary'] >= 100), 'segment'] = 'warm high'
    consumer_df.loc[(rec_filter>365) & (rec_filter<=2*365) & (consumer_df['monetary'] < 100), 'segment'] = 'warm low'
    consumer_df.loc[(rec_filter>365) & (rec_filter<=2*365) & (first_purchase_filter <=2*365) & (consumer_df['monetary'] >= 100) , 'segment'] = 'new warm high'
    consumer_df.loc[(rec_filter>365) & (rec_filter<=2*365) & (first_purchase_filter <=2*365) & (consumer_df['monetary'] < 100) , 'segment'] = 'new warm low'
    consumer_df.loc[(rec_filter>2*365) & (rec_filter<=3*365), 'segment'] = 'cold'
    consumer_df.loc[(rec_filter>3*365), 'segment'] = 'inactive'
    return consumer_df
cons_df = get_consumer_df(df, offset=1*365);cons_df
get_segment_summary(cons_df)
size revenue_till_date avg_revenue_till_date recency_min recency recency_max frequency_min frequency frequency_max monetary_min monetary monetary_max first_purchase first_purchase_date first_purchase_year last_purchase_date last_purchase_year
segment
new active high 203 59506.22 293.134089 1 94.615764 336 1 1.049261 4 100.0 284.673498 4500.00 342 2014-01-24 2014 2014-12-31 2014
new active low 1234 45775.50 37.095219 1 138.251216 355 1 1.074554 10 5.0 34.364938 90.00 362 2014-01-04 2014 2014-12-31 2014
active high 475 665593.93 1401.250379 1 85.338947 342 2 5.696842 17 100.0 261.902155 4000.00 3645 2005-01-08 2005 2014-12-31 2014
active low 3011 699743.77 232.395805 1 98.091000 342 2 5.633677 40 5.0 40.459174 98.75 3647 2005-01-06 2005 2014-12-31 2014
new warm high 127 29418.34 231.640472 366 477.622047 721 1 1.047244 2 100.0 212.845669 2000.00 721 2013-01-10 2013 2013-12-31 2013
new warm low 1123 39498.00 35.171861 366 499.544969 722 1 1.058771 5 5.0 33.106115 99.00 722 2013-01-09 2013 2013-12-31 2013
warm high 111 92948.45 837.373423 366 461.198198 712 2 4.414414 14 100.0 187.849110 1250.00 3635 2005-01-18 2005 2013-12-31 2013
warm low 956 157955.50 165.225418 366 470.661088 720 2 4.361925 20 5.0 37.382060 98.00 3651 2005-01-02 2005 2013-12-31 2013
cold 2153 273645.09 127.099438 731 866.616814 1086 1 2.254064 23 5.0 51.114605 3043.75 3651 2005-01-02 2005 2012-12-31 2012
inactive 7512 651865.97 86.776620 1097 2058.441294 3649 1 1.730964 19 5.0 48.111199 3000.00 3651 2005-01-02 2005 2011-12-31 2011
cons_df['segment'].value_counts(dropna=False)
inactive           7512
active low         3011
cold               2153
new active low     1234
new warm low       1123
warm low            956
active high         475
new active high     203
new warm high       127
warm high           111
Name: segment, dtype: int64
cons_df['segment'].value_counts(dropna=False).sum() #+1512
16905

Revenue Generation#

How much revenue is generated in 2015?

df.head()
consumer_id purchase_amount date_of_purchase year_of_purchase days_since
0 760 25.0 2009-11-06 2009 2247
1 860 50.0 2012-09-28 2012 1190
2 1200 100.0 2005-10-25 2005 3720
3 1420 50.0 2009-07-09 2009 2367
4 1940 70.0 2013-01-25 2013 1071
revenue_df = df[df['days_since'] <= 365].groupby('consumer_id').sum()
revenue_df.shape
(5398, 3)
revenue_df = df[df['year_of_purchase']==2015].groupby('consumer_id').agg(revenue=('purchase_amount', 'sum'))
revenue_df.shape
(5398, 1)
revenue_df.head()
revenue
consumer_id
80 80.0
480 45.0
830 50.0
850 60.0
860 60.0
cons_df= get_consumer_df(df, offset=0); cons_df.shape
(18417, 10)
get_segment_summary(pd.merge(cons_df, revenue_df, left_index=True,right_index=True, how='left').fillna(0))
size revenue_today avg_revenue_today revenue_till_date avg_revenue_till_date recency_min recency recency_max frequency_min frequency frequency_max monetary_min monetary monetary_max first_purchase first_purchase_date first_purchase_year last_purchase_date last_purchase_year
segment
new active high 263 75629.21 287.563536 75629.21 287.563536 1 82.372624 360 1 1.015209 2 100.0 283.381027 4500.000000 360 2015-01-06 2015 2015-12-31 2015
new active low 1249 44070.00 35.284227 44070.00 35.284227 1 85.542034 359 1 1.052042 11 5.0 33.704697 90.000000 359 2015-01-07 2015 2015-12-31 2015
active high 573 185405.00 323.568935 788194.95 1375.558377 1 88.820244 344 2 5.888307 19 100.0 240.045740 4500.000000 4010 2005-01-08 2005 2015-12-31 2015
active low 3313 173289.92 52.306043 810481.97 244.636876 1 108.361002 359 2 5.935406 45 5.0 40.724525 99.000000 4012 2005-01-06 2005 2015-12-31 2015
new warm high 116 0.00 0.000000 36166.22 311.777759 366 476.060345 701 1 1.060345 4 100.0 302.036379 4000.000000 707 2014-01-24 2014 2014-12-31 2014
new warm low 822 0.00 0.000000 28854.50 35.102798 366 513.996350 720 1 1.042579 3 5.0 33.374290 90.000000 720 2014-01-11 2014 2014-12-31 2014
warm high 119 0.00 0.000000 168498.98 1415.957815 366 455.126050 665 2 4.714286 14 100.0 327.407457 4000.000000 4004 2005-01-14 2005 2014-12-31 2014
warm low 901 0.00 0.000000 159740.22 177.292142 366 474.377358 707 2 4.531632 20 5.0 38.591926 96.428571 4011 2005-01-07 2005 2014-12-31 2014
cold 1903 0.00 0.000000 232402.29 122.124167 731 857.781398 1087 1 2.303205 20 5.0 51.739893 2000.000000 4016 2005-01-02 2005 2013-12-31 2013
inactive 9158 0.00 0.000000 850306.56 92.848500 1096 2178.110832 4014 1 1.814479 23 5.0 48.112771 3043.750000 4016 2005-01-02 2005 2012-12-31 2012
cons_df_2014 = get_consumer_df(df, offset=365); cons_df_2014.shape
(16905, 10)
get_segment_summary(pd.merge(cons_df_2014, revenue_df, left_index=True,right_index=True, how='left').fillna(0))
size revenue_today avg_revenue_today revenue_till_date avg_revenue_till_date recency_min recency recency_max frequency_min frequency frequency_max monetary_min monetary monetary_max first_purchase first_purchase_date first_purchase_year last_purchase_date last_purchase_year
segment
new active high 203 22275.00 109.729064 59506.22 293.134089 1 94.615764 336 1 1.049261 4 100.0 284.673498 4500.00 342 2014-01-24 2014 2014-12-31 2014
new active low 1234 22339.00 18.102917 45775.50 37.095219 1 138.251216 355 1 1.074554 10 5.0 34.364938 90.00 362 2014-01-04 2014 2014-12-31 2014
active high 475 120687.00 254.077895 665593.93 1401.250379 1 85.338947 342 2 5.696842 17 100.0 261.902155 4000.00 3645 2005-01-08 2005 2014-12-31 2014
active low 3011 126150.53 41.896556 699743.77 232.395805 1 98.091000 342 2 5.633677 40 5.0 40.459174 98.75 3647 2005-01-06 2005 2014-12-31 2014
new warm high 127 1925.00 15.157480 29418.34 231.640472 366 477.622047 721 1 1.047244 2 100.0 212.845669 2000.00 721 2013-01-10 2013 2013-12-31 2013
new warm low 1123 4405.00 3.922529 39498.00 35.171861 366 499.544969 722 1 1.058771 5 5.0 33.106115 99.00 722 2013-01-09 2013 2013-12-31 2013
warm high 111 12705.00 114.459459 92948.45 837.373423 366 461.198198 712 2 4.414414 14 100.0 187.849110 1250.00 3635 2005-01-18 2005 2013-12-31 2013
warm low 956 12901.00 13.494770 157955.50 165.225418 366 470.661088 720 2 4.361925 20 5.0 37.382060 98.00 3651 2005-01-02 2005 2013-12-31 2013
cold 2153 13151.00 6.108221 273645.09 127.099438 731 866.616814 1086 1 2.254064 23 5.0 51.114605 3043.75 3651 2005-01-02 2005 2012-12-31 2012
inactive 7512 22156.39 2.949466 651865.97 86.776620 1097 2058.441294 3649 1 1.730964 19 5.0 48.111199 3000.00 3651 2005-01-02 2005 2011-12-31 2011
get_segment_summary(pd.merge(cons_df_2014, revenue_df, left_index=True,right_index=True, how='left').fillna(0))['avg_revenue_today'].sort_values(ascending=False).plot(kind='bar')
<AxesSubplot:xlabel='segment'>
../../_images/02_managerial_segmentation_35_1.png
df[df['purchase_amount']>0]
consumer_id purchase_amount date_of_purchase year_of_purchase days_since
0 760 25.0 2009-11-06 2009 2247
1 860 50.0 2012-09-28 2012 1190
2 1200 100.0 2005-10-25 2005 3720
3 1420 50.0 2009-07-09 2009 2367
4 1940 70.0 2013-01-25 2013 1071
... ... ... ... ... ...
51238 163230 30.0 2011-11-12 2011 1511
51239 154410 100.0 2013-11-15 2013 777
51240 189270 30.0 2014-12-23 2014 374
51241 173810 30.0 2015-05-28 2015 218
51242 9830 50.0 2011-11-30 2011 1493

51243 rows × 5 columns