Customer Lifetime Value#

Methodology

  • Do managerial segmentation

  • Identify behaviour for the customer for 1 year. [ Assume current year is N, Look at period from N-1 to N to evaluate transition matrix™]

    • Calculate Year N-1 & Year N customer matrix

    • Join them on customer_id

    • Tabulate to calculate Matrix

    • Divide by row_sum to calculate transition probablities

  • Assume the behaviour displayed by user remains the same per segment over the years.

  • Assume Year N as 1st column in Segment_customer matrix

  • segment_customer(j+1) = segment_customer(j)*TM + New Acquisition(j) where j goes from 0-> M where years are N-> N+M

  • Assume avg/revenue for year N as revenue for first column

  • Evaluated yearly_revenue vector by multiplying with segment_customer matrix [ Assumption: Avg revenue per customer is same]

  • Multiply by discount_factor matrix with yearly_revenue vector to calculate net present which is your CLV

  • What is the value of your database?=> CLV - year1 revenue.

Imports#

import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, FunctionTransformer
from sklearn.linear_model import LogisticRegression, LinearRegression
import statsmodels.api as sm
import statsmodels.formula.api as smf
sns.set()

Data Collection & Feature Engineering#

df = pd.read_csv("W106_purchases.txt", sep='\t', names=['consumer_id','purchase_amount', 'date_of_purchase'], parse_dates=['date_of_purchase'] )
df['year_of_purchase'] = df['date_of_purchase'].dt.year
max_date = df['date_of_purchase'].max() + dt.timedelta(days=1)
df['days_since'] = (max_date - df['date_of_purchase']).dt.days
df.head()
consumer_id purchase_amount date_of_purchase year_of_purchase days_since
0 760 25.0 2009-11-06 2009 2247
1 860 50.0 2012-09-28 2012 1190
2 1200 100.0 2005-10-25 2005 3720
3 1420 50.0 2009-07-09 2009 2367
4 1940 70.0 2013-01-25 2013 1071

Managerial Segmentation#

Calculate Customer and segment summary#

def get_consumer_df(df, offset=0):
    df = df.copy()
    df = df[df['days_since']>offset]
    consumer_df = df.groupby('consumer_id').agg(recency=('days_since', 'min'),
                                            frequency=('date_of_purchase', 'count'),
                                            monetary=('purchase_amount', 'mean'),
                                            # avg_purchase_amount=('purchase_amount', 'mean'),
                                            max_purchase_amount=('purchase_amount', 'max'),
                                            first_purchase=('days_since', 'max'), 
                                            revenue_till_date=('purchase_amount', 'sum'),
                                            first_purchase_date=('date_of_purchase', 'min'),
                                            first_purchase_year=('year_of_purchase', 'min'),
                                            last_purchase_date=('date_of_purchase', 'max'),
                                            last_purchase_year=('year_of_purchase', 'max'),
                                           ); consumer_df.head()
    consumer_df['recency'] = consumer_df['recency']  - offset
    consumer_df['first_purchase'] = consumer_df['first_purchase'] - offset
    consumer_df['segment'] = np.nan
    rec_filter  = consumer_df['recency'] 
    first_purchase_filter = consumer_df['first_purchase']
    consumer_df.loc[(rec_filter <=365)  & (consumer_df['monetary'] >= 100), 'segment'] = 'active high'
    consumer_df.loc[(rec_filter <=365)  & (consumer_df['monetary'] < 100), 'segment'] = 'active low'""
    consumer_df.loc[(rec_filter <=365) & (first_purchase_filter <=365) , 'segment'] = 'new active'
    # consumer_df.loc[(rec_filter <=365) & (first_purchase_filter <=365) & (consumer_df['monetary'] < 100) , 'segment'] = 'new active low'
    consumer_df.loc[(rec_filter>365) & (rec_filter<=2*365) & (consumer_df['monetary'] >= 100), 'segment'] = 'warm high'
    consumer_df.loc[(rec_filter>365) & (rec_filter<=2*365) & (consumer_df['monetary'] < 100), 'segment'] = 'warm low'
    consumer_df.loc[(rec_filter>365) & (rec_filter<=2*365) & (first_purchase_filter <=2*365) & (consumer_df['monetary'] >= 100) , 'segment'] = 'new warm high'
    consumer_df.loc[(rec_filter>365) & (rec_filter<=2*365) & (first_purchase_filter <=2*365) & (consumer_df['monetary'] < 100) , 'segment'] = 'new warm low'
    consumer_df.loc[(rec_filter>2*365) & (rec_filter<=3*365), 'segment'] = 'cold'
    consumer_df.loc[(rec_filter>3*365), 'segment'] = 'inactive'
    return consumer_df
def get_segment_summary(consumer_df):
    if 'revenue' in consumer_df.columns:
        segment_df = consumer_df.groupby('segment').agg(size=('recency', 'count'),
                                       revenue_today=('revenue', 'sum'),
                                       avg_revenue_today=('revenue', 'mean'),
                                       revenue_till_date=('revenue_till_date', 'sum'),
                                       avg_revenue_till_date=('revenue_till_date', 'mean'),
                                       recency_min=('recency', 'min'),
                                       recency=('recency', 'mean'),
                                       recency_max=('recency', 'max'),
                                       frequency_min=('frequency', 'min'),
                                       frequency=('frequency', 'mean'),
                                       frequency_max=('frequency', 'max'),
                                       monetary_min=('monetary', 'min'),
                                       monetary=('monetary', 'mean'),
                                       monetary_max=('monetary', 'max'),
                                       first_purchase=('first_purchase', 'max'),
                                       first_purchase_date=('first_purchase_date', 'min'),
                                       first_purchase_year=('first_purchase_year', 'min'),
                                       last_purchase_date=('last_purchase_date', 'max'),
                                       last_purchase_year=('last_purchase_year', 'max')
                                      ).sort_values(by=['last_purchase_year', 'first_purchase_year','monetary'], ascending=[False, False, False])
    else:
        segment_df = consumer_df.groupby('segment').agg(size=('recency', 'count'),
                                       revenue_till_date=('revenue_till_date', 'sum'),
                                       avg_revenue_till_date=('revenue_till_date', 'mean'),
                                       recency_min=('recency', 'min'),
                                       recency=('recency', 'mean'),
                                       recency_max=('recency', 'max'),
                                       frequency_min=('frequency', 'min'),
                                       frequency=('frequency', 'mean'),
                                       frequency_max=('frequency', 'max'),
                                       monetary_min=('monetary', 'min'),
                                       monetary=('monetary', 'mean'),
                                       monetary_max=('monetary', 'max'),
                                       first_purchase=('first_purchase', 'max'),
                                       first_purchase_date=('first_purchase_date', 'min'),
                                       first_purchase_year=('first_purchase_year', 'min'),
                                       last_purchase_date=('last_purchase_date', 'max'),
                                       last_purchase_year=('last_purchase_year', 'max')
                                      ).sort_values(by=['last_purchase_year', 'first_purchase_year','monetary'], ascending=[False, False, False])   
    return segment_df
cons_df_2014 = get_consumer_df(df, offset=365); cons_df_2014.head()
cons_df_2015 = get_consumer_df(df, offset=0); cons_df_2015.head()
recency frequency monetary max_purchase_amount first_purchase revenue_till_date first_purchase_date first_purchase_year last_purchase_date last_purchase_year segment
consumer_id
10 3829 1 30.000000 30.0 3829 30.0 2005-07-08 2005 2005-07-08 2005 inactive
80 343 7 71.428571 80.0 3751 500.0 2005-09-24 2005 2015-01-23 2015 active low
90 758 10 115.800000 153.0 3783 1158.0 2005-08-23 2005 2013-12-04 2013 cold
120 1401 1 20.000000 20.0 1401 20.0 2012-03-01 2012 2012-03-01 2012 inactive
130 2970 2 50.000000 60.0 3710 100.0 2005-11-04 2005 2007-11-14 2007 inactive
get_segment_summary(cons_df_2015)
size revenue_till_date avg_revenue_till_date recency_min recency recency_max frequency_min frequency frequency_max monetary_min monetary monetary_max first_purchase first_purchase_date first_purchase_year last_purchase_date last_purchase_year
segment
new active 1512 119699.21 79.166144 1 84.990741 360 1 1.045635 11 5.0 77.133847 4500.000000 360 2015-01-06 2015 2015-12-31 2015
active high 573 788194.95 1375.558377 1 88.820244 344 2 5.888307 19 100.0 240.045740 4500.000000 4010 2005-01-08 2005 2015-12-31 2015
active low 3313 810481.97 244.636876 1 108.361002 359 2 5.935406 45 5.0 40.724525 99.000000 4012 2005-01-06 2005 2015-12-31 2015
new warm high 116 36166.22 311.777759 366 476.060345 701 1 1.060345 4 100.0 302.036379 4000.000000 707 2014-01-24 2014 2014-12-31 2014
new warm low 822 28854.50 35.102798 366 513.996350 720 1 1.042579 3 5.0 33.374290 90.000000 720 2014-01-11 2014 2014-12-31 2014
warm high 119 168498.98 1415.957815 366 455.126050 665 2 4.714286 14 100.0 327.407457 4000.000000 4004 2005-01-14 2005 2014-12-31 2014
warm low 901 159740.22 177.292142 366 474.377358 707 2 4.531632 20 5.0 38.591926 96.428571 4011 2005-01-07 2005 2014-12-31 2014
cold 1903 232402.29 122.124167 731 857.781398 1087 1 2.303205 20 5.0 51.739893 2000.000000 4016 2005-01-02 2005 2013-12-31 2013
inactive 9158 850306.56 92.848500 1096 2178.110832 4014 1 1.814479 23 5.0 48.112771 3043.750000 4016 2005-01-02 2005 2012-12-31 2012
segment_order = get_segment_summary(cons_df_2015).index.tolist(); segment_order
['new active',
 'active high',
 'active low',
 'new warm high',
 'new warm low',
 'warm high',
 'warm low',
 'cold',
 'inactive']

Calculate Transition Matrix#

df_transition = pd.merge(cons_df_2014['segment'] , cons_df_2015['segment'], left_index=True, right_index=True, how='left')
df_transition.head(10)
segment_x segment_y
consumer_id
10 inactive inactive
80 active low active low
90 warm high cold
120 cold inactive
130 inactive inactive
160 inactive inactive
190 inactive inactive
220 inactive inactive
230 inactive inactive
240 active low warm low
transition_count = pd.crosstab(df_transition.segment_x, df_transition.segment_y); transition_count
segment_y active high active low cold inactive new warm high new warm low warm high warm low
segment_x
active high 354 2 0 0 0 0 119 0
active low 22 2088 0 0 0 0 0 901
cold 22 200 0 1931 0 0 0 0
inactive 35 250 0 7227 0 0 0 0
new active 89 410 0 0 116 822 0 0
new warm high 15 0 112 0 0 0 0 0
new warm low 0 96 1027 0 0 0 0 0
warm high 35 1 75 0 0 0 0 0
warm low 1 266 689 0 0 0 0 0
transition_count.sum(axis=1)
segment_x
active high       475
active low       3011
cold             2153
inactive         7512
new active       1437
new warm high     127
new warm low     1123
warm high         111
warm low          956
dtype: int64
transition_matrix = (transition_count.T/ transition_count.sum(axis=1)).T
transition_matrix.index.difference(transition_matrix.columns)
Index(['new active'], dtype='object')
transition_matrix[transition_matrix.index.difference(transition_matrix.columns)] = np.nan
def get_transition_matrix(cons_df_pre, cons_df_curr, order=None):
    df_transition = pd.merge(cons_df_pre['segment'] , cons_df_curr['segment'], left_index=True, right_index=True, how='left')
    transition_count = pd.crosstab(df_transition.segment_x, df_transition.segment_y)
    transition_matrix = (transition_count.T/ transition_count.sum(axis=1)).T
    transition_matrix[transition_matrix.index.difference(transition_matrix.columns)] = 0.0
    if order: return transition_matrix.loc[order, order]
    return transition_matrix
    
    
transition_matrix = get_transition_matrix(cons_df_2014, cons_df_2015, order=segment_order); transition_matrix
segment_y new active active high active low new warm high new warm low warm high warm low cold inactive
segment_x
new active 0.0 0.061935 0.285317 0.080724 0.572025 0.000000 0.000000 0.000000 0.000000
active high 0.0 0.745263 0.004211 0.000000 0.000000 0.250526 0.000000 0.000000 0.000000
active low 0.0 0.007307 0.693457 0.000000 0.000000 0.000000 0.299236 0.000000 0.000000
new warm high 0.0 0.118110 0.000000 0.000000 0.000000 0.000000 0.000000 0.881890 0.000000
new warm low 0.0 0.000000 0.085485 0.000000 0.000000 0.000000 0.000000 0.914515 0.000000
warm high 0.0 0.315315 0.009009 0.000000 0.000000 0.000000 0.000000 0.675676 0.000000
warm low 0.0 0.001046 0.278243 0.000000 0.000000 0.000000 0.000000 0.720711 0.000000
cold 0.0 0.010218 0.092894 0.000000 0.000000 0.000000 0.000000 0.000000 0.896888
inactive 0.0 0.004659 0.033280 0.000000 0.000000 0.000000 0.000000 0.000000 0.962061

Calculate Segment Customer Matrix#

transition_matrix.columns
Index(['new active', 'active high', 'active low', 'new warm high',
       'new warm low', 'warm high', 'warm low', 'cold', 'inactive'],
      dtype='object', name='segment_y')
segment_vector = get_segment_summary(cons_df_2015)['size']; segment_vector
segment
new active       1512
active high       573
active low       3313
new warm high     116
new warm low      822
warm high         119
warm low          901
cold             1903
inactive         9158
Name: size, dtype: int64
new_customer_vector = segment_vector.copy()*0.0
new_customer_vector.loc['new active'] = 1000
new_customer_vector
segment
new active       1000.0
active high         0.0
active low          0.0
new warm high       0.0
new warm low        0.0
warm high           0.0
warm low            0.0
cold                0.0
inactive            0.0
Name: size, dtype: float64
duration = 10
list(range(10))
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
transition_matrix
segment_y new active active high active low new warm high new warm low warm high warm low cold inactive
segment_x
new active 0.0 0.061935 0.285317 0.080724 0.572025 0.000000 0.000000 0.000000 0.000000
active high 0.0 0.745263 0.004211 0.000000 0.000000 0.250526 0.000000 0.000000 0.000000
active low 0.0 0.007307 0.693457 0.000000 0.000000 0.000000 0.299236 0.000000 0.000000
new warm high 0.0 0.118110 0.000000 0.000000 0.000000 0.000000 0.000000 0.881890 0.000000
new warm low 0.0 0.000000 0.085485 0.000000 0.000000 0.000000 0.000000 0.914515 0.000000
warm high 0.0 0.315315 0.009009 0.000000 0.000000 0.000000 0.000000 0.675676 0.000000
warm low 0.0 0.001046 0.278243 0.000000 0.000000 0.000000 0.000000 0.720711 0.000000
cold 0.0 0.010218 0.092894 0.000000 0.000000 0.000000 0.000000 0.000000 0.896888
inactive 0.0 0.004659 0.033280 0.000000 0.000000 0.000000 0.000000 0.000000 0.962061
segment_vector
segment
new active       1512
active high       573
active low       3313
new warm high     116
new warm low      822
warm high         119
warm low          901
cold             1903
inactive         9158
Name: size, dtype: int64
transition_matrix.multiply(segment_vector, axis='index').sum()
segment_y
new active           0.000000
active high        659.167726
active low        3534.828749
new warm high      122.054280
new warm low       864.901879
warm high          143.551579
warm low           991.369313
cold              1583.796574
inactive         10517.329901
dtype: float64
base=2015
segment_matrix = pd.DataFrame(np.zeros([segment_vector.shape[0], duration]), index= segment_vector.index, columns=range(base, duration+base))
segment_matrix.iloc[:, 0] = segment_vector
segment_matrix.iloc[:, 1] = transition_matrix.multiply(segment_vector, axis='index').sum()
segment_matrix
2015 2016 2017 2018 2019 2020 2021 2022 2023 2024
segment
new active 1512 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
active high 573 659.167726 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
active low 3313 3534.828749 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
new warm high 116 122.054280 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
new warm low 822 864.901879 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
warm high 119 143.551579 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
warm low 901 991.369313 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
cold 1903 1583.796574 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
inactive 9158 10517.329901 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
def get_segment_customer_matrix(duration, transition_matrix, segment_vector, new_customer_vector=None,  base=2015):
    if new_customer_vector is None: new_customer_vector = segment_vector.copy()*0
    
    segment_matrix = pd.DataFrame(np.zeros([segment_vector.shape[0], duration]), index= segment_vector.index, columns=range(base, duration+base))
    segment_matrix.iloc[:, 0] = segment_vector
    for i in range(1, duration):
        segment_matrix.iloc[:, i] = transition_matrix.multiply(segment_matrix.iloc[:, i-1], axis='index').sum()+new_customer_vector
    return round(segment_matrix)


segment_customer_matrix = get_segment_customer_matrix(11, transition_matrix, segment_vector, new_customer_vector=None) 
segment_customer_matrix
2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025
segment
new active 1512 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
active high 573 659.0 643.0 628.0 610.0 596.0 583.0 572.0 563.0 555.0 548.0
active low 3313 3535.0 3302.0 3131.0 2952.0 2819.0 2716.0 2636.0 2574.0 2527.0 2490.0
new warm high 116 122.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
new warm low 822 865.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
warm high 119 144.0 165.0 161.0 157.0 153.0 149.0 146.0 143.0 141.0 139.0
warm low 901 991.0 1058.0 988.0 937.0 883.0 843.0 813.0 789.0 770.0 756.0
cold 1903 1584.0 1710.0 874.0 821.0 782.0 740.0 709.0 684.0 665.0 650.0
inactive 9158 10517.0 11539.0 12635.0 12939.0 13185.0 13385.0 13541.0 13663.0 13759.0 13833.0
segment_customer_matrix.sum()
2015    18417.0
2016    18417.0
2017    18417.0
2018    18417.0
2019    18416.0
2020    18418.0
2021    18416.0
2022    18417.0
2023    18416.0
2024    18417.0
2025    18416.0
dtype: float64
revenue_df = df[df['year_of_purchase']==2015].groupby('consumer_id').agg(revenue_2015=('purchase_amount', 'sum')); revenue_df.head()
revenue_2015
consumer_id
80 80.0
480 45.0
830 50.0
850 60.0
860 60.0
segment_revenue_vector=pd.merge(cons_df_2015, revenue_df, left_index=True,right_index=True, how='left')[['segment','revenue_2015']].fillna(0)\
                         .groupby('segment').mean().loc[segment_order]
segment_revenue_vector
revenue_2015
segment
new active 79.166144
active high 323.568935
active low 52.306043
new warm high 0.000000
new warm low 0.000000
warm high 0.000000
warm low 0.000000
cold 0.000000
inactive 0.000000
segment_customer_revenue = segment_customer_matrix.multiply(segment_revenue_vector.values, axis=1)
yearly_revenue = segment_customer_revenue.sum(); yearly_revenue
2015    478394.130000
2016    398133.789962
2017    380769.379008
2018    366971.511648
2019    351784.489138
2020    340297.820341
2021    330703.901766
2022    322960.160047
2023    316805.064971
2024    311758.129473
2025    307557.823339
dtype: float64
discount_rate = 0.1
discount_factor = [1/(1+discount_rate)**i for i in range(0,duration+1)]
discount_factor
[1.0,
 0.9090909090909091,
 0.8264462809917354,
 0.7513148009015775,
 0.6830134553650705,
 0.6209213230591549,
 0.5644739300537772,
 0.5131581182307065,
 0.4665073802097331,
 0.42409761837248455,
 0.3855432894295314]
(yearly_revenue*discount_factor)
2015    478394.130000
2016    361939.809056
2017    314685.437197
2018    275711.128210
2019    240273.539470
2020    211298.172840
2021    186673.731114
2022    165729.627993
2023    147791.900897
2024    132215.880218
2025    118576.854900
dtype: float64
(yearly_revenue*discount_factor).sum() # Value of Database
2633290.2118960177