Predictive Analytics#

Goal :-

  • What is likelihood that a customer will purchase?

  • If they do, How much they will spend?

Imports#

import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, FunctionTransformer
from sklearn.linear_model import LogisticRegression, LinearRegression
import statsmodels.api as sm
import statsmodels.formula.api as smf

Data Collection and Feature Engineering#

Read Dataset#

df = pd.read_csv("W106_purchases.txt", sep='\t', names=['consumer_id','purchase_amount', 'date_of_purchase'], parse_dates=['date_of_purchase'] )
df['year_of_purchase'] = df['date_of_purchase'].dt.year
max_date = df['date_of_purchase'].max() + dt.timedelta(days=1)
df['days_since'] = (max_date - df['date_of_purchase']).dt.days
df.head()
consumer_id purchase_amount date_of_purchase year_of_purchase days_since
0 760 25.0 2009-11-06 2009 2247
1 860 50.0 2012-09-28 2012 1190
2 1200 100.0 2005-10-25 2005 3720
3 1420 50.0 2009-07-09 2009 2367
4 1940 70.0 2013-01-25 2013 1071

Generate Consumer_df Input and Target (Revenue_2015)#

def get_consumer_df(df, offset=0):
    df = df.copy()
    df = df[df['days_since']>offset]
    consumer_df = df.groupby('consumer_id').agg(recency=('days_since', 'min'),
                                            frequency=('date_of_purchase', 'count'),
                                            # monetary=('purchase_amount', 'mean'),
                                            avg_purchase_amount=('purchase_amount', 'mean'),
                                            max_purchase_amount=('purchase_amount', 'max'),
                                            first_purchase=('days_since', 'max'), 
                                            # revenue_till_date=('purchase_amount', 'sum'),
                                            
                                            # first_purchase_date=('date_of_purchase', 'min'),
                                            # first_purchase_year=('year_of_purchase', 'min'),
                                            # last_purchase_date=('date_of_purchase', 'max'),
                                            # last_purchase_year=('year_of_purchase', 'max'),
                                           ); consumer_df.head()
    consumer_df['recency'] = consumer_df['recency']  - offset
    consumer_df['first_purchase'] = consumer_df['first_purchase'] - offset
    # consumer_df['segment'] = np.nan
    # rec_filter  = consumer_df['recency'] 
    # first_purchase_filter = consumer_df['first_purchase']
    # consumer_df.loc[(rec_filter <=365)  & (consumer_df['monetary'] >= 100), 'segment'] = 'active high'
    # consumer_df.loc[(rec_filter <=365)  & (consumer_df['monetary'] < 100), 'segment'] = 'active low'""
    # consumer_df.loc[(rec_filter <=365) & (first_purchase_filter <=365) & (consumer_df['monetary'] >= 100) , 'segment'] = 'new active high'
    # consumer_df.loc[(rec_filter <=365) & (first_purchase_filter <=365) & (consumer_df['monetary'] < 100) , 'segment'] = 'new active low'
    # consumer_df.loc[(rec_filter>365) & (rec_filter<=2*365) & (consumer_df['monetary'] >= 100), 'segment'] = 'warm high'
    # consumer_df.loc[(rec_filter>365) & (rec_filter<=2*365) & (consumer_df['monetary'] < 100), 'segment'] = 'warm low'
    # consumer_df.loc[(rec_filter>365) & (rec_filter<=2*365) & (first_purchase_filter <=2*365) & (consumer_df['monetary'] >= 100) , 'segment'] = 'new warm high'
    # consumer_df.loc[(rec_filter>365) & (rec_filter<=2*365) & (first_purchase_filter <=2*365) & (consumer_df['monetary'] < 100) , 'segment'] = 'new warm low'
    # consumer_df.loc[(rec_filter>2*365) & (rec_filter<=3*365), 'segment'] = 'cold'
    # consumer_df.loc[(rec_filter>3*365), 'segment'] = 'inactive'
    return consumer_df
revenue_df = df[df['year_of_purchase']==2015].groupby('consumer_id').agg(revenue_2015=('purchase_amount', 'sum')); revenue_df.head()
revenue_2015
consumer_id
80 80.0
480 45.0
830 50.0
850 60.0
860 60.0
cons_df_2014 = get_consumer_df(df, offset=365); cons_df_2014.head()
recency frequency avg_purchase_amount max_purchase_amount first_purchase
consumer_id
10 3464 1 30.0 30.0 3464
80 302 6 70.0 80.0 3386
90 393 10 115.8 153.0 3418
120 1036 1 20.0 20.0 1036
130 2605 2 50.0 60.0 3345

Prepare Training dataset#

df_insample = pd.merge(cons_df_2014, revenue_df, left_index=True,right_index=True, how='left')
df_insample['active_2015'] = 1
df_insample.loc[df_insample.revenue_2015.isnull(), 'active_2015']=0
df_insample = df_insample.fillna(0)
df_insample.head()
recency frequency avg_purchase_amount max_purchase_amount first_purchase revenue_2015 active_2015
consumer_id
10 3464 1 30.0 30.0 3464 0.0 0
80 302 6 70.0 80.0 3386 80.0 1
90 393 10 115.8 153.0 3418 0.0 0
120 1036 1 20.0 20.0 1036 0.0 0
130 2605 2 50.0 60.0 3345 0.0 0

Prepare Out of Sample Dataset#

df_outsample = get_consumer_df(df, offset=0);df_outsample.shape
(18417, 5)
df_outsample.head()
recency frequency avg_purchase_amount max_purchase_amount first_purchase
consumer_id
10 3829 1 30.000000 30.0 3829
80 343 7 71.428571 80.0 3751
90 758 10 115.800000 153.0 3783
120 1401 1 20.000000 20.0 1401
130 2970 2 50.000000 60.0 3710

Predictive Models#

def process_df(df):
    df_cp = df.copy()
    df_cp['frequency'] = np.log1p(df_cp['frequency'])
    df_cp['avg_purchase_amount'] = np.log1p(df_cp['avg_purchase_amount'])
    return df_cp

Likelihood of Purchase#

df_insample.columns
Index(['recency', 'frequency', 'avg_purchase_amount', 'max_purchase_amount',
       'first_purchase', 'revenue_2015', 'active_2015'],
      dtype='object')
input_ds = df_insample[['recency', 'frequency', 'avg_purchase_amount', 'max_purchase_amount',
       'first_purchase']]
y = df_insample['active_2015']
X = sm.add_constant(input_ds); X.head()
const recency frequency avg_purchase_amount max_purchase_amount first_purchase
consumer_id
10 1.0 3464 1 30.0 30.0 3464
80 1.0 302 6 70.0 80.0 3386
90 1.0 393 10 115.8 153.0 3418
120 1.0 1036 1 20.0 20.0 1036
130 1.0 2605 2 50.0 60.0 3345
model_purchase_incidence = sm.Logit(y, X).fit()
Optimization terminated successfully.
         Current function value: 0.365836
         Iterations 8
model_purchase_incidence.summary()
Logit Regression Results
Dep. Variable: active_2015 No. Observations: 16905
Model: Logit Df Residuals: 16899
Method: MLE Df Model: 5
Date: Sat, 28 May 2022 Pseudo R-squ.: 0.3214
Time: 19:48:41 Log-Likelihood: -6184.5
converged: True LL-Null: -9113.9
Covariance Type: nonrobust LLR p-value: 0.000
coef std err z P>|z| [0.025 0.975]
const -0.5331 0.044 -12.087 0.000 -0.620 -0.447
recency -0.0020 6e-05 -32.748 0.000 -0.002 -0.002
frequency 0.2195 0.015 14.840 0.000 0.191 0.249
avg_purchase_amount 0.0004 0.000 1.144 0.253 -0.000 0.001
max_purchase_amount -0.0002 0.000 -0.574 0.566 -0.001 0.000
first_purchase -1.167e-05 3.93e-05 -0.297 0.766 -8.86e-05 6.53e-05

Amount of Purchase#

df_insample[df_insample['active_2015']==1]
recency frequency avg_purchase_amount max_purchase_amount first_purchase revenue_2015 active_2015
consumer_id
80 302 6 70.000000 80.0 3386 80.0 1
480 16 11 62.272727 235.0 3313 45.0 1
830 267 6 48.333333 60.0 3374 50.0 1
850 62 8 28.125000 30.0 3051 60.0 1
860 267 9 53.333333 60.0 3643 60.0 1
... ... ... ... ... ... ... ...
234590 1 1 20.000000 20.0 1 25.0 1
234640 1 1 30.000000 30.0 1 45.0 1
234660 1 1 35.000000 35.0 1 40.0 1
234760 1 1 20.000000 20.0 1 15.0 1
236660 390 2 75.000000 100.0 684 100.0 1

3886 rows × 7 columns

model_monetary_prediction = smf.ols(formula='revenue_2015 ~ avg_purchase_amount + max_purchase_amount', data=df_insample[df_insample['active_2015']==1])
res = model_monetary_prediction.fit()
res.summary()
OLS Regression Results
Dep. Variable: revenue_2015 R-squared: 0.605
Model: OLS Adj. R-squared: 0.605
Method: Least Squares F-statistic: 2979.
Date: Sat, 28 May 2022 Prob (F-statistic): 0.00
Time: 20:01:40 Log-Likelihood: -24621.
No. Observations: 3886 AIC: 4.925e+04
Df Residuals: 3883 BIC: 4.927e+04
Df Model: 2
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 20.7471 2.381 8.713 0.000 16.079 25.415
avg_purchase_amount 0.6749 0.033 20.575 0.000 0.611 0.739
max_purchase_amount 0.2923 0.024 12.367 0.000 0.246 0.339
Omnibus: 5580.836 Durbin-Watson: 2.007
Prob(Omnibus): 0.000 Jarque-Bera (JB): 8162692.709
Skew: 7.843 Prob(JB): 0.00
Kurtosis: 226.980 Cond. No. 315.


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
y = res.predict(df_insample[df_insample['active_2015']==1]).values
x = df_insample[df_insample['active_2015']==1]['revenue_2015'].values
sns.scatterplot(x,y)
/opt/anaconda/envs/aiking/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
<AxesSubplot:>
../../_images/03_predictive_analytics_28_2.png
model_monetary_prediction2 = smf.ols(formula='np.log(revenue_2015) ~ np.log(avg_purchase_amount) + np.log(max_purchase_amount)', data=df_insample[df_insample['active_2015']==1])
res2 = model_monetary_prediction2.fit()
res2.summary()
OLS Regression Results
Dep. Variable: np.log(revenue_2015) R-squared: 0.693
Model: OLS Adj. R-squared: 0.693
Method: Least Squares F-statistic: 4377.
Date: Sat, 28 May 2022 Prob (F-statistic): 0.00
Time: 20:12:35 Log-Likelihood: -2644.6
No. Observations: 3886 AIC: 5295.
Df Residuals: 3883 BIC: 5314.
Df Model: 2
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 0.3700 0.040 9.242 0.000 0.292 0.448
np.log(avg_purchase_amount) 0.5488 0.042 13.171 0.000 0.467 0.631
np.log(max_purchase_amount) 0.3881 0.038 10.224 0.000 0.314 0.463
Omnibus: 501.505 Durbin-Watson: 1.961
Prob(Omnibus): 0.000 Jarque-Bera (JB): 3328.833
Skew: 0.421 Prob(JB): 0.00
Kurtosis: 7.455 Cond. No. 42.2


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
y = res2.predict(df_insample[df_insample['active_2015']==1]).values
x = np.log(df_insample[df_insample['active_2015']==1]['revenue_2015'].values)
sns.scatterplot(x,y)
/opt/anaconda/envs/aiking/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
<AxesSubplot:>
../../_images/03_predictive_analytics_30_2.png

Scoring and Out of Sample Prediction#

out = df_outsample.copy()
out['Prob']= model_purchase_incidence.predict(sm.add_constant(df_outsample.copy())) 
out['Monetary'] = np.exp( res2.predict(df_outsample.copy()))
out['Score'] = out['Prob']*out['Monetary']
out.head()
recency frequency avg_purchase_amount max_purchase_amount first_purchase Prob Monetary Score
consumer_id
10 3829 1 30.000000 30.0 3829 0.000380 35.048014 0.013326
80 343 7 71.428571 80.0 3751 0.575159 82.558085 47.484052
90 758 10 115.800000 153.0 3783 0.538104 138.426136 74.487670
120 1401 1 20.000000 20.0 1401 0.044037 23.970452 1.055597
130 2970 2 50.000000 60.0 3710 0.002568 60.709281 0.155924
out.describe().T 
count mean std min 25% 50% 75% max
recency 18417.0 1253.037900 1081.437868 1.000000 244.000000 1070.000000 2130.000000 4014.000000
frequency 18417.0 2.782375 2.936888 1.000000 1.000000 2.000000 3.000000 45.000000
avg_purchase_amount 18417.0 57.792985 154.360109 5.000000 21.666667 30.000000 50.000000 4500.000000
max_purchase_amount 18417.0 68.756314 194.317960 5.000000 25.000000 30.000000 60.000000 4500.000000
first_purchase 18417.0 1984.009882 1133.405441 1.000000 988.000000 2087.000000 2992.000000 4016.000000
Prob 18417.0 0.224987 0.251136 0.000263 0.012637 0.106187 0.397817 0.999904
Monetary 18417.0 65.632173 147.887338 6.540053 28.999793 35.048014 57.298430 3832.952670
Score 18417.0 18.833654 70.211180 0.003298 0.455815 4.556268 17.957241 2854.156430

Note

Expected revenue for each customer based on score = 18.83