# Predictive Analytics#

Goal :-

• What is likelihood that a customer will purchase?

• If they do, How much they will spend?

## Imports#

import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.cluster import AgglomerativeClustering
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, FunctionTransformer
from sklearn.linear_model import LogisticRegression, LinearRegression
import statsmodels.api as sm
import statsmodels.formula.api as smf


## Data Collection and Feature Engineering#

df = pd.read_csv("W106_purchases.txt", sep='\t', names=['consumer_id','purchase_amount', 'date_of_purchase'], parse_dates=['date_of_purchase'] )
df['year_of_purchase'] = df['date_of_purchase'].dt.year
max_date = df['date_of_purchase'].max() + dt.timedelta(days=1)
df['days_since'] = (max_date - df['date_of_purchase']).dt.days

consumer_id purchase_amount date_of_purchase year_of_purchase days_since
0 760 25.0 2009-11-06 2009 2247
1 860 50.0 2012-09-28 2012 1190
2 1200 100.0 2005-10-25 2005 3720
3 1420 50.0 2009-07-09 2009 2367
4 1940 70.0 2013-01-25 2013 1071

### Generate Consumer_df Input and Target (Revenue_2015)#

def get_consumer_df(df, offset=0):
df = df.copy()
df = df[df['days_since']>offset]
consumer_df = df.groupby('consumer_id').agg(recency=('days_since', 'min'),
frequency=('date_of_purchase', 'count'),
# monetary=('purchase_amount', 'mean'),
avg_purchase_amount=('purchase_amount', 'mean'),
max_purchase_amount=('purchase_amount', 'max'),
first_purchase=('days_since', 'max'),
# revenue_till_date=('purchase_amount', 'sum'),

# first_purchase_date=('date_of_purchase', 'min'),
# first_purchase_year=('year_of_purchase', 'min'),
# last_purchase_date=('date_of_purchase', 'max'),
# last_purchase_year=('year_of_purchase', 'max'),
consumer_df['recency'] = consumer_df['recency']  - offset
consumer_df['first_purchase'] = consumer_df['first_purchase'] - offset
# consumer_df['segment'] = np.nan
# rec_filter  = consumer_df['recency']
# first_purchase_filter = consumer_df['first_purchase']
# consumer_df.loc[(rec_filter <=365)  & (consumer_df['monetary'] >= 100), 'segment'] = 'active high'
# consumer_df.loc[(rec_filter <=365)  & (consumer_df['monetary'] < 100), 'segment'] = 'active low'""
# consumer_df.loc[(rec_filter <=365) & (first_purchase_filter <=365) & (consumer_df['monetary'] >= 100) , 'segment'] = 'new active high'
# consumer_df.loc[(rec_filter <=365) & (first_purchase_filter <=365) & (consumer_df['monetary'] < 100) , 'segment'] = 'new active low'
# consumer_df.loc[(rec_filter>365) & (rec_filter<=2*365) & (consumer_df['monetary'] >= 100), 'segment'] = 'warm high'
# consumer_df.loc[(rec_filter>365) & (rec_filter<=2*365) & (consumer_df['monetary'] < 100), 'segment'] = 'warm low'
# consumer_df.loc[(rec_filter>365) & (rec_filter<=2*365) & (first_purchase_filter <=2*365) & (consumer_df['monetary'] >= 100) , 'segment'] = 'new warm high'
# consumer_df.loc[(rec_filter>365) & (rec_filter<=2*365) & (first_purchase_filter <=2*365) & (consumer_df['monetary'] < 100) , 'segment'] = 'new warm low'
# consumer_df.loc[(rec_filter>2*365) & (rec_filter<=3*365), 'segment'] = 'cold'
# consumer_df.loc[(rec_filter>3*365), 'segment'] = 'inactive'
return consumer_df

revenue_df = df[df['year_of_purchase']==2015].groupby('consumer_id').agg(revenue_2015=('purchase_amount', 'sum')); revenue_df.head()

revenue_2015
consumer_id
80 80.0
480 45.0
830 50.0
850 60.0
860 60.0
cons_df_2014 = get_consumer_df(df, offset=365); cons_df_2014.head()

recency frequency avg_purchase_amount max_purchase_amount first_purchase
consumer_id
10 3464 1 30.0 30.0 3464
80 302 6 70.0 80.0 3386
90 393 10 115.8 153.0 3418
120 1036 1 20.0 20.0 1036
130 2605 2 50.0 60.0 3345

### Prepare Training dataset#

df_insample = pd.merge(cons_df_2014, revenue_df, left_index=True,right_index=True, how='left')
df_insample['active_2015'] = 1
df_insample.loc[df_insample.revenue_2015.isnull(), 'active_2015']=0
df_insample = df_insample.fillna(0)

recency frequency avg_purchase_amount max_purchase_amount first_purchase revenue_2015 active_2015
consumer_id
10 3464 1 30.0 30.0 3464 0.0 0
80 302 6 70.0 80.0 3386 80.0 1
90 393 10 115.8 153.0 3418 0.0 0
120 1036 1 20.0 20.0 1036 0.0 0
130 2605 2 50.0 60.0 3345 0.0 0

### Prepare Out of Sample Dataset#

df_outsample = get_consumer_df(df, offset=0);df_outsample.shape

(18417, 5)

df_outsample.head()

recency frequency avg_purchase_amount max_purchase_amount first_purchase
consumer_id
10 3829 1 30.000000 30.0 3829
80 343 7 71.428571 80.0 3751
90 758 10 115.800000 153.0 3783
120 1401 1 20.000000 20.0 1401
130 2970 2 50.000000 60.0 3710

## Predictive Models#

def process_df(df):
df_cp = df.copy()
df_cp['frequency'] = np.log1p(df_cp['frequency'])
df_cp['avg_purchase_amount'] = np.log1p(df_cp['avg_purchase_amount'])
return df_cp


### Likelihood of Purchase#

df_insample.columns

Index(['recency', 'frequency', 'avg_purchase_amount', 'max_purchase_amount',
'first_purchase', 'revenue_2015', 'active_2015'],
dtype='object')

input_ds = df_insample[['recency', 'frequency', 'avg_purchase_amount', 'max_purchase_amount',
'first_purchase']]
y = df_insample['active_2015']

const recency frequency avg_purchase_amount max_purchase_amount first_purchase
consumer_id
10 1.0 3464 1 30.0 30.0 3464
80 1.0 302 6 70.0 80.0 3386
90 1.0 393 10 115.8 153.0 3418
120 1.0 1036 1 20.0 20.0 1036
130 1.0 2605 2 50.0 60.0 3345
model_purchase_incidence = sm.Logit(y, X).fit()

Optimization terminated successfully.
Current function value: 0.365836
Iterations 8

model_purchase_incidence.summary()

Dep. Variable: No. Observations: active_2015 16905 Logit 16899 MLE 5 Sat, 28 May 2022 0.3214 19:48:41 -6184.5 True -9113.9 nonrobust 0
coef std err z P>|z| [0.025 0.975] -0.5331 0.044 -12.087 0.000 -0.620 -0.447 -0.0020 6e-05 -32.748 0.000 -0.002 -0.002 0.2195 0.015 14.840 0.000 0.191 0.249 0.0004 0.000 1.144 0.253 -0.000 0.001 -0.0002 0.000 -0.574 0.566 -0.001 0.000 -1.167e-05 3.93e-05 -0.297 0.766 -8.86e-05 6.53e-05

### Amount of Purchase#

df_insample[df_insample['active_2015']==1]

recency frequency avg_purchase_amount max_purchase_amount first_purchase revenue_2015 active_2015
consumer_id
80 302 6 70.000000 80.0 3386 80.0 1
480 16 11 62.272727 235.0 3313 45.0 1
830 267 6 48.333333 60.0 3374 50.0 1
850 62 8 28.125000 30.0 3051 60.0 1
860 267 9 53.333333 60.0 3643 60.0 1
... ... ... ... ... ... ... ...
234590 1 1 20.000000 20.0 1 25.0 1
234640 1 1 30.000000 30.0 1 45.0 1
234660 1 1 35.000000 35.0 1 40.0 1
234760 1 1 20.000000 20.0 1 15.0 1
236660 390 2 75.000000 100.0 684 100.0 1

3886 rows × 7 columns

model_monetary_prediction = smf.ols(formula='revenue_2015 ~ avg_purchase_amount + max_purchase_amount', data=df_insample[df_insample['active_2015']==1])

res = model_monetary_prediction.fit()

res.summary()

Dep. Variable: R-squared: revenue_2015 0.605 OLS 0.605 Least Squares 2979. Sat, 28 May 2022 0.00 20:01:40 -24621. 3886 4.925e+04 3883 4.927e+04 2 nonrobust
coef std err t P>|t| [0.025 0.975] 20.7471 2.381 8.713 0.000 16.079 25.415 0.6749 0.033 20.575 0.000 0.611 0.739 0.2923 0.024 12.367 0.000 0.246 0.339
 Omnibus: Durbin-Watson: 5580.84 2.007 0 8.16269e+06 7.843 0 226.98 315

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
y = res.predict(df_insample[df_insample['active_2015']==1]).values
x = df_insample[df_insample['active_2015']==1]['revenue_2015'].values
sns.scatterplot(x,y)

/opt/anaconda/envs/aiking/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be data, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(

<AxesSubplot:>

model_monetary_prediction2 = smf.ols(formula='np.log(revenue_2015) ~ np.log(avg_purchase_amount) + np.log(max_purchase_amount)', data=df_insample[df_insample['active_2015']==1])
res2 = model_monetary_prediction2.fit()
res2.summary()

Dep. Variable: R-squared: np.log(revenue_2015) 0.693 OLS 0.693 Least Squares 4377. Sat, 28 May 2022 0.00 20:12:35 -2644.6 3886 5295. 3883 5314. 2 nonrobust
coef std err t P>|t| [0.025 0.975] 0.3700 0.040 9.242 0.000 0.292 0.448 0.5488 0.042 13.171 0.000 0.467 0.631 0.3881 0.038 10.224 0.000 0.314 0.463
 Omnibus: Durbin-Watson: 501.505 1.961 0 3328.83 0.421 0 7.455 42.2

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
y = res2.predict(df_insample[df_insample['active_2015']==1]).values
x = np.log(df_insample[df_insample['active_2015']==1]['revenue_2015'].values)
sns.scatterplot(x,y)

/opt/anaconda/envs/aiking/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be data, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(

<AxesSubplot:>


### Scoring and Out of Sample Prediction#

out = df_outsample.copy()
out['Monetary'] = np.exp( res2.predict(df_outsample.copy()))
out['Score'] = out['Prob']*out['Monetary']

recency frequency avg_purchase_amount max_purchase_amount first_purchase Prob Monetary Score
consumer_id
10 3829 1 30.000000 30.0 3829 0.000380 35.048014 0.013326
80 343 7 71.428571 80.0 3751 0.575159 82.558085 47.484052
90 758 10 115.800000 153.0 3783 0.538104 138.426136 74.487670
120 1401 1 20.000000 20.0 1401 0.044037 23.970452 1.055597
130 2970 2 50.000000 60.0 3710 0.002568 60.709281 0.155924
out.describe().T

count mean std min 25% 50% 75% max
recency 18417.0 1253.037900 1081.437868 1.000000 244.000000 1070.000000 2130.000000 4014.000000
frequency 18417.0 2.782375 2.936888 1.000000 1.000000 2.000000 3.000000 45.000000
avg_purchase_amount 18417.0 57.792985 154.360109 5.000000 21.666667 30.000000 50.000000 4500.000000
max_purchase_amount 18417.0 68.756314 194.317960 5.000000 25.000000 30.000000 60.000000 4500.000000
first_purchase 18417.0 1984.009882 1133.405441 1.000000 988.000000 2087.000000 2992.000000 4016.000000
Prob 18417.0 0.224987 0.251136 0.000263 0.012637 0.106187 0.397817 0.999904
Monetary 18417.0 65.632173 147.887338 6.540053 28.999793 35.048014 57.298430 3832.952670
Score 18417.0 18.833654 70.211180 0.003298 0.455815 4.556268 17.957241 2854.156430

Note

Expected revenue for each customer based on score = 18.83