Brand Choice Modelling#

Imports#

import pandas as pd
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt 
import sklearn
import numpy as np
import pickle
import joblib
import itertools
from sklearn.linear_model import LogisticRegression
sns.set()

Read and Prepare Dataset#

df = pd.read_csv("purchase data.csv"); df.head()
ID Day Incidence Brand Quantity Last_Inc_Brand Last_Inc_Quantity Price_1 Price_2 Price_3 ... Promotion_3 Promotion_4 Promotion_5 Sex Marital status Age Education Income Occupation Settlement size
0 200000001 1 0 0 0 0 0 1.59 1.87 2.01 ... 0 0 0 0 0 47 1 110866 1 0
1 200000001 11 0 0 0 0 0 1.51 1.89 1.99 ... 0 0 0 0 0 47 1 110866 1 0
2 200000001 12 0 0 0 0 0 1.51 1.89 1.99 ... 0 0 0 0 0 47 1 110866 1 0
3 200000001 16 0 0 0 0 0 1.52 1.89 1.98 ... 0 0 0 0 0 47 1 110866 1 0
4 200000001 18 0 0 0 0 0 1.52 1.89 1.99 ... 0 0 0 0 0 47 1 110866 1 0

5 rows × 24 columns

def do_clustering(df, pipeline, drop_cols=None, sel_cols=None, do_fit=False):
    y = None
    df_new = df.copy()
    if drop_cols: df_new = df_new.drop(columns=drop_cols, axis=1)
    df_filter = df_new.copy()    
    if sel_cols: df_filter = df_new[sel_cols]
    if do_fit:y = pipeline.fit_predict(df_filter)
    else: y = pipeline.predict(df_filter)
    if 'pca' in pipeline.named_steps:
        m = pipeline.named_steps['pca']
        comp_names = [f"PCA{i+1}" for i in range(m.n_components)]
        transform_df = df_filter.copy()
        for step in pipeline.named_steps:
            transform_df = pipeline.named_steps[step].transform(transform_df)
            if step == "pca": break
        pca_df = pd.DataFrame(transform_df, 
                              columns=comp_names, 
                              index=df_filter.index)
        df_new = pd.concat([df_new, pca_df], axis=1)
    df_new['y'] = y+1
    return df_new, pipeline
pipeline = joblib.load("cluster_pipeline.pkl"); pipeline
sel_cols = ['Sex','Marital status','Age','Education','Income','Occupation','Settlement size']
df_segments, _ = do_clustering(df, pipeline, sel_cols=sel_cols)
names = {1:"Standard",
         2:"Career-Focussed",
         3:"Fewer-Opportunities",
         4:"Well-off"}

df_segments['labels'] = df_segments['y'].map(names)
df_segments.head().T
0 1 2 3 4
ID 200000001 200000001 200000001 200000001 200000001
Day 1 11 12 16 18
Incidence 0 0 0 0 0
Brand 0 0 0 0 0
Quantity 0 0 0 0 0
Last_Inc_Brand 0 0 0 0 0
Last_Inc_Quantity 0 0 0 0 0
Price_1 1.59 1.51 1.51 1.52 1.52
Price_2 1.87 1.89 1.89 1.89 1.89
Price_3 2.01 1.99 1.99 1.98 1.99
Price_4 2.09 2.09 2.09 2.09 2.09
Price_5 2.66 2.66 2.66 2.66 2.66
Promotion_1 0 0 0 0 0
Promotion_2 1 0 0 0 0
Promotion_3 0 0 0 0 0
Promotion_4 0 0 0 0 0
Promotion_5 0 0 0 0 0
Sex 0 0 0 0 0
Marital status 0 0 0 0 0
Age 47 47 47 47 47
Education 1 1 1 1 1
Income 110866 110866 110866 110866 110866
Occupation 1 1 1 1 1
Settlement size 0 0 0 0 0
PCA1 0.362152 0.362152 0.362152 0.362152 0.362152
PCA2 -0.639557 -0.639557 -0.639557 -0.639557 -0.639557
PCA3 1.462706 1.462706 1.462706 1.462706 1.462706
PCA4 -0.593242 -0.593242 -0.593242 -0.593242 -0.593242
y 3 3 3 3 3
labels Fewer-Opportunities Fewer-Opportunities Fewer-Opportunities Fewer-Opportunities Fewer-Opportunities
features = "Brand|Price*|Promotion*|labels"
df_brand_choice = df_segments[df_segments.Incidence == 1].filter(regex=features).drop(columns=["Last_Inc_Brand"]).reset_index(drop=True); df_brand_choice.head()
Brand Price_1 Price_2 Price_3 Price_4 Price_5 Promotion_1 Promotion_2 Promotion_3 Promotion_4 Promotion_5 labels
0 2 1.50 1.90 1.99 2.09 2.67 0 0 0 0 0 Fewer-Opportunities
1 5 1.39 1.90 1.91 2.12 2.62 1 0 0 0 1 Fewer-Opportunities
2 1 1.47 1.90 1.99 1.97 2.67 0 0 0 1 0 Fewer-Opportunities
3 4 1.21 1.35 1.99 2.16 2.68 1 1 0 0 0 Fewer-Opportunities
4 2 1.46 1.88 1.97 1.89 2.37 1 0 0 1 1 Fewer-Opportunities
df_brand_choice.filter(regex="Pr")
Price_1 Price_2 Price_3 Price_4 Price_5 Promotion_1 Promotion_2 Promotion_3 Promotion_4 Promotion_5
0 1.50 1.90 1.99 2.09 2.67 0 0 0 0 0
1 1.39 1.90 1.91 2.12 2.62 1 0 0 0 1
2 1.47 1.90 1.99 1.97 2.67 0 0 0 1 0
3 1.21 1.35 1.99 2.16 2.68 1 1 0 0 0
4 1.46 1.88 1.97 1.89 2.37 1 0 0 1 1
... ... ... ... ... ... ... ... ... ... ...
14633 1.48 1.89 2.01 2.18 2.69 0 0 0 0 0
14634 1.35 1.57 2.02 2.21 2.70 1 1 0 0 0
14635 1.50 1.85 2.06 2.24 2.79 1 1 0 0 0
14636 1.42 1.51 1.97 2.24 2.78 0 0 0 0 0
14637 1.51 1.82 2.09 2.24 2.80 0 0 0 0 0

14638 rows × 10 columns

Brand choice probability#

model_brand = LogisticRegression(multi_class='multinomial', solver='sag')
model_brand.fit(df_brand_choice.filter(regex="Pr").values, df_brand_choice['Brand'].values)
LogisticRegression(multi_class='multinomial', solver='sag')
model_brand.coef_.shape
(5, 10)
df_brand_choice.filter(regex="Pr").columns.tolist(), df_brand_choice.Brand.unique().tolist()
(['Price_1',
  'Price_2',
  'Price_3',
  'Price_4',
  'Price_5',
  'Promotion_1',
  'Promotion_2',
  'Promotion_3',
  'Promotion_4',
  'Promotion_5'],
 [2, 5, 1, 4, 3])
df_brand_coeff = pd.DataFrame(model_brand.coef_, columns=df_brand_choice.filter(regex="Pr").columns.tolist(), index=model_brand.classes_)
fig, ax = plt.subplots(figsize=(12,6))
sns.heatmap(data=df_brand_coeff, cmap="RdYlBu", annot=True, ax=ax)
<AxesSubplot:>
../../_images/06_brand_probability_15_1.png
df_brand_coeff.loc[1, 'Price_1']
-4.469319087099478
prices = np.arange(0.5, 3.5,0.01); prices.shape, prices
((300,),
 array([0.5 , 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 ,
        0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71,
        0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82,
        0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9 , 0.91, 0.92, 0.93,
        0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.  , 1.01, 1.02, 1.03, 1.04,
        1.05, 1.06, 1.07, 1.08, 1.09, 1.1 , 1.11, 1.12, 1.13, 1.14, 1.15,
        1.16, 1.17, 1.18, 1.19, 1.2 , 1.21, 1.22, 1.23, 1.24, 1.25, 1.26,
        1.27, 1.28, 1.29, 1.3 , 1.31, 1.32, 1.33, 1.34, 1.35, 1.36, 1.37,
        1.38, 1.39, 1.4 , 1.41, 1.42, 1.43, 1.44, 1.45, 1.46, 1.47, 1.48,
        1.49, 1.5 , 1.51, 1.52, 1.53, 1.54, 1.55, 1.56, 1.57, 1.58, 1.59,
        1.6 , 1.61, 1.62, 1.63, 1.64, 1.65, 1.66, 1.67, 1.68, 1.69, 1.7 ,
        1.71, 1.72, 1.73, 1.74, 1.75, 1.76, 1.77, 1.78, 1.79, 1.8 , 1.81,
        1.82, 1.83, 1.84, 1.85, 1.86, 1.87, 1.88, 1.89, 1.9 , 1.91, 1.92,
        1.93, 1.94, 1.95, 1.96, 1.97, 1.98, 1.99, 2.  , 2.01, 2.02, 2.03,
        2.04, 2.05, 2.06, 2.07, 2.08, 2.09, 2.1 , 2.11, 2.12, 2.13, 2.14,
        2.15, 2.16, 2.17, 2.18, 2.19, 2.2 , 2.21, 2.22, 2.23, 2.24, 2.25,
        2.26, 2.27, 2.28, 2.29, 2.3 , 2.31, 2.32, 2.33, 2.34, 2.35, 2.36,
        2.37, 2.38, 2.39, 2.4 , 2.41, 2.42, 2.43, 2.44, 2.45, 2.46, 2.47,
        2.48, 2.49, 2.5 , 2.51, 2.52, 2.53, 2.54, 2.55, 2.56, 2.57, 2.58,
        2.59, 2.6 , 2.61, 2.62, 2.63, 2.64, 2.65, 2.66, 2.67, 2.68, 2.69,
        2.7 , 2.71, 2.72, 2.73, 2.74, 2.75, 2.76, 2.77, 2.78, 2.79, 2.8 ,
        2.81, 2.82, 2.83, 2.84, 2.85, 2.86, 2.87, 2.88, 2.89, 2.9 , 2.91,
        2.92, 2.93, 2.94, 2.95, 2.96, 2.97, 2.98, 2.99, 3.  , 3.01, 3.02,
        3.03, 3.04, 3.05, 3.06, 3.07, 3.08, 3.09, 3.1 , 3.11, 3.12, 3.13,
        3.14, 3.15, 3.16, 3.17, 3.18, 3.19, 3.2 , 3.21, 3.22, 3.23, 3.24,
        3.25, 3.26, 3.27, 3.28, 3.29, 3.3 , 3.31, 3.32, 3.33, 3.34, 3.35,
        3.36, 3.37, 3.38, 3.39, 3.4 , 3.41, 3.42, 3.43, 3.44, 3.45, 3.46,
        3.47, 3.48, 3.49]))
brand = 5
df_prices = pd.DataFrame({"mean":df_brand_choice.filter(regex="Price").mean(),
           "min":df_brand_choice.filter(regex="Price").min(),
           "max":df_brand_choice.filter(regex="Price").max()}).reset_index().melt(id_vars='index')
df_prices
index variable value
0 Price_1 mean 1.384559
1 Price_2 mean 1.764717
2 Price_3 mean 2.006694
3 Price_4 mean 2.159658
4 Price_5 mean 2.654296
5 Price_1 min 1.100000
6 Price_2 min 1.260000
7 Price_3 min 1.870000
8 Price_4 min 1.760000
9 Price_5 min 2.110000
10 Price_1 max 1.590000
11 Price_2 max 1.900000
12 Price_3 max 2.140000
13 Price_4 max 2.260000
14 Price_5 max 2.800000
regex="Price"

def get_df_summary(df_brand_choice, regex="Price"):
    df = pd.DataFrame({"mean":df_brand_choice.filter(regex=regex).mean(),
               "min":df_brand_choice.filter(regex=regex).min(),
               "max":df_brand_choice.filter(regex=regex).max()}).reset_index().melt(id_vars='index')
    df['id'] = df['index'].str.cat(df['variable'], sep="-")
    df = df.drop(columns=['index', 'variable'], axis=1)
    df = df.set_index('id').T.reset_index(drop=True)
    return df

get_df_summary(df_brand_choice,regex="Promotion")
id Promotion_1-mean Promotion_2-mean Promotion_3-mean Promotion_4-mean Promotion_5-mean Promotion_1-min Promotion_2-min Promotion_3-min Promotion_4-min Promotion_5-min Promotion_1-max Promotion_2-max Promotion_3-max Promotion_4-max Promotion_5-max
0 0.372455 0.349638 0.043858 0.128091 0.04543 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0
include_promo = "min"
brand = 5
df_brand_elasticity=pd.DataFrame()
df_brand_elasticity['prices'] = prices
df_brand_elasticity['promo']= 1
df_brand_elasticity['brand']= brand
df_brand_elasticity = df_brand_elasticity.join(get_df_summary(df_brand_choice,regex="Price")).ffill()
df_brand_elasticity = df_brand_elasticity.join(get_df_summary(df_brand_choice,regex="Promotion")).ffill()

for br in model_brand.classes_:
    df_input = df_brand_elasticity.filter(regex=f"(Price.*mean|Promo.*{include_promo})").copy()
    df_input[f"Price_{br}-mean"] = df_brand_elasticity.prices
    probabilities = model_brand.predict_proba(df_input.values)
    df_brand_elasticity.loc[:, f"Probabilities_{br}" ] = probabilities[:,br-1]
    if br == brand: df_brand_elasticity.loc[:, f"Elasticity_{br}" ] = df_brand_coeff.loc[brand, f'Price_{brand}']*df_brand_elasticity.prices*(1- df_brand_elasticity[f"Probabilities_{br}"])
    else: df_brand_elasticity.loc[:, f"Elasticity_{br}" ] = -1*df_brand_coeff.loc[brand, f'Price_{brand}']*df_brand_elasticity.prices*df_brand_elasticity[f"Probabilities_{br}"]
df_brand_elasticity
prices promo brand Price_1-mean Price_2-mean Price_3-mean Price_4-mean Price_5-mean Price_1-min Price_2-min ... Probabilities_1 Elasticity_1 Probabilities_2 Elasticity_2 Probabilities_3 Elasticity_3 Probabilities_4 Elasticity_4 Probabilities_5 Elasticity_5
0 0.50 1 5 1.384559 1.764717 2.006694 2.159658 2.654296 1.1 1.26 ... 9.183243e-01 0.265457 0.853568 0.246738 0.002126 0.000615 0.652661 0.188662 0.709821 -0.083881
1 0.51 1 5 1.384559 1.764717 2.006694 2.159658 2.654296 1.1 1.26 ... 9.143896e-01 0.269606 0.850910 0.250889 0.002180 0.000643 0.650092 0.191678 0.708717 -0.085884
2 0.52 1 5 1.384559 1.764717 2.006694 2.159658 2.654296 1.1 1.26 ... 9.102819e-01 0.273657 0.848212 0.254997 0.002234 0.000672 0.647513 0.194661 0.707608 -0.087901
3 0.53 1 5 1.384559 1.764717 2.006694 2.159658 2.654296 1.1 1.26 ... 9.059955e-01 0.277606 0.845475 0.259062 0.002290 0.000702 0.644924 0.197611 0.706493 -0.089933
4 0.54 1 5 1.384559 1.764717 2.006694 2.159658 2.654296 1.1 1.26 ... 9.015243e-01 0.281448 0.842696 0.263083 0.002348 0.000733 0.642325 0.200529 0.705373 -0.091980
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
295 3.45 1 5 1.384559 1.764717 2.006694 2.159658 2.654296 1.1 1.26 ... 7.788910e-07 0.000002 0.010487 0.020917 0.057650 0.114986 0.036948 0.073695 0.222011 -1.551743
296 3.46 1 5 1.384559 1.764717 2.006694 2.159658 2.654296 1.1 1.26 ... 7.335952e-07 0.000001 0.010264 0.020532 0.056995 0.114010 0.036359 0.072730 0.220519 -1.559226
297 3.47 1 5 1.384559 1.764717 2.006694 2.159658 2.654296 1.1 1.26 ... 6.909231e-07 0.000001 0.010046 0.020153 0.056344 0.113033 0.035777 0.071774 0.219033 -1.566714
298 3.48 1 5 1.384559 1.764717 2.006694 2.159658 2.654296 1.1 1.26 ... 6.507233e-07 0.000001 0.009832 0.019781 0.055696 0.112056 0.035204 0.070826 0.217553 -1.574206
299 3.49 1 5 1.384559 1.764717 2.006694 2.159658 2.654296 1.1 1.26 ... 6.128534e-07 0.000001 0.009623 0.019416 0.055053 0.111079 0.034638 0.069888 0.216080 -1.581702

300 rows × 43 columns

df_brand_elasticity['Price_5-max'][0]
2.8
o = 5 #Own Brand
c = 4 #Cross Brand
fig, axes = plt.subplots(1,2, figsize=(12,6))
sns.lineplot(data=df_brand_elasticity, x="prices", y=f"Elasticity_{o}", ax=axes[0]).set(title=f"Own_brand_{o}")
axes[0].axvspan(df_brand_elasticity[f'Price_{o}-min'][0], df_brand_elasticity[f'Price_{o}-max'][0], alpha=.2, color='red')
axes[0].axvline(df_brand_elasticity[f'Price_{o}-mean'][0], color='blue')
sns.lineplot(data=df_brand_elasticity, x="prices", y=f"Elasticity_{c}", ax=axes[1]).set(title=f"Cross_brand_{c}")
axes[1].axvspan(df_brand_elasticity[f'Price_{c}-min'][0], df_brand_elasticity[f'Price_{c}-max'][0], alpha=.2, color='red')
axes[1].axvline(df_brand_elasticity[f'Price_{c}-mean'][0], color='blue')
<matplotlib.lines.Line2D at 0xf1f6c080eb0>
../../_images/06_brand_probability_24_1.png

Brand choice probability by segments#

segments = df_segments['labels'].unique().tolist(); segments
segments.insert(0, None)
segments
[None, 'Fewer-Opportunities', 'Well-off', 'Career-Focussed', 'Standard']
def get_brand_elasticity_df(df_brand_choice, segment=None, brand=5, include_promo='min', max_iter=1000):
    label = "Aggregate"
    df = df_brand_choice.copy()
    if segment: 
        label = segment
        df = df_brand_choice[df_brand_choice['labels'] == segment].copy()
        
    # Model Training
    model_brand = LogisticRegression(multi_class='multinomial', solver='sag', max_iter=max_iter)
    model_brand.fit(df.filter(regex="Pr").values, df['Brand'].values)
    df_brand_coeff = pd.DataFrame(model_brand.coef_, columns=df.filter(regex="Pr").columns.tolist(), index=model_brand.classes_)
    prices = np.arange(0.5, 3.5,0.01)
    df_brand_elasticity = pd.DataFrame()
    df_brand_elasticity['prices'] = prices
    df_brand_elasticity['promo']= 1
    df_brand_elasticity['brand']= brand
    df_brand_elasticity = df_brand_elasticity.join(get_df_summary(df,regex="Price")).ffill()
    df_brand_elasticity = df_brand_elasticity.join(get_df_summary(df,regex="Promotion")).ffill()
    for br in model_brand.classes_:
        df_input = df_brand_elasticity.filter(regex=f"(Price.*mean|Promo.*{include_promo})").copy()
        df_input[f"Price_{br}-mean"] = df_brand_elasticity.prices
        probabilities = model_brand.predict_proba(df_input.values)
        df_brand_elasticity.loc[:, f"Probabilities_{br}" ] = probabilities[:,br-1]
        if br == brand: df_brand_elasticity.loc[:, f"Elasticity_{br}" ] = df_brand_coeff.loc[brand, f'Price_{brand}']*df_brand_elasticity.prices*(1- df_brand_elasticity[f"Probabilities_{br}"])
        else: df_brand_elasticity.loc[:, f"Elasticity_{br}" ] = -1*df_brand_coeff.loc[brand, f'Price_{brand}']*df_brand_elasticity.prices*df_brand_elasticity[f"Probabilities_{br}"]
    df_brand_elasticity['label'] = label
    return df_brand_elasticity
    
a = get_brand_elasticity_df(df_brand_choice,segment=segments[0])
a.head()
prices promo brand Price_1-mean Price_2-mean Price_3-mean Price_4-mean Price_5-mean Price_1-min Price_2-min ... Elasticity_1 Probabilities_2 Elasticity_2 Probabilities_3 Elasticity_3 Probabilities_4 Elasticity_4 Probabilities_5 Elasticity_5 label
0 0.50 1 5 1.384559 1.764717 2.006694 2.159658 2.654296 1.1 1.26 ... 0.265461 0.853569 0.246742 0.002126 0.000614 0.652661 0.188665 0.709819 -0.083883 Aggregate
1 0.51 1 5 1.384559 1.764717 2.006694 2.159658 2.654296 1.1 1.26 ... 0.269610 0.850911 0.250893 0.002179 0.000643 0.650092 0.191681 0.708715 -0.085886 Aggregate
2 0.52 1 5 1.384559 1.764717 2.006694 2.159658 2.654296 1.1 1.26 ... 0.273662 0.848213 0.255001 0.002234 0.000672 0.647513 0.194664 0.707606 -0.087903 Aggregate
3 0.53 1 5 1.384559 1.764717 2.006694 2.159658 2.654296 1.1 1.26 ... 0.277611 0.845475 0.259066 0.002290 0.000702 0.644924 0.197614 0.706491 -0.089935 Aggregate
4 0.54 1 5 1.384559 1.764717 2.006694 2.159658 2.654296 1.1 1.26 ... 0.281453 0.842697 0.263087 0.002347 0.000733 0.642325 0.200531 0.705370 -0.091982 Aggregate

5 rows × 44 columns

brands = [1,2,3,4,5]
df_brand_elasticity_all = pd.concat([get_brand_elasticity_df(df_brand_choice, segment=s, brand=b) for s,b in itertools.product(segments, brands)]).reset_index(drop=True)
df_brand_elasticity_all.shape
(7500, 44)