Channel Attribution Analysis#

Imports#

import pandas as pd
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt 
import sklearn
import numpy as np
import pickle
import joblib
import itertools
from sklearn.linear_model import LogisticRegression
sns.set()

Read the Dataset#

  • Cookie: Randomly generated customer id enabling us to tie subsequent visits back to the same customer

  • Timestamp: Date and time when the visit took place

  • Interaction: Categorical variable indicating the type of interaction that took place

  • Conversion: Boolean variable indicating whether a conversion took place

  • Conversion Value: Value of the potential conversion event

  • Channel: The marketing channel that brought the customer to our site

df = pd.read_csv("attribution data.csv"); df.head().T
0 1 2 3 4
cookie 00000FkCnDfDDf0iC97iC703B 00000FkCnDfDDf0iC97iC703B 00000FkCnDfDDf0iC97iC703B 00000FkCnDfDDf0iC97iC703B 0000nACkD9nFkBBDECD3ki00E
time 2018-07-03T13:02:11Z 2018-07-17T19:15:07Z 2018-07-24T15:51:46Z 2018-07-29T07:44:51Z 2018-07-03T09:44:57Z
interaction impression impression impression impression impression
conversion 0 0 0 0 0
conversion_value 0.0 0.0 0.0 0.0 0.0
channel Instagram Online Display Online Display Online Display Paid Search
df.shape
(586737, 6)
df.describe(include='all')
cookie time interaction conversion conversion_value channel
count 586737 586737 586737 586737.000000 586737.000000 586737
unique 240108 485110 2 NaN NaN 5
top FDkCnkBiB7FiE9oBDC7ifn7kF 2018-07-14T20:50:09Z impression NaN NaN Facebook
freq 134 12 569098 NaN NaN 175741
mean NaN NaN NaN 0.030063 0.187871 NaN
std NaN NaN NaN 0.170760 1.084498 NaN
min NaN NaN NaN 0.000000 0.000000 NaN
25% NaN NaN NaN 0.000000 0.000000 NaN
50% NaN NaN NaN 0.000000 0.000000 NaN
75% NaN NaN NaN 0.000000 0.000000 NaN
max NaN NaN NaN 1.000000 8.500000 NaN
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586737 entries, 0 to 586736
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   cookie            586737 non-null  object 
 1   time              586737 non-null  object 
 2   interaction       586737 non-null  object 
 3   conversion        586737 non-null  int64  
 4   conversion_value  586737 non-null  float64
 5   channel           586737 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 26.9+ MB
df = df.sort_values(['cookie', 'time'], ascending=[False, True])
df.head()
cookie time interaction conversion conversion_value channel
586736 ooooohAFofEnonEikhAi3fF9o 2018-07-14T17:17:12Z impression 0 0.0 Paid Search
586734 ooooiBh70D3k3BfAhDFfii9h7 2018-07-03T12:57:25Z impression 0 0.0 Paid Search
586735 ooooiBh70D3k3BfAhDFfii9h7 2018-07-19T08:17:59Z impression 0 0.0 Online Video
586731 ooooEiB0CCoEf9fiiC90Dfhfk 2018-07-06T23:30:38Z impression 0 0.0 Online Display
586732 ooooEiB0CCoEf9fiiC90Dfhfk 2018-07-12T23:50:45Z impression 0 0.0 Online Display
df['visit_store'] = df.groupby('cookie').cumcount() +1 #Number of times a user visit store
df.head()
cookie time interaction conversion conversion_value channel visit_store
586736 ooooohAFofEnonEikhAi3fF9o 2018-07-14T17:17:12Z impression 0 0.0 Paid Search 1
586734 ooooiBh70D3k3BfAhDFfii9h7 2018-07-03T12:57:25Z impression 0 0.0 Paid Search 1
586735 ooooiBh70D3k3BfAhDFfii9h7 2018-07-19T08:17:59Z impression 0 0.0 Online Video 2
586731 ooooEiB0CCoEf9fiiC90Dfhfk 2018-07-06T23:30:38Z impression 0 0.0 Online Display 1
586732 ooooEiB0CCoEf9fiiC90Dfhfk 2018-07-12T23:50:45Z impression 0 0.0 Online Display 2

Converting user channel to path format

df_paths = df.groupby('cookie')['channel'].agg(lambda x: x.unique().tolist()).reset_index(); df_paths.head()
cookie channel
0 00000FkCnDfDDf0iC97iC703B [Instagram, Online Display]
1 0000nACkD9nFkBBDECD3ki00E [Paid Search]
2 0003EfE37E93D0BC03iBhBBhF [Paid Search]
3 00073CFE3FoFCn70fBhB3kfon [Instagram]
4 00079hhBkDF3k3kDkiFi9EFAD [Paid Search]
df_last_interaction = df.drop_duplicates('cookie', keep='last')[['cookie', 'conversion']]
df_last_interaction.head()
cookie conversion
586736 ooooohAFofEnonEikhAi3fF9o 0
586735 ooooiBh70D3k3BfAhDFfii9h7 0
586733 ooooEiB0CCoEf9fiiC90Dfhfk 0
586730 ooooEBE0o0D97ACAAAnDoi3F0 0
586729 ooooE0hkAFBkED90ChDDiBFAf 0
df_combined = pd.merge(df_paths, df_last_interaction, how='left', on='cookie'); df_combined.head()
cookie channel conversion
0 00000FkCnDfDDf0iC97iC703B [Instagram, Online Display] 0
1 0000nACkD9nFkBBDECD3ki00E [Paid Search] 0
2 0003EfE37E93D0BC03iBhBBhF [Paid Search] 0
3 00073CFE3FoFCn70fBhB3kfon [Instagram] 0
4 00079hhBkDF3k3kDkiFi9EFAD [Paid Search] 0
# df_combined['channel'] = 
def mod_channel_list(row):
    if row['conversion']==1: row = ['Start']+row['channel']+['Conversion']
    else: row = ['Start']+row['channel']+['Null']
    return row
df_combined['channel'] = df_combined.apply(mod_channel_list,axis=1); df_combined.head()
cookie channel conversion
0 00000FkCnDfDDf0iC97iC703B [Start, Instagram, Online Display, Null] 0
1 0000nACkD9nFkBBDECD3ki00E [Start, Paid Search, Null] 0
2 0003EfE37E93D0BC03iBhBBhF [Start, Paid Search, Null] 0
3 00073CFE3FoFCn70fBhB3kfon [Start, Instagram, Null] 0
4 00079hhBkDF3k3kDkiFi9EFAD [Start, Paid Search, Null] 0

Markov Chain Modelling#

list_of_channels = df_combined['channel']
list_of_channels
0         [Start, Instagram, Online Display, Null]
1                       [Start, Paid Search, Null]
2                       [Start, Paid Search, Null]
3                         [Start, Instagram, Null]
4                       [Start, Paid Search, Null]
                            ...                   
240103               [Start, Online Display, Null]
240104               [Start, Online Display, Null]
240105               [Start, Online Display, Null]
240106    [Start, Paid Search, Online Video, Null]
240107                  [Start, Paid Search, Null]
Name: channel, Length: 240108, dtype: object
for channels in df_combined['channel'].tolist():
    print(channels.count('Conversion'))
    if channels.count('Conversion'):
        print(channels)
        break
0
0
0
0
0
0
0
1
['Start', 'Paid Search', 'Conversion']
total_conversions = df_combined['conversion'].sum()
base_conversion_rate = df_combined['conversion'].mean()
total_conversions, base_conversion_rate
(17639, 0.07346277508454528)
list_of_paths = df_combined['channel']

list_of_unique_channels = set(x for element in list_of_paths for x in element); list_of_unique_channels
{'Conversion',
 'Facebook',
 'Instagram',
 'Null',
 'Online Display',
 'Online Video',
 'Paid Search',
 'Start'}
transition_states = {x+'>'+y: 0 for x in list_of_unique_channels for y in list_of_unique_channels}
len(transition_states.keys())
64
transition_state={a+'>'+b: 0 for a,b in itertools.product(list_of_unique_channels, list_of_unique_channels)}
len(transition_states.keys())
64
i = 0
for user_path in list_of_paths:
    print([i for i,s in enumerate(user_path)])
    i = i+1
    if i == 10: break
    
list(transition_states.keys())[10]
'Null>Online Video'
list_of_paths.str.join('>').str.contains(list(transition_states.keys())[10]).count()
240108
transitions = list_of_paths.str.join('>')
transitions
0         Start>Instagram>Online Display>Null
1                      Start>Paid Search>Null
2                      Start>Paid Search>Null
3                        Start>Instagram>Null
4                      Start>Paid Search>Null
                         ...                 
240103              Start>Online Display>Null
240104              Start>Online Display>Null
240105              Start>Online Display>Null
240106    Start>Paid Search>Online Video>Null
240107                 Start>Paid Search>Null
Name: channel, Length: 240108, dtype: object
transition_states={a+'>'+b:transitions.str.contains(a+'>'+b).sum()  
                  for a,b in itertools.product(list_of_unique_channels, list_of_unique_channels)
                  if transitions.str.contains(a+'>'+b).sum() > 0}
start_keys = list(set(k.split('>')[0] for k in transition_states.keys())); start_keys
['Online Video',
 'Paid Search',
 'Instagram',
 'Online Display',
 'Start',
 'Facebook']
sel_key = start_keys[0]


dict_start_keys={sel_key:sum([transition_states[k] 
                              for k in transition_states 
                              if k.startswith(sel_key)])
                 for sel_key in start_keys}
dict_start_keys
{'Online Video': 40962,
 'Paid Search': 88840,
 'Instagram': 49466,
 'Online Display': 42604,
 'Start': 240108,
 'Facebook': 88125}
transition_prob = {k: v/dict_start_keys[k.split('>')[0]] for k, v in transition_states.items()}
transition_prob
{'Online Video>Conversion': 0.0781455983594551,
 'Online Video>Null': 0.7637566525072018,
 'Online Video>Instagram': 0.03190762169815927,
 'Online Video>Paid Search': 0.04794687759386749,
 'Online Video>Online Display': 0.018919974610614718,
 'Online Video>Facebook': 0.059323275230701626,
 'Instagram>Conversion': 0.057979218048760765,
 'Instagram>Null': 0.6297861157158452,
 'Instagram>Online Video': 0.024117575708567502,
 'Instagram>Paid Search': 0.04580924271216593,
 'Instagram>Online Display': 0.023531314438199977,
 'Instagram>Facebook': 0.2187765333764606,
 'Paid Search>Conversion': 0.053309320126069336,
 'Paid Search>Null': 0.7707789284106259,
 'Paid Search>Online Video': 0.029221071589374155,
 'Paid Search>Instagram': 0.03419630796938316,
 'Paid Search>Online Display': 0.04805267897343539,
 'Paid Search>Facebook': 0.06444169293111211,
 'Online Display>Conversion': 0.05032391324758239,
 'Online Display>Null': 0.7566425687728852,
 'Online Display>Online Video': 0.017298845178856444,
 'Online Display>Instagram': 0.02936344005257722,
 'Online Display>Paid Search': 0.09238569148436766,
 'Online Display>Facebook': 0.053985541263731104,
 'Start>Online Video': 0.14236093757808985,
 'Start>Instagram': 0.11918803205224315,
 'Start>Paid Search': 0.31739883718993117,
 'Start>Online Display': 0.14264414346877238,
 'Start>Facebook': 0.2784080497109634,
 'Facebook>Conversion': 0.053219858156028366,
 'Facebook>Null': 0.6731234042553191,
 'Facebook>Online Video': 0.025577304964539006,
 'Facebook>Instagram': 0.1730723404255319,
 'Facebook>Paid Search': 0.05065531914893617,
 'Facebook>Online Display': 0.02435177304964539}
transition_matrix = pd.DataFrame(columns=list_of_unique_channels, index=list_of_unique_channels)
transition_matrix
Conversion Null Online Video Instagram Paid Search Online Display Start Facebook
Conversion NaN NaN NaN NaN NaN NaN NaN NaN
Null NaN NaN NaN NaN NaN NaN NaN NaN
Online Video NaN NaN NaN NaN NaN NaN NaN NaN
Instagram NaN NaN NaN NaN NaN NaN NaN NaN
Paid Search NaN NaN NaN NaN NaN NaN NaN NaN
Online Display NaN NaN NaN NaN NaN NaN NaN NaN
Start NaN NaN NaN NaN NaN NaN NaN NaN
Facebook NaN NaN NaN NaN NaN NaN NaN NaN
for k,v in transition_prob.items():
    origin, dest = k.split('>')
    transition_matrix.at[origin, dest] = v
transition_matrix.at['Conversion','Conversion'] = 1.0
transition_matrix.at['Null','Null'] = 1.0
transition_matrix=transition_matrix.fillna(0)
transition_matrix
Conversion Null Online Video Instagram Paid Search Online Display Start Facebook
Conversion 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0 0.000000
Null 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0 0.000000
Online Video 0.078146 0.763757 0.000000 0.031908 0.047947 0.018920 0 0.059323
Instagram 0.057979 0.629786 0.024118 0.000000 0.045809 0.023531 0 0.218777
Paid Search 0.053309 0.770779 0.029221 0.034196 0.000000 0.048053 0 0.064442
Online Display 0.050324 0.756643 0.017299 0.029363 0.092386 0.000000 0 0.053986
Start 0.000000 0.000000 0.142361 0.119188 0.317399 0.142644 0 0.278408
Facebook 0.053220 0.673123 0.025577 0.173072 0.050655 0.024352 0 0.000000
sns.heatmap(data=transition_matrix, annot=True, cmap='Greens')
<AxesSubplot:>
../../_images/09_channel_attribution_35_1.png

Assessing Removal Effect#

A Markov graph can be used to measure the importance of each campaign by calculating what is known as the Removal Effect. A campaign’s effectiveness is determined by removing it from the graph and simulating buyer journeys to measure the change in success rate without it in place. Removal Effect is a proxy for weight, and it’s calculated for each campaign in the Markov graph.

Using Removal Effect for marketing attribution is the final piece of the puzzle. To calculate each campaign’s attribution value we can use the following formula: A = V * (Rt / Rv)

  • A = Campaign’s attribution value

  • V = Total value to divide. For example, the total USD value of all successful buyer journeys used as input to the Markov model

  • Rt = Campaign’s Removal Effect

  • Rv = Sum of all Removal Effect values

# channels = [col for col in transition_matrix.columns.tolist() 
#             if col not in ['Start','Null', 'Conversion']]
# channels

# removal_effect_dict = {}
# chnl = channels[0]
# removal_df = transition_matrix.drop(chnl, axis=0).drop(chnl,  axis=1)
# sel_col = removal_df.columns[2]; sel_col
# row_sum=np.sum(removal_df.loc[sel_col])
# removal_df.loc[sel_col,'Null']= 1.0 - row_sum
# removal_to_conv = removal_df[['Null', 'Conversion']].drop(['Null', 'Conversion'], axis=0)
# removal_to_non_conv = removal_df.drop(['Null', 'Conversion'], axis=1).drop(['Null', 'Conversion'], axis=0)
# removal_inv_diff = np.linalg.inv(np.identity(len(removal_to_non_conv.columns)) - np.asarray(removal_to_non_conv))
# removal_inv_diff
# removal_dot_prod = removal_inv_diff@np.asarray(removal_to_conv)
# removal_cvr_df = pd.DataFrame(removal_dot_prod, index=removal_to_conv.index)
# removal_cvr = removal_cvr_df[[1]].loc['Start'].values[0]
# removal_effect = 1 -removal_cvr/base_conversion_rate

# removal_effect_dict[chnl] = removal_effect


def get_removal_effect_dict(transition_matrix, base_conversion_rate):
    channels = [col for col in transition_matrix.columns.tolist() 
                if col not in ['Start','Null', 'Conversion']]; channels
    removal_effect_dict = {}
    for chnl in channels:
        removal_df = transition_matrix.drop(chnl, axis=0).drop(chnl,  axis=1)
        for sel_col in removal_df.columns:   
            row_sum=np.sum(removal_df.loc[sel_col])
            removal_df.loc[sel_col,'Null']= 1.0 - row_sum
        removal_to_conv = removal_df[['Null', 'Conversion']].drop(['Null', 'Conversion'], axis=0)
        removal_to_non_conv = removal_df.drop(['Null', 'Conversion'], axis=1).drop(['Null', 'Conversion'], axis=0)
        removal_inv_diff = np.linalg.inv(np.identity(len(removal_to_non_conv.columns)) - np.asarray(removal_to_non_conv))
        # removal_inv_diff
        removal_dot_prod = removal_inv_diff@np.asarray(removal_to_conv)
        removal_cvr_df = pd.DataFrame(removal_dot_prod, index=removal_to_conv.index)
        removal_cvr = removal_cvr_df[[1]].loc['Start'].values[0]
        removal_effect = 1 -removal_cvr/base_conversion_rate
        
        removal_effect_dict[chnl] = removal_effect
        
    return removal_effect_dict
    
removal_effect_dict= get_removal_effect_dict(transition_matrix, base_conversion_rate)
removal_effect_dict
{'Online Video': 0.20691411655642178,
 'Instagram': 0.21731366149038456,
 'Paid Search': 0.3311037560086154,
 'Online Display': 0.15435482356041275,
 'Facebook': 0.3547597674182721}
# removal_df.loc[sel_col],  sel_col # first row,
# removal_df
total_conversions
def markov_chain_allocations(removal_effect_dict, total_conversions):
    re_sum = np.sum(list(removal_effect_dict.values()))
    return {k: (v/re_sum)* total_conversions for k,v in removal_effect_dict.items()}

allocations = markov_chain_allocations(removal_effect_dict, total_conversions)
allocations
{'Online Video': 2886.4480895461456,
 'Instagram': 3031.521548555893,
 'Paid Search': 4618.891257291356,
 'Online Display': 2153.2469267590823,
 'Facebook': 4948.892177847523}

ax = sns.catplot(data=pd.Series(allocations).sort_index().to_frame().reset_index(),
            x='index', y=0, kind='bar')
# ax.xticks(rotation = 45)
../../_images/09_channel_attribution_41_0.png