# Multi Arm Bandit Problem#

## Introduction#

Row of slot machines with different probabilities of paying off? Which ones should you play often and how often?

• Exploit vs Explore

• Application Areas

• Model for A/B Testing: Ad someone clicks or doesnot

• Medical Diagnosis: Well known treatment or new treatment

• Diseases Epidemic :

import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import empiricaldist
from empiricaldist import Pmf, Distribution
from ipywidgets import interact, interactive, fixed


## Prior#

def decorate_bandit(title):
"""
Labels the axes
title: string
"""
plt.xlabel('Probability of winning')
plt.ylabel('PMF')
plt.title(title)

bandit = Pmf.from_seq(range(101))
bandit.plot()
decorate_bandit(title="Prior Distribution")

• We are assuming uniform prior distribution here for probability

def likelihood_bandit(data, hypo):
x = hypo/100
if data == "W":
return x
else:
return 1-x

actual_probs = [0.1, 0.2, 0.3, 0.4]

def posterior(n_w=1, n_l=9):
bandit = Pmf.from_seq(range(101))
outcomes = 'W'*n_w+"L"*n_l
bandit.plot(color='steelblue', label='Prior', linestyle="--")
for data in outcomes:
bandit.update(likelihood_bandit, data)
bandit.plot(color='steelblue', label='Posterior')
plt.legend()
decorate_bandit(title="Prior vs Posterior")

interactive(posterior, n_w=(0,10), n_l=(0,10))


## Simulate Machines Based on Given Probabilities#

from random import random
from collections import Counter

counter = Counter()

def flip(p):
return random()<p

def play(i):
counter[i] += 1
p = actual_probs[i]
if flip(p):
return 'W'
else:
return 'L'

play(1), play(2), play(3), play(0)

('W', 'L', 'W', 'W')


### Playing machines 20 times#

results = []
for i in range(20):
single  = []
for j in range(4):
#         print(i,j)
single.append(play(j))
results.append(single)

counter[4]

0

list(range(4))

[0, 1, 2, 3]

results

[['L', 'L', 'L', 'L'],
['L', 'W', 'L', 'W'],
['L', 'W', 'L', 'L'],
['L', 'W', 'W', 'L'],
['L', 'W', 'L', 'W'],
['L', 'L', 'L', 'L'],
['L', 'L', 'W', 'L'],
['L', 'W', 'W', 'L'],
['L', 'L', 'L', 'W'],
['W', 'W', 'L', 'L'],
['L', 'L', 'L', 'L'],
['L', 'L', 'W', 'L'],
['W', 'W', 'W', 'L'],
['L', 'W', 'L', 'L'],
['L', 'W', 'W', 'W'],
['L', 'W', 'L', 'W'],
['L', 'W', 'W', 'L'],
['W', 'L', 'L', 'L'],
['L', 'L', 'L', 'L'],
['L', 'L', 'L', 'L']]

prior = range(101)

beliefs = [Pmf.from_seq(prior) for i in range(4)]
beliefs

[0      0.009901
1      0.009901
2      0.009901
3      0.009901
4      0.009901
...
96     0.009901
97     0.009901
98     0.009901
99     0.009901
100    0.009901
Length: 101, dtype: float64,
0      0.009901
1      0.009901
2      0.009901
3      0.009901
4      0.009901
...
96     0.009901
97     0.009901
98     0.009901
99     0.009901
100    0.009901
Length: 101, dtype: float64,
0      0.009901
1      0.009901
2      0.009901
3      0.009901
4      0.009901
...
96     0.009901
97     0.009901
98     0.009901
99     0.009901
100    0.009901
Length: 101, dtype: float64,
0      0.009901
1      0.009901
2      0.009901
3      0.009901
4      0.009901
...
96     0.009901
97     0.009901
98     0.009901
99     0.009901
100    0.009901
Length: 101, dtype: float64]

options = dict(xticklabels='invisible', yticklabels='invisible')

def plot(beliefs,label_pre='Prior',**options):
sns.set_context('paper')
for i, b in enumerate(beliefs):
plt.subplot(2,2, i+1, label=f"{label_pre}{i}")
b.plot(label=f"Machine {i}")
plt.gca().set_yticklabels([])
plt.legend()
plt.tight_layout()
sns.set_context('talk')

plot(beliefs)

def update(beliefs, i, outcome):
beliefs[i].update(likelihood_bandit, outcome)

prior = range(101)

counter = Counter()

def flip(p):
return random()<p

def play(i):
counter[i] += 1
p = actual_probs[i]
if flip(p):
return 'W'
else:
return 'L'

def update(beliefs, i, outcome):
beliefs[i].update(likelihood_bandit, outcome)

beliefs = [Pmf.from_seq(prior) for i in range(4)]
# beliefs
plot(beliefs, label_pre='Prior')
for i in range(20):
for j in range(4):
update(beliefs, j, play(j))

plot(beliefs, label_pre='Posterior')

for i, b in enumerate(beliefs):
print(f"{b.mean():0.02f}", b.credible_interval(0.9))

9.12 [ 2. 21.]
22.73 [10. 38.]
50.00 [33. 67.]
27.27 [13. 44.]


## Bayesian Bandit#

• Idea is to choose best course of action while running the experiment/ simulation

• Choice internally call np.random.choice on quantities

# def choose(beliefs):
#     ps = []

beliefs[3].choice()

40

# Pmf.choice?

beliefs[3].qs

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
91,  92,  93,  94,  95,  96,  97,  98,  99, 100])

ps = [b.choice() for b in beliefs]
ps, np.argmax(ps)

([0, 11, 21, 40], 3)

def choose(beliefs):
ps = [b.choice() for b in beliefs]
return np.argmax(ps)

choose(beliefs)

3

def choose_play_update(beliefs, verbose=False):
machine = choose(beliefs)

outcome = play(machine)

update(beliefs,machine,outcome)

if verbose:
print(machine, outcome, beliefs[machine].mean())

choose_play_update(beliefs, verbose=True)

3 W 42.85714285714286

prior = range(101)
beliefs = [Pmf.from_seq(prior) for i in range(4)]
counter = Counter()

num_plays = 200
for i in range(num_plays):
choose_play_update(beliefs)

plot(beliefs)

for i,b in enumerate(beliefs):
print(b.mean(), b.credible_interval(0.9))

15.000082130288831 [ 4. 30.]
18.181782304462693 [ 7. 33.]
21.2121212181695 [11. 34.]
33.834586466165426 [27. 41.]

for machine, count in sorted(counter.items()):
print(machine , count)

0 18
1 20
2 31
3 131