Contents

Naive Bayes

Contents

Naive Bayes#

Imports#

from fastcore.all import *
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import rich
from rich.console import Console
import nltk
from nltk.corpus import twitter_samples
import re                                  # library for regular expression operations
import string                              # for string operations
from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings
import string
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms

sns.set()
console = Console()

console.print("Hello Naive Bayes", style='red')

Hello Naive Bayes

Download Dataset and Read Dataset#

nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/rahul.saraf/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rahul.saraf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

True

ptweets = twitter_samples.strings('positive_tweets.json')
ntweets = twitter_samples.strings('negative_tweets.json')

df = pd.DataFrame({'positive':ptweets, 'negative':ntweets}).unstack().reset_index().drop(columns=['level_1']).rename(columns={'level_0':'class', 0:'tweets'})
df.head()

	class	tweets
0	positive	#FollowFriday @France_Inte @PKuchly57 @Milipol...
1	positive	@Lamb2ja Hey James! How odd :/ Please call our...
2	positive	@DespiteOfficial we had a listen last night :)...
3	positive	@97sides CONGRATS :)
4	positive	yeaaaah yippppy!!! my accnt verified rqst has...

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   class   10000 non-null  object
 1   tweets  10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB

Preprocessing#

tweet = df.loc[2277, 'tweets']; tweet

'My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… https://t.co/3tfYom0N1i'

Clean & Stem Tweet#

def remove_old_style(tweet): return re.sub(r'^RT[\s]+', '', tweet)
def remove_url(tweet): return re.sub(r'https?://[^\s\n\r]+', '', tweet)
def remove_hash(tweet): return re.sub(r'#', "", tweet)
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
skip_words = stopwords.words('english')+list(string.punctuation)
stemmer = PorterStemmer() 
def filter_stem_tokens(tweet_tokens, skip_words=skip_words, stemmer=stemmer): 
    return [ stemmer.stem(token) for token in tweet_tokens if token not in skip_words]

process_tweet = compose(remove_old_style, remove_url, remove_hash, tokenizer.tokenize, filter_stem_tokens)
process_tweet(tweet)
# skip_words

['beauti',
 'sunflow',
 'sunni',
 'friday',
 'morn',
 ':)',
 'sunflow',
 'favourit',
 'happi',
 'friday',
 '…']

df['Ptweets'] = df['tweets'].apply(process_tweet)
# df['Ptweets_join'] = df['Ptweets'].apply(lambda row: u" ".join(row))

u':)'

':)'

tweet = df.loc[2277, "Ptweets"]
def check_token(tweet, token): 
    if token in tweet : return True
    else: return False

token = ":)"
df[df['Ptweets'].apply(lambda row: check_token(row, token))]

	class	tweets	Ptweets
0	positive	#FollowFriday @France_Inte @PKuchly57 @Milipol...	[followfriday, top, engag, member, commun, wee...
1	positive	@Lamb2ja Hey James! How odd :/ Please call our...	[hey, jame, odd, :/, pleas, call, contact, cen...
2	positive	@DespiteOfficial we had a listen last night :)...	[listen, last, night, :), bleed, amaz, track, ...
3	positive	@97sides CONGRATS :)	[congrat, :)]
4	positive	yeaaaah yippppy!!! my accnt verified rqst has...	[yeaaah, yipppi, accnt, verifi, rqst, succeed,...
...	...	...	...
4996	positive	@RachelLiskeard Thanks for the shout-out :) It...	[thank, shout-out, :), great, aboard]
4997	positive	@side556 Hey! :) Long time no talk...	[hey, :), long, time, talk, ...]
4998	positive	@staybubbly69 as Matt would say. WELCOME TO AD...	[matt, would, say, welcom, adulthood, ..., :)]
6736	negative	@Israelgirly They sure do, esp now when ppl ar...	[sure, esp, ppl, talk, crap, milli, >:(, i'll,...
7244	negative	@wtfxmbs AMBS please it's harry's jeans :)):):):(	[amb, pleas, harry', jean, :), ):, ):, ):]

3543 rows × 3 columns

# df[df['Ptweets_join'].str.contains

Creating Freqeuncy Dataframe#

len(df['Ptweets'].sum())

len(set(df['Ptweets'].sum()))

def get_word_count(token):
    d = df[df['Ptweets'].apply(lambda row: check_token(row, token))]['class'].value_counts().to_dict()
    return {'word': token, 'positive':d.get('positive',0), 'negative':d.get('negative', 0)}

def build_freqs(df):
    tokens = list(set(df['Ptweets'].sum()))
    df_freqs = pd.DataFrame([get_word_count(token) for token in tokens]).set_index('word'); 
    # Laplace smoothing formulae for probability
    V = df_freqs.shape[0]
    df_freqs['log_pos_prob'] = np.log((df_freqs['positive']+1)/(df_freqs['positive'].sum()+V))
    df_freqs['log_neg_prob'] = np.log((df_freqs['negative']+1)/(df_freqs['negative'].sum()+V))
    df_freqs['lambda'] = df_freqs['log_pos_prob'] - df_freqs['log_neg_prob']
    return df_freqs

df_freqs = build_freqs(df)
# np.log(df_freqs['pos_prob'])

# good_keys = df.index.intersection()
# df_freqs.loc[good_keys]
l = df_freqs.head().index.tolist()
l.append("lalala")
df_freqs.loc[df_freqs.index.intersection(l)]

	positive	negative	log_pos_prob	log_neg_prob	lambda
word
sweden	0	1	-10.688279	-9.972150	-0.716129
jackson	0	3	-10.688279	-9.279003	-1.409276
gl	1	0	-9.995132	-10.665298	0.670166
shake	2	1	-9.589667	-9.972150	0.382484
hee	1	0	-9.995132	-10.665298	0.670166

Extract Features from tweet#

tweet = df.loc[2277, "Ptweets"]
l = df_freqs.loc[tweet].sum().tolist()
l.append(1)
l

[4174.0, 119.0, -76.26418672683839, -96.03713892386281, 19.77295219702441, 1]

def score_tweet(tweet, df_freqs):
    l = df_freqs.loc[df_freqs.index.intersection(tweet)].sum().tolist() 
    # Do intersection to take keys that exist in frequency table and skip which don't 
    l.append(1)
    return l
tweet, score_tweet(tweet, df_freqs)
# df_freqs.loc[]

(['beauti',
  'sunflow',
  'sunni',
  'friday',
  'morn',
  ':)',
  'sunflow',
  'favourit',
  'happi',
  'friday',
  '…'],
 [4063.0,
  107.0,
  -60.290305886425145,
  -77.27149318108225,
  16.981187294657104,
  1])

# This is a data leak . Build Frequency and scoring only on train_df
df['positive'], df['negative'],df['log_pos_prob'], df['log_neg_prob'], df['lambda'] , df['bias']=zip(*df['Ptweets'].map(lambda row : score_tweet(row, df_freqs)))

df['sentiment'] = 0
df.loc[df['class']=='positive', 'sentiment'] = 1

df.head()

	class	tweets	Ptweets	positive	negative	log_pos_prob	log_neg_prob	lambda	bias	sentiment
0	positive	#FollowFriday @France_Inte @PKuchly57 @Milipol...	[followfriday, top, engag, member, commun, wee...	3737.0	69.0	-47.021071	-64.579054	17.557983	1	1
1	positive	@Lamb2ja Hey James! How odd :/ Please call our...	[hey, jame, odd, :/, pleas, call, contact, cen...	4448.0	473.0	-107.276901	-116.195717	8.918815	1	1
2	positive	@DespiteOfficial we had a listen last night :)...	[listen, last, night, :), bleed, amaz, track, ...	3728.0	159.0	-58.478652	-67.157334	8.678683	1	1
3	positive	@97sides CONGRATS :)	[congrat, :)]	3562.0	4.0	-10.113069	-19.133371	9.020302	1	1
4	positive	yeaaaah yippppy!!! my accnt verified rqst has...	[yeaaah, yipppi, accnt, verifi, rqst, succeed,...	3878.0	273.0	-129.201531	-141.211416	12.009885	1	1

df_freqs.sum()

positive         33332.000000
negative         32336.000000
log_pos_prob   -104991.038240
log_neg_prob   -104871.983649
lambda            -119.054592
dtype: float64

Modeling#

df = pd.DataFrame({'positive':ptweets, 'negative':ntweets}).unstack().reset_index().drop(columns=['level_1']).rename(columns={'level_0':'class', 0:'tweets'})
df['Ptweets'] = df['tweets'].apply(process_tweet)
train_df = pd.concat([df[:4000],df[5000:9000]])
test_df =  pd.concat([df[4000:5000],df[9000:10000]])

df_freqs = build_freqs(train_df)
df_freqs

	positive	negative	log_pos_prob	log_neg_prob	lambda
word
sweden	0	1	-10.638928	-9.923462	-0.715466
jackson	0	3	-10.638928	-9.230315	-1.408613
gl	1	0	-9.945780	-10.616609	0.670828
shake	2	1	-9.540315	-9.923462	0.383146
hee	1	0	-9.945780	-10.616609	0.670828
...	...	...	...	...	...
control	1	2	-9.945780	-9.517997	-0.427784
590	0	1	-10.638928	-9.923462	-0.715466
who'	9	7	-8.336343	-8.537167	0.200825
school'	1	0	-9.945780	-10.616609	0.670828
ladygaga	0	1	-10.638928	-9.923462	-0.715466

9162 rows × 5 columns

bd = train_df['class'].value_counts().to_dict()
bias = np.log(bd['positive']/bd['negative'])
bias

0.0

train_df['positive'], train_df['negative'], train_df['log_pos_prob'], train_df['log_neg_prob'], train_df['lambda'] , train_df['bias']=zip(*train_df['Ptweets'].map(lambda row : score_tweet(row, df_freqs)))

train_df['prediction'] = train_df['lambda']+bias > 0
train_df['actual'] = train_df['class'] == 'positive'
(train_df['actual'] == train_df['prediction']).mean() # accuracy

0.999

test_df['positive'], test_df['negative'], test_df['log_pos_prob'], test_df['log_neg_prob'], test_df['lambda'] , test_df['bias']=zip(*test_df['Ptweets'].map(lambda row : score_tweet(row, df_freqs)))
test_df['prediction'] = test_df['lambda']+bias > 0
test_df['actual'] = test_df['class'] == 'positive'
(test_df['actual'] == test_df['prediction']).mean() # accuracy

0.9985

Visualization#

sns.scatterplot(data=train_df, x='log_pos_prob', y='log_neg_prob', hue='class')

<AxesSubplot:xlabel='log_pos_prob', ylabel='log_neg_prob'>

../../_images/02_naive_bayes_40_1.png

Confidence Elipse#

data_pos = train_df[train_df['class']=='positive']
data_neg = train_df[train_df['class']=='negative']

x = data_pos['log_pos_prob']
y = data_pos['log_neg_prob']
n_std=3.0
cov_mat= np.cov(x,y)
cov_mat

array([[832.64706392, 866.45359717],
       [866.45359717, 911.72992453]])

pearson = cov_mat[0,1]/np.sqrt(cov_mat[0,0]*cov_mat[1,1])
pearson

0.994447194605905

ell_radius_x = np.sqrt(1+pearson)
ell_radius_y = np.sqrt(1-pearson)
ell_radius_x, ell_radius_y

(1.4122489846361743, 0.07451714832234908)

scale_x = np.sqrt(cov_mat[0,0])*n_std;  mean_x = np.mean(x)
scale_x, mean_x

(86.56687342915261, -45.98846279441421)

scale_y = np.sqrt(cov_mat[1,1])*n_std;  mean_y = np.mean(y)
scale_y, mean_y

(90.58459759147846, -55.61467522963381)

def calc_ellipses_data(x,y, n_std=3.0):
    cov_mat= np.cov(x,y)
    pearson = cov_mat[0,1]/np.sqrt(cov_mat[0,0]*cov_mat[1,1])
    ell_radius_x = np.sqrt(1+pearson)
    ell_radius_y = np.sqrt(1-pearson)
    scale_x = np.sqrt(cov_mat[0,0])*n_std
    mean_x = np.mean(x)
    scale_y = np.sqrt(cov_mat[1,1])*n_std
    mean_y = np.mean(y)
    return ell_radius_x, scale_x, mean_x, ell_radius_y, scale_y, mean_y
calc_ellipses_data(x,y)

(1.4122489846361743,
 86.56687342915261,
 -45.98846279441421,
 0.07451714832234908,
 90.58459759147846,
 -55.61467522963381)

def draw_ellipse(data, ax, facecolor='None', **kwargs):
    ell_radius_x, scale_x, mean_x, ell_radius_y, scale_y, mean_y = data
    ellipse = Ellipse((0, 0),
                  width=ell_radius_x * 2,
                  height=ell_radius_y * 2,
                  facecolor=facecolor,
                  **kwargs)
    transf = transforms.Affine2D() \
        .rotate_deg(45) \
        .scale(scale_x, scale_y) \
        .translate(mean_x, mean_y)
    ellipse.set_transform(transf + ax.transData)
    return ax.add_patch(ellipse)

def plot_naive_bayes(df, ax=None, title='data'):
    data_pos = df[df['class']=='positive']
    data_neg = df[df['class']=='negative']
    if ax is None:fig, ax = plt.subplots(figsize=(11.7, 8.27))
    sns.scatterplot(data=train_df, x='log_pos_prob', y='log_neg_prob', hue='class', ax=ax)
    x = data_pos['log_pos_prob']
    y = data_pos['log_neg_prob']
    ellipse_data_2std=calc_ellipses_data(x,y, n_std=2)
    draw_ellipse(ellipse_data_2std, ax, edgecolor='black', linestyle=':',label=r'$2\sigma$')
    ellipse_data_3std=calc_ellipses_data(x,y, n_std=3)
    draw_ellipse(ellipse_data_3std, ax, edgecolor='black')
    x = data_neg['log_pos_prob']
    y = data_neg['log_neg_prob']
    ellipse_data_2std=calc_ellipses_data(x,y, n_std=2)
    draw_ellipse(ellipse_data_2std, ax, edgecolor='red', linestyle=':')
    ellipse_data_3std=calc_ellipses_data(x,y, n_std=3)
    draw_ellipse(ellipse_data_3std, ax, edgecolor='red',label=r'$3\sigma$')
    ax.legend()
    ax.set_title(title)
    return ax

plot_naive_bayes(train_df, title='Train')
plot_naive_bayes(test_df, title='Test')

<AxesSubplot:title={'center':'Test'}, xlabel='log_pos_prob', ylabel='log_neg_prob'>

../../_images/02_naive_bayes_50_1.png

../../_images/02_naive_bayes_50_2.png