Logistic Regression#


from fastcore.all import *
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import nltk
import re
import string
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from nltk.corpus import twitter_samples
from rich.console import Console
from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings
console = Console()

Download Dataset#

ptweets = twitter_samples.strings('positive_tweets.json')
ntweets = twitter_samples.strings('negative_tweets.json')
len(ptweets), len(ntweets)
(5000, 5000)
console.print(ptweets[random.randint(0,5000)], style='green')
console.print(ntweets[random.randint(0,5000)], style='red')
Hi BAM ! @BarsAndMelody 
Can you follow my bestfriend @969Horan696 ? 
She loves you a lot :) 
See you in Warsaw &lt;3 
Love you &lt;3 x23
Mtaani tunaita pussy viazi choma and we still get laid :-(


What are we going to do?

  1. Remove hyperlinks, twitter marks and styles

  2. Tokenize

  3. Remove Stopwords

  4. Stemming

Feature Engineering#

Building Frequency Dictionary#


def contains_tok(tweet_tokens, tok):
    in_tokens = False
    if tok in tweet_tokens: in_tokens = True
    return in_tokens

df[df.apply(lambda row: contains_tok(row['PTweet'], toks[1]), axis=1)]['class'].value_counts().to_dict()
# contains_tok(df.loc[0,'PTweet'], toks[0])
{'positive': 32, 'negative': 6}
toks = list(set(df['PTweet'].sum()))
# toks[:10]
               **df[df.apply(lambda row: contains_tok(row['PTweet'], tok), axis=1)]['class'].value_counts().to_dict()} 
 for tok in toks[:10]]).fillna(0).set_index('word')
positive negative
children 3.0 2.0
latin 3.0 0.0
bilal 0.0 1.0
leno 0.0 1.0
savag 1.0 0.0
hyung 0.0 1.0
braxton 0.0 1.0
statement 1.0 1.0
convinc 0.0 3.0
therefor 0.0 1.0
def build_freqs(df):
    toks = list(set(df['PTweet'].sum()))
    return pd.DataFrame([{'word':tok, 
               **df[df.apply(lambda row: contains_tok(row['PTweet'], tok), axis=1)]['class'].value_counts().to_dict()} 
                  for tok in toks]).fillna(0).set_index('word')
df_freq=build_freqs(df); df_freq.head()
positive negative
children 3.0 2.0
latin 3.0 0.0
bilal 0.0 1.0
leno 0.0 1.0
savag 1.0 0.0
positive negative
count 10507.000000 10507.000000
mean 3.172361 3.077567
std 37.991689 44.787129
min 0.000000 0.000000
25% 0.000000 0.000000
50% 1.000000 1.000000
75% 1.000000 1.000000
max 3541.000000 4422.000000
df_freq.sort_values(by='positive', ascending=False)
positive negative
:) 3541.0 2.0
:-) 669.0 0.0
thank 636.0 105.0
:d 628.0 0.0
follow 365.0 169.0
... ... ...
💎 0.0 1.0
gate 0.0 1.0
goodmus 0.0 4.0
322 0.0 1.0
3a2ad 0.0 1.0

10507 rows × 2 columns

df_freq.sort_values(by='negative', ascending=False)
positive negative
:( 1.0 4422.0
:-( 0.0 481.0
i'm 173.0 318.0
miss 27.0 296.0
... 253.0 284.0
... ... ...
swasa 1.0 0.0
soph 1.0 0.0
ef 1.0 0.0
cocoar 1.0 0.0
kw 2.0 0.0

10507 rows × 2 columns

Scoring Tweets#

{'positive': 3737.0, 'negative': 69.0}
# pd.DataFrame(df.apply(lambda row: ,axis=1))

def score_tweet(tweet_tokens): 
    l = df_freq.loc[tweet_tokens].sum().tolist()
    return l
[3737.0, 69.0, 1.0]
df['positive'], df['negative'], df['bias']=zip(*df['PTweet'].map(score_tweet))
class Tweet PTweet positive negative bias
0 positive #FollowFriday @France_Inte @PKuchly57 @Milipol... [followfriday, top, engag, member, commun, wee... 3737.0 69.0 1.0
1 positive @Lamb2ja Hey James! How odd :/ Please call our... [hey, jame, odd, :/, pleas, call, contact, cen... 4448.0 473.0 1.0
2 positive @DespiteOfficial we had a listen last night :)... [listen, last, night, :), bleed, amaz, track, ... 3728.0 159.0 1.0
3 positive @97sides CONGRATS :) [congrat, :)] 3562.0 4.0 1.0
4 positive yeaaaah yippppy!!! my accnt verified rqst has... [yeaaah, yipppi, accnt, verifi, rqst, succeed,... 3878.0 273.0 1.0
... ... ... ... ... ... ...
9995 negative I wanna change my avi but uSanele :( [wanna, chang, avi, usanel, :(] 55.0 4546.0 1.0
9996 negative MY PUPPY BROKE HER FOOT :( [puppi, broke, foot, :(] 3.0 4439.0 1.0
9997 negative where's all the jaebum baby pictures :(( [where', jaebum, babi, pictur, :(] 34.0 4490.0 1.0
9998 negative But but Mr Ahmad Maslan cooks too :( https://t... [mr, ahmad, maslan, cook, :(] 9.0 4434.0 1.0
9999 negative @eawoman As a Hull supporter I am expecting a ... [hull, support, expect, misser, week, :-(] 116.0 565.0 1.0

10000 rows × 6 columns

positive negative
children 3.0 2.0
latin 3.0 0.0
bilal 0.0 1.0
leno 0.0 1.0
savag 1.0 0.0
... ... ...
smoak 1.0 0.0
siguro 1.0 0.0
kapan 0.0 1.0
fever 2.0 7.0
3a2ad 0.0 1.0

10507 rows × 2 columns

Visualizing Words#

keys = ['happi', 'merri', 'nice', 'good', 'bad', 'sad', 'mad', 'best', 'pretti',
        '❤', ':)', ':(', '😒', '😬', '😄', '😍', '♛',
        'song', 'idea', 'power', 'play', 'magnific']

sel_keys = [ k for k in keys if k in df_freq.index]
sel_df = df_freq.loc[sel_keys]
%matplotlib inline
fig, ax = plt.subplots()

sel_df.plot.scatter(x='positive', y='negative', loglog=True, ax=ax)
for row in sel_df.iterrows():
    ax.annotate(row[0], (row[1]['positive'], row[1]['negative'])) 
ax.plot([0, 9000], [0, 9000], color = 'red')
# fig.canvas.draw()
Modeling- Logistics Regression#

class Tweet PTweet positive negative bias
0 positive #FollowFriday @France_Inte @PKuchly57 @Milipol... [followfriday, top, engag, member, commun, wee... 3737.0 69.0 1.0
1 positive @Lamb2ja Hey James! How odd :/ Please call our... [hey, jame, odd, :/, pleas, call, contact, cen... 4448.0 473.0 1.0
2 positive @DespiteOfficial we had a listen last night :)... [listen, last, night, :), bleed, amaz, track, ... 3728.0 159.0 1.0
3 positive @97sides CONGRATS :) [congrat, :)] 3562.0 4.0 1.0
4 positive yeaaaah yippppy!!! my accnt verified rqst has... [yeaaah, yipppi, accnt, verifi, rqst, succeed,... 3878.0 273.0 1.0
class Tweet PTweet positive negative bias
5000 negative hopeless for tmr :( [hopeless, tmr, :(] 2.0 4427.0 1.0
5001 negative Everything in the kids section of IKEA is so c... [everyth, kid, section, ikea, cute, shame, i'm... 316.0 4917.0 1.0
5002 negative @Hegelbon That heart sliding into the waste ba... [heart, slide, wast, basket, :(] 20.0 4456.0 1.0
5003 negative “@ketchBurning: I hate Japanese call him "bani... [“, hate, japanes, call, bani, :(, :(, ”] 67.0 8962.0 1.0
5004 negative Dang starting next week I have "work" :( [dang, start, next, week, work, :(] 303.0 4690.0 1.0
... ... ... ... ... ... ...
8995 negative Amelia didnt stalk my twitter :( [amelia, didnt, stalk, twitter, :(] 34.0 4479.0 1.0
8996 negative oh, i missed the broadcast. : ( [oh, miss, broadcast] 79.0 393.0 1.0
8997 negative i really can't stream on melon i feel useless :-( [realli, can't, stream, melon, feel, useless, ... 174.0 958.0 1.0
8998 negative I need to stop looking at old soccer pictures :( [need, stop, look, old, soccer, pictur, :(] 251.0 4703.0 1.0
8999 negative Got an interview for the job that I want but t... [got, interview, job, want, rang, tuesday, int... 236.0 4800.0 1.0

4000 rows × 6 columns

df['sentiment'] = 0
df.loc[df['class']=='positive', 'sentiment']=1
class Tweet PTweet positive negative bias sentiment
0 positive #FollowFriday @France_Inte @PKuchly57 @Milipol... [followfriday, top, engag, member, commun, wee... 3737.0 69.0 1.0 1
1 positive @Lamb2ja Hey James! How odd :/ Please call our... [hey, jame, odd, :/, pleas, call, contact, cen... 4448.0 473.0 1.0 1
2 positive @DespiteOfficial we had a listen last night :)... [listen, last, night, :), bleed, amaz, track, ... 3728.0 159.0 1.0 1
3 positive @97sides CONGRATS :) [congrat, :)] 3562.0 4.0 1.0 1
4 positive yeaaaah yippppy!!! my accnt verified rqst has... [yeaaah, yipppi, accnt, verifi, rqst, succeed,... 3878.0 273.0 1.0 1
... ... ... ... ... ... ... ...
9995 negative I wanna change my avi but uSanele :( [wanna, chang, avi, usanel, :(] 55.0 4546.0 1.0 0
9996 negative MY PUPPY BROKE HER FOOT :( [puppi, broke, foot, :(] 3.0 4439.0 1.0 0
9997 negative where's all the jaebum baby pictures :(( [where', jaebum, babi, pictur, :(] 34.0 4490.0 1.0 0
9998 negative But but Mr Ahmad Maslan cooks too :( https://t... [mr, ahmad, maslan, cook, :(] 9.0 4434.0 1.0 0
9999 negative @eawoman As a Hull supporter I am expecting a ... [hull, support, expect, misser, week, :-(] 116.0 565.0 1.0 0

10000 rows × 7 columns

train_df = pd.concat([df.iloc[:4000], df.iloc[5000:9000]]); train_df
class Tweet PTweet positive negative bias sentiment
0 positive #FollowFriday @France_Inte @PKuchly57 @Milipol... [followfriday, top, engag, member, commun, wee... 3737.0 69.0 1.0 1
1 positive @Lamb2ja Hey James! How odd :/ Please call our... [hey, jame, odd, :/, pleas, call, contact, cen... 4448.0 473.0 1.0 1
2 positive @DespiteOfficial we had a listen last night :)... [listen, last, night, :), bleed, amaz, track, ... 3728.0 159.0 1.0 1
3 positive @97sides CONGRATS :) [congrat, :)] 3562.0 4.0 1.0 1
4 positive yeaaaah yippppy!!! my accnt verified rqst has... [yeaaah, yipppi, accnt, verifi, rqst, succeed,... 3878.0 273.0 1.0 1
... ... ... ... ... ... ... ...
8995 negative Amelia didnt stalk my twitter :( [amelia, didnt, stalk, twitter, :(] 34.0 4479.0 1.0 0
8996 negative oh, i missed the broadcast. : ( [oh, miss, broadcast] 79.0 393.0 1.0 0
8997 negative i really can't stream on melon i feel useless :-( [realli, can't, stream, melon, feel, useless, ... 174.0 958.0 1.0 0
8998 negative I need to stop looking at old soccer pictures :( [need, stop, look, old, soccer, pictur, :(] 251.0 4703.0 1.0 0
8999 negative Got an interview for the job that I want but t... [got, interview, job, want, rang, tuesday, int... 236.0 4800.0 1.0 0

8000 rows × 7 columns

test_df  = df.iloc[list(set(df.index.tolist()) - set(train_df.index.tolist()))]
class Tweet PTweet positive negative bias sentiment
4995 positive @chriswiggin3 Chris, that's great to hear :) D... [chri, that', great, hear, :), due, time, remi... 4005.0 337.0 1.0 1
4996 positive @RachelLiskeard Thanks for the shout-out :) It... [thank, shout-out, :), great, aboard] 4349.0 129.0 1.0 1
4997 positive @side556 Hey! :) Long time no talk... [hey, :), long, time, talk, ...] 4075.0 556.0 1.0 1
4998 positive @staybubbly69 as Matt would say. WELCOME TO AD... [matt, would, say, welcom, adulthood, ..., :)] 4017.0 420.0 1.0 1
4999 positive @DanielOConnel18 you could say he will have eg... [could, say, egg, face, :-)] 776.0 154.0 1.0 1

Train test Split#

X = train_df[['bias', 'positive', 'negative']]
y = train_df['sentiment']
X_test = test_df[['bias', 'positive', 'negative']]
y_test = test_df['sentiment']

Model training#

model = LogisticRegression()
model.fit(X , y)

Model Scoring#

array([[ 0.24050227,  0.00685412, -0.0077651 ]])
metrics.accuracy_score(y, model.predict(X)) # Score on training 
metrics.accuracy_score(y_test, model.predict(X_test))

Visualizing Model#

thetas = model.coef_.reshape(3,1)
array([0.00685412, 0.01370825, 0.02056237])
def negative(thetas, pos):
    return ( -thetas[0] - thetas[1]*pos)/thetas[2]

def direction(thetas, pos):
    return    pos * thetas[2] / thetas[1]

# negative(thetas, np.array([1,2,3]))
# direction(thetas, np.array([1,2,3]))
palette ={0:'red', 1:'green'}
ax = sns.scatterplot(data=df, x='positive',y='negative', hue='sentiment', palette=palette, marker='.')
pos = np.arange(0, int(X.abs().max().max()), 1); pos

sns.lineplot(pos, negative(thetas, pos), ax=ax)
# ax.arrow(offset, negative(thetas, offset), offset, direction(thetas, offset), head_width=500, head_length=500, fc='g', ec='g')
# # Plot a red line pointing to the negative direction
# ax.arrow(offset, negative(thetas, offset), -offset, -direction(thetas, offset), head_width=500, head_length=500, fc='r', ec='r')
