Sentiment Analysis with Deep Learning#


!export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/anaconda/envs/aiking/lib/
# import numpy as np  # regular ol' numpy
import trax
from trax import layers as tl  # core building block
from trax import shapes  # data signatures: dimensionality and type
from trax import fastmath  # uses jax, offers numpy on steroids
from trax.fastmath import numpy as np
from trax.supervised import training
import pandas as pd
from sklearn.model_selection import train_test_split
from import *
from fastcore.all import *
from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings
import string
import itertools
import random
import shutil
!pip list|grep jax
jax                                        0.3.15
jaxlib                                     0.3.15
jupyter-server-mathjax                     0.2.3
relu = tl.Relu()
], relu.n_in, relu.n_out
('Serial', 1, 1)
x = np.array([-2,-1,0,1,2]); x
DeviceArray([-2, -1,  0,  1,  2], dtype=int32)
DeviceArray([0, 0, 0, 1, 2], dtype=int32)
norm = tl.LayerNorm()
x = np.array([0,1,2,3], dtype='float');x
/tmp/ipykernel_389065/ UserWarning: Explicitly requested dtype float requested in array is not available, and will be truncated to dtype float32. To enable more dtypes, set the jax_enable_x64 configuration option or the JAX_ENABLE_X64 shell environment variable. See for more.
  x = np.array([0,1,2,3], dtype='float');x
DeviceArray([0., 1., 2., 3.], dtype=float32)
((DeviceArray([1., 1., 1., 1.], dtype=float32),
  DeviceArray([0., 0., 0., 0.], dtype=float32)),
DeviceArray([-1.3416404 , -0.44721344,  0.44721344,  1.3416404 ], dtype=float32)
# Define a custom Layer

def Power():
    layer_name = "Power"
    def func(x):
        return x**2
    return tl.Fn(layer_name, func)
power = Power(), power.n_in, power.n_out
('Power', 1, 1)
DeviceArray([0., 1., 4., 9.], dtype=float32)
serial = tl.Serial(

x = np.array([-2,-1,0,1,2])
(((DeviceArray([1, 1, 1, 1, 1], dtype=int32),
   DeviceArray([0, 0, 0, 0, 0], dtype=int32)),
  ((), (), ()),
 ((), ((), (), ()), ()))
DeviceArray([0.        , 0.        , 0.        , 0.49999973, 1.9999989 ],            dtype=float32)
class My_Class:
    def __init__(self, y):
        self.x = y
    def __call__(self, z):
        self.x += z
instance_c = My_Class(10); instance_c(3); instance_c.x
def f(x): return 3.0*x**2+x
grad_f  = trax.fastmath.grad(f)
f(2.0), grad_f(2.0)
(14.0, DeviceArray(13., dtype=float32, weak_type=True))
a = [1,2,3,4]
b = [0]*10
a, b
([1, 2, 3, 4], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
# lines_index = [*range(len(a))]; lines_index
# def f(ind, a_size): return ind%a_size
b = [a[ind%len(a)] for ind in range(len(b))]
[1, 2, 3, 4, 1, 2, 3, 4, 1, 2]

Financial Sentiment Analysis#

Read Dataset#

df = pd.read_csv("financial_sentiment.csv"); df.head()
Sentence Sentiment
0 The GeoSolutions technology will leverage Benefon 's GPS solutions by providing Location Based Search Technology , a Communities Platform , location relevant multimedia content and a new and powerful commercial model . positive
1 $ESI on lows, down $1.50 to $2.50 BK a real possibility negative
2 For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m . positive
3 According to the Finnish-Russian Chamber of Commerce , all the major construction companies of Finland are operating in Russia . neutral
4 The Swedish buyout firm has sold its remaining 22.4 percent stake , almost eighteen months after taking the company public in Finland . neutral
Sentence Sentiment
count 5842 5842
unique 5322 3
top Managing Director 's comments : `` Net sales for the first quarter were notably lower than a year before , especially in Finland , Russia and the Baltic countries . neutral
freq 2 3130

Split Train Test and Validation Dataset#

df_train, df_test = train_test_split(df, stratify=df['Sentiment'])
df_train, df_valid = train_test_split(df_train, stratify=df_train['Sentiment'])
df_train.shape, df_valid.shape, df_test.shape
((3285, 2), (1096, 2), (1461, 2))

Data Processing and Cleaning#

def remove_old_style(tweet): return re.sub(r'^RT[\s]+', '', tweet)
def remove_url(tweet): return re.sub(r'https?://[^\s\n\r]+', '', tweet)
def remove_hash(tweet): return re.sub(r'#', "", tweet)
def remove_numbers(tweet): return re.sub(r'\d*\.?\d+', "", tweet)
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
skip_words = stopwords.words('english')+list(string.punctuation)
stemmer = PorterStemmer() 
def filter_stem_tokens(tweet_tokens, skip_words=skip_words, stemmer=stemmer): 
    return [ stemmer.stem(token) for token in tweet_tokens if token not in skip_words]

process_sentence = compose(remove_old_style, remove_url, remove_hash, remove_numbers, tokenizer.tokenize, filter_stem_tokens)
# df_train
# inverse_vocab = dict(enumerate(['__PAD__', '__</e>__', '__UNK__'] + list(set(df_train.Sentence.apply(process_sentence).sum()))))
# vocab  = pd.Dataframe({v:k for k,v in inverse_vocab.items()})
# vocab

df_vocab = pd.DataFrame(['__PAD__', '__</e>__', '__UNK__'] + list(set(df_train.Sentence.apply(process_sentence).sum()))).reset_index()
df_vocab = df_vocab.set_index(0)
sentence = process_sentence(df_train.loc[df_train.index[0],'Sentence']); sentence
msg =  "My blog name is Soliloquium"

def process_with_vocab(msg,df_vocab=df_vocab, unknown_token='__UNK__'):
    tokens = process_sentence(msg)
    return df_vocab.loc[[token if token in df_vocab.index else unknown_token 
                         for token in tokens]][df_vocab.columns[0]].tolist()

[2508, 4496, 2]
neutral     1760
positive    1041
negative     484
Name: Sentiment, dtype: int64

Data Batching#

def data_generator(df, batch_sz, df_vocab,  stop=False, shuffle=True, loop=True,
             unknown_token='__UNK__', pad_token='__PAD__',x_col='Sentence', y_col='Sentiment',
             process=process_with_vocab, class_dict={'neutral':0, 'positive':1, 'negative':-1}):
    while not stop:
        index = 0
        print("Restarting Loop")
        if shuffle: df = df.sample(frac=1)
        itr = itertools.cycle(df.iterrows())
        pad_id = df_vocab.loc[pad_token, df_vocab.columns[0]]
        while index <= len(df):
            batch  = [next(itr) for i in range(batch_sz)]
            X,y = zip(*[(process_with_vocab(i[1][x_col], df_vocab=df_vocab, unknown_token=unknown_token), i[1][y_col])  for i in batch])
            inputs = np.array(pd.DataFrame(X).fillna(pad_id), dtype='int32')
            targets = np.array([class_dict[i] for i in y])
            index += batch_sz
            example_weights = np.array([1.0]*len(targets))
            yield inputs, targets, example_weights
        if loop: continue
        else: break
count = 0
g = data_generator(df_train[:10].copy(), 3, df_vocab)
# while count < 20:
#     batch = next(g)
#     count +=1
Restarting Loop
(DeviceArray([[4869, 2664, 2478, 4233, 3262, 3262, 1336, 2877,  414, 4386,
               2744, 3359, 5280,  750,    0,    0,    0,    0,    0,    0,
                  0,    0,    0],
              [ 668, 4016, 4646, 3481, 1716, 2861, 3277, 2761, 4386, 1228,
                423, 5176, 3239, 3189,  423,  575, 2018, 3359,    0,    0,
                  0,    0,    0],
              [ 665, 1388, 5432,  601, 1870,  914,  116, 4680,  601, 5924,
               3968,  638, 5938,  990, 2468,  568, 2054, 5560, 5660, 4278,
               5924, 5938, 3283]], dtype=int32),
 DeviceArray([0, 0, 0], dtype=int32),
 DeviceArray([1., 1., 1.], dtype=float32))
'CDP was established on the initiative of institutional investors ; however , the annually published results also interest an increasing number of customers and other interest groups of the reporting companies .'
(5992, 1)

Model Definition#

def classifier(vocab_sz=5920, emb_dims=256, output_dims=3, mode='train'):
    model = tl.Serial(
        tl.Embedding(vocab_size=vocab_sz, d_feature=emb_dims),
    return model

inputs, targets, weights = next(data_generator(df_train[:10].copy(), 4, df_vocab))
# model = classifier()
# model(inputs)
inputs.shape, inputs
Restarting Loop
((4, 18),
 DeviceArray([[ 668, 4016, 4646, 3481, 1716, 2861, 3277, 2761, 4386, 1228,
                423, 5176, 3239, 3189,  423,  575, 2018, 3359],
              [5083, 1819, 1815,  256, 3504,  234,  895, 2100, 5673,    0,
                  0,    0,    0,    0,    0,    0,    0,    0],
              [1934, 4707,   40, 3698, 3921,  566,    0,    0,    0,    0,
                  0,    0,    0,    0,    0,    0,    0,    0],
              [4869, 2664, 2478, 4233, 3262, 3262, 1336, 2877,  414, 4386,
               2744, 3359, 5280,  750,    0,    0,    0,    0]],            dtype=int32))
vocab_sz = 5920
emb_dims = 256
embed_layer = tl.Embedding(vocab_size=vocab_sz, d_feature=emb_dims)
# embed_layer.init(trax.shapes.signature(inputs))
# embed_layer(inputs)
# embed_layer(inputs)
ShapeDtype{shape:(4, 18), dtype:int32}
el = embed_layer.init(trax.shapes.signature(inputs)); 

(5920, 256)
# An example of and embedding layer
# rnd.seed(31)
tmp_embed = tl.Embedding(d_feature=256, vocab_size=5920)

# tmp_in_arr = np.array([[0.0, 1,2],
#                     [3,2,0]
#                    ])

# random_key = trax.fastmath.random.get_prng(seed=0)
# tmp_in_arr = trax.fastmath.random.normal(key = random_key, shape = (4, 18))
tmp_in_arr = inputs

# Embedding layer will return an array of shape (batch size, vocab size, d_feature)
tmp_embedded_arr = tmp_embed(tmp_in_arr)
print(f"Shape of returned array is {tmp_embedded_arr.shape}")
# display(tmp_embedded_arr)
# display(tmp_embed)
# display(inputs)
Shape of returned array is (4, 18, 256)
tmp_mean = tl.Mean(axis=1)
(4, 256)

Train Eval Task Definition#

def get_train_eval_task(df_train, df_valid, 
                        df_vocab, loop, batch_sz=16):
    train_task = training.TrainTask(
            labeled_data=data_generator(df_train, batch_sz, df_vocab,  stop=False, shuffle=True, loop=loop,
                     unknown_token='__UNK__', pad_token='__PAD__',x_col='Sentence', y_col='Sentiment',
                     process=process_with_vocab, class_dict={'neutral':0, 'positive':1, 'negative':-1}),
    eval_task = training.EvalTask(
            labeled_data=data_generator(df_valid, batch_sz, df_vocab,  stop=False, shuffle=True, loop=loop,
                 unknown_token='__UNK__', pad_token='__PAD__',x_col='Sentence', y_col='Sentiment',
                 process=process_with_vocab, class_dict={'neutral':0, 'positive':1, 'negative':-1}),
            metrics=[tl.WeightedCategoryCrossEntropy(), tl.WeightedCategoryAccuracy()]
    return train_task, eval_task

get_train_eval_task(df_train, df_valid, df_vocab, loop=True)
Restarting Loop
Restarting Loop
(< at 0x101ae78eabb0>,
 < at 0x101ae791a3d0>)
dir_path = './model/'

except OSError as e:

output_dir = './model/'
output_dir_expand = os.path.expanduser(output_dir)

Model Training#

def train_model(classifier, train_task, eval_task, n_steps, output_dir):
    training_loop = training.Loop(
                        random_seed=31) = n_steps)
    return training_loop
train_task, eval_task = get_train_eval_task(df_train, df_valid, df_vocab, loop=True)
model = classifier(vocab_sz=len(df_vocab), emb_dims=256, output_dims=3, mode='train')
training_loop = train_model(model, train_task, eval_task, n_steps=100, output_dir=output_dir)
Restarting Loop
Restarting Loop
/opt/anaconda/envs/aiking/lib/python3.9/site-packages/jax/_src/lib/ UserWarning: jax.host_count has been renamed to jax.process_count. This alias will eventually be removed; please update your code.
/opt/anaconda/envs/aiking/lib/python3.9/site-packages/trax/layers/ FutureWarning: GzipFile was opened for writing, but this will change in future Python releases.  Specify the mode argument for opening it for writing.
  with gzip.GzipFile(fileobj=f, compresslevel=compresslevel) as gzipf:
Step      1: Total number of trainable weights: 1534723
Step      1: Ran 1 train steps in 1.16 secs
Step      1: train WeightedCategoryCrossEntropy |  1.02272236
/opt/anaconda/envs/aiking/lib/python3.9/site-packages/trax/supervised/ FutureWarning: GzipFile was opened for writing, but this will change in future Python releases.  Specify the mode argument for opening it for writing.
  with gzip_lib.GzipFile(fileobj=f, compresslevel=2) as gzipf:
Step      1: eval  WeightedCategoryCrossEntropy |  0.70422697
Step      1: eval      WeightedCategoryAccuracy |  0.37500000

Step     10: Ran 9 train steps in 5.69 secs
Step     10: train WeightedCategoryCrossEntropy |  0.73040473
Step     10: eval  WeightedCategoryCrossEntropy |  0.69985682
Step     10: eval      WeightedCategoryAccuracy |  0.62500000

Step     20: Ran 10 train steps in 4.34 secs
Step     20: train WeightedCategoryCrossEntropy |  0.56657821
Step     20: eval  WeightedCategoryCrossEntropy |  0.60906959
Step     20: eval      WeightedCategoryAccuracy |  0.68750000

Step     30: Ran 10 train steps in 1.63 secs
Step     30: train WeightedCategoryCrossEntropy |  0.54628116
Step     30: eval  WeightedCategoryCrossEntropy |  0.41477004
Step     30: eval      WeightedCategoryAccuracy |  0.62500000

Step     40: Ran 10 train steps in 2.34 secs
Step     40: train WeightedCategoryCrossEntropy |  0.50867021
Step     40: eval  WeightedCategoryCrossEntropy |  0.58183014
Step     40: eval      WeightedCategoryAccuracy |  0.43750000

Step     50: Ran 10 train steps in 1.68 secs
Step     50: train WeightedCategoryCrossEntropy |  0.50223523
Step     50: eval  WeightedCategoryCrossEntropy |  0.40479019
Step     50: eval      WeightedCategoryAccuracy |  0.68750000

Step     60: Ran 10 train steps in 0.97 secs
Step     60: train WeightedCategoryCrossEntropy |  0.54113311
Step     60: eval  WeightedCategoryCrossEntropy |  0.38476127
Step     60: eval      WeightedCategoryAccuracy |  0.43750000

Step     70: Ran 10 train steps in 2.34 secs
Step     70: train WeightedCategoryCrossEntropy |  0.50714481
Step     70: eval  WeightedCategoryCrossEntropy |  0.31111774
Step     70: eval      WeightedCategoryAccuracy |  0.56250000

Step     80: Ran 10 train steps in 1.04 secs
Step     80: train WeightedCategoryCrossEntropy |  0.48107988
Step     80: eval  WeightedCategoryCrossEntropy |  0.53283501
Step     80: eval      WeightedCategoryAccuracy |  0.68750000

Step     90: Ran 10 train steps in 1.04 secs
Step     90: train WeightedCategoryCrossEntropy |  0.48959431
Step     90: eval  WeightedCategoryCrossEntropy |  0.52207285
Step     90: eval      WeightedCategoryAccuracy |  0.62500000

Step    100: Ran 10 train steps in 1.08 secs
Step    100: train WeightedCategoryCrossEntropy |  0.49267441
Step    100: eval  WeightedCategoryCrossEntropy |  0.34768826
Step    100: eval      WeightedCategoryAccuracy |  0.81250000