# Sentiment Analysis with Deep Learning

## Imports

In [499]:
!export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/anaconda/envs/aiking/lib/

In [500]:
# import numpy as np  # regular ol' numpy
import trax
from trax import layers as tl  # core building block
from trax import shapes  # data signatures: dimensionality and type
from trax import fastmath  # uses jax, offers numpy on steroids
from trax.fastmath import numpy as np
from trax.supervised import training
import pandas as pd
from sklearn.model_selection import train_test_split
from aiking.data.external import *
from fastcore.all import *
from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings
import string
import itertools
import random
import shutil

In [501]:
!pip list|grep jax

jax                                        0.3.15
jaxlib                                     0.3.15
jupyter-server-mathjax                     0.2.3


In [502]:
relu = tl.Relu()
relu

Serial[
  Relu
]

In [503]:
relu.name, relu.n_in, relu.n_out

('Serial', 1, 1)

In [504]:
x = np.array([-2,-1,0,1,2]); x

DeviceArray([-2, -1,  0,  1,  2], dtype=int32)

In [505]:
!nvidia-smi

Wed Jul 27 09:03:07 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.129.06   Driver Version: 470.129.06   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:1E.0 Off |                    0 |
| N/A   49C    P8    29W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [506]:
relu(x)

DeviceArray([0, 0, 0, 1, 2], dtype=int32)

In [507]:
help(tl.Concatenate)

Help on class Concatenate in module trax.layers.combinators:

class Concatenate(trax.layers.base.Layer)
 |  Concatenate(n_items=2, axis=-1)
 |  
 |  Concatenates a number of tensors into a single tensor.
 |  
 |  For example::
 |  
 |      x = np.array([1, 2])
 |      y = np.array([3, 4])
 |      z = np.array([5, 6])
 |      concat3 = tl.Concatenate(n_items=3)
 |      z = concat3((x, y, z))  # z = [1, 2, 3, 4, 5, 6]
 |  
 |  Use the `axis` argument to specify on which axis to concatenate the tensors.
 |  By default it's the last axis, `axis=-1`, and `n_items=2`.
 |  
 |  Method resolution order:
 |      Concatenate
 |      trax.layers.base.Layer
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, n_items=2, axis=-1)
 |      Creates a partially initialized, unconnected layer instance.
 |      
 |      Args:
 |        n_in: Number of inputs expected by this layer.
 |        n_out: Number of outputs promised by this layer.
 |        name: Class-like name for thi

In [508]:
# help(tl.LayerNorm)
help(shapes.signature)

Help on function signature in module trax.shapes:

signature(obj)
    Returns a `ShapeDtype` signature for the given `obj`.
    
    A signature is either a `ShapeDtype` instance or a tuple of `ShapeDtype`
    instances. Note that this function is permissive with respect to its inputs
    (accepts lists or tuples or dicts, and underlying objects can be any type
    as long as they have shape and dtype attributes) and returns the corresponding
    nested structure of `ShapeDtype`.
    
    Args:
      obj: An object that has `shape` and `dtype` attributes, or a list/tuple/dict
          of such objects.
    
    Returns:
      A corresponding nested structure of `ShapeDtype` instances.



In [509]:
norm = tl.LayerNorm()
x = np.array([0,1,2,3], dtype='float');x

  x = np.array([0,1,2,3], dtype='float');x


DeviceArray([0., 1., 2., 3.], dtype=float32)

In [510]:
norm.init(shapes.signature(x))

((DeviceArray([1., 1., 1., 1.], dtype=float32),
  DeviceArray([0., 0., 0., 0.], dtype=float32)),
 ())

In [511]:
norm(x)

DeviceArray([-1.3416404 , -0.44721344,  0.44721344,  1.3416404 ], dtype=float32)

In [512]:
# Define a custom Layer

def Power():
    layer_name = "Power"
    
    def func(x):
        return x**2
    
    return tl.Fn(layer_name, func)
power = Power()
power.name, power.n_in, power.n_out

('Power', 1, 1)

In [513]:
power(x)

DeviceArray([0., 1., 4., 9.], dtype=float32)

In [514]:
serial = tl.Serial(
    tl.LayerNorm(), 
    tl.Relu(),
    Power()
)

x = np.array([-2,-1,0,1,2])
serial.init(shapes.signature(x))

(((DeviceArray([1, 1, 1, 1, 1], dtype=int32),
   DeviceArray([0, 0, 0, 0, 0], dtype=int32)),
  ((), (), ()),
  ()),
 ((), ((), (), ()), ()))

In [515]:
serial(x)

DeviceArray([0.        , 0.        , 0.        , 0.49999973, 1.9999989 ],            dtype=float32)

In [516]:
class My_Class:
    def __init__(self, y):
        self.x = y
    def __call__(self, z):
        self.x += z
        print(self.x)
       
instance_c = My_Class(10); instance_c(3); instance_c.x

13


13

In [517]:
def f(x): return 3.0*x**2+x

In [518]:
grad_f  = trax.fastmath.grad(f)

In [519]:
f(2.0), grad_f(2.0)

(14.0, DeviceArray(13., dtype=float32, weak_type=True))

In [520]:
a = [1,2,3,4]
b = [0]*10
a, b

([1, 2, 3, 4], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [521]:
# lines_index = [*range(len(a))]; lines_index
# def f(ind, a_size): return ind%a_size
b = [a[ind%len(a)] for ind in range(len(b))]
b

[1, 2, 3, 4, 1, 2, 3, 4, 1, 2]

## Financial Sentiment Analysis

### Read Dataset

In [522]:
df = pd.read_csv("financial_sentiment.csv"); df.head()

Unnamed: 0,Sentence,Sentiment
0,"The GeoSolutions technology will leverage Benefon 's GPS solutions by providing Location Based Search Technology , a Communities Platform , location relevant multimedia content and a new and powerful commercial model .",positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real possibility",negative
2,"For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",positive
3,"According to the Finnish-Russian Chamber of Commerce , all the major construction companies of Finland are operating in Russia .",neutral
4,"The Swedish buyout firm has sold its remaining 22.4 percent stake , almost eighteen months after taking the company public in Finland .",neutral


In [523]:
df.describe()

Unnamed: 0,Sentence,Sentiment
count,5842,5842
unique,5322,3
top,"Managing Director 's comments : `` Net sales for the first quarter were notably lower than a year before , especially in Finland , Russia and the Baltic countries .",neutral
freq,2,3130


### Split Train Test and Validation Dataset

In [524]:
df_train, df_test = train_test_split(df, stratify=df['Sentiment'])
df_train, df_valid = train_test_split(df_train, stratify=df_train['Sentiment'])

In [525]:
df_train.shape, df_valid.shape, df_test.shape

((3285, 2), (1096, 2), (1461, 2))

### Data Processing and Cleaning

In [526]:
def remove_old_style(tweet): return re.sub(r'^RT[\s]+', '', tweet)
def remove_url(tweet): return re.sub(r'https?://[^\s\n\r]+', '', tweet)
def remove_hash(tweet): return re.sub(r'#', "", tweet)
def remove_numbers(tweet): return re.sub(r'\d*\.?\d+', "", tweet)
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
skip_words = stopwords.words('english')+list(string.punctuation)
stemmer = PorterStemmer() 
def filter_stem_tokens(tweet_tokens, skip_words=skip_words, stemmer=stemmer): 
    return [ stemmer.stem(token) for token in tweet_tokens if token not in skip_words]

process_sentence = compose(remove_old_style, remove_url, remove_hash, remove_numbers, tokenizer.tokenize, filter_stem_tokens)
process_sentence(df_train.loc[df_train.index[0],'Sentence'])
# df_train

['cdp',
 'establish',
 'initi',
 'institut',
 'investor',
 'howev',
 'annual',
 'publish',
 'result',
 'also',
 'interest',
 'increas',
 'number',
 'custom',
 'interest',
 'group',
 'report',
 'compani']

In [527]:
# inverse_vocab = dict(enumerate(['__PAD__', '__</e>__', '__UNK__'] + list(set(df_train.Sentence.apply(process_sentence).sum()))))
# vocab  = pd.Dataframe({v:k for k,v in inverse_vocab.items()})
# vocab

df_vocab = pd.DataFrame(['__PAD__', '__</e>__', '__UNK__'] + list(set(df_train.Sentence.apply(process_sentence).sum()))).reset_index()
df_vocab = df_vocab.set_index(0)

In [528]:
sentence = process_sentence(df_train.loc[df_train.index[0],'Sentence']); sentence

['cdp',
 'establish',
 'initi',
 'institut',
 'investor',
 'howev',
 'annual',
 'publish',
 'result',
 'also',
 'interest',
 'increas',
 'number',
 'custom',
 'interest',
 'group',
 'report',
 'compani']

In [529]:
df_vocab.loc[sentence]['index'].tolist()

[668,
 4016,
 4646,
 3481,
 1716,
 2861,
 3277,
 2761,
 4386,
 1228,
 423,
 5176,
 3239,
 3189,
 423,
 575,
 2018,
 3359]

In [530]:
msg =  "My blog name is Soliloquium"

def process_with_vocab(msg,df_vocab=df_vocab, unknown_token='__UNK__'):
    tokens = process_sentence(msg)
    return df_vocab.loc[[token if token in df_vocab.index else unknown_token 
                         for token in tokens]][df_vocab.columns[0]].tolist()

process_with_vocab(msg)

[2508, 4496, 2]

In [531]:
df_train.Sentiment.value_counts()

neutral     1760
positive    1041
negative     484
Name: Sentiment, dtype: int64

### Data Batching

In [532]:
def data_generator(df, batch_sz, df_vocab,  stop=False, shuffle=True, loop=True,
             unknown_token='__UNK__', pad_token='__PAD__',x_col='Sentence', y_col='Sentiment',
             process=process_with_vocab, class_dict={'neutral':0, 'positive':1, 'negative':-1}):
    while not stop:
        index = 0
        print("Restarting Loop")
        if shuffle: df = df.sample(frac=1)
        itr = itertools.cycle(df.iterrows())
        pad_id = df_vocab.loc[pad_token, df_vocab.columns[0]]
        while index <= len(df):
            batch  = [next(itr) for i in range(batch_sz)]
            X,y = zip(*[(process_with_vocab(i[1][x_col], df_vocab=df_vocab, unknown_token=unknown_token), i[1][y_col])  for i in batch])
            inputs = np.array(pd.DataFrame(X).fillna(pad_id), dtype='int32')
            targets = np.array([class_dict[i] for i in y])
            
            index += batch_sz
            example_weights = np.array([1.0]*len(targets))
            yield inputs, targets, example_weights
        if loop: continue
        else: break
            
count = 0
g = data_generator(df_train[:10].copy(), 3, df_vocab)
# while count < 20:
#     batch = next(g)
#     count +=1

In [533]:
next(g)

Restarting Loop


(DeviceArray([[4869, 2664, 2478, 4233, 3262, 3262, 1336, 2877,  414, 4386,
               2744, 3359, 5280,  750,    0,    0,    0,    0,    0,    0,
                  0,    0,    0],
              [ 668, 4016, 4646, 3481, 1716, 2861, 3277, 2761, 4386, 1228,
                423, 5176, 3239, 3189,  423,  575, 2018, 3359,    0,    0,
                  0,    0,    0],
              [ 665, 1388, 5432,  601, 1870,  914,  116, 4680,  601, 5924,
               3968,  638, 5938,  990, 2468,  568, 2054, 5560, 5660, 4278,
               5924, 5938, 3283]], dtype=int32),
 DeviceArray([0, 0, 0], dtype=int32),
 DeviceArray([1., 1., 1.], dtype=float32))

In [534]:
df_train.iloc[0]['Sentence']

'CDP was established on the initiative of institutional investors ; however , the annually published results also interest an increasing number of customers and other interest groups of the reporting companies .'

In [535]:
df_vocab.shape

(5992, 1)

### Model Definition

In [536]:
def classifier(vocab_sz=5920, emb_dims=256, output_dims=3, mode='train'):
    model = tl.Serial(
        tl.Embedding(vocab_size=vocab_sz, d_feature=emb_dims),
        tl.Mean(axis=1),
        tl.Dense(n_units=output_dims),
        tl.LogSoftmax()
        
    )
    return model

inputs, targets, weights = next(data_generator(df_train[:10].copy(), 4, df_vocab))
# model = classifier()
# model(inputs)
inputs.shape, inputs

Restarting Loop


((4, 18),
 DeviceArray([[ 668, 4016, 4646, 3481, 1716, 2861, 3277, 2761, 4386, 1228,
                423, 5176, 3239, 3189,  423,  575, 2018, 3359],
              [5083, 1819, 1815,  256, 3504,  234,  895, 2100, 5673,    0,
                  0,    0,    0,    0,    0,    0,    0,    0],
              [1934, 4707,   40, 3698, 3921,  566,    0,    0,    0,    0,
                  0,    0,    0,    0,    0,    0,    0,    0],
              [4869, 2664, 2478, 4233, 3262, 3262, 1336, 2877,  414, 4386,
               2744, 3359, 5280,  750,    0,    0,    0,    0]],            dtype=int32))

In [537]:
vocab_sz = 5920
emb_dims = 256
embed_layer = tl.Embedding(vocab_size=vocab_sz, d_feature=emb_dims)
# embed_layer.init(trax.shapes.signature(inputs))
# embed_layer(inputs)

In [538]:
trax.shapes.signature(inputs)

ShapeDtype{shape:(4, 18), dtype:int32}

In [539]:
el = embed_layer.init(trax.shapes.signature(inputs)); 

el[0].shape

(5920, 256)

In [540]:
# An example of and embedding layer
# rnd.seed(31)
tmp_embed = tl.Embedding(d_feature=256, vocab_size=5920)

# tmp_in_arr = np.array([[0.0, 1,2],
#                     [3,2,0]
#                    ])

# random_key = trax.fastmath.random.get_prng(seed=0)
# tmp_in_arr = trax.fastmath.random.normal(key = random_key, shape = (4, 18))
tmp_in_arr = inputs
tmp_embed.init(trax.shapes.signature(tmp_in_arr))

# Embedding layer will return an array of shape (batch size, vocab size, d_feature)
tmp_embedded_arr = tmp_embed(tmp_in_arr)
print(f"Shape of returned array is {tmp_embedded_arr.shape}")
# display(tmp_embedded_arr)
# display(tmp_embed)
# display(inputs)

Shape of returned array is (4, 18, 256)


In [541]:
tmp_mean = tl.Mean(axis=1)
tmp_mean(tmp_embedded_arr).shape

(4, 256)

### Train Eval Task Definition

In [542]:
def get_train_eval_task(df_train, df_valid, 
                        df_vocab, loop, batch_sz=16):
    random.seed(271)
    train_task = training.TrainTask(
            labeled_data=data_generator(df_train, batch_sz, df_vocab,  stop=False, shuffle=True, loop=loop,
                     unknown_token='__UNK__', pad_token='__PAD__',x_col='Sentence', y_col='Sentiment',
                     process=process_with_vocab, class_dict={'neutral':0, 'positive':1, 'negative':-1}),
            loss_layer=tl.WeightedCategoryCrossEntropy(),
            optimizer=trax.optimizers.Adam(0.01),
            n_steps_per_checkpoint=10)
    eval_task = training.EvalTask(
            labeled_data=data_generator(df_valid, batch_sz, df_vocab,  stop=False, shuffle=True, loop=loop,
                 unknown_token='__UNK__', pad_token='__PAD__',x_col='Sentence', y_col='Sentiment',
                 process=process_with_vocab, class_dict={'neutral':0, 'positive':1, 'negative':-1}),
            metrics=[tl.WeightedCategoryCrossEntropy(), tl.WeightedCategoryAccuracy()]
        
    )
    return train_task, eval_task

    
get_train_eval_task(df_train, df_valid, df_vocab, loop=True)

Restarting Loop
Restarting Loop


(<trax.supervised.training.TrainTask at 0x101ae78eabb0>,
 <trax.supervised.training.EvalTask at 0x101ae791a3d0>)

In [543]:
dir_path = './model/'

try:
    shutil.rmtree(dir_path)
except OSError as e:
    pass


output_dir = './model/'
output_dir_expand = os.path.expanduser(output_dir)
print(output_dir_expand)

./model/


### Model Training

In [544]:
def train_model(classifier, train_task, eval_task, n_steps, output_dir):
    random.seed(1234)
    training_loop = training.Loop(
                        classifier, 
                        train_task,
                        eval_tasks=eval_task,
                        output_dir=output_dir, 
                        random_seed=31)
    training_loop.run(n_steps = n_steps)
    return training_loop
    
    

In [545]:
train_task, eval_task = get_train_eval_task(df_train, df_valid, df_vocab, loop=True)
model = classifier(vocab_sz=len(df_vocab), emb_dims=256, output_dims=3, mode='train')
training_loop = train_model(model, train_task, eval_task, n_steps=100, output_dir=output_dir)

Restarting Loop
Restarting Loop


  with gzip.GzipFile(fileobj=f, compresslevel=compresslevel) as gzipf:



Step      1: Total number of trainable weights: 1534723
Step      1: Ran 1 train steps in 1.16 secs
Step      1: train WeightedCategoryCrossEntropy |  1.02272236


  with gzip_lib.GzipFile(fileobj=f, compresslevel=2) as gzipf:


Step      1: eval  WeightedCategoryCrossEntropy |  0.70422697
Step      1: eval      WeightedCategoryAccuracy |  0.37500000

Step     10: Ran 9 train steps in 5.69 secs
Step     10: train WeightedCategoryCrossEntropy |  0.73040473
Step     10: eval  WeightedCategoryCrossEntropy |  0.69985682
Step     10: eval      WeightedCategoryAccuracy |  0.62500000

Step     20: Ran 10 train steps in 4.34 secs
Step     20: train WeightedCategoryCrossEntropy |  0.56657821
Step     20: eval  WeightedCategoryCrossEntropy |  0.60906959
Step     20: eval      WeightedCategoryAccuracy |  0.68750000

Step     30: Ran 10 train steps in 1.63 secs
Step     30: train WeightedCategoryCrossEntropy |  0.54628116
Step     30: eval  WeightedCategoryCrossEntropy |  0.41477004
Step     30: eval      WeightedCategoryAccuracy |  0.62500000

Step     40: Ran 10 train steps in 2.34 secs
Step     40: train WeightedCategoryCrossEntropy |  0.50867021
Step     40: eval  WeightedCategoryCrossEntropy |  0.58183014
Step     40