Autocomplete#

Imports#

import numpy as np 
import scipy as sp 
import matplotlib.pyplot as plt
import pandas as pd
from fastcore.all import *
import seaborn as sns
from collections import Counter
import nltk
sns.set()
## Calculating 
nltk.download('punkt')
[nltk_data] Downloading package punkt to
[nltk_data]     /home/rahul.saraf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
True
corpus =  "I am happy because I am Learning"
corpus
'I am happy because I am Learning'
n = 3
tokens = nltk.word_tokenize(corpus.lower()); tokens
tokens = ["<s>"]*(n-1)+ tokens +['<e>']; tokens
ngrams = np.array([tokens[i:i+n] for i in range(len(tokens)-n+1)]); ngrams
array([['<s>', '<s>', 'i'],
       ['<s>', 'i', 'am'],
       ['i', 'am', 'happy'],
       ['am', 'happy', 'because'],
       ['happy', 'because', 'i'],
       ['because', 'i', 'am'],
       ['i', 'am', 'learning'],
       ['am', 'learning', '<e>']], dtype='<U8')
ns1gram = ngrams[:,:-1]; ns1gram
array([['<s>', '<s>'],
       ['<s>', 'i'],
       ['i', 'am'],
       ['am', 'happy'],
       ['happy', 'because'],
       ['because', 'i'],
       ['i', 'am'],
       ['am', 'learning']], dtype='<U8')
vocab = ngrams[:,-1]; vocab
array(['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '<e>'],
      dtype='<U8')
count_matrix = pd.DataFrame(0.0, index=ns1gram, columns = vocab); count_matrix
i am happy because i am learning <e>
(<s>, <s>) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
(<s>, i) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
(i, am) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
(am, happy) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
(happy, because) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
(because, i) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
(i, am) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
(am, learning) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Counter?
Init signature: Counter(iterable=None, /, **kwds)
Docstring:     
Dict subclass for counting hashable items.  Sometimes called a bag
or multiset.  Elements are stored as dictionary keys and their counts
are stored as dictionary values.

>>> c = Counter('abcdeabcdabcaba')  # count elements from a string

>>> c.most_common(3)                # three most common elements
[('a', 5), ('b', 4), ('c', 3)]
>>> sorted(c)                       # list all unique elements
['a', 'b', 'c', 'd', 'e']
>>> ''.join(sorted(c.elements()))   # list elements with repetitions
'aaaaabbbbcccdde'
>>> sum(c.values())                 # total of all counts
15

>>> c['a']                          # count of letter 'a'
5
>>> for elem in 'shazam':           # update counts from an iterable
...     c[elem] += 1                # by adding 1 to each element's count
>>> c['a']                          # now there are seven 'a'
7
>>> del c['b']                      # remove all 'b'
>>> c['b']                          # now there are zero 'b'
0

>>> d = Counter('simsalabim')       # make another counter
>>> c.update(d)                     # add in the second counter
>>> c['a']                          # now there are nine 'a'
9

>>> c.clear()                       # empty the counter
>>> c
Counter()

Note:  If a count is set to zero or reduced to zero, it will remain
in the counter until the entry is deleted or the counter is cleared:

>>> c = Counter('aaabbc')
>>> c['b'] -= 2                     # reduce the count of 'b' by two
>>> c.most_common()                 # 'b' is still in, but its count is zero
[('a', 3), ('c', 1), ('b', 0)]
Init docstring:
Create a new, empty Counter object.  And if given, count elements
from an input iterable.  Or, initialize the count from another mapping
of elements to their counts.

>>> c = Counter()                           # a new, empty counter
>>> c = Counter('gallahad')                 # a new counter from an iterable
>>> c = Counter({'a': 4, 'b': 2})           # a new counter from a mapping
>>> c = Counter(a=4, b=2)                   # a new counter from keyword args
File:           /opt/anaconda/envs/aiking/lib/python3.9/collections/__init__.py
Type:           type
Subclasses:     _OrderedCounter, FreqDist
for i, v in zip(ns1gram,vocab):
    print(i,v)
    # print(count_matrix[i,v])
    # count_matrix[i,v] = count_matrix[i,v]+1
['<s>' '<s>'] i
['<s>' 'i'] am
['i' 'am'] happy
['am' 'happy'] because
['happy' 'because'] i
['because' 'i'] am
['i' 'am'] learning
['am' 'learning'] <e>
count_matrix.index[0] #count_matrix
('<s>', '<s>')
for i, c in zip(pd.Index(ns1gram), pd.Index(vocab)):
    print(i, c)
('<s>', '<s>') i
('<s>', 'i') am
('i', 'am') happy
('am', 'happy') because
('happy', 'because') i
('because', 'i') am
('i', 'am') learning
('am', 'learning') <e>
pd.Index(ns1gram)[0] in count_matrix.index
True
print("Hello")
Hello
a = pd.Index(ns1gram)[0]; a
# count_matrix.iloc[[a], :]
('<s>', '<s>')
for a, v in zip(pd.Index(ns1gram),pd.Index(vocab)):
    count_matrix.loc[[a], v] +=1
count_matrix
i am happy because i am learning <e>
(<s>, <s>) 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
(<s>, i) 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0
(i, am) 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0
(am, happy) 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
(happy, because) 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
(because, i) 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0
(i, am) 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0
(am, learning) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
# prob_matrix = 
prob_matrix = count_matrix/count_matrix.sum(axis=0); prob_matrix
i am happy because i am learning <e>
(<s>, <s>) 0.5 0.0 0.0 0.0 0.5 0.0 0.0 0.0
(<s>, i) 0.0 0.5 0.0 0.0 0.0 0.5 0.0 0.0
(i, am) 0.0 0.0 0.5 0.0 0.0 0.0 0.5 0.0
(am, happy) 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
(happy, because) 0.5 0.0 0.0 0.0 0.5 0.0 0.0 0.0
(because, i) 0.0 0.5 0.0 0.0 0.0 0.5 0.0 0.0
(i, am) 0.0 0.0 0.5 0.0 0.0 0.0 0.5 0.0
(am, learning) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
n = 3
tokens = nltk.word_tokenize(corpus.lower()); tokens
tokens = ["<s>"]*(n-1)+ tokens +['<e>']; tokens
ngrams = np.array([tokens[i:i+n] for i in range(len(tokens)-n+1)]); ngrams
ns1grams = ngrams[:,:-1]; ns1gram
vocab = ngrams[:,-1]; vocab
count_matrix = pd.DataFrame(0.0, index=ns1grams, columns = vocab); count_matrix
for a, v in zip(pd.Index(ns1grams),pd.Index(vocab)):count_matrix.loc[[a], v] +=1
prob_matrix = count_matrix/count_matrix.sum(axis=0); prob_matrix
i am happy because i am learning <e>
(<s>, <s>) 0.5 0.0 0.0 0.0 0.5 0.0 0.0 0.0
(<s>, i) 0.0 0.5 0.0 0.0 0.0 0.5 0.0 0.0
(i, am) 0.0 0.0 0.5 0.0 0.0 0.0 0.5 0.0
(am, happy) 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
(happy, because) 0.5 0.0 0.0 0.0 0.5 0.0 0.0 0.0
(because, i) 0.0 0.5 0.0 0.0 0.0 0.5 0.0 0.0
(i, am) 0.0 0.0 0.5 0.0 0.0 0.0 0.5 0.0
(am, learning) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0