Autocomplete
Contents
Autocomplete#
Imports#
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
from fastcore.all import *
import seaborn as sns
from collections import Counter
import nltk
sns.set()
## Calculating
nltk.download('punkt')
[nltk_data] Downloading package punkt to
[nltk_data] /home/rahul.saraf/nltk_data...
[nltk_data] Package punkt is already up-to-date!
True
corpus = "I am happy because I am Learning"
corpus
'I am happy because I am Learning'
n = 3
tokens = nltk.word_tokenize(corpus.lower()); tokens
tokens = ["<s>"]*(n-1)+ tokens +['<e>']; tokens
ngrams = np.array([tokens[i:i+n] for i in range(len(tokens)-n+1)]); ngrams
array([['<s>', '<s>', 'i'],
['<s>', 'i', 'am'],
['i', 'am', 'happy'],
['am', 'happy', 'because'],
['happy', 'because', 'i'],
['because', 'i', 'am'],
['i', 'am', 'learning'],
['am', 'learning', '<e>']], dtype='<U8')
ns1gram = ngrams[:,:-1]; ns1gram
array([['<s>', '<s>'],
['<s>', 'i'],
['i', 'am'],
['am', 'happy'],
['happy', 'because'],
['because', 'i'],
['i', 'am'],
['am', 'learning']], dtype='<U8')
vocab = ngrams[:,-1]; vocab
array(['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '<e>'],
dtype='<U8')
count_matrix = pd.DataFrame(0.0, index=ns1gram, columns = vocab); count_matrix
i | am | happy | because | i | am | learning | <e> | |
---|---|---|---|---|---|---|---|---|
(<s>, <s>) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
(<s>, i) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
(i, am) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
(am, happy) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
(happy, because) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
(because, i) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
(i, am) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
(am, learning) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Counter?
Init signature: Counter(iterable=None, /, **kwds)
Docstring:
Dict subclass for counting hashable items. Sometimes called a bag
or multiset. Elements are stored as dictionary keys and their counts
are stored as dictionary values.
>>> c = Counter('abcdeabcdabcaba') # count elements from a string
>>> c.most_common(3) # three most common elements
[('a', 5), ('b', 4), ('c', 3)]
>>> sorted(c) # list all unique elements
['a', 'b', 'c', 'd', 'e']
>>> ''.join(sorted(c.elements())) # list elements with repetitions
'aaaaabbbbcccdde'
>>> sum(c.values()) # total of all counts
15
>>> c['a'] # count of letter 'a'
5
>>> for elem in 'shazam': # update counts from an iterable
... c[elem] += 1 # by adding 1 to each element's count
>>> c['a'] # now there are seven 'a'
7
>>> del c['b'] # remove all 'b'
>>> c['b'] # now there are zero 'b'
0
>>> d = Counter('simsalabim') # make another counter
>>> c.update(d) # add in the second counter
>>> c['a'] # now there are nine 'a'
9
>>> c.clear() # empty the counter
>>> c
Counter()
Note: If a count is set to zero or reduced to zero, it will remain
in the counter until the entry is deleted or the counter is cleared:
>>> c = Counter('aaabbc')
>>> c['b'] -= 2 # reduce the count of 'b' by two
>>> c.most_common() # 'b' is still in, but its count is zero
[('a', 3), ('c', 1), ('b', 0)]
Init docstring:
Create a new, empty Counter object. And if given, count elements
from an input iterable. Or, initialize the count from another mapping
of elements to their counts.
>>> c = Counter() # a new, empty counter
>>> c = Counter('gallahad') # a new counter from an iterable
>>> c = Counter({'a': 4, 'b': 2}) # a new counter from a mapping
>>> c = Counter(a=4, b=2) # a new counter from keyword args
File: /opt/anaconda/envs/aiking/lib/python3.9/collections/__init__.py
Type: type
Subclasses: _OrderedCounter, FreqDist
for i, v in zip(ns1gram,vocab):
print(i,v)
# print(count_matrix[i,v])
# count_matrix[i,v] = count_matrix[i,v]+1
['<s>' '<s>'] i
['<s>' 'i'] am
['i' 'am'] happy
['am' 'happy'] because
['happy' 'because'] i
['because' 'i'] am
['i' 'am'] learning
['am' 'learning'] <e>
count_matrix.index[0] #count_matrix
('<s>', '<s>')
for i, c in zip(pd.Index(ns1gram), pd.Index(vocab)):
print(i, c)
('<s>', '<s>') i
('<s>', 'i') am
('i', 'am') happy
('am', 'happy') because
('happy', 'because') i
('because', 'i') am
('i', 'am') learning
('am', 'learning') <e>
pd.Index(ns1gram)[0] in count_matrix.index
True
print("Hello")
Hello
a = pd.Index(ns1gram)[0]; a
# count_matrix.iloc[[a], :]
('<s>', '<s>')
for a, v in zip(pd.Index(ns1gram),pd.Index(vocab)):
count_matrix.loc[[a], v] +=1
count_matrix
i | am | happy | because | i | am | learning | <e> | |
---|---|---|---|---|---|---|---|---|
(<s>, <s>) | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
(<s>, i) | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
(i, am) | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
(am, happy) | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
(happy, because) | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
(because, i) | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
(i, am) | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
(am, learning) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
# prob_matrix =
prob_matrix = count_matrix/count_matrix.sum(axis=0); prob_matrix
i | am | happy | because | i | am | learning | <e> | |
---|---|---|---|---|---|---|---|---|
(<s>, <s>) | 0.5 | 0.0 | 0.0 | 0.0 | 0.5 | 0.0 | 0.0 | 0.0 |
(<s>, i) | 0.0 | 0.5 | 0.0 | 0.0 | 0.0 | 0.5 | 0.0 | 0.0 |
(i, am) | 0.0 | 0.0 | 0.5 | 0.0 | 0.0 | 0.0 | 0.5 | 0.0 |
(am, happy) | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
(happy, because) | 0.5 | 0.0 | 0.0 | 0.0 | 0.5 | 0.0 | 0.0 | 0.0 |
(because, i) | 0.0 | 0.5 | 0.0 | 0.0 | 0.0 | 0.5 | 0.0 | 0.0 |
(i, am) | 0.0 | 0.0 | 0.5 | 0.0 | 0.0 | 0.0 | 0.5 | 0.0 |
(am, learning) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
n = 3
tokens = nltk.word_tokenize(corpus.lower()); tokens
tokens = ["<s>"]*(n-1)+ tokens +['<e>']; tokens
ngrams = np.array([tokens[i:i+n] for i in range(len(tokens)-n+1)]); ngrams
ns1grams = ngrams[:,:-1]; ns1gram
vocab = ngrams[:,-1]; vocab
count_matrix = pd.DataFrame(0.0, index=ns1grams, columns = vocab); count_matrix
for a, v in zip(pd.Index(ns1grams),pd.Index(vocab)):count_matrix.loc[[a], v] +=1
prob_matrix = count_matrix/count_matrix.sum(axis=0); prob_matrix
i | am | happy | because | i | am | learning | <e> | |
---|---|---|---|---|---|---|---|---|
(<s>, <s>) | 0.5 | 0.0 | 0.0 | 0.0 | 0.5 | 0.0 | 0.0 | 0.0 |
(<s>, i) | 0.0 | 0.5 | 0.0 | 0.0 | 0.0 | 0.5 | 0.0 | 0.0 |
(i, am) | 0.0 | 0.0 | 0.5 | 0.0 | 0.0 | 0.0 | 0.5 | 0.0 |
(am, happy) | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
(happy, because) | 0.5 | 0.0 | 0.0 | 0.0 | 0.5 | 0.0 | 0.0 | 0.0 |
(because, i) | 0.0 | 0.5 | 0.0 | 0.0 | 0.0 | 0.5 | 0.0 | 0.0 |
(i, am) | 0.0 | 0.0 | 0.5 | 0.0 | 0.0 | 0.0 | 0.5 | 0.0 |
(am, learning) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |