Autocomplete

import numpy as np 
import scipy as sp 
import matplotlib.pyplot as plt
import pandas as pd
from fastcore.all import *
import seaborn as sns
from collections import Counter
import nltk

sns.set()

## Calculating

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/rahul.saraf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

True

corpus =  "I am happy because I am Learning"
corpus

'I am happy because I am Learning'

n = 3
tokens = nltk.word_tokenize(corpus.lower()); tokens
tokens = ["<s>"]*(n-1)+ tokens +['<e>']; tokens
ngrams = np.array([tokens[i:i+n] for i in range(len(tokens)-n+1)]); ngrams

array([['<s>', '<s>', 'i'],
       ['<s>', 'i', 'am'],
       ['i', 'am', 'happy'],
       ['am', 'happy', 'because'],
       ['happy', 'because', 'i'],
       ['because', 'i', 'am'],
       ['i', 'am', 'learning'],
       ['am', 'learning', '<e>']], dtype='<U8')

ns1gram = ngrams[:,:-1]; ns1gram

array([['<s>', '<s>'],
       ['<s>', 'i'],
       ['i', 'am'],
       ['am', 'happy'],
       ['happy', 'because'],
       ['because', 'i'],
       ['i', 'am'],
       ['am', 'learning']], dtype='<U8')

vocab = ngrams[:,-1]; vocab

array(['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '<e>'],
      dtype='<U8')

count_matrix = pd.DataFrame(0.0, index=ns1gram, columns = vocab); count_matrix

	i	am	happy	because	i	am	learning	<e>
(<s>, <s>)	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
(<s>, i)	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
(i, am)	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
(am, happy)	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
(happy, because)	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
(because, i)	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
(i, am)	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
(am, learning)	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

Counter?

Init signature: Counter(iterable=None, /, **kwds)
Docstring:     
Dict subclass for counting hashable items.  Sometimes called a bag
or multiset.  Elements are stored as dictionary keys and their counts
are stored as dictionary values.

>>> c = Counter('abcdeabcdabcaba')  # count elements from a string

>>> c.most_common(3)                # three most common elements
[('a', 5), ('b', 4), ('c', 3)]
>>> sorted(c)                       # list all unique elements
['a', 'b', 'c', 'd', 'e']
>>> ''.join(sorted(c.elements()))   # list elements with repetitions
'aaaaabbbbcccdde'
>>> sum(c.values())                 # total of all counts
15

>>> c['a']                          # count of letter 'a'
5
>>> for elem in 'shazam':           # update counts from an iterable
...     c[elem] += 1                # by adding 1 to each element's count
>>> c['a']                          # now there are seven 'a'
7
>>> del c['b']                      # remove all 'b'
>>> c['b']                          # now there are zero 'b'
0

>>> d = Counter('simsalabim')       # make another counter
>>> c.update(d)                     # add in the second counter
>>> c['a']                          # now there are nine 'a'
9

>>> c.clear()                       # empty the counter
>>> c
Counter()

Note:  If a count is set to zero or reduced to zero, it will remain
in the counter until the entry is deleted or the counter is cleared:

>>> c = Counter('aaabbc')
>>> c['b'] -= 2                     # reduce the count of 'b' by two
>>> c.most_common()                 # 'b' is still in, but its count is zero
[('a', 3), ('c', 1), ('b', 0)]
Init docstring:
Create a new, empty Counter object.  And if given, count elements
from an input iterable.  Or, initialize the count from another mapping
of elements to their counts.

>>> c = Counter()                           # a new, empty counter
>>> c = Counter('gallahad')                 # a new counter from an iterable
>>> c = Counter({'a': 4, 'b': 2})           # a new counter from a mapping
>>> c = Counter(a=4, b=2)                   # a new counter from keyword args
File:           /opt/anaconda/envs/aiking/lib/python3.9/collections/__init__.py
Type:           type
Subclasses:     _OrderedCounter, FreqDist

for i, v in zip(ns1gram,vocab):
    print(i,v)
    # print(count_matrix[i,v])
    # count_matrix[i,v] = count_matrix[i,v]+1

['<s>' '<s>'] i
['<s>' 'i'] am
['i' 'am'] happy
['am' 'happy'] because
['happy' 'because'] i
['because' 'i'] am
['i' 'am'] learning
['am' 'learning'] <e>

count_matrix.index[0] #count_matrix

('<s>', '<s>')

for i, c in zip(pd.Index(ns1gram), pd.Index(vocab)):
    print(i, c)

('<s>', '<s>') i
('<s>', 'i') am
('i', 'am') happy
('am', 'happy') because
('happy', 'because') i
('because', 'i') am
('i', 'am') learning
('am', 'learning') <e>

pd.Index(ns1gram)[0] in count_matrix.index

True

print("Hello")

Hello

a = pd.Index(ns1gram)[0]; a
# count_matrix.iloc[[a], :]

('<s>', '<s>')

for a, v in zip(pd.Index(ns1gram),pd.Index(vocab)):
    count_matrix.loc[[a], v] +=1

count_matrix

	i	am	happy	because	i	am	learning	<e>
(<s>, <s>)	1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0
(<s>, i)	0.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0
(i, am)	0.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0
(am, happy)	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0
(happy, because)	1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0
(because, i)	0.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0
(i, am)	0.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0
(am, learning)	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0

# prob_matrix = 
prob_matrix = count_matrix/count_matrix.sum(axis=0); prob_matrix

	i	am	happy	because	i	am	learning	<e>
(<s>, <s>)	0.5	0.0	0.0	0.0	0.5	0.0	0.0	0.0
(<s>, i)	0.0	0.5	0.0	0.0	0.0	0.5	0.0	0.0
(i, am)	0.0	0.0	0.5	0.0	0.0	0.0	0.5	0.0
(am, happy)	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0
(happy, because)	0.5	0.0	0.0	0.0	0.5	0.0	0.0	0.0
(because, i)	0.0	0.5	0.0	0.0	0.0	0.5	0.0	0.0
(i, am)	0.0	0.0	0.5	0.0	0.0	0.0	0.5	0.0
(am, learning)	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0

n = 3
tokens = nltk.word_tokenize(corpus.lower()); tokens
tokens = ["<s>"]*(n-1)+ tokens +['<e>']; tokens
ngrams = np.array([tokens[i:i+n] for i in range(len(tokens)-n+1)]); ngrams
ns1grams = ngrams[:,:-1]; ns1gram
vocab = ngrams[:,-1]; vocab
count_matrix = pd.DataFrame(0.0, index=ns1grams, columns = vocab); count_matrix
for a, v in zip(pd.Index(ns1grams),pd.Index(vocab)):count_matrix.loc[[a], v] +=1
prob_matrix = count_matrix/count_matrix.sum(axis=0); prob_matrix

	i	am	happy	because	i	am	learning	<e>
(<s>, <s>)	0.5	0.0	0.0	0.0	0.5	0.0	0.0	0.0
(<s>, i)	0.0	0.5	0.0	0.0	0.0	0.5	0.0	0.0
(i, am)	0.0	0.0	0.5	0.0	0.0	0.0	0.5	0.0
(am, happy)	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0
(happy, because)	0.5	0.0	0.0	0.0	0.5	0.0	0.0	0.0
(because, i)	0.0	0.5	0.0	0.0	0.0	0.5	0.0	0.0
(i, am)	0.0	0.0	0.5	0.0	0.0	0.0	0.5	0.0
(am, learning)	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0

AIBook

Contents

Autocomplete#

Imports#