{
"cells": [
{
"cell_type": "markdown",
"id": "dae44f67-3129-4313-a7d9-a6c11d243473",
"metadata": {},
"source": [
"# Autocomplete"
]
},
{
"cell_type": "markdown",
"id": "2758e72f-d107-44dc-9c47-185e07cacccc",
"metadata": {},
"source": [
"# Imports"
]
},
{
"cell_type": "code",
"execution_count": 69,
"id": "0e1f39c9-101b-44f6-9e53-0390a61627db",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np \n",
"import scipy as sp \n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"from fastcore.all import *\n",
"import seaborn as sns\n",
"from collections import Counter\n",
"import nltk"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "bb9c8620-9a30-4b29-b55f-d8fedf89493b",
"metadata": {},
"outputs": [],
"source": [
"sns.set()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "365dd204-58f6-4e55-86b4-9995d47cd0b1",
"metadata": {},
"outputs": [],
"source": [
"## Calculating "
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "f0e4d68f-a8b9-4018-9c94-15c3500493bd",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] /home/rahul.saraf/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nltk.download('punkt')"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "4856fc38-8055-420a-9af5-7ba95e4c56e2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'I am happy because I am Learning'"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"corpus = \"I am happy because I am Learning\"\n",
"corpus"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "beb24ed2-993e-4225-aa51-d5d11cf241e6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([['<s>', '<s>', 'i'],\n",
" ['<s>', 'i', 'am'],\n",
" ['i', 'am', 'happy'],\n",
" ['am', 'happy', 'because'],\n",
" ['happy', 'because', 'i'],\n",
" ['because', 'i', 'am'],\n",
" ['i', 'am', 'learning'],\n",
" ['am', 'learning', '<e>']], dtype='<U8')"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n = 3\n",
"tokens = nltk.word_tokenize(corpus.lower()); tokens\n",
"tokens = [\"<s>\"]*(n-1)+ tokens +['<e>']; tokens\n",
"ngrams = np.array([tokens[i:i+n] for i in range(len(tokens)-n+1)]); ngrams"
]
},
{
"cell_type": "code",
"execution_count": 92,
"id": "99cb9a3d-5449-467e-9963-bffc4c6b0e1a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([['<s>', '<s>'],\n",
" ['<s>', 'i'],\n",
" ['i', 'am'],\n",
" ['am', 'happy'],\n",
" ['happy', 'because'],\n",
" ['because', 'i'],\n",
" ['i', 'am'],\n",
" ['am', 'learning']], dtype='<U8')"
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ns1gram = ngrams[:,:-1]; ns1gram"
]
},
{
"cell_type": "code",
"execution_count": 93,
"id": "fd6f103a-1c20-4e79-96ed-b889ab771d0f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '<e>'],\n",
" dtype='<U8')"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab = ngrams[:,-1]; vocab"
]
},
{
"cell_type": "code",
"execution_count": 94,
"id": "8fbd557c-6105-433a-930c-62ee0bda5635",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>i</th>\n",
" <th>am</th>\n",
" <th>happy</th>\n",
" <th>because</th>\n",
" <th>i</th>\n",
" <th>am</th>\n",
" <th>learning</th>\n",
" <th><e></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>(<s>, <s>)</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(<s>, i)</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(i, am)</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(am, happy)</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(happy, because)</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(because, i)</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(i, am)</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(am, learning)</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" i am happy because i am learning <e>\n",
"(<s>, <s>) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n",
"(<s>, i) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n",
"(i, am) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n",
"(am, happy) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n",
"(happy, because) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n",
"(because, i) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n",
"(i, am) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n",
"(am, learning) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0"
]
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"count_matrix = pd.DataFrame(0.0, index=ns1gram, columns = vocab); count_matrix"
]
},
{
"cell_type": "code",
"execution_count": 77,
"id": "8fcbb5f8-c9bb-4ff6-b944-bb6ce96970b2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\u001b[0;31mInit signature:\u001b[0m \u001b[0mCounter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miterable\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m/\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mDocstring:\u001b[0m \n",
"Dict subclass for counting hashable items. Sometimes called a bag\n",
"or multiset. Elements are stored as dictionary keys and their counts\n",
"are stored as dictionary values.\n",
"\n",
">>> c = Counter('abcdeabcdabcaba') # count elements from a string\n",
"\n",
">>> c.most_common(3) # three most common elements\n",
"[('a', 5), ('b', 4), ('c', 3)]\n",
">>> sorted(c) # list all unique elements\n",
"['a', 'b', 'c', 'd', 'e']\n",
">>> ''.join(sorted(c.elements())) # list elements with repetitions\n",
"'aaaaabbbbcccdde'\n",
">>> sum(c.values()) # total of all counts\n",
"15\n",
"\n",
">>> c['a'] # count of letter 'a'\n",
"5\n",
">>> for elem in 'shazam': # update counts from an iterable\n",
"... c[elem] += 1 # by adding 1 to each element's count\n",
">>> c['a'] # now there are seven 'a'\n",
"7\n",
">>> del c['b'] # remove all 'b'\n",
">>> c['b'] # now there are zero 'b'\n",
"0\n",
"\n",
">>> d = Counter('simsalabim') # make another counter\n",
">>> c.update(d) # add in the second counter\n",
">>> c['a'] # now there are nine 'a'\n",
"9\n",
"\n",
">>> c.clear() # empty the counter\n",
">>> c\n",
"Counter()\n",
"\n",
"Note: If a count is set to zero or reduced to zero, it will remain\n",
"in the counter until the entry is deleted or the counter is cleared:\n",
"\n",
">>> c = Counter('aaabbc')\n",
">>> c['b'] -= 2 # reduce the count of 'b' by two\n",
">>> c.most_common() # 'b' is still in, but its count is zero\n",
"[('a', 3), ('c', 1), ('b', 0)]\n",
"\u001b[0;31mInit docstring:\u001b[0m\n",
"Create a new, empty Counter object. And if given, count elements\n",
"from an input iterable. Or, initialize the count from another mapping\n",
"of elements to their counts.\n",
"\n",
">>> c = Counter() # a new, empty counter\n",
">>> c = Counter('gallahad') # a new counter from an iterable\n",
">>> c = Counter({'a': 4, 'b': 2}) # a new counter from a mapping\n",
">>> c = Counter(a=4, b=2) # a new counter from keyword args\n",
"\u001b[0;31mFile:\u001b[0m /opt/anaconda/envs/aiking/lib/python3.9/collections/__init__.py\n",
"\u001b[0;31mType:\u001b[0m type\n",
"\u001b[0;31mSubclasses:\u001b[0m _OrderedCounter, FreqDist\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"Counter?"
]
},
{
"cell_type": "code",
"execution_count": 90,
"id": "ec079f36-b6e6-41fc-ae24-2ae33cf72790",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['<s>' '<s>'] i\n",
"['<s>' 'i'] am\n",
"['i' 'am'] happy\n",
"['am' 'happy'] because\n",
"['happy' 'because'] i\n",
"['because' 'i'] am\n",
"['i' 'am'] learning\n",
"['am' 'learning'] <e>\n"
]
}
],
"source": [
"for i, v in zip(ns1gram,vocab):\n",
" print(i,v)\n",
" # print(count_matrix[i,v])\n",
" # count_matrix[i,v] = count_matrix[i,v]+1"
]
},
{
"cell_type": "code",
"execution_count": 99,
"id": "57a21f23-eefe-4ecc-bf5b-31f35a87b545",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('<s>', '<s>')"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"count_matrix.index[0] #count_matrix"
]
},
{
"cell_type": "code",
"execution_count": 109,
"id": "3ef45d9f-facf-4b1b-ac6a-6d46f17f881d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('<s>', '<s>') i\n",
"('<s>', 'i') am\n",
"('i', 'am') happy\n",
"('am', 'happy') because\n",
"('happy', 'because') i\n",
"('because', 'i') am\n",
"('i', 'am') learning\n",
"('am', 'learning') <e>\n"
]
}
],
"source": [
"for i, c in zip(pd.Index(ns1gram), pd.Index(vocab)):\n",
" print(i, c)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4845b200-2f99-4b14-b406-d235e902dc5f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 116,
"id": "9a2a6d17-5aa4-4d2d-93b7-a4915a3245cf",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.Index(ns1gram)[0] in count_matrix.index"
]
},
{
"cell_type": "code",
"execution_count": 112,
"id": "21d42586-6004-490b-a762-a80fff309d5a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Hello\n"
]
}
],
"source": [
"print(\"Hello\")"
]
},
{
"cell_type": "code",
"execution_count": 123,
"id": "25feef6a-be2f-4013-8ec1-c375339c6773",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('<s>', '<s>')"
]
},
"execution_count": 123,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a = pd.Index(ns1gram)[0]; a\n",
"# count_matrix.iloc[[a], :]"
]
},
{
"cell_type": "code",
"execution_count": 132,
"id": "7f901d49-f0c4-4772-ab17-fd8c77248dee",
"metadata": {},
"outputs": [],
"source": [
"for a, v in zip(pd.Index(ns1gram),pd.Index(vocab)):\n",
" count_matrix.loc[[a], v] +=1"
]
},
{
"cell_type": "code",
"execution_count": 133,
"id": "5af3053d-2c38-44f8-853e-754f0a3594b9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>i</th>\n",
" <th>am</th>\n",
" <th>happy</th>\n",
" <th>because</th>\n",
" <th>i</th>\n",
" <th>am</th>\n",
" <th>learning</th>\n",
" <th><e></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>(<s>, <s>)</th>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(<s>, i)</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(i, am)</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(am, happy)</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(happy, because)</th>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(because, i)</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(i, am)</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(am, learning)</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" i am happy because i am learning <e>\n",
"(<s>, <s>) 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0\n",
"(<s>, i) 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0\n",
"(i, am) 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0\n",
"(am, happy) 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0\n",
"(happy, because) 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0\n",
"(because, i) 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0\n",
"(i, am) 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0\n",
"(am, learning) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0"
]
},
"execution_count": 133,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"count_matrix"
]
},
{
"cell_type": "code",
"execution_count": 137,
"id": "888170a1-431b-493f-b493-b564188b217b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>i</th>\n",
" <th>am</th>\n",
" <th>happy</th>\n",
" <th>because</th>\n",
" <th>i</th>\n",
" <th>am</th>\n",
" <th>learning</th>\n",
" <th><e></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>(<s>, <s>)</th>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(<s>, i)</th>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(i, am)</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(am, happy)</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(happy, because)</th>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(because, i)</th>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(i, am)</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(am, learning)</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" i am happy because i am learning <e>\n",
"(<s>, <s>) 0.5 0.0 0.0 0.0 0.5 0.0 0.0 0.0\n",
"(<s>, i) 0.0 0.5 0.0 0.0 0.0 0.5 0.0 0.0\n",
"(i, am) 0.0 0.0 0.5 0.0 0.0 0.0 0.5 0.0\n",
"(am, happy) 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0\n",
"(happy, because) 0.5 0.0 0.0 0.0 0.5 0.0 0.0 0.0\n",
"(because, i) 0.0 0.5 0.0 0.0 0.0 0.5 0.0 0.0\n",
"(i, am) 0.0 0.0 0.5 0.0 0.0 0.0 0.5 0.0\n",
"(am, learning) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0"
]
},
"execution_count": 137,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# prob_matrix = \n",
"prob_matrix = count_matrix/count_matrix.sum(axis=0); prob_matrix"
]
},
{
"cell_type": "code",
"execution_count": 146,
"id": "e933d1e1-6f35-44b9-a818-d6e9acceced4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>i</th>\n",
" <th>am</th>\n",
" <th>happy</th>\n",
" <th>because</th>\n",
" <th>i</th>\n",
" <th>am</th>\n",
" <th>learning</th>\n",
" <th><e></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>(<s>, <s>)</th>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(<s>, i)</th>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(i, am)</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(am, happy)</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(happy, because)</th>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(because, i)</th>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(i, am)</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.5</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(am, learning)</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" i am happy because i am learning <e>\n",
"(<s>, <s>) 0.5 0.0 0.0 0.0 0.5 0.0 0.0 0.0\n",
"(<s>, i) 0.0 0.5 0.0 0.0 0.0 0.5 0.0 0.0\n",
"(i, am) 0.0 0.0 0.5 0.0 0.0 0.0 0.5 0.0\n",
"(am, happy) 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0\n",
"(happy, because) 0.5 0.0 0.0 0.0 0.5 0.0 0.0 0.0\n",
"(because, i) 0.0 0.5 0.0 0.0 0.0 0.5 0.0 0.0\n",
"(i, am) 0.0 0.0 0.5 0.0 0.0 0.0 0.5 0.0\n",
"(am, learning) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0"
]
},
"execution_count": 146,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n = 3\n",
"tokens = nltk.word_tokenize(corpus.lower()); tokens\n",
"tokens = [\"<s>\"]*(n-1)+ tokens +['<e>']; tokens\n",
"ngrams = np.array([tokens[i:i+n] for i in range(len(tokens)-n+1)]); ngrams\n",
"ns1grams = ngrams[:,:-1]; ns1gram\n",
"vocab = ngrams[:,-1]; vocab\n",
"count_matrix = pd.DataFrame(0.0, index=ns1grams, columns = vocab); count_matrix\n",
"for a, v in zip(pd.Index(ns1grams),pd.Index(vocab)):count_matrix.loc[[a], v] +=1\n",
"prob_matrix = count_matrix/count_matrix.sum(axis=0); prob_matrix"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7e968eba-6c77-4ac7-97be-b6548e94a4e4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}