{ "cells": [ { "cell_type": "markdown", "id": "dae44f67-3129-4313-a7d9-a6c11d243473", "metadata": {}, "source": [ "# Autocomplete" ] }, { "cell_type": "markdown", "id": "2758e72f-d107-44dc-9c47-185e07cacccc", "metadata": {}, "source": [ "# Imports" ] }, { "cell_type": "code", "execution_count": 69, "id": "0e1f39c9-101b-44f6-9e53-0390a61627db", "metadata": {}, "outputs": [], "source": [ "import numpy as np \n", "import scipy as sp \n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "from fastcore.all import *\n", "import seaborn as sns\n", "from collections import Counter\n", "import nltk" ] }, { "cell_type": "code", "execution_count": 70, "id": "bb9c8620-9a30-4b29-b55f-d8fedf89493b", "metadata": {}, "outputs": [], "source": [ "sns.set()" ] }, { "cell_type": "code", "execution_count": null, "id": "365dd204-58f6-4e55-86b4-9995d47cd0b1", "metadata": {}, "outputs": [], "source": [ "## Calculating " ] }, { "cell_type": "code", "execution_count": 71, "id": "f0e4d68f-a8b9-4018-9c94-15c3500493bd", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to\n", "[nltk_data] /home/rahul.saraf/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nltk.download('punkt')" ] }, { "cell_type": "code", "execution_count": 72, "id": "4856fc38-8055-420a-9af5-7ba95e4c56e2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'I am happy because I am Learning'" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "corpus = \"I am happy because I am Learning\"\n", "corpus" ] }, { "cell_type": "code", "execution_count": 73, "id": "beb24ed2-993e-4225-aa51-d5d11cf241e6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([['<s>', '<s>', 'i'],\n", " ['<s>', 'i', 'am'],\n", " ['i', 'am', 'happy'],\n", " ['am', 'happy', 'because'],\n", " ['happy', 'because', 'i'],\n", " ['because', 'i', 'am'],\n", " ['i', 'am', 'learning'],\n", " ['am', 'learning', '<e>']], dtype='<U8')" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "n = 3\n", "tokens = nltk.word_tokenize(corpus.lower()); tokens\n", "tokens = [\"<s>\"]*(n-1)+ tokens +['<e>']; tokens\n", "ngrams = np.array([tokens[i:i+n] for i in range(len(tokens)-n+1)]); ngrams" ] }, { "cell_type": "code", "execution_count": 92, "id": "99cb9a3d-5449-467e-9963-bffc4c6b0e1a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([['<s>', '<s>'],\n", " ['<s>', 'i'],\n", " ['i', 'am'],\n", " ['am', 'happy'],\n", " ['happy', 'because'],\n", " ['because', 'i'],\n", " ['i', 'am'],\n", " ['am', 'learning']], dtype='<U8')" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ns1gram = ngrams[:,:-1]; ns1gram" ] }, { "cell_type": "code", "execution_count": 93, "id": "fd6f103a-1c20-4e79-96ed-b889ab771d0f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '<e>'],\n", " dtype='<U8')" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vocab = ngrams[:,-1]; vocab" ] }, { "cell_type": "code", "execution_count": 94, "id": "8fbd557c-6105-433a-930c-62ee0bda5635", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>i</th>\n", " <th>am</th>\n", " <th>happy</th>\n", " <th>because</th>\n", " <th>i</th>\n", " <th>am</th>\n", " <th>learning</th>\n", " <th><e></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>(<s>, <s>)</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(<s>, i)</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(i, am)</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(am, happy)</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(happy, because)</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(because, i)</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(i, am)</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(am, learning)</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " i am happy because i am learning <e>\n", "(<s>, <s>) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n", "(<s>, i) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n", "(i, am) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n", "(am, happy) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n", "(happy, because) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n", "(because, i) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n", "(i, am) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n", "(am, learning) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "count_matrix = pd.DataFrame(0.0, index=ns1gram, columns = vocab); count_matrix" ] }, { "cell_type": "code", "execution_count": 77, "id": "8fcbb5f8-c9bb-4ff6-b944-bb6ce96970b2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\u001b[0;31mInit signature:\u001b[0m \u001b[0mCounter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miterable\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m/\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mDocstring:\u001b[0m \n", "Dict subclass for counting hashable items. Sometimes called a bag\n", "or multiset. Elements are stored as dictionary keys and their counts\n", "are stored as dictionary values.\n", "\n", ">>> c = Counter('abcdeabcdabcaba') # count elements from a string\n", "\n", ">>> c.most_common(3) # three most common elements\n", "[('a', 5), ('b', 4), ('c', 3)]\n", ">>> sorted(c) # list all unique elements\n", "['a', 'b', 'c', 'd', 'e']\n", ">>> ''.join(sorted(c.elements())) # list elements with repetitions\n", "'aaaaabbbbcccdde'\n", ">>> sum(c.values()) # total of all counts\n", "15\n", "\n", ">>> c['a'] # count of letter 'a'\n", "5\n", ">>> for elem in 'shazam': # update counts from an iterable\n", "... c[elem] += 1 # by adding 1 to each element's count\n", ">>> c['a'] # now there are seven 'a'\n", "7\n", ">>> del c['b'] # remove all 'b'\n", ">>> c['b'] # now there are zero 'b'\n", "0\n", "\n", ">>> d = Counter('simsalabim') # make another counter\n", ">>> c.update(d) # add in the second counter\n", ">>> c['a'] # now there are nine 'a'\n", "9\n", "\n", ">>> c.clear() # empty the counter\n", ">>> c\n", "Counter()\n", "\n", "Note: If a count is set to zero or reduced to zero, it will remain\n", "in the counter until the entry is deleted or the counter is cleared:\n", "\n", ">>> c = Counter('aaabbc')\n", ">>> c['b'] -= 2 # reduce the count of 'b' by two\n", ">>> c.most_common() # 'b' is still in, but its count is zero\n", "[('a', 3), ('c', 1), ('b', 0)]\n", "\u001b[0;31mInit docstring:\u001b[0m\n", "Create a new, empty Counter object. And if given, count elements\n", "from an input iterable. Or, initialize the count from another mapping\n", "of elements to their counts.\n", "\n", ">>> c = Counter() # a new, empty counter\n", ">>> c = Counter('gallahad') # a new counter from an iterable\n", ">>> c = Counter({'a': 4, 'b': 2}) # a new counter from a mapping\n", ">>> c = Counter(a=4, b=2) # a new counter from keyword args\n", "\u001b[0;31mFile:\u001b[0m /opt/anaconda/envs/aiking/lib/python3.9/collections/__init__.py\n", "\u001b[0;31mType:\u001b[0m type\n", "\u001b[0;31mSubclasses:\u001b[0m _OrderedCounter, FreqDist\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "Counter?" ] }, { "cell_type": "code", "execution_count": 90, "id": "ec079f36-b6e6-41fc-ae24-2ae33cf72790", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['<s>' '<s>'] i\n", "['<s>' 'i'] am\n", "['i' 'am'] happy\n", "['am' 'happy'] because\n", "['happy' 'because'] i\n", "['because' 'i'] am\n", "['i' 'am'] learning\n", "['am' 'learning'] <e>\n" ] } ], "source": [ "for i, v in zip(ns1gram,vocab):\n", " print(i,v)\n", " # print(count_matrix[i,v])\n", " # count_matrix[i,v] = count_matrix[i,v]+1" ] }, { "cell_type": "code", "execution_count": 99, "id": "57a21f23-eefe-4ecc-bf5b-31f35a87b545", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('<s>', '<s>')" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "count_matrix.index[0] #count_matrix" ] }, { "cell_type": "code", "execution_count": 109, "id": "3ef45d9f-facf-4b1b-ac6a-6d46f17f881d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('<s>', '<s>') i\n", "('<s>', 'i') am\n", "('i', 'am') happy\n", "('am', 'happy') because\n", "('happy', 'because') i\n", "('because', 'i') am\n", "('i', 'am') learning\n", "('am', 'learning') <e>\n" ] } ], "source": [ "for i, c in zip(pd.Index(ns1gram), pd.Index(vocab)):\n", " print(i, c)" ] }, { "cell_type": "code", "execution_count": null, "id": "4845b200-2f99-4b14-b406-d235e902dc5f", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 116, "id": "9a2a6d17-5aa4-4d2d-93b7-a4915a3245cf", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.Index(ns1gram)[0] in count_matrix.index" ] }, { "cell_type": "code", "execution_count": 112, "id": "21d42586-6004-490b-a762-a80fff309d5a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Hello\n" ] } ], "source": [ "print(\"Hello\")" ] }, { "cell_type": "code", "execution_count": 123, "id": "25feef6a-be2f-4013-8ec1-c375339c6773", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('<s>', '<s>')" ] }, "execution_count": 123, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a = pd.Index(ns1gram)[0]; a\n", "# count_matrix.iloc[[a], :]" ] }, { "cell_type": "code", "execution_count": 132, "id": "7f901d49-f0c4-4772-ab17-fd8c77248dee", "metadata": {}, "outputs": [], "source": [ "for a, v in zip(pd.Index(ns1gram),pd.Index(vocab)):\n", " count_matrix.loc[[a], v] +=1" ] }, { "cell_type": "code", "execution_count": 133, "id": "5af3053d-2c38-44f8-853e-754f0a3594b9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>i</th>\n", " <th>am</th>\n", " <th>happy</th>\n", " <th>because</th>\n", " <th>i</th>\n", " <th>am</th>\n", " <th>learning</th>\n", " <th><e></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>(<s>, <s>)</th>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(<s>, i)</th>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(i, am)</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(am, happy)</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(happy, because)</th>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(because, i)</th>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(i, am)</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(am, learning)</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " i am happy because i am learning <e>\n", "(<s>, <s>) 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0\n", "(<s>, i) 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0\n", "(i, am) 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0\n", "(am, happy) 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0\n", "(happy, because) 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0\n", "(because, i) 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0\n", "(i, am) 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0\n", "(am, learning) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0" ] }, "execution_count": 133, "metadata": {}, "output_type": "execute_result" } ], "source": [ "count_matrix" ] }, { "cell_type": "code", "execution_count": 137, "id": "888170a1-431b-493f-b493-b564188b217b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>i</th>\n", " <th>am</th>\n", " <th>happy</th>\n", " <th>because</th>\n", " <th>i</th>\n", " <th>am</th>\n", " <th>learning</th>\n", " <th><e></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>(<s>, <s>)</th>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(<s>, i)</th>\n", " <td>0.0</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(i, am)</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(am, happy)</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(happy, because)</th>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(because, i)</th>\n", " <td>0.0</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(i, am)</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(am, learning)</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " i am happy because i am learning <e>\n", "(<s>, <s>) 0.5 0.0 0.0 0.0 0.5 0.0 0.0 0.0\n", "(<s>, i) 0.0 0.5 0.0 0.0 0.0 0.5 0.0 0.0\n", "(i, am) 0.0 0.0 0.5 0.0 0.0 0.0 0.5 0.0\n", "(am, happy) 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0\n", "(happy, because) 0.5 0.0 0.0 0.0 0.5 0.0 0.0 0.0\n", "(because, i) 0.0 0.5 0.0 0.0 0.0 0.5 0.0 0.0\n", "(i, am) 0.0 0.0 0.5 0.0 0.0 0.0 0.5 0.0\n", "(am, learning) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0" ] }, "execution_count": 137, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# prob_matrix = \n", "prob_matrix = count_matrix/count_matrix.sum(axis=0); prob_matrix" ] }, { "cell_type": "code", "execution_count": 146, "id": "e933d1e1-6f35-44b9-a818-d6e9acceced4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>i</th>\n", " <th>am</th>\n", " <th>happy</th>\n", " <th>because</th>\n", " <th>i</th>\n", " <th>am</th>\n", " <th>learning</th>\n", " <th><e></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>(<s>, <s>)</th>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(<s>, i)</th>\n", " <td>0.0</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(i, am)</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(am, happy)</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(happy, because)</th>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(because, i)</th>\n", " <td>0.0</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(i, am)</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.5</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>(am, learning)</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " i am happy because i am learning <e>\n", "(<s>, <s>) 0.5 0.0 0.0 0.0 0.5 0.0 0.0 0.0\n", "(<s>, i) 0.0 0.5 0.0 0.0 0.0 0.5 0.0 0.0\n", "(i, am) 0.0 0.0 0.5 0.0 0.0 0.0 0.5 0.0\n", "(am, happy) 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0\n", "(happy, because) 0.5 0.0 0.0 0.0 0.5 0.0 0.0 0.0\n", "(because, i) 0.0 0.5 0.0 0.0 0.0 0.5 0.0 0.0\n", "(i, am) 0.0 0.0 0.5 0.0 0.0 0.0 0.5 0.0\n", "(am, learning) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0" ] }, "execution_count": 146, "metadata": {}, "output_type": "execute_result" } ], "source": [ "n = 3\n", "tokens = nltk.word_tokenize(corpus.lower()); tokens\n", "tokens = [\"<s>\"]*(n-1)+ tokens +['<e>']; tokens\n", "ngrams = np.array([tokens[i:i+n] for i in range(len(tokens)-n+1)]); ngrams\n", "ns1grams = ngrams[:,:-1]; ns1gram\n", "vocab = ngrams[:,-1]; vocab\n", "count_matrix = pd.DataFrame(0.0, index=ns1grams, columns = vocab); count_matrix\n", "for a, v in zip(pd.Index(ns1grams),pd.Index(vocab)):count_matrix.loc[[a], v] +=1\n", "prob_matrix = count_matrix/count_matrix.sum(axis=0); prob_matrix" ] }, { "cell_type": "code", "execution_count": null, "id": "7e968eba-6c77-4ac7-97be-b6548e94a4e4", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }