{
"cells": [
{
"cell_type": "markdown",
"id": "21847ae1-b8c4-4fa1-8b05-e5d045b0e71d",
"metadata": {},
"source": [
"# Word Embeddings"
]
},
{
"cell_type": "markdown",
"id": "81e5c8bf-485a-4ce4-9787-942ae301a6d0",
"metadata": {},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "c955e907-9fdc-4871-a34a-cd4abf93479b",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np \n",
"import scipy as sp \n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"\n",
"import seaborn as sns\n",
"from collections import Counter\n",
"import nltk\n",
"import re\n",
"import emoji\n",
"\n",
"from fastcore.all import *"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "e624b09a-b35e-4f73-8ff2-548cad7cdd69",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] /home/rahul.saraf/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nltk.download('punkt')"
]
},
{
"cell_type": "markdown",
"id": "b8c160c1-4e49-4098-b5bf-ee0057fdb8da",
"metadata": {},
"source": [
"## Define / Get Corpus"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "7e86c638-ae2b-4adb-bc48-c22351bd10b1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Who ❤️ \"word embeddings\" in 2020? I do!!!'"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Define a corpus\n",
"corpus = 'Who ❤️ \"word embeddings\" in 2020? I do!!!'; corpus"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "1fb68e40-f18f-430f-9f6e-94b9f58e907a",
"metadata": {},
"outputs": [],
"source": [
"# em = 'Hey 😷😷😷'\n",
"# em_split_emoji = emoji.get_emoji_regexp().split(em)\n",
"# em_split_emoji"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "f63b0033-2f7b-4208-844a-cf950aa47d74",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['who', '❤️', 'word', 'embeddings', 'in', '.', 'i', 'do', '.']"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def tokenize(corpus):\n",
" data = re.sub(r'[,!?;-]+', '.', corpus); data\n",
" tokens = nltk.word_tokenize(data); tokens\n",
" return [token.lower() for token in tokens \n",
" if token.isalpha() \n",
" or token == \".\" \n",
" or token in emoji.get_emoji_unicode_dict('en').values()]\n",
"\n",
"tokenize(corpus)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "2d34931c-0fe3-4f80-beec-5e96ab1accad",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['i', 'am', 'happy', 'because', 'i', 'am', 'learning']"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenize('I am happy because I am learning')"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "1a961543-5e47-4179-90c6-3b84bcfe9350",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(['who', '❤️', 'embeddings', 'in'], 'word')"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def sliding_window(tokens, C=2):\n",
" for i in range(C, len(tokens)-C):\n",
" center_word = tokens[i]\n",
" context_words = tokens[i-C:i] + tokens[i+1:i+C+1]\n",
" yield context_words, center_word\n",
" return\n",
" # print(tokens[i-C:i], tokens[i], tokens[i+1:i+C+1])\n",
" \n",
"g = sliding_window(tokenize(corpus))\n",
"next(g)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "40e48f48-c879-4c57-811d-fd0208a66e43",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['who', '❤️', 'embeddings', 'in'] word\n",
"['❤️', 'word', 'in', '.'] embeddings\n",
"['word', 'embeddings', '.', 'i'] in\n",
"['embeddings', 'in', 'i', 'do'] .\n",
"['in', '.', 'do', '.'] i\n"
]
}
],
"source": [
"for context_words, center_word in sliding_window(tokenize(corpus)):\n",
" print(context_words, center_word)"
]
},
{
"cell_type": "code",
"execution_count": 81,
"id": "b4982306-c716-46fa-a3e8-df845282bb33",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>.</th>\n",
" <th>do</th>\n",
" <th>embeddings</th>\n",
" <th>i</th>\n",
" <th>in</th>\n",
" <th>who</th>\n",
" <th>word</th>\n",
" <th>❤️</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" . do embeddings i in who word ❤️\n",
"0 0 0 0 0 0 1 0 0\n",
"1 0 0 0 0 0 0 0 1\n",
"2 0 0 0 0 0 0 1 0\n",
"3 0 0 1 0 0 0 0 0\n",
"4 0 0 0 0 1 0 0 0\n",
"5 1 0 0 0 0 0 0 0\n",
"6 0 0 0 1 0 0 0 0\n",
"7 0 1 0 0 0 0 0 0\n",
"8 1 0 0 0 0 0 0 0"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokens = tokenize(corpus)\n",
"ind2word = dict(enumerate(tokens))\n",
"word2ind = {v:k for k,v in ind2word.items()}\n",
"one_hot = pd.get_dummies(pd.Series(ind2word, name='vocab')); one_hot"
]
},
{
"cell_type": "code",
"execution_count": 92,
"id": "c7aa73a2-5b27-45d4-8e09-64b507de4067",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([0.25, 0.25, 0. , 0.25, 0.25, 0. , 0. , 0. , 0. ]),\n",
" array([0, 0, 1, 0, 0, 0, 0, 0, 0], dtype=uint8))"
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"g = sliding_window(tokenize(corpus))\n",
"context_words, center_word = next(g)\n",
"one_hot[context_words].mean(axis=1).values, one_hot[center_word].values"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "23e35871-0421-48c5-ae3b-7d42a3173118",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}