Word Embeddings
Contents
Word Embeddings#
Imports#
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from collections import Counter
import nltk
import re
import emoji
from fastcore.all import *
nltk.download('punkt')
[nltk_data] Downloading package punkt to
[nltk_data] /home/rahul.saraf/nltk_data...
[nltk_data] Package punkt is already up-to-date!
True
Define / Get Corpus#
# Define a corpus
corpus = 'Who ❤️ "word embeddings" in 2020? I do!!!'; corpus
'Who ❤️ "word embeddings" in 2020? I do!!!'
# em = 'Hey 😷😷😷'
# em_split_emoji = emoji.get_emoji_regexp().split(em)
# em_split_emoji
def tokenize(corpus):
data = re.sub(r'[,!?;-]+', '.', corpus); data
tokens = nltk.word_tokenize(data); tokens
return [token.lower() for token in tokens
if token.isalpha()
or token == "."
or token in emoji.get_emoji_unicode_dict('en').values()]
tokenize(corpus)
['who', '❤️', 'word', 'embeddings', 'in', '.', 'i', 'do', '.']
tokenize('I am happy because I am learning')
['i', 'am', 'happy', 'because', 'i', 'am', 'learning']
def sliding_window(tokens, C=2):
for i in range(C, len(tokens)-C):
center_word = tokens[i]
context_words = tokens[i-C:i] + tokens[i+1:i+C+1]
yield context_words, center_word
return
# print(tokens[i-C:i], tokens[i], tokens[i+1:i+C+1])
g = sliding_window(tokenize(corpus))
next(g)
(['who', '❤️', 'embeddings', 'in'], 'word')
for context_words, center_word in sliding_window(tokenize(corpus)):
print(context_words, center_word)
['who', '❤️', 'embeddings', 'in'] word
['❤️', 'word', 'in', '.'] embeddings
['word', 'embeddings', '.', 'i'] in
['embeddings', 'in', 'i', 'do'] .
['in', '.', 'do', '.'] i
tokens = tokenize(corpus)
ind2word = dict(enumerate(tokens))
word2ind = {v:k for k,v in ind2word.items()}
one_hot = pd.get_dummies(pd.Series(ind2word, name='vocab')); one_hot
. | do | embeddings | i | in | who | word | ❤️ | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
3 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
6 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
7 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
g = sliding_window(tokenize(corpus))
context_words, center_word = next(g)
one_hot[context_words].mean(axis=1).values, one_hot[center_word].values
(array([0.25, 0.25, 0. , 0.25, 0.25, 0. , 0. , 0. , 0. ]),
array([0, 0, 1, 0, 0, 0, 0, 0, 0], dtype=uint8))