Word Embeddings#

Imports#

import numpy as np 
import scipy as sp 
import matplotlib.pyplot as plt
import pandas as pd

import seaborn as sns
from collections import Counter
import nltk
import re
import emoji

from fastcore.all import *
nltk.download('punkt')
[nltk_data] Downloading package punkt to
[nltk_data]     /home/rahul.saraf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
True

Define / Get Corpus#

# Define a corpus
corpus = 'Who ❤️ "word embeddings" in 2020? I do!!!'; corpus
'Who ❤️ "word embeddings" in 2020? I do!!!'
# em = 'Hey 😷😷😷'
# em_split_emoji = emoji.get_emoji_regexp().split(em)
# em_split_emoji
def tokenize(corpus):
    data = re.sub(r'[,!?;-]+', '.', corpus); data
    tokens = nltk.word_tokenize(data); tokens
    return [token.lower() for token in tokens 
                     if token.isalpha() 
                     or token == "." 
                     or token in emoji.get_emoji_unicode_dict('en').values()]

tokenize(corpus)
['who', '❤️', 'word', 'embeddings', 'in', '.', 'i', 'do', '.']
tokenize('I am happy because I am learning')
['i', 'am', 'happy', 'because', 'i', 'am', 'learning']
def sliding_window(tokens, C=2):
    for i in range(C, len(tokens)-C):
        center_word = tokens[i]
        context_words = tokens[i-C:i] + tokens[i+1:i+C+1]
        yield context_words, center_word
    return
        # print(tokens[i-C:i], tokens[i], tokens[i+1:i+C+1])
        
g = sliding_window(tokenize(corpus))
next(g)
(['who', '❤️', 'embeddings', 'in'], 'word')
for context_words, center_word in sliding_window(tokenize(corpus)):
    print(context_words, center_word)
['who', '❤️', 'embeddings', 'in'] word
['❤️', 'word', 'in', '.'] embeddings
['word', 'embeddings', '.', 'i'] in
['embeddings', 'in', 'i', 'do'] .
['in', '.', 'do', '.'] i
tokens = tokenize(corpus)
ind2word = dict(enumerate(tokens))
word2ind = {v:k for k,v in ind2word.items()}
one_hot = pd.get_dummies(pd.Series(ind2word, name='vocab')); one_hot
. do embeddings i in who word ❤️
0 0 0 0 0 0 1 0 0
1 0 0 0 0 0 0 0 1
2 0 0 0 0 0 0 1 0
3 0 0 1 0 0 0 0 0
4 0 0 0 0 1 0 0 0
5 1 0 0 0 0 0 0 0
6 0 0 0 1 0 0 0 0
7 0 1 0 0 0 0 0 0
8 1 0 0 0 0 0 0 0
g = sliding_window(tokenize(corpus))
context_words, center_word = next(g)
one_hot[context_words].mean(axis=1).values, one_hot[center_word].values
(array([0.25, 0.25, 0.  , 0.25, 0.25, 0.  , 0.  , 0.  , 0.  ]),
 array([0, 0, 1, 0, 0, 0, 0, 0, 0], dtype=uint8))