Vector Space Models#

Imports#

import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
import seaborn as sns
sns.set()

PCA implementation#

np.random.seed(1)
n_components = 2
A = np.random.random([3,10]) # shape 3x10
Am = (A - A.mean(axis=0)) # S1 - Mean centering
cv = np.cov(Am,  rowvar=False); cv.shape # S2 - Calculating Covariance
ev, evecs = np.linalg.eigh(cv); ev.shape, evecs.shape # S3 - Calculating Eigenvalues
idx_sort = np.argsort(ev)[::-1] # S4- Sorting
ev_sort = ev[idx_sort]
evecs_sort = evecs[:,idx_sort]
evecs_subset = evecs_sort[:,0:n_components]; evecs_subset # S5 - Taking Column subset
# ev[idx_sort]
# evecs_subset=evecs[idx_sort][:,:n_components]; evecs_subset.shape
comps = (evecs_subset.T@Am.T).T # S6 - Reducing components
comps
array([[ 0.43437323,  0.49820384],
       [ 0.42077249, -0.50351448],
       [-0.85514571,  0.00531064]])
pca = PCA(n_components=2)
pca.fit_transform(A)
array([[-0.43437323, -0.49820384],
       [-0.42077249,  0.50351448],
       [ 0.85514571, -0.00531064]])

Cosine similarity#

def cos_sim(a,b):
    return a@b/(np.linalg.norm(a)*np.linalg.norm(b))
def euclid_sim(a,b):
    return np.linalg.norm(a-b)
cos_sim(np.array([1,2,3]), -1*np.array([2,4,6]))
-1.0
cos_sim(np.array([1,2,3]), np.array([3,1,4])), cos_sim(np.array([4,7,2]), np.array([3,1,4]))
(0.8910421112136306, 0.6374594222773456)
euclid_sim(np.array([1,2,3]), np.array([3,1,4])), euclid_sim(np.array([4,7,2]), np.array([3,1,4]))
(2.449489742783178, 6.4031242374328485)
cos_sim(np.array([1,0,-1]), np.array([2,8,1]))
0.08512565307587484
usa = np.array([5,6])
wash = np.array([10,5])
turkey = np.array([3,1])
ankara = np.array([9,1])
russian = np.array([5,5])
japan = np.array([4,3])
usa, wash, turkey, ankara, russian, japan

diff = usa - wash


cos_sim(turkey, ankara+diff), cos_sim(russian, ankara+diff), cos_sim(japan, ankara+diff)
(0.9899494936611664, 0.9486832980505138, 0.9838699100999074)
euclid_sim(turkey, ankara+diff), euclid_sim(russian, ankara+diff), euclid_sim(japan, ankara+diff)
(1.4142135623730951, 3.1622776601683795, 1.0)