{
"cells": [
{
"cell_type": "markdown",
"id": "b95177df-3dc6-47fb-a5b7-6cb7c14f9a17",
"metadata": {},
"source": [
"# Vector Space Models"
]
},
{
"cell_type": "markdown",
"id": "ae53a908-a9f0-412a-9aba-a826bdcab72a",
"metadata": {},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8ae3a47e-aa53-487a-a7ec-6035ea621737",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import scipy as sp\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"from sklearn.decomposition import PCA\n",
"import seaborn as sns"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6a696cb7-4a37-46c1-a9ab-5a36b82514db",
"metadata": {},
"outputs": [],
"source": [
"sns.set()"
]
},
{
"cell_type": "markdown",
"id": "6ec18e7a-b7dd-410f-9fd3-51bad6547480",
"metadata": {},
"source": [
"## PCA implementation"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e6a007c7-e5ae-465f-9c2b-a70993445f5f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0.43437323, 0.49820384],\n",
" [ 0.42077249, -0.50351448],\n",
" [-0.85514571, 0.00531064]])"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.random.seed(1)\n",
"n_components = 2\n",
"A = np.random.random([3,10]) # shape 3x10\n",
"Am = (A - A.mean(axis=0)) # S1 - Mean centering\n",
"cv = np.cov(Am, rowvar=False); cv.shape # S2 - Calculating Covariance\n",
"ev, evecs = np.linalg.eigh(cv); ev.shape, evecs.shape # S3 - Calculating Eigenvalues\n",
"idx_sort = np.argsort(ev)[::-1] # S4- Sorting\n",
"ev_sort = ev[idx_sort]\n",
"evecs_sort = evecs[:,idx_sort]\n",
"evecs_subset = evecs_sort[:,0:n_components]; evecs_subset # S5 - Taking Column subset\n",
"# ev[idx_sort]\n",
"# evecs_subset=evecs[idx_sort][:,:n_components]; evecs_subset.shape\n",
"comps = (evecs_subset.T@Am.T).T # S6 - Reducing components\n",
"comps"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "75cc6dd1-f77d-462e-84f8-10df2570fa8b",
"metadata": {},
"outputs": [],
"source": [
"pca = PCA(n_components=2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fe6f46f3-3514-4691-9cfc-8561258ff1ae",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[-0.43437323, -0.49820384],\n",
" [-0.42077249, 0.50351448],\n",
" [ 0.85514571, -0.00531064]])"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pca.fit_transform(A)"
]
},
{
"cell_type": "markdown",
"id": "dd587ebd-c4fc-4c10-8227-e72aedb9c90e",
"metadata": {},
"source": [
"## Cosine similarity"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "38df67d6-9469-4156-8eef-2702e5030171",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"-1.0"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def cos_sim(a,b):\n",
" return a@b/(np.linalg.norm(a)*np.linalg.norm(b))\n",
"def euclid_sim(a,b):\n",
" return np.linalg.norm(a-b)\n",
"cos_sim(np.array([1,2,3]), -1*np.array([2,4,6]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "11b0221c-5141-4b52-ad76-a3d51fe697c8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0.8910421112136306, 0.6374594222773456)"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cos_sim(np.array([1,2,3]), np.array([3,1,4])), cos_sim(np.array([4,7,2]), np.array([3,1,4]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "613e4caf-50de-4be1-b0f3-e40f633f913d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(2.449489742783178, 6.4031242374328485)"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"euclid_sim(np.array([1,2,3]), np.array([3,1,4])), euclid_sim(np.array([4,7,2]), np.array([3,1,4]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "85db946d-0115-4568-94a5-89cfc26b1578",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.08512565307587484"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cos_sim(np.array([1,0,-1]), np.array([2,8,1]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "878a1885-69ed-44a7-823b-4e8cb7130348",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0.9899494936611664, 0.9486832980505138, 0.9838699100999074)"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"usa = np.array([5,6])\n",
"wash = np.array([10,5])\n",
"turkey = np.array([3,1])\n",
"ankara = np.array([9,1])\n",
"russian = np.array([5,5])\n",
"japan = np.array([4,3])\n",
"usa, wash, turkey, ankara, russian, japan\n",
"\n",
"diff = usa - wash\n",
"\n",
"\n",
"cos_sim(turkey, ankara+diff), cos_sim(russian, ankara+diff), cos_sim(japan, ankara+diff)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "20464d78-2b85-45b3-a418-886093a26a41",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1.4142135623730951, 3.1622776601683795, 1.0)"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"euclid_sim(turkey, ankara+diff), euclid_sim(russian, ankara+diff), euclid_sim(japan, ankara+diff)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c243a997-bb81-4110-89f0-50df6ea7f82f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}