{ "cells": [ { "cell_type": "markdown", "id": "b95177df-3dc6-47fb-a5b7-6cb7c14f9a17", "metadata": {}, "source": [ "# Vector Space Models" ] }, { "cell_type": "markdown", "id": "ae53a908-a9f0-412a-9aba-a826bdcab72a", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": null, "id": "8ae3a47e-aa53-487a-a7ec-6035ea621737", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import scipy as sp\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "from sklearn.decomposition import PCA\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": null, "id": "6a696cb7-4a37-46c1-a9ab-5a36b82514db", "metadata": {}, "outputs": [], "source": [ "sns.set()" ] }, { "cell_type": "markdown", "id": "6ec18e7a-b7dd-410f-9fd3-51bad6547480", "metadata": {}, "source": [ "## PCA implementation" ] }, { "cell_type": "code", "execution_count": null, "id": "e6a007c7-e5ae-465f-9c2b-a70993445f5f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0.43437323, 0.49820384],\n", " [ 0.42077249, -0.50351448],\n", " [-0.85514571, 0.00531064]])" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.random.seed(1)\n", "n_components = 2\n", "A = np.random.random([3,10]) # shape 3x10\n", "Am = (A - A.mean(axis=0)) # S1 - Mean centering\n", "cv = np.cov(Am, rowvar=False); cv.shape # S2 - Calculating Covariance\n", "ev, evecs = np.linalg.eigh(cv); ev.shape, evecs.shape # S3 - Calculating Eigenvalues\n", "idx_sort = np.argsort(ev)[::-1] # S4- Sorting\n", "ev_sort = ev[idx_sort]\n", "evecs_sort = evecs[:,idx_sort]\n", "evecs_subset = evecs_sort[:,0:n_components]; evecs_subset # S5 - Taking Column subset\n", "# ev[idx_sort]\n", "# evecs_subset=evecs[idx_sort][:,:n_components]; evecs_subset.shape\n", "comps = (evecs_subset.T@Am.T).T # S6 - Reducing components\n", "comps" ] }, { "cell_type": "code", "execution_count": null, "id": "75cc6dd1-f77d-462e-84f8-10df2570fa8b", "metadata": {}, "outputs": [], "source": [ "pca = PCA(n_components=2)" ] }, { "cell_type": "code", "execution_count": null, "id": "fe6f46f3-3514-4691-9cfc-8561258ff1ae", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[-0.43437323, -0.49820384],\n", " [-0.42077249, 0.50351448],\n", " [ 0.85514571, -0.00531064]])" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pca.fit_transform(A)" ] }, { "cell_type": "markdown", "id": "dd587ebd-c4fc-4c10-8227-e72aedb9c90e", "metadata": {}, "source": [ "## Cosine similarity" ] }, { "cell_type": "code", "execution_count": null, "id": "38df67d6-9469-4156-8eef-2702e5030171", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-1.0" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def cos_sim(a,b):\n", " return a@b/(np.linalg.norm(a)*np.linalg.norm(b))\n", "def euclid_sim(a,b):\n", " return np.linalg.norm(a-b)\n", "cos_sim(np.array([1,2,3]), -1*np.array([2,4,6]))" ] }, { "cell_type": "code", "execution_count": null, "id": "11b0221c-5141-4b52-ad76-a3d51fe697c8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(0.8910421112136306, 0.6374594222773456)" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cos_sim(np.array([1,2,3]), np.array([3,1,4])), cos_sim(np.array([4,7,2]), np.array([3,1,4]))" ] }, { "cell_type": "code", "execution_count": null, "id": "613e4caf-50de-4be1-b0f3-e40f633f913d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2.449489742783178, 6.4031242374328485)" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "euclid_sim(np.array([1,2,3]), np.array([3,1,4])), euclid_sim(np.array([4,7,2]), np.array([3,1,4]))" ] }, { "cell_type": "code", "execution_count": null, "id": "85db946d-0115-4568-94a5-89cfc26b1578", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.08512565307587484" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cos_sim(np.array([1,0,-1]), np.array([2,8,1]))" ] }, { "cell_type": "code", "execution_count": null, "id": "878a1885-69ed-44a7-823b-4e8cb7130348", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(0.9899494936611664, 0.9486832980505138, 0.9838699100999074)" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "usa = np.array([5,6])\n", "wash = np.array([10,5])\n", "turkey = np.array([3,1])\n", "ankara = np.array([9,1])\n", "russian = np.array([5,5])\n", "japan = np.array([4,3])\n", "usa, wash, turkey, ankara, russian, japan\n", "\n", "diff = usa - wash\n", "\n", "\n", "cos_sim(turkey, ankara+diff), cos_sim(russian, ankara+diff), cos_sim(japan, ankara+diff)" ] }, { "cell_type": "code", "execution_count": null, "id": "20464d78-2b85-45b3-a418-886093a26a41", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1.4142135623730951, 3.1622776601683795, 1.0)" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "euclid_sim(turkey, ankara+diff), euclid_sim(russian, ankara+diff), euclid_sim(japan, ankara+diff)" ] }, { "cell_type": "code", "execution_count": null, "id": "c243a997-bb81-4110-89f0-50df6ea7f82f", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }