{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "b95177df-3dc6-47fb-a5b7-6cb7c14f9a17",
   "metadata": {},
   "source": [
    "# Vector Space Models"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ae53a908-a9f0-412a-9aba-a826bdcab72a",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ae3a47e-aa53-487a-a7ec-6035ea621737",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import scipy as sp\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "from sklearn.decomposition import PCA\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6a696cb7-4a37-46c1-a9ab-5a36b82514db",
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.set()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6ec18e7a-b7dd-410f-9fd3-51bad6547480",
   "metadata": {},
   "source": [
    "## PCA implementation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6a007c7-e5ae-465f-9c2b-a70993445f5f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.43437323,  0.49820384],\n",
       "       [ 0.42077249, -0.50351448],\n",
       "       [-0.85514571,  0.00531064]])"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.random.seed(1)\n",
    "n_components = 2\n",
    "A = np.random.random([3,10]) # shape 3x10\n",
    "Am = (A - A.mean(axis=0)) # S1 - Mean centering\n",
    "cv = np.cov(Am,  rowvar=False); cv.shape # S2 - Calculating Covariance\n",
    "ev, evecs = np.linalg.eigh(cv); ev.shape, evecs.shape # S3 - Calculating Eigenvalues\n",
    "idx_sort = np.argsort(ev)[::-1] # S4- Sorting\n",
    "ev_sort = ev[idx_sort]\n",
    "evecs_sort = evecs[:,idx_sort]\n",
    "evecs_subset = evecs_sort[:,0:n_components]; evecs_subset # S5 - Taking Column subset\n",
    "# ev[idx_sort]\n",
    "# evecs_subset=evecs[idx_sort][:,:n_components]; evecs_subset.shape\n",
    "comps = (evecs_subset.T@Am.T).T # S6 - Reducing components\n",
    "comps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "75cc6dd1-f77d-462e-84f8-10df2570fa8b",
   "metadata": {},
   "outputs": [],
   "source": [
    "pca = PCA(n_components=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fe6f46f3-3514-4691-9cfc-8561258ff1ae",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-0.43437323, -0.49820384],\n",
       "       [-0.42077249,  0.50351448],\n",
       "       [ 0.85514571, -0.00531064]])"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pca.fit_transform(A)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dd587ebd-c4fc-4c10-8227-e72aedb9c90e",
   "metadata": {},
   "source": [
    "## Cosine similarity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "38df67d6-9469-4156-8eef-2702e5030171",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-1.0"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def cos_sim(a,b):\n",
    "    return a@b/(np.linalg.norm(a)*np.linalg.norm(b))\n",
    "def euclid_sim(a,b):\n",
    "    return np.linalg.norm(a-b)\n",
    "cos_sim(np.array([1,2,3]), -1*np.array([2,4,6]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11b0221c-5141-4b52-ad76-a3d51fe697c8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.8910421112136306, 0.6374594222773456)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cos_sim(np.array([1,2,3]), np.array([3,1,4])), cos_sim(np.array([4,7,2]), np.array([3,1,4]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "613e4caf-50de-4be1-b0f3-e40f633f913d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2.449489742783178, 6.4031242374328485)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "euclid_sim(np.array([1,2,3]), np.array([3,1,4])), euclid_sim(np.array([4,7,2]), np.array([3,1,4]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85db946d-0115-4568-94a5-89cfc26b1578",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.08512565307587484"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cos_sim(np.array([1,0,-1]), np.array([2,8,1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "878a1885-69ed-44a7-823b-4e8cb7130348",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.9899494936611664, 0.9486832980505138, 0.9838699100999074)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "usa = np.array([5,6])\n",
    "wash = np.array([10,5])\n",
    "turkey = np.array([3,1])\n",
    "ankara = np.array([9,1])\n",
    "russian = np.array([5,5])\n",
    "japan = np.array([4,3])\n",
    "usa, wash, turkey, ankara, russian, japan\n",
    "\n",
    "diff = usa - wash\n",
    "\n",
    "\n",
    "cos_sim(turkey, ankara+diff), cos_sim(russian, ankara+diff), cos_sim(japan, ankara+diff)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20464d78-2b85-45b3-a418-886093a26a41",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1.4142135623730951, 3.1622776601683795, 1.0)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "euclid_sim(turkey, ankara+diff), euclid_sim(russian, ankara+diff), euclid_sim(japan, ankara+diff)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c243a997-bb81-4110-89f0-50df6ea7f82f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}