{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "2c90de90-1e3e-4bf1-8705-b2b85ed8a387",
   "metadata": {},
   "source": [
    "# Regression : Understanding effect and cause"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "49d86280-47eb-471e-becf-33affb9f56c5",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5cb2af73-246e-4a95-ad43-1816b79d3985",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import scipy as sp\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "from sklearn.linear_model import LinearRegression, LogisticRegression\n",
    "from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder\n",
    "from sklearn.pipeline import Pipeline, make_pipeline\n",
    "import statsmodels.api as sm\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1e147e1-14c7-4dbf-beab-02a3ed42c760",
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.set()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "17a98d1e-3959-405c-bc52-ffcc9f53fb9a",
   "metadata": {},
   "source": [
    "## Credit Score Rating Example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3f142627-eaa6-40f9-830b-90594c450469",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Income</th>\n",
       "      <th>Rating</th>\n",
       "      <th>Cards</th>\n",
       "      <th>Age</th>\n",
       "      <th>Education</th>\n",
       "      <th>Gender</th>\n",
       "      <th>Student</th>\n",
       "      <th>Married</th>\n",
       "      <th>Ethnicity</th>\n",
       "      <th>Balance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>14.891</td>\n",
       "      <td>283</td>\n",
       "      <td>2</td>\n",
       "      <td>34</td>\n",
       "      <td>11</td>\n",
       "      <td>Male</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Caucasian</td>\n",
       "      <td>333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>106.025</td>\n",
       "      <td>483</td>\n",
       "      <td>3</td>\n",
       "      <td>82</td>\n",
       "      <td>15</td>\n",
       "      <td>Female</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Asian</td>\n",
       "      <td>903</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>104.593</td>\n",
       "      <td>514</td>\n",
       "      <td>4</td>\n",
       "      <td>71</td>\n",
       "      <td>11</td>\n",
       "      <td>Male</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Asian</td>\n",
       "      <td>580</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>148.924</td>\n",
       "      <td>681</td>\n",
       "      <td>3</td>\n",
       "      <td>36</td>\n",
       "      <td>11</td>\n",
       "      <td>Female</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>Asian</td>\n",
       "      <td>964</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>55.882</td>\n",
       "      <td>357</td>\n",
       "      <td>2</td>\n",
       "      <td>68</td>\n",
       "      <td>16</td>\n",
       "      <td>Male</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Caucasian</td>\n",
       "      <td>331</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Income  Rating  Cards  Age  Education  Gender Student Married  Ethnicity  \\\n",
       "0   14.891     283      2   34         11    Male      No     Yes  Caucasian   \n",
       "1  106.025     483      3   82         15  Female     Yes     Yes      Asian   \n",
       "2  104.593     514      4   71         11    Male      No      No      Asian   \n",
       "3  148.924     681      3   36         11  Female      No      No      Asian   \n",
       "4   55.882     357      2   68         16    Male      No     Yes  Caucasian   \n",
       "\n",
       "   Balance  \n",
       "0      333  \n",
       "1      903  \n",
       "2      580  \n",
       "3      964  \n",
       "4      331  "
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_credscore = pd.read_csv(\"DATA_3.01_CREDIT.csv\", dtype={'Gender':'category', \n",
    "                                                          'Student':'category',\n",
    "                                                          'Married':'category',\n",
    "                                                          'Ethnicity':'category'\n",
    "                                                         });df_credscore.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58c58a15-5422-4cd9-9ea0-11e2eb912f98",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 300 entries, 0 to 299\n",
      "Data columns (total 10 columns):\n",
      " #   Column     Non-Null Count  Dtype   \n",
      "---  ------     --------------  -----   \n",
      " 0   Income     300 non-null    float64 \n",
      " 1   Rating     300 non-null    int64   \n",
      " 2   Cards      300 non-null    int64   \n",
      " 3   Age        300 non-null    int64   \n",
      " 4   Education  300 non-null    int64   \n",
      " 5   Gender     300 non-null    category\n",
      " 6   Student    300 non-null    category\n",
      " 7   Married    300 non-null    category\n",
      " 8   Ethnicity  300 non-null    category\n",
      " 9   Balance    300 non-null    int64   \n",
      "dtypes: category(4), float64(1), int64(5)\n",
      "memory usage: 15.6 KB\n"
     ]
    }
   ],
   "source": [
    "df_credscore.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cdb64a98-e23c-41ef-b0c0-5f672ac69e7b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Income</th>\n",
       "      <th>Rating</th>\n",
       "      <th>Cards</th>\n",
       "      <th>Age</th>\n",
       "      <th>Education</th>\n",
       "      <th>Gender</th>\n",
       "      <th>Student</th>\n",
       "      <th>Married</th>\n",
       "      <th>Ethnicity</th>\n",
       "      <th>Balance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>300.000000</td>\n",
       "      <td>300.000000</td>\n",
       "      <td>300.000000</td>\n",
       "      <td>300.000000</td>\n",
       "      <td>300.000000</td>\n",
       "      <td>300</td>\n",
       "      <td>300</td>\n",
       "      <td>300</td>\n",
       "      <td>300</td>\n",
       "      <td>300.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unique</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>top</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Female</td>\n",
       "      <td>No</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Caucasian</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>freq</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>168</td>\n",
       "      <td>268</td>\n",
       "      <td>183</td>\n",
       "      <td>141</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>44.054393</td>\n",
       "      <td>348.116667</td>\n",
       "      <td>3.026667</td>\n",
       "      <td>54.983333</td>\n",
       "      <td>13.393333</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>502.686667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>33.863066</td>\n",
       "      <td>150.871547</td>\n",
       "      <td>1.351064</td>\n",
       "      <td>17.216982</td>\n",
       "      <td>3.075193</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>466.991447</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>10.354000</td>\n",
       "      <td>93.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>24.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>21.027500</td>\n",
       "      <td>235.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>41.000000</td>\n",
       "      <td>11.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15.750000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>33.115500</td>\n",
       "      <td>339.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>55.000000</td>\n",
       "      <td>14.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>433.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>55.975500</td>\n",
       "      <td>433.000000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>69.000000</td>\n",
       "      <td>16.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>857.750000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>186.634000</td>\n",
       "      <td>949.000000</td>\n",
       "      <td>8.000000</td>\n",
       "      <td>91.000000</td>\n",
       "      <td>20.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1809.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            Income      Rating       Cards         Age   Education  Gender  \\\n",
       "count   300.000000  300.000000  300.000000  300.000000  300.000000     300   \n",
       "unique         NaN         NaN         NaN         NaN         NaN       2   \n",
       "top            NaN         NaN         NaN         NaN         NaN  Female   \n",
       "freq           NaN         NaN         NaN         NaN         NaN     168   \n",
       "mean     44.054393  348.116667    3.026667   54.983333   13.393333     NaN   \n",
       "std      33.863066  150.871547    1.351064   17.216982    3.075193     NaN   \n",
       "min      10.354000   93.000000    1.000000   24.000000    5.000000     NaN   \n",
       "25%      21.027500  235.000000    2.000000   41.000000   11.000000     NaN   \n",
       "50%      33.115500  339.000000    3.000000   55.000000   14.000000     NaN   \n",
       "75%      55.975500  433.000000    4.000000   69.000000   16.000000     NaN   \n",
       "max     186.634000  949.000000    8.000000   91.000000   20.000000     NaN   \n",
       "\n",
       "       Student Married  Ethnicity      Balance  \n",
       "count      300     300        300   300.000000  \n",
       "unique       2       2          3          NaN  \n",
       "top         No     Yes  Caucasian          NaN  \n",
       "freq       268     183        141          NaN  \n",
       "mean       NaN     NaN        NaN   502.686667  \n",
       "std        NaN     NaN        NaN   466.991447  \n",
       "min        NaN     NaN        NaN     0.000000  \n",
       "25%        NaN     NaN        NaN    15.750000  \n",
       "50%        NaN     NaN        NaN   433.500000  \n",
       "75%        NaN     NaN        NaN   857.750000  \n",
       "max        NaN     NaN        NaN  1809.000000  "
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_credscore.describe(include='all')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf537297-931e-4db9-bba1-c24983cdc2b2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Income</th>\n",
       "      <th>Rating</th>\n",
       "      <th>Cards</th>\n",
       "      <th>Age</th>\n",
       "      <th>Education</th>\n",
       "      <th>Balance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Income</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.771167</td>\n",
       "      <td>0.028875</td>\n",
       "      <td>0.123201</td>\n",
       "      <td>-0.070959</td>\n",
       "      <td>0.432327</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Rating</th>\n",
       "      <td>0.771167</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.095854</td>\n",
       "      <td>0.042377</td>\n",
       "      <td>-0.095433</td>\n",
       "      <td>0.859829</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cards</th>\n",
       "      <td>0.028875</td>\n",
       "      <td>0.095854</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.054655</td>\n",
       "      <td>0.015176</td>\n",
       "      <td>0.123846</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Age</th>\n",
       "      <td>0.123201</td>\n",
       "      <td>0.042377</td>\n",
       "      <td>0.054655</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.046178</td>\n",
       "      <td>-0.052426</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Education</th>\n",
       "      <td>-0.070959</td>\n",
       "      <td>-0.095433</td>\n",
       "      <td>0.015176</td>\n",
       "      <td>-0.046178</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.073167</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Balance</th>\n",
       "      <td>0.432327</td>\n",
       "      <td>0.859829</td>\n",
       "      <td>0.123846</td>\n",
       "      <td>-0.052426</td>\n",
       "      <td>-0.073167</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             Income    Rating     Cards       Age  Education   Balance\n",
       "Income     1.000000  0.771167  0.028875  0.123201  -0.070959  0.432327\n",
       "Rating     0.771167  1.000000  0.095854  0.042377  -0.095433  0.859829\n",
       "Cards      0.028875  0.095854  1.000000  0.054655   0.015176  0.123846\n",
       "Age        0.123201  0.042377  0.054655  1.000000  -0.046178 -0.052426\n",
       "Education -0.070959 -0.095433  0.015176 -0.046178   1.000000 -0.073167\n",
       "Balance    0.432327  0.859829  0.123846 -0.052426  -0.073167  1.000000"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_credscore.corr() # Individual correlations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4f86f79-7ab6-4fcb-8f4c-60fa461990bb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Income       0.771167\n",
       "Rating       1.000000\n",
       "Cards        0.095854\n",
       "Age          0.042377\n",
       "Education   -0.095433\n",
       "Balance      0.859829\n",
       "Name: Rating, dtype: float64"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_credscore.corr()[\"Rating\"] # We need to understand interactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "71f8d364-b3c7-442b-9d1f-a0177b224326",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(steps=[('ordinalencoder', OrdinalEncoder()),\n",
       "                ('linearregression', LinearRegression())])"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipeline = make_pipeline(OrdinalEncoder(),LinearRegression()); pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "49bddc88-7e6e-4b6a-aa93-fb20e5755f17",
   "metadata": {},
   "outputs": [],
   "source": [
    "y = df_credscore.pop('Rating')\n",
    "X = df_credscore"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "738990d1-3603-4508-994f-6f7c9965e98d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 300 entries, 0 to 299\n",
      "Data columns (total 9 columns):\n",
      " #   Column     Non-Null Count  Dtype   \n",
      "---  ------     --------------  -----   \n",
      " 0   Income     300 non-null    float64 \n",
      " 1   Cards      300 non-null    int64   \n",
      " 2   Age        300 non-null    int64   \n",
      " 3   Education  300 non-null    int64   \n",
      " 4   Gender     300 non-null    category\n",
      " 5   Student    300 non-null    category\n",
      " 6   Married    300 non-null    category\n",
      " 7   Ethnicity  300 non-null    category\n",
      " 8   Balance    300 non-null    int64   \n",
      "dtypes: category(4), float64(1), int64(4)\n",
      "memory usage: 13.3 KB\n"
     ]
    }
   ],
   "source": [
    "X.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7483a8ff-7b00-482c-aa7e-a68aad62023b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(steps=[('ordinalencoder', OrdinalEncoder()),\n",
       "                ('linearregression', LinearRegression())])"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipeline.fit(X,y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "074306df-d771-4564-a985-569c6512e2cf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LinearRegression()"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = pipeline['linearregression']\n",
    "model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23a58fef-1ec2-46a1-9e65-9e993981c9de",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([  0.71532602,   1.41275992,   0.17419851,   0.61789045,\n",
       "         0.33006896, -91.64416173,   3.56809569,  -2.47231507,\n",
       "         1.6260681 ])"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.coef_"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e75338a2-9086-461e-9f30-3a51fa87a542",
   "metadata": {},
   "source": [
    "### Statsmodel api"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc42cee7-4ae2-487c-ad8b-0e1d347f50ca",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Income</th>\n",
       "      <th>Cards</th>\n",
       "      <th>Age</th>\n",
       "      <th>Education</th>\n",
       "      <th>Gender</th>\n",
       "      <th>Student</th>\n",
       "      <th>Married</th>\n",
       "      <th>Ethnicity</th>\n",
       "      <th>Balance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>14.891</td>\n",
       "      <td>2</td>\n",
       "      <td>34</td>\n",
       "      <td>11</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>106.025</td>\n",
       "      <td>3</td>\n",
       "      <td>82</td>\n",
       "      <td>15</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>903</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>104.593</td>\n",
       "      <td>4</td>\n",
       "      <td>71</td>\n",
       "      <td>11</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>580</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>148.924</td>\n",
       "      <td>3</td>\n",
       "      <td>36</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>964</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>55.882</td>\n",
       "      <td>2</td>\n",
       "      <td>68</td>\n",
       "      <td>16</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>331</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295</th>\n",
       "      <td>27.272</td>\n",
       "      <td>5</td>\n",
       "      <td>67</td>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>296</th>\n",
       "      <td>65.896</td>\n",
       "      <td>1</td>\n",
       "      <td>49</td>\n",
       "      <td>17</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>293</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>297</th>\n",
       "      <td>55.054</td>\n",
       "      <td>3</td>\n",
       "      <td>74</td>\n",
       "      <td>17</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>188</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>298</th>\n",
       "      <td>20.791</td>\n",
       "      <td>1</td>\n",
       "      <td>70</td>\n",
       "      <td>18</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>299</th>\n",
       "      <td>24.919</td>\n",
       "      <td>3</td>\n",
       "      <td>76</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>711</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>300 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      Income  Cards  Age  Education  Gender  Student  Married  Ethnicity  \\\n",
       "0     14.891      2   34         11       0        0        1          2   \n",
       "1    106.025      3   82         15       1        1        1          1   \n",
       "2    104.593      4   71         11       0        0        0          1   \n",
       "3    148.924      3   36         11       1        0        0          1   \n",
       "4     55.882      2   68         16       0        0        1          2   \n",
       "..       ...    ...  ...        ...     ...      ...      ...        ...   \n",
       "295   27.272      5   67         10       1        0        1          2   \n",
       "296   65.896      1   49         17       1        0        1          2   \n",
       "297   55.054      3   74         17       0        0        1          1   \n",
       "298   20.791      1   70         18       1        0        0          0   \n",
       "299   24.919      3   76         11       1        0        1          0   \n",
       "\n",
       "     Balance  \n",
       "0        333  \n",
       "1        903  \n",
       "2        580  \n",
       "3        964  \n",
       "4        331  \n",
       "..       ...  \n",
       "295        0  \n",
       "296      293  \n",
       "297      188  \n",
       "298        0  \n",
       "299      711  \n",
       "\n",
       "[300 rows x 9 columns]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def preprocess_categories(df):\n",
    "    df_out = df.copy()\n",
    "    for col in df.dtypes[df.dtypes=='category'].index:\n",
    "        df_out[col] = df[col].cat.codes\n",
    "    return df_out\n",
    "\n",
    "preprocess_categories(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ad73cf4b-6928-4a3a-94db-b9751777b364",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>const</th>\n",
       "      <th>Income</th>\n",
       "      <th>Cards</th>\n",
       "      <th>Age</th>\n",
       "      <th>Education</th>\n",
       "      <th>Gender</th>\n",
       "      <th>Student</th>\n",
       "      <th>Married</th>\n",
       "      <th>Ethnicity</th>\n",
       "      <th>Balance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>14.891</td>\n",
       "      <td>2</td>\n",
       "      <td>34</td>\n",
       "      <td>11</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>106.025</td>\n",
       "      <td>3</td>\n",
       "      <td>82</td>\n",
       "      <td>15</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>903</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.0</td>\n",
       "      <td>104.593</td>\n",
       "      <td>4</td>\n",
       "      <td>71</td>\n",
       "      <td>11</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>580</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.0</td>\n",
       "      <td>148.924</td>\n",
       "      <td>3</td>\n",
       "      <td>36</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>964</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.0</td>\n",
       "      <td>55.882</td>\n",
       "      <td>2</td>\n",
       "      <td>68</td>\n",
       "      <td>16</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>331</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295</th>\n",
       "      <td>1.0</td>\n",
       "      <td>27.272</td>\n",
       "      <td>5</td>\n",
       "      <td>67</td>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>296</th>\n",
       "      <td>1.0</td>\n",
       "      <td>65.896</td>\n",
       "      <td>1</td>\n",
       "      <td>49</td>\n",
       "      <td>17</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>293</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>297</th>\n",
       "      <td>1.0</td>\n",
       "      <td>55.054</td>\n",
       "      <td>3</td>\n",
       "      <td>74</td>\n",
       "      <td>17</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>188</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>298</th>\n",
       "      <td>1.0</td>\n",
       "      <td>20.791</td>\n",
       "      <td>1</td>\n",
       "      <td>70</td>\n",
       "      <td>18</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>299</th>\n",
       "      <td>1.0</td>\n",
       "      <td>24.919</td>\n",
       "      <td>3</td>\n",
       "      <td>76</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>711</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>300 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     const   Income  Cards  Age  Education  Gender  Student  Married  \\\n",
       "0      1.0   14.891      2   34         11       0        0        1   \n",
       "1      1.0  106.025      3   82         15       1        1        1   \n",
       "2      1.0  104.593      4   71         11       0        0        0   \n",
       "3      1.0  148.924      3   36         11       1        0        0   \n",
       "4      1.0   55.882      2   68         16       0        0        1   \n",
       "..     ...      ...    ...  ...        ...     ...      ...      ...   \n",
       "295    1.0   27.272      5   67         10       1        0        1   \n",
       "296    1.0   65.896      1   49         17       1        0        1   \n",
       "297    1.0   55.054      3   74         17       0        0        1   \n",
       "298    1.0   20.791      1   70         18       1        0        0   \n",
       "299    1.0   24.919      3   76         11       1        0        1   \n",
       "\n",
       "     Ethnicity  Balance  \n",
       "0            2      333  \n",
       "1            1      903  \n",
       "2            1      580  \n",
       "3            1      964  \n",
       "4            2      331  \n",
       "..         ...      ...  \n",
       "295          2        0  \n",
       "296          2      293  \n",
       "297          1      188  \n",
       "298          0        0  \n",
       "299          0      711  \n",
       "\n",
       "[300 rows x 10 columns]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = sm.add_constant(preprocess_categories(X))\n",
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b43d7bd-46bd-412a-9645-43364bc89b3e",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = sm.OLS(y, X).fit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "af0e37a0-e8ab-41f0-b1aa-8f9b4a73a630",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>OLS Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>         <td>Rating</td>      <th>  R-squared:         </th> <td>   0.974</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.973</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   1185.</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>             <td>Fri, 27 May 2022</td> <th>  Prob (F-statistic):</th> <td>6.33e-223</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                 <td>08:02:31</td>     <th>  Log-Likelihood:    </th> <td> -1385.4</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>No. Observations:</th>      <td>   300</td>      <th>  AIC:               </th> <td>   2791.</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Residuals:</th>          <td>   290</td>      <th>  BIC:               </th> <td>   2828.</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Model:</th>              <td>     9</td>      <th>                     </th>     <td> </td>    \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>    \n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "      <td></td>         <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>const</th>     <td>  139.4908</td> <td>    9.595</td> <td>   14.538</td> <td> 0.000</td> <td>  120.607</td> <td>  158.375</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Income</th>    <td>    2.0946</td> <td>    0.048</td> <td>   43.507</td> <td> 0.000</td> <td>    2.000</td> <td>    2.189</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Cards</th>     <td>   -0.7769</td> <td>    1.080</td> <td>   -0.719</td> <td> 0.473</td> <td>   -2.903</td> <td>    1.349</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Age</th>       <td>    0.1493</td> <td>    0.086</td> <td>    1.740</td> <td> 0.083</td> <td>   -0.020</td> <td>    0.318</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Education</th> <td>    0.1721</td> <td>    0.474</td> <td>    0.363</td> <td> 0.717</td> <td>   -0.761</td> <td>    1.105</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Gender</th>    <td>    1.8529</td> <td>    2.919</td> <td>    0.635</td> <td> 0.526</td> <td>   -3.891</td> <td>    7.597</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Student</th>   <td>  -99.2582</td> <td>    4.947</td> <td>  -20.066</td> <td> 0.000</td> <td> -108.994</td> <td>  -89.522</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Married</th>   <td>    2.7424</td> <td>    2.983</td> <td>    0.919</td> <td> 0.359</td> <td>   -3.129</td> <td>    8.614</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Ethnicity</th> <td>   -0.3005</td> <td>    1.745</td> <td>   -0.172</td> <td> 0.863</td> <td>   -3.735</td> <td>    3.134</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Balance</th>   <td>    0.2316</td> <td>    0.004</td> <td>   63.330</td> <td> 0.000</td> <td>    0.224</td> <td>    0.239</td>\n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "  <th>Omnibus:</th>       <td>43.876</td> <th>  Durbin-Watson:     </th> <td>   1.851</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Prob(Omnibus):</th> <td> 0.000</td> <th>  Jarque-Bera (JB):  </th> <td>  59.049</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Skew:</th>          <td>-0.999</td> <th>  Prob(JB):          </th> <td>1.51e-13</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Kurtosis:</th>      <td> 3.857</td> <th>  Cond. No.          </th> <td>4.61e+03</td>\n",
       "</tr>\n",
       "</table><br/><br/>Notes:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.<br/>[2] The condition number is large, 4.61e+03. This might indicate that there are<br/>strong multicollinearity or other numerical problems."
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                            OLS Regression Results                            \n",
       "==============================================================================\n",
       "Dep. Variable:                 Rating   R-squared:                       0.974\n",
       "Model:                            OLS   Adj. R-squared:                  0.973\n",
       "Method:                 Least Squares   F-statistic:                     1185.\n",
       "Date:                Fri, 27 May 2022   Prob (F-statistic):          6.33e-223\n",
       "Time:                        08:02:31   Log-Likelihood:                -1385.4\n",
       "No. Observations:                 300   AIC:                             2791.\n",
       "Df Residuals:                     290   BIC:                             2828.\n",
       "Df Model:                           9                                         \n",
       "Covariance Type:            nonrobust                                         \n",
       "==============================================================================\n",
       "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
       "------------------------------------------------------------------------------\n",
       "const        139.4908      9.595     14.538      0.000     120.607     158.375\n",
       "Income         2.0946      0.048     43.507      0.000       2.000       2.189\n",
       "Cards         -0.7769      1.080     -0.719      0.473      -2.903       1.349\n",
       "Age            0.1493      0.086      1.740      0.083      -0.020       0.318\n",
       "Education      0.1721      0.474      0.363      0.717      -0.761       1.105\n",
       "Gender         1.8529      2.919      0.635      0.526      -3.891       7.597\n",
       "Student      -99.2582      4.947    -20.066      0.000    -108.994     -89.522\n",
       "Married        2.7424      2.983      0.919      0.359      -3.129       8.614\n",
       "Ethnicity     -0.3005      1.745     -0.172      0.863      -3.735       3.134\n",
       "Balance        0.2316      0.004     63.330      0.000       0.224       0.239\n",
       "==============================================================================\n",
       "Omnibus:                       43.876   Durbin-Watson:                   1.851\n",
       "Prob(Omnibus):                  0.000   Jarque-Bera (JB):               59.049\n",
       "Skew:                          -0.999   Prob(JB):                     1.51e-13\n",
       "Kurtosis:                       3.857   Cond. No.                     4.61e+03\n",
       "==============================================================================\n",
       "\n",
       "Notes:\n",
       "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
       "[2] The condition number is large, 4.61e+03. This might indicate that there are\n",
       "strong multicollinearity or other numerical problems.\n",
       "\"\"\""
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a08f7238-5ef6-409e-866b-5bd4d30c22d7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Balance      63.329758\n",
       "Income       43.507422\n",
       "Student      20.066064\n",
       "const        14.538499\n",
       "Age           1.740111\n",
       "Married       0.919321\n",
       "Cards         0.719127\n",
       "Gender        0.634877\n",
       "Education     0.363174\n",
       "Ethnicity     0.172217\n",
       "dtype: float64"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.tvalues.abs().sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b899f23-2fb1-423c-9212-33c5722bbde6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Balance    63.329758\n",
       "Income     43.507422\n",
       "Student   -20.066064\n",
       "const      14.538499\n",
       "dtype: float64"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.tvalues[model.tvalues[model.pvalues <= 0.05].abs().sort_values(ascending=False).index]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7be36d13-32e1-4274-bd06-862895be0302",
   "metadata": {},
   "outputs": [],
   "source": [
    "# np.corr(model.fittedvalues,y.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a09f1fed-3174-4e7e-8c6d-686a4cbc8de0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([42981258.44356128])"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.correlate(model.fittedvalues, y.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20d2c6d5-1b33-43b4-95fa-d6b6e15fc023",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      255.364880\n",
       "1      488.244174\n",
       "2      501.990296\n",
       "3      681.185956\n",
       "4      346.698891\n",
       "          ...    \n",
       "295    208.450617\n",
       "296    358.837627\n",
       "297    312.436387\n",
       "298    197.666967\n",
       "299    371.866045\n",
       "Length: 300, dtype: float64"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fittedvalues"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "84e39447-13cf-464a-9c8a-ec2d1f9fc151",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1.       , 0.9866719],\n",
       "       [0.9866719, 1.       ]])"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.corrcoef(model.fittedvalues.values, y.values)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d4ddb1b1-bf69-4d3f-ad5b-630cc70071a2",
   "metadata": {},
   "source": [
    "### Limited Variables Income, Cards, Married"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "45f58e37-fc62-440f-8a45-2568e0f2d7a5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>const</th>\n",
       "      <th>Income</th>\n",
       "      <th>Cards</th>\n",
       "      <th>Married</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>14.891</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>106.025</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.0</td>\n",
       "      <td>104.593</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.0</td>\n",
       "      <td>148.924</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.0</td>\n",
       "      <td>55.882</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295</th>\n",
       "      <td>1.0</td>\n",
       "      <td>27.272</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>296</th>\n",
       "      <td>1.0</td>\n",
       "      <td>65.896</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>297</th>\n",
       "      <td>1.0</td>\n",
       "      <td>55.054</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>298</th>\n",
       "      <td>1.0</td>\n",
       "      <td>20.791</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>299</th>\n",
       "      <td>1.0</td>\n",
       "      <td>24.919</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>300 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     const   Income  Cards  Married\n",
       "0      1.0   14.891      2        1\n",
       "1      1.0  106.025      3        1\n",
       "2      1.0  104.593      4        0\n",
       "3      1.0  148.924      3        0\n",
       "4      1.0   55.882      2        1\n",
       "..     ...      ...    ...      ...\n",
       "295    1.0   27.272      5        1\n",
       "296    1.0   65.896      1        1\n",
       "297    1.0   55.054      3        1\n",
       "298    1.0   20.791      1        0\n",
       "299    1.0   24.919      3        1\n",
       "\n",
       "[300 rows x 4 columns]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_red = X[['const', 'Income', 'Cards', 'Married']]\n",
    "X_red"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c07628d2-e64e-4b80-8adc-acbaf46aecea",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d18b567f-4f32-40bf-98ed-1088b5237f99",
   "metadata": {},
   "outputs": [],
   "source": [
    "model2 = sm.OLS(y, X_red).fit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8869c58c-7201-4689-a268-8cb3a0d158b2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>OLS Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>         <td>Rating</td>      <th>  R-squared:         </th> <td>   0.602</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.598</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   149.0</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>             <td>Fri, 27 May 2022</td> <th>  Prob (F-statistic):</th> <td>7.56e-59</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                 <td>08:02:39</td>     <th>  Log-Likelihood:    </th> <td> -1792.1</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>No. Observations:</th>      <td>   300</td>      <th>  AIC:               </th> <td>   3592.</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Residuals:</th>          <td>   296</td>      <th>  BIC:               </th> <td>   3607.</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Model:</th>              <td>     3</td>      <th>                     </th>     <td> </td>   \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>   \n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "     <td></td>        <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>const</th>   <td>  165.2144</td> <td>   16.641</td> <td>    9.928</td> <td> 0.000</td> <td>  132.464</td> <td>  197.964</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Income</th>  <td>    3.4196</td> <td>    0.164</td> <td>   20.896</td> <td> 0.000</td> <td>    3.098</td> <td>    3.742</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Cards</th>   <td>    8.2699</td> <td>    4.099</td> <td>    2.018</td> <td> 0.045</td> <td>    0.203</td> <td>   16.336</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Married</th> <td>   11.8404</td> <td>   11.339</td> <td>    1.044</td> <td> 0.297</td> <td>  -10.474</td> <td>   34.155</td>\n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "  <th>Omnibus:</th>       <td>133.940</td> <th>  Durbin-Watson:     </th> <td>   1.873</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Prob(Omnibus):</th> <td> 0.000</td>  <th>  Jarque-Bera (JB):  </th> <td>  17.170</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Skew:</th>          <td> 0.044</td>  <th>  Prob(JB):          </th> <td>0.000187</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Kurtosis:</th>      <td> 1.831</td>  <th>  Cond. No.          </th> <td>    179.</td>\n",
       "</tr>\n",
       "</table><br/><br/>Notes:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified."
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                            OLS Regression Results                            \n",
       "==============================================================================\n",
       "Dep. Variable:                 Rating   R-squared:                       0.602\n",
       "Model:                            OLS   Adj. R-squared:                  0.598\n",
       "Method:                 Least Squares   F-statistic:                     149.0\n",
       "Date:                Fri, 27 May 2022   Prob (F-statistic):           7.56e-59\n",
       "Time:                        08:02:39   Log-Likelihood:                -1792.1\n",
       "No. Observations:                 300   AIC:                             3592.\n",
       "Df Residuals:                     296   BIC:                             3607.\n",
       "Df Model:                           3                                         \n",
       "Covariance Type:            nonrobust                                         \n",
       "==============================================================================\n",
       "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
       "------------------------------------------------------------------------------\n",
       "const        165.2144     16.641      9.928      0.000     132.464     197.964\n",
       "Income         3.4196      0.164     20.896      0.000       3.098       3.742\n",
       "Cards          8.2699      4.099      2.018      0.045       0.203      16.336\n",
       "Married       11.8404     11.339      1.044      0.297     -10.474      34.155\n",
       "==============================================================================\n",
       "Omnibus:                      133.940   Durbin-Watson:                   1.873\n",
       "Prob(Omnibus):                  0.000   Jarque-Bera (JB):               17.170\n",
       "Skew:                           0.044   Prob(JB):                     0.000187\n",
       "Kurtosis:                       1.831   Cond. No.                         179.\n",
       "==============================================================================\n",
       "\n",
       "Notes:\n",
       "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
       "\"\"\""
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model2.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c15f308-a5bd-420a-9060-aae8a3a558e4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Income    20.895784\n",
       "const      9.928059\n",
       "Cards      2.017633\n",
       "dtype: float64"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model2.tvalues[model2.tvalues[model2.pvalues <= 0.05].abs().sort_values(ascending=False).index]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "73e5e5eb-5d84-4cf4-917d-2224ee66fb5a",
   "metadata": {},
   "source": [
    "## HR Example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8a4a2cb-5ae8-434d-a6e6-44d8f7f8e51e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>S</th>\n",
       "      <th>LPE</th>\n",
       "      <th>NP</th>\n",
       "      <th>ANH</th>\n",
       "      <th>TIC</th>\n",
       "      <th>Newborn</th>\n",
       "      <th>left</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.38</td>\n",
       "      <td>0.53</td>\n",
       "      <td>2</td>\n",
       "      <td>157</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.80</td>\n",
       "      <td>0.86</td>\n",
       "      <td>5</td>\n",
       "      <td>262</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.11</td>\n",
       "      <td>0.88</td>\n",
       "      <td>7</td>\n",
       "      <td>272</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.72</td>\n",
       "      <td>0.87</td>\n",
       "      <td>5</td>\n",
       "      <td>223</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.37</td>\n",
       "      <td>0.52</td>\n",
       "      <td>2</td>\n",
       "      <td>159</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      S   LPE  NP  ANH  TIC  Newborn  left\n",
       "0  0.38  0.53   2  157    3        0     1\n",
       "1  0.80  0.86   5  262    6        0     1\n",
       "2  0.11  0.88   7  272    4        0     1\n",
       "3  0.72  0.87   5  223    5        0     1\n",
       "4  0.37  0.52   2  159    3        0     1"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_hr = pd.read_csv(\"DATA_3.02_HR2.csv\"); df_hr.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b3c97757-53c6-4c41-b6d3-8f8937d66370",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 12000 entries, 0 to 11999\n",
      "Data columns (total 7 columns):\n",
      " #   Column   Non-Null Count  Dtype  \n",
      "---  ------   --------------  -----  \n",
      " 0   S        12000 non-null  float64\n",
      " 1   LPE      12000 non-null  float64\n",
      " 2   NP       12000 non-null  int64  \n",
      " 3   ANH      12000 non-null  int64  \n",
      " 4   TIC      12000 non-null  int64  \n",
      " 5   Newborn  12000 non-null  int64  \n",
      " 6   left     12000 non-null  int64  \n",
      "dtypes: float64(2), int64(5)\n",
      "memory usage: 656.4 KB\n"
     ]
    }
   ],
   "source": [
    "df_hr.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b631c30c-cc89-40e0-8fb8-652392f52d36",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>S</th>\n",
       "      <th>LPE</th>\n",
       "      <th>NP</th>\n",
       "      <th>ANH</th>\n",
       "      <th>TIC</th>\n",
       "      <th>Newborn</th>\n",
       "      <th>left</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>12000.000000</td>\n",
       "      <td>12000.000000</td>\n",
       "      <td>12000.000000</td>\n",
       "      <td>12000.000000</td>\n",
       "      <td>12000.000000</td>\n",
       "      <td>12000.000000</td>\n",
       "      <td>12000.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>0.629463</td>\n",
       "      <td>0.716558</td>\n",
       "      <td>3.801833</td>\n",
       "      <td>200.437917</td>\n",
       "      <td>3.228750</td>\n",
       "      <td>0.154167</td>\n",
       "      <td>0.166667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>0.241100</td>\n",
       "      <td>0.168368</td>\n",
       "      <td>1.163906</td>\n",
       "      <td>48.740178</td>\n",
       "      <td>1.056811</td>\n",
       "      <td>0.361123</td>\n",
       "      <td>0.372694</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.090000</td>\n",
       "      <td>0.360000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>96.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>0.480000</td>\n",
       "      <td>0.570000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>157.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>0.660000</td>\n",
       "      <td>0.720000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>199.500000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>0.820000</td>\n",
       "      <td>0.860000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>243.000000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>7.000000</td>\n",
       "      <td>310.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  S           LPE            NP           ANH           TIC  \\\n",
       "count  12000.000000  12000.000000  12000.000000  12000.000000  12000.000000   \n",
       "mean       0.629463      0.716558      3.801833    200.437917      3.228750   \n",
       "std        0.241100      0.168368      1.163906     48.740178      1.056811   \n",
       "min        0.090000      0.360000      2.000000     96.000000      2.000000   \n",
       "25%        0.480000      0.570000      3.000000    157.000000      2.000000   \n",
       "50%        0.660000      0.720000      4.000000    199.500000      3.000000   \n",
       "75%        0.820000      0.860000      5.000000    243.000000      4.000000   \n",
       "max        1.000000      1.000000      7.000000    310.000000      6.000000   \n",
       "\n",
       "            Newborn          left  \n",
       "count  12000.000000  12000.000000  \n",
       "mean       0.154167      0.166667  \n",
       "std        0.361123      0.372694  \n",
       "min        0.000000      0.000000  \n",
       "25%        0.000000      0.000000  \n",
       "50%        0.000000      0.000000  \n",
       "75%        0.000000      0.000000  \n",
       "max        1.000000      1.000000  "
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_hr.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bd2a4d7b-96db-42bf-86a3-ea8a1ff9153f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<AxesSubplot:ylabel='Frequency'>"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZoAAAD7CAYAAABT2VIoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8/fFQqAAAACXBIWXMAAAsTAAALEwEAmpwYAAAX8ElEQVR4nO3de2xT9/3G8cc2kMIAhWRJMJcWoSFIhwZqwtAqUNcEMNAkrAgWlJato5RubcelHQOxHwkFxBouXbluMKYyrQi0SiuXtEoyytaMDTroChQCYYRLsyWQ4IBCKZBgn98fVTPYejmx/fUh9vslIeHztXOejyB+fI6dE5dlWZYAADDE7XQAAEBso2gAAEZRNAAAoygaAIBRFA0AwCiKBgBgFEUDADCqg9MB7laXL19TMNj2HzFKTu4qv/8jA4nuXswcH+Jt5nibVwpvZrfbpR49vvKZaxTN5wgGrZCK5tPHxhtmjg/xNnO8zSuZmZlTZwAAoygaAIBRFA0AwCiKBgBgVFSKpri4WFlZWRo4cKBOnTrVuv3s2bPKz8+Xz+dTfn6+zp07Z3QNABB9USma7Oxsbd26Vb17975je1FRkQoKClRWVqaCggIVFhYaXQMARF9UiiYzM1Ner/eObX6/X5WVlcrJyZEk5eTkqLKyUo2NjUbWAADOcOznaOrq6pSWliaPxyNJ8ng8Sk1NVV1dnSzLivhaUlJSm/IlJ3cNaa7mloBSUrqF9NhwNLcE1KmjJ+r7/ZQTMzuNmWNfvM0rmZmZH9j8HH7/RyH94FJKSjflvrDTQKIvtnvVBDU0XI36fqVPZnZq305h5tgXb/NK4c3sdrs+9wW6Y0Xj9Xp18eJFBQIBeTweBQIB1dfXy+v1yrKsiK8BAJzh2Mebk5OTlZ6erpKSEklSSUmJ0tPTlZSUZGQNAOAMl2VZxi/ms3TpUpWXl+vSpUvq0aOHEhMT9eabb6q6ulrz589XU1OTunfvruLiYvXv31+SjKy1BafO7OMUQ3yIt5njbV7J3KmzqBRNe0TR2Mc3ZHyIt5njbV7JXNFwZQAAgFEUDQDAKIoGAGAURQMAMIqiAQAYRdEAAIyiaAAARlE0AACjKBoAgFEUDQDAKIoGAGAURQMAMIqiAQAYRdEAAIyiaAAARlE0AACjKBoAgFEUDQDAKIoGAGAURQMAMIqiAQAYRdEAAIyiaAAARlE0AACjKBoAgFEUDQDAKIoGAGAURQMAMIqiAQAYRdEAAIyiaAAARlE0AACj7oqi+dOf/qTvfOc7mjBhgvLy8lReXi5JOnv2rPLz8+Xz+ZSfn69z5861PibUNQBAdDleNJZl6ac//amWL1+unTt3avny5Zo3b56CwaCKiopUUFCgsrIyFRQUqLCwsPVxoa4BAKLL8aKRJLfbratXr0qSrl69qtTUVF2+fFmVlZXKycmRJOXk5KiyslKNjY3y+/0hrQEAoq+D0wFcLpdeeeUVPfPMM+rSpYuuXbumTZs2qa6uTmlpafJ4PJIkj8ej1NRU1dXVybKskNaSkpIcmxMA4pXjRXPr1i1t3LhRGzZsUEZGht577z3Nnj1by5cvdzRXcnJXR/cfipSUbnG5b6cwc+yLt3klMzM7XjQnTpxQfX29MjIyJEkZGRnq3LmzEhISdPHiRQUCAXk8HgUCAdXX18vr9cqyrJDW2sLv/0jBoNXmeZz8j9nQcNWR/aakdHNs305h5tgXb/NK4c3sdrs+9wW64+/R9OzZUxcuXNCZM2ckSdXV1fL7/brvvvuUnp6ukpISSVJJSYnS09OVlJSk5OTkkNYAANHnsiyr7S/bI2zXrl369a9/LZfLJUmaOXOmRo0aperqas2fP19NTU3q3r27iouL1b9/f0kKec2ucI5ocl/Y2ebHhWv3qgkc0UQRM8e+eJtXMndEc1cUzd2IorGPb8j4EG8zx9u8UgyfOgMAxDaKBgBgFEUDADCKogEAGEXRAACMomgAAEZRNAAAoygaAIBRFA0AwCiKBgBgFEUDADCKogEAGEXRAACMomgAAEZRNAAAoygaAIBRFA0AwCiKBgBgFEUDADCKogEAGEXRAACMomgAAEZRNAAAoygaAIBRtotmz549unXrlsksAIAYZLto1qxZoxEjRmjx4sU6cuSIyUwAgBhiu2h27dqlLVu2KCEhQT/+8Y/l8/m0YcMG/etf/zKZDwDQzrXpPZpBgwZp3rx5euedd1RUVKTS0lKNHj1ajz32mHbt2qVgMGgqJwCgnerQ1gd8+OGH2rVrl3bt2iWXy6WZM2fK6/Vq69atKi8v17p160zkBAC0U7aLZuvWrdq5c6fOnz+vcePGafny5Ro6dGjrus/n04MPPmgiIwCgHbNdNBUVFfrBD36g7OxsderU6X/WO3furLVr10Y0HACg/bNdNGvWrJHb7VbHjh1bt7W0tMiyrNbiGTFiROQTAgDaNdsfBpg2bZqOHz9+x7bjx4/rySefDDvEzZs3VVRUpDFjxig3N1cLFy6UJJ09e1b5+fny+XzKz8/XuXPnWh8T6hoAILpsF01VVZWGDBlyx7ZvfOMbOnnyZNghVqxYoYSEBJWVlWn37t2aNWuWJKmoqEgFBQUqKytTQUGBCgsLWx8T6hoAILpsF0337t116dKlO7ZdunRJnTt3DivAtWvXtGPHDs2aNUsul0uS9NWvflV+v1+VlZXKycmRJOXk5KiyslKNjY0hrwEAos920YwZM0YvvPCCTp06pevXr6uqqkrz5s3TuHHjwgpQU1OjxMRErVu3ThMnTtTUqVN16NAh1dXVKS0tTR6PR5Lk8XiUmpqqurq6kNcAANFn+8MAc+bM0UsvvaTJkyerublZCQkJmjhxop5//vmwAgQCAdXU1Oj+++/XvHnzdOTIEf3whz/U6tWrw/q64UpO7uro/kORktItLvftFGaOffE2r2RmZttFk5CQoKKiIhUWFury5cvq0aNH66mucHi9XnXo0KH1VNeQIUPUo0cP3XPPPbp48aICgYA8Ho8CgYDq6+vl9XplWVZIa23h93+kYNBq8zxO/sdsaLjqyH5TUro5tm+nMHPsi7d5pfBmdrtdn/sCvU2XoLl69ao++OADVVVV6cCBA9q/f7/2798fUqhPJSUlafjw4frrX/8q6ZNPjPn9fvXr10/p6ekqKSmRJJWUlCg9PV1JSUlKTk4OaQ0AEH0uy7JsvWz/wx/+oMWLF6tLly665557/vMFXC69/fbbYYWoqanRggULdOXKFXXo0EGzZ8/WQw89pOrqas2fP19NTU3q3r27iouL1b9/f0kKec2ucI5ocl/Y2ebHhWv3qgkc0UQRM8e+eJtXMndEY7toRo4cqaVLl+qhhx4KKUR7Q9HYxzdkfIi3meNtXukuOHUWCAT4yX8AQJvZLpqnnnpKv/zlL/lVAACANrH9qbMtW7bo0qVL2rx5sxITE+9Y+/Of/xzhWACAWGG7aFasWGEyBwAgRtkumm9+85smcwAAYpTt92iam5v1i1/8QtnZ2crIyJAk7du3T6+99pqxcACA9s920SxbtkynTp3SypUrW68IMGDAAG3bts1YOABA+2f71NmePXtUXl6uLl26yO3+pJ/S0tJ08eJFY+EAAO2f7SOajh07KhAI3LGtsbHxfz6BBgDA7WwXzdixYzVv3jzV1NRIkurr67V48WI98sgjxsIBANo/20UzZ84c9enTR3l5eWpqapLP51NqaqqeffZZk/kAAO2c7fdoOnXqpAULFmjBggVqbGyM2K8JAADENttF8+kps09du3at9e99+/aNXCIAQEyxXTSjR4+Wy+XS7Rd7/vSI5sSJE5FPBgCICbaL5uTJk3fcbmho0Lp165SZmRnxUACA2NGm37B5u5SUFP3sZz/Tyy+/HMk8AIAYE3LRSNKZM2d0/fr1SGUBAMQg26fOCgoK7viU2fXr13X69Gk+3gwA+EK2i2by5Ml33O7cubMGDRqkfv36RToTACCG2C6aRx991GQOAECMsl00q1evtnW/WbNmhRwGABB7bBfN+fPnVV5ersGDB6t3796qra3VBx98oDFjxighIcFkRgBAO2a7aCzL0qpVq+Tz+Vq3lZeXq7S0VD//+c+NhAMAtH+2P95cUVGhUaNG3bEtKytL77zzTsRDAQBih+2iue+++7R169Y7tm3btk333ntvxEMBAGKH7VNnS5cu1XPPPafNmze3/mbNDh06aO3atSbzAQDaOdtFc//996usrExHjhxRfX29UlJSNHToUHXs2NFkPgBAOxfyJWiGDRumlpYWffzxx5HMAwCIMbaPaKqqqvSjH/1InTp10sWLFzV+/HgdPHhQb7zxhl555RWDEQEA7ZntI5pFixZp5syZKi0tVYcOn/TTsGHD9N577xkLBwBo/2wXzenTpzVhwgRJ//mFZ126dNHNmzfNJAMAxATbRdO7d28dO3bsjm1Hjx7l480AgC9k+z2aWbNm6emnn9aUKVPU0tKijRs3avv27VqyZInJfACAds72Ec3DDz+szZs3q7GxUcOGDdO///1vrV27ViNGjIhYmHXr1mngwIE6deqUJOnw4cPKy8uTz+fTtGnT5Pf7W+8b6hoAILpsFU0gENCoUaP0ta99TYsWLdKmTZu0ePFiDR48OGJBjh8/rsOHD6t3796SpGAwqLlz56qwsFBlZWXKzMzUypUrw1oDAESfraLxeDzyeDzG3vhvbm7W4sWLtWjRotZtx44dU0JCgjIzMyVJU6ZMUWlpaVhrAIDos/0ezfe+9z3Nnj1bTz/9tHr27HnHr3Xu27dvWCFWr16tvLw89enTp3VbXV2devXq1Xo7KSlJwWBQV65cCXktMTHRdqbk5K5hzeSElJRucblvpzBz7Iu3eSUzM39p0TQ0NCglJaX1Tf+//e1vsiyrdd3lcunEiRMhB3j//fd17Ngx/eQnPwn5a5jg93+kYND68jv+Fyf/YzY0XHVkvykp3Rzbt1OYOfbF27xSeDO73a7PfYH+pUXj8/n0j3/8QydPnpQkPfvss1q/fn1IQT7LwYMHVV1drezsbEnShQsX9OSTT2rq1Kmqra1tvV9jY6PcbrcSExPl9XpDWgMARN+Xvkdz+9GL9EkxRNKMGTO0b98+7d27V3v37lXPnj31m9/8RtOnT9eNGzd06NAhSdL27ds1duxYSdLgwYNDWgMARN+XHtHc/l6M9L/FY4rb7dby5ctVVFSkmzdvqnfv3lqxYkVYawCA6PvSogkEAjpw4EBrwfz3bUn61re+FbFAe/fubf37Aw88oN27d3/m/UJdAwBE15cWTXJyshYsWNB6OzEx8Y7bLpdLb7/9tpl0AIB270uL5vYjDAAA2irkX3wGAIAdFA0AwCiKBgBgFEUDADCKogEAGEXRAACMomgAAEZRNAAAoygaAIBRFA0AwCiKBgBgFEUDADCKogEAGEXRAACMomgAAEZRNAAAoygaAIBRFA0AwCiKBgBgFEUDADCKogEAGEXRAACMomgAAEZRNAAAoygaAIBRFA0AwCiKBgBgFEUDADCKogEAGEXRAACMcrxoLl++rKeeeko+n0+5ubl67rnn1NjYKEk6fPiw8vLy5PP5NG3aNPn9/tbHhboGAIgux4vG5XJp+vTpKisr0+7du9W3b1+tXLlSwWBQc+fOVWFhocrKypSZmamVK1dKUshrAIDoc7xoEhMTNXz48NbbQ4cOVW1trY4dO6aEhARlZmZKkqZMmaLS0lJJCnkNABB9jhfN7YLBoLZt26asrCzV1dWpV69erWtJSUkKBoO6cuVKyGsAgOjr4HSA2y1ZskRdunTR448/rj/+8Y+OZklO7uro/kORktItLvftFGaOfU7N29wSUKeOHkf2a2Lmu6ZoiouLdf78ef3qV7+S2+2W1+tVbW1t63pjY6PcbrcSExNDXmsLv/8jBYNWm+dw8huxoeGqI/tNSenm2L6dwsyxz8l5U1K6KfeFnVHf7+5VE0Ke2e12fe4L9Lvi1NnLL7+sY8eOaf369erUqZMkafDgwbpx44YOHTokSdq+fbvGjh0b1hoAIPocP6L55z//qY0bN6pfv36aMmWKJKlPnz5av369li9frqKiIt28eVO9e/fWihUrJElutzukNQBA9DleNAMGDFBVVdVnrj3wwAPavXt3RNcAANF1V5w6AwDELooGAGAURQMAMIqiAQAYRdEAAIyiaAAARlE0AACjKBoAgFEUDQDAKIoGAGAURQMAMIqiAQAYRdEAAIyiaAAARlE0AACjKBoAgFEUDQDAKIoGAGAURQMAMIqiAQAYRdEAAIyiaAAARlE0AACjKBoAgFEUDQDAKIoGAGAURQMAMIqiAQAYRdEAAIyiaAAARlE0AACjKBoAgFEUDQDAqJgtmrNnzyo/P18+n0/5+fk6d+6c05EAIC7FbNEUFRWpoKBAZWVlKigoUGFhodORACAudXA6gAl+v1+VlZV69dVXJUk5OTlasmSJGhsblZSUZOtruN2ukPef2qNzyI8NRziZ2/O+ncLMsc/Jedvb88gXPS4mi6aurk5paWnyeDySJI/Ho9TUVNXV1dkumh49vhLy/n/zf2NCfmw4kpO7OrJfp/ftFGaOfU7OG0vPIzF76gwAcHeIyaLxer26ePGiAoGAJCkQCKi+vl5er9fhZAAQf2KyaJKTk5Wenq6SkhJJUklJidLT022fNgMARI7LsizL6RAmVFdXa/78+WpqalL37t1VXFys/v37Ox0LAOJOzBYNAODuEJOnzgAAdw+KBgBgFEUDADCKogEAGEXRhMDOBTsDgYBefPFFjRo1SqNHj9brr78e/aARZGfm9evX65FHHlFubq4mTpyov/zlL9EPGkFtuTDrmTNnNGTIEBUXF0cvoAF2Z37rrbeUm5urnJwc5ebm6tKlS9ENGiF25vX7/ZoxY4Zyc3M1btw4LVq0SLdu3Yp+2AgpLi5WVlaWBg4cqFOnTn3mfSL+/GWhzaZOnWrt2LHDsizL2rFjhzV16tT/uc8bb7xhTZs2zQoEApbf77dGjhxp1dTURDtqxNiZuaKiwvr4448ty7KsEydOWBkZGdb169ejmjOS7MxsWZZ169Yt6/HHH7eef/5566WXXopmxIizM/PRo0etcePGWfX19ZZlWVZTU5N148aNqOaMFDvzLl26tPXftbm52Zo0aZL15ptvRjVnJB08eNCqra21Hn74Yauqquoz7xPp5y+OaNro0wt25uTkSPrkgp2VlZVqbGy8435vvfWWJk+eLLfbraSkJI0aNUqlpaVORA6b3ZlHjhypzp0/uRDgwIEDZVmWrly5Eu24EWF3ZknatGmTvv3tb6tfv35RThlZdmfesmWLpk2bppSUFElSt27dlJCQEPW84bI7r8vl0rVr1xQMBtXc3KyWlhalpaU5ETkiMjMzv/QqKZF+/qJo2uiLLtj53/fr1atX622v16sLFy5ENWuk2J35djt27NC9996rnj17RitmRNmd+eTJk9q3b5+eeOIJB1JGlt2Zq6urVVNTo8cee0yPPvqoNmzYIKsd/jie3XmfeeYZnT17ViNGjGj9k5GR4UTkqIn08xdFg4j7+9//rtWrV2vVqlVORzGqpaVFCxcu1Isvvtj6ZBUPAoGAqqqq9Oqrr+p3v/udKioqtHPnTqdjGVNaWqqBAwdq3759qqio0KFDh9rt2QmnUDRtZPeCnV6vV7W1ta236+rq2u2r+7ZcpPT999/X3LlztX79+nZ9yR87Mzc0NOjDDz/UjBkzlJWVpd/+9rf6/e9/r4ULFzoVOyx2/5179eqlsWPHqlOnTuratauys7N19OhRJyKHxe68r732mvLy8uR2u9WtWzdlZWXp3XffdSJy1ET6+YuiaSO7F+wcO3asXn/9dQWDQTU2NmrPnj3y+XxORA6b3ZmPHj2qOXPmaM2aNfr617/uRNSIsTNzr1699O6772rv3r3au3evvv/97+u73/2ulixZ4lTssNj9d87JydG+fftkWZZaWlp04MABDRo0yInIYbE7b58+fVRRUSFJam5u1v79+zVgwICo542miD9/hfwxgjh2+vRpa9KkSdaYMWOsSZMmWdXV1ZZlWdb06dOto0ePWpb1ySeRCgsLrezsbCs7O9vavn27k5HDZmfmiRMnWsOHD7fy8vJa/5w8edLJ2GGxM/Pt1qxZ0+4/dWZn5kAgYC1btswaO3asNX78eGvZsmVWIBBwMnbI7Mx7/vx564knnrBycnKscePGWYsWLbJaWlqcjB2WJUuWWCNHjrTS09OtBx980Bo/frxlWWafv7ioJgDAKE6dAQCMomgAAEZRNAAAoygaAIBRFA0AwCiKBgBgFEUDADCKogEAGPX/DOJu0hB5BZYAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df_hr['left'].plot.hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "84e823bb-82e6-467b-b67b-24dedbda07f4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<AxesSubplot:ylabel='Frequency'>"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df_hr['S'].plot.hist(bins=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e7d2e0f-b5dd-4d01-a159-64bd49d06e28",
   "metadata": {},
   "outputs": [],
   "source": [
    "y = df_hr.pop('left')\n",
    "X = df_hr.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66b02a04-fd6c-44e5-8803-e25be209c3d7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>S</th>\n",
       "      <th>LPE</th>\n",
       "      <th>NP</th>\n",
       "      <th>ANH</th>\n",
       "      <th>TIC</th>\n",
       "      <th>Newborn</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.38</td>\n",
       "      <td>0.53</td>\n",
       "      <td>2</td>\n",
       "      <td>157</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.80</td>\n",
       "      <td>0.86</td>\n",
       "      <td>5</td>\n",
       "      <td>262</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.11</td>\n",
       "      <td>0.88</td>\n",
       "      <td>7</td>\n",
       "      <td>272</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.72</td>\n",
       "      <td>0.87</td>\n",
       "      <td>5</td>\n",
       "      <td>223</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.37</td>\n",
       "      <td>0.52</td>\n",
       "      <td>2</td>\n",
       "      <td>159</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11995</th>\n",
       "      <td>0.90</td>\n",
       "      <td>0.55</td>\n",
       "      <td>3</td>\n",
       "      <td>259</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11996</th>\n",
       "      <td>0.74</td>\n",
       "      <td>0.95</td>\n",
       "      <td>5</td>\n",
       "      <td>266</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11997</th>\n",
       "      <td>0.85</td>\n",
       "      <td>0.54</td>\n",
       "      <td>3</td>\n",
       "      <td>185</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11998</th>\n",
       "      <td>0.33</td>\n",
       "      <td>0.65</td>\n",
       "      <td>3</td>\n",
       "      <td>172</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11999</th>\n",
       "      <td>0.50</td>\n",
       "      <td>0.73</td>\n",
       "      <td>4</td>\n",
       "      <td>180</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>12000 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          S   LPE  NP  ANH  TIC  Newborn\n",
       "0      0.38  0.53   2  157    3        0\n",
       "1      0.80  0.86   5  262    6        0\n",
       "2      0.11  0.88   7  272    4        0\n",
       "3      0.72  0.87   5  223    5        0\n",
       "4      0.37  0.52   2  159    3        0\n",
       "...     ...   ...  ..  ...  ...      ...\n",
       "11995  0.90  0.55   3  259    2        1\n",
       "11996  0.74  0.95   5  266    4        0\n",
       "11997  0.85  0.54   3  185    3        0\n",
       "11998  0.33  0.65   3  172    5        0\n",
       "11999  0.50  0.73   4  180    3        0\n",
       "\n",
       "[12000 rows x 6 columns]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "59f26d5b-6e3e-4fd4-b4a1-a065d06989a2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0        1\n",
       "1        1\n",
       "2        1\n",
       "3        1\n",
       "4        1\n",
       "        ..\n",
       "11995    0\n",
       "11996    0\n",
       "11997    0\n",
       "11998    0\n",
       "11999    0\n",
       "Name: left, Length: 12000, dtype: int64"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25f796be-6d89-4a65-a279-f23c79542383",
   "metadata": {},
   "outputs": [],
   "source": [
    "X = sm.add_constant(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "01477663-857c-4f90-b814-635cbb1bffb5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.354538\n",
      "         Iterations 7\n"
     ]
    }
   ],
   "source": [
    "model_hr = sm.Logit(y, X).fit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f327cbc-aa9e-4966-9b8b-a5f92f387168",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>Logit Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>         <td>left</td>       <th>  No. Observations:  </th>  <td> 12000</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                 <td>Logit</td>      <th>  Df Residuals:      </th>  <td> 11993</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>                 <td>MLE</td>       <th>  Df Model:          </th>  <td>     6</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>            <td>Fri, 27 May 2022</td> <th>  Pseudo R-squ.:     </th>  <td>0.2131</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                <td>08:04:49</td>     <th>  Log-Likelihood:    </th> <td> -4254.5</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>converged:</th>             <td>True</td>       <th>  LL-Null:           </th> <td> -5406.7</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>     <td>nonrobust</td>    <th>  LLR p-value:       </th>  <td> 0.000</td> \n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "     <td></td>        <th>coef</th>     <th>std err</th>      <th>z</th>      <th>P>|z|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>const</th>   <td>   -1.2412</td> <td>    0.160</td> <td>   -7.751</td> <td> 0.000</td> <td>   -1.555</td> <td>   -0.927</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>S</th>       <td>   -3.8163</td> <td>    0.121</td> <td>  -31.607</td> <td> 0.000</td> <td>   -4.053</td> <td>   -3.580</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>LPE</th>     <td>    0.5044</td> <td>    0.181</td> <td>    2.788</td> <td> 0.005</td> <td>    0.150</td> <td>    0.859</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>NP</th>      <td>   -0.3592</td> <td>    0.026</td> <td>  -13.569</td> <td> 0.000</td> <td>   -0.411</td> <td>   -0.307</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>ANH</th>     <td>    0.0038</td> <td>    0.001</td> <td>    6.067</td> <td> 0.000</td> <td>    0.003</td> <td>    0.005</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>TIC</th>     <td>    0.6188</td> <td>    0.027</td> <td>   22.820</td> <td> 0.000</td> <td>    0.566</td> <td>    0.672</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Newborn</th> <td>   -1.4851</td> <td>    0.113</td> <td>  -13.157</td> <td> 0.000</td> <td>   -1.706</td> <td>   -1.264</td>\n",
       "</tr>\n",
       "</table>"
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                           Logit Regression Results                           \n",
       "==============================================================================\n",
       "Dep. Variable:                   left   No. Observations:                12000\n",
       "Model:                          Logit   Df Residuals:                    11993\n",
       "Method:                           MLE   Df Model:                            6\n",
       "Date:                Fri, 27 May 2022   Pseudo R-squ.:                  0.2131\n",
       "Time:                        08:04:49   Log-Likelihood:                -4254.5\n",
       "converged:                       True   LL-Null:                       -5406.7\n",
       "Covariance Type:            nonrobust   LLR p-value:                     0.000\n",
       "==============================================================================\n",
       "                 coef    std err          z      P>|z|      [0.025      0.975]\n",
       "------------------------------------------------------------------------------\n",
       "const         -1.2412      0.160     -7.751      0.000      -1.555      -0.927\n",
       "S             -3.8163      0.121    -31.607      0.000      -4.053      -3.580\n",
       "LPE            0.5044      0.181      2.788      0.005       0.150       0.859\n",
       "NP            -0.3592      0.026    -13.569      0.000      -0.411      -0.307\n",
       "ANH            0.0038      0.001      6.067      0.000       0.003       0.005\n",
       "TIC            0.6188      0.027     22.820      0.000       0.566       0.672\n",
       "Newborn       -1.4851      0.113    -13.157      0.000      -1.706      -1.264\n",
       "==============================================================================\n",
       "\"\"\""
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model_hr.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8046b296-fed4-47da-90f4-272f3593f146",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\u001b[0;31mSignature:\u001b[0m \u001b[0mmodel_hr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexog\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
       "\u001b[0;31mDocstring:\u001b[0m\n",
       "Call self.model.predict with self.params as the first argument.\n",
       "\n",
       "Parameters\n",
       "----------\n",
       "exog : array_like, optional\n",
       "    The values for which you want to predict. see Notes below.\n",
       "transform : bool, optional\n",
       "    If the model was fit via a formula, do you want to pass\n",
       "    exog through the formula. Default is True. E.g., if you fit\n",
       "    a model y ~ log(x1) + log(x2), and transform is True, then\n",
       "    you can pass a data structure that contains x1 and x2 in\n",
       "    their original form. Otherwise, you'd need to log the data\n",
       "    first.\n",
       "*args\n",
       "    Additional arguments to pass to the model, see the\n",
       "    predict method of the model for the details.\n",
       "**kwargs\n",
       "    Additional keywords arguments to pass to the model, see the\n",
       "    predict method of the model for the details.\n",
       "\n",
       "Returns\n",
       "-------\n",
       "array_like\n",
       "    See self.model.predict.\n",
       "\n",
       "Notes\n",
       "-----\n",
       "The types of exog that are supported depends on whether a formula\n",
       "was used in the specification of the model.\n",
       "\n",
       "If a formula was used, then exog is processed in the same way as\n",
       "the original data. This transformation needs to have key access to the\n",
       "same variable names, and can be a pandas DataFrame or a dict like\n",
       "object that contains numpy arrays.\n",
       "\n",
       "If no formula was used, then the provided exog needs to have the\n",
       "same number of columns as the original exog in the model. No\n",
       "transformation of the data is performed except converting it to\n",
       "a numpy array.\n",
       "\n",
       "Row indices as in pandas data frames are supported, and added to the\n",
       "returned prediction.\n",
       "\u001b[0;31mFile:\u001b[0m      /opt/anaconda/envs/aiking/lib/python3.9/site-packages/statsmodels/base/model.py\n",
       "\u001b[0;31mType:\u001b[0m      method\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "model_hr.predict?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "177f018a-d746-4fb8-b1dc-83f71699ef6d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7.641666666666667"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cutoff = 0.5\n",
    "(model_hr.predict(X) > cutoff).sum()*100/len(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5cc0ea1a-eae5-4979-af3d-c5b5e511e7fe",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>left</th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>row_0</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>False</th>\n",
       "      <td>9464</td>\n",
       "      <td>1619</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>True</th>\n",
       "      <td>536</td>\n",
       "      <td>381</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "left      0     1\n",
       "row_0            \n",
       "False  9464  1619\n",
       "True    536   381"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.crosstab(model_hr.predict(X) >cutoff, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b3563e23-3c27-410d-8a72-81e32b2a60ce",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.9464, 0.1905)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "9464/(9464+536), 381/(1619+381)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "55ab4840-1389-491a-aac8-7318dfffbaad",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9235833333333333"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(9464+1619)/12000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64ca6dff-f5c6-4d1f-bfa0-64b40d55d091",
   "metadata": {},
   "outputs": [],
   "source": [
    "accuracy = (9464+381)/(9464+381+526+1619); accuracy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "47df970f-6673-41e3-bf10-a06dbbbcfa55",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "S         -31.606505\n",
       "TIC        22.820109\n",
       "NP        -13.569440\n",
       "Newborn   -13.156788\n",
       "const      -7.751316\n",
       "ANH         6.067180\n",
       "LPE         2.788130\n",
       "dtype: float64"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model_hr.tvalues[model_hr.tvalues[model_hr.pvalues <= 0.05].abs().sort_values(ascending=False).index]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d474039f-918e-4dcb-869a-782ebdef665d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>S</th>\n",
       "      <th>LPE</th>\n",
       "      <th>NP</th>\n",
       "      <th>ANH</th>\n",
       "      <th>TIC</th>\n",
       "      <th>Newborn</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.38</td>\n",
       "      <td>0.53</td>\n",
       "      <td>2</td>\n",
       "      <td>157</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.80</td>\n",
       "      <td>0.86</td>\n",
       "      <td>5</td>\n",
       "      <td>262</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.11</td>\n",
       "      <td>0.88</td>\n",
       "      <td>7</td>\n",
       "      <td>272</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.72</td>\n",
       "      <td>0.87</td>\n",
       "      <td>5</td>\n",
       "      <td>223</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.37</td>\n",
       "      <td>0.52</td>\n",
       "      <td>2</td>\n",
       "      <td>159</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11995</th>\n",
       "      <td>0.90</td>\n",
       "      <td>0.55</td>\n",
       "      <td>3</td>\n",
       "      <td>259</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11996</th>\n",
       "      <td>0.74</td>\n",
       "      <td>0.95</td>\n",
       "      <td>5</td>\n",
       "      <td>266</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11997</th>\n",
       "      <td>0.85</td>\n",
       "      <td>0.54</td>\n",
       "      <td>3</td>\n",
       "      <td>185</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11998</th>\n",
       "      <td>0.33</td>\n",
       "      <td>0.65</td>\n",
       "      <td>3</td>\n",
       "      <td>172</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11999</th>\n",
       "      <td>0.50</td>\n",
       "      <td>0.73</td>\n",
       "      <td>4</td>\n",
       "      <td>180</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>12000 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          S   LPE  NP  ANH  TIC  Newborn\n",
       "0      0.38  0.53   2  157    3        0\n",
       "1      0.80  0.86   5  262    6        0\n",
       "2      0.11  0.88   7  272    4        0\n",
       "3      0.72  0.87   5  223    5        0\n",
       "4      0.37  0.52   2  159    3        0\n",
       "...     ...   ...  ..  ...  ...      ...\n",
       "11995  0.90  0.55   3  259    2        1\n",
       "11996  0.74  0.95   5  266    4        0\n",
       "11997  0.85  0.54   3  185    3        0\n",
       "11998  0.33  0.65   3  172    5        0\n",
       "11999  0.50  0.73   4  180    3        0\n",
       "\n",
       "[12000 rows x 6 columns]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98ada868-95bb-4351-9c35-4f2678e89f76",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>S</th>\n",
       "      <th>LPE</th>\n",
       "      <th>NP</th>\n",
       "      <th>ANH</th>\n",
       "      <th>TIC</th>\n",
       "      <th>Newborn</th>\n",
       "      <th>left</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.38</td>\n",
       "      <td>0.53</td>\n",
       "      <td>2</td>\n",
       "      <td>157</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.80</td>\n",
       "      <td>0.86</td>\n",
       "      <td>5</td>\n",
       "      <td>262</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.11</td>\n",
       "      <td>0.88</td>\n",
       "      <td>7</td>\n",
       "      <td>272</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.72</td>\n",
       "      <td>0.87</td>\n",
       "      <td>5</td>\n",
       "      <td>223</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.37</td>\n",
       "      <td>0.52</td>\n",
       "      <td>2</td>\n",
       "      <td>159</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      S   LPE  NP  ANH  TIC  Newborn  left\n",
       "0  0.38  0.53   2  157    3        0     1\n",
       "1  0.80  0.86   5  262    6        0     1\n",
       "2  0.11  0.88   7  272    4        0     1\n",
       "3  0.72  0.87   5  223    5        0     1\n",
       "4  0.37  0.52   2  159    3        0     1"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_hr = pd.read_csv(\"DATA_3.02_HR2.csv\"); df_hr.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fac951c6-1207-4d7a-95c1-90506764e53f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<AxesSubplot:xlabel='Time in Company', ylabel='Attrition'>"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df_hr.groupby(['TIC'])['left'].agg(['mean', 'sum']).reset_index().plot.scatter(y='mean',x='TIC', s='sum', xlabel='Time in Company', ylabel='Attrition',ylim=(0,0.6) )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d62f30a-835b-44d1-a4c8-3c040797b727",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>TIC</th>\n",
       "      <th>mean</th>\n",
       "      <th>sum</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>0.010262</td>\n",
       "      <td>31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>0.165727</td>\n",
       "      <td>882</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4</td>\n",
       "      <td>0.240777</td>\n",
       "      <td>496</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5</td>\n",
       "      <td>0.444240</td>\n",
       "      <td>482</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>6</td>\n",
       "      <td>0.212891</td>\n",
       "      <td>109</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   TIC      mean  sum\n",
       "0    2  0.010262   31\n",
       "1    3  0.165727  882\n",
       "2    4  0.240777  496\n",
       "3    5  0.444240  482\n",
       "4    6  0.212891  109"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_hr.groupby(['TIC'])['left'].agg(['mean', 'sum']).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f7a3908c-8464-4e72-9136-5d868c082973",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_hr['S_ranked'] = -np.ceil(df_hr['S'].rank(method='max')/600)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a5798cfc-4a2f-4b6d-bab7-3f16ecfccd20",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_hr['attrition'] = df_hr.groupby('S_ranked')['left'].transform('mean')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd1af100-2d5a-4151-a5a2-f1b1dd1f28e1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<AxesSubplot:xlabel='S_ranked', ylabel='attrition'>"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAEMCAYAAAAxoErWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8/fFQqAAAACXBIWXMAAAsTAAALEwEAmpwYAAAdVklEQVR4nO3dfVRUdeLH8Q8zSthPXIMAh54s3Yyt1LUHe9DWTAMLDuCGGD2ZR8pK2+20KGWhrC3mbr/aFnU7m6EZPUkpLqOpmbtlZZrWBifKY4ZZMoCCbj7gYsP9/eFP1hEuDMwj8H6d4znD8L1zPw4XPnO/987cEMMwDAEA0AJLoAMAAIIXJQEAMEVJAABMURIAAFOUBADAFCUBADBFSQAATPUIdABvO3DgiBob2//Wj8jI3qqtPeyDRN5BPs+Qz3PBnpF8HWOxhOiss/7H9PtdriQaG40OlcTJZYMZ+TxDPs8Fe0byeR/TTQAAU5QEAMAUJQEAMEVJAABMdbkD1wDQ3Ux+emPT7YLs0V59bPYkAKATO7UgWvraU5QEAHRSZoXgzaKgJAAApigJAIApSgIAOimzg9TePHhNSQBAJ3Z6IXj77CZOgQWATs7bxXAq9iQAAKYoCQCAKUoCAGCKkgAAmKIkAACmKAkAgClKAgBgipIAAJiiJAAApvz2juuKigplZ2fr4MGD6tu3r+bPn6/+/fu7jKmtrdVjjz0mh8Ohn376ScOHD9cTTzyhHj14YzgABILf9iRmz56tjIwMrVu3ThkZGcrJyWk25oUXXtCAAQNUUlKiv//97/ryyy+1fv16f0UEAJzGLyVRW1ur8vJyJSYmSpISExNVXl6uuro6l3EhISE6cuSIGhsb1dDQoOPHjysmJsYfEQEALfBLSTgcDsXExMhqtUqSrFaroqOj5XA4XMY9+OCDqqio0IgRI5r+XXHFFf6ICABoQVBN9q9du1aDBg3Syy+/rCNHjigzM1Nr165VQkKC248RGdm7w+uPigrv8LL+QD7PkM9zwZ6RfN7nl5Kw2Wyqrq6W0+mU1WqV0+lUTU2NbDaby7jCwkLl5eXJYrEoPDxco0eP1pYtW9pVErW1h9XYaLQ7Y1RUuPbtO9Tu5fyFfJ4hn+eCPSP5OsZiCWn1xbVfppsiIyMVFxcnu90uSbLb7YqLi1NERITLuHPPPVcffPCBJKmhoUGbN2/Wz3/+c39EBAC0wG9nN82ZM0eFhYWKj49XYWGhcnNzJUmZmZkqKyuTJD3++OPavn27kpKSlJKSov79+2vChAn+iggAOE2IYRjtn5sJYkw3BQb5PBPs+aTgz0i+jgmK6SYAQOdESQAATFESAABTlAQAwBQlAQAwRUkAAExREgAAU5QEAMAUJQEAMEVJAABMURIAAFOUBADAFCUBADBFSQAATAXV5UsBIBAmP72x6XZB9ugAJgk+7EkA6NZOLYiWvu7uKAkA3ZZZIVAU/0VJAABMURIAAFOUBIBuy+wgNQev/4uSANCtnV4IFIQrToEF0O1RDObYkwAAmKIkAACmKAkAgClKAgBgipIAAJiiJAAApigJAIApSgIAYIqSAACY4h3XADo9LhrkO+xJAOjUuGiQb1ESADotLhrke5QEAMAUJQEAMOW3kqioqFB6erri4+OVnp6u3bt3tzhuzZo1SkpKUmJiopKSkrR//35/RQTQyXDRIN/zW0nMnj1bGRkZWrdunTIyMpSTk9NsTFlZmRYsWKCCggLZ7Xa99tprCg8P91dEAJ0QFw3yLb+cAltbW6vy8nItWbJEkpSYmKi5c+eqrq5OERERTeOWLl2qyZMnKyoqSpIoCABuKcgeraiocO3bdyjQUbocv+xJOBwOxcTEyGq1SpKsVquio6PlcDhcxu3atUvff/+97rjjDqWmpmrRokUyDMMfEQEALQiqN9M5nU7t2LFDS5YsUUNDg6ZMmaLY2FilpKS4/RiRkb07vP6oqODecyGfZ8jnuWDPSD7v80tJ2Gw2VVdXy+l0ymq1yul0qqamRjabzWVcbGysEhISFBoaqtDQUN10000qLS1tV0nU1h5WY2P79z6CfVeVfJ4hn+eCPSP5OsZiCWn1xbVfppsiIyMVFxcnu90uSbLb7YqLi3M5HiGdOFbx4YcfyjAMHT9+XJ988okuueQSf0QEALTAb2c3zZkzR4WFhYqPj1dhYaFyc3MlSZmZmSorK5Mk3XrrrYqMjNQtt9yilJQUDRw4ULfddpu/IgIAThNidLEjw0w3BQb5PBPs+aTgz0i+jgmK6SYAQOdESQAATFESAABTlAQAwBQlAQAwRUkAAExREgAAU5QEAMAUJQEAMNWuD/j79ttv9fXXX+vo0aMu9/PRGQDQNbldEi+88IIWLlyoSy65RGFhYU33h4SEUBIA0EW5XRIvv/yyioqK+FRWAOhG3D4mERYWposuusiXWQAAQcbtkvjNb36jp556SjU1NWpsbHT5BwDomtyebsrOzpYkFRUVNd1nGIZCQkL01VdfeT8ZACDg3C6J9957z5c5AABByO2SOOeccyRJjY2N2r9/v84++2xZLLzNAgC6Mrf/yh8+fFgzZszQ4MGDdcMNN2jw4MGaOXOmDh0KvistAQC8w+2SeOqpp1RfX6+SkhKVlpaqpKRE9fX1euqpp3yZDwAQQG5PN23atEkbNmxQr169JEkXXnih5s2bp7Fjx/osHAAgsNzekzjjjDNUV1fnct+BAwcUGhrq9VAAgODg9p7EbbfdpsmTJ2vSpEmKjY1VZWWlli5dqgkTJvgyHwAggNwuiQceeEDR0dGy2+2qqalRdHS0pkyZwuc2AUAX5nZJnPwgP0oBALqPVkuiuLhYKSkpkqS33nrLdBzFAQBdU6slsXr16qaSWLVqVYtj+KhwAOi6Wi2JF198sen2K6+84vMwAIDg4vYpsCf3KE43fvx4b2UBAAQZt0viu+++a3afYRj64YcfvBoIABA82jy7acaMGZKk48ePN90+ae/evRo4cKBvkgEAAq7Nkjj//PNbvC1Jw4YNU0JCgvdTAQCCQpslMW3aNEnSkCFDNHLkSJ8HAgAEj1ZL4tNPP9VVV111YmCPHtq8eXOL46699lrvJwMABFyrJZGbmyu73S5JmjVrVotjQkJCuGodAHRRrZbEyYKQpHfffVdWq9XngQAAwcOtU2CdTqd++ctfqqGhwdd5AABBxK2SsFqt6t+/vw4cONDhFVVUVCg9PV3x8fFKT0/X7t27Tcd+++23GjJkiObPn9/h9QEAPOf2p8AmJSVp6tSpuvvuu9WvXz+X77lz4Hr27NnKyMhQcnKyVq1apZycHC1btqzZOKfTqdmzZ2vMmDHuRgMA+IjbJfH6669LkvLz813ud+fAdW1trcrLy7VkyRJJUmJioubOnau6ujpFRES4jP3b3/6mUaNG6ejRozp69Ki78QAAPuB2SWzcuLHDK3E4HIqJiWk68G21WhUdHS2Hw+FSEl9//bU+/PBDLVu2TIsWLerw+gAA3tGuK9P99a9/bXb/tGnTtGDBAo+DHD9+XE8++aTmzZvn0VlUkZG9O7xsVFR4h5f1B/J5hnyeC/aM5PM+t0tiy5YtLd6/devWNpe12Wyqrq6W0+mU1WqV0+lUTU2NbDZb05h9+/Zpz549uu+++yRJP/74owzD0OHDhzV37lx3Y6q29rAaGw23x58UFRWuffsOtXs5fyGfZ8jnuWDPSL6OsVhCWn1x3WZJPP/885JOvNI/efuk77//XrGxsW2GiIyMVFxcnOx2u5KTk2W32xUXF+cy1RQbG+tSRPn5+Tp69KhmzpzZ5uMDAHyjzZKoqqqSdOJjwU/ePslms2n69OlurWjOnDnKzs7WokWL1KdPn6bTWzMzM/Xwww/r8ssvb292AICPhRiG4dbczPLlyzVhwgRf5/EY002BQT7PBHs+Kfgzkq9j2ppucvuiQ88880yL9/PhfgDQdbldEsePH2/xvsbGRq8GAgAEjzaPSWRkZCgkJEQNDQ264447XL5XVVWloUOH+iobACDA2iyJtLQ0GYahsrIy3XbbbU33h4SEKDIyUtdcc41PAwIAAqfNkkhNTZV04sp0P/vZz1RaWqoDBw7IMAxVV1dr1apVLuUBAOg63H4zXUVFhbKysnTBBRfom2++0cCBA7Vz504NGzaMkgCALsrtkvjzn/+svLw8jRs3TldddZWKi4v19ttv65tvvvFlPgBAALl9dlNlZaXGjRvncl9qaqqKi4u9nQkAECTcLonIyEjt379fknTOOefo888/1549ezgFFgC6MLdLIi0tTdu3b5ckTZo0SXfffbeSk5N1++23+ywcACCw3D4mcfLTWSUpJSVFV199terr6zVgwACfBAMABJ7bJXE6dz79FQDQubk93QQA6H4oCQCAKUoCAGCKkgAAmKIkAACmKAkAgClKAgBgipIAAJiiJAAApigJAIApSgIAYIqSAACYoiQAAKYoCQCAKUoCAGCKkgAAmKIkAACmKAkAgClKAgBgipIAAJiiJAAApigJAIApSgIAYIqSAACY6uGvFVVUVCg7O1sHDx5U3759NX/+fPXv399lzMKFC7VmzRpZLBb17NlTjzzyiEaOHOmviACA0/itJGbPnq2MjAwlJydr1apVysnJ0bJly1zGDB48WJMnT1avXr309ddf684779SHH36osLAwf8UEAJzCL9NNtbW1Ki8vV2JioiQpMTFR5eXlqqurcxk3cuRI9erVS5I0aNAgGYahgwcP+iMiAKAFfikJh8OhmJgYWa1WSZLValV0dLQcDofpMsXFxTr//PPVr18/f0QEALTAb9NN7bF161Y9//zzKigoaPeykZG9O7zeqKjwDi/rD+TzDPk8F+wZyed9fikJm82m6upqOZ1OWa1WOZ1O1dTUyGazNRv7+eefKysrS4sWLdJFF13U7nXV1h5WY6PR7uWiosK1b9+hdi/nL+TzDPk8F+wZydcxFktIqy+u/TLdFBkZqbi4ONntdkmS3W5XXFycIiIiXMaVlpbqkUce0V/+8hddeuml/ogGAGiF394nMWfOHBUWFio+Pl6FhYXKzc2VJGVmZqqsrEySlJubq2PHjiknJ0fJyclKTk7Wjh07/BURAHAavx2TGDBggIqKiprd/+KLLzbdfvvtt/0VBwDgBt5xDQAwFZRnNwFAZzL56Y1NtwuyRwcwifexJwEAHji1IFr6urOjJKDJT29s+gfAfWa/M13pd4nppiDg6a6qJ8u39Cqoq+0uA+g49iQCzNNdVU+WD5ZXQezJAMGLkgggT/9IB8sfeU909flcdG1me91daW+ckkDAdIWSA04vhK5UEBLHJLq1guzRLf5B9uS4RiB+QQK9fqArb3fsSQSQp7uq3tjV9fRVUKCniwK9fqCroyQCzNM/0t7Y1S3IHt30rz08nS7ytOSYrgJ8j+mmIODprmpn3tU9fcqrM/9fgK6IkkDAUQxA8GK6CR0W6NP/Ar1+oDugJOCRQJ/+F+j1A10d003wWEH26IBempFiAHyHPQkAgClKAgBgipIAAJiiJAAApigJAIApSgIAYIqSAACYoiQAAKYoCQCAKd5xjW6PT6EFzLEngW6NixYBraMk0G1x0SKgbUw3AUCABfOUJ3sSABBAwT7lSUmg2+KiRQi0zjDlSUmgW+OiRUDrOCbhBcE8n4i28TMDzFESHmppPpE/OvA3T1+odPblO6uC7NEtTi0F03PAdJMHOsN8Inxv8tMbm/51dPmkR1d5tHxrX3f15Tu7YJ/ypCQADwT6D6SnL1Q6+/JdRUH26KZ/wcZvJVFRUaH09HTFx8crPT1du3fvbjbG6XQqNzdXY8aM0dixY1VUVOSXbJ6+kkP3xB9IdAd+K4nZs2crIyND69atU0ZGhnJycpqNKSkp0Z49e7R+/Xq9+eabys/P1w8//ODTXJ68kuMUSgBdnV9Kora2VuXl5UpMTJQkJSYmqry8XHV1dS7j1qxZo7S0NFksFkVERGjMmDFau3atz3J545VcsM8nomvz9IVKZ18evueXs5scDodiYmJktVolSVarVdHR0XI4HIqIiHAZFxsb2/S1zWZTVVVVu9YVGdnbK5mjosLdHlvyv8leWWdb2pMpELpbvpL/TVbSo6tavN8fy5s9ji+Xb+k59Of629LdtkF/6HKnwNbWHlZjo+Hx4+zbd8gLabwnKio86DKdqrvmO/0UxoLs0e1aj6fLn7rcSb5avrXn0B/rb0t33QY9ZbGEtPri2i8lYbPZVF1dLafTKavVKqfTqZqaGtlstmbjKisrNXjwYEnN9yy8rTOco4zg5+n2UpA9Omj/gAB+OSYRGRmpuLg42e12SZLdbldcXJzLVJMkJSQkqKioSI2Njaqrq9OGDRsUHx/v02wcUwAAc36bbpozZ46ys7O1aNEi9enTR/Pnz5ckZWZm6uGHH9bll1+u5ORkffHFF7r55pslSQ899JDOO+88n2fjlRwAtCzEMAzPJ/CDSEePSQR7SZDPM+TzXLBnJF/HtHVMgndcAwBMURIAAFOUBADAVJd7n4TFEhKQZf2BfJ4hn+eCPSP52q+tTF3uwDUAwHuYbgIAmKIkAACmKAkAgClKAgBgipIAAJiiJAAApigJAIApSgIAYIqSAACY6nIfy9Ga3Nxcbd68WaGhoTrzzDM1a9YsXX755ZKk/fv3a8aMGdq7d6/OOOMMzZ07V0OGDGnxcRYuXKiVK1dKklJTU/XQQw95Jd+qVau0ePFi7dq1S48//rjuvPPOpu9NmjRJBw4ckCQ5nU7t3LlTq1at0iWXXOLyGFu2bNF9992n/v37S5JCQ0NVVFTk83zZ2dn6+OOPddZZZ0k6cQGpBx54oMXHCcTz19rP/lQrVqxQXl6ezjnnHEnSueeeq4ULF/o8X319vR577DF9+eWXslqtmjlzpm688cYWH2f58uV68cUXZRiGbrjhBj3xxBOyWLz7ei8YtrfWBMP21pZg2Oa8wuhGNm7caDQ0NDTdvummm5q+l52dbSxcuNAwDMP49NNPjbFjxxqNjY3NHmPr1q1GYmKiUV9fb9TX1xuJiYnG1q1bvZJvx44dxs6dO42srCzjlVdeMR337rvvGrfeemuL3/vkk0+M1NRUr+RpT76ZM2e2mvmkQD1/rf3sT/X2228b06dP90qe9uTLz883Zs2aZRiGYVRUVBjXXXedcfjw4WaPsWfPHmPkyJFGbW2t4XQ6jcmTJxsrV670Sd6TArW9tSYYtre2BMM25w3darrpxhtvVM+ePSVJQ4cOVVVVlRobGyVJa9eu1cSJEyVJV155pUJDQ1VWVtbsMdasWaOUlBSFhYUpLCxMKSkpWrNmjVfyXXzxxRo4cGCbrwrfeust/frXv/bKOtvD3XytCdTz19rP3l9ay/fOO+8oPT1dktS/f39ddtll+uCDD5qNW7duncaMGaOIiAhZLBalpaV57fkzE6jtzRt8ub21JRi2OW/oViVxqldffVWjRo2SxWLRgQMHZBiGyzW3bTabqqqqmi3ncDgUGxvrMs7hcPglsyTt27dPmzdvVnJysumY3bt3KzU1VWlpaU272f6wZMkSJSUl6cEHH9SuXbtaHBPo509y/dm3ZOvWrUpOTtYdd9yhf/7zn37JVFlZ2TTdILm//cXGxvr0+WN7845g3Obc1aWOSaSmpqqysrLF73388ceyWq2SpNWrV6ukpESvvvqqP+O5na81xcXFGjlypEuhnerSSy/V+++/r/DwcH3//fe69957FRMTo+uuu86n+R555BFFRUXJYrGouLhYU6ZM0YYNG9z6P7nLG89fWz/7UaNG6ZZbblFYWJjKy8uVmZmpZcuWacCAAX7J5y/uZvXl9uZJPn9sb55mdPfvjSfbnD90qZJw51XMu+++q+eee05Lly7V2WefLUlNB7/q6uqafhkcDof69evXbHmbzeayYTgcDtlsNq/la8uKFSs0Y8YM0+/37v3fa9Wed955GjNmjD777DO3fmk9yRcTE9N0OyUlRfPmzVNVVZXLq2MpsM9fSz/70536x/AXv/iFhg0bptLSUrd+YT3JFxsbq71797psf8OHD2827vTnr7Ky0u3nryNZfbm9eZLPH9ubpxkl329z/tCtppv+8Y9/aN68eXrppZd07rnnunwvISFBb7zxhiRp27ZtOnbsmC677LJmj5GQkKDi4mIdO3ZMx44dU3FxscaNG+eX/J999pkOHTqkG264wXRMTU2NjP+/RMjBgwf10UcfNTsjxReqq6ubbm/atEkWi8XlF/mkQD1/rf3sT3Xq/2Pv3r3617/+pUGDBvk8X0JCgt58801JJ6ZvysrKNHLkyGbj4uPjtWHDBtXV1amxsVFFRUU+e/7Y3jwT7Nucu7rVRYeuueYa9ezZ06W5ly5dqrPOOkv79u1TVlaWKisrdcYZZyg3N1fDhg2TJM2aNUujR4/WTTfdJEnKz89XcXGxpBOvYqZPn+6VfHa7XX/84x/1448/qmfPnurVq5cKCgo0cOBASdITTzyhvn376ne/+53Lcs8//7yio6N1++23q7CwUK+//rp69Oghp9OplJQUTZkyxef5Jk2apNraWoWEhKh3796aMWOGhg4dKik4nr/Wfvan5nv22Wf13nvvNU0V3HvvvUpNTfV5vqNHjyo7O1tfffWVLBaLsrKyNGbMGEmuP19JeuONN7R48WJJ0vXXX6+cnByfTLMEentrTTBsb20Jhm3OG7pVSQAA2qdbTTcBANqHkgAAmKIkAACmKAkAgClKAgBgipIAAJiiJIAA27JlS6tvWGuPQYMG6bvvvvPKYwESJQFo27Ztmjhxoq644gpdffXVmjhxokpLSwMdCwgKXeqzm4D2Onz4sKZOnao5c+Zo3LhxOn78uLZt26bQ0FC3H+Onn35Sjx78KqFrYk8C3VpFRYUkKTExUVarVWFhYRoxYkSrnz+0YsUKTZw4UXl5eRo+fLjy8/O1Z88e3X333Ro+fLiGDx+uRx99VD/++GPTMqNHj9ZLL72kpKQkXXHFFfrtb3+r//znPy0+/rJly3TLLbeoqqpKDQ0Nmj9/vkaNGqXrrrtOOTk5OnbsWNPYxYsXa8SIERoxYoTeeustLz0rwH9REujWLrzwwqbLhb7//vv697//7dZypaWlOu+88/TRRx/pgQcekGEYuv/++7Vp0ya98847qqqqUn5+vssy77zzjhYvXqz33ntPO3bs0IoVK5o97oIFC7Ry5UoVFhaqX79+euaZZ1RRUaHi4mKtX79eNTU1TZe2/OCDD1RQUKCCggKtX79emzdv9vwJAU5DSaBb6927t1577TWFhIToySef1LXXXqupU6dq//79rS4XHR2tu+66Sz169FBYWJguuOACXX/99QoNDVVERITuvfdeffrppy7L3HXXXYqJiVHfvn1144036quvvmr6nmEYmjdvnj766CMtW7ZMERERMgxDy5cv1+OPP66+ffuqd+/euv/++7V69WpJJ0pn/Pjxuvjii3XmmWdq2rRp3n+C0O0xkYpub8CAAXr66aclSbt27VJWVpby8vL07LPPmi5z+rVG9u/frz/84Q/atm2bjhw5IsMw1KdPH5cxUVFRTbd79eqlmpqapq8PHTqk5cuX67nnnlN4eLikE9c3qa+v1/jx45vGGYbRdAnMmpoal4+zP/1aCoA3sCcBnGLAgAEaP368du7c2eq4kJAQl6+fffZZhYSEqKSkRJ999pn+9Kc/qT0fsNynTx+98MILeuyxx7R9+3ZJJy6GFRYWptWrV2vbtm3atm2btm/frs8//1zSib2ZUy/FaXaVNMATlAS6tV27dqmgoKDpetIOh0N2u11Dhgxp1+McOXJEZ555psLDw1VdXd10vYf2GD58uJ555hlNnz5dpaWlslgsSktLU15enmprayWduEDNpk2bJJ24oM7KlSv1zTffqL6+XgsWLGj3OoG2UBLo1nr37q0vvvhCaWlpGjp0qCZMmKCLL75Y2dnZ7XqcadOmqby8XFdeeaXuu+8+3XzzzR3Kc/311ysvL09Tp07Vl19+qaysLF1wwQWaMGGChg0bpkmTJjWdkfWrX/1K99xzj+655x6NHTtW11xzTYfWCbSGiw4BAEyxJwEAMMXZTUALcnJyVFJS0uz+pKQk/f73vw9AIiAwmG4CAJhiugkAYIqSAACYoiQAAKYoCQCAKUoCAGDq/wDoDk0Uz/GBNgAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df_hr.plot.scatter(x='S_ranked', y='attrition')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f9b5e52-7e39-424d-b4f9-e07e98992658",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Newborn</th>\n",
       "      <th>mean</th>\n",
       "      <th>sum</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0.186700</td>\n",
       "      <td>1895</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0.056757</td>\n",
       "      <td>105</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Newborn      mean   sum\n",
       "0        0  0.186700  1895\n",
       "1        1  0.056757   105"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_hr.groupby(['Newborn'])['left'].agg(['mean', 'sum']).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3034bf7e-97a4-4159-8ded-8a8214ab8e86",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<AxesSubplot:xlabel='New Projects', ylabel='Attrition'>"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df_hr.groupby(['NP'])['left'].agg(['mean', 'sum']).reset_index().plot.scatter(y='mean',x='NP', s='sum', xlabel='New Projects', ylabel='Attrition',ylim=(0,0.6) )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91a3053d-25a9-4c9c-881a-78fc43ac486e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>Logit Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>         <td>left</td>       <th>  No. Observations:  </th>  <td> 12000</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                 <td>Logit</td>      <th>  Df Residuals:      </th>  <td> 11993</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>                 <td>MLE</td>       <th>  Df Model:          </th>  <td>     6</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>            <td>Fri, 27 May 2022</td> <th>  Pseudo R-squ.:     </th>  <td>0.2131</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                <td>08:46:06</td>     <th>  Log-Likelihood:    </th> <td> -4254.5</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>converged:</th>             <td>True</td>       <th>  LL-Null:           </th> <td> -5406.7</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>     <td>nonrobust</td>    <th>  LLR p-value:       </th>  <td> 0.000</td> \n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "     <td></td>        <th>coef</th>     <th>std err</th>      <th>z</th>      <th>P>|z|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>const</th>   <td>   -1.2412</td> <td>    0.160</td> <td>   -7.751</td> <td> 0.000</td> <td>   -1.555</td> <td>   -0.927</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>S</th>       <td>   -3.8163</td> <td>    0.121</td> <td>  -31.607</td> <td> 0.000</td> <td>   -4.053</td> <td>   -3.580</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>LPE</th>     <td>    0.5044</td> <td>    0.181</td> <td>    2.788</td> <td> 0.005</td> <td>    0.150</td> <td>    0.859</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>NP</th>      <td>   -0.3592</td> <td>    0.026</td> <td>  -13.569</td> <td> 0.000</td> <td>   -0.411</td> <td>   -0.307</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>ANH</th>     <td>    0.0038</td> <td>    0.001</td> <td>    6.067</td> <td> 0.000</td> <td>    0.003</td> <td>    0.005</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>TIC</th>     <td>    0.6188</td> <td>    0.027</td> <td>   22.820</td> <td> 0.000</td> <td>    0.566</td> <td>    0.672</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Newborn</th> <td>   -1.4851</td> <td>    0.113</td> <td>  -13.157</td> <td> 0.000</td> <td>   -1.706</td> <td>   -1.264</td>\n",
       "</tr>\n",
       "</table>"
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                           Logit Regression Results                           \n",
       "==============================================================================\n",
       "Dep. Variable:                   left   No. Observations:                12000\n",
       "Model:                          Logit   Df Residuals:                    11993\n",
       "Method:                           MLE   Df Model:                            6\n",
       "Date:                Fri, 27 May 2022   Pseudo R-squ.:                  0.2131\n",
       "Time:                        08:46:06   Log-Likelihood:                -4254.5\n",
       "converged:                       True   LL-Null:                       -5406.7\n",
       "Covariance Type:            nonrobust   LLR p-value:                     0.000\n",
       "==============================================================================\n",
       "                 coef    std err          z      P>|z|      [0.025      0.975]\n",
       "------------------------------------------------------------------------------\n",
       "const         -1.2412      0.160     -7.751      0.000      -1.555      -0.927\n",
       "S             -3.8163      0.121    -31.607      0.000      -4.053      -3.580\n",
       "LPE            0.5044      0.181      2.788      0.005       0.150       0.859\n",
       "NP            -0.3592      0.026    -13.569      0.000      -0.411      -0.307\n",
       "ANH            0.0038      0.001      6.067      0.000       0.003       0.005\n",
       "TIC            0.6188      0.027     22.820      0.000       0.566       0.672\n",
       "Newborn       -1.4851      0.113    -13.157      0.000      -1.706      -1.264\n",
       "==============================================================================\n",
       "\"\"\""
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model_hr.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "01476cae-b429-48e3-bd59-6dac6d4294a9",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}