{ "cells": [ { "cell_type": "markdown", "id": "2c90de90-1e3e-4bf1-8705-b2b85ed8a387", "metadata": {}, "source": [ "# Regression : Understanding effect and cause" ] }, { "cell_type": "markdown", "id": "49d86280-47eb-471e-becf-33affb9f56c5", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": null, "id": "5cb2af73-246e-4a95-ad43-1816b79d3985", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import scipy as sp\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "from sklearn.linear_model import LinearRegression, LogisticRegression\n", "from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder\n", "from sklearn.pipeline import Pipeline, make_pipeline\n", "import statsmodels.api as sm\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": null, "id": "f1e147e1-14c7-4dbf-beab-02a3ed42c760", "metadata": {}, "outputs": [], "source": [ "sns.set()" ] }, { "cell_type": "markdown", "id": "17a98d1e-3959-405c-bc52-ffcc9f53fb9a", "metadata": {}, "source": [ "## Credit Score Rating Example" ] }, { "cell_type": "code", "execution_count": null, "id": "3f142627-eaa6-40f9-830b-90594c450469", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Income</th>\n", " <th>Rating</th>\n", " <th>Cards</th>\n", " <th>Age</th>\n", " <th>Education</th>\n", " <th>Gender</th>\n", " <th>Student</th>\n", " <th>Married</th>\n", " <th>Ethnicity</th>\n", " <th>Balance</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>14.891</td>\n", " <td>283</td>\n", " <td>2</td>\n", " <td>34</td>\n", " <td>11</td>\n", " <td>Male</td>\n", " <td>No</td>\n", " <td>Yes</td>\n", " <td>Caucasian</td>\n", " <td>333</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>106.025</td>\n", " <td>483</td>\n", " <td>3</td>\n", " <td>82</td>\n", " <td>15</td>\n", " <td>Female</td>\n", " <td>Yes</td>\n", " <td>Yes</td>\n", " <td>Asian</td>\n", " <td>903</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>104.593</td>\n", " <td>514</td>\n", " <td>4</td>\n", " <td>71</td>\n", " <td>11</td>\n", " <td>Male</td>\n", " <td>No</td>\n", " <td>No</td>\n", " <td>Asian</td>\n", " <td>580</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>148.924</td>\n", " <td>681</td>\n", " <td>3</td>\n", " <td>36</td>\n", " <td>11</td>\n", " <td>Female</td>\n", " <td>No</td>\n", " <td>No</td>\n", " <td>Asian</td>\n", " <td>964</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>55.882</td>\n", " <td>357</td>\n", " <td>2</td>\n", " <td>68</td>\n", " <td>16</td>\n", " <td>Male</td>\n", " <td>No</td>\n", " <td>Yes</td>\n", " <td>Caucasian</td>\n", " <td>331</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Income Rating Cards Age Education Gender Student Married Ethnicity \\\n", "0 14.891 283 2 34 11 Male No Yes Caucasian \n", "1 106.025 483 3 82 15 Female Yes Yes Asian \n", "2 104.593 514 4 71 11 Male No No Asian \n", "3 148.924 681 3 36 11 Female No No Asian \n", "4 55.882 357 2 68 16 Male No Yes Caucasian \n", "\n", " Balance \n", "0 333 \n", "1 903 \n", "2 580 \n", "3 964 \n", "4 331 " ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_credscore = pd.read_csv(\"DATA_3.01_CREDIT.csv\", dtype={'Gender':'category', \n", " 'Student':'category',\n", " 'Married':'category',\n", " 'Ethnicity':'category'\n", " });df_credscore.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "58c58a15-5422-4cd9-9ea0-11e2eb912f98", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<class 'pandas.core.frame.DataFrame'>\n", "RangeIndex: 300 entries, 0 to 299\n", "Data columns (total 10 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Income 300 non-null float64 \n", " 1 Rating 300 non-null int64 \n", " 2 Cards 300 non-null int64 \n", " 3 Age 300 non-null int64 \n", " 4 Education 300 non-null int64 \n", " 5 Gender 300 non-null category\n", " 6 Student 300 non-null category\n", " 7 Married 300 non-null category\n", " 8 Ethnicity 300 non-null category\n", " 9 Balance 300 non-null int64 \n", "dtypes: category(4), float64(1), int64(5)\n", "memory usage: 15.6 KB\n" ] } ], "source": [ "df_credscore.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "cdb64a98-e23c-41ef-b0c0-5f672ac69e7b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Income</th>\n", " <th>Rating</th>\n", " <th>Cards</th>\n", " <th>Age</th>\n", " <th>Education</th>\n", " <th>Gender</th>\n", " <th>Student</th>\n", " <th>Married</th>\n", " <th>Ethnicity</th>\n", " <th>Balance</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>count</th>\n", " <td>300.000000</td>\n", " <td>300.000000</td>\n", " <td>300.000000</td>\n", " <td>300.000000</td>\n", " <td>300.000000</td>\n", " <td>300</td>\n", " <td>300</td>\n", " <td>300</td>\n", " <td>300</td>\n", " <td>300.000000</td>\n", " </tr>\n", " <tr>\n", " <th>unique</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>top</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Female</td>\n", " <td>No</td>\n", " <td>Yes</td>\n", " <td>Caucasian</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>freq</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>168</td>\n", " <td>268</td>\n", " <td>183</td>\n", " <td>141</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>mean</th>\n", " <td>44.054393</td>\n", " <td>348.116667</td>\n", " <td>3.026667</td>\n", " <td>54.983333</td>\n", " <td>13.393333</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>502.686667</td>\n", " </tr>\n", " <tr>\n", " <th>std</th>\n", " <td>33.863066</td>\n", " <td>150.871547</td>\n", " <td>1.351064</td>\n", " <td>17.216982</td>\n", " <td>3.075193</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>466.991447</td>\n", " </tr>\n", " <tr>\n", " <th>min</th>\n", " <td>10.354000</td>\n", " <td>93.000000</td>\n", " <td>1.000000</td>\n", " <td>24.000000</td>\n", " <td>5.000000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>25%</th>\n", " <td>21.027500</td>\n", " <td>235.000000</td>\n", " <td>2.000000</td>\n", " <td>41.000000</td>\n", " <td>11.000000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>15.750000</td>\n", " </tr>\n", " <tr>\n", " <th>50%</th>\n", " <td>33.115500</td>\n", " <td>339.000000</td>\n", " <td>3.000000</td>\n", " <td>55.000000</td>\n", " <td>14.000000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>433.500000</td>\n", " </tr>\n", " <tr>\n", " <th>75%</th>\n", " <td>55.975500</td>\n", " <td>433.000000</td>\n", " <td>4.000000</td>\n", " <td>69.000000</td>\n", " <td>16.000000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>857.750000</td>\n", " </tr>\n", " <tr>\n", " <th>max</th>\n", " <td>186.634000</td>\n", " <td>949.000000</td>\n", " <td>8.000000</td>\n", " <td>91.000000</td>\n", " <td>20.000000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>1809.000000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Income Rating Cards Age Education Gender \\\n", "count 300.000000 300.000000 300.000000 300.000000 300.000000 300 \n", "unique NaN NaN NaN NaN NaN 2 \n", "top NaN NaN NaN NaN NaN Female \n", "freq NaN NaN NaN NaN NaN 168 \n", "mean 44.054393 348.116667 3.026667 54.983333 13.393333 NaN \n", "std 33.863066 150.871547 1.351064 17.216982 3.075193 NaN \n", "min 10.354000 93.000000 1.000000 24.000000 5.000000 NaN \n", "25% 21.027500 235.000000 2.000000 41.000000 11.000000 NaN \n", "50% 33.115500 339.000000 3.000000 55.000000 14.000000 NaN \n", "75% 55.975500 433.000000 4.000000 69.000000 16.000000 NaN \n", "max 186.634000 949.000000 8.000000 91.000000 20.000000 NaN \n", "\n", " Student Married Ethnicity Balance \n", "count 300 300 300 300.000000 \n", "unique 2 2 3 NaN \n", "top No Yes Caucasian NaN \n", "freq 268 183 141 NaN \n", "mean NaN NaN NaN 502.686667 \n", "std NaN NaN NaN 466.991447 \n", "min NaN NaN NaN 0.000000 \n", "25% NaN NaN NaN 15.750000 \n", "50% NaN NaN NaN 433.500000 \n", "75% NaN NaN NaN 857.750000 \n", "max NaN NaN NaN 1809.000000 " ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_credscore.describe(include='all')" ] }, { "cell_type": "code", "execution_count": null, "id": "cf537297-931e-4db9-bba1-c24983cdc2b2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Income</th>\n", " <th>Rating</th>\n", " <th>Cards</th>\n", " <th>Age</th>\n", " <th>Education</th>\n", " <th>Balance</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Income</th>\n", " <td>1.000000</td>\n", " <td>0.771167</td>\n", " <td>0.028875</td>\n", " <td>0.123201</td>\n", " <td>-0.070959</td>\n", " <td>0.432327</td>\n", " </tr>\n", " <tr>\n", " <th>Rating</th>\n", " <td>0.771167</td>\n", " <td>1.000000</td>\n", " <td>0.095854</td>\n", " <td>0.042377</td>\n", " <td>-0.095433</td>\n", " <td>0.859829</td>\n", " </tr>\n", " <tr>\n", " <th>Cards</th>\n", " <td>0.028875</td>\n", " <td>0.095854</td>\n", " <td>1.000000</td>\n", " <td>0.054655</td>\n", " <td>0.015176</td>\n", " <td>0.123846</td>\n", " </tr>\n", " <tr>\n", " <th>Age</th>\n", " <td>0.123201</td>\n", " <td>0.042377</td>\n", " <td>0.054655</td>\n", " <td>1.000000</td>\n", " <td>-0.046178</td>\n", " <td>-0.052426</td>\n", " </tr>\n", " <tr>\n", " <th>Education</th>\n", " <td>-0.070959</td>\n", " <td>-0.095433</td>\n", " <td>0.015176</td>\n", " <td>-0.046178</td>\n", " <td>1.000000</td>\n", " <td>-0.073167</td>\n", " </tr>\n", " <tr>\n", " <th>Balance</th>\n", " <td>0.432327</td>\n", " <td>0.859829</td>\n", " <td>0.123846</td>\n", " <td>-0.052426</td>\n", " <td>-0.073167</td>\n", " <td>1.000000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Income Rating Cards Age Education Balance\n", "Income 1.000000 0.771167 0.028875 0.123201 -0.070959 0.432327\n", "Rating 0.771167 1.000000 0.095854 0.042377 -0.095433 0.859829\n", "Cards 0.028875 0.095854 1.000000 0.054655 0.015176 0.123846\n", "Age 0.123201 0.042377 0.054655 1.000000 -0.046178 -0.052426\n", "Education -0.070959 -0.095433 0.015176 -0.046178 1.000000 -0.073167\n", "Balance 0.432327 0.859829 0.123846 -0.052426 -0.073167 1.000000" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_credscore.corr() # Individual correlations" ] }, { "cell_type": "code", "execution_count": null, "id": "a4f86f79-7ab6-4fcb-8f4c-60fa461990bb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Income 0.771167\n", "Rating 1.000000\n", "Cards 0.095854\n", "Age 0.042377\n", "Education -0.095433\n", "Balance 0.859829\n", "Name: Rating, dtype: float64" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_credscore.corr()[\"Rating\"] # We need to understand interactions" ] }, { "cell_type": "code", "execution_count": null, "id": "71f8d364-b3c7-442b-9d1f-a0177b224326", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Pipeline(steps=[('ordinalencoder', OrdinalEncoder()),\n", " ('linearregression', LinearRegression())])" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline = make_pipeline(OrdinalEncoder(),LinearRegression()); pipeline" ] }, { "cell_type": "code", "execution_count": null, "id": "49bddc88-7e6e-4b6a-aa93-fb20e5755f17", "metadata": {}, "outputs": [], "source": [ "y = df_credscore.pop('Rating')\n", "X = df_credscore" ] }, { "cell_type": "code", "execution_count": null, "id": "738990d1-3603-4508-994f-6f7c9965e98d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<class 'pandas.core.frame.DataFrame'>\n", "RangeIndex: 300 entries, 0 to 299\n", "Data columns (total 9 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Income 300 non-null float64 \n", " 1 Cards 300 non-null int64 \n", " 2 Age 300 non-null int64 \n", " 3 Education 300 non-null int64 \n", " 4 Gender 300 non-null category\n", " 5 Student 300 non-null category\n", " 6 Married 300 non-null category\n", " 7 Ethnicity 300 non-null category\n", " 8 Balance 300 non-null int64 \n", "dtypes: category(4), float64(1), int64(4)\n", "memory usage: 13.3 KB\n" ] } ], "source": [ "X.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "7483a8ff-7b00-482c-aa7e-a68aad62023b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Pipeline(steps=[('ordinalencoder', OrdinalEncoder()),\n", " ('linearregression', LinearRegression())])" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline.fit(X,y)" ] }, { "cell_type": "code", "execution_count": null, "id": "074306df-d771-4564-a985-569c6512e2cf", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LinearRegression()" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = pipeline['linearregression']\n", "model" ] }, { "cell_type": "code", "execution_count": null, "id": "23a58fef-1ec2-46a1-9e65-9e993981c9de", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0.71532602, 1.41275992, 0.17419851, 0.61789045,\n", " 0.33006896, -91.64416173, 3.56809569, -2.47231507,\n", " 1.6260681 ])" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.coef_" ] }, { "cell_type": "markdown", "id": "e75338a2-9086-461e-9f30-3a51fa87a542", "metadata": {}, "source": [ "### Statsmodel api" ] }, { "cell_type": "code", "execution_count": null, "id": "cc42cee7-4ae2-487c-ad8b-0e1d347f50ca", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Income</th>\n", " <th>Cards</th>\n", " <th>Age</th>\n", " <th>Education</th>\n", " <th>Gender</th>\n", " <th>Student</th>\n", " <th>Married</th>\n", " <th>Ethnicity</th>\n", " <th>Balance</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>14.891</td>\n", " <td>2</td>\n", " <td>34</td>\n", " <td>11</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>333</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>106.025</td>\n", " <td>3</td>\n", " <td>82</td>\n", " <td>15</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>903</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>104.593</td>\n", " <td>4</td>\n", " <td>71</td>\n", " <td>11</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>580</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>148.924</td>\n", " <td>3</td>\n", " <td>36</td>\n", " <td>11</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>964</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>55.882</td>\n", " <td>2</td>\n", " <td>68</td>\n", " <td>16</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>331</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>295</th>\n", " <td>27.272</td>\n", " <td>5</td>\n", " <td>67</td>\n", " <td>10</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>296</th>\n", " <td>65.896</td>\n", " <td>1</td>\n", " <td>49</td>\n", " <td>17</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>293</td>\n", " </tr>\n", " <tr>\n", " <th>297</th>\n", " <td>55.054</td>\n", " <td>3</td>\n", " <td>74</td>\n", " <td>17</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>188</td>\n", " </tr>\n", " <tr>\n", " <th>298</th>\n", " <td>20.791</td>\n", " <td>1</td>\n", " <td>70</td>\n", " <td>18</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>299</th>\n", " <td>24.919</td>\n", " <td>3</td>\n", " <td>76</td>\n", " <td>11</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>711</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>300 rows × 9 columns</p>\n", "</div>" ], "text/plain": [ " Income Cards Age Education Gender Student Married Ethnicity \\\n", "0 14.891 2 34 11 0 0 1 2 \n", "1 106.025 3 82 15 1 1 1 1 \n", "2 104.593 4 71 11 0 0 0 1 \n", "3 148.924 3 36 11 1 0 0 1 \n", "4 55.882 2 68 16 0 0 1 2 \n", ".. ... ... ... ... ... ... ... ... \n", "295 27.272 5 67 10 1 0 1 2 \n", "296 65.896 1 49 17 1 0 1 2 \n", "297 55.054 3 74 17 0 0 1 1 \n", "298 20.791 1 70 18 1 0 0 0 \n", "299 24.919 3 76 11 1 0 1 0 \n", "\n", " Balance \n", "0 333 \n", "1 903 \n", "2 580 \n", "3 964 \n", "4 331 \n", ".. ... \n", "295 0 \n", "296 293 \n", "297 188 \n", "298 0 \n", "299 711 \n", "\n", "[300 rows x 9 columns]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def preprocess_categories(df):\n", " df_out = df.copy()\n", " for col in df.dtypes[df.dtypes=='category'].index:\n", " df_out[col] = df[col].cat.codes\n", " return df_out\n", "\n", "preprocess_categories(X)" ] }, { "cell_type": "code", "execution_count": null, "id": "ad73cf4b-6928-4a3a-94db-b9751777b364", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>const</th>\n", " <th>Income</th>\n", " <th>Cards</th>\n", " <th>Age</th>\n", " <th>Education</th>\n", " <th>Gender</th>\n", " <th>Student</th>\n", " <th>Married</th>\n", " <th>Ethnicity</th>\n", " <th>Balance</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1.0</td>\n", " <td>14.891</td>\n", " <td>2</td>\n", " <td>34</td>\n", " <td>11</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>333</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1.0</td>\n", " <td>106.025</td>\n", " <td>3</td>\n", " <td>82</td>\n", " <td>15</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>903</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>1.0</td>\n", " <td>104.593</td>\n", " <td>4</td>\n", " <td>71</td>\n", " <td>11</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>580</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>1.0</td>\n", " <td>148.924</td>\n", " <td>3</td>\n", " <td>36</td>\n", " <td>11</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>964</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>1.0</td>\n", " <td>55.882</td>\n", " <td>2</td>\n", " <td>68</td>\n", " <td>16</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>331</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>295</th>\n", " <td>1.0</td>\n", " <td>27.272</td>\n", " <td>5</td>\n", " <td>67</td>\n", " <td>10</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>296</th>\n", " <td>1.0</td>\n", " <td>65.896</td>\n", " <td>1</td>\n", " <td>49</td>\n", " <td>17</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>293</td>\n", " </tr>\n", " <tr>\n", " <th>297</th>\n", " <td>1.0</td>\n", " <td>55.054</td>\n", " <td>3</td>\n", " <td>74</td>\n", " <td>17</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>188</td>\n", " </tr>\n", " <tr>\n", " <th>298</th>\n", " <td>1.0</td>\n", " <td>20.791</td>\n", " <td>1</td>\n", " <td>70</td>\n", " <td>18</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>299</th>\n", " <td>1.0</td>\n", " <td>24.919</td>\n", " <td>3</td>\n", " <td>76</td>\n", " <td>11</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>711</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>300 rows × 10 columns</p>\n", "</div>" ], "text/plain": [ " const Income Cards Age Education Gender Student Married \\\n", "0 1.0 14.891 2 34 11 0 0 1 \n", "1 1.0 106.025 3 82 15 1 1 1 \n", "2 1.0 104.593 4 71 11 0 0 0 \n", "3 1.0 148.924 3 36 11 1 0 0 \n", "4 1.0 55.882 2 68 16 0 0 1 \n", ".. ... ... ... ... ... ... ... ... \n", "295 1.0 27.272 5 67 10 1 0 1 \n", "296 1.0 65.896 1 49 17 1 0 1 \n", "297 1.0 55.054 3 74 17 0 0 1 \n", "298 1.0 20.791 1 70 18 1 0 0 \n", "299 1.0 24.919 3 76 11 1 0 1 \n", "\n", " Ethnicity Balance \n", "0 2 333 \n", "1 1 903 \n", "2 1 580 \n", "3 1 964 \n", "4 2 331 \n", ".. ... ... \n", "295 2 0 \n", "296 2 293 \n", "297 1 188 \n", "298 0 0 \n", "299 0 711 \n", "\n", "[300 rows x 10 columns]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = sm.add_constant(preprocess_categories(X))\n", "X" ] }, { "cell_type": "code", "execution_count": null, "id": "8b43d7bd-46bd-412a-9645-43364bc89b3e", "metadata": {}, "outputs": [], "source": [ "model = sm.OLS(y, X).fit()" ] }, { "cell_type": "code", "execution_count": null, "id": "af0e37a0-e8ab-41f0-b1aa-8f9b4a73a630", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<table class=\"simpletable\">\n", "<caption>OLS Regression Results</caption>\n", "<tr>\n", " <th>Dep. Variable:</th> <td>Rating</td> <th> R-squared: </th> <td> 0.974</td> \n", "</tr>\n", "<tr>\n", " <th>Model:</th> <td>OLS</td> <th> Adj. R-squared: </th> <td> 0.973</td> \n", "</tr>\n", "<tr>\n", " <th>Method:</th> <td>Least Squares</td> <th> F-statistic: </th> <td> 1185.</td> \n", "</tr>\n", "<tr>\n", " <th>Date:</th> <td>Fri, 27 May 2022</td> <th> Prob (F-statistic):</th> <td>6.33e-223</td>\n", "</tr>\n", "<tr>\n", " <th>Time:</th> <td>08:02:31</td> <th> Log-Likelihood: </th> <td> -1385.4</td> \n", "</tr>\n", "<tr>\n", " <th>No. Observations:</th> <td> 300</td> <th> AIC: </th> <td> 2791.</td> \n", "</tr>\n", "<tr>\n", " <th>Df Residuals:</th> <td> 290</td> <th> BIC: </th> <td> 2828.</td> \n", "</tr>\n", "<tr>\n", " <th>Df Model:</th> <td> 9</td> <th> </th> <td> </td> \n", "</tr>\n", "<tr>\n", " <th>Covariance Type:</th> <td>nonrobust</td> <th> </th> <td> </td> \n", "</tr>\n", "</table>\n", "<table class=\"simpletable\">\n", "<tr>\n", " <td></td> <th>coef</th> <th>std err</th> <th>t</th> <th>P>|t|</th> <th>[0.025</th> <th>0.975]</th> \n", "</tr>\n", "<tr>\n", " <th>const</th> <td> 139.4908</td> <td> 9.595</td> <td> 14.538</td> <td> 0.000</td> <td> 120.607</td> <td> 158.375</td>\n", "</tr>\n", "<tr>\n", " <th>Income</th> <td> 2.0946</td> <td> 0.048</td> <td> 43.507</td> <td> 0.000</td> <td> 2.000</td> <td> 2.189</td>\n", "</tr>\n", "<tr>\n", " <th>Cards</th> <td> -0.7769</td> <td> 1.080</td> <td> -0.719</td> <td> 0.473</td> <td> -2.903</td> <td> 1.349</td>\n", "</tr>\n", "<tr>\n", " <th>Age</th> <td> 0.1493</td> <td> 0.086</td> <td> 1.740</td> <td> 0.083</td> <td> -0.020</td> <td> 0.318</td>\n", "</tr>\n", "<tr>\n", " <th>Education</th> <td> 0.1721</td> <td> 0.474</td> <td> 0.363</td> <td> 0.717</td> <td> -0.761</td> <td> 1.105</td>\n", "</tr>\n", "<tr>\n", " <th>Gender</th> <td> 1.8529</td> <td> 2.919</td> <td> 0.635</td> <td> 0.526</td> <td> -3.891</td> <td> 7.597</td>\n", "</tr>\n", "<tr>\n", " <th>Student</th> <td> -99.2582</td> <td> 4.947</td> <td> -20.066</td> <td> 0.000</td> <td> -108.994</td> <td> -89.522</td>\n", "</tr>\n", "<tr>\n", " <th>Married</th> <td> 2.7424</td> <td> 2.983</td> <td> 0.919</td> <td> 0.359</td> <td> -3.129</td> <td> 8.614</td>\n", "</tr>\n", "<tr>\n", " <th>Ethnicity</th> <td> -0.3005</td> <td> 1.745</td> <td> -0.172</td> <td> 0.863</td> <td> -3.735</td> <td> 3.134</td>\n", "</tr>\n", "<tr>\n", " <th>Balance</th> <td> 0.2316</td> <td> 0.004</td> <td> 63.330</td> <td> 0.000</td> <td> 0.224</td> <td> 0.239</td>\n", "</tr>\n", "</table>\n", "<table class=\"simpletable\">\n", "<tr>\n", " <th>Omnibus:</th> <td>43.876</td> <th> Durbin-Watson: </th> <td> 1.851</td>\n", "</tr>\n", "<tr>\n", " <th>Prob(Omnibus):</th> <td> 0.000</td> <th> Jarque-Bera (JB): </th> <td> 59.049</td>\n", "</tr>\n", "<tr>\n", " <th>Skew:</th> <td>-0.999</td> <th> Prob(JB): </th> <td>1.51e-13</td>\n", "</tr>\n", "<tr>\n", " <th>Kurtosis:</th> <td> 3.857</td> <th> Cond. No. </th> <td>4.61e+03</td>\n", "</tr>\n", "</table><br/><br/>Notes:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.<br/>[2] The condition number is large, 4.61e+03. This might indicate that there are<br/>strong multicollinearity or other numerical problems." ], "text/plain": [ "<class 'statsmodels.iolib.summary.Summary'>\n", "\"\"\"\n", " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: Rating R-squared: 0.974\n", "Model: OLS Adj. R-squared: 0.973\n", "Method: Least Squares F-statistic: 1185.\n", "Date: Fri, 27 May 2022 Prob (F-statistic): 6.33e-223\n", "Time: 08:02:31 Log-Likelihood: -1385.4\n", "No. Observations: 300 AIC: 2791.\n", "Df Residuals: 290 BIC: 2828.\n", "Df Model: 9 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const 139.4908 9.595 14.538 0.000 120.607 158.375\n", "Income 2.0946 0.048 43.507 0.000 2.000 2.189\n", "Cards -0.7769 1.080 -0.719 0.473 -2.903 1.349\n", "Age 0.1493 0.086 1.740 0.083 -0.020 0.318\n", "Education 0.1721 0.474 0.363 0.717 -0.761 1.105\n", "Gender 1.8529 2.919 0.635 0.526 -3.891 7.597\n", "Student -99.2582 4.947 -20.066 0.000 -108.994 -89.522\n", "Married 2.7424 2.983 0.919 0.359 -3.129 8.614\n", "Ethnicity -0.3005 1.745 -0.172 0.863 -3.735 3.134\n", "Balance 0.2316 0.004 63.330 0.000 0.224 0.239\n", "==============================================================================\n", "Omnibus: 43.876 Durbin-Watson: 1.851\n", "Prob(Omnibus): 0.000 Jarque-Bera (JB): 59.049\n", "Skew: -0.999 Prob(JB): 1.51e-13\n", "Kurtosis: 3.857 Cond. No. 4.61e+03\n", "==============================================================================\n", "\n", "Notes:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", "[2] The condition number is large, 4.61e+03. This might indicate that there are\n", "strong multicollinearity or other numerical problems.\n", "\"\"\"" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.summary()" ] }, { "cell_type": "code", "execution_count": null, "id": "a08f7238-5ef6-409e-866b-5bd4d30c22d7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Balance 63.329758\n", "Income 43.507422\n", "Student 20.066064\n", "const 14.538499\n", "Age 1.740111\n", "Married 0.919321\n", "Cards 0.719127\n", "Gender 0.634877\n", "Education 0.363174\n", "Ethnicity 0.172217\n", "dtype: float64" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.tvalues.abs().sort_values(ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "8b899f23-2fb1-423c-9212-33c5722bbde6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Balance 63.329758\n", "Income 43.507422\n", "Student -20.066064\n", "const 14.538499\n", "dtype: float64" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.tvalues[model.tvalues[model.pvalues <= 0.05].abs().sort_values(ascending=False).index]" ] }, { "cell_type": "code", "execution_count": null, "id": "7be36d13-32e1-4274-bd06-862895be0302", "metadata": {}, "outputs": [], "source": [ "# np.corr(model.fittedvalues,y.values)" ] }, { "cell_type": "code", "execution_count": null, "id": "a09f1fed-3174-4e7e-8c6d-686a4cbc8de0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([42981258.44356128])" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.correlate(model.fittedvalues, y.values)" ] }, { "cell_type": "code", "execution_count": null, "id": "20d2c6d5-1b33-43b4-95fa-d6b6e15fc023", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 255.364880\n", "1 488.244174\n", "2 501.990296\n", "3 681.185956\n", "4 346.698891\n", " ... \n", "295 208.450617\n", "296 358.837627\n", "297 312.436387\n", "298 197.666967\n", "299 371.866045\n", "Length: 300, dtype: float64" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fittedvalues" ] }, { "cell_type": "code", "execution_count": null, "id": "84e39447-13cf-464a-9c8a-ec2d1f9fc151", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1. , 0.9866719],\n", " [0.9866719, 1. ]])" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.corrcoef(model.fittedvalues.values, y.values)" ] }, { "cell_type": "markdown", "id": "d4ddb1b1-bf69-4d3f-ad5b-630cc70071a2", "metadata": {}, "source": [ "### Limited Variables Income, Cards, Married" ] }, { "cell_type": "code", "execution_count": null, "id": "45f58e37-fc62-440f-8a45-2568e0f2d7a5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>const</th>\n", " <th>Income</th>\n", " <th>Cards</th>\n", " <th>Married</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1.0</td>\n", " <td>14.891</td>\n", " <td>2</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1.0</td>\n", " <td>106.025</td>\n", " <td>3</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>1.0</td>\n", " <td>104.593</td>\n", " <td>4</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>1.0</td>\n", " <td>148.924</td>\n", " <td>3</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>1.0</td>\n", " <td>55.882</td>\n", " <td>2</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>295</th>\n", " <td>1.0</td>\n", " <td>27.272</td>\n", " <td>5</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>296</th>\n", " <td>1.0</td>\n", " <td>65.896</td>\n", " <td>1</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>297</th>\n", " <td>1.0</td>\n", " <td>55.054</td>\n", " <td>3</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>298</th>\n", " <td>1.0</td>\n", " <td>20.791</td>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>299</th>\n", " <td>1.0</td>\n", " <td>24.919</td>\n", " <td>3</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>300 rows × 4 columns</p>\n", "</div>" ], "text/plain": [ " const Income Cards Married\n", "0 1.0 14.891 2 1\n", "1 1.0 106.025 3 1\n", "2 1.0 104.593 4 0\n", "3 1.0 148.924 3 0\n", "4 1.0 55.882 2 1\n", ".. ... ... ... ...\n", "295 1.0 27.272 5 1\n", "296 1.0 65.896 1 1\n", "297 1.0 55.054 3 1\n", "298 1.0 20.791 1 0\n", "299 1.0 24.919 3 1\n", "\n", "[300 rows x 4 columns]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_red = X[['const', 'Income', 'Cards', 'Married']]\n", "X_red" ] }, { "cell_type": "code", "execution_count": null, "id": "c07628d2-e64e-4b80-8adc-acbaf46aecea", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "d18b567f-4f32-40bf-98ed-1088b5237f99", "metadata": {}, "outputs": [], "source": [ "model2 = sm.OLS(y, X_red).fit()" ] }, { "cell_type": "code", "execution_count": null, "id": "8869c58c-7201-4689-a268-8cb3a0d158b2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<table class=\"simpletable\">\n", "<caption>OLS Regression Results</caption>\n", "<tr>\n", " <th>Dep. Variable:</th> <td>Rating</td> <th> R-squared: </th> <td> 0.602</td>\n", "</tr>\n", "<tr>\n", " <th>Model:</th> <td>OLS</td> <th> Adj. R-squared: </th> <td> 0.598</td>\n", "</tr>\n", "<tr>\n", " <th>Method:</th> <td>Least Squares</td> <th> F-statistic: </th> <td> 149.0</td>\n", "</tr>\n", "<tr>\n", " <th>Date:</th> <td>Fri, 27 May 2022</td> <th> Prob (F-statistic):</th> <td>7.56e-59</td>\n", "</tr>\n", "<tr>\n", " <th>Time:</th> <td>08:02:39</td> <th> Log-Likelihood: </th> <td> -1792.1</td>\n", "</tr>\n", "<tr>\n", " <th>No. Observations:</th> <td> 300</td> <th> AIC: </th> <td> 3592.</td>\n", "</tr>\n", "<tr>\n", " <th>Df Residuals:</th> <td> 296</td> <th> BIC: </th> <td> 3607.</td>\n", "</tr>\n", "<tr>\n", " <th>Df Model:</th> <td> 3</td> <th> </th> <td> </td> \n", "</tr>\n", "<tr>\n", " <th>Covariance Type:</th> <td>nonrobust</td> <th> </th> <td> </td> \n", "</tr>\n", "</table>\n", "<table class=\"simpletable\">\n", "<tr>\n", " <td></td> <th>coef</th> <th>std err</th> <th>t</th> <th>P>|t|</th> <th>[0.025</th> <th>0.975]</th> \n", "</tr>\n", "<tr>\n", " <th>const</th> <td> 165.2144</td> <td> 16.641</td> <td> 9.928</td> <td> 0.000</td> <td> 132.464</td> <td> 197.964</td>\n", "</tr>\n", "<tr>\n", " <th>Income</th> <td> 3.4196</td> <td> 0.164</td> <td> 20.896</td> <td> 0.000</td> <td> 3.098</td> <td> 3.742</td>\n", "</tr>\n", "<tr>\n", " <th>Cards</th> <td> 8.2699</td> <td> 4.099</td> <td> 2.018</td> <td> 0.045</td> <td> 0.203</td> <td> 16.336</td>\n", "</tr>\n", "<tr>\n", " <th>Married</th> <td> 11.8404</td> <td> 11.339</td> <td> 1.044</td> <td> 0.297</td> <td> -10.474</td> <td> 34.155</td>\n", "</tr>\n", "</table>\n", "<table class=\"simpletable\">\n", "<tr>\n", " <th>Omnibus:</th> <td>133.940</td> <th> Durbin-Watson: </th> <td> 1.873</td>\n", "</tr>\n", "<tr>\n", " <th>Prob(Omnibus):</th> <td> 0.000</td> <th> Jarque-Bera (JB): </th> <td> 17.170</td>\n", "</tr>\n", "<tr>\n", " <th>Skew:</th> <td> 0.044</td> <th> Prob(JB): </th> <td>0.000187</td>\n", "</tr>\n", "<tr>\n", " <th>Kurtosis:</th> <td> 1.831</td> <th> Cond. No. </th> <td> 179.</td>\n", "</tr>\n", "</table><br/><br/>Notes:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified." ], "text/plain": [ "<class 'statsmodels.iolib.summary.Summary'>\n", "\"\"\"\n", " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: Rating R-squared: 0.602\n", "Model: OLS Adj. R-squared: 0.598\n", "Method: Least Squares F-statistic: 149.0\n", "Date: Fri, 27 May 2022 Prob (F-statistic): 7.56e-59\n", "Time: 08:02:39 Log-Likelihood: -1792.1\n", "No. Observations: 300 AIC: 3592.\n", "Df Residuals: 296 BIC: 3607.\n", "Df Model: 3 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const 165.2144 16.641 9.928 0.000 132.464 197.964\n", "Income 3.4196 0.164 20.896 0.000 3.098 3.742\n", "Cards 8.2699 4.099 2.018 0.045 0.203 16.336\n", "Married 11.8404 11.339 1.044 0.297 -10.474 34.155\n", "==============================================================================\n", "Omnibus: 133.940 Durbin-Watson: 1.873\n", "Prob(Omnibus): 0.000 Jarque-Bera (JB): 17.170\n", "Skew: 0.044 Prob(JB): 0.000187\n", "Kurtosis: 1.831 Cond. No. 179.\n", "==============================================================================\n", "\n", "Notes:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", "\"\"\"" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model2.summary()" ] }, { "cell_type": "code", "execution_count": null, "id": "8c15f308-a5bd-420a-9060-aae8a3a558e4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Income 20.895784\n", "const 9.928059\n", "Cards 2.017633\n", "dtype: float64" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model2.tvalues[model2.tvalues[model2.pvalues <= 0.05].abs().sort_values(ascending=False).index]" ] }, { "cell_type": "markdown", "id": "73e5e5eb-5d84-4cf4-917d-2224ee66fb5a", "metadata": {}, "source": [ "## HR Example" ] }, { "cell_type": "code", "execution_count": null, "id": "b8a4a2cb-5ae8-434d-a6e6-44d8f7f8e51e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>S</th>\n", " <th>LPE</th>\n", " <th>NP</th>\n", " <th>ANH</th>\n", " <th>TIC</th>\n", " <th>Newborn</th>\n", " <th>left</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0.38</td>\n", " <td>0.53</td>\n", " <td>2</td>\n", " <td>157</td>\n", " <td>3</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>0.80</td>\n", " <td>0.86</td>\n", " <td>5</td>\n", " <td>262</td>\n", " <td>6</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>0.11</td>\n", " <td>0.88</td>\n", " <td>7</td>\n", " <td>272</td>\n", " <td>4</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>0.72</td>\n", " <td>0.87</td>\n", " <td>5</td>\n", " <td>223</td>\n", " <td>5</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>0.37</td>\n", " <td>0.52</td>\n", " <td>2</td>\n", " <td>159</td>\n", " <td>3</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " S LPE NP ANH TIC Newborn left\n", "0 0.38 0.53 2 157 3 0 1\n", "1 0.80 0.86 5 262 6 0 1\n", "2 0.11 0.88 7 272 4 0 1\n", "3 0.72 0.87 5 223 5 0 1\n", "4 0.37 0.52 2 159 3 0 1" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_hr = pd.read_csv(\"DATA_3.02_HR2.csv\"); df_hr.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "b3c97757-53c6-4c41-b6d3-8f8937d66370", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<class 'pandas.core.frame.DataFrame'>\n", "RangeIndex: 12000 entries, 0 to 11999\n", "Data columns (total 7 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 S 12000 non-null float64\n", " 1 LPE 12000 non-null float64\n", " 2 NP 12000 non-null int64 \n", " 3 ANH 12000 non-null int64 \n", " 4 TIC 12000 non-null int64 \n", " 5 Newborn 12000 non-null int64 \n", " 6 left 12000 non-null int64 \n", "dtypes: float64(2), int64(5)\n", "memory usage: 656.4 KB\n" ] } ], "source": [ "df_hr.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "b631c30c-cc89-40e0-8fb8-652392f52d36", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>S</th>\n", " <th>LPE</th>\n", " <th>NP</th>\n", " <th>ANH</th>\n", " <th>TIC</th>\n", " <th>Newborn</th>\n", " <th>left</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>count</th>\n", " <td>12000.000000</td>\n", " <td>12000.000000</td>\n", " <td>12000.000000</td>\n", " <td>12000.000000</td>\n", " <td>12000.000000</td>\n", " <td>12000.000000</td>\n", " <td>12000.000000</td>\n", " </tr>\n", " <tr>\n", " <th>mean</th>\n", " <td>0.629463</td>\n", " <td>0.716558</td>\n", " <td>3.801833</td>\n", " <td>200.437917</td>\n", " <td>3.228750</td>\n", " <td>0.154167</td>\n", " <td>0.166667</td>\n", " </tr>\n", " <tr>\n", " <th>std</th>\n", " <td>0.241100</td>\n", " <td>0.168368</td>\n", " <td>1.163906</td>\n", " <td>48.740178</td>\n", " <td>1.056811</td>\n", " <td>0.361123</td>\n", " <td>0.372694</td>\n", " </tr>\n", " <tr>\n", " <th>min</th>\n", " <td>0.090000</td>\n", " <td>0.360000</td>\n", " <td>2.000000</td>\n", " <td>96.000000</td>\n", " <td>2.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>25%</th>\n", " <td>0.480000</td>\n", " <td>0.570000</td>\n", " <td>3.000000</td>\n", " <td>157.000000</td>\n", " <td>2.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>50%</th>\n", " <td>0.660000</td>\n", " <td>0.720000</td>\n", " <td>4.000000</td>\n", " <td>199.500000</td>\n", " <td>3.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>75%</th>\n", " <td>0.820000</td>\n", " <td>0.860000</td>\n", " <td>5.000000</td>\n", " <td>243.000000</td>\n", " <td>4.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>max</th>\n", " <td>1.000000</td>\n", " <td>1.000000</td>\n", " <td>7.000000</td>\n", " <td>310.000000</td>\n", " <td>6.000000</td>\n", " <td>1.000000</td>\n", " <td>1.000000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " S LPE NP ANH TIC \\\n", "count 12000.000000 12000.000000 12000.000000 12000.000000 12000.000000 \n", "mean 0.629463 0.716558 3.801833 200.437917 3.228750 \n", "std 0.241100 0.168368 1.163906 48.740178 1.056811 \n", "min 0.090000 0.360000 2.000000 96.000000 2.000000 \n", "25% 0.480000 0.570000 3.000000 157.000000 2.000000 \n", "50% 0.660000 0.720000 4.000000 199.500000 3.000000 \n", "75% 0.820000 0.860000 5.000000 243.000000 4.000000 \n", "max 1.000000 1.000000 7.000000 310.000000 6.000000 \n", "\n", " Newborn left \n", "count 12000.000000 12000.000000 \n", "mean 0.154167 0.166667 \n", "std 0.361123 0.372694 \n", "min 0.000000 0.000000 \n", "25% 0.000000 0.000000 \n", "50% 0.000000 0.000000 \n", "75% 0.000000 0.000000 \n", "max 1.000000 1.000000 " ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_hr.describe()" ] }, { "cell_type": "code", "execution_count": null, "id": "bd2a4d7b-96db-42bf-86a3-ea8a1ff9153f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<AxesSubplot:ylabel='Frequency'>" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "<Figure size 432x288 with 1 Axes>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df_hr['left'].plot.hist()" ] }, { "cell_type": "code", "execution_count": null, "id": "84e823bb-82e6-467b-b67b-24dedbda07f4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<AxesSubplot:ylabel='Frequency'>" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "<Figure size 432x288 with 1 Axes>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df_hr['S'].plot.hist(bins=20)" ] }, { "cell_type": "code", "execution_count": null, "id": "2e7d2e0f-b5dd-4d01-a159-64bd49d06e28", "metadata": {}, "outputs": [], "source": [ "y = df_hr.pop('left')\n", "X = df_hr.copy()" ] }, { "cell_type": "code", "execution_count": null, "id": "66b02a04-fd6c-44e5-8803-e25be209c3d7", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>S</th>\n", " <th>LPE</th>\n", " <th>NP</th>\n", " <th>ANH</th>\n", " <th>TIC</th>\n", " <th>Newborn</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0.38</td>\n", " <td>0.53</td>\n", " <td>2</td>\n", " <td>157</td>\n", " <td>3</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>0.80</td>\n", " <td>0.86</td>\n", " <td>5</td>\n", " <td>262</td>\n", " <td>6</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>0.11</td>\n", " <td>0.88</td>\n", " <td>7</td>\n", " <td>272</td>\n", " <td>4</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>0.72</td>\n", " <td>0.87</td>\n", " <td>5</td>\n", " <td>223</td>\n", " <td>5</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>0.37</td>\n", " <td>0.52</td>\n", " <td>2</td>\n", " <td>159</td>\n", " <td>3</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>11995</th>\n", " <td>0.90</td>\n", " <td>0.55</td>\n", " <td>3</td>\n", " <td>259</td>\n", " <td>2</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>11996</th>\n", " <td>0.74</td>\n", " <td>0.95</td>\n", " <td>5</td>\n", " <td>266</td>\n", " <td>4</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>11997</th>\n", " <td>0.85</td>\n", " <td>0.54</td>\n", " <td>3</td>\n", " <td>185</td>\n", " <td>3</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>11998</th>\n", " <td>0.33</td>\n", " <td>0.65</td>\n", " <td>3</td>\n", " <td>172</td>\n", " <td>5</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>11999</th>\n", " <td>0.50</td>\n", " <td>0.73</td>\n", " <td>4</td>\n", " <td>180</td>\n", " <td>3</td>\n", " <td>0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>12000 rows × 6 columns</p>\n", "</div>" ], "text/plain": [ " S LPE NP ANH TIC Newborn\n", "0 0.38 0.53 2 157 3 0\n", "1 0.80 0.86 5 262 6 0\n", "2 0.11 0.88 7 272 4 0\n", "3 0.72 0.87 5 223 5 0\n", "4 0.37 0.52 2 159 3 0\n", "... ... ... .. ... ... ...\n", "11995 0.90 0.55 3 259 2 1\n", "11996 0.74 0.95 5 266 4 0\n", "11997 0.85 0.54 3 185 3 0\n", "11998 0.33 0.65 3 172 5 0\n", "11999 0.50 0.73 4 180 3 0\n", "\n", "[12000 rows x 6 columns]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X" ] }, { "cell_type": "code", "execution_count": null, "id": "59f26d5b-6e3e-4fd4-b4a1-a065d06989a2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 1\n", "1 1\n", "2 1\n", "3 1\n", "4 1\n", " ..\n", "11995 0\n", "11996 0\n", "11997 0\n", "11998 0\n", "11999 0\n", "Name: left, Length: 12000, dtype: int64" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y" ] }, { "cell_type": "code", "execution_count": null, "id": "25f796be-6d89-4a65-a279-f23c79542383", "metadata": {}, "outputs": [], "source": [ "X = sm.add_constant(X)" ] }, { "cell_type": "code", "execution_count": null, "id": "01477663-857c-4f90-b814-635cbb1bffb5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Optimization terminated successfully.\n", " Current function value: 0.354538\n", " Iterations 7\n" ] } ], "source": [ "model_hr = sm.Logit(y, X).fit()" ] }, { "cell_type": "code", "execution_count": null, "id": "9f327cbc-aa9e-4966-9b8b-a5f92f387168", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<table class=\"simpletable\">\n", "<caption>Logit Regression Results</caption>\n", "<tr>\n", " <th>Dep. Variable:</th> <td>left</td> <th> No. Observations: </th> <td> 12000</td> \n", "</tr>\n", "<tr>\n", " <th>Model:</th> <td>Logit</td> <th> Df Residuals: </th> <td> 11993</td> \n", "</tr>\n", "<tr>\n", " <th>Method:</th> <td>MLE</td> <th> Df Model: </th> <td> 6</td> \n", "</tr>\n", "<tr>\n", " <th>Date:</th> <td>Fri, 27 May 2022</td> <th> Pseudo R-squ.: </th> <td>0.2131</td> \n", "</tr>\n", "<tr>\n", " <th>Time:</th> <td>08:04:49</td> <th> Log-Likelihood: </th> <td> -4254.5</td>\n", "</tr>\n", "<tr>\n", " <th>converged:</th> <td>True</td> <th> LL-Null: </th> <td> -5406.7</td>\n", "</tr>\n", "<tr>\n", " <th>Covariance Type:</th> <td>nonrobust</td> <th> LLR p-value: </th> <td> 0.000</td> \n", "</tr>\n", "</table>\n", "<table class=\"simpletable\">\n", "<tr>\n", " <td></td> <th>coef</th> <th>std err</th> <th>z</th> <th>P>|z|</th> <th>[0.025</th> <th>0.975]</th> \n", "</tr>\n", "<tr>\n", " <th>const</th> <td> -1.2412</td> <td> 0.160</td> <td> -7.751</td> <td> 0.000</td> <td> -1.555</td> <td> -0.927</td>\n", "</tr>\n", "<tr>\n", " <th>S</th> <td> -3.8163</td> <td> 0.121</td> <td> -31.607</td> <td> 0.000</td> <td> -4.053</td> <td> -3.580</td>\n", "</tr>\n", "<tr>\n", " <th>LPE</th> <td> 0.5044</td> <td> 0.181</td> <td> 2.788</td> <td> 0.005</td> <td> 0.150</td> <td> 0.859</td>\n", "</tr>\n", "<tr>\n", " <th>NP</th> <td> -0.3592</td> <td> 0.026</td> <td> -13.569</td> <td> 0.000</td> <td> -0.411</td> <td> -0.307</td>\n", "</tr>\n", "<tr>\n", " <th>ANH</th> <td> 0.0038</td> <td> 0.001</td> <td> 6.067</td> <td> 0.000</td> <td> 0.003</td> <td> 0.005</td>\n", "</tr>\n", "<tr>\n", " <th>TIC</th> <td> 0.6188</td> <td> 0.027</td> <td> 22.820</td> <td> 0.000</td> <td> 0.566</td> <td> 0.672</td>\n", "</tr>\n", "<tr>\n", " <th>Newborn</th> <td> -1.4851</td> <td> 0.113</td> <td> -13.157</td> <td> 0.000</td> <td> -1.706</td> <td> -1.264</td>\n", "</tr>\n", "</table>" ], "text/plain": [ "<class 'statsmodels.iolib.summary.Summary'>\n", "\"\"\"\n", " Logit Regression Results \n", "==============================================================================\n", "Dep. Variable: left No. Observations: 12000\n", "Model: Logit Df Residuals: 11993\n", "Method: MLE Df Model: 6\n", "Date: Fri, 27 May 2022 Pseudo R-squ.: 0.2131\n", "Time: 08:04:49 Log-Likelihood: -4254.5\n", "converged: True LL-Null: -5406.7\n", "Covariance Type: nonrobust LLR p-value: 0.000\n", "==============================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const -1.2412 0.160 -7.751 0.000 -1.555 -0.927\n", "S -3.8163 0.121 -31.607 0.000 -4.053 -3.580\n", "LPE 0.5044 0.181 2.788 0.005 0.150 0.859\n", "NP -0.3592 0.026 -13.569 0.000 -0.411 -0.307\n", "ANH 0.0038 0.001 6.067 0.000 0.003 0.005\n", "TIC 0.6188 0.027 22.820 0.000 0.566 0.672\n", "Newborn -1.4851 0.113 -13.157 0.000 -1.706 -1.264\n", "==============================================================================\n", "\"\"\"" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model_hr.summary()" ] }, { "cell_type": "code", "execution_count": null, "id": "8046b296-fed4-47da-90f4-272f3593f146", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\u001b[0;31mSignature:\u001b[0m \u001b[0mmodel_hr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexog\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mDocstring:\u001b[0m\n", "Call self.model.predict with self.params as the first argument.\n", "\n", "Parameters\n", "----------\n", "exog : array_like, optional\n", " The values for which you want to predict. see Notes below.\n", "transform : bool, optional\n", " If the model was fit via a formula, do you want to pass\n", " exog through the formula. Default is True. E.g., if you fit\n", " a model y ~ log(x1) + log(x2), and transform is True, then\n", " you can pass a data structure that contains x1 and x2 in\n", " their original form. Otherwise, you'd need to log the data\n", " first.\n", "*args\n", " Additional arguments to pass to the model, see the\n", " predict method of the model for the details.\n", "**kwargs\n", " Additional keywords arguments to pass to the model, see the\n", " predict method of the model for the details.\n", "\n", "Returns\n", "-------\n", "array_like\n", " See self.model.predict.\n", "\n", "Notes\n", "-----\n", "The types of exog that are supported depends on whether a formula\n", "was used in the specification of the model.\n", "\n", "If a formula was used, then exog is processed in the same way as\n", "the original data. This transformation needs to have key access to the\n", "same variable names, and can be a pandas DataFrame or a dict like\n", "object that contains numpy arrays.\n", "\n", "If no formula was used, then the provided exog needs to have the\n", "same number of columns as the original exog in the model. No\n", "transformation of the data is performed except converting it to\n", "a numpy array.\n", "\n", "Row indices as in pandas data frames are supported, and added to the\n", "returned prediction.\n", "\u001b[0;31mFile:\u001b[0m /opt/anaconda/envs/aiking/lib/python3.9/site-packages/statsmodels/base/model.py\n", "\u001b[0;31mType:\u001b[0m method\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "model_hr.predict?" ] }, { "cell_type": "code", "execution_count": null, "id": "177f018a-d746-4fb8-b1dc-83f71699ef6d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "7.641666666666667" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cutoff = 0.5\n", "(model_hr.predict(X) > cutoff).sum()*100/len(y)" ] }, { "cell_type": "code", "execution_count": null, "id": "5cc0ea1a-eae5-4979-af3d-c5b5e511e7fe", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th>left</th>\n", " <th>0</th>\n", " <th>1</th>\n", " </tr>\n", " <tr>\n", " <th>row_0</th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>False</th>\n", " <td>9464</td>\n", " <td>1619</td>\n", " </tr>\n", " <tr>\n", " <th>True</th>\n", " <td>536</td>\n", " <td>381</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ "left 0 1\n", "row_0 \n", "False 9464 1619\n", "True 536 381" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.crosstab(model_hr.predict(X) >cutoff, y)" ] }, { "cell_type": "code", "execution_count": null, "id": "b3563e23-3c27-410d-8a72-81e32b2a60ce", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(0.9464, 0.1905)" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "9464/(9464+536), 381/(1619+381)" ] }, { "cell_type": "code", "execution_count": null, "id": "55ab4840-1389-491a-aac8-7318dfffbaad", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9235833333333333" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(9464+1619)/12000" ] }, { "cell_type": "code", "execution_count": null, "id": "64ca6dff-f5c6-4d1f-bfa0-64b40d55d091", "metadata": {}, "outputs": [], "source": [ "accuracy = (9464+381)/(9464+381+526+1619); accuracy" ] }, { "cell_type": "code", "execution_count": null, "id": "47df970f-6673-41e3-bf10-a06dbbbcfa55", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "S -31.606505\n", "TIC 22.820109\n", "NP -13.569440\n", "Newborn -13.156788\n", "const -7.751316\n", "ANH 6.067180\n", "LPE 2.788130\n", "dtype: float64" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model_hr.tvalues[model_hr.tvalues[model_hr.pvalues <= 0.05].abs().sort_values(ascending=False).index]" ] }, { "cell_type": "code", "execution_count": null, "id": "d474039f-918e-4dcb-869a-782ebdef665d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>S</th>\n", " <th>LPE</th>\n", " <th>NP</th>\n", " <th>ANH</th>\n", " <th>TIC</th>\n", " <th>Newborn</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0.38</td>\n", " <td>0.53</td>\n", " <td>2</td>\n", " <td>157</td>\n", " <td>3</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>0.80</td>\n", " <td>0.86</td>\n", " <td>5</td>\n", " <td>262</td>\n", " <td>6</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>0.11</td>\n", " <td>0.88</td>\n", " <td>7</td>\n", " <td>272</td>\n", " <td>4</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>0.72</td>\n", " <td>0.87</td>\n", " <td>5</td>\n", " <td>223</td>\n", " <td>5</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>0.37</td>\n", " <td>0.52</td>\n", " <td>2</td>\n", " <td>159</td>\n", " <td>3</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>11995</th>\n", " <td>0.90</td>\n", " <td>0.55</td>\n", " <td>3</td>\n", " <td>259</td>\n", " <td>2</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>11996</th>\n", " <td>0.74</td>\n", " <td>0.95</td>\n", " <td>5</td>\n", " <td>266</td>\n", " <td>4</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>11997</th>\n", " <td>0.85</td>\n", " <td>0.54</td>\n", " <td>3</td>\n", " <td>185</td>\n", " <td>3</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>11998</th>\n", " <td>0.33</td>\n", " <td>0.65</td>\n", " <td>3</td>\n", " <td>172</td>\n", " <td>5</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>11999</th>\n", " <td>0.50</td>\n", " <td>0.73</td>\n", " <td>4</td>\n", " <td>180</td>\n", " <td>3</td>\n", " <td>0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>12000 rows × 6 columns</p>\n", "</div>" ], "text/plain": [ " S LPE NP ANH TIC Newborn\n", "0 0.38 0.53 2 157 3 0\n", "1 0.80 0.86 5 262 6 0\n", "2 0.11 0.88 7 272 4 0\n", "3 0.72 0.87 5 223 5 0\n", "4 0.37 0.52 2 159 3 0\n", "... ... ... .. ... ... ...\n", "11995 0.90 0.55 3 259 2 1\n", "11996 0.74 0.95 5 266 4 0\n", "11997 0.85 0.54 3 185 3 0\n", "11998 0.33 0.65 3 172 5 0\n", "11999 0.50 0.73 4 180 3 0\n", "\n", "[12000 rows x 6 columns]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "98ada868-95bb-4351-9c35-4f2678e89f76", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>S</th>\n", " <th>LPE</th>\n", " <th>NP</th>\n", " <th>ANH</th>\n", " <th>TIC</th>\n", " <th>Newborn</th>\n", " <th>left</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0.38</td>\n", " <td>0.53</td>\n", " <td>2</td>\n", " <td>157</td>\n", " <td>3</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>0.80</td>\n", " <td>0.86</td>\n", " <td>5</td>\n", " <td>262</td>\n", " <td>6</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>0.11</td>\n", " <td>0.88</td>\n", " <td>7</td>\n", " <td>272</td>\n", " <td>4</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>0.72</td>\n", " <td>0.87</td>\n", " <td>5</td>\n", " <td>223</td>\n", " <td>5</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>0.37</td>\n", " <td>0.52</td>\n", " <td>2</td>\n", " <td>159</td>\n", " <td>3</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " S LPE NP ANH TIC Newborn left\n", "0 0.38 0.53 2 157 3 0 1\n", "1 0.80 0.86 5 262 6 0 1\n", "2 0.11 0.88 7 272 4 0 1\n", "3 0.72 0.87 5 223 5 0 1\n", "4 0.37 0.52 2 159 3 0 1" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_hr = pd.read_csv(\"DATA_3.02_HR2.csv\"); df_hr.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "fac951c6-1207-4d7a-95c1-90506764e53f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.\n" ] }, { "data": { "text/plain": [ "<AxesSubplot:xlabel='Time in Company', ylabel='Attrition'>" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "<Figure size 432x288 with 1 Axes>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df_hr.groupby(['TIC'])['left'].agg(['mean', 'sum']).reset_index().plot.scatter(y='mean',x='TIC', s='sum', xlabel='Time in Company', ylabel='Attrition',ylim=(0,0.6) )" ] }, { "cell_type": "code", "execution_count": null, "id": "7d62f30a-835b-44d1-a4c8-3c040797b727", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>TIC</th>\n", " <th>mean</th>\n", " <th>sum</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>2</td>\n", " <td>0.010262</td>\n", " <td>31</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>3</td>\n", " <td>0.165727</td>\n", " <td>882</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>4</td>\n", " <td>0.240777</td>\n", " <td>496</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>5</td>\n", " <td>0.444240</td>\n", " <td>482</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>6</td>\n", " <td>0.212891</td>\n", " <td>109</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " TIC mean sum\n", "0 2 0.010262 31\n", "1 3 0.165727 882\n", "2 4 0.240777 496\n", "3 5 0.444240 482\n", "4 6 0.212891 109" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_hr.groupby(['TIC'])['left'].agg(['mean', 'sum']).reset_index()" ] }, { "cell_type": "code", "execution_count": null, "id": "f7a3908c-8464-4e72-9136-5d868c082973", "metadata": {}, "outputs": [], "source": [ "df_hr['S_ranked'] = -np.ceil(df_hr['S'].rank(method='max')/600)" ] }, { "cell_type": "code", "execution_count": null, "id": "a5798cfc-4a2f-4b6d-bab7-3f16ecfccd20", "metadata": {}, "outputs": [], "source": [ "df_hr['attrition'] = df_hr.groupby('S_ranked')['left'].transform('mean')" ] }, { "cell_type": "code", "execution_count": null, "id": "dd1af100-2d5a-4151-a5a2-f1b1dd1f28e1", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.\n" ] }, { "data": { "text/plain": [ "<AxesSubplot:xlabel='S_ranked', ylabel='attrition'>" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "<Figure size 432x288 with 1 Axes>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df_hr.plot.scatter(x='S_ranked', y='attrition')" ] }, { "cell_type": "code", "execution_count": null, "id": "0f9b5e52-7e39-424d-b4f9-e07e98992658", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Newborn</th>\n", " <th>mean</th>\n", " <th>sum</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0</td>\n", " <td>0.186700</td>\n", " <td>1895</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>0.056757</td>\n", " <td>105</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Newborn mean sum\n", "0 0 0.186700 1895\n", "1 1 0.056757 105" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_hr.groupby(['Newborn'])['left'].agg(['mean', 'sum']).reset_index()" ] }, { "cell_type": "code", "execution_count": null, "id": "3034bf7e-97a4-4159-8ded-8a8214ab8e86", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.\n" ] }, { "data": { "text/plain": [ "<AxesSubplot:xlabel='New Projects', ylabel='Attrition'>" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "<Figure size 432x288 with 1 Axes>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df_hr.groupby(['NP'])['left'].agg(['mean', 'sum']).reset_index().plot.scatter(y='mean',x='NP', s='sum', xlabel='New Projects', ylabel='Attrition',ylim=(0,0.6) )" ] }, { "cell_type": "code", "execution_count": null, "id": "91a3053d-25a9-4c9c-881a-78fc43ac486e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<table class=\"simpletable\">\n", "<caption>Logit Regression Results</caption>\n", "<tr>\n", " <th>Dep. Variable:</th> <td>left</td> <th> No. Observations: </th> <td> 12000</td> \n", "</tr>\n", "<tr>\n", " <th>Model:</th> <td>Logit</td> <th> Df Residuals: </th> <td> 11993</td> \n", "</tr>\n", "<tr>\n", " <th>Method:</th> <td>MLE</td> <th> Df Model: </th> <td> 6</td> \n", "</tr>\n", "<tr>\n", " <th>Date:</th> <td>Fri, 27 May 2022</td> <th> Pseudo R-squ.: </th> <td>0.2131</td> \n", "</tr>\n", "<tr>\n", " <th>Time:</th> <td>08:46:06</td> <th> Log-Likelihood: </th> <td> -4254.5</td>\n", "</tr>\n", "<tr>\n", " <th>converged:</th> <td>True</td> <th> LL-Null: </th> <td> -5406.7</td>\n", "</tr>\n", "<tr>\n", " <th>Covariance Type:</th> <td>nonrobust</td> <th> LLR p-value: </th> <td> 0.000</td> \n", "</tr>\n", "</table>\n", "<table class=\"simpletable\">\n", "<tr>\n", " <td></td> <th>coef</th> <th>std err</th> <th>z</th> <th>P>|z|</th> <th>[0.025</th> <th>0.975]</th> \n", "</tr>\n", "<tr>\n", " <th>const</th> <td> -1.2412</td> <td> 0.160</td> <td> -7.751</td> <td> 0.000</td> <td> -1.555</td> <td> -0.927</td>\n", "</tr>\n", "<tr>\n", " <th>S</th> <td> -3.8163</td> <td> 0.121</td> <td> -31.607</td> <td> 0.000</td> <td> -4.053</td> <td> -3.580</td>\n", "</tr>\n", "<tr>\n", " <th>LPE</th> <td> 0.5044</td> <td> 0.181</td> <td> 2.788</td> <td> 0.005</td> <td> 0.150</td> <td> 0.859</td>\n", "</tr>\n", "<tr>\n", " <th>NP</th> <td> -0.3592</td> <td> 0.026</td> <td> -13.569</td> <td> 0.000</td> <td> -0.411</td> <td> -0.307</td>\n", "</tr>\n", "<tr>\n", " <th>ANH</th> <td> 0.0038</td> <td> 0.001</td> <td> 6.067</td> <td> 0.000</td> <td> 0.003</td> <td> 0.005</td>\n", "</tr>\n", "<tr>\n", " <th>TIC</th> <td> 0.6188</td> <td> 0.027</td> <td> 22.820</td> <td> 0.000</td> <td> 0.566</td> <td> 0.672</td>\n", "</tr>\n", "<tr>\n", " <th>Newborn</th> <td> -1.4851</td> <td> 0.113</td> <td> -13.157</td> <td> 0.000</td> <td> -1.706</td> <td> -1.264</td>\n", "</tr>\n", "</table>" ], "text/plain": [ "<class 'statsmodels.iolib.summary.Summary'>\n", "\"\"\"\n", " Logit Regression Results \n", "==============================================================================\n", "Dep. Variable: left No. Observations: 12000\n", "Model: Logit Df Residuals: 11993\n", "Method: MLE Df Model: 6\n", "Date: Fri, 27 May 2022 Pseudo R-squ.: 0.2131\n", "Time: 08:46:06 Log-Likelihood: -4254.5\n", "converged: True LL-Null: -5406.7\n", "Covariance Type: nonrobust LLR p-value: 0.000\n", "==============================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const -1.2412 0.160 -7.751 0.000 -1.555 -0.927\n", "S -3.8163 0.121 -31.607 0.000 -4.053 -3.580\n", "LPE 0.5044 0.181 2.788 0.005 0.150 0.859\n", "NP -0.3592 0.026 -13.569 0.000 -0.411 -0.307\n", "ANH 0.0038 0.001 6.067 0.000 0.003 0.005\n", "TIC 0.6188 0.027 22.820 0.000 0.566 0.672\n", "Newborn -1.4851 0.113 -13.157 0.000 -1.706 -1.264\n", "==============================================================================\n", "\"\"\"" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model_hr.summary()" ] }, { "cell_type": "code", "execution_count": null, "id": "01476cae-b429-48e3-bd59-6dac6d4294a9", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 5 }