{ "cells": [ { "cell_type": "markdown", "id": "c8f169c2-e21c-40b6-b15f-7f6c849e9626", "metadata": {}, "source": [ "# Customer Analytics" ] }, { "cell_type": "markdown", "id": "4d541670-3435-46d1-aadd-a11b70fa4f04", "metadata": {}, "source": [ "## Introduction" ] }, { "cell_type": "markdown", "id": "7ce3aa9a-70da-4c94-9420-b0f39a346700", "metadata": {}, "source": [ "What we will cover in the course?\n", "- KYC\n", "- Purchase Probability\n", "- Brand Probability\n", "- Quantity to be purchased" ] }, { "cell_type": "code", "execution_count": null, "id": "d831c5ea-a7d1-4cda-a506-600750b6a63e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2022-05-15 08:13:50.540344: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n" ] } ], "source": [ "import tensorflow as tf\n", "import seaborn \n", "import pickle \n", "import sklearn\n", "import torch\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "id": "f4bf4332-8f23-4bf2-9ec2-ea76adbd3815", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "torch.cuda.is_available()" ] }, { "cell_type": "markdown", "id": "b5c16ed1-5e28-4543-99cf-c9b6bf1fcbd3", "metadata": {}, "source": [ "## Dataset" ] }, { "cell_type": "markdown", "id": "32a72d4e-6932-474c-a6f8-355f2ce50aff", "metadata": {}, "source": [ "### Demographic Data for Segmentation " ] }, { "cell_type": "code", "execution_count": null, "id": "6b29ea6a-02d0-4aa7-84ae-658314e1f030", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"segmentation data.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "215089ea-51f1-4ead-98e4-2d4e3d42301a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>ID</th>\n", " <th>Sex</th>\n", " <th>Marital status</th>\n", " <th>Age</th>\n", " <th>Education</th>\n", " <th>Income</th>\n", " <th>Occupation</th>\n", " <th>Settlement size</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>100000001</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>67</td>\n", " <td>2</td>\n", " <td>124670</td>\n", " <td>1</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>100000002</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>22</td>\n", " <td>1</td>\n", " <td>150773</td>\n", " <td>1</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>100000003</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>49</td>\n", " <td>1</td>\n", " <td>89210</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>100000004</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>45</td>\n", " <td>1</td>\n", " <td>171565</td>\n", " <td>1</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>100000005</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>53</td>\n", " <td>1</td>\n", " <td>149031</td>\n", " <td>1</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " ID Sex Marital status Age Education Income Occupation \\\n", "0 100000001 0 0 67 2 124670 1 \n", "1 100000002 1 1 22 1 150773 1 \n", "2 100000003 0 0 49 1 89210 0 \n", "3 100000004 0 0 45 1 171565 1 \n", "4 100000005 0 0 53 1 149031 1 \n", "\n", " Settlement size \n", "0 2 \n", "1 2 \n", "2 0 \n", "3 1 \n", "4 1 " ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "2c3eb7da-5fed-4af2-9de3-4d37f757c3b5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ID 2000\n", "Sex 2\n", "Marital status 2\n", "Age 58\n", "Education 4\n", "Income 1982\n", "Occupation 3\n", "Settlement size 3\n", "dtype: int64" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.nunique()" ] }, { "cell_type": "code", "execution_count": null, "id": "1ca56cc5-10e7-4f25-929d-5b68816a9cf4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<style type=\"text/css\">\n", "</style>\n", "<table id=\"T_8dfd8_\">\n", " <thead>\n", " <tr>\n", " <th class=\"col_heading level0 col0\" >Variable</th>\n", " <th class=\"col_heading level0 col1\" >Data type</th>\n", " <th class=\"col_heading level0 col2\" >Range</th>\n", " <th class=\"col_heading level0 col3\" >Description</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <td id=\"T_8dfd8_row0_col0\" class=\"data row0 col0\" >ID</td>\n", " <td id=\"T_8dfd8_row0_col1\" class=\"data row0 col1\" >numerical</td>\n", " <td id=\"T_8dfd8_row0_col2\" class=\"data row0 col2\" >Integer</td>\n", " <td id=\"T_8dfd8_row0_col3\" class=\"data row0 col3\" >Shows a unique identificator of a customer.</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row1_col0\" class=\"data row1 col0\" >Sex</td>\n", " <td id=\"T_8dfd8_row1_col1\" class=\"data row1 col1\" >categorical</td>\n", " <td id=\"T_8dfd8_row1_col2\" class=\"data row1 col2\" >{0,1}</td>\n", " <td id=\"T_8dfd8_row1_col3\" class=\"data row1 col3\" >Biological sex (gender) of a customer. In this dataset there are only 2 different options.</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row2_col0\" class=\"data row2 col0\" ></td>\n", " <td id=\"T_8dfd8_row2_col1\" class=\"data row2 col1\" ></td>\n", " <td id=\"T_8dfd8_row2_col2\" class=\"data row2 col2\" >0</td>\n", " <td id=\"T_8dfd8_row2_col3\" class=\"data row2 col3\" >male</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row3_col0\" class=\"data row3 col0\" ></td>\n", " <td id=\"T_8dfd8_row3_col1\" class=\"data row3 col1\" ></td>\n", " <td id=\"T_8dfd8_row3_col2\" class=\"data row3 col2\" >1</td>\n", " <td id=\"T_8dfd8_row3_col3\" class=\"data row3 col3\" >female</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row4_col0\" class=\"data row4 col0\" >Marital status</td>\n", " <td id=\"T_8dfd8_row4_col1\" class=\"data row4 col1\" >categorical</td>\n", " <td id=\"T_8dfd8_row4_col2\" class=\"data row4 col2\" >{0,1}</td>\n", " <td id=\"T_8dfd8_row4_col3\" class=\"data row4 col3\" >Marital status of a customer.</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row5_col0\" class=\"data row5 col0\" ></td>\n", " <td id=\"T_8dfd8_row5_col1\" class=\"data row5 col1\" ></td>\n", " <td id=\"T_8dfd8_row5_col2\" class=\"data row5 col2\" >0</td>\n", " <td id=\"T_8dfd8_row5_col3\" class=\"data row5 col3\" >single</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row6_col0\" class=\"data row6 col0\" ></td>\n", " <td id=\"T_8dfd8_row6_col1\" class=\"data row6 col1\" ></td>\n", " <td id=\"T_8dfd8_row6_col2\" class=\"data row6 col2\" >1</td>\n", " <td id=\"T_8dfd8_row6_col3\" class=\"data row6 col3\" >non-single (divorced / separated / married / widowed)</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row7_col0\" class=\"data row7 col0\" >Age</td>\n", " <td id=\"T_8dfd8_row7_col1\" class=\"data row7 col1\" >numerical</td>\n", " <td id=\"T_8dfd8_row7_col2\" class=\"data row7 col2\" >Integer</td>\n", " <td id=\"T_8dfd8_row7_col3\" class=\"data row7 col3\" >The age of the customer in years, calculated as current year minus the year of birth of the customer at the time of creation of the dataset</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row8_col0\" class=\"data row8 col0\" ></td>\n", " <td id=\"T_8dfd8_row8_col1\" class=\"data row8 col1\" ></td>\n", " <td id=\"T_8dfd8_row8_col2\" class=\"data row8 col2\" >18</td>\n", " <td id=\"T_8dfd8_row8_col3\" class=\"data row8 col3\" >Min value (the lowest age observed in the dataset)</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row9_col0\" class=\"data row9 col0\" ></td>\n", " <td id=\"T_8dfd8_row9_col1\" class=\"data row9 col1\" ></td>\n", " <td id=\"T_8dfd8_row9_col2\" class=\"data row9 col2\" >76</td>\n", " <td id=\"T_8dfd8_row9_col3\" class=\"data row9 col3\" >Max value (the highest age observed in the dataset)</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row10_col0\" class=\"data row10 col0\" >Education</td>\n", " <td id=\"T_8dfd8_row10_col1\" class=\"data row10 col1\" >categorical</td>\n", " <td id=\"T_8dfd8_row10_col2\" class=\"data row10 col2\" >{0,1,2,3}</td>\n", " <td id=\"T_8dfd8_row10_col3\" class=\"data row10 col3\" >Level of education of the customer</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row11_col0\" class=\"data row11 col0\" ></td>\n", " <td id=\"T_8dfd8_row11_col1\" class=\"data row11 col1\" ></td>\n", " <td id=\"T_8dfd8_row11_col2\" class=\"data row11 col2\" >0</td>\n", " <td id=\"T_8dfd8_row11_col3\" class=\"data row11 col3\" >other / unknown</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row12_col0\" class=\"data row12 col0\" ></td>\n", " <td id=\"T_8dfd8_row12_col1\" class=\"data row12 col1\" ></td>\n", " <td id=\"T_8dfd8_row12_col2\" class=\"data row12 col2\" >1</td>\n", " <td id=\"T_8dfd8_row12_col3\" class=\"data row12 col3\" >high school</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row13_col0\" class=\"data row13 col0\" ></td>\n", " <td id=\"T_8dfd8_row13_col1\" class=\"data row13 col1\" ></td>\n", " <td id=\"T_8dfd8_row13_col2\" class=\"data row13 col2\" >2</td>\n", " <td id=\"T_8dfd8_row13_col3\" class=\"data row13 col3\" >university</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row14_col0\" class=\"data row14 col0\" ></td>\n", " <td id=\"T_8dfd8_row14_col1\" class=\"data row14 col1\" ></td>\n", " <td id=\"T_8dfd8_row14_col2\" class=\"data row14 col2\" >3</td>\n", " <td id=\"T_8dfd8_row14_col3\" class=\"data row14 col3\" >graduate school</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row15_col0\" class=\"data row15 col0\" >Income</td>\n", " <td id=\"T_8dfd8_row15_col1\" class=\"data row15 col1\" >numerical</td>\n", " <td id=\"T_8dfd8_row15_col2\" class=\"data row15 col2\" >Real</td>\n", " <td id=\"T_8dfd8_row15_col3\" class=\"data row15 col3\" >Self-reported annual income in US dollars of the customer.</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row16_col0\" class=\"data row16 col0\" ></td>\n", " <td id=\"T_8dfd8_row16_col1\" class=\"data row16 col1\" ></td>\n", " <td id=\"T_8dfd8_row16_col2\" class=\"data row16 col2\" >35832</td>\n", " <td id=\"T_8dfd8_row16_col3\" class=\"data row16 col3\" >Min value (the lowest income observed in the dataset)</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row17_col0\" class=\"data row17 col0\" ></td>\n", " <td id=\"T_8dfd8_row17_col1\" class=\"data row17 col1\" ></td>\n", " <td id=\"T_8dfd8_row17_col2\" class=\"data row17 col2\" >309364</td>\n", " <td id=\"T_8dfd8_row17_col3\" class=\"data row17 col3\" >Max value (the highest income observed in the dataset)</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row18_col0\" class=\"data row18 col0\" >Occupation</td>\n", " <td id=\"T_8dfd8_row18_col1\" class=\"data row18 col1\" >categorical</td>\n", " <td id=\"T_8dfd8_row18_col2\" class=\"data row18 col2\" >{0,1,2}</td>\n", " <td id=\"T_8dfd8_row18_col3\" class=\"data row18 col3\" >Category of occupation of the customer.</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row19_col0\" class=\"data row19 col0\" ></td>\n", " <td id=\"T_8dfd8_row19_col1\" class=\"data row19 col1\" ></td>\n", " <td id=\"T_8dfd8_row19_col2\" class=\"data row19 col2\" >0</td>\n", " <td id=\"T_8dfd8_row19_col3\" class=\"data row19 col3\" >unemployed / unskilled</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row20_col0\" class=\"data row20 col0\" ></td>\n", " <td id=\"T_8dfd8_row20_col1\" class=\"data row20 col1\" ></td>\n", " <td id=\"T_8dfd8_row20_col2\" class=\"data row20 col2\" >1</td>\n", " <td id=\"T_8dfd8_row20_col3\" class=\"data row20 col3\" >skilled employee / official</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row21_col0\" class=\"data row21 col0\" ></td>\n", " <td id=\"T_8dfd8_row21_col1\" class=\"data row21 col1\" ></td>\n", " <td id=\"T_8dfd8_row21_col2\" class=\"data row21 col2\" >2</td>\n", " <td id=\"T_8dfd8_row21_col3\" class=\"data row21 col3\" >management / self-employed / highly qualified employee / officer</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row22_col0\" class=\"data row22 col0\" >Settlement size</td>\n", " <td id=\"T_8dfd8_row22_col1\" class=\"data row22 col1\" >categorical</td>\n", " <td id=\"T_8dfd8_row22_col2\" class=\"data row22 col2\" >{0,1,2}</td>\n", " <td id=\"T_8dfd8_row22_col3\" class=\"data row22 col3\" >The size of the city that the customer lives in.</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row23_col0\" class=\"data row23 col0\" ></td>\n", " <td id=\"T_8dfd8_row23_col1\" class=\"data row23 col1\" ></td>\n", " <td id=\"T_8dfd8_row23_col2\" class=\"data row23 col2\" >0</td>\n", " <td id=\"T_8dfd8_row23_col3\" class=\"data row23 col3\" >small city</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row24_col0\" class=\"data row24 col0\" ></td>\n", " <td id=\"T_8dfd8_row24_col1\" class=\"data row24 col1\" ></td>\n", " <td id=\"T_8dfd8_row24_col2\" class=\"data row24 col2\" >1</td>\n", " <td id=\"T_8dfd8_row24_col3\" class=\"data row24 col3\" >mid-sized city</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_8dfd8_row25_col0\" class=\"data row25 col0\" ></td>\n", " <td id=\"T_8dfd8_row25_col1\" class=\"data row25 col1\" ></td>\n", " <td id=\"T_8dfd8_row25_col2\" class=\"data row25 col2\" >2</td>\n", " <td id=\"T_8dfd8_row25_col3\" class=\"data row25 col3\" >big city</td>\n", " </tr>\n", " </tbody>\n", "</table>\n" ], "text/plain": [ "<pandas.io.formats.style.Styler at 0x12ff941a61c0>" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.set_option('display.max_colwidth', 0)\n", "df_legend=pd.read_excel(\"segmentation data legend.xlsx\", skiprows=2, header=1).dropna(how=\"all\", axis=1).dropna(how=\"all\", axis=0).fillna(\"\")\n", "df_legend.style.hide_index()" ] }, { "cell_type": "markdown", "id": "58d1c5fe-e9f1-4df2-864f-2b62d2a7bae0", "metadata": {}, "source": [ "### Purchase History" ] }, { "cell_type": "code", "execution_count": null, "id": "c7bd294f-0f6b-4701-a8ad-21718fb13f2c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>ID</th>\n", " <th>Day</th>\n", " <th>Incidence</th>\n", " <th>Brand</th>\n", " <th>Quantity</th>\n", " <th>Last_Inc_Brand</th>\n", " <th>Last_Inc_Quantity</th>\n", " <th>Price_1</th>\n", " <th>Price_2</th>\n", " <th>Price_3</th>\n", " <th>...</th>\n", " <th>Promotion_3</th>\n", " <th>Promotion_4</th>\n", " <th>Promotion_5</th>\n", " <th>Sex</th>\n", " <th>Marital status</th>\n", " <th>Age</th>\n", " <th>Education</th>\n", " <th>Income</th>\n", " <th>Occupation</th>\n", " <th>Settlement size</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>200000001</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1.59</td>\n", " <td>1.87</td>\n", " <td>2.01</td>\n", " <td>...</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>47</td>\n", " <td>1</td>\n", " <td>110866</td>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>200000001</td>\n", " <td>11</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1.51</td>\n", " <td>1.89</td>\n", " <td>1.99</td>\n", " <td>...</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>47</td>\n", " <td>1</td>\n", " <td>110866</td>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>200000001</td>\n", " <td>12</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1.51</td>\n", " <td>1.89</td>\n", " <td>1.99</td>\n", " <td>...</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>47</td>\n", " <td>1</td>\n", " <td>110866</td>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>200000001</td>\n", " <td>16</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1.52</td>\n", " <td>1.89</td>\n", " <td>1.98</td>\n", " <td>...</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>47</td>\n", " <td>1</td>\n", " <td>110866</td>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>200000001</td>\n", " <td>18</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1.52</td>\n", " <td>1.89</td>\n", " <td>1.99</td>\n", " <td>...</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>47</td>\n", " <td>1</td>\n", " <td>110866</td>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>5 rows × 24 columns</p>\n", "</div>" ], "text/plain": [ " ID Day Incidence Brand Quantity Last_Inc_Brand \\\n", "0 200000001 1 0 0 0 0 \n", "1 200000001 11 0 0 0 0 \n", "2 200000001 12 0 0 0 0 \n", "3 200000001 16 0 0 0 0 \n", "4 200000001 18 0 0 0 0 \n", "\n", " Last_Inc_Quantity Price_1 Price_2 Price_3 ... Promotion_3 \\\n", "0 0 1.59 1.87 2.01 ... 0 \n", "1 0 1.51 1.89 1.99 ... 0 \n", "2 0 1.51 1.89 1.99 ... 0 \n", "3 0 1.52 1.89 1.98 ... 0 \n", "4 0 1.52 1.89 1.99 ... 0 \n", "\n", " Promotion_4 Promotion_5 Sex Marital status Age Education Income \\\n", "0 0 0 0 0 47 1 110866 \n", "1 0 0 0 0 47 1 110866 \n", "2 0 0 0 0 47 1 110866 \n", "3 0 0 0 0 47 1 110866 \n", "4 0 0 0 0 47 1 110866 \n", "\n", " Occupation Settlement size \n", "0 1 0 \n", "1 1 0 \n", "2 1 0 \n", "3 1 0 \n", "4 1 0 \n", "\n", "[5 rows x 24 columns]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"purchase data.csv\"); df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "43601ba8-fc2f-4f62-9b4f-ae38ab3ad98c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(58693, 24)" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "a842319a-405b-402b-9b38-60cd006451fd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ID 500\n", "Day 730\n", "Incidence 2 \n", "Brand 6 \n", "Quantity 16 \n", "Last_Inc_Brand 6 \n", "Last_Inc_Quantity 2 \n", "Price_1 37 \n", "Price_2 30 \n", "Price_3 21 \n", "Price_4 26 \n", "Price_5 44 \n", "Promotion_1 2 \n", "Promotion_2 2 \n", "Promotion_3 2 \n", "Promotion_4 2 \n", "Promotion_5 2 \n", "Sex 2 \n", "Marital status 2 \n", "Age 56 \n", "Education 4 \n", "Income 499\n", "Occupation 3 \n", "Settlement size 3 \n", "dtype: int64" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.nunique()" ] }, { "cell_type": "code", "execution_count": null, "id": "69701739-019d-45c5-92ce-1275f34b9c19", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<style type=\"text/css\">\n", "</style>\n", "<table id=\"T_24f5f_\">\n", " <thead>\n", " <tr>\n", " <th class=\"col_heading level0 col0\" >Variable</th>\n", " <th class=\"col_heading level0 col1\" >Data type</th>\n", " <th class=\"col_heading level0 col2\" >Range</th>\n", " <th class=\"col_heading level0 col3\" >Description</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <td id=\"T_24f5f_row0_col0\" class=\"data row0 col0\" >ID</td>\n", " <td id=\"T_24f5f_row0_col1\" class=\"data row0 col1\" >numerical</td>\n", " <td id=\"T_24f5f_row0_col2\" class=\"data row0 col2\" >Integer</td>\n", " <td id=\"T_24f5f_row0_col3\" class=\"data row0 col3\" >Shows a unique identificator of a customer.</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row1_col0\" class=\"data row1 col0\" >Day</td>\n", " <td id=\"T_24f5f_row1_col1\" class=\"data row1 col1\" >numerical</td>\n", " <td id=\"T_24f5f_row1_col2\" class=\"data row1 col2\" >Integer</td>\n", " <td id=\"T_24f5f_row1_col3\" class=\"data row1 col3\" >Day when the customer has visited the store </td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row2_col0\" class=\"data row2 col0\" >Incidence</td>\n", " <td id=\"T_24f5f_row2_col1\" class=\"data row2 col1\" >categorical</td>\n", " <td id=\"T_24f5f_row2_col2\" class=\"data row2 col2\" >{0,1}</td>\n", " <td id=\"T_24f5f_row2_col3\" class=\"data row2 col3\" >Purchase Incidence</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row3_col0\" class=\"data row3 col0\" ></td>\n", " <td id=\"T_24f5f_row3_col1\" class=\"data row3 col1\" ></td>\n", " <td id=\"T_24f5f_row3_col2\" class=\"data row3 col2\" >0</td>\n", " <td id=\"T_24f5f_row3_col3\" class=\"data row3 col3\" >The customer has not purchased an item from the category of interest </td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row4_col0\" class=\"data row4 col0\" ></td>\n", " <td id=\"T_24f5f_row4_col1\" class=\"data row4 col1\" ></td>\n", " <td id=\"T_24f5f_row4_col2\" class=\"data row4 col2\" >1</td>\n", " <td id=\"T_24f5f_row4_col3\" class=\"data row4 col3\" >The customer has purchased an item from the category of interest </td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row5_col0\" class=\"data row5 col0\" >Brand</td>\n", " <td id=\"T_24f5f_row5_col1\" class=\"data row5 col1\" >categorical</td>\n", " <td id=\"T_24f5f_row5_col2\" class=\"data row5 col2\" >{0,1,2,3,4,5}</td>\n", " <td id=\"T_24f5f_row5_col3\" class=\"data row5 col3\" >Shows which brand the customer has purchased</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row6_col0\" class=\"data row6 col0\" ></td>\n", " <td id=\"T_24f5f_row6_col1\" class=\"data row6 col1\" ></td>\n", " <td id=\"T_24f5f_row6_col2\" class=\"data row6 col2\" >0</td>\n", " <td id=\"T_24f5f_row6_col3\" class=\"data row6 col3\" >No brand was purchased</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row7_col0\" class=\"data row7 col0\" ></td>\n", " <td id=\"T_24f5f_row7_col1\" class=\"data row7 col1\" ></td>\n", " <td id=\"T_24f5f_row7_col2\" class=\"data row7 col2\" >1,2,3,4,5</td>\n", " <td id=\"T_24f5f_row7_col3\" class=\"data row7 col3\" >Brand ID</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row8_col0\" class=\"data row8 col0\" >Quantity</td>\n", " <td id=\"T_24f5f_row8_col1\" class=\"data row8 col1\" >numerical</td>\n", " <td id=\"T_24f5f_row8_col2\" class=\"data row8 col2\" >integer</td>\n", " <td id=\"T_24f5f_row8_col3\" class=\"data row8 col3\" >Number of items bought by the customer from the product category of interest</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row9_col0\" class=\"data row9 col0\" >Last_Inc_Brand</td>\n", " <td id=\"T_24f5f_row9_col1\" class=\"data row9 col1\" >categorical</td>\n", " <td id=\"T_24f5f_row9_col2\" class=\"data row9 col2\" >{0,1,2,3,4,5}</td>\n", " <td id=\"T_24f5f_row9_col3\" class=\"data row9 col3\" >Shows which brand the customer has purchased on their previous store visit</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row10_col0\" class=\"data row10 col0\" ></td>\n", " <td id=\"T_24f5f_row10_col1\" class=\"data row10 col1\" ></td>\n", " <td id=\"T_24f5f_row10_col2\" class=\"data row10 col2\" >0</td>\n", " <td id=\"T_24f5f_row10_col3\" class=\"data row10 col3\" >No brand was purchased</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row11_col0\" class=\"data row11 col0\" ></td>\n", " <td id=\"T_24f5f_row11_col1\" class=\"data row11 col1\" ></td>\n", " <td id=\"T_24f5f_row11_col2\" class=\"data row11 col2\" >1,2,3,4,5</td>\n", " <td id=\"T_24f5f_row11_col3\" class=\"data row11 col3\" >Brand ID</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row12_col0\" class=\"data row12 col0\" >Last_Inc_Quantity</td>\n", " <td id=\"T_24f5f_row12_col1\" class=\"data row12 col1\" >numerical</td>\n", " <td id=\"T_24f5f_row12_col2\" class=\"data row12 col2\" >integer</td>\n", " <td id=\"T_24f5f_row12_col3\" class=\"data row12 col3\" >Number of items bought by the customer from the product category of interest during their previous store visit</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row13_col0\" class=\"data row13 col0\" >Price_1</td>\n", " <td id=\"T_24f5f_row13_col1\" class=\"data row13 col1\" >numerical</td>\n", " <td id=\"T_24f5f_row13_col2\" class=\"data row13 col2\" >real</td>\n", " <td id=\"T_24f5f_row13_col3\" class=\"data row13 col3\" >Price of an item from Brand 1 on a particular day</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row14_col0\" class=\"data row14 col0\" >Price_2</td>\n", " <td id=\"T_24f5f_row14_col1\" class=\"data row14 col1\" >numerical</td>\n", " <td id=\"T_24f5f_row14_col2\" class=\"data row14 col2\" >real</td>\n", " <td id=\"T_24f5f_row14_col3\" class=\"data row14 col3\" >Price of an item from Brand 2 on a particular day</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row15_col0\" class=\"data row15 col0\" >Price_3</td>\n", " <td id=\"T_24f5f_row15_col1\" class=\"data row15 col1\" >numerical</td>\n", " <td id=\"T_24f5f_row15_col2\" class=\"data row15 col2\" >real</td>\n", " <td id=\"T_24f5f_row15_col3\" class=\"data row15 col3\" >Price of an item from Brand 3 on a particular day</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row16_col0\" class=\"data row16 col0\" >Price_4</td>\n", " <td id=\"T_24f5f_row16_col1\" class=\"data row16 col1\" >numerical</td>\n", " <td id=\"T_24f5f_row16_col2\" class=\"data row16 col2\" >real</td>\n", " <td id=\"T_24f5f_row16_col3\" class=\"data row16 col3\" >Price of an item from Brand 4 on a particular day</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row17_col0\" class=\"data row17 col0\" >Price_5</td>\n", " <td id=\"T_24f5f_row17_col1\" class=\"data row17 col1\" >numerical</td>\n", " <td id=\"T_24f5f_row17_col2\" class=\"data row17 col2\" >real</td>\n", " <td id=\"T_24f5f_row17_col3\" class=\"data row17 col3\" >Price of an item from Brand 5 on a particular day</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row18_col0\" class=\"data row18 col0\" >Promotion_1</td>\n", " <td id=\"T_24f5f_row18_col1\" class=\"data row18 col1\" >categorical</td>\n", " <td id=\"T_24f5f_row18_col2\" class=\"data row18 col2\" >{0,1}</td>\n", " <td id=\"T_24f5f_row18_col3\" class=\"data row18 col3\" >Indicator whether Brand 1 was on promotion or not on a particular day</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row19_col0\" class=\"data row19 col0\" ></td>\n", " <td id=\"T_24f5f_row19_col1\" class=\"data row19 col1\" ></td>\n", " <td id=\"T_24f5f_row19_col2\" class=\"data row19 col2\" >0</td>\n", " <td id=\"T_24f5f_row19_col3\" class=\"data row19 col3\" >There is no promotion</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row20_col0\" class=\"data row20 col0\" ></td>\n", " <td id=\"T_24f5f_row20_col1\" class=\"data row20 col1\" ></td>\n", " <td id=\"T_24f5f_row20_col2\" class=\"data row20 col2\" >1</td>\n", " <td id=\"T_24f5f_row20_col3\" class=\"data row20 col3\" >There is promotion</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row21_col0\" class=\"data row21 col0\" >Promotion_2</td>\n", " <td id=\"T_24f5f_row21_col1\" class=\"data row21 col1\" >categorical</td>\n", " <td id=\"T_24f5f_row21_col2\" class=\"data row21 col2\" >{0,1}</td>\n", " <td id=\"T_24f5f_row21_col3\" class=\"data row21 col3\" >Indicator of whether Brand 2 was on promotion or not on a particular day</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row22_col0\" class=\"data row22 col0\" ></td>\n", " <td id=\"T_24f5f_row22_col1\" class=\"data row22 col1\" ></td>\n", " <td id=\"T_24f5f_row22_col2\" class=\"data row22 col2\" >0</td>\n", " <td id=\"T_24f5f_row22_col3\" class=\"data row22 col3\" >There is no promotion</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row23_col0\" class=\"data row23 col0\" ></td>\n", " <td id=\"T_24f5f_row23_col1\" class=\"data row23 col1\" ></td>\n", " <td id=\"T_24f5f_row23_col2\" class=\"data row23 col2\" >1</td>\n", " <td id=\"T_24f5f_row23_col3\" class=\"data row23 col3\" >There is promotion</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row24_col0\" class=\"data row24 col0\" >Promotion_3</td>\n", " <td id=\"T_24f5f_row24_col1\" class=\"data row24 col1\" >categorical</td>\n", " <td id=\"T_24f5f_row24_col2\" class=\"data row24 col2\" >{0,1}</td>\n", " <td id=\"T_24f5f_row24_col3\" class=\"data row24 col3\" >Indicator of whether Brand 3 was on promotion or not on a particular day</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row25_col0\" class=\"data row25 col0\" ></td>\n", " <td id=\"T_24f5f_row25_col1\" class=\"data row25 col1\" ></td>\n", " <td id=\"T_24f5f_row25_col2\" class=\"data row25 col2\" >0</td>\n", " <td id=\"T_24f5f_row25_col3\" class=\"data row25 col3\" >There is no promotion</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row26_col0\" class=\"data row26 col0\" ></td>\n", " <td id=\"T_24f5f_row26_col1\" class=\"data row26 col1\" ></td>\n", " <td id=\"T_24f5f_row26_col2\" class=\"data row26 col2\" >1</td>\n", " <td id=\"T_24f5f_row26_col3\" class=\"data row26 col3\" >There is promotion</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row27_col0\" class=\"data row27 col0\" >Promotion_4</td>\n", " <td id=\"T_24f5f_row27_col1\" class=\"data row27 col1\" >categorical</td>\n", " <td id=\"T_24f5f_row27_col2\" class=\"data row27 col2\" >{0,1}</td>\n", " <td id=\"T_24f5f_row27_col3\" class=\"data row27 col3\" >Indicator of whether Brand 4 was on promotion or not on a particular day</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row28_col0\" class=\"data row28 col0\" ></td>\n", " <td id=\"T_24f5f_row28_col1\" class=\"data row28 col1\" ></td>\n", " <td id=\"T_24f5f_row28_col2\" class=\"data row28 col2\" >0</td>\n", " <td id=\"T_24f5f_row28_col3\" class=\"data row28 col3\" >There is no promotion</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row29_col0\" class=\"data row29 col0\" ></td>\n", " <td id=\"T_24f5f_row29_col1\" class=\"data row29 col1\" ></td>\n", " <td id=\"T_24f5f_row29_col2\" class=\"data row29 col2\" >1</td>\n", " <td id=\"T_24f5f_row29_col3\" class=\"data row29 col3\" >There is promotion</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row30_col0\" class=\"data row30 col0\" >Promotion_5</td>\n", " <td id=\"T_24f5f_row30_col1\" class=\"data row30 col1\" >categorical</td>\n", " <td id=\"T_24f5f_row30_col2\" class=\"data row30 col2\" >{0,1}</td>\n", " <td id=\"T_24f5f_row30_col3\" class=\"data row30 col3\" >Indicator of whether Brand 5 was on promotion or not on a particular day</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row31_col0\" class=\"data row31 col0\" ></td>\n", " <td id=\"T_24f5f_row31_col1\" class=\"data row31 col1\" ></td>\n", " <td id=\"T_24f5f_row31_col2\" class=\"data row31 col2\" >0</td>\n", " <td id=\"T_24f5f_row31_col3\" class=\"data row31 col3\" >There is no promotion</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row32_col0\" class=\"data row32 col0\" ></td>\n", " <td id=\"T_24f5f_row32_col1\" class=\"data row32 col1\" ></td>\n", " <td id=\"T_24f5f_row32_col2\" class=\"data row32 col2\" >1</td>\n", " <td id=\"T_24f5f_row32_col3\" class=\"data row32 col3\" >There is promotion</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row33_col0\" class=\"data row33 col0\" >Sex</td>\n", " <td id=\"T_24f5f_row33_col1\" class=\"data row33 col1\" >categorical</td>\n", " <td id=\"T_24f5f_row33_col2\" class=\"data row33 col2\" >{0,1}</td>\n", " <td id=\"T_24f5f_row33_col3\" class=\"data row33 col3\" >Biological sex (gender) of a customer. In this dataset there are only 2 different options.</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row34_col0\" class=\"data row34 col0\" ></td>\n", " <td id=\"T_24f5f_row34_col1\" class=\"data row34 col1\" ></td>\n", " <td id=\"T_24f5f_row34_col2\" class=\"data row34 col2\" >0</td>\n", " <td id=\"T_24f5f_row34_col3\" class=\"data row34 col3\" >male</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row35_col0\" class=\"data row35 col0\" ></td>\n", " <td id=\"T_24f5f_row35_col1\" class=\"data row35 col1\" ></td>\n", " <td id=\"T_24f5f_row35_col2\" class=\"data row35 col2\" >1</td>\n", " <td id=\"T_24f5f_row35_col3\" class=\"data row35 col3\" >female</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row36_col0\" class=\"data row36 col0\" >Marital status</td>\n", " <td id=\"T_24f5f_row36_col1\" class=\"data row36 col1\" >categorical</td>\n", " <td id=\"T_24f5f_row36_col2\" class=\"data row36 col2\" >{0,1}</td>\n", " <td id=\"T_24f5f_row36_col3\" class=\"data row36 col3\" >Marital status of a customer.</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row37_col0\" class=\"data row37 col0\" ></td>\n", " <td id=\"T_24f5f_row37_col1\" class=\"data row37 col1\" ></td>\n", " <td id=\"T_24f5f_row37_col2\" class=\"data row37 col2\" >0</td>\n", " <td id=\"T_24f5f_row37_col3\" class=\"data row37 col3\" >single</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row38_col0\" class=\"data row38 col0\" ></td>\n", " <td id=\"T_24f5f_row38_col1\" class=\"data row38 col1\" ></td>\n", " <td id=\"T_24f5f_row38_col2\" class=\"data row38 col2\" >1</td>\n", " <td id=\"T_24f5f_row38_col3\" class=\"data row38 col3\" >non-single (divorced / separated / married / widowed)</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row39_col0\" class=\"data row39 col0\" >Age</td>\n", " <td id=\"T_24f5f_row39_col1\" class=\"data row39 col1\" >numerical</td>\n", " <td id=\"T_24f5f_row39_col2\" class=\"data row39 col2\" >Integer</td>\n", " <td id=\"T_24f5f_row39_col3\" class=\"data row39 col3\" >The age of the customer in years, calculated as current year minus the year of birth of the customer at the time of creation of the dataset</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row40_col0\" class=\"data row40 col0\" ></td>\n", " <td id=\"T_24f5f_row40_col1\" class=\"data row40 col1\" ></td>\n", " <td id=\"T_24f5f_row40_col2\" class=\"data row40 col2\" >18</td>\n", " <td id=\"T_24f5f_row40_col3\" class=\"data row40 col3\" >Min value (the lowest age observed in the dataset)</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row41_col0\" class=\"data row41 col0\" ></td>\n", " <td id=\"T_24f5f_row41_col1\" class=\"data row41 col1\" ></td>\n", " <td id=\"T_24f5f_row41_col2\" class=\"data row41 col2\" >75</td>\n", " <td id=\"T_24f5f_row41_col3\" class=\"data row41 col3\" >Max value (the highest age observed in the dataset)</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row42_col0\" class=\"data row42 col0\" >Education</td>\n", " <td id=\"T_24f5f_row42_col1\" class=\"data row42 col1\" >categorical</td>\n", " <td id=\"T_24f5f_row42_col2\" class=\"data row42 col2\" >{0,1,2,3}</td>\n", " <td id=\"T_24f5f_row42_col3\" class=\"data row42 col3\" >Level of education of the customer</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row43_col0\" class=\"data row43 col0\" ></td>\n", " <td id=\"T_24f5f_row43_col1\" class=\"data row43 col1\" ></td>\n", " <td id=\"T_24f5f_row43_col2\" class=\"data row43 col2\" >0</td>\n", " <td id=\"T_24f5f_row43_col3\" class=\"data row43 col3\" >other / unknown</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row44_col0\" class=\"data row44 col0\" ></td>\n", " <td id=\"T_24f5f_row44_col1\" class=\"data row44 col1\" ></td>\n", " <td id=\"T_24f5f_row44_col2\" class=\"data row44 col2\" >1</td>\n", " <td id=\"T_24f5f_row44_col3\" class=\"data row44 col3\" >high school</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row45_col0\" class=\"data row45 col0\" ></td>\n", " <td id=\"T_24f5f_row45_col1\" class=\"data row45 col1\" ></td>\n", " <td id=\"T_24f5f_row45_col2\" class=\"data row45 col2\" >2</td>\n", " <td id=\"T_24f5f_row45_col3\" class=\"data row45 col3\" >university</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row46_col0\" class=\"data row46 col0\" ></td>\n", " <td id=\"T_24f5f_row46_col1\" class=\"data row46 col1\" ></td>\n", " <td id=\"T_24f5f_row46_col2\" class=\"data row46 col2\" >3</td>\n", " <td id=\"T_24f5f_row46_col3\" class=\"data row46 col3\" >graduate school</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row47_col0\" class=\"data row47 col0\" >Income</td>\n", " <td id=\"T_24f5f_row47_col1\" class=\"data row47 col1\" >numerical</td>\n", " <td id=\"T_24f5f_row47_col2\" class=\"data row47 col2\" >real</td>\n", " <td id=\"T_24f5f_row47_col3\" class=\"data row47 col3\" >Self-reported annual income in US dollars of the customer.</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row48_col0\" class=\"data row48 col0\" ></td>\n", " <td id=\"T_24f5f_row48_col1\" class=\"data row48 col1\" ></td>\n", " <td id=\"T_24f5f_row48_col2\" class=\"data row48 col2\" >38247</td>\n", " <td id=\"T_24f5f_row48_col3\" class=\"data row48 col3\" >Min value (the lowest income observed in the dataset)</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row49_col0\" class=\"data row49 col0\" ></td>\n", " <td id=\"T_24f5f_row49_col1\" class=\"data row49 col1\" ></td>\n", " <td id=\"T_24f5f_row49_col2\" class=\"data row49 col2\" >309364</td>\n", " <td id=\"T_24f5f_row49_col3\" class=\"data row49 col3\" >Max value (the highest income observed in the dataset)</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row50_col0\" class=\"data row50 col0\" >Occupation</td>\n", " <td id=\"T_24f5f_row50_col1\" class=\"data row50 col1\" >categorical</td>\n", " <td id=\"T_24f5f_row50_col2\" class=\"data row50 col2\" >{0,1,2}</td>\n", " <td id=\"T_24f5f_row50_col3\" class=\"data row50 col3\" >Category of occupation of the customer.</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row51_col0\" class=\"data row51 col0\" ></td>\n", " <td id=\"T_24f5f_row51_col1\" class=\"data row51 col1\" ></td>\n", " <td id=\"T_24f5f_row51_col2\" class=\"data row51 col2\" >0</td>\n", " <td id=\"T_24f5f_row51_col3\" class=\"data row51 col3\" >unemployed / unskilled</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row52_col0\" class=\"data row52 col0\" ></td>\n", " <td id=\"T_24f5f_row52_col1\" class=\"data row52 col1\" ></td>\n", " <td id=\"T_24f5f_row52_col2\" class=\"data row52 col2\" >1</td>\n", " <td id=\"T_24f5f_row52_col3\" class=\"data row52 col3\" >skilled employee / official</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row53_col0\" class=\"data row53 col0\" ></td>\n", " <td id=\"T_24f5f_row53_col1\" class=\"data row53 col1\" ></td>\n", " <td id=\"T_24f5f_row53_col2\" class=\"data row53 col2\" >2</td>\n", " <td id=\"T_24f5f_row53_col3\" class=\"data row53 col3\" >management / self-employed / highly qualified employee / officer</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row54_col0\" class=\"data row54 col0\" >Settlement size</td>\n", " <td id=\"T_24f5f_row54_col1\" class=\"data row54 col1\" >categorical</td>\n", " <td id=\"T_24f5f_row54_col2\" class=\"data row54 col2\" >{0,1,2}</td>\n", " <td id=\"T_24f5f_row54_col3\" class=\"data row54 col3\" >The size of the city that the customer lives in.</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row55_col0\" class=\"data row55 col0\" ></td>\n", " <td id=\"T_24f5f_row55_col1\" class=\"data row55 col1\" ></td>\n", " <td id=\"T_24f5f_row55_col2\" class=\"data row55 col2\" >0</td>\n", " <td id=\"T_24f5f_row55_col3\" class=\"data row55 col3\" >small city</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row56_col0\" class=\"data row56 col0\" ></td>\n", " <td id=\"T_24f5f_row56_col1\" class=\"data row56 col1\" ></td>\n", " <td id=\"T_24f5f_row56_col2\" class=\"data row56 col2\" >1</td>\n", " <td id=\"T_24f5f_row56_col3\" class=\"data row56 col3\" >mid-sized city</td>\n", " </tr>\n", " <tr>\n", " <td id=\"T_24f5f_row57_col0\" class=\"data row57 col0\" ></td>\n", " <td id=\"T_24f5f_row57_col1\" class=\"data row57 col1\" ></td>\n", " <td id=\"T_24f5f_row57_col2\" class=\"data row57 col2\" >2</td>\n", " <td id=\"T_24f5f_row57_col3\" class=\"data row57 col3\" >big city</td>\n", " </tr>\n", " </tbody>\n", "</table>\n" ], "text/plain": [ "<pandas.io.formats.style.Styler at 0x12ff91492790>" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.set_option('display.max_colwidth', 0)\n", "df_legend=pd.read_excel(\"purchase data legend.xlsx\", skiprows=2, header=1).dropna(how=\"all\", axis=1).dropna(how=\"all\", axis=0).fillna(\"\")\n", "df_legend.style.hide_index()" ] }, { "cell_type": "code", "execution_count": null, "id": "567a19ba-3603-49cb-a8f3-248ab96617b1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ID 0\n", "Day 0\n", "Incidence 0\n", "Brand 0\n", "Quantity 0\n", "Last_Inc_Brand 0\n", "Last_Inc_Quantity 0\n", "Price_1 0\n", "Price_2 0\n", "Price_3 0\n", "Price_4 0\n", "Price_5 0\n", "Promotion_1 0\n", "Promotion_2 0\n", "Promotion_3 0\n", "Promotion_4 0\n", "Promotion_5 0\n", "Sex 0\n", "Marital status 0\n", "Age 0\n", "Education 0\n", "Income 0\n", "Occupation 0\n", "Settlement size 0\n", "dtype: int64" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().sum() # No missing information" ] }, { "cell_type": "code", "execution_count": null, "id": "36e93046-5cb1-4632-af8d-9275002b94d0", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 5 }