{
"cells": [
{
"cell_type": "markdown",
"id": "c8f169c2-e21c-40b6-b15f-7f6c849e9626",
"metadata": {},
"source": [
"# Customer Analytics"
]
},
{
"cell_type": "markdown",
"id": "4d541670-3435-46d1-aadd-a11b70fa4f04",
"metadata": {},
"source": [
"## Introduction"
]
},
{
"cell_type": "markdown",
"id": "7ce3aa9a-70da-4c94-9420-b0f39a346700",
"metadata": {},
"source": [
"What we will cover in the course?\n",
"- KYC\n",
"- Purchase Probability\n",
"- Brand Probability\n",
"- Quantity to be purchased"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d831c5ea-a7d1-4cda-a506-600750b6a63e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-05-15 08:13:50.540344: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n"
]
}
],
"source": [
"import tensorflow as tf\n",
"import seaborn \n",
"import pickle \n",
"import sklearn\n",
"import torch\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f4bf4332-8f23-4bf2-9ec2-ea76adbd3815",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"torch.cuda.is_available()"
]
},
{
"cell_type": "markdown",
"id": "b5c16ed1-5e28-4543-99cf-c9b6bf1fcbd3",
"metadata": {},
"source": [
"## Dataset"
]
},
{
"cell_type": "markdown",
"id": "32a72d4e-6932-474c-a6f8-355f2ce50aff",
"metadata": {},
"source": [
"### Demographic Data for Segmentation "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6b29ea6a-02d0-4aa7-84ae-658314e1f030",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"segmentation data.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "215089ea-51f1-4ead-98e4-2d4e3d42301a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>Sex</th>\n",
" <th>Marital status</th>\n",
" <th>Age</th>\n",
" <th>Education</th>\n",
" <th>Income</th>\n",
" <th>Occupation</th>\n",
" <th>Settlement size</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>100000001</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>67</td>\n",
" <td>2</td>\n",
" <td>124670</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>100000002</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>22</td>\n",
" <td>1</td>\n",
" <td>150773</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100000003</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>49</td>\n",
" <td>1</td>\n",
" <td>89210</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>100000004</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>45</td>\n",
" <td>1</td>\n",
" <td>171565</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100000005</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>53</td>\n",
" <td>1</td>\n",
" <td>149031</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ID Sex Marital status Age Education Income Occupation \\\n",
"0 100000001 0 0 67 2 124670 1 \n",
"1 100000002 1 1 22 1 150773 1 \n",
"2 100000003 0 0 49 1 89210 0 \n",
"3 100000004 0 0 45 1 171565 1 \n",
"4 100000005 0 0 53 1 149031 1 \n",
"\n",
" Settlement size \n",
"0 2 \n",
"1 2 \n",
"2 0 \n",
"3 1 \n",
"4 1 "
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2c3eb7da-5fed-4af2-9de3-4d37f757c3b5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"ID 2000\n",
"Sex 2\n",
"Marital status 2\n",
"Age 58\n",
"Education 4\n",
"Income 1982\n",
"Occupation 3\n",
"Settlement size 3\n",
"dtype: int64"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.nunique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1ca56cc5-10e7-4f25-929d-5b68816a9cf4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"</style>\n",
"<table id=\"T_8dfd8_\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"col_heading level0 col0\" >Variable</th>\n",
" <th class=\"col_heading level0 col1\" >Data type</th>\n",
" <th class=\"col_heading level0 col2\" >Range</th>\n",
" <th class=\"col_heading level0 col3\" >Description</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row0_col0\" class=\"data row0 col0\" >ID</td>\n",
" <td id=\"T_8dfd8_row0_col1\" class=\"data row0 col1\" >numerical</td>\n",
" <td id=\"T_8dfd8_row0_col2\" class=\"data row0 col2\" >Integer</td>\n",
" <td id=\"T_8dfd8_row0_col3\" class=\"data row0 col3\" >Shows a unique identificator of a customer.</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row1_col0\" class=\"data row1 col0\" >Sex</td>\n",
" <td id=\"T_8dfd8_row1_col1\" class=\"data row1 col1\" >categorical</td>\n",
" <td id=\"T_8dfd8_row1_col2\" class=\"data row1 col2\" >{0,1}</td>\n",
" <td id=\"T_8dfd8_row1_col3\" class=\"data row1 col3\" >Biological sex (gender) of a customer. In this dataset there are only 2 different options.</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row2_col0\" class=\"data row2 col0\" ></td>\n",
" <td id=\"T_8dfd8_row2_col1\" class=\"data row2 col1\" ></td>\n",
" <td id=\"T_8dfd8_row2_col2\" class=\"data row2 col2\" >0</td>\n",
" <td id=\"T_8dfd8_row2_col3\" class=\"data row2 col3\" >male</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row3_col0\" class=\"data row3 col0\" ></td>\n",
" <td id=\"T_8dfd8_row3_col1\" class=\"data row3 col1\" ></td>\n",
" <td id=\"T_8dfd8_row3_col2\" class=\"data row3 col2\" >1</td>\n",
" <td id=\"T_8dfd8_row3_col3\" class=\"data row3 col3\" >female</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row4_col0\" class=\"data row4 col0\" >Marital status</td>\n",
" <td id=\"T_8dfd8_row4_col1\" class=\"data row4 col1\" >categorical</td>\n",
" <td id=\"T_8dfd8_row4_col2\" class=\"data row4 col2\" >{0,1}</td>\n",
" <td id=\"T_8dfd8_row4_col3\" class=\"data row4 col3\" >Marital status of a customer.</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row5_col0\" class=\"data row5 col0\" ></td>\n",
" <td id=\"T_8dfd8_row5_col1\" class=\"data row5 col1\" ></td>\n",
" <td id=\"T_8dfd8_row5_col2\" class=\"data row5 col2\" >0</td>\n",
" <td id=\"T_8dfd8_row5_col3\" class=\"data row5 col3\" >single</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row6_col0\" class=\"data row6 col0\" ></td>\n",
" <td id=\"T_8dfd8_row6_col1\" class=\"data row6 col1\" ></td>\n",
" <td id=\"T_8dfd8_row6_col2\" class=\"data row6 col2\" >1</td>\n",
" <td id=\"T_8dfd8_row6_col3\" class=\"data row6 col3\" >non-single (divorced / separated / married / widowed)</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row7_col0\" class=\"data row7 col0\" >Age</td>\n",
" <td id=\"T_8dfd8_row7_col1\" class=\"data row7 col1\" >numerical</td>\n",
" <td id=\"T_8dfd8_row7_col2\" class=\"data row7 col2\" >Integer</td>\n",
" <td id=\"T_8dfd8_row7_col3\" class=\"data row7 col3\" >The age of the customer in years, calculated as current year minus the year of birth of the customer at the time of creation of the dataset</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row8_col0\" class=\"data row8 col0\" ></td>\n",
" <td id=\"T_8dfd8_row8_col1\" class=\"data row8 col1\" ></td>\n",
" <td id=\"T_8dfd8_row8_col2\" class=\"data row8 col2\" >18</td>\n",
" <td id=\"T_8dfd8_row8_col3\" class=\"data row8 col3\" >Min value (the lowest age observed in the dataset)</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row9_col0\" class=\"data row9 col0\" ></td>\n",
" <td id=\"T_8dfd8_row9_col1\" class=\"data row9 col1\" ></td>\n",
" <td id=\"T_8dfd8_row9_col2\" class=\"data row9 col2\" >76</td>\n",
" <td id=\"T_8dfd8_row9_col3\" class=\"data row9 col3\" >Max value (the highest age observed in the dataset)</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row10_col0\" class=\"data row10 col0\" >Education</td>\n",
" <td id=\"T_8dfd8_row10_col1\" class=\"data row10 col1\" >categorical</td>\n",
" <td id=\"T_8dfd8_row10_col2\" class=\"data row10 col2\" >{0,1,2,3}</td>\n",
" <td id=\"T_8dfd8_row10_col3\" class=\"data row10 col3\" >Level of education of the customer</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row11_col0\" class=\"data row11 col0\" ></td>\n",
" <td id=\"T_8dfd8_row11_col1\" class=\"data row11 col1\" ></td>\n",
" <td id=\"T_8dfd8_row11_col2\" class=\"data row11 col2\" >0</td>\n",
" <td id=\"T_8dfd8_row11_col3\" class=\"data row11 col3\" >other / unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row12_col0\" class=\"data row12 col0\" ></td>\n",
" <td id=\"T_8dfd8_row12_col1\" class=\"data row12 col1\" ></td>\n",
" <td id=\"T_8dfd8_row12_col2\" class=\"data row12 col2\" >1</td>\n",
" <td id=\"T_8dfd8_row12_col3\" class=\"data row12 col3\" >high school</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row13_col0\" class=\"data row13 col0\" ></td>\n",
" <td id=\"T_8dfd8_row13_col1\" class=\"data row13 col1\" ></td>\n",
" <td id=\"T_8dfd8_row13_col2\" class=\"data row13 col2\" >2</td>\n",
" <td id=\"T_8dfd8_row13_col3\" class=\"data row13 col3\" >university</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row14_col0\" class=\"data row14 col0\" ></td>\n",
" <td id=\"T_8dfd8_row14_col1\" class=\"data row14 col1\" ></td>\n",
" <td id=\"T_8dfd8_row14_col2\" class=\"data row14 col2\" >3</td>\n",
" <td id=\"T_8dfd8_row14_col3\" class=\"data row14 col3\" >graduate school</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row15_col0\" class=\"data row15 col0\" >Income</td>\n",
" <td id=\"T_8dfd8_row15_col1\" class=\"data row15 col1\" >numerical</td>\n",
" <td id=\"T_8dfd8_row15_col2\" class=\"data row15 col2\" >Real</td>\n",
" <td id=\"T_8dfd8_row15_col3\" class=\"data row15 col3\" >Self-reported annual income in US dollars of the customer.</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row16_col0\" class=\"data row16 col0\" ></td>\n",
" <td id=\"T_8dfd8_row16_col1\" class=\"data row16 col1\" ></td>\n",
" <td id=\"T_8dfd8_row16_col2\" class=\"data row16 col2\" >35832</td>\n",
" <td id=\"T_8dfd8_row16_col3\" class=\"data row16 col3\" >Min value (the lowest income observed in the dataset)</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row17_col0\" class=\"data row17 col0\" ></td>\n",
" <td id=\"T_8dfd8_row17_col1\" class=\"data row17 col1\" ></td>\n",
" <td id=\"T_8dfd8_row17_col2\" class=\"data row17 col2\" >309364</td>\n",
" <td id=\"T_8dfd8_row17_col3\" class=\"data row17 col3\" >Max value (the highest income observed in the dataset)</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row18_col0\" class=\"data row18 col0\" >Occupation</td>\n",
" <td id=\"T_8dfd8_row18_col1\" class=\"data row18 col1\" >categorical</td>\n",
" <td id=\"T_8dfd8_row18_col2\" class=\"data row18 col2\" >{0,1,2}</td>\n",
" <td id=\"T_8dfd8_row18_col3\" class=\"data row18 col3\" >Category of occupation of the customer.</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row19_col0\" class=\"data row19 col0\" ></td>\n",
" <td id=\"T_8dfd8_row19_col1\" class=\"data row19 col1\" ></td>\n",
" <td id=\"T_8dfd8_row19_col2\" class=\"data row19 col2\" >0</td>\n",
" <td id=\"T_8dfd8_row19_col3\" class=\"data row19 col3\" >unemployed / unskilled</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row20_col0\" class=\"data row20 col0\" ></td>\n",
" <td id=\"T_8dfd8_row20_col1\" class=\"data row20 col1\" ></td>\n",
" <td id=\"T_8dfd8_row20_col2\" class=\"data row20 col2\" >1</td>\n",
" <td id=\"T_8dfd8_row20_col3\" class=\"data row20 col3\" >skilled employee / official</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row21_col0\" class=\"data row21 col0\" ></td>\n",
" <td id=\"T_8dfd8_row21_col1\" class=\"data row21 col1\" ></td>\n",
" <td id=\"T_8dfd8_row21_col2\" class=\"data row21 col2\" >2</td>\n",
" <td id=\"T_8dfd8_row21_col3\" class=\"data row21 col3\" >management / self-employed / highly qualified employee / officer</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row22_col0\" class=\"data row22 col0\" >Settlement size</td>\n",
" <td id=\"T_8dfd8_row22_col1\" class=\"data row22 col1\" >categorical</td>\n",
" <td id=\"T_8dfd8_row22_col2\" class=\"data row22 col2\" >{0,1,2}</td>\n",
" <td id=\"T_8dfd8_row22_col3\" class=\"data row22 col3\" >The size of the city that the customer lives in.</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row23_col0\" class=\"data row23 col0\" ></td>\n",
" <td id=\"T_8dfd8_row23_col1\" class=\"data row23 col1\" ></td>\n",
" <td id=\"T_8dfd8_row23_col2\" class=\"data row23 col2\" >0</td>\n",
" <td id=\"T_8dfd8_row23_col3\" class=\"data row23 col3\" >small city</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row24_col0\" class=\"data row24 col0\" ></td>\n",
" <td id=\"T_8dfd8_row24_col1\" class=\"data row24 col1\" ></td>\n",
" <td id=\"T_8dfd8_row24_col2\" class=\"data row24 col2\" >1</td>\n",
" <td id=\"T_8dfd8_row24_col3\" class=\"data row24 col3\" >mid-sized city</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_8dfd8_row25_col0\" class=\"data row25 col0\" ></td>\n",
" <td id=\"T_8dfd8_row25_col1\" class=\"data row25 col1\" ></td>\n",
" <td id=\"T_8dfd8_row25_col2\" class=\"data row25 col2\" >2</td>\n",
" <td id=\"T_8dfd8_row25_col3\" class=\"data row25 col3\" >big city</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x12ff941a61c0>"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.set_option('display.max_colwidth', 0)\n",
"df_legend=pd.read_excel(\"segmentation data legend.xlsx\", skiprows=2, header=1).dropna(how=\"all\", axis=1).dropna(how=\"all\", axis=0).fillna(\"\")\n",
"df_legend.style.hide_index()"
]
},
{
"cell_type": "markdown",
"id": "58d1c5fe-e9f1-4df2-864f-2b62d2a7bae0",
"metadata": {},
"source": [
"### Purchase History"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c7bd294f-0f6b-4701-a8ad-21718fb13f2c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>Day</th>\n",
" <th>Incidence</th>\n",
" <th>Brand</th>\n",
" <th>Quantity</th>\n",
" <th>Last_Inc_Brand</th>\n",
" <th>Last_Inc_Quantity</th>\n",
" <th>Price_1</th>\n",
" <th>Price_2</th>\n",
" <th>Price_3</th>\n",
" <th>...</th>\n",
" <th>Promotion_3</th>\n",
" <th>Promotion_4</th>\n",
" <th>Promotion_5</th>\n",
" <th>Sex</th>\n",
" <th>Marital status</th>\n",
" <th>Age</th>\n",
" <th>Education</th>\n",
" <th>Income</th>\n",
" <th>Occupation</th>\n",
" <th>Settlement size</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>200000001</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.59</td>\n",
" <td>1.87</td>\n",
" <td>2.01</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>47</td>\n",
" <td>1</td>\n",
" <td>110866</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>200000001</td>\n",
" <td>11</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.51</td>\n",
" <td>1.89</td>\n",
" <td>1.99</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>47</td>\n",
" <td>1</td>\n",
" <td>110866</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>200000001</td>\n",
" <td>12</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.51</td>\n",
" <td>1.89</td>\n",
" <td>1.99</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>47</td>\n",
" <td>1</td>\n",
" <td>110866</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>200000001</td>\n",
" <td>16</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.52</td>\n",
" <td>1.89</td>\n",
" <td>1.98</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>47</td>\n",
" <td>1</td>\n",
" <td>110866</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>200000001</td>\n",
" <td>18</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.52</td>\n",
" <td>1.89</td>\n",
" <td>1.99</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>47</td>\n",
" <td>1</td>\n",
" <td>110866</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 24 columns</p>\n",
"</div>"
],
"text/plain": [
" ID Day Incidence Brand Quantity Last_Inc_Brand \\\n",
"0 200000001 1 0 0 0 0 \n",
"1 200000001 11 0 0 0 0 \n",
"2 200000001 12 0 0 0 0 \n",
"3 200000001 16 0 0 0 0 \n",
"4 200000001 18 0 0 0 0 \n",
"\n",
" Last_Inc_Quantity Price_1 Price_2 Price_3 ... Promotion_3 \\\n",
"0 0 1.59 1.87 2.01 ... 0 \n",
"1 0 1.51 1.89 1.99 ... 0 \n",
"2 0 1.51 1.89 1.99 ... 0 \n",
"3 0 1.52 1.89 1.98 ... 0 \n",
"4 0 1.52 1.89 1.99 ... 0 \n",
"\n",
" Promotion_4 Promotion_5 Sex Marital status Age Education Income \\\n",
"0 0 0 0 0 47 1 110866 \n",
"1 0 0 0 0 47 1 110866 \n",
"2 0 0 0 0 47 1 110866 \n",
"3 0 0 0 0 47 1 110866 \n",
"4 0 0 0 0 47 1 110866 \n",
"\n",
" Occupation Settlement size \n",
"0 1 0 \n",
"1 1 0 \n",
"2 1 0 \n",
"3 1 0 \n",
"4 1 0 \n",
"\n",
"[5 rows x 24 columns]"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(\"purchase data.csv\"); df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "43601ba8-fc2f-4f62-9b4f-ae38ab3ad98c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(58693, 24)"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a842319a-405b-402b-9b38-60cd006451fd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"ID 500\n",
"Day 730\n",
"Incidence 2 \n",
"Brand 6 \n",
"Quantity 16 \n",
"Last_Inc_Brand 6 \n",
"Last_Inc_Quantity 2 \n",
"Price_1 37 \n",
"Price_2 30 \n",
"Price_3 21 \n",
"Price_4 26 \n",
"Price_5 44 \n",
"Promotion_1 2 \n",
"Promotion_2 2 \n",
"Promotion_3 2 \n",
"Promotion_4 2 \n",
"Promotion_5 2 \n",
"Sex 2 \n",
"Marital status 2 \n",
"Age 56 \n",
"Education 4 \n",
"Income 499\n",
"Occupation 3 \n",
"Settlement size 3 \n",
"dtype: int64"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.nunique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "69701739-019d-45c5-92ce-1275f34b9c19",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"</style>\n",
"<table id=\"T_24f5f_\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"col_heading level0 col0\" >Variable</th>\n",
" <th class=\"col_heading level0 col1\" >Data type</th>\n",
" <th class=\"col_heading level0 col2\" >Range</th>\n",
" <th class=\"col_heading level0 col3\" >Description</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td id=\"T_24f5f_row0_col0\" class=\"data row0 col0\" >ID</td>\n",
" <td id=\"T_24f5f_row0_col1\" class=\"data row0 col1\" >numerical</td>\n",
" <td id=\"T_24f5f_row0_col2\" class=\"data row0 col2\" >Integer</td>\n",
" <td id=\"T_24f5f_row0_col3\" class=\"data row0 col3\" >Shows a unique identificator of a customer.</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row1_col0\" class=\"data row1 col0\" >Day</td>\n",
" <td id=\"T_24f5f_row1_col1\" class=\"data row1 col1\" >numerical</td>\n",
" <td id=\"T_24f5f_row1_col2\" class=\"data row1 col2\" >Integer</td>\n",
" <td id=\"T_24f5f_row1_col3\" class=\"data row1 col3\" >Day when the customer has visited the store </td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row2_col0\" class=\"data row2 col0\" >Incidence</td>\n",
" <td id=\"T_24f5f_row2_col1\" class=\"data row2 col1\" >categorical</td>\n",
" <td id=\"T_24f5f_row2_col2\" class=\"data row2 col2\" >{0,1}</td>\n",
" <td id=\"T_24f5f_row2_col3\" class=\"data row2 col3\" >Purchase Incidence</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row3_col0\" class=\"data row3 col0\" ></td>\n",
" <td id=\"T_24f5f_row3_col1\" class=\"data row3 col1\" ></td>\n",
" <td id=\"T_24f5f_row3_col2\" class=\"data row3 col2\" >0</td>\n",
" <td id=\"T_24f5f_row3_col3\" class=\"data row3 col3\" >The customer has not purchased an item from the category of interest </td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row4_col0\" class=\"data row4 col0\" ></td>\n",
" <td id=\"T_24f5f_row4_col1\" class=\"data row4 col1\" ></td>\n",
" <td id=\"T_24f5f_row4_col2\" class=\"data row4 col2\" >1</td>\n",
" <td id=\"T_24f5f_row4_col3\" class=\"data row4 col3\" >The customer has purchased an item from the category of interest </td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row5_col0\" class=\"data row5 col0\" >Brand</td>\n",
" <td id=\"T_24f5f_row5_col1\" class=\"data row5 col1\" >categorical</td>\n",
" <td id=\"T_24f5f_row5_col2\" class=\"data row5 col2\" >{0,1,2,3,4,5}</td>\n",
" <td id=\"T_24f5f_row5_col3\" class=\"data row5 col3\" >Shows which brand the customer has purchased</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row6_col0\" class=\"data row6 col0\" ></td>\n",
" <td id=\"T_24f5f_row6_col1\" class=\"data row6 col1\" ></td>\n",
" <td id=\"T_24f5f_row6_col2\" class=\"data row6 col2\" >0</td>\n",
" <td id=\"T_24f5f_row6_col3\" class=\"data row6 col3\" >No brand was purchased</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row7_col0\" class=\"data row7 col0\" ></td>\n",
" <td id=\"T_24f5f_row7_col1\" class=\"data row7 col1\" ></td>\n",
" <td id=\"T_24f5f_row7_col2\" class=\"data row7 col2\" >1,2,3,4,5</td>\n",
" <td id=\"T_24f5f_row7_col3\" class=\"data row7 col3\" >Brand ID</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row8_col0\" class=\"data row8 col0\" >Quantity</td>\n",
" <td id=\"T_24f5f_row8_col1\" class=\"data row8 col1\" >numerical</td>\n",
" <td id=\"T_24f5f_row8_col2\" class=\"data row8 col2\" >integer</td>\n",
" <td id=\"T_24f5f_row8_col3\" class=\"data row8 col3\" >Number of items bought by the customer from the product category of interest</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row9_col0\" class=\"data row9 col0\" >Last_Inc_Brand</td>\n",
" <td id=\"T_24f5f_row9_col1\" class=\"data row9 col1\" >categorical</td>\n",
" <td id=\"T_24f5f_row9_col2\" class=\"data row9 col2\" >{0,1,2,3,4,5}</td>\n",
" <td id=\"T_24f5f_row9_col3\" class=\"data row9 col3\" >Shows which brand the customer has purchased on their previous store visit</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row10_col0\" class=\"data row10 col0\" ></td>\n",
" <td id=\"T_24f5f_row10_col1\" class=\"data row10 col1\" ></td>\n",
" <td id=\"T_24f5f_row10_col2\" class=\"data row10 col2\" >0</td>\n",
" <td id=\"T_24f5f_row10_col3\" class=\"data row10 col3\" >No brand was purchased</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row11_col0\" class=\"data row11 col0\" ></td>\n",
" <td id=\"T_24f5f_row11_col1\" class=\"data row11 col1\" ></td>\n",
" <td id=\"T_24f5f_row11_col2\" class=\"data row11 col2\" >1,2,3,4,5</td>\n",
" <td id=\"T_24f5f_row11_col3\" class=\"data row11 col3\" >Brand ID</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row12_col0\" class=\"data row12 col0\" >Last_Inc_Quantity</td>\n",
" <td id=\"T_24f5f_row12_col1\" class=\"data row12 col1\" >numerical</td>\n",
" <td id=\"T_24f5f_row12_col2\" class=\"data row12 col2\" >integer</td>\n",
" <td id=\"T_24f5f_row12_col3\" class=\"data row12 col3\" >Number of items bought by the customer from the product category of interest during their previous store visit</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row13_col0\" class=\"data row13 col0\" >Price_1</td>\n",
" <td id=\"T_24f5f_row13_col1\" class=\"data row13 col1\" >numerical</td>\n",
" <td id=\"T_24f5f_row13_col2\" class=\"data row13 col2\" >real</td>\n",
" <td id=\"T_24f5f_row13_col3\" class=\"data row13 col3\" >Price of an item from Brand 1 on a particular day</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row14_col0\" class=\"data row14 col0\" >Price_2</td>\n",
" <td id=\"T_24f5f_row14_col1\" class=\"data row14 col1\" >numerical</td>\n",
" <td id=\"T_24f5f_row14_col2\" class=\"data row14 col2\" >real</td>\n",
" <td id=\"T_24f5f_row14_col3\" class=\"data row14 col3\" >Price of an item from Brand 2 on a particular day</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row15_col0\" class=\"data row15 col0\" >Price_3</td>\n",
" <td id=\"T_24f5f_row15_col1\" class=\"data row15 col1\" >numerical</td>\n",
" <td id=\"T_24f5f_row15_col2\" class=\"data row15 col2\" >real</td>\n",
" <td id=\"T_24f5f_row15_col3\" class=\"data row15 col3\" >Price of an item from Brand 3 on a particular day</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row16_col0\" class=\"data row16 col0\" >Price_4</td>\n",
" <td id=\"T_24f5f_row16_col1\" class=\"data row16 col1\" >numerical</td>\n",
" <td id=\"T_24f5f_row16_col2\" class=\"data row16 col2\" >real</td>\n",
" <td id=\"T_24f5f_row16_col3\" class=\"data row16 col3\" >Price of an item from Brand 4 on a particular day</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row17_col0\" class=\"data row17 col0\" >Price_5</td>\n",
" <td id=\"T_24f5f_row17_col1\" class=\"data row17 col1\" >numerical</td>\n",
" <td id=\"T_24f5f_row17_col2\" class=\"data row17 col2\" >real</td>\n",
" <td id=\"T_24f5f_row17_col3\" class=\"data row17 col3\" >Price of an item from Brand 5 on a particular day</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row18_col0\" class=\"data row18 col0\" >Promotion_1</td>\n",
" <td id=\"T_24f5f_row18_col1\" class=\"data row18 col1\" >categorical</td>\n",
" <td id=\"T_24f5f_row18_col2\" class=\"data row18 col2\" >{0,1}</td>\n",
" <td id=\"T_24f5f_row18_col3\" class=\"data row18 col3\" >Indicator whether Brand 1 was on promotion or not on a particular day</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row19_col0\" class=\"data row19 col0\" ></td>\n",
" <td id=\"T_24f5f_row19_col1\" class=\"data row19 col1\" ></td>\n",
" <td id=\"T_24f5f_row19_col2\" class=\"data row19 col2\" >0</td>\n",
" <td id=\"T_24f5f_row19_col3\" class=\"data row19 col3\" >There is no promotion</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row20_col0\" class=\"data row20 col0\" ></td>\n",
" <td id=\"T_24f5f_row20_col1\" class=\"data row20 col1\" ></td>\n",
" <td id=\"T_24f5f_row20_col2\" class=\"data row20 col2\" >1</td>\n",
" <td id=\"T_24f5f_row20_col3\" class=\"data row20 col3\" >There is promotion</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row21_col0\" class=\"data row21 col0\" >Promotion_2</td>\n",
" <td id=\"T_24f5f_row21_col1\" class=\"data row21 col1\" >categorical</td>\n",
" <td id=\"T_24f5f_row21_col2\" class=\"data row21 col2\" >{0,1}</td>\n",
" <td id=\"T_24f5f_row21_col3\" class=\"data row21 col3\" >Indicator of whether Brand 2 was on promotion or not on a particular day</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row22_col0\" class=\"data row22 col0\" ></td>\n",
" <td id=\"T_24f5f_row22_col1\" class=\"data row22 col1\" ></td>\n",
" <td id=\"T_24f5f_row22_col2\" class=\"data row22 col2\" >0</td>\n",
" <td id=\"T_24f5f_row22_col3\" class=\"data row22 col3\" >There is no promotion</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row23_col0\" class=\"data row23 col0\" ></td>\n",
" <td id=\"T_24f5f_row23_col1\" class=\"data row23 col1\" ></td>\n",
" <td id=\"T_24f5f_row23_col2\" class=\"data row23 col2\" >1</td>\n",
" <td id=\"T_24f5f_row23_col3\" class=\"data row23 col3\" >There is promotion</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row24_col0\" class=\"data row24 col0\" >Promotion_3</td>\n",
" <td id=\"T_24f5f_row24_col1\" class=\"data row24 col1\" >categorical</td>\n",
" <td id=\"T_24f5f_row24_col2\" class=\"data row24 col2\" >{0,1}</td>\n",
" <td id=\"T_24f5f_row24_col3\" class=\"data row24 col3\" >Indicator of whether Brand 3 was on promotion or not on a particular day</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row25_col0\" class=\"data row25 col0\" ></td>\n",
" <td id=\"T_24f5f_row25_col1\" class=\"data row25 col1\" ></td>\n",
" <td id=\"T_24f5f_row25_col2\" class=\"data row25 col2\" >0</td>\n",
" <td id=\"T_24f5f_row25_col3\" class=\"data row25 col3\" >There is no promotion</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row26_col0\" class=\"data row26 col0\" ></td>\n",
" <td id=\"T_24f5f_row26_col1\" class=\"data row26 col1\" ></td>\n",
" <td id=\"T_24f5f_row26_col2\" class=\"data row26 col2\" >1</td>\n",
" <td id=\"T_24f5f_row26_col3\" class=\"data row26 col3\" >There is promotion</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row27_col0\" class=\"data row27 col0\" >Promotion_4</td>\n",
" <td id=\"T_24f5f_row27_col1\" class=\"data row27 col1\" >categorical</td>\n",
" <td id=\"T_24f5f_row27_col2\" class=\"data row27 col2\" >{0,1}</td>\n",
" <td id=\"T_24f5f_row27_col3\" class=\"data row27 col3\" >Indicator of whether Brand 4 was on promotion or not on a particular day</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row28_col0\" class=\"data row28 col0\" ></td>\n",
" <td id=\"T_24f5f_row28_col1\" class=\"data row28 col1\" ></td>\n",
" <td id=\"T_24f5f_row28_col2\" class=\"data row28 col2\" >0</td>\n",
" <td id=\"T_24f5f_row28_col3\" class=\"data row28 col3\" >There is no promotion</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row29_col0\" class=\"data row29 col0\" ></td>\n",
" <td id=\"T_24f5f_row29_col1\" class=\"data row29 col1\" ></td>\n",
" <td id=\"T_24f5f_row29_col2\" class=\"data row29 col2\" >1</td>\n",
" <td id=\"T_24f5f_row29_col3\" class=\"data row29 col3\" >There is promotion</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row30_col0\" class=\"data row30 col0\" >Promotion_5</td>\n",
" <td id=\"T_24f5f_row30_col1\" class=\"data row30 col1\" >categorical</td>\n",
" <td id=\"T_24f5f_row30_col2\" class=\"data row30 col2\" >{0,1}</td>\n",
" <td id=\"T_24f5f_row30_col3\" class=\"data row30 col3\" >Indicator of whether Brand 5 was on promotion or not on a particular day</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row31_col0\" class=\"data row31 col0\" ></td>\n",
" <td id=\"T_24f5f_row31_col1\" class=\"data row31 col1\" ></td>\n",
" <td id=\"T_24f5f_row31_col2\" class=\"data row31 col2\" >0</td>\n",
" <td id=\"T_24f5f_row31_col3\" class=\"data row31 col3\" >There is no promotion</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row32_col0\" class=\"data row32 col0\" ></td>\n",
" <td id=\"T_24f5f_row32_col1\" class=\"data row32 col1\" ></td>\n",
" <td id=\"T_24f5f_row32_col2\" class=\"data row32 col2\" >1</td>\n",
" <td id=\"T_24f5f_row32_col3\" class=\"data row32 col3\" >There is promotion</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row33_col0\" class=\"data row33 col0\" >Sex</td>\n",
" <td id=\"T_24f5f_row33_col1\" class=\"data row33 col1\" >categorical</td>\n",
" <td id=\"T_24f5f_row33_col2\" class=\"data row33 col2\" >{0,1}</td>\n",
" <td id=\"T_24f5f_row33_col3\" class=\"data row33 col3\" >Biological sex (gender) of a customer. In this dataset there are only 2 different options.</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row34_col0\" class=\"data row34 col0\" ></td>\n",
" <td id=\"T_24f5f_row34_col1\" class=\"data row34 col1\" ></td>\n",
" <td id=\"T_24f5f_row34_col2\" class=\"data row34 col2\" >0</td>\n",
" <td id=\"T_24f5f_row34_col3\" class=\"data row34 col3\" >male</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row35_col0\" class=\"data row35 col0\" ></td>\n",
" <td id=\"T_24f5f_row35_col1\" class=\"data row35 col1\" ></td>\n",
" <td id=\"T_24f5f_row35_col2\" class=\"data row35 col2\" >1</td>\n",
" <td id=\"T_24f5f_row35_col3\" class=\"data row35 col3\" >female</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row36_col0\" class=\"data row36 col0\" >Marital status</td>\n",
" <td id=\"T_24f5f_row36_col1\" class=\"data row36 col1\" >categorical</td>\n",
" <td id=\"T_24f5f_row36_col2\" class=\"data row36 col2\" >{0,1}</td>\n",
" <td id=\"T_24f5f_row36_col3\" class=\"data row36 col3\" >Marital status of a customer.</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row37_col0\" class=\"data row37 col0\" ></td>\n",
" <td id=\"T_24f5f_row37_col1\" class=\"data row37 col1\" ></td>\n",
" <td id=\"T_24f5f_row37_col2\" class=\"data row37 col2\" >0</td>\n",
" <td id=\"T_24f5f_row37_col3\" class=\"data row37 col3\" >single</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row38_col0\" class=\"data row38 col0\" ></td>\n",
" <td id=\"T_24f5f_row38_col1\" class=\"data row38 col1\" ></td>\n",
" <td id=\"T_24f5f_row38_col2\" class=\"data row38 col2\" >1</td>\n",
" <td id=\"T_24f5f_row38_col3\" class=\"data row38 col3\" >non-single (divorced / separated / married / widowed)</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row39_col0\" class=\"data row39 col0\" >Age</td>\n",
" <td id=\"T_24f5f_row39_col1\" class=\"data row39 col1\" >numerical</td>\n",
" <td id=\"T_24f5f_row39_col2\" class=\"data row39 col2\" >Integer</td>\n",
" <td id=\"T_24f5f_row39_col3\" class=\"data row39 col3\" >The age of the customer in years, calculated as current year minus the year of birth of the customer at the time of creation of the dataset</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row40_col0\" class=\"data row40 col0\" ></td>\n",
" <td id=\"T_24f5f_row40_col1\" class=\"data row40 col1\" ></td>\n",
" <td id=\"T_24f5f_row40_col2\" class=\"data row40 col2\" >18</td>\n",
" <td id=\"T_24f5f_row40_col3\" class=\"data row40 col3\" >Min value (the lowest age observed in the dataset)</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row41_col0\" class=\"data row41 col0\" ></td>\n",
" <td id=\"T_24f5f_row41_col1\" class=\"data row41 col1\" ></td>\n",
" <td id=\"T_24f5f_row41_col2\" class=\"data row41 col2\" >75</td>\n",
" <td id=\"T_24f5f_row41_col3\" class=\"data row41 col3\" >Max value (the highest age observed in the dataset)</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row42_col0\" class=\"data row42 col0\" >Education</td>\n",
" <td id=\"T_24f5f_row42_col1\" class=\"data row42 col1\" >categorical</td>\n",
" <td id=\"T_24f5f_row42_col2\" class=\"data row42 col2\" >{0,1,2,3}</td>\n",
" <td id=\"T_24f5f_row42_col3\" class=\"data row42 col3\" >Level of education of the customer</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row43_col0\" class=\"data row43 col0\" ></td>\n",
" <td id=\"T_24f5f_row43_col1\" class=\"data row43 col1\" ></td>\n",
" <td id=\"T_24f5f_row43_col2\" class=\"data row43 col2\" >0</td>\n",
" <td id=\"T_24f5f_row43_col3\" class=\"data row43 col3\" >other / unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row44_col0\" class=\"data row44 col0\" ></td>\n",
" <td id=\"T_24f5f_row44_col1\" class=\"data row44 col1\" ></td>\n",
" <td id=\"T_24f5f_row44_col2\" class=\"data row44 col2\" >1</td>\n",
" <td id=\"T_24f5f_row44_col3\" class=\"data row44 col3\" >high school</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row45_col0\" class=\"data row45 col0\" ></td>\n",
" <td id=\"T_24f5f_row45_col1\" class=\"data row45 col1\" ></td>\n",
" <td id=\"T_24f5f_row45_col2\" class=\"data row45 col2\" >2</td>\n",
" <td id=\"T_24f5f_row45_col3\" class=\"data row45 col3\" >university</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row46_col0\" class=\"data row46 col0\" ></td>\n",
" <td id=\"T_24f5f_row46_col1\" class=\"data row46 col1\" ></td>\n",
" <td id=\"T_24f5f_row46_col2\" class=\"data row46 col2\" >3</td>\n",
" <td id=\"T_24f5f_row46_col3\" class=\"data row46 col3\" >graduate school</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row47_col0\" class=\"data row47 col0\" >Income</td>\n",
" <td id=\"T_24f5f_row47_col1\" class=\"data row47 col1\" >numerical</td>\n",
" <td id=\"T_24f5f_row47_col2\" class=\"data row47 col2\" >real</td>\n",
" <td id=\"T_24f5f_row47_col3\" class=\"data row47 col3\" >Self-reported annual income in US dollars of the customer.</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row48_col0\" class=\"data row48 col0\" ></td>\n",
" <td id=\"T_24f5f_row48_col1\" class=\"data row48 col1\" ></td>\n",
" <td id=\"T_24f5f_row48_col2\" class=\"data row48 col2\" >38247</td>\n",
" <td id=\"T_24f5f_row48_col3\" class=\"data row48 col3\" >Min value (the lowest income observed in the dataset)</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row49_col0\" class=\"data row49 col0\" ></td>\n",
" <td id=\"T_24f5f_row49_col1\" class=\"data row49 col1\" ></td>\n",
" <td id=\"T_24f5f_row49_col2\" class=\"data row49 col2\" >309364</td>\n",
" <td id=\"T_24f5f_row49_col3\" class=\"data row49 col3\" >Max value (the highest income observed in the dataset)</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row50_col0\" class=\"data row50 col0\" >Occupation</td>\n",
" <td id=\"T_24f5f_row50_col1\" class=\"data row50 col1\" >categorical</td>\n",
" <td id=\"T_24f5f_row50_col2\" class=\"data row50 col2\" >{0,1,2}</td>\n",
" <td id=\"T_24f5f_row50_col3\" class=\"data row50 col3\" >Category of occupation of the customer.</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row51_col0\" class=\"data row51 col0\" ></td>\n",
" <td id=\"T_24f5f_row51_col1\" class=\"data row51 col1\" ></td>\n",
" <td id=\"T_24f5f_row51_col2\" class=\"data row51 col2\" >0</td>\n",
" <td id=\"T_24f5f_row51_col3\" class=\"data row51 col3\" >unemployed / unskilled</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row52_col0\" class=\"data row52 col0\" ></td>\n",
" <td id=\"T_24f5f_row52_col1\" class=\"data row52 col1\" ></td>\n",
" <td id=\"T_24f5f_row52_col2\" class=\"data row52 col2\" >1</td>\n",
" <td id=\"T_24f5f_row52_col3\" class=\"data row52 col3\" >skilled employee / official</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row53_col0\" class=\"data row53 col0\" ></td>\n",
" <td id=\"T_24f5f_row53_col1\" class=\"data row53 col1\" ></td>\n",
" <td id=\"T_24f5f_row53_col2\" class=\"data row53 col2\" >2</td>\n",
" <td id=\"T_24f5f_row53_col3\" class=\"data row53 col3\" >management / self-employed / highly qualified employee / officer</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row54_col0\" class=\"data row54 col0\" >Settlement size</td>\n",
" <td id=\"T_24f5f_row54_col1\" class=\"data row54 col1\" >categorical</td>\n",
" <td id=\"T_24f5f_row54_col2\" class=\"data row54 col2\" >{0,1,2}</td>\n",
" <td id=\"T_24f5f_row54_col3\" class=\"data row54 col3\" >The size of the city that the customer lives in.</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row55_col0\" class=\"data row55 col0\" ></td>\n",
" <td id=\"T_24f5f_row55_col1\" class=\"data row55 col1\" ></td>\n",
" <td id=\"T_24f5f_row55_col2\" class=\"data row55 col2\" >0</td>\n",
" <td id=\"T_24f5f_row55_col3\" class=\"data row55 col3\" >small city</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row56_col0\" class=\"data row56 col0\" ></td>\n",
" <td id=\"T_24f5f_row56_col1\" class=\"data row56 col1\" ></td>\n",
" <td id=\"T_24f5f_row56_col2\" class=\"data row56 col2\" >1</td>\n",
" <td id=\"T_24f5f_row56_col3\" class=\"data row56 col3\" >mid-sized city</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_24f5f_row57_col0\" class=\"data row57 col0\" ></td>\n",
" <td id=\"T_24f5f_row57_col1\" class=\"data row57 col1\" ></td>\n",
" <td id=\"T_24f5f_row57_col2\" class=\"data row57 col2\" >2</td>\n",
" <td id=\"T_24f5f_row57_col3\" class=\"data row57 col3\" >big city</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x12ff91492790>"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.set_option('display.max_colwidth', 0)\n",
"df_legend=pd.read_excel(\"purchase data legend.xlsx\", skiprows=2, header=1).dropna(how=\"all\", axis=1).dropna(how=\"all\", axis=0).fillna(\"\")\n",
"df_legend.style.hide_index()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "567a19ba-3603-49cb-a8f3-248ab96617b1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"ID 0\n",
"Day 0\n",
"Incidence 0\n",
"Brand 0\n",
"Quantity 0\n",
"Last_Inc_Brand 0\n",
"Last_Inc_Quantity 0\n",
"Price_1 0\n",
"Price_2 0\n",
"Price_3 0\n",
"Price_4 0\n",
"Price_5 0\n",
"Promotion_1 0\n",
"Promotion_2 0\n",
"Promotion_3 0\n",
"Promotion_4 0\n",
"Promotion_5 0\n",
"Sex 0\n",
"Marital status 0\n",
"Age 0\n",
"Education 0\n",
"Income 0\n",
"Occupation 0\n",
"Settlement size 0\n",
"dtype: int64"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().sum() # No missing information"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "36e93046-5cb1-4632-af8d-9275002b94d0",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}