EDA#

This is a review for the dataset

%load_ext autoreload
%autoreload 2
%matplotlib inline

Imports#

import warnings
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os
from fastdownload import download_url
from IPython.display import display, Image
from PIL import Image as PILImage
import shutil
from ipywidgets.widgets import interact
from sklearn.model_selection import train_test_split
from aiking.data.external import * #We need to import this after fastai modules
sns.set(color_codes=True)
sns.set_palette(sns.color_palette("muted"))
warnings.filterwarnings("ignore")

Getting Dataset#

# kaggle competitions download -c ultra-mnist
path = untar_data("kaggle_competitions::ultra-mnist"); path.ls()
(#5) [Path('/Landmark2/pdo/aiking/data/ultra-mnist/train'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/ultra-mnist.zip'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/train.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/sample_submission.csv')]
# !unzip /Landmark2/pdo/aiking/data/ultra-mnist/ultra-mnist.zip -d /Landmark2/pdo/aiking/data/ultra-mnist
pd.read_csv(path/"train.csv")['digit_sum'].value_counts().sort_index().plot(kind='bar')
<AxesSubplot:>
../../_images/01_eda_9_1.png

Simple Data Visualization#

pd.read_csv(path/"train.csv")
id digit_sum
0 vyctxmodyu 12
1 kghlqhpshk 14
2 rfznywdgjo 19
3 zllfkikwrw 24
4 qeiqjoqakl 20
... ... ...
27995 tmsimiaoed 26
27996 zoewngbuwq 0
27997 edxdmumlvt 1
27998 vomwiagnxi 13
27999 uvogzysotf 21

28000 rows × 2 columns

(path/"test").ls()
(#28000) [Path('/Landmark2/pdo/aiking/data/ultra-mnist/test/shrxqvnwni.jpeg'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test/ewandndkhm.jpeg'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test/gldzgpmcox.jpeg'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test/mzahedfegm.jpeg'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test/zfnsiamdih.jpeg'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test/axoyjjoraz.jpeg'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test/xcmrpmdwpm.jpeg'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test/xyuiqnuugg.jpeg'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test/nsybrzrmmb.jpeg'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test/immkpvyqyb.jpeg')...]
images = (path/"train").ls()
df_train = pd.read_csv(path/"train.csv")
print(df_train[df_train['id'] == "shrxqvnwni"]['digit_sum'].values)
[]
@interact(idx=(0, len(images)))
def display_img(idx):
    img_loc = images[idx]
    img_name = os.path.splitext(img_loc.name)[0]
    print(df_train[df_train['id'] == img_name]['digit_sum'].values)
    
    # return Image(img_loc, width=400, height=400)
    photo = plt.imread(img_loc)
    photo = photo.astype('int')
    return plt.imshow(photo)
test_images = (path/"test").ls()
range(len(images))

@interact(idx=(0, len(test_images)))
def display_img(idx):
    return Image(test_images[idx], width=400, height=400)

Train Test Split#

df = pd.read_csv(path/"train.csv")
df
id digit_sum
0 vyctxmodyu 12
1 kghlqhpshk 14
2 rfznywdgjo 19
3 zllfkikwrw 24
4 qeiqjoqakl 20
... ... ...
27995 tmsimiaoed 26
27996 zoewngbuwq 0
27997 edxdmumlvt 1
27998 vomwiagnxi 13
27999 uvogzysotf 21

28000 rows × 2 columns

df_train, df_valid= train_test_split(df, test_size=0.2, stratify=df['digit_sum'])
df_train.shape, df_valid.shape
((22400, 2), (5600, 2))
df_train['digit_sum'].value_counts().sort_index().plot(kind='bar')
<AxesSubplot:>
../../_images/01_eda_21_1.png
df_train.to_csv(path/"train_train.csv", index=False)
df_valid.to_csv(path/"valid_train.csv", index=False)
path.ls()
(#7) [Path('/Landmark2/pdo/aiking/data/ultra-mnist/train'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/train_train.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/valid_train.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/ultra-mnist.zip'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/train.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/sample_submission.csv')]
!head {path}/"train.csv"
id,digit_sum
vyctxmodyu,12
kghlqhpshk,14
rfznywdgjo,19
zllfkikwrw,24
qeiqjoqakl,20
kgybaqqiuf,20
decsgvbjlk,5
vzinebdzrx,8
irjmhkxvxw,1
!head {path}/"train_train.csv"
id,digit_sum
fneyofrewl,5
qabcaosvod,21
ifgkabsbtd,26
amtrieynfn,19
ypdcmwqliz,21
gkxsysyloo,13
hlzhqfuoxw,27
pytumbeqxd,4
rolquggkcx,17
Image((path/"train").ls()[0])
../../_images/01_eda_27_0.jpg

Create a sample#

df_train, df_sample= train_test_split(df, test_size=0.02, stratify=df['digit_sum'])
df_sample.to_csv(path/"sample.csv", index=False)
df_train, df_valid= train_test_split(df_sample, test_size=0.2, stratify=df_sample['digit_sum'])
df_train.to_csv(path/"train_sample.csv", index=False)
df_valid.to_csv(path/"valid_sample.csv", index=False)
path.ls()
(#10) [Path('/Landmark2/pdo/aiking/data/ultra-mnist/train_sample.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/train'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/sample.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/train_train.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/valid_train.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/ultra-mnist.zip'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/valid_sample.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/train.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/sample_submission.csv')]
df_train.shape
(448, 2)
df_valid.shape
(112, 2)
def copyfile(fname):
    src = path/"train"/fname
    dest = path/"sample"/fname
    shutil.copyfile(src, dest)
# !rm -rf 
!rm -rf {path/"sample"}
!mkdir {path/"sample"}
df_sample['id'] = df_sample['id']+".jpeg"
df_sample['id'].map(copyfile)
21387    None
15672    None
13161    None
7152     None
26522    None
         ... 
11771    None
15287    None
27202    None
19250    None
248      None
Name: id, Length: 560, dtype: object
Image((path/"sample").ls()[0])
../../_images/01_eda_40_0.jpg