EDA
Contents
EDA#
This is a review for the dataset
%load_ext autoreload
%autoreload 2
%matplotlib inline
Imports#
import warnings
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os
from fastdownload import download_url
from IPython.display import display, Image
from PIL import Image as PILImage
import shutil
from ipywidgets.widgets import interact
from sklearn.model_selection import train_test_split
from aiking.data.external import * #We need to import this after fastai modules
sns.set(color_codes=True)
sns.set_palette(sns.color_palette("muted"))
warnings.filterwarnings("ignore")
Getting Dataset#
# kaggle competitions download -c ultra-mnist
path = untar_data("kaggle_competitions::ultra-mnist"); path.ls()
(#5) [Path('/Landmark2/pdo/aiking/data/ultra-mnist/train'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/ultra-mnist.zip'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/train.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/sample_submission.csv')]
# !unzip /Landmark2/pdo/aiking/data/ultra-mnist/ultra-mnist.zip -d /Landmark2/pdo/aiking/data/ultra-mnist
pd.read_csv(path/"train.csv")['digit_sum'].value_counts().sort_index().plot(kind='bar')
<AxesSubplot:>
Simple Data Visualization#
pd.read_csv(path/"train.csv")
id | digit_sum | |
---|---|---|
0 | vyctxmodyu | 12 |
1 | kghlqhpshk | 14 |
2 | rfznywdgjo | 19 |
3 | zllfkikwrw | 24 |
4 | qeiqjoqakl | 20 |
... | ... | ... |
27995 | tmsimiaoed | 26 |
27996 | zoewngbuwq | 0 |
27997 | edxdmumlvt | 1 |
27998 | vomwiagnxi | 13 |
27999 | uvogzysotf | 21 |
28000 rows × 2 columns
(path/"test").ls()
(#28000) [Path('/Landmark2/pdo/aiking/data/ultra-mnist/test/shrxqvnwni.jpeg'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test/ewandndkhm.jpeg'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test/gldzgpmcox.jpeg'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test/mzahedfegm.jpeg'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test/zfnsiamdih.jpeg'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test/axoyjjoraz.jpeg'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test/xcmrpmdwpm.jpeg'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test/xyuiqnuugg.jpeg'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test/nsybrzrmmb.jpeg'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test/immkpvyqyb.jpeg')...]
images = (path/"train").ls()
df_train = pd.read_csv(path/"train.csv")
print(df_train[df_train['id'] == "shrxqvnwni"]['digit_sum'].values)
[]
@interact(idx=(0, len(images)))
def display_img(idx):
img_loc = images[idx]
img_name = os.path.splitext(img_loc.name)[0]
print(df_train[df_train['id'] == img_name]['digit_sum'].values)
# return Image(img_loc, width=400, height=400)
photo = plt.imread(img_loc)
photo = photo.astype('int')
return plt.imshow(photo)
test_images = (path/"test").ls()
range(len(images))
@interact(idx=(0, len(test_images)))
def display_img(idx):
return Image(test_images[idx], width=400, height=400)
Train Test Split#
df = pd.read_csv(path/"train.csv")
df
id | digit_sum | |
---|---|---|
0 | vyctxmodyu | 12 |
1 | kghlqhpshk | 14 |
2 | rfznywdgjo | 19 |
3 | zllfkikwrw | 24 |
4 | qeiqjoqakl | 20 |
... | ... | ... |
27995 | tmsimiaoed | 26 |
27996 | zoewngbuwq | 0 |
27997 | edxdmumlvt | 1 |
27998 | vomwiagnxi | 13 |
27999 | uvogzysotf | 21 |
28000 rows × 2 columns
df_train, df_valid= train_test_split(df, test_size=0.2, stratify=df['digit_sum'])
df_train.shape, df_valid.shape
((22400, 2), (5600, 2))
df_train['digit_sum'].value_counts().sort_index().plot(kind='bar')
<AxesSubplot:>
df_train.to_csv(path/"train_train.csv", index=False)
df_valid.to_csv(path/"valid_train.csv", index=False)
path.ls()
(#7) [Path('/Landmark2/pdo/aiking/data/ultra-mnist/train'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/train_train.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/valid_train.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/ultra-mnist.zip'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/train.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/sample_submission.csv')]
!head {path}/"train.csv"
id,digit_sum
vyctxmodyu,12
kghlqhpshk,14
rfznywdgjo,19
zllfkikwrw,24
qeiqjoqakl,20
kgybaqqiuf,20
decsgvbjlk,5
vzinebdzrx,8
irjmhkxvxw,1
!head {path}/"train_train.csv"
id,digit_sum
fneyofrewl,5
qabcaosvod,21
ifgkabsbtd,26
amtrieynfn,19
ypdcmwqliz,21
gkxsysyloo,13
hlzhqfuoxw,27
pytumbeqxd,4
rolquggkcx,17
Image((path/"train").ls()[0])
Create a sample#
df_train, df_sample= train_test_split(df, test_size=0.02, stratify=df['digit_sum'])
df_sample.to_csv(path/"sample.csv", index=False)
df_train, df_valid= train_test_split(df_sample, test_size=0.2, stratify=df_sample['digit_sum'])
df_train.to_csv(path/"train_sample.csv", index=False)
df_valid.to_csv(path/"valid_sample.csv", index=False)
path.ls()
(#10) [Path('/Landmark2/pdo/aiking/data/ultra-mnist/train_sample.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/train'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/sample.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/train_train.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/valid_train.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/ultra-mnist.zip'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/valid_sample.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/train.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/sample_submission.csv')]
df_train.shape
(448, 2)
df_valid.shape
(112, 2)
def copyfile(fname):
src = path/"train"/fname
dest = path/"sample"/fname
shutil.copyfile(src, dest)
# !rm -rf
!rm -rf {path/"sample"}
!mkdir {path/"sample"}
df_sample['id'] = df_sample['id']+".jpeg"
df_sample['id'].map(copyfile)
21387 None
15672 None
13161 None
7152 None
26522 None
...
11771 None
15287 None
27202 None
19250 None
248 None
Name: id, Length: 560, dtype: object
Image((path/"sample").ls()[0])