Digit Cleaner Idea#

%load_ext autoreload
%autoreload 2

Imports#

import pandas as pd 
import numpy as np
import cv2
import matplotlib.pyplot as plt
import itertools
from IPython.display import display, Image
from aiking.data.external import * #We need to import this after fastai modules
from ipywidgets.widgets import interact
import warnings
import os
import dask.bag as db
from fastprogress.fastprogress import master_bar, progress_bar
from matplotlib import cm
import PIL
from dask.diagnostics import ProgressBar
path = untar_data("kaggle_competitions::ultra-mnist"); path
(path/"train").ls()[0]
Path('/Landmark2/pdo/aiking/data/ultra-mnist/train/ypuccwrtnt.jpeg')
Image((path/"train").ls()[0])
../../_images/03_image_clean_5_0.jpg
images = (path/"train").ls()
df_train = pd.read_csv(path/"train.csv")
@interact(idx=(0, len(images)))
def display_img(idx):
    img_loc = images[idx]
    img_name = os.path.splitext(img_loc.name)[0]
    print(df_train[df_train['id'] == img_name]['digit_sum'].values)
    
    img1 = cv2.imread(str(img_loc.resolve()), cv2.IMREAD_GRAYSCALE)
    img2 = img1/255
    Hori = np.concatenate((img1, img2), axis=1)
    plt.imshow(Hori, cmap='gray')
    # return cv2.imread(img_loc)
    
    # return display(Image(img_loc, width=200, height=200), Image(img_loc, width=200, height=200))
cv2.imread(str(images[0].resolve()), cv2.IMREAD_GRAYSCALE)
array([[255, 255, 255, ..., 255, 255, 255],
       [255, 255, 255, ..., 255, 255, 255],
       [255, 255, 255, ..., 255, 255, 255],
       ...,
       [255, 255, 255, ..., 255, 255, 255],
       [255, 255, 255, ..., 255, 255, 255],
       [255, 255, 255, ..., 255, 255, 255]], dtype=uint8)
img1 = cv2.imread(str(images[0].resolve()), cv2.IMREAD_GRAYSCALE)
Hori = np.concatenate((img1, img1), axis=1)
plt.imshow(Hori, cmap='gray')
<matplotlib.image.AxesImage at 0x11333050ed30>
../../_images/03_image_clean_8_1.png
img = cv2.imread(str(images[0].resolve()), cv2.IMREAD_GRAYSCALE)
img = img/255
plt.imshow(img, cmap='gray')
<matplotlib.image.AxesImage at 0x1133304858e0>
../../_images/03_image_clean_9_1.png
for i in range(4):
    for j in range(4):
        print(1000*i, 1000*(i+1), 1000*j, 1000*(j+1))
        print(np.mean(img[1000*i:1000*(i+1),1000*j:1000*(j+1)]))# >0.5
        # print(i,j)
0 1000 0 1000
1.0
0 1000 1000 2000
1.0
0 1000 2000 3000
0.0
0 1000 3000 4000
1.0
1000 2000 0 1000
0.0
1000 2000 1000 2000
0.005666682352941176
1000 2000 2000 3000
0.0484751725490196
1000 2000 3000 4000
0.0
2000 3000 0 1000
0.1405221647058823
2000 3000 1000 2000
0.8555160588235289
2000 3000 2000 3000
0.003584
2000 3000 3000 4000
1.0
3000 4000 0 1000
0.9356675725490194
3000 4000 1000 2000
0.10501367450980394
3000 4000 2000 3000
0.02856782352941177
3000 4000 3000 4000
1.0
list(itertools.product(range(4), range(4)))
[(0, 0),
 (0, 1),
 (0, 2),
 (0, 3),
 (1, 0),
 (1, 1),
 (1, 2),
 (1, 3),
 (2, 0),
 (2, 1),
 (2, 2),
 (2, 3),
 (3, 0),
 (3, 1),
 (3, 2),
 (3, 3)]
def image_cleaner(img, threshold=0.5):
    img = img/255
    for i,j in itertools.product(range(4), range(4)):
        x1 = 1000*i
        x2 = 1000*(i+1)
        y1 = 1000*j
        y2 = 1000*(j+1)
        if np.mean(img[x1:x2, y1:y2]) > threshold:
            img[x1:x2, y1:y2] = np.abs(img[x1:x2, y1:y2] -1)
    return img
def image_cleaner2(img, threshold=0.5):
    img = img/255
    for i,j in itertools.product(range(4), range(4)):
        x1 = 1000*i
        x2 = 1000*(i+1)
        y1 = 1000*j
        y2 = 1000*(j+1)
        max_xy = 4000
        border = img[x1:x2,min(y1,max_xy -1)].sum() + img[x1:x2, y2-1].sum() + img[min(x1, max_xy-1), y1:y2].sum() + img[x2-1, y1:y2].sum()
        if border >=2000: img[x1:x2, y1:y2] = np.abs(img[x1:x2, y1:y2] -1)
    return img

# img = cv2.imread(str(images[0].resolve()), cv2.IMREAD_GRAYSCALE)
# # plt.imshow(image_cleaner(img), cmap='gray')
# i, j = 2,2
# img = img/255
# x1 = 1000*i
# x2 = 1000*(i+1)
# y1 = 1000*j
# y2 = 1000*(j+1)
# max_xy = 4000
# border = img[x1:x2,min(y1,max_xy -1)].sum() + img[x1:x2, y2-1].sum() + img[min(x1, max_xy-1), y1:y2].sum() + img[x2-1, y1:y2].sum()
# if border >=2000: img[x1:x2, y1:y2] = np.abs(img[x1:x2, y1:y2] -1)
# print(border)
img = cv2.imread(str(images[0].resolve()), cv2.IMREAD_GRAYSCALE)
plt.imshow(image_cleaner2(img), cmap='gray')
<matplotlib.image.AxesImage at 0x11331fdb3e80>
../../_images/03_image_clean_14_1.png
plt.imshow(img, cmap='gray')
<matplotlib.image.AxesImage at 0x66ace94cdc0>
../../_images/03_image_clean_15_1.png
images = (path/"train").ls()
df_train = pd.read_csv(path/"train.csv")
@interact(idx=(0, len(images)), continuous_update=False)
def display_img(idx):
    img_loc = images[idx]
    img_name = os.path.splitext(img_loc.name)[0]
    print(df_train[df_train['id'] == img_name]['digit_sum'].values)
    
    img1 = cv2.imread(str(img_loc.resolve()), cv2.IMREAD_GRAYSCALE)
    Hori = np.concatenate((img1/255, image_cleaner(img1), image_cleaner2(img1)), axis=1)
    plt.figure(figsize = (30,10))
    plt.imshow(Hori, cmap='gray')
path.ls()
(#9) [Path('/Landmark2/pdo/aiking/data/ultra-mnist/test_black'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/train.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/train'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/train_train.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/valid_train.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/ultra-mnist.zip'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/sample_submission.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/train_black')]
pb_train = (path/"train_black")
pb_test = (path/"test_black")
pb_train.mkdir(exist_ok=True)
pb_test.mkdir(exist_ok=True)
for img in (path/"train").ls()[:1]:
    data = image_cleaner2(cv2.imread(str(img.resolve()), cv2.IMREAD_GRAYSCALE))
    
    cv2.imwrite(str(pb_train/img.name), data)
def create_clean_ds(folder="train"):
    pb_folder = path/f"{folder}_black"
    for img in progress_bar((path/folder).ls()):
        data = image_cleaner2(cv2.imread(str(img.resolve()), cv2.IMREAD_GRAYSCALE))
        data = cv2.normalize(data, None, alpha = 0, beta = 255, norm_type = cv2.NORM_MINMAX, dtype = cv2.CV_32F).astype(np.uint8)
        cv2.imwrite(str(pb_folder/img.name), data)
create_clean_ds()
0.06% [17/28000 00:04<2:11:05]
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
/tmp/ipykernel_3631/51113177.py in <module>
----> 1 create_clean_ds()

/tmp/ipykernel_3631/538686461.py in create_clean_ds(folder)
      3     for img in progress_bar((path/folder).ls()):
      4         data = image_cleaner2(cv2.imread(str(img.resolve()), cv2.IMREAD_GRAYSCALE))
----> 5         cv2.imwrite(str(pb_folder/img.name), data)

KeyboardInterrupt: 
pb_train = (path/"train_black")
pb_test = (path/"test_black")
pb_train.mkdir(exist_ok=True)
pb_test.mkdir(exist_ok=True)
def create_clean_ds2(folder="train"):
    pb_folder = path/f"{folder}_black"
    def convert_image(img):
        if not (pb_folder/img.name).exists():
            data = image_cleaner2(cv2.imread(str(img.resolve()), cv2.IMREAD_GRAYSCALE))
            data = cv2.normalize(data, None, alpha = 0, beta = 255, norm_type = cv2.NORM_MINMAX, dtype = cv2.CV_32F).astype(np.uint8)
            cv2.imwrite(str(pb_folder/img.name), data)
        return True
    res = db.from_sequence((path/folder).ls()).map(convert_image)
    with ProgressBar():
        res.compute()
create_clean_ds2(folder="train")
[########################################] | 100% Completed |  1hr 30min 26.4s
# !mv {path}/"test_black" {path}/"test_black2"
path.ls()
(#12) [Path('/Landmark2/pdo/aiking/data/ultra-mnist/train_sample.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/sample.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/train.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/train'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/train_train.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/test'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/train_black2'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/valid_train.csv'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/ultra-mnist.zip'),Path('/Landmark2/pdo/aiking/data/ultra-mnist/sample_submission.csv')...]

img = PIL.Image.open(str((path/"train_black2").ls()[2800])).convert("L")
imgarr = np.array(img) 
img = cv2.imread(str((path/"train_black2").ls()[2800]))
norm_image = cv2.normalize(img, None, alpha = 0, beta = 255, norm_type = cv2.NORM_MINMAX, dtype = cv2.CV_32F)
norm_image = norm_image.astype(np.uint8)
plt.imshow(norm_image)
<matplotlib.image.AxesImage at 0x11331d79b430>
../../_images/03_image_clean_31_1.png
img2 = cv2.imread(str((path/"train").ls()[2800]))
img2
array([[[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       ...,

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0]]], dtype=uint8)
def create_norm_ds(inp_folder="train_black2", out_folder="train_black"):
    def norm_img(img_loc):
        if not (path/out_folder/img_loc.name).exists():
            img = cv2.imread(str(path/inp_folder/img_loc.name))
            norm_image = cv2.normalize(img, None, alpha = 0, beta = 255, norm_type = cv2.NORM_MINMAX, dtype = cv2.CV_32F)
            norm_image = norm_image.astype(np.uint8)
            cv2.imwrite(str(path/out_folder/img_loc.name), norm_image)
        return True
    res = db.from_sequence((path/inp_folder).ls()).map(norm_img)
    with ProgressBar():
        res.compute()
create_norm_ds()
[                                        ] | 0% Completed |  1min 37.0s
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
/opt/anaconda/envs/aiking/lib/python3.9/site-packages/dask/multiprocessing.py in get(dsk, keys, num_workers, func_loads, func_dumps, optimize_graph, pool, chunksize, **kwargs)
    218         # Run
--> 219         result = get_async(
    220             pool.submit,

/opt/anaconda/envs/aiking/lib/python3.9/site-packages/dask/local.py in get_async(submit, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, chunksize, **kwargs)
    495                 fire_tasks(chunksize)
--> 496                 for key, res_info, failed in queue_get(queue).result():
    497                     if failed:

/opt/anaconda/envs/aiking/lib/python3.9/site-packages/dask/local.py in queue_get(q)
    133     def queue_get(q):
--> 134         return q.get()
    135 

/opt/anaconda/envs/aiking/lib/python3.9/queue.py in get(self, block, timeout)
    170                 while not self._qsize():
--> 171                     self.not_empty.wait()
    172             elif timeout < 0:

/opt/anaconda/envs/aiking/lib/python3.9/threading.py in wait(self, timeout)
    311             if timeout is None:
--> 312                 waiter.acquire()
    313                 gotit = True

KeyboardInterrupt: 

During handling of the above exception, another exception occurred:

KeyboardInterrupt                         Traceback (most recent call last)
/tmp/ipykernel_3631/992931216.py in <module>
----> 1 create_norm_ds()

/tmp/ipykernel_3631/4181445123.py in create_norm_ds(inp_folder, out_folder)
      9     res = db.from_sequence((path/inp_folder).ls()).map(norm_img)
     10     with ProgressBar():
---> 11         res.compute()

/opt/anaconda/envs/aiking/lib/python3.9/site-packages/dask/base.py in compute(self, **kwargs)
    286         dask.base.compute
    287         """
--> 288         (result,) = compute(self, traverse=False, **kwargs)
    289         return result
    290 

/opt/anaconda/envs/aiking/lib/python3.9/site-packages/dask/base.py in compute(traverse, optimize_graph, scheduler, get, *args, **kwargs)
    569         postcomputes.append(x.__dask_postcompute__())
    570 
--> 571     results = schedule(dsk, keys, **kwargs)
    572     return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
    573 

/opt/anaconda/envs/aiking/lib/python3.9/site-packages/dask/multiprocessing.py in get(dsk, keys, num_workers, func_loads, func_dumps, optimize_graph, pool, chunksize, **kwargs)
    232     finally:
    233         if cleanup:
--> 234             pool.shutdown()
    235     return result
    236 

/opt/anaconda/envs/aiking/lib/python3.9/concurrent/futures/process.py in shutdown(self, wait, cancel_futures)
    738 
    739         if self._executor_manager_thread is not None and wait:
--> 740             self._executor_manager_thread.join()
    741         # To reduce the risk of opening too many files, remove references to
    742         # objects that use file descriptors.

/opt/anaconda/envs/aiking/lib/python3.9/threading.py in join(self, timeout)
   1051 
   1052         if timeout is None:
-> 1053             self._wait_for_tstate_lock()
   1054         else:
   1055             # the behavior of a negative timeout isn't documented, but

/opt/anaconda/envs/aiking/lib/python3.9/threading.py in _wait_for_tstate_lock(self, block, timeout)
   1067         if lock is None:  # already determined that the C code is done
   1068             assert self._is_stopped
-> 1069         elif lock.acquire(block, timeout):
   1070             lock.release()
   1071             self._stop()

KeyboardInterrupt: