Lecture2: Evidence and p-value

%config IPCompleter.greedy=True
# from dotenv import load_dotenv
# load_dotenv()
from PIL import Image

Introduction

Synopsis: What did we learn in this lecture

  • ML Problems: Classification & Regression

  • Validation Set, Overfitting and hints of selecting a representative set

  • Metrics vs Loss [Important question on overfitting with respect to training loss going down and validation loss going up to indicate overfitting ]

    • Not always true : Follow the metric not loss

  • Transfer Learning and Fine-tuning

  • Why Transfer Learning Works (Zeller & Fergues viz paper)

  • State of DL now [What works and what doesn’t]

  • Filter/ Feature & catostrophic forgetting [ if you want your model to keep performing on old data => use old(Samples)+ new in training]

  • Model Zoo

  • Interpreting p-value , utilities

    • Almost always confuse the issue

    • Multivariate p-value more robust[ t-statistics]?

    • choose reverse hypothesis as null hypothesis and see if you have sufficient data . Otherwise no decision can be made

  • Drive train approach

    • Objectives => Levers => Data Collection => Model

    • Strategy(Sources of value, Levers)=> Data(Availiability, Suitability)=>Analytics (Predictions, Insights) => Implementation(IT, Human Capital) => Maintenance(Environment Changes)

    • Identify & Manage Constraints across spectrum

  • Prior Belief, Evidence and Utility View

  • Data Curation

    • Bing image search

    • L object

    • verify_images

  • Datablock Api

    • blocks

    • get_x, get_y

    • item_tfms

  • Model Export

  • Inference

  • Covid Paper

    • Seasonality

    • Transmissibility

    • Group of cities

    • p-value

    • Visualization

  • How to read paper & criticism

Implementation Plan

  • Curate Dataset

  • Apply Datablock api on multiple datasets

  • Explore more on p-value, t2 stats

  • Read Drivetrain paper in detail and understand modeler, simulator and optimizer concepts

Datablock Api

PETS problem using datablock api

%load_ext autoreload
%autoreload 2
from fastai.basics import *
from fastai.vision.all import *
from nlphero.data.external import *
from fastai.vision.widgets import *
path = untar_data(URLs.PETS); path
Path('/Landmark2/pdo/.nlphero/data/oxford-iiit-pet')
o = (path/"images").ls()[0]
o
Path('/Landmark2/pdo/.nlphero/data/oxford-iiit-pet/images/wheaten_terrier_112.jpg')
def get_breed(o): return o.name.rsplit("_", 1)[0]
pets = DataBlock(
    blocks = (ImageBlock, CategoryBlock),
    get_items = get_image_files,
    splitter = RandomSplitter(valid_pct=0.2, seed=42),
    get_y = get_breed,
    item_tfms = Resize(224)
    )
pets
<fastai.data.block.DataBlock at 0x7f170bfa2dc0>
dls = pets.dataloaders(path); dls
<fastai.data.core.DataLoaders at 0x7f170bfa2fd0>
dls.valid.show_batch(max_n=4, nrows=1)
../_images/Lecture2_19_0.png
learn = cnn_learner(dls, resnet34, metrics=[error_rate, accuracy]); learn
<fastai.learner.Learner at 0x7f170bf7ebe0>
learn.fine_tune(2, freeze_epochs=3)
epoch train_loss valid_loss error_rate accuracy time
0 3.252484 2.097097 0.566644 0.433356 07:20
1 2.311346 1.852842 0.519621 0.480379 08:34
2 2.041009 1.734501 0.502030 0.497970 08:36
0.00% [0/2 00:00<00:00]
epoch train_loss valid_loss error_rate accuracy time

22.28% [41/184 02:38<09:13 1.8422]

Data Curation

Google

#from google_images_download import google_images_download
#response = google_images_download.googleimagesdownload(); response
#arguments = {"keywords":"polar bear, grizzly bear, teddy Bear","limit":20,"print_urls":True}
#response.download(arguments)

Bing

# from azure.cognitiveservices.search.imagesearch import ImageSearchClient as api
# from msrest.authentication import CognitiveServicesCredentials as auth
# def search_images_bing(key, term, count=150, min_sz=128):
#     client = api("https://api.cognitive.microsoft.com", auth(key))
#     return L(client.images.search(query=term, count=count, min_height=min_sz, min_width=min_sz).value)
results = search_images_bing(key, "Sarah Shahi", count=1)
imgs = results.attrgot("content_url")
imgs
(#1) ['https://celebmafia.com/wp-content/uploads/2018/05/sarah-shahi-2018-environmental-media-association-awards-in-beverly-hills-1.jpg']
download_url(imgs[0], dest=Path("Sarah_Shahi.jpg"))
im = Image.open(Path("Sarah_Shahi.jpg"))
im.to_thumb(128,128)
../_images/Lecture2_34_0.png

Datasets

Bears

bear_types = 'grizzly', 'black', 'teddy'
path = Path('bears')
if not path.exists():
    path.mkdir(exist_ok=True)
    for o in bear_types:
        dest = (path/o)
        dest.mkdir(exist_ok=True)
        results = search_images_bing(key, f"{o} bear", count=150)
        download_images(dest, urls=results.attrgot("content_url"))
for o in bear_types:
    print((path/o))
bears/grizzly
bears/black
bears/teddy
fns = get_image_files(path)
fns
(#425) [Path('bears/teddy/00000046.jpg'),Path('bears/teddy/00000119.jpg'),Path('bears/teddy/00000041.jpg'),Path('bears/teddy/00000090.jpg'),Path('bears/teddy/00000117.jpg'),Path('bears/teddy/00000069.png'),Path('bears/teddy/00000033.jpg'),Path('bears/teddy/00000034.jpg'),Path('bears/teddy/00000110.jpg'),Path('bears/teddy/00000048.jpg')...]
failed = verify_images(fns)
failed
(#0) []
failed.map(Path.unlink)
(#0) []

DoppelGanger

# def construct_image_dataset(clstypes, dest,key=key, count=150):
#     path = Path(dest)
#     if not path.exists():
#         path.mkdir(exist_ok=True)
#         for o in clstypes:
#             d = o.replace(" ", "_")
#             dest = (path/d)
#             print(f"Dowloading images in {d}")
#             dest.mkdir(exist_ok=True)

#             results = search_images_bing(key, o, count=count)
#             download_images(dest, urls=results.attrgot("content_url"))
#             print(f"Finished Dowloading images in {d}")

#         for i in range(3):
#             fns = get_image_files(path)
#             failed = verify_images(fns)
#             print(failed)
#             failed.map(Path.unlink)
#     return path
    
clstypes = "Sarah Shahi", "Mercedes Masohn"
dest = "Doppelganger"
count=150
path = construct_image_dataset(clstypes,dest,count); path
Path('/Landmark2/pdo/.nlphero/data/Doppelganger')
actresses = DataBlock(
        blocks = (ImageBlock, CategoryBlock),
        get_items = get_image_files,
        splitter = RandomSplitter(valid_pct=0.2, seed=42),
        get_y = parent_label,
        item_tfms = Resize(224)
        )
actresses
<fastai.data.block.DataBlock at 0x7fb613b65520>
dls = actresses.dataloaders(path); dls
<fastai.data.core.DataLoaders at 0x7fb50707b3d0>
dls.valid.show_batch(max_n=4, nrows=1)
../_images/Lecture2_50_0.png
learn = cnn_learner(dls, resnet34, metrics=[error_rate, accuracy]); learn
<fastai.learner.Learner at 0x7f8ca4195790>
learn.fine_tune(2, freeze_epochs=5)
epoch train_loss valid_loss error_rate accuracy time
0 1.023837 0.849942 0.431035 0.568965 00:15
1 1.047065 0.581088 0.327586 0.672414 00:21
2 0.912259 0.429392 0.189655 0.810345 00:21
3 0.820823 0.318312 0.137931 0.862069 00:20
4 0.715490 0.220695 0.120690 0.879310 00:22
epoch train_loss valid_loss error_rate accuracy time
0 0.283033 0.150566 0.017241 0.982759 00:24
1 0.237740 0.141855 0.051724 0.948276 00:27
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix()
../_images/Lecture2_54_0.png
interp.plot_top_losses(3, nrows=3)
../_images/Lecture2_55_0.png

DoppelGanger2

clstypes = "Jeffrey Dean Morgan", "Javier Bardem"
dest = "Doppelganger2"
# count=150
path = construct_image_dataset(clstypes,dest); path
Dowloading images in Jeffrey_Dean_Morgan
Finished Dowloading images in Jeffrey_Dean_Morgan
Dowloading images in Javier_Bardem
Finished Dowloading images in Javier_Bardem
(#13) [Path('Doppelganger2/Jeffrey_Dean_Morgan/00000041.jpg'),Path('Doppelganger2/Jeffrey_Dean_Morgan/00000063.jpg'),Path('Doppelganger2/Jeffrey_Dean_Morgan/00000111.jpg'),Path('Doppelganger2/Jeffrey_Dean_Morgan/00000116.jpg'),Path('Doppelganger2/Jeffrey_Dean_Morgan/00000007.jpg'),Path('Doppelganger2/Javier_Bardem/00000027.jpg'),Path('Doppelganger2/Javier_Bardem/00000029.jpg'),Path('Doppelganger2/Javier_Bardem/00000034.jpg'),Path('Doppelganger2/Javier_Bardem/00000041.jpg'),Path('Doppelganger2/Javier_Bardem/00000065.jpg')...]
(#0) []
(#0) []
Path('Doppelganger2')
# !rm -rf Doppelganger2
actors = DataBlock(
        blocks = (ImageBlock, CategoryBlock),
        get_items = get_image_files,
        splitter = RandomSplitter(valid_pct=0.3, seed=42),
        get_y = parent_label,
        item_tfms = Resize(224)
       )
dls = actors.dataloaders(path)
learn = cnn_learner(dls, resnet34, metrics=[accuracy, error_rate])
learn
<fastai.learner.Learner at 0x7f8c8499cfd0>
learn.fine_tune(3,freeze_epochs=5)
epoch train_loss valid_loss accuracy error_rate time
0 1.216884 0.837433 0.571429 0.428571 00:25
1 1.125102 0.537283 0.785714 0.214286 00:25
2 0.942884 0.531028 0.821429 0.178571 00:27
3 0.814877 0.549914 0.833333 0.166667 00:24
4 0.693545 0.521625 0.857143 0.142857 00:24
epoch train_loss valid_loss accuracy error_rate time
0 0.247736 0.401182 0.892857 0.107143 00:30
1 0.171536 0.322162 0.916667 0.083333 00:30
2 0.128251 0.308142 0.928571 0.071429 00:30
dls.valid.show_batch()
../_images/Lecture2_63_0.png
interp = ClassificationInterpretation.from_learner(learn)
interp
<fastai.interpret.ClassificationInterpretation at 0x7f8bc75299a0>
interp.plot_confusion_matrix()
../_images/Lecture2_65_0.png
interp.plot_top_losses(6,nrows=6)
../_images/Lecture2_66_0.png

Doppelganger3

clstypes = "Benicio Del Toro", "Brad Pitt"
dest = "Doppelganger3"
path = construct_image_dataset(clstypes,dest, count=150); path
Dowloading images in Benicio_Del_Toro
Finished Dowloading images in Benicio_Del_Toro
Dowloading images in Brad_Pitt
 Download of https://dailystormer.name/wp-content/uploads/2019/11/brad-pitt-2.jpg has failed after 5 retries
 Fix the download manually:
$ mkdir -p Doppelganger3/Brad_Pitt
$ cd Doppelganger3/Brad_Pitt
$ wget -c https://dailystormer.name/wp-content/uploads/2019/11/brad-pitt-2.jpg
$ tar xf brad-pitt-2.jpg
 And re-run your code once the download is successful

Finished Dowloading images in Brad_Pitt
(#14) [Path('Doppelganger3/Brad_Pitt/00000040.jpg'),Path('Doppelganger3/Brad_Pitt/00000149.jpg'),Path('Doppelganger3/Brad_Pitt/00000125.jpg'),Path('Doppelganger3/Brad_Pitt/00000059.jpg'),Path('Doppelganger3/Brad_Pitt/00000130.jpg'),Path('Doppelganger3/Brad_Pitt/00000137.jpg'),Path('Doppelganger3/Brad_Pitt/00000013.jpg'),Path('Doppelganger3/Brad_Pitt/00000037.jpg'),Path('Doppelganger3/Brad_Pitt/00000138.jpg'),Path('Doppelganger3/Brad_Pitt/00000060.jpg')...]
(#0) []
(#0) []
Path('Doppelganger3')
actors = DataBlock(
         blocks = (ImageBlock, CategoryBlock),
         get_items = get_image_files,
         splitter = RandomSplitter(valid_pct=0.3, seed=43),
         get_y = parent_label,
         item_tfms = Resize(224)
        )
dls = actors.dataloaders(path)
dls.valid.show_batch(nrows=1)
../_images/Lecture2_71_0.png
learn = cnn_learner(dls, resnet34, metrics=[error_rate, accuracy])
learn
<fastai.learner.Learner at 0x7f8c84857a90>
learn.fine_tune(3, freeze_epochs=5)
epoch train_loss valid_loss error_rate accuracy time
0 1.391564 0.935234 0.464286 0.535714 00:25
1 1.172075 0.878080 0.452381 0.547619 00:24
2 1.053020 0.914704 0.392857 0.607143 00:25
3 0.870856 0.782933 0.297619 0.702381 00:24
4 0.721680 0.565612 0.238095 0.761905 00:24
epoch train_loss valid_loss error_rate accuracy time
0 0.180159 0.497945 0.190476 0.809524 00:29
1 0.166392 0.430486 0.142857 0.857143 00:30
2 0.134516 0.393905 0.130952 0.869048 00:29
interp = ClassificationInterpretation.from_learner(learn)
interp
<fastai.interpret.ClassificationInterpretation at 0x7f8c81efcdc0>
interp.plot_confusion_matrix()
../_images/Lecture2_75_0.png
interp.plot_top_losses(11, nrows=7)
../_images/Lecture2_76_0.png

DoppelgangerMixed

os.getenv("NLPHERO_HOME")
'/Landmark2/pdo/.nlphero'
list_ds()
(#13) ['av-healthcare-analytics-ii','adult_sample','DoppelgangerMixed','twitter-airline-sentiment','glove840b300dtxt','Bears','oxford-iiit-pet','spooky','60k-stack-overflow-questions-with-quality-rate','imdb_tok'...]
path = get_ds("DoppelgangerMixed"); path
Path('/Landmark2/pdo/.nlphero/data/DoppelgangerMixed')
actors = DataBlock(
         blocks = (ImageBlock, CategoryBlock),
         get_items = get_image_files, 
         get_y = parent_label, 
         splitter = RandomSplitter(valid_pct=0.3, seed=43),
         item_tfms = Resize(224)
        )
dls = actors.dataloaders(path); dls
<fastai.data.core.DataLoaders at 0x7f9e2dc0fa00>
learn = cnn_learner(dls, resnet18, metrics=[accuracy, error_rate])
learn
<fastai.learner.Learner at 0x7f9cde1651f0>
learn.fine_tune(4, freeze_epochs=4)
epoch train_loss valid_loss accuracy error_rate time
0 0.093747 0.474693 0.830116 0.169884 00:27
1 0.105986 0.514394 0.818533 0.181467 00:25
2 0.120812 0.532366 0.849421 0.150579 00:27
3 0.122049 0.643733 0.826255 0.173745 00:26
epoch train_loss valid_loss accuracy error_rate time
0 0.096170 0.538056 0.837838 0.162162 00:32
1 0.085646 0.545626 0.857143 0.142857 00:29
2 0.075069 0.524330 0.872587 0.127413 00:31
3 0.074985 0.526975 0.868726 0.131274 00:28
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix()
../_images/Lecture2_85_1.png
interp.plot_top_losses(k=40, nrows=40)
../_images/Lecture2_86_0.png
interp
<fastai.interpret.ClassificationInterpretation at 0x7f9d1aaab9d0>
learn.export(fname='DoppelGanger.pkl')
# learn.export??
learn.dls.train.vocab
['Benicio_Del_Toro', 'Brad_Pitt', 'Javier_Bardem', 'Jeffrey_Dean_Morgan', 'Mercedes_Masohn', 'Sarah_Shahi']
cleaner = ImageClassifierCleaner(learn)
cleaner
# cleaner??
# interp.top_losses()
learn_inf = load_learner?

Model Inference

learn_inf = load_learner(fname='DoppelGanger.pkl'); learn_inf
<fastai.learner.Learner at 0x7f9e2cd25490>
im = Image.open(Path("Sarah_Shahi.jpg"))
im.to_thumb(128,128)
../_images/Lecture2_97_0.png
learn_inf.predict("Sarah_Shahi.jpg")
('Sarah_Shahi',
 tensor(5),
 tensor([3.1567e-09, 4.7201e-06, 5.2629e-09, 1.3037e-10, 1.9874e-05, 9.9998e-01]))

UI

btn_upload = widgets.FileUpload(); btn_upload
#hide
# For the book, we can't actually click an upload button, so we fake it
btn_upload = SimpleNamespace(data = ['Sarah_Shahi.jpg'])
img = PILImage.create(btn_upload.data[-1])
img.to_thumb(256,256)
../_images/Lecture2_103_0.png
#hide_output
out_pl = widgets.Output()
out_pl.clear_output()
with out_pl: display(img.to_thumb(128,128))
out_pl