from IPython.core.display import display, HTML

display(HTML("<style>.container { width:90% !important; }</style>"))


from IPython.display import Image
import numpy as np
import torch
from torch import nn
from torch import Tensor


import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset): 
    # 초기 데이터 생성 방법 지정
    def __init__(self, text, labels):
        self.labels = labels
        self.data = text
        
    # 데이터의 전체 길이
    def __len__(self): 
        return len(self.labels)
    
    # index 값을 주었을 때 반환되는 데이터의 형태 (X, y)
    def __getitem__(self, idx): 
        label = self.labels[idx]
        text = self.data[idx]
        sample = {"Text": text, "Class": label}
        return sample


# Dataset 생성
text = ['Happy', 'Amazing', 'Sad', 'Unhappy', 'Glum']
labels = ['Positive', 'Positive', 'Negative', 'Negative', 'Negative']
MyDataset = CustomDataset(text, labels)


type(MyDataset)

__main__.CustomDataset


MyDataLoder = DataLoader(MyDataset, batch_size=2, shuffle=True)
next(iter(MyDataLoder))

{'Text': ['Glum', 'Happy'], 'Class': ['Negative', 'Positive']}


next(iter(MyDataLoder))

{'Text': ['Unhappy', 'Happy'], 'Class': ['Negative', 'Positive']}


for dataset in MyDataLoder: 
    # batch_size가 2, 2,2,1 
    print(dataset)

{'Text': ['Amazing', 'Happy'], 'Class': ['Positive', 'Positive']}
{'Text': ['Sad', 'Glum'], 'Class': ['Negative', 'Negative']}
{'Text': ['Unhappy'], 'Class': ['Negative']}


from torchvision.datasets import VisionDataset
from typing import Any, Callable, Dict, List, Optional, Tuple
import os

from tqdm import tqdm
import os
import sys
from pathlib import Path
import requests

from skimage import io, transform
import matplotlib.pyplot as plt


import tarfile

    
class NotMNIST(VisionDataset):
    resource_url = 'http://yaroslavvb.com/upload/notMNIST/notMNIST_large.tar.gz'
    
    def __init__(
            self,
            root: str,
            train: bool = True,
            transform: Optional[Callable] = None,
            target_transform: Optional[Callable] = None,
            download: bool = False, # download 여부를 사용자가 설정 가능
    ) -> None:
        super(NotMNIST, self).__init__(root, transform=transform,
                                    target_transform=target_transform)

        if not self._check_exists() or download: # download가 존재하니?
            self.download()
            
        self.data, self.targets = self._load_data()
        

    def __len__(self):
        return len(self.data)

    
    def __getitem__(self, index):
        image_name = self.data[index]
        image = io.imread(image_name)
        label = self.targets[index]
        if self.transform: # 이미지를 일고 transform으로 보낸다.
            image = self.transform(image)
        return image, label

    def _load_data(self):
        filepath = self.image_folder
        data = []
        targets = []
        
        for target in os.listdir(filepath): # 해당 Path에서 List 생성
            filenames = [os.path.abspath(
                os.path.join(filepath, target, x)) for x in os.listdir(
                os.path.join(filepath, target))]
            
            targets.extend([target] * len(filenames))
            data.extend(filenames)
        return data, targets

    @property
    def raw_folder(self) -> str:
        return os.path.join(self.root, self.__class__.__name__, 'raw')

    @property
    def image_folder(self) -> str:
        return os.path.join(self.root, 'notMNIST_large')
    
    def download(self) -> None:
        os.makedirs(self.raw_folder, exist_ok=True)
        fname = self.resource_url.split("/")[-1]
        chunk_size = 1024
        
        headers = {
            "User-Agent": f"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                          f"AppleWebKit/537.36 (KHTML, like Gecko) "
                          f"Chrome/80.0.3987.122 Safari/537.36"
        }
        r = requests.get(self.resource_url, 
                         headers=headers)

        filesize = int(r.headers["Content-Length"])
        
        with requests.get(self.resource_url, stream=True, headers=headers) as r, open(
            os.path.join(self.raw_folder, fname), "wb") as f, tqdm(
            unit="B",  # unit string to be displayed.
            unit_scale=True,  # let tqdm to determine the scale in kilo, mega..etc.
            unit_divisor=1024,  # is used when unit_scale is true
            total=filesize,  # the total iteration.
            file=sys.stdout,  # default goes to stderr, this is the display on console.
            desc=fname  # prefix to be displayed on progress bar.
        ) as progress:
            for chunk in r.iter_content(chunk_size=chunk_size):
                # download the file chunk by chunk
                datasize = f.write(chunk)
                # on each chunk update the progress bar.
                progress.update(datasize)
                
        # 압축 해제
        self._extract_file(os.path.join(self.raw_folder, fname), target_path=self.root)
        
    def _extract_file(self, fname, target_path) -> None:
        if fname.endswith("tar.gz"):
            tag = "r:gz"
        elif fname.endswith("tar"):
            tag = "r:"
        tar = tarfile.open(fname, tag)
        tar.extractall(path=target_path)
        tar.close()
    
    def _check_exists(self) -> bool:
        return os.path.exists(self.raw_folder)


dataset = NotMNIST("data", download=True)


fig = plt.figure()

for i in range(8):
    sample = dataset[i]

    ax = plt.subplot(1, 4, i + 1)
    plt.tight_layout()
    ax.set_title('Sample #{}'.format(i))
    ax.axis('off')
    plt.imshow(sample[0])

    if i == 3:
        plt.show()
        break

[Pytorch]4. Dataset & Dataloader

Dataset & Dataloader¶

모델에 데이터를 먹이는 방법¶

Dataset 클래스¶

Dataset 클래스 생성시 유의점¶

DataLoader 클래스¶