├── .gitignore
├── 1958-perceptron
    ├── README.md
    ├── binary_adaline_minibatch.py
    ├── binary_multilayer_minibatch.py
    └── iris.data
├── LICENSE
├── README.md
├── classifiers
    └── xgb
    │   ├── iris.ipynb
    │   ├── requirements.txt
    │   └── train.py
├── convnet
    ├── README
    ├── config.py
    ├── sweep.yml
    ├── train.py
    ├── train_wandb.py
    └── trainer.py
├── datasets
    └── upload_iris.py
├── preprocessing
    ├── README.md
    ├── multi_file_example.py
    ├── save_v_mmap.py
    └── single_file_example.py
├── requirements.txt
└── thoughts
    ├── README
    ├── bench
        ├── dataloader
        │   ├── Makefile
        │   ├── load_from_disk.py
        │   ├── make_example_tensor.py
        │   ├── naive.py
        │   └── preallocate_on_gpu.py
        └── preprocessing
        │   ├── .gitignore
        │   ├── convert_imagenet_to_hdf5.py
        │   ├── hdf5_dataloader.py
        │   ├── hdf5_handwritten_loader.py
        │   ├── load_hf_imagenet_1k.py
        │   ├── load_imagefolder.py
        │   ├── np_memmap_loader.py
        │   └── read_hdf5.py
    └── view_model.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | source_*
163 | data
164 | wandb


--------------------------------------------------------------------------------
/1958-perceptron/README.md:
--------------------------------------------------------------------------------
 1 | Rosenblatt’s perceptron is built around a nonlinear
 2 | neuron, namely, the McCulloch–Pitts model of a neuron. 
 3 | 
 4 | The perceptron + adaline first introduced the idea that we
 5 | could learn simple linear functions based on data. It was
 6 | able to achieve impressive results (at the time) for
 7 | solving binary classification problems.
 8 | 
 9 | The iris dataset is a classic starting point for anyone
10 | starting off with classification tasks because it has only
11 | 4 features and 3 output classes.
12 | 
13 | https://archive.ics.uci.edu/ml/machine-learning-databases/iris/
14 | 
15 | In my implementation of adaline, I modified the dataset so
16 | that one of the classes is excluded as to turn the problem
17 | back into a binary classification task.
18 | 
19 | For the sake of comparison, I've also included
20 | implementations for MLP and multiclass classification
21 | 
22 | - Data Normalization
23 | - Initialization for weights and biases
24 | - Minibatch vs. full batch
25 | 


--------------------------------------------------------------------------------
/1958-perceptron/binary_adaline_minibatch.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import pandas as pd
  4 | from pathlib import Path
  5 | 
  6 | current_dir = Path(__file__).parent
  7 | 
  8 | # 0. Prepare data
  9 | df = pd.read_csv(
 10 |     current_dir / "iris.data", 
 11 |     names=["sepal_length", "sepal_width", "petal_length", "petal_width", "class"]
 12 |     )
 13 | df = df.loc[df["class"] != "Iris-virginica"]
 14 | labels = df.iloc[:,4]
 15 | 
 16 | xs = torch.from_numpy(df.iloc[:,:4][["sepal_width", "petal_length"]].to_numpy()).float()
 17 | 
 18 | # center data around mean
 19 | xs = (xs - xs.mean(dim=0)) / xs.std(dim=0)
 20 | ys = torch.tensor(list(map(lambda x: 1 if x == 'Iris-versicolor' else 0, labels)), dtype=torch.long)
 21 | 
 22 | # split data into train and test
 23 | from sklearn.model_selection import train_test_split
 24 | xs, xs_test, ys, ys_test = train_test_split(xs, ys, test_size=0.3, random_state=42)
 25 | 
 26 | from torch.utils.data import Dataset, DataLoader
 27 | class Data(Dataset):
 28 | 
 29 |   def __init__(self, X_train, y_train):
 30 |     self.X = X_train
 31 |     self.y = y_train
 32 |     self.len = self.X.shape[0]
 33 |   
 34 |   def __getitem__(self, index):
 35 |     return self.X[index], self.y[index]
 36 | 
 37 |   def __len__(self):
 38 |     return self.len
 39 | 
 40 | traindata = Data(xs, ys)
 41 | 
 42 | # 1. Define the model
 43 | class Perceptron(nn.Module):
 44 |     def __init__(self, input_dim):
 45 |         super(Perceptron, self).__init__()
 46 |         self.linear = nn.Linear(input_dim, 1, dtype=torch.float)
 47 |         self.linear.weight.detach().zero_()
 48 |         self.linear.bias.detach().zero_()
 49 | 
 50 |     def forward(self, x_in):
 51 |         x = self.linear(x_in)
 52 |         x = x.view(-1)
 53 |         return x
 54 | 
 55 | # 2. Instantiate the model with hyperparameters
 56 | model = Perceptron(input_dim=xs.shape[1])
 57 | 
 58 | # 3. define loss function
 59 | criterion = nn.MSELoss()
 60 | 
 61 | # 4. Instantiate the optimizer
 62 | optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
 63 | 
 64 | BATCH_SIZE = 10
 65 | 
 66 | trainloader = DataLoader(
 67 |    traindata, 
 68 |    batch_size=BATCH_SIZE, 
 69 |    shuffle=True, 
 70 |    num_workers=1
 71 |    )
 72 | 
 73 | 
 74 | # 5. Iterate through the dataset
 75 | for epoch in range(20):
 76 | 
 77 |     running_loss = 0.0
 78 |     for i, data in enumerate(trainloader, 0):
 79 |         inputs, labels = data 
 80 | 
 81 |         # Forward pass
 82 |         y_pred = model(inputs)
 83 | 
 84 |         # Compute Loss
 85 |         loss = criterion(y_pred, labels.float())
 86 | 
 87 |         # Zero gradients
 88 |         optimizer.zero_grad()
 89 | 
 90 |         # perform a backward pass (backpropagation)
 91 |         loss.backward()
 92 | 
 93 |         # Update the parameters
 94 |         optimizer.step()
 95 | 
 96 |         with torch.no_grad():
 97 |             eval_preds = model(xs_test)
 98 |             eval_loss = criterion(eval_preds, ys_test.float())
 99 | 
100 |         print(f'[{epoch + 1:3d} {i + 1:3d}] minibatch loss: {loss.item():.5f} eval loss: {eval_loss.item():.5f}')
101 | 
102 | # 6. Make predictions
103 | with torch.no_grad():
104 |     y_pred = model(xs_test)
105 |     ones = torch.ones(ys_test.size())
106 |     zeros = torch.zeros(ys_test.size())
107 |     test_acc = torch.mean((torch.where(y_pred > 0.5, ones, zeros).int() == ys_test).float())
108 |     print(f"Test accuracy: {test_acc:.5f}")


--------------------------------------------------------------------------------
/1958-perceptron/binary_multilayer_minibatch.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import pandas as pd
  4 | from pathlib import Path
  5 | 
  6 | current_dir = Path(__file__).parent
  7 | 
  8 | # 0. Prepare data
  9 | df = pd.read_csv(
 10 |     current_dir / "iris.data", 
 11 |     names=["sepal_length", "sepal_width", "petal_length", "petal_width", "class"]
 12 |     )
 13 | df = df.loc[df["class"] != "Iris-virginica"]
 14 | labels = df.iloc[:,4]
 15 | 
 16 | xs = torch.from_numpy(df.iloc[:,:4][["sepal_width", "petal_length"]].to_numpy()).float()
 17 | 
 18 | # center data around mean
 19 | xs = (xs - xs.mean(dim=0)) / xs.std(dim=0)
 20 | ys = torch.tensor(list(map(lambda x: 1 if x == 'Iris-versicolor' else 0, labels)), dtype=torch.long)
 21 | 
 22 | # split data into train and test
 23 | from sklearn.model_selection import train_test_split
 24 | xs, xs_test, ys, ys_test = train_test_split(xs, ys, test_size=0.3, random_state=42)
 25 | 
 26 | from torch.utils.data import Dataset, DataLoader
 27 | class Data(Dataset):
 28 | 
 29 |   def __init__(self, X_train, y_train):
 30 |     self.X = X_train
 31 |     self.y = y_train
 32 |     self.len = self.X.shape[0]
 33 |   
 34 |   def __getitem__(self, index):
 35 |     return self.X[index], self.y[index]
 36 | 
 37 |   def __len__(self):
 38 |     return self.len
 39 | 
 40 | traindata = Data(xs, ys)
 41 | 
 42 | # 1. Define the model
 43 | class Perceptron(nn.Module):
 44 |     def __init__(self, input_dim, output_dim):
 45 |         super(Perceptron, self).__init__()
 46 |         self.sequential = nn.Sequential(
 47 |           nn.Linear(input_dim, 25, dtype=torch.float),
 48 |           nn.Sigmoid(),
 49 |           nn.Linear(25, output_dim, dtype=torch.float),
 50 |         )
 51 | 
 52 |     def forward(self, x_in):
 53 |         x = self.sequential(x_in)
 54 |         return x.view(-1)
 55 | 
 56 | # 2. Instantiate the model with hyperparameters
 57 | 
 58 | model = Perceptron(input_dim=xs.shape[1], output_dim=1)
 59 | 
 60 | # 3. Instantiate the loss
 61 | criterion = nn.MSELoss()
 62 | 
 63 | # 4. Instantiate the optimizer
 64 | optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
 65 | # optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
 66 | 
 67 | BATCH_SIZE = 10
 68 | 
 69 | trainloader = DataLoader(
 70 |     traindata, 
 71 |     batch_size=BATCH_SIZE, 
 72 |     shuffle=True, 
 73 |     num_workers=1
 74 |     )
 75 | 
 76 | # 5. Iterate through the dataset
 77 | for epoch in range(20):
 78 | 
 79 |     running_loss = 0.0
 80 |     for i, data in enumerate(trainloader, 0):
 81 |         inputs, labels = data
 82 | 
 83 |         # Forward pass
 84 |         y_pred = model(inputs)
 85 | 
 86 |         # Compute Loss
 87 |         loss = criterion(y_pred, labels.float())
 88 | 
 89 |         # Zero gradients
 90 |         optimizer.zero_grad()
 91 | 
 92 |         # perform a backward pass (backpropagation)
 93 |         loss.backward()
 94 | 
 95 |         # Update the parameters
 96 |         optimizer.step()
 97 | 
 98 |         with torch.no_grad():
 99 |             eval_preds = model(xs_test)
100 |             eval_loss = criterion(eval_preds, ys_test.float())
101 | 
102 |         print(f'[{epoch + 1:3d} {i + 1:3d}] minibatch loss: {loss.item():.5f} eval loss: {eval_loss.item():.5f}')
103 | 
104 | # 6. Make predictions
105 | with torch.no_grad():
106 |     y_pred = model(xs_test)
107 |     ones = torch.ones(ys_test.size())
108 |     zeros = torch.zeros(ys_test.size())
109 |     test_acc = torch.mean((torch.where(y_pred > 0.5, ones, zeros).int() == ys_test).float())
110 |     print(f"Test accuracy: {test_acc:.5f}")


--------------------------------------------------------------------------------
/1958-perceptron/iris.data:
--------------------------------------------------------------------------------
  1 | 5.1,3.5,1.4,0.2,Iris-setosa
  2 | 4.9,3.0,1.4,0.2,Iris-setosa
  3 | 4.7,3.2,1.3,0.2,Iris-setosa
  4 | 4.6,3.1,1.5,0.2,Iris-setosa
  5 | 5.0,3.6,1.4,0.2,Iris-setosa
  6 | 5.4,3.9,1.7,0.4,Iris-setosa
  7 | 4.6,3.4,1.4,0.3,Iris-setosa
  8 | 5.0,3.4,1.5,0.2,Iris-setosa
  9 | 4.4,2.9,1.4,0.2,Iris-setosa
 10 | 4.9,3.1,1.5,0.1,Iris-setosa
 11 | 5.4,3.7,1.5,0.2,Iris-setosa
 12 | 4.8,3.4,1.6,0.2,Iris-setosa
 13 | 4.8,3.0,1.4,0.1,Iris-setosa
 14 | 4.3,3.0,1.1,0.1,Iris-setosa
 15 | 5.8,4.0,1.2,0.2,Iris-setosa
 16 | 5.7,4.4,1.5,0.4,Iris-setosa
 17 | 5.4,3.9,1.3,0.4,Iris-setosa
 18 | 5.1,3.5,1.4,0.3,Iris-setosa
 19 | 5.7,3.8,1.7,0.3,Iris-setosa
 20 | 5.1,3.8,1.5,0.3,Iris-setosa
 21 | 5.4,3.4,1.7,0.2,Iris-setosa
 22 | 5.1,3.7,1.5,0.4,Iris-setosa
 23 | 4.6,3.6,1.0,0.2,Iris-setosa
 24 | 5.1,3.3,1.7,0.5,Iris-setosa
 25 | 4.8,3.4,1.9,0.2,Iris-setosa
 26 | 5.0,3.0,1.6,0.2,Iris-setosa
 27 | 5.0,3.4,1.6,0.4,Iris-setosa
 28 | 5.2,3.5,1.5,0.2,Iris-setosa
 29 | 5.2,3.4,1.4,0.2,Iris-setosa
 30 | 4.7,3.2,1.6,0.2,Iris-setosa
 31 | 4.8,3.1,1.6,0.2,Iris-setosa
 32 | 5.4,3.4,1.5,0.4,Iris-setosa
 33 | 5.2,4.1,1.5,0.1,Iris-setosa
 34 | 5.5,4.2,1.4,0.2,Iris-setosa
 35 | 4.9,3.1,1.5,0.1,Iris-setosa
 36 | 5.0,3.2,1.2,0.2,Iris-setosa
 37 | 5.5,3.5,1.3,0.2,Iris-setosa
 38 | 4.9,3.1,1.5,0.1,Iris-setosa
 39 | 4.4,3.0,1.3,0.2,Iris-setosa
 40 | 5.1,3.4,1.5,0.2,Iris-setosa
 41 | 5.0,3.5,1.3,0.3,Iris-setosa
 42 | 4.5,2.3,1.3,0.3,Iris-setosa
 43 | 4.4,3.2,1.3,0.2,Iris-setosa
 44 | 5.0,3.5,1.6,0.6,Iris-setosa
 45 | 5.1,3.8,1.9,0.4,Iris-setosa
 46 | 4.8,3.0,1.4,0.3,Iris-setosa
 47 | 5.1,3.8,1.6,0.2,Iris-setosa
 48 | 4.6,3.2,1.4,0.2,Iris-setosa
 49 | 5.3,3.7,1.5,0.2,Iris-setosa
 50 | 5.0,3.3,1.4,0.2,Iris-setosa
 51 | 7.0,3.2,4.7,1.4,Iris-versicolor
 52 | 6.4,3.2,4.5,1.5,Iris-versicolor
 53 | 6.9,3.1,4.9,1.5,Iris-versicolor
 54 | 5.5,2.3,4.0,1.3,Iris-versicolor
 55 | 6.5,2.8,4.6,1.5,Iris-versicolor
 56 | 5.7,2.8,4.5,1.3,Iris-versicolor
 57 | 6.3,3.3,4.7,1.6,Iris-versicolor
 58 | 4.9,2.4,3.3,1.0,Iris-versicolor
 59 | 6.6,2.9,4.6,1.3,Iris-versicolor
 60 | 5.2,2.7,3.9,1.4,Iris-versicolor
 61 | 5.0,2.0,3.5,1.0,Iris-versicolor
 62 | 5.9,3.0,4.2,1.5,Iris-versicolor
 63 | 6.0,2.2,4.0,1.0,Iris-versicolor
 64 | 6.1,2.9,4.7,1.4,Iris-versicolor
 65 | 5.6,2.9,3.6,1.3,Iris-versicolor
 66 | 6.7,3.1,4.4,1.4,Iris-versicolor
 67 | 5.6,3.0,4.5,1.5,Iris-versicolor
 68 | 5.8,2.7,4.1,1.0,Iris-versicolor
 69 | 6.2,2.2,4.5,1.5,Iris-versicolor
 70 | 5.6,2.5,3.9,1.1,Iris-versicolor
 71 | 5.9,3.2,4.8,1.8,Iris-versicolor
 72 | 6.1,2.8,4.0,1.3,Iris-versicolor
 73 | 6.3,2.5,4.9,1.5,Iris-versicolor
 74 | 6.1,2.8,4.7,1.2,Iris-versicolor
 75 | 6.4,2.9,4.3,1.3,Iris-versicolor
 76 | 6.6,3.0,4.4,1.4,Iris-versicolor
 77 | 6.8,2.8,4.8,1.4,Iris-versicolor
 78 | 6.7,3.0,5.0,1.7,Iris-versicolor
 79 | 6.0,2.9,4.5,1.5,Iris-versicolor
 80 | 5.7,2.6,3.5,1.0,Iris-versicolor
 81 | 5.5,2.4,3.8,1.1,Iris-versicolor
 82 | 5.5,2.4,3.7,1.0,Iris-versicolor
 83 | 5.8,2.7,3.9,1.2,Iris-versicolor
 84 | 6.0,2.7,5.1,1.6,Iris-versicolor
 85 | 5.4,3.0,4.5,1.5,Iris-versicolor
 86 | 6.0,3.4,4.5,1.6,Iris-versicolor
 87 | 6.7,3.1,4.7,1.5,Iris-versicolor
 88 | 6.3,2.3,4.4,1.3,Iris-versicolor
 89 | 5.6,3.0,4.1,1.3,Iris-versicolor
 90 | 5.5,2.5,4.0,1.3,Iris-versicolor
 91 | 5.5,2.6,4.4,1.2,Iris-versicolor
 92 | 6.1,3.0,4.6,1.4,Iris-versicolor
 93 | 5.8,2.6,4.0,1.2,Iris-versicolor
 94 | 5.0,2.3,3.3,1.0,Iris-versicolor
 95 | 5.6,2.7,4.2,1.3,Iris-versicolor
 96 | 5.7,3.0,4.2,1.2,Iris-versicolor
 97 | 5.7,2.9,4.2,1.3,Iris-versicolor
 98 | 6.2,2.9,4.3,1.3,Iris-versicolor
 99 | 5.1,2.5,3.0,1.1,Iris-versicolor
100 | 5.7,2.8,4.1,1.3,Iris-versicolor
101 | 6.3,3.3,6.0,2.5,Iris-virginica
102 | 5.8,2.7,5.1,1.9,Iris-virginica
103 | 7.1,3.0,5.9,2.1,Iris-virginica
104 | 6.3,2.9,5.6,1.8,Iris-virginica
105 | 6.5,3.0,5.8,2.2,Iris-virginica
106 | 7.6,3.0,6.6,2.1,Iris-virginica
107 | 4.9,2.5,4.5,1.7,Iris-virginica
108 | 7.3,2.9,6.3,1.8,Iris-virginica
109 | 6.7,2.5,5.8,1.8,Iris-virginica
110 | 7.2,3.6,6.1,2.5,Iris-virginica
111 | 6.5,3.2,5.1,2.0,Iris-virginica
112 | 6.4,2.7,5.3,1.9,Iris-virginica
113 | 6.8,3.0,5.5,2.1,Iris-virginica
114 | 5.7,2.5,5.0,2.0,Iris-virginica
115 | 5.8,2.8,5.1,2.4,Iris-virginica
116 | 6.4,3.2,5.3,2.3,Iris-virginica
117 | 6.5,3.0,5.5,1.8,Iris-virginica
118 | 7.7,3.8,6.7,2.2,Iris-virginica
119 | 7.7,2.6,6.9,2.3,Iris-virginica
120 | 6.0,2.2,5.0,1.5,Iris-virginica
121 | 6.9,3.2,5.7,2.3,Iris-virginica
122 | 5.6,2.8,4.9,2.0,Iris-virginica
123 | 7.7,2.8,6.7,2.0,Iris-virginica
124 | 6.3,2.7,4.9,1.8,Iris-virginica
125 | 6.7,3.3,5.7,2.1,Iris-virginica
126 | 7.2,3.2,6.0,1.8,Iris-virginica
127 | 6.2,2.8,4.8,1.8,Iris-virginica
128 | 6.1,3.0,4.9,1.8,Iris-virginica
129 | 6.4,2.8,5.6,2.1,Iris-virginica
130 | 7.2,3.0,5.8,1.6,Iris-virginica
131 | 7.4,2.8,6.1,1.9,Iris-virginica
132 | 7.9,3.8,6.4,2.0,Iris-virginica
133 | 6.4,2.8,5.6,2.2,Iris-virginica
134 | 6.3,2.8,5.1,1.5,Iris-virginica
135 | 6.1,2.6,5.6,1.4,Iris-virginica
136 | 7.7,3.0,6.1,2.3,Iris-virginica
137 | 6.3,3.4,5.6,2.4,Iris-virginica
138 | 6.4,3.1,5.5,1.8,Iris-virginica
139 | 6.0,3.0,4.8,1.8,Iris-virginica
140 | 6.9,3.1,5.4,2.1,Iris-virginica
141 | 6.7,3.1,5.6,2.4,Iris-virginica
142 | 6.9,3.1,5.1,2.3,Iris-virginica
143 | 5.8,2.7,5.1,1.9,Iris-virginica
144 | 6.8,3.2,5.9,2.3,Iris-virginica
145 | 6.7,3.3,5.7,2.5,Iris-virginica
146 | 6.7,3.0,5.2,2.3,Iris-virginica
147 | 6.3,2.5,5.0,1.9,Iris-virginica
148 | 6.5,3.0,5.2,2.0,Iris-virginica
149 | 6.2,3.4,5.4,2.3,Iris-virginica
150 | 5.9,3.0,5.1,1.8,Iris-virginica


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 bocchi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ```
 2 | NOTE: paused work on this until i can do quick, expressive training runs
 3 | ```
 4 | 
 5 | # papers
 6 | Now that I have a 3090, I think it'll be an interesting
 7 | exercise to go through key papers in deep learning
 8 | history. 
 9 | 
10 | In an effort to cover my bases, I have the deep learning
11 | book with me.
12 | 
13 | This is not a repo about the contents of these papers,
14 | instead it's a log of things I personally learned along
15 | the way. Everything will be done in `torch`.
16 | 
17 | ![](https://github.com/hitorilabs/papers/assets/131238467/52a1e456-dd13-402a-a2ce-3c8fb35105cb)
18 | *Deep Learning (Ian J. Goodfellow, Yoshua Bengio and Aaron Courville), MIT Press, 2016.*
19 | 
20 | ## Setup
21 | 
22 | ```bash
23 | pip install torch numpy pandas
24 | ```
25 | 
26 | - cybernetics + "model of a neuron" (McCulloch and Pitts, 1943; Hebb, 1949)
27 | - perceptron (Rosenblatt, 1958)
28 | - adaptive linear element (ADALINE)
29 | - back-propagation (Rumelhart et al., 1986)
30 | - deep learning (Hinton et al., 2006; Bengio et al., 2007; Ranzato et al., 2007)
31 | 
32 | Important Papers:
33 | - Paper linked in [PyTorch SGD Implementation](https://pytorch.org/docs/stable/generated/torch.optim.SGD.html) - Nesterov momentum [On the importance of initialization and momentum in deep learning](http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf)
34 | - AdamW (the goal is not exactly convergence)
35 | - Early Stopping https://github.com/Bjarten/early-stopping-pytorch# 
36 | 
37 | # 1958 - perceptron + adaline
38 | 
39 | The usecase was mostly for simple binary classifiers. To
40 | demonstrate the perceptron + adaline:
41 | - use a single linear layer
42 | - zero initialized weights + biases
43 | - stochastic gradient descent (SGD) optimizer
44 | - mean squared error (MSE) loss
45 | 
46 | This is probably the simplest form of backpropagation
47 | 
48 | Deep learning was heavily inspired by the brain, but most
49 | advancements were made by engineering.
50 | 
51 | - 1975 - 1980 introduced the neocognitron
52 | - 1986 - connectionism / parallel distributed processing https://stanford.edu/~jlmcc/papers/PDP/Chapter1.pdf
53 | - distributed representation https://web.stanford.edu/~jlmcc/papers/PDP/Chapter3.pdf
54 | 
55 | > This is the idea that each input to a system should be represented by many features, and each feature should be involved in the representation of many possible inputs
56 | 
57 | 1990s progress in modeling sequences with neural networks. 
58 | 
59 | - Hochreiter (1991) and Bengio et al. (1994) identified some of thge fundamental mathematical difficulties in modeling long sequences.
60 | - Hochreiter and Schmidhuber (1997) introduced long short-term memory (LSTM) network to resolve some difficulties.
61 | - Kernel machines (Boser et al., 199; Cortes and Vapnik, 1995; Scholkopf et al., 1999) and graphical models (Jordan, 1998) achieved good results on many important tasks. (led to a decline in popularity with neural networks)
62 | - Canadian Institute for Advanced Research (CIFAR) played a key role in keeping neural network research alive. This united machine learning groups led by Geoffrey Hinton, Yoshua Bengio, Yann LeCun.
63 | 
64 | 1. Perceptron (Rosenblatt, 1958, 1962)
65 | 2. Adaptive linear element (Widrow and Hoﬀ, 1960)
66 | 3. Neocognitron (Fukushima, 1980)
67 | 4. Early back-propagation network (Rumelhart et al., 1986b)
68 | 5. Recurrent neural network for speech recognition (Robinson and Fallside, 1991)
69 | 6. Multilayer perceptron for speech recognition (Bengio et al., 1991)
70 | 7. Mean ﬁeld sigmoid belief network (Saul et al., 1996)
71 | 8. LeNet-5 (LeCun et al., 1998b)
72 | 9. Echo state network (Jaeger and Haas, 2004)
73 | 10. Deep belief network (Hinton et al., 2006)
74 | 11. GPU-accelerated convolutional network (Chellapilla et al., 2006)
75 | 12. Deep Boltzmann machine (Salakhutdinov and Hinton, 2009a)
76 | 13. GPU-accelerated deep belief network (Raina et al., 2009)
77 | 14. Unsupervised convolutional network (Jarrett et al., 2009)
78 | 15. GPU-accelerated multilayer perceptron (Ciresan et al., 2010)
79 | 16. OMP-1 network (Coates and Ng, 2011)
80 | 17. Distributed autoencoder (Le et al., 2012)
81 | 18. Multi-GPU convolutional network (Krizhevsky et al., 2012)
82 | 19. COTS HPC unsupervised convolutional network (Coates et al., 2013)
83 | 20. GoogLeNet (Szegedy et al., 2014a)
84 | 
85 | LSTMs were thought to revolutionize machine translation
86 | (Sutskever et al., 2014; Bahdanau et al., 2015) when the
87 | book was published back in 2016
88 | 


--------------------------------------------------------------------------------
/classifiers/xgb/iris.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 136,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "Map: 100%|██████████| 15/15 [00:00<00:00, 4549.47 examples/s]\n",
 13 |       "Map: 100%|██████████| 135/135 [00:00<00:00, 11127.44 examples/s]"
 14 |      ]
 15 |     },
 16 |     {
 17 |      "name": "stdout",
 18 |      "output_type": "stream",
 19 |      "text": [
 20 |       "94.07% (127/135)\n"
 21 |      ]
 22 |     },
 23 |     {
 24 |      "name": "stderr",
 25 |      "output_type": "stream",
 26 |      "text": [
 27 |       "\n"
 28 |      ]
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "from datasets import load_dataset\n",
 33 |     "import numpy as np\n",
 34 |     "import xgboost as xgb\n",
 35 |     "\n",
 36 |     "dataset = load_dataset(\n",
 37 |     "    \"hitorilabs/iris\", \n",
 38 |     "    split=\"train\", \n",
 39 |     "    )\n",
 40 |     "\n",
 41 |     "# keep the int2str mapping to retrieve string labels\n",
 42 |     "itos = dataset.features[\"species\"].int2str\n",
 43 |     "\n",
 44 |     "dataset = dataset.train_test_split(test_size=0.9, stratify_by_column=\"species\")\n",
 45 |     "X_train = dataset[\"train\"].map(remove_columns=[\"species\"]).to_pandas().to_numpy()\n",
 46 |     "y_train = np.array(dataset[\"train\"][\"species\"])\n",
 47 |     "\n",
 48 |     "X_test = dataset[\"test\"].map(remove_columns=[\"species\"]).to_pandas().to_numpy()\n",
 49 |     "y_test = np.array(dataset[\"test\"][\"species\"])\n",
 50 |     "\n",
 51 |     "# Create DMatrix for train and test\n",
 52 |     "dtrain = xgb.DMatrix(X_train, label=y_train)\n",
 53 |     "dtest = xgb.DMatrix(X_test, label=y_test)\n",
 54 |     "\n",
 55 |     "NUM_CLASSES = 3\n",
 56 |     "# Set hyperparameters\n",
 57 |     "params = {\n",
 58 |     "    'objective': 'multi:softprob',\n",
 59 |     "    'max_depth': 15,\n",
 60 |     "    'learning_rate': 0.1,\n",
 61 |     "    'num_class': NUM_CLASSES,\n",
 62 |     "}\n",
 63 |     "\n",
 64 |     "# Train the model\n",
 65 |     "num_rounds = 100\n",
 66 |     "bst = xgb.train(params, dtrain, num_rounds)\n",
 67 |     "\n",
 68 |     "# Make predictions\n",
 69 |     "preds = bst.predict(dtest)\n",
 70 |     "\n",
 71 |     "acc = sum(dataset[\"test\"][\"species\"] == preds.argmax(axis=1)) / len(dataset[\"test\"])\n",
 72 |     "\n",
 73 |     "print(f\"\"\"{acc:.2%} ({sum(dataset[\"test\"][\"species\"] == preds.argmax(axis=1))}/{len(dataset[\"test\"])})\"\"\")"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 126,
 79 |    "metadata": {},
 80 |    "outputs": [
 81 |     {
 82 |      "data": {
 83 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfIAAAG1CAYAAAACzHYEAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAAA6EUlEQVR4nO3de3wU9b3/8fdsLpsA2UCA3GRBBASRgBUB89Mil0gAi1BovR8BPaCegEK8NRUFrQpFj+IlorUIthpRUVCxgkBNUipRCFKgRxEiSqwkXKqEBLMJyfz+QLauiZjNbrKXeT15zIPMd2a+89lu5ZPPd74zY5imaQoAAIQkW6ADAAAAzUciBwAghJHIAQAIYSRyAABCGIkcAIAQRiIHACCEkcgBAAhhJHIAAEIYiRwAgBBGIgcAIISRyENMbm6uTj/9dMXExGjIkCH68MMPAx0SWkBhYaHGjRun1NRUGYahVatWBToktJD58+dr0KBBiouLU2JioiZMmKBdu3YFOiyEEBJ5CHn55ZeVnZ2tuXPnauvWrRowYIAyMzN14MCBQIcGP6uqqtKAAQOUm5sb6FDQwgoKCpSVlaWioiKtW7dOtbW1GjVqlKqqqgIdGkKEwUtTQseQIUM0aNAgPfnkk5Kk+vp6OZ1OzZw5U7/5zW8CHB1aimEYWrlypSZMmBDoUNAKDh48qMTERBUUFGjo0KGBDgchgIo8RNTU1Ki4uFgZGRnuNpvNpoyMDG3atCmAkQHwpyNHjkiSEhISAhwJQgWJPEQcOnRIdXV1SkpK8mhPSkpSWVlZgKIC4E/19fWaNWuWLrjgAvXr1y/Q4SBERAY6AADACVlZWdq5c6c2btwY6FAQQkjkIaJTp06KiIhQeXm5R3t5ebmSk5MDFBUAf5kxY4ZWr16twsJCdenSJdDhIIQwtB4ioqOjNXDgQG3YsMHdVl9frw0bNig9PT2AkQHwhWmamjFjhlauXKm//vWv6t69e6BDQoihIg8h2dnZmjx5ss477zwNHjxYixYtUlVVlaZOnRro0OBnlZWV2rNnj3t979692rZtmxISEtS1a9cARgZ/y8rKUl5ent544w3FxcW557zEx8crNjY2wNEhFHD7WYh58skn9dBDD6msrEznnHOOHn/8cQ0ZMiTQYcHP8vPzNXz48AbtkydP1rJly1o/ILQYwzAabV+6dKmmTJnSusEgJJHIAQAIYVwjBwAghJHIAQAIYSRyAABCGIkcAIAQRiIHACCEkcgBAAhhJPIQ5HK5NG/ePLlcrkCHghbGd20dfNdoLhJ5CHK5XLr33nv5D94C+K6tg+86vC1YsECGYWjWrFnutmHDhskwDI/lxhtv9LpvHtEKAEAL2rx5s5555hn179+/wbZp06bpvvvuc6+3adPG6/6pyAEAaCGVlZW6+uqr9eyzz6pDhw4Ntrdp00bJycnuxeFweH2OkK7I6+vr9dVXXykuLu5Hn1ccjioqKjz+Rvjiu7YOq37Xpmnq6NGjSk1Nlc3WcrVldXW1ampqfO7HNM0G+cZut8tutze6f1ZWli655BJlZGTo/vvvb7D9xRdf1AsvvKDk5GSNGzdOd999t9dVeUgn8q+++kpOpzPQYQSMlT+71fBdW4dVv+vS0tIWew97dXW1YuM6SseP+dxXu3btVFlZ6dE2d+5czZs3r8G+y5cv19atW7V58+ZG+7rqqqvUrVs3paamavv27brzzju1a9cuvf76617FFNKJPC4uTpIU3XeyjIjoAEeDlrYv/+FAhwDAz45WVKhnd6f73/OWUFNTIx0/JvvZUyVfckVdjSr/uVSlpaUeQ+CNVeOlpaW65ZZbtG7dOsXExDTa3fTp090/p6WlKSUlRSNHjlRJSYl69OjR5LBCOpGfHN4wIqJJ5BbQnGtHAEJDq1we9TFXnHxVqMPh+Ml/j4qLi3XgwAGde+657ra6ujoVFhbqySeflMvlUkREhMcxJ19JvWfPHuskcgAAmsyQ5MsvDF4cOnLkSO3YscOjberUqerTp4/uvPPOBklckrZt2yZJSklJ8SosEjkAwBoM24nFl+ObKC4uTv369fNoa9u2rTp27Kh+/fqppKREeXl5Gjt2rDp27Kjt27dr9uzZGjp0aKO3qZ0KiRwAgFYWHR2t9evXa9GiRaqqqpLT6dSkSZM0Z84cr/sikQMArMEwfBxa9+06fn5+vvtnp9OpgoICn/o7iUQOALCGVhxab03BGRUAAGgSKnIAgDUEeGi9pZDIAQAW4ePQepAOYgdnVAAAoEmoyAEA1sDQOgAAISxMZ62TyAEA1hCmFXlw/noBAACahIocAGANDK0DABDCGFoHAADBhoocAGANDK0DABDCDMPHRM7QOgAA8DMqcgCANdiME4svxwchEjkAwBrC9Bp5cEYFAACahIocAGANYXofOYkcAGANYTq0TiIHAFhDmFbkwfnrBQAAaBIqcgCANTC0DgBACGNoHQAABBsqcgCANTC0DgBACGNoHQAABBsqcgCARfg4tB6ktS+JHABgDQytAwCA5liwYIEMw9CsWbPcbdXV1crKylLHjh3Vrl07TZo0SeXl5V73TSIHAFiDYfxn5nqzluZV5Js3b9Yzzzyj/v37e7TPnj1bb731ll599VUVFBToq6++0sSJE73un0QOALAGn5J4866vV1ZW6uqrr9azzz6rDh06uNuPHDmiJUuW6JFHHtGIESM0cOBALV26VO+//76Kioq8OgeJHABgDSevkfuySKqoqPBYXC7Xj54yKytLl1xyiTIyMjzai4uLVVtb69Hep08fde3aVZs2bfLqY5HIAQDwgtPpVHx8vHuZP39+o/stX75cW7dubXR7WVmZoqOj1b59e4/2pKQklZWVeRUPs9YBANbgpye7lZaWyuFwuJvtdnuDXUtLS3XLLbdo3bp1iomJaf45m4CKHABgDX4aWnc4HB5LY4m8uLhYBw4c0LnnnqvIyEhFRkaqoKBAjz/+uCIjI5WUlKSamhp98803HseVl5crOTnZq49FRQ4AgJ+NHDlSO3bs8GibOnWq+vTpozvvvFNOp1NRUVHasGGDJk2aJEnatWuX9u3bp/T0dK/ORSIHAFhDK740JS4uTv369fNoa9u2rTp27Ohuv/7665Wdna2EhAQ5HA7NnDlT6enpOv/8870Ki0QOALCGIHuy26OPPiqbzaZJkybJ5XIpMzNTTz31lNf9kMgBAGgF+fn5HusxMTHKzc1Vbm6uT/2SyAEAlmAYhowgqsj9hUQOALCEcE3k3H4GAEAIoyIHAFiD8d3iy/FBiEQOALCEcB1aJ5EDACwhXBM518gBAAhhVOQAAEsI14qcRB4ijpcX6/j+IkV06q+oLj+XJLl2r5RZ9ZXHfhEdz1aUc1gAIgSA4EYiR8DUHytX3eF/yojp2GBbRMe+ikwe/J8GW1QrRgYACLSguEaem5ur008/XTExMRoyZIg+/PDDQIcUNMy6GtV+sU6RzuFSRMNX5cmIlBHV9j9LRHTrBwkAocDwwxKEAp7IX375ZWVnZ2vu3LnaunWrBgwYoMzMTB04cCDQoQWF2i8LZXOcrog4Z6Pb677+VNU7lsj1yUuq/WqTzPraVo4QAELDyaF1X5ZgFPBE/sgjj2jatGmaOnWq+vbtq6efflpt2rTRc889F+jQAq7u690yvz2oyJTGX2kX0eFMRXW7WNE9Jygy8VzVfb1LtV+sb+UoAQCBFNBr5DU1NSouLlZOTo67zWazKSMjQ5s2bWqwv8vlksvlcq9XVFS0SpyBYNYcVe2//qboHpfKsDX+NUV2Ovs/K7Edpai2qi15Q/WuI7LZ41spUgAIDSfeYurLZDf/xeJPAU3khw4dUl1dnZKSkjzak5KS9MknnzTYf/78+br33ntbK7yAqj92UDr+rWp2vfK9VlN1VV+p7tAO2QfcKOMHL7m3tTnxv6PpOiKRyAHAgyFfh8eDM5OH1Kz1nJwcZWdnu9crKirkdDZ+7TjU2eK6KLr3FR5ttfv+KiOmvSITz22QxCXJ/PaQJMmIatMqMQIAAi+gibxTp06KiIhQeXm5R3t5ebmSk5Mb7G+322W3NzJzOwwZEdEyYn9wu5ktUkZEjGyxHVXvOqL6rz+VzdFNioiRWX1Yx/+1UUbbVNliOwUmaAAIYuF6H3lAJ7tFR0dr4MCB2rBhg7utvr5eGzZsUHp6egAjC36GYVPd0S9VU/Kmaj7J0/F//V229j0UfcYlgQ4NAIJTmN5+FvCh9ezsbE2ePFnnnXeeBg8erEWLFqmqqkpTp04NdGhBx97rl+6fjeg4j3UAwE/wsSI3g7QiD3giv/zyy3Xw4EHdc889Kisr0znnnKM1a9Y0mAAHAAAaCngil6QZM2ZoxowZgQ4DABDGfL1GHqwPhAmKRA4AQEsL10Qe8Ce7AQCA5qMiBwBYg68zz4OzICeRAwCsgaF1AAAQdKjIAQCWEK4VOYkcAGAJ4ZrIGVoHACCEUZEDACyBihwAgFDWyi9NWbx4sfr37y+HwyGHw6H09HS988477u3Dhg1z/3Jxcrnxxhu9/lhU5AAAS2jtirxLly5asGCBevXqJdM09fzzz2v8+PH66KOPdPbZZ0uSpk2bpvvuu899TJs2bbyOi0QOAEALGDdunMf6Aw88oMWLF6uoqMidyNu0aaPk5GSfzsPQOgDAEn44jN2cRZIqKio8FpfL9ZPnrqur0/Lly1VVVaX09HR3+4svvqhOnTqpX79+ysnJ0bFjx7z+XFTkAABL8NfQutPp9GifO3eu5s2b1+gxO3bsUHp6uqqrq9WuXTutXLlSffv2lSRdddVV6tatm1JTU7V9+3bdeeed2rVrl15//XWv4iKRAwDghdLSUjkcDve63W7/0X179+6tbdu26ciRI1qxYoUmT56sgoIC9e3bV9OnT3fvl5aWppSUFI0cOVIlJSXq0aNHk+MhkQMArMFPL005OQu9KaKjo9WzZ09J0sCBA7V582Y99thjeuaZZxrsO2TIEEnSnj17SOQAAPxQMNxHXl9f/6PX1Ldt2yZJSklJ8apPEjkAAC0gJydHY8aMUdeuXXX06FHl5eUpPz9fa9euVUlJifLy8jR27Fh17NhR27dv1+zZszV06FD179/fq/OQyAEAltDaFfmBAwd07bXXav/+/YqPj1f//v21du1aXXzxxSotLdX69eu1aNEiVVVVyel0atKkSZozZ47XcZHIAQCWYMjHRO7lBfYlS5b86Dan06mCgoJmx/J9JHIAgCUEwzXylsADYQAACGFU5AAAa/DT7WfBhkQOALAEhtYBAEDQoSIHAFhCuFbkJHIAgCUYxonFl+ODEUPrAACEMCpyAIAlnKjIfRla92MwfkQiBwBYg49D68F6+xlD6wAAhDAqcgCAJTBrHQCAEBaus9ZJ5AAAS7DZDNlszc/Gpg/HtiSukQMAEMKoyAEAlsDQOgAAISxcJ7sxtA4AQAijIgcAWAJD6wAAhDCG1gEAQNChIgcAWEK4VuQkcgCAJYTrNXKG1gEACGFU5AAASzDk49B6kL7HlEQOALCEcB1aJ5EDACwhXCe7cY0cAIAQRkUOALAEhtYBAAhhDK0DAIAmW7x4sfr37y+HwyGHw6H09HS988477u3V1dXKyspSx44d1a5dO02aNEnl5eVen4dEDgCwhJND674s3ujSpYsWLFig4uJibdmyRSNGjND48eP1z3/+U5I0e/ZsvfXWW3r11VdVUFCgr776ShMnTvT6czG0DgCwhNYeWh83bpzH+gMPPKDFixerqKhIXbp00ZIlS5SXl6cRI0ZIkpYuXaqzzjpLRUVFOv/885t8HipyAAC8UFFR4bG4XK6fPKaurk7Lly9XVVWV0tPTVVxcrNraWmVkZLj36dOnj7p27apNmzZ5FU9YVOT78h+Ww+EIdBhoYQlXPBfoENCKdjx1ZaBDQCs4evTb1juZj7PWTz7Yzel0ejTPnTtX8+bNa/SQHTt2KD09XdXV1WrXrp1Wrlypvn37atu2bYqOjlb79u099k9KSlJZWZlXYYVFIgcA4Kf4a2i9tLTUo3i02+0/ekzv3r21bds2HTlyRCtWrNDkyZNVUFDQ7BgaQyIHAMALJ2ehN0V0dLR69uwpSRo4cKA2b96sxx57TJdffrlqamr0zTffeFTl5eXlSk5O9ioerpEDACyhtWetN6a+vl4ul0sDBw5UVFSUNmzY4N62a9cu7du3T+np6V71SUUOALCE1p61npOTozFjxqhr1646evSo8vLylJ+fr7Vr1yo+Pl7XX3+9srOzlZCQIIfDoZkzZyo9Pd2rGesSiRwAYBGt/YjWAwcO6Nprr9X+/fsVHx+v/v37a+3atbr44oslSY8++qhsNpsmTZokl8ulzMxMPfXUU17HRSIHAKAFLFmy5JTbY2JilJubq9zcXJ/OQyIHAFhCuD5rnUQOALCEcE3kzFoHACCEUZEDACyB95EDABDCGFoHAABBh4ocAGAJDK0DABDCGFoHAABBh4ocAGAJhnwcWvdbJP5FIgcAWILNMGTzIZP7cmxLIpEDACwhXCe7cY0cAIAQRkUOALCEcJ21TiIHAFiCzTix+HJ8MPJ6aH3NmjXauHGjez03N1fnnHOOrrrqKn399dd+DQ4AAJya14n89ttvV0VFhSRpx44duvXWWzV27Fjt3btX2dnZfg8QAAC/MP4zvN6cJVjvP/N6aH3v3r3q27evJOm1117TL37xCz344IPaunWrxo4d6/cAAQDwB2atfyc6OlrHjh2TJK1fv16jRo2SJCUkJLgrdQAA0Dq8rsgvvPBCZWdn64ILLtCHH36ol19+WZL06aefqkuXLn4PEAAAfzC+++PL8cHI64r8ySefVGRkpFasWKHFixfrtNNOkyS98847Gj16tN8DBADAH07OWvdlCUZeV+Rdu3bV6tWrG7Q/+uijfgkIAAA0ndcV+datW7Vjxw73+htvvKEJEybot7/9rWpqavwaHAAA/uLLjHVfHybTkrxO5DfccIM+/fRTSdJnn32mK664Qm3atNGrr76qO+64w+8BAgDgDydnrfuyBCOvE/mnn36qc845R5L06quvaujQocrLy9OyZcv02muv+Ts+AAD84uTbz3xZgpHXidw0TdXX10s6cfvZyXvHnU6nDh065N/oAADAKXk92e28887T/fffr4yMDBUUFGjx4sWSTjwoJikpye8BAgDgD+H6QBivE/miRYt09dVXa9WqVbrrrrvUs2dPSdKKFSv0//7f//N7gAAA+ANvP/tO//79PWatn/TQQw8pIiLCL0EBAICm8dtrTGNiYvzVFQAAfheuQ+teT3arq6vTww8/rMGDBys5OVkJCQkeCwAAwai1Z63Pnz9fgwYNUlxcnBITEzVhwgTt2rXLY59hw4Y1uFf9xhtv9O5zebW3pHvvvVePPPKILr/8ch05ckTZ2dmaOHGibDab5s2b5213AACEpYKCAmVlZamoqEjr1q1TbW2tRo0apaqqKo/9pk2bpv3797uXhQsXenUer4fWX3zxRT377LO65JJLNG/ePF155ZXq0aOH+vfvr6KiIt18883edgkAQIsz5Nsrxb09ds2aNR7ry5YtU2JiooqLizV06FB3e5s2bZScnNzsuLyuyMvKypSWliZJateunY4cOSJJ+sUvfqG333672YEAANCS/PWI1oqKCo/F5XI16fwn8+UPL0O/+OKL6tSpk/r166ecnBz3q8KbyutE3qVLF+3fv1+S1KNHD7377ruSpM2bN8tut3vbHQAAIcXpdCo+Pt69zJ8//yePqa+v16xZs3TBBReoX79+7varrrpKL7zwgt577z3l5OToz3/+s6655hqv4vF6aP2Xv/ylNmzYoCFDhmjmzJm65pprtGTJEu3bt0+zZ8/2tjsAAFqFr68iPXlsaWmpHA6Hu70pRWxWVpZ27typjRs3erRPnz7d/XNaWppSUlI0cuRIlZSUqEePHk2Ky+tEvmDBAvfPl19+ubp27apNmzapV69eGjdunLfdAQDQKvz1QBiHw+GRyH/KjBkztHr1ahUWFqpLly6n3HfIkCGSpD179rRcIv+h9PR0paen+9oNAAAtrjXvBTdNUzNnztTKlSuVn5+v7t27/+Qx27ZtkySlpKQ0+TxNSuRvvvlmkzu89NJLm7wvAADhKisrS3l5eXrjjTcUFxensrIySVJ8fLxiY2NVUlKivLw8jR07Vh07dtT27ds1e/ZsDR06VP3792/yeZqUyCdMmNCkzgzDUF1dXZNPDgBAa2ntZ62ffKnYsGHDPNqXLl2qKVOmKDo6WuvXr9eiRYtUVVUlp9OpSZMmac6cOV6dp0mJ/ORrSwEACFX+muzWVKZpnnK70+lUQUFB8wP6jte3nwEAgODR5ET+17/+VX379lVFRUWDbUeOHNHZZ5+twsJCvwYHAIC/+OuBMMGmyYl80aJFmjZtWqNT7uPj43XDDTfo0Ucf9WtwAAD4i+GHJRg1OZH/4x//0OjRo390+6hRo1RcXOyXoAAAQNM0+T7y8vJyRUVF/XhHkZE6ePCgX4ICAMDfmvMq0h8eH4yaXJGfdtpp2rlz549u3759u1c3sAMA0JoMw/clGDU5kY8dO1Z33323qqurG2z79ttvNXfuXP3iF7/wa3AAAODUmjy0PmfOHL3++us688wzNWPGDPXu3VuS9Mknnyg3N1d1dXW66667WixQAAB80doPhGktTU7kSUlJev/993XTTTcpJyfHfaO7YRjKzMxUbm6ukpKSWixQAAB84evweJDmce9emtKtWzf95S9/0ddff609e/bINE316tVLHTp0aKn4gLB3vOQ9Hf8sX+axQ5Ikw5GqqLMuVURymiSpvvKAane8ovpDu6X644pI6qeoc66SERMfyLDhR2X7/6WFv7tbhX99V99+e0zdTu+h3z/2tNLOGRjo0MKK5Se7fV+HDh00aNAgDR482KckXlhYqHHjxik1NVWGYWjVqlXN7gsIVUZsB0X1myT7iHtkH3G3IjqfpZr3n1B9xb9kHnepZuMjkgzZh94u+7AcmfXH5Xr/CZkmj04OB0e++VqXjxupqKhILclbqTWFW5Vz73w52lMgoWl8fo2pL6qqqjRgwABdd911mjhxYiBDAQImIvUcj3Vbv4k6/tl7qj/8mYxvv5ZZdUj2kXNlRMVKkqIHXa/qN29W/YFPFJHUNwARw5+eeeIRpaR20e8f+4O7zdnt9MAFFMYYWm8BY8aM0ZgxYwIZAhBUTLNedV9ulupqZOvYQ2blgRP/eti+95+qLUoyDNUf3k0iDwMb3n1bPx+WoRn/fbU+fH+jklJSdPWU6briv64LdGhhx/KT3YKBy+WSy+Vyrzf23HcgFNUf+VKu9x6U6mulSLuiz8+SzZEq0x4nRdhVu3OFos4+MWpVu3OFZNbLrD4S4KjhD6Vf7FXe88/quhtm6qZbbteOj4r1uzm3KTo6WhMvvybQ4SEEhFQinz9/vu69995AhwH4nRGXLHvGXKn2W9X9q1g1W5bIftGdsjlSFX3+jar96AVV79kgGYYinINltO+m4H3yM7xh1ter34Bzddtd90mSzk47R59+8n/Ke/6PJHI/s8m3V34G6+tCm5TI33zzzSZ3eOmllzY7mJ+Sk5Oj7Oxs93pFRYWcTmeLnQ9oLYYtUka7E7dv2jqcrvp/79XxPesVfe61ikjqp4jRC2S6jkpGhIzoNvp29WwZXQYHOGr4Q+ekZPU8s49HW48ze2vt26sCE1AYs/TQ+oQJE5rUmWEYqqur8yWeU7Lb7bLb7S3WPxA8zBPD7N9j2OMkSXUHPpZcRxtMkkNoGjgoXXtLdnu07S3Zo9QuXQMUEUJNkxJ5fT23uQAtpXbna7Il9ZPRpqN0vFp1pR+o/uAuRV84W5J0/PONssWlSPY41f+7RLX/eEmRvS6WLS45wJHDH6beMEOX/WKEnlq0UGPHT9L2rVv08p+f0/0PPxno0MKOYUg2Zq37V2Vlpfbs2eNe37t3r7Zt26aEhAR17cpvo7AG01Wh2i1LTkxei4qVzdFF0RfOVkTS2Se2Hy2Ta+drUk2VjLadFNn7EkX2GhXgqOEv/X92np5aulwPPzBXTz4yX86up+uu3y3U+F9dEejQwo7Nx0Tuy7EtqVmJvKqqSgUFBdq3b59qamo8tt18881N7mfLli0aPny4e/3k9e/Jkydr2bJlzQkNCDnRA6eecntU2q8UlfarVooGgTBi1FiNGDU20GEgRHmdyD/66CONHTtWx44dU1VVlRISEnTo0CG1adNGiYmJXiXyYcOGuZ/ZDgBASwrXyW5ez6afPXu2xo0bp6+//lqxsbEqKirSF198oYEDB+rhhx9uiRgBAPDZyaF1X5Zg5HUi37Ztm2699VbZbDZFRETI5XLJ6XRq4cKF+u1vf9sSMQIA4LOTj2j1ZQlGXifyqKgo2WwnDktMTNS+ffskSfHx8SotLfVvdAAA4JS8vkb+s5/9TJs3b1avXr100UUX6Z577tGhQ4f05z//Wf369WuJGAEA8BmvMf3Ogw8+qJSUFEnSAw88oA4dOuimm27SwYMH9Yc//OEnjgYAIDBsfliCkdcV+Xnnnef+OTExUWvWrPFrQAAAoOlC6qUpAAA0F+8j/0737t1PeS/dZ5995lNAAAC0BJt8vEYepG8c9DqRz5o1y2O9trZWH330kdasWaPbb7/dX3EBAIAm8DqR33LLLY225+bmasuWLT4HBABASwjXoXW/TcIbM2aMXnvtNX91BwCAX7X2k93mz5+vQYMGKS4uTomJiZowYYJ27drlsU91dbWysrLUsWNHtWvXTpMmTVJ5ebl3n8u7sH7cihUrlJCQ4K/uAAAIaQUFBcrKylJRUZHWrVun2tpajRo1SlVVVe59Zs+erbfeekuvvvqqCgoK9NVXX2nixIlenadZD4T5/mQ30zRVVlamgwcP6qmnnvK2OwAAWsWJ95H78tKUE39XVFR4tNvtdtnt9gb7//D27GXLlikxMVHFxcUaOnSojhw5oiVLligvL08jRoyQJC1dulRnnXWWioqKdP755zcpLq8T+fjx4z0Suc1mU+fOnTVs2DD16dPH2+4AAGgV/rpG7nQ6Pdrnzp2refPm/eTxR44ckST36HVxcbFqa2uVkZHh3qdPnz7q2rWrNm3a1HKJvCnBAgAQbHx9g9nJY0tLS+VwONztjVXjP1RfX69Zs2bpggsucD/OvKysTNHR0Wrfvr3HvklJSSorK2tyXF4n8oiICO3fv1+JiYke7YcPH1ZiYqLq6uq87RIAgJDhcDg8EnlTZGVlaefOndq4caPf4/F6sptpmo22u1wuRUdH+xwQAAAtwfDDn+aYMWOGVq9erffee09dunRxtycnJ6umpkbffPONx/7l5eVKTk5ucv9Nrsgff/xxSZJhGPrjH/+odu3aubfV1dWpsLCQa+QAgKDlr6H1pjJNUzNnztTKlSuVn5+v7t27e2wfOHCgoqKitGHDBk2aNEmStGvXLu3bt0/p6elNPk+TE/mjjz7qDuzpp59WRESEe1t0dLROP/10Pf30000+MQAA4SwrK0t5eXl64403FBcX577uHR8fr9jYWMXHx+v6669Xdna2EhIS5HA4NHPmTKWnpzd5opvkRSLfu3evJGn48OF6/fXX1aFDBy8/EgAAgdPaFfnixYslScOGDfNoX7p0qaZMmSLpRJFss9k0adIkuVwuZWZmen0rt9eT3d577z1vDwEAIOAMwzjlS7+acrw3fmxO2ffFxMQoNzdXubm5zQ3L+8lukyZN0u9///sG7QsXLtSvf/3rZgcCAAC853UiLyws1NixYxu0jxkzRoWFhX4JCgAAf2vtZ623Fq+H1isrKxu9zSwqKqrBY+sAAAgWvP3sO2lpaXr55ZcbtC9fvlx9+/b1S1AAAKBpvK7I7777bk2cOFElJSXuh7xv2LBBL730kl599VW/BwgAgD/YDMOnl6b4cmxL8jqRjxs3TqtWrdKDDz6oFStWKDY2Vv3799f69et10UUXtUSMAAD4rLVvP2stXidySbrkkkt0ySWXNGjfuXOn+2HwAAAEFR+vkTfzCa0tzutr5D909OhR/eEPf9DgwYM1YMAAf8QEAACaqNmJvLCwUNdee61SUlL08MMPa8SIESoqKvJnbAAA+I1Nhs9LMPJqaL2srEzLli3TkiVLVFFRocsuu0wul0urVq1ixjoAIKhZ/vazcePGqXfv3tq+fbsWLVqkr776Sk888URLxgYAAH5Ckyvyd955RzfffLNuuukm9erVqyVjAgDA78J11nqTK/KNGzfq6NGjGjhwoIYMGaInn3xShw4dasnYAADwm5P3kfuyBKMmJ/Lzzz9fzz77rPbv368bbrhBy5cvV2pqqurr67Vu3TodPXq0JeMEAACN8HrWetu2bXXddddp48aN2rFjh2699VYtWLBAiYmJuvTSS1siRgAAfHZyspsvSzDy6T7y3r17a+HChfryyy/10ksv+SsmAAD8ziYfh9aD9PYznx8II0kRERGaMGGC3nzzTX90BwAAmqhZj2gFACDUhOt95CRyAIAl2OTbMLRfhrBbAIkcAGAJhmHI8KGs9uXYlhSsv2AAAIAmoCIHAFiCId/eRBqc9TiJHABgEb4+nS3kn+wGAACCDxU5AMAygrOm9g2JHABgCeF6HzlD6wAAhDAqcgCAJYTrfeQkcgCAJYTrk92CNS4AANAEJHIAgCWcHFr3ZfFGYWGhxo0bp9TUVBmGoVWrVnlsnzJlSoP+R48e7fXnIpEDACzB8MPijaqqKg0YMEC5ubk/us/o0aO1f/9+9/LSSy95eRaukQMALMJfk90qKio82u12u+x2e4P9x4wZozFjxpyyT7vdruTk5GbHJJHIEUIOvDgl0CGgFXX++e2BDgGtwKxzBToErzmdTo/1uXPnat68ec3qKz8/X4mJierQoYNGjBih+++/Xx07dvSqDxI5AMAS/DVrvbS0VA6Hw93eWDXeFKNHj9bEiRPVvXt3lZSU6Le//a3GjBmjTZs2KSIiosn9kMgBAJbgr6F1h8Phkcib64orrnD/nJaWpv79+6tHjx7Kz8/XyJEjm9wPk90AAAgCZ5xxhjp16qQ9e/Z4dRwVOQDAEoL9feRffvmlDh8+rJSUFK+OI5EDACyhtV+aUllZ6VFd7927V9u2bVNCQoISEhJ07733atKkSUpOTlZJSYnuuOMO9ezZU5mZmV6dh0QOAEAL2LJli4YPH+5ez87OliRNnjxZixcv1vbt2/X888/rm2++UWpqqkaNGqXf/e53Xk+eI5EDACzBJkM2HwbIvT122LBhMk3zR7evXbu22bF8H4kcAGAJvI8cAAAEHSpyAIAlGN/98eX4YEQiBwBYQrgOrZPIAQCWYPg42S1YK3KukQMAEMKoyAEAlsDQOgAAISxcEzlD6wAAhDAqcgCAJXD7GQAAIcxmnFh8OT4YMbQOAEAIoyIHAFgCQ+sAAIQwZq0DAICgQ0UOALAEQ74NjwdpQU4iBwBYQ7jOWieRAwAsIVwnu3GNHACAEEZFDgCwhHCdtU4iBwBYgiHfJqwFaR5naB0AgFBGRQ4AsASbDNl8GB+3BWlNTiIHAFgCQ+sAACDoUJEDAKwhTEtyEjkAwBJ4IAwAAAg6VOQAAGvw8YEwQVqQk8gBANYQppfIGVoHAFiE4YfFC4WFhRo3bpxSU1NlGIZWrVrlsd00Td1zzz1KSUlRbGysMjIytHv3bq8/FokcAIAWUFVVpQEDBig3N7fR7QsXLtTjjz+up59+Wh988IHatm2rzMxMVVdXe3UehtYBAJbgr1nrFRUVHu12u112u73B/mPGjNGYMWMa7cs0TS1atEhz5szR+PHjJUl/+tOflJSUpFWrVumKK65oclxU5AAASzj59jNfFklyOp2Kj493L/Pnz/c6lr1796qsrEwZGRnutvj4eA0ZMkSbNm3yqi8qcgAAvFBaWiqHw+Feb6wa/yllZWWSpKSkJI/2pKQk97amIpEDACzBX7PWHQ6HRyIPNIbWAQDW0Mqz1k8lOTlZklReXu7RXl5e7t7WVCRyAABaWffu3ZWcnKwNGza42yoqKvTBBx8oPT3dq74YWgcAWEJrP2u9srJSe/bsca/v3btX27ZtU0JCgrp27apZs2bp/vvvV69evdS9e3fdfffdSk1N1YQJE7w6D4kcAGAJ35953tzjvbFlyxYNHz7cvZ6dnS1Jmjx5spYtW6Y77rhDVVVVmj59ur755htdeOGFWrNmjWJiYrw6D4kcAIAWMGzYMJmm+aPbDcPQfffdp/vuu8+n85DIAQCWEK7PWieRAwCsIUwzOYkcAGAJrT3ZrbVw+xkAACGMihwAYAmtPWu9tZDIAQCWEKaXyBlaBwAglFGRAwCsIUxLcipyIAht/Fuhfj3xUvXq3kVxMRF6681VgQ4JLeD4/g9VvfkR1e57r8E20zRV8+nrqt78iOq+3tPI0fCW4Yc/wSigiXz+/PkaNGiQ4uLilJiYqAkTJmjXrl2BDAkICseOVSktbYD+d9ETgQ4FLaS+skx1B7bLiO3U6Pa68q2tHBFCVUATeUFBgbKyslRUVKR169aptrZWo0aNUlVVVSDDAgJuVOYY3XPv73Tp+F8GOhS0ALOuRrWf/UWRp18sRTZ8rnb9sQM6XlasqO6ZAYgufJ2cte7LEowCeo18zZo1HuvLli1TYmKiiouLNXTo0ABFBQAtq/aLv8rW/gxFxHfT8f0feGwz62pVW/IXRXUbISOqbYAiDE9heok8uCa7HTlyRJKUkJDQ6HaXyyWXy+Ver6ioaJW4AMBf6g5/IvNYuaL6Xt3o9uOl+bK1S1VEh56tGxhCVtBMdquvr9esWbN0wQUXqF+/fo3uM3/+fMXHx7sXp9PZylECQPOZrqOq3ZevqDPGyrA1rKPqvi5RfUWpIrsOa/3grMDwwxKEgqYiz8rK0s6dO7Vx48Yf3ScnJ8f9PlfpREVOMgcQKuqPlUvHj6nmny98r9VU3dEvVVe+TRGJA2S6vpFra67HcbV73tLxuNNk73NZ6wYcZsL1WetBkchnzJih1atXq7CwUF26dPnR/ex2u+x2eytGBgD+Y3N0VfTZ13q01e5dKyM2QZHJg2RExSqic3+P7TX//JMiu14kW/serRlqWOIRrS3ANE3NnDlTK1euVH5+vrp37x7IcICgUVlZqc9K/nPv8Beff67t/9imDh0S5OzaNYCRwRdGRLSMNj+43SwiSkZkjGzftTc2wc2Idshmj2+NEBGCAprIs7KylJeXpzfeeENxcXEqKyuTJMXHxys2NjaQoQEB9VHxFo3NHOlez7njVknSVddcq2f+uDRQYQEhjVnrLWDx4sWSpGHDhnm0L126VFOmTGn9gIAg8fOLhulodV2gw0Ar+Knr3jGDsk+5HV4I00we8KF1AADQfEEx2Q0AgJbGrHUAAEKZr49ZDc48HjwPhAEAAN6jIgcAWEKYznUjkQMALCJMMzmJHABgCeE62Y1r5AAAhDAqcgCAJfCsdQAAQliYXiJnaB0AgFBGIgcAWIPhh8UL8+bNk2EYHkufPn3881m+h6F1AIAlBGLW+tlnn63169e71yMj/Z92SeQAALSQyMhIJScnt+g5GFoHAFiCof/MXG/W8l0/FRUVHovL5frRc+7evVupqak644wzdPXVV2vfvn1+/1wkcgCAJfjrErnT6VR8fLx7mT9/fqPnGzJkiJYtW6Y1a9Zo8eLF2rt3r37+85/r6NGjfv1cDK0DAOCF0tJSORwO97rdbm90vzFjxrh/7t+/v4YMGaJu3brplVde0fXXX++3eEjkAABL8NcDYRwOh0cib6r27dvrzDPP1J49e5ofRCMYWgcAWEQr33/2A5WVlSopKVFKSopP/fwQiRwAYAk+TXRrRjV/2223qaCgQJ9//rnef/99/fKXv1RERISuvPJKv34uhtYBAGgBX375pa688kodPnxYnTt31oUXXqiioiJ17tzZr+chkQMALKG1n7W+fPlyH87WdCRyAIAlhOvbz7hGDgBACKMiBwBYQiCetd4aSOQAAGsI0xeSM7QOAEAIoyIHAFhCmBbkJHIAgDUwax0AAAQdKnIAgCUwax0AgFAWphfJSeQAAEsI0zzONXIAAEIZFTkAwBLCddY6iRwAYBG+TXYL1sF1htYBAAhhVOQAAEsI16F1KnIAAEIYiRwAgBDG0DoAwBLCdWidRA4AsIRwfUQrQ+sAAIQwKnIAgCUwtA4AQAgL12etk8gBANYQppmca+QAAIQwKnIAgCWE66x1EjkAwBLCdbIbQ+sAAIQwKnIAgCWE6Vw3EjkAwCLCNJMztA4AQAvKzc3V6aefrpiYGA0ZMkQffvihX/snkQMALMHwwx9vvfzyy8rOztbcuXO1detWDRgwQJmZmTpw4IDfPheJHABgCSdnrfuyeOuRRx7RtGnTNHXqVPXt21dPP/202rRpo+eee85vnyukr5GbpilJOlpREeBI0BqO19UHOgS0IrPOFegQ0ArMupoTf3/373lLqvAxV5w8/of92O122e32BvvX1NSouLhYOTk57jabzaaMjAxt2rTJp1i+L6QT+dGjRyVJPbs7AxwJAMAXR48eVXx8fIv0HR0dreTkZPXyQ65o166dnE7PfubOnat58+Y12PfQoUOqq6tTUlKSR3tSUpI++eQTn2M5KaQTeWpqqkpLSxUXFycjWO/UbwEVFRVyOp0qLS2Vw+EIdDhoQXzX1mHV79o0TR09elSpqaktdo6YmBjt3btXNTU1PvdlmmaDfNNYNd6aQjqR22w2denSJdBhBIzD4bDUf/BWxndtHVb8rluqEv++mJgYxcTEtPh5vq9Tp06KiIhQeXm5R3t5ebmSk5P9dh4muwEA0AKio6M1cOBAbdiwwd1WX1+vDRs2KD093W/nCemKHACAYJadna3JkyfrvPPO0+DBg7Vo0SJVVVVp6tSpfjsHiTwE2e12zZ07N+DXZdDy+K6tg+86PF1++eU6ePCg7rnnHpWVlemcc87RmjVrGkyA84VhtsacfwAA0CK4Rg4AQAgjkQMAEMJI5AAAhDASOQAAIYxEDvhgypQpmjBhgnt92LBhmjVrVqvHkZ+fL8Mw9M033wRFPwBaD4kcYWfKlCkyDEOGYSg6Olo9e/bUfffdp+PHj7f4uV9//XX97ne/a9K+gUiaH330kX79618rKSlJMTEx6tWrl6ZNm6ZPP/201WIA4F8kcoSl0aNHa//+/dq9e7duvfVWzZs3Tw899FCj+/rj+csnJSQkKC4uzm/9+dPq1at1/vnny+Vy6cUXX9THH3+sF154QfHx8br77rsDHR6AZiKRIyzZ7XYlJyerW7duuummm5SRkaE333xT0n+Gwx944AGlpqaqd+/ekqTS0lJddtllat++vRISEjR+/Hh9/vnn7j7r6uqUnZ2t9u3bq2PHjrrjjjsavHrxh0PrLpdLd955p5xOp+x2u3r27KklS5bo888/1/DhwyVJHTp0kGEYmjJliqQTj3CcP3++unfvrtjYWA0YMEArVqzwOM9f/vIXnXnmmYqNjdXw4cM94mzMsWPHNHXqVI0dO1ZvvvmmMjIy1L17dw0ZMkQPP/ywnnnmmUaPO3z4sK688kqddtppatOmjdLS0vTSSy957LNixQqlpaUpNjZWHTt2VEZGhqqqqiSdGHUYPHiw2rZtq/bt2+uCCy7QF198ccpYAXiHRA5LiI2N9ai8N2zYoF27dmndunVavXq1amtrlZmZqbi4OP3tb3/T3//+d7Vr106jR492H/e///u/WrZsmZ577jlt3LhR//73v7Vy5cpTnvfaa6/VSy+9pMcff1wff/yxnnnmGfdrEF977TVJ0q5du7R//3499thjkqT58+frT3/6k55++mn985//1OzZs3XNNdeooKBA0olfOCZOnKhx48Zp27Zt+u///m/95je/OWUca9eu1aFDh3THHXc0ur19+/aNtldXV2vgwIF6++23tXPnTk2fPl3/9V//pQ8//FCStH//fl155ZW67rrr9PHHHys/P18TJ06UaZo6fvy4JkyYoIsuukjbt2/Xpk2bNH36dEu9qRBoFSYQZiZPnmyOHz/eNE3TrK+vN9etW2fa7Xbztttuc29PSkoyXS6X+5g///nPZu/evc36+np3m8vlMmNjY821a9eapmmaKSkp5sKFC93ba2trzS5durjPZZqmedFFF5m33HKLaZqmuWvXLlOSuW7dukbjfO+990xJ5tdff+1uq66uNtu0aWO+//77Hvtef/315pVXXmmapmnm5OSYffv29dh+5513Nujr+37/+9+bksx///vfjW4/VUw/dMkll5i33nqraZqmWVxcbEoyP//88wb7HT582JRk5ufnn/KcAHzDs9YRllavXq127dqptrZW9fX1uuqqqzRv3jz39rS0NEVHR7vX//GPf2jPnj0Nrm9XV1erpKRER44c0f79+zVkyBD3tsjISJ133nkNhtdP2rZtmyIiInTRRRc1Oe49e/bo2LFjuvjiiz3aa2pq9LOf/UyS9PHHH3vEIekn36T0YzH+lLq6Oj344IN65ZVX9K9//Us1NTVyuVxq06aNJGnAgAEaOXKk0tLSlJmZqVGjRulXv/qVOnTooISEBE2ZMkWZmZm6+OKLlZGRocsuu0wpKSnNigVA40jkCEvDhw/X4sWLFR0drdTUVEVGev5fvW3bth7rlZWVGjhwoF588cUGfXXu3LlZMcTGxnp9TGVlpSTp7bff1mmnneaxzZeXaZx55pmSpE8++cSr1yc+9NBDeuyxx7Ro0SKlpaWpbdu2mjVrlvtyQ0REhNatW6f3339f7777rp544gnddddd+uCDD9S9e3ctXbpUN998s9asWaOXX35Zc+bM0bp163T++ec3+7MA8MQ1coSltm3bqmfPnuratWuDJN6Yc889V7t371ZiYqJ69uzpscTHxys+Pl4pKSn64IMP3MccP35cxcXFP9pnWlqa6uvr3de2f+jkiEBdXZ27rW/fvrLb7dq3b1+DOJxOpyTprLPOcl+jPqmoqOiUn2/UqFHq1KmTFi5c2Oj2H7sF7u9//7vGjx+va665RgMGDNAZZ5zR4FY1wzB0wQUX6N5779VHH32k6Ohoj7kDP/vZz5STk6P3339f/fr1U15e3iljBeAdEjkg6eqrr1anTp00fvx4/e1vf9PevXuVn5+vm2++WV9++aUk6ZZbbtGCBQu0atUqffLJJ/qf//mfU94Dfvrpp2vy5Mm67rrrtGrVKnefr7zyiiSpW7duMgxDq1ev1sGDB1VZWam4uDjddtttmj17tp5//nmVlJRo69ateuKJJ/T8889Lkm688Ubt3r1bt99+u3bt2qW8vDwtW7bslJ+vbdu2+uMf/6i3335bl156qdavX6/PP/9cW7Zs0R133KEbb7yx0eN69erlrrg//vhj3XDDDSovL3dv/+CDD/Tggw9qy5Yt2rdvn15//XUdPHhQZ511lvbu3aucnBxt2rRJX3zxhd59913t3r1bZ511lhffDICfFOiL9IC/fX+ymzfb9+/fb1577bVmp06dTLvdbp5xxhnmtGnTzCNHjpimeWJy2y233GI6HA6zffv2ZnZ2tnnttdf+6GQ30zTNb7/91pw9e7aZkpJiRkdHmz179jSfe+459/b77rvPTE5ONg3DMCdPnmya5okJeosWLTJ79+5tRkVFmZ07dzYzMzPNgoIC93FvvfWW2bNnT9Nut5s///nPzeeee+4nJ6mZpmlu3rzZnDhxotm5c2fTbrebPXv2NKdPn27u3r3bNM2Gk90OHz5sjh8/3mzXrp2ZmJhozpkzx+Mz/9///Z+ZmZnp7u/MM880n3jiCdM0TbOsrMycMGGC+7N369bNvOeee8y6urpTxgjAO7yPHACAEMbQOgAAIYxEDgBACCORAwAQwkjkAACEMBI5AAAhjEQOAEAII5EDABDCSOQAAIQwEjkAACGMRA4AQAgjkQMAEML+P3/TNmM+esJRAAAAAElFTkSuQmCC",
 84 |       "text/plain": [
 85 |        "<Figure size 640x480 with 2 Axes>"
 86 |       ]
 87 |      },
 88 |      "metadata": {},
 89 |      "output_type": "display_data"
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "import matplotlib.pyplot as plt\n",
 94 |     "\n",
 95 |     "from collections import Counter\n",
 96 |     "\n",
 97 |     "fig, ax = plt.subplots()\n",
 98 |     "\n",
 99 |     "pred_matrix = np.zeros((NUM_CLASSES, NUM_CLASSES))\n",
100 |     "for [x,y], count in Counter(zip(y_test,preds.argmax(axis=1))).items():\n",
101 |     "    pred_matrix[x,y] = count\n",
102 |     "    ax.text(y,x,  count, ha='center', va='center')\n",
103 |     "\n",
104 |     "ax.set_xlabel('Predicted Class')\n",
105 |     "ax.set_ylabel('Actual Class')\n",
106 |     "fig.colorbar(ax.matshow(pred_matrix, cmap='Blues'))\n",
107 |     "fig.show()"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 120,
113 |    "metadata": {},
114 |    "outputs": [
115 |     {
116 |      "name": "stdout",
117 |      "output_type": "stream",
118 |      "text": [
119 |       "Class Iris-setosa          Precision: 100.00%  Recall: 100.00%  F1: 100.00% \n",
120 |       "Class Iris-versicolor      Precision: 94.81%   Recall: 94.81%   F1: 94.81%  \n",
121 |       "Class Iris-virginica       Precision: 94.81%   Recall: 94.81%   F1: 94.81%  \n"
122 |      ]
123 |     }
124 |    ],
125 |    "source": [
126 |     "def precision_recall_f1(y_true, y_pred, class_label):\n",
127 |     "    TP = np.sum((y_true == class_label) == (y_pred == class_label))\n",
128 |     "    FP = np.sum((y_true != class_label) == (y_pred == class_label))\n",
129 |     "    FN = np.sum((y_true == class_label) == (y_pred != class_label))\n",
130 |     "    \n",
131 |     "    precision = TP / (TP + FP) if (TP + FP) > 0 else 0\n",
132 |     "    recall = TP / (TP + FN) if (TP + FN) > 0 else 0\n",
133 |     "    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0\n",
134 |     "    \n",
135 |     "    return class_label, precision, recall, f1\n",
136 |     "\n",
137 |     "results = (precision_recall_f1(y_test, preds.argmax(axis=1), class_label) for class_label in range(NUM_CLASSES))\n",
138 |     "\n",
139 |     "for class_label, precision, recall, f1 in results:\n",
140 |     "    print(f\"Class {itos(class_label):<20} Precision: {precision:<8.2%} Recall: {recall:<8.2%} F1: {f1:<8.2%}\")"
141 |    ]
142 |   }
143 |  ],
144 |  "metadata": {
145 |   "kernelspec": {
146 |    "display_name": ".venv",
147 |    "language": "python",
148 |    "name": "python3"
149 |   },
150 |   "language_info": {
151 |    "codemirror_mode": {
152 |     "name": "ipython",
153 |     "version": 3
154 |    },
155 |    "file_extension": ".py",
156 |    "mimetype": "text/x-python",
157 |    "name": "python",
158 |    "nbconvert_exporter": "python",
159 |    "pygments_lexer": "ipython3",
160 |    "version": "3.11.4"
161 |   },
162 |   "orig_nbformat": 4
163 |  },
164 |  "nbformat": 4,
165 |  "nbformat_minor": 2
166 | }
167 | 


--------------------------------------------------------------------------------
/classifiers/xgb/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.8.5
 2 | aiosignal==1.3.1
 3 | asttokens==2.2.1
 4 | async-timeout==4.0.3
 5 | attrs==23.1.0
 6 | backcall==0.2.0
 7 | certifi==2023.7.22
 8 | charset-normalizer==3.2.0
 9 | comm==0.1.4
10 | contourpy==1.1.0
11 | cycler==0.11.0
12 | datasets==2.14.4
13 | debugpy==1.6.7.post1
14 | decorator==5.1.1
15 | dill==0.3.7
16 | executing==1.2.0
17 | filelock==3.12.2
18 | fonttools==4.42.1
19 | frozenlist==1.4.0
20 | fsspec==2023.6.0
21 | huggingface-hub==0.16.4
22 | idna==3.4
23 | ipykernel==6.25.1
24 | ipython==8.14.0
25 | jedi==0.19.0
26 | jupyter_client==8.3.0
27 | jupyter_core==5.3.1
28 | kiwisolver==1.4.5
29 | matplotlib==3.7.2
30 | matplotlib-inline==0.1.6
31 | multidict==6.0.4
32 | multiprocess==0.70.15
33 | nest-asyncio==1.5.7
34 | numpy==1.25.2
35 | packaging==23.1
36 | pandas==2.0.3
37 | parso==0.8.3
38 | pexpect==4.8.0
39 | pickleshare==0.7.5
40 | Pillow==10.0.0
41 | platformdirs==3.10.0
42 | prompt-toolkit==3.0.39
43 | psutil==5.9.5
44 | ptyprocess==0.7.0
45 | pure-eval==0.2.2
46 | pyarrow==13.0.0
47 | Pygments==2.16.1
48 | pyparsing==3.0.9
49 | python-dateutil==2.8.2
50 | pytz==2023.3
51 | PyYAML==6.0.1
52 | pyzmq==25.1.1
53 | requests==2.31.0
54 | scipy==1.11.2
55 | six==1.16.0
56 | stack-data==0.6.2
57 | tornado==6.3.3
58 | tqdm==4.66.1
59 | traitlets==5.9.0
60 | typing_extensions==4.7.1
61 | tzdata==2023.3
62 | urllib3==2.0.4
63 | wcwidth==0.2.6
64 | xgboost==1.7.6
65 | xxhash==3.3.0
66 | yarl==1.9.2
67 | 


--------------------------------------------------------------------------------
/classifiers/xgb/train.py:
--------------------------------------------------------------------------------
 1 | import xgboost as xgb
 2 | from datasets import load_dataset
 3 | import numpy as np
 4 | 
 5 | dataset = load_dataset(
 6 |     "hitorilabs/iris", 
 7 |     split="train", 
 8 |     )
 9 | 
10 | # keep the int2str mapping to retrieve string labels
11 | itos = dataset.features["species"].int2str
12 | 
13 | dataset = dataset.train_test_split(test_size=0.9, stratify_by_column="species")
14 | X_train = dataset["train"].map(remove_columns=["species"]).to_pandas().to_numpy()
15 | y_train = np.array(dataset["train"]["species"])
16 | 
17 | X_test = dataset["test"].map(remove_columns=["species"]).to_pandas().to_numpy()
18 | y_test = np.array(dataset["test"]["species"])
19 | 
20 | # Create DMatrix for train and test
21 | dtrain = xgb.DMatrix(X_train, label=y_train)
22 | dtest = xgb.DMatrix(X_test, label=y_test)
23 | 
24 | NUM_CLASSES = 3
25 | # Set hyperparameters
26 | params = {
27 |     'objective': 'multi:softprob',
28 |     'max_depth': 15,
29 |     'learning_rate': 0.1,
30 |     'num_class': NUM_CLASSES,
31 | }
32 | 
33 | # Train the model
34 | num_rounds = 100
35 | bst = xgb.train(params, dtrain, num_rounds)
36 | 
37 | # Make predictions
38 | preds = bst.predict(dtest)
39 | acc = sum(dataset["test"]["species"] == preds.argmax(axis=1)) / len(dataset["test"])
40 | print(f"""{acc:.2%} ({sum(dataset["test"]["species"] == preds.argmax(axis=1))}/{len(dataset["test"])})""")
41 | 
42 | def precision_recall_f1(y_true, y_pred, class_label):
43 |     TP = np.sum((y_true == class_label) == (y_pred == class_label))
44 |     FP = np.sum((y_true != class_label) == (y_pred == class_label))
45 |     FN = np.sum((y_true == class_label) == (y_pred != class_label))
46 |     
47 |     precision = TP / (TP + FP) if (TP + FP) > 0 else 0
48 |     recall = TP / (TP + FN) if (TP + FN) > 0 else 0
49 |     f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
50 |     
51 |     return class_label, precision, recall, f1
52 | 
53 | results = (precision_recall_f1(y_test, preds.argmax(axis=1), class_label) for class_label in range(NUM_CLASSES))
54 | 
55 | for class_label, precision, recall, f1 in results:
56 |     print(f"Class {itos(class_label):<20} Precision: {precision:<8.2%} Recall: {recall:<8.2%} F1: {f1:<8.2%}")
57 | 
58 | bst.save_model("iris.model")


--------------------------------------------------------------------------------
/convnet/README:
--------------------------------------------------------------------------------
1 | Usage:
2 |     wandb sweep --project <project_name> sweep.yaml


--------------------------------------------------------------------------------
/convnet/config.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple, Optional
 2 | from dataclasses import dataclass, asdict
 3 | 
 4 | @dataclass
 5 | class CIFAR10Config:
 6 |     batch_size: int = 256
 7 |     num_epochs: int = 100
 8 |     learning_rate: int = 0.0068
 9 |     num_classes: int = 10
10 |     momentum: float = 0.86583
11 |     weight_decay: float = 0.00834863
12 | 
13 | config = CIFAR10Config()


--------------------------------------------------------------------------------
/convnet/sweep.yml:
--------------------------------------------------------------------------------
 1 | program: train_wandb.py
 2 | method: bayes
 3 | metric:
 4 |   name: val_accuracy
 5 |   goal: maximize
 6 | parameters:
 7 |   learning_rate:
 8 |     min: 0.0001
 9 |     max: 0.01
10 |   batch_size: {'values': [16, 32, 64, 128, 256]}
11 |   num_epochs: {'values': [100, 200, 300]}
12 |   momentum: 
13 |     min: 0.8
14 |     max: 0.99
15 |   weight_decay:
16 |     min: 0.0
17 |     max: 0.01
18 | early_terminate:
19 |   type: hyperband
20 |   min_iter: 3
21 |   eta: 3


--------------------------------------------------------------------------------
/convnet/train.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from config import config
 3 | from dataclasses import asdict
 4 | import trainer
 5 | 
 6 | 
 7 | if __name__ == '__main__':
 8 | 
 9 |     with torch.device("cuda:0"):
10 |         model = trainer.ConvNet()
11 |     
12 |     trainloader, testloader = trainer.get_dataloaders(config)
13 | 
14 |     trainer.train(
15 |         model=model,
16 |         trainloader=trainloader,
17 |         testloader=testloader,
18 |         config=config, 
19 |         logger_fn=print
20 |         )


--------------------------------------------------------------------------------
/convnet/train_wandb.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import wandb
 3 | from config import config
 4 | from dataclasses import asdict
 5 | import trainer
 6 | 
 7 | 
 8 | if __name__ == '__main__':
 9 | 
10 |     run = wandb.init(
11 |         project="cifar10",
12 |         config=asdict(config)
13 |     )
14 | 
15 |     with torch.device("cuda:0"):
16 |         model = trainer.ConvNet()
17 |     
18 |     wandb.watch(model)
19 | 
20 |     trainloader, testloader = trainer.get_dataloaders(config)
21 | 
22 |     trainer.train(
23 |         model=model,
24 |         trainloader=trainloader,
25 |         testloader=testloader,
26 |         config=wandb.config, 
27 |         logger_fn=wandb.log
28 |         )


--------------------------------------------------------------------------------
/convnet/trainer.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple
  2 | import time
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | import torchvision
  7 | 
  8 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  9 | 
 10 | class ConvNet(nn.Module):
 11 |     def __init__(self):
 12 |         super().__init__()
 13 |         self.conv_pool1 = nn.Sequential(
 14 |             nn.Conv2d(3, 6, 5),
 15 |             nn.ReLU(),
 16 |             nn.MaxPool2d(2, 2),
 17 |         ) 
 18 |         self.conv_pool2 = nn.Sequential(
 19 |             nn.Conv2d(6, 16, 5),
 20 |             nn.ReLU(),
 21 |             nn.MaxPool2d(2, 2),
 22 |         )
 23 |         self.fc1 = nn.Sequential(
 24 |             nn.Linear(16 * 5 * 5, 120),
 25 |             nn.ReLU(),
 26 |         )
 27 |         self.fc2 = nn.Sequential(
 28 |             nn.Linear(120, 84),
 29 |             nn.ReLU(),
 30 |         )
 31 |         self.fc3 = nn.Linear(84, 10)
 32 | 
 33 |     def forward(self, x):
 34 |         x = self.conv_pool1(x)  # (3, 32, 32) => (6, 28, 28) => (6, 14, 14)
 35 |         x = self.conv_pool2(x)  # (6, 14, 14) => (16, 10, 10) => (16, 5, 5)
 36 |         x = torch.flatten(x, 1) # flatten all dimensions except batch
 37 |         x = self.fc1(x)         # (16, 5, 5) => (16 * 5 * 5) => (120)
 38 |         x = self.fc2(x)         # (120) => (84)
 39 |         x = self.fc3(x)         # (84) => (10)
 40 |         return x
 41 | 
 42 | def train(model, trainloader, testloader, config, logger_fn):
 43 |     
 44 |     criterion = nn.CrossEntropyLoss()
 45 |     optimizer = torch.optim.SGD(
 46 |         model.parameters(), 
 47 |         lr=config.learning_rate, 
 48 |         momentum=config.momentum,
 49 |         weight_decay=config.weight_decay,
 50 |         )
 51 | 
 52 |     for _ in range(config.num_epochs):
 53 | 
 54 |         start = time.monotonic()
 55 |         running_loss = 0.
 56 |         last_loss = 0.
 57 |         for idx, [data, targets] in enumerate(trainloader):
 58 |             optimizer.zero_grad()
 59 |             loss = criterion(model(data.to(device, non_blocking=True)), targets.to(device, non_blocking=True))
 60 |             loss.backward()
 61 |             optimizer.step()
 62 | 
 63 |             # Gather data and report
 64 |             running_loss += loss.item()
 65 |             if idx % 1000 == 999:
 66 |                 last_loss = running_loss / 1000 # loss per mini-batch
 67 |                 running_loss = 0.0
 68 |         
 69 |         last_loss = running_loss / (idx % 1000)
 70 |         end = time.monotonic() 
 71 | 
 72 |         with torch.no_grad():
 73 |             correct = 0
 74 |             total = 0
 75 | 
 76 |             for data, targets in testloader:
 77 |                 outputs = model(data.to(device, non_blocking=True))
 78 |                 pred = torch.argmax(outputs, 1)
 79 |                 total += pred.size(0)
 80 |                 correct += (pred == targets.to(device, non_blocking=True)).sum().item()
 81 | 
 82 |             logger_fn({"val_accuracy": correct / total, "train_loss": last_loss, "epoch_time": end - start})
 83 | 
 84 | def get_dataloaders(config) -> Tuple[torch.utils.data.DataLoader, torch.utils.data.DataLoader]:
 85 |     transform = torchvision.transforms.Compose([
 86 |         torchvision.transforms.ToTensor(),
 87 |         torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
 88 |         ])
 89 | 
 90 |     trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
 91 |                                             download=True, transform=transform)
 92 |     trainloader = torch.utils.data.DataLoader(trainset, batch_size=config.batch_size,
 93 |                                             shuffle=True, num_workers=2, 
 94 |                                             pin_memory=True)
 95 | 
 96 |     testset = torchvision.datasets.CIFAR10(root='./data', train=False,
 97 |                                         download=True, transform=transform)
 98 |     testloader = torch.utils.data.DataLoader(testset, batch_size=config.batch_size,
 99 |                                             shuffle=False, num_workers=2,
100 |                                             pin_memory=True)
101 |     
102 |     return trainloader, testloader


--------------------------------------------------------------------------------
/datasets/upload_iris.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset, ClassLabel, Value, Features
 2 | 
 3 | dataset = load_dataset(
 4 |     "scikit-learn/iris", 
 5 |     split="train", 
 6 |     )
 7 | 
 8 | dataset = dataset \
 9 |     .map(remove_columns=["Id"]) \
10 |     .rename_columns({
11 |         "SepalLengthCm": "sepal_length", 
12 |         "SepalWidthCm": "sepal_width", 
13 |         "PetalLengthCm": "petal_length",
14 |         "PetalWidthCm": "petal_width",
15 |         "Species": "species"
16 |     })
17 | 
18 | 
19 | names = ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]
20 | 
21 | new_features = Features({
22 |     "petal_length": Value(dtype="float32", id=None),
23 |     "petal_width": Value(dtype="float32", id=None),
24 |     "sepal_length": Value(dtype="float32", id=None),
25 |     "sepal_width": Value(dtype="float32", id=None),
26 |     "species": ClassLabel(names=names)
27 | })
28 | 
29 | dataset = dataset.cast(new_features)
30 | 
31 | dataset.push_to_hub("hitorilabs/iris")


--------------------------------------------------------------------------------
/preprocessing/README.md:
--------------------------------------------------------------------------------
 1 | # Preprocessing Techniques
 2 | 
 3 | On the surface, pre-processing is quite straightforward.
 4 | All we're doing to taking data and transforming it into
 5 | some format we want (i.e. binary, chunks, numerical,
 6 | etc.). 
 7 | 
 8 | However, the problem is that it's such a simple idea that
 9 | everyone just rolls their own pre-processing method. I
10 | don't think we should standardize it, but we should all
11 | have some idea of the most elegant path.
12 | 
13 | ![image](https://github.com/hitorilabs/papers/assets/131238467/9597fe82-0d20-4af7-bf91-acfff08d68d9)
14 | 
15 | 
16 | https://gist.github.com/ZijiaLewisLu/eabdca955110833c0ce984d34eb7ff39?permalink_comment_id=3417135
17 | 
18 | just use `np.memmap` for everything - it's all tensors.


--------------------------------------------------------------------------------
/preprocessing/multi_file_example.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import pathlib
 4 | import os
 5 | from tqdm import tqdm
 6 | 
 7 | # one-liner for copying data 
 8 | # for((i=1; i <= 100; i++)); do cp source_iris/iris.data "iris_dataset/iris_${i}.data"; done
 9 | 
10 | source_path = os.getenv("SOURCE_PATH")
11 | if source_path is None: raise Exception("Missing SOURCE_PATH variable")
12 | 
13 | pattern = os.getenv("PATTERN")
14 | if pattern is None: raise Exception("Missing PATTERN variable")
15 | 
16 | source_path = pathlib.Path(".") / source_path
17 | data_path = pathlib.Path("datasets") / source_path.stem
18 | data_path.mkdir(exist_ok=True)
19 | 
20 | rows = 0
21 | cols = 0
22 | for i, file in enumerate(tqdm(list(source_path.glob(pattern)))):
23 |     df = pd.read_csv(file, header=None)
24 |     df_rows, df_cols = df.shape
25 |     rows += df_rows
26 |     cols = df_cols
27 | 
28 | train_file = np.memmap(data_path / "train.memmap", dtype='float32', mode='w+', shape=(rows, cols -1))
29 | target_file = np.memmap(data_path / "target.memmap", dtype='int64', mode='w+', shape=(rows))
30 | 
31 | current_rows = 0
32 | for i, file in enumerate(tqdm(list(source_path.glob(pattern)))):
33 |     df = pd.read_csv(
34 |             file, 
35 |             header=None,
36 |             names=[
37 |                 "sepal_length", 
38 |                 "sepal_width", 
39 |                 "petal_length", 
40 |                 "petal_width", 
41 |                 "class"
42 |             ],
43 |             dtype={
44 |                 "sepal_length": np.float32, 
45 |                 "sepal_width": np.float32, 
46 |                 "petal_length": np.float32, 
47 |                 "petal_width": np.float32, 
48 |                 "class": "category",
49 |             })
50 |     xs, ys = (
51 |         df.loc[:, df.columns != "class"].to_numpy(), 
52 |         df["class"].cat.codes.to_numpy(dtype="int64"),
53 |     )
54 | 
55 |     rows, cols = xs.shape
56 |     left_pos = current_rows
57 |     right_pos = current_rows + rows
58 | 
59 |     train_file[left_pos:right_pos,:cols] = xs[:rows,:cols]
60 |     train_file.flush()
61 | 
62 |     target_file[left_pos:right_pos] = ys[:rows]
63 |     target_file.flush()
64 | 
65 |     current_rows += rows


--------------------------------------------------------------------------------
/preprocessing/save_v_mmap.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import pathlib
 4 | import os
 5 | 
 6 | source_file = os.getenv("SOURCE_FILE")
 7 | if source_file is None: raise Exception("Missing SOURCE_FILE variable")
 8 | source_path = pathlib.Path(".") / source_file
 9 | data_path = pathlib.Path("datasets") / source_path.stem
10 | data_path.mkdir(exist_ok=True)
11 | 
12 | df = pd.read_csv(
13 |     source_path, 
14 |     header=None,
15 |     names=[
16 |         "sepal_length", 
17 |         "sepal_width", 
18 |         "petal_length", 
19 |         "petal_width", 
20 |         "class"
21 |     ],
22 |     dtype={
23 |         "sepal_length": np.float32, 
24 |         "sepal_width": np.float32, 
25 |         "petal_length": np.float32, 
26 |         "petal_width": np.float32, 
27 |         "class": "category",
28 |     })
29 | 
30 | xs, ys = (
31 |     np.repeat(df.loc[:, df.columns != "class"].to_numpy(),   1000000), 
32 |     np.repeat(df["class"].cat.codes.to_numpy(dtype="int64"), 1000000),
33 | )
34 | 
35 | print(xs.shape)
36 | print(ys.shape)
37 | 
38 | np.save(data_path / "train_save.npy", xs)
39 | np.save(data_path / "target_save.npy", ys)
40 | 
41 | rows, *_ = xs.shape
42 | fp = np.memmap(data_path / "train_memmap.npy", dtype='float32', mode='w+', shape=xs.shape)
43 | fp[:rows] = xs[:rows]
44 | print(fp.shape)
45 | fp.flush()
46 | 
47 | rows, *_ = ys.shape
48 | fp = np.memmap(data_path / "target_memmap.npy", dtype="int64", mode='w+', shape=ys.shape)
49 | print(fp.shape)
50 | fp[:rows] = ys[:rows]
51 | fp.flush()


--------------------------------------------------------------------------------
/preprocessing/single_file_example.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import pathlib
 4 | import os
 5 | 
 6 | source_file = os.getenv("SOURCE_FILE")
 7 | if source_file is None: raise Exception("Missing SOURCE_FILE variable")
 8 | 
 9 | source_path = pathlib.Path(".") / source_file
10 | data_path = pathlib.Path("datasets") / source_path.stem
11 | data_path.mkdir(exist_ok=True)
12 | 
13 | df = pd.read_csv(
14 |     source_path, 
15 |     header=None,
16 |     names=[
17 |         "sepal_length", 
18 |         "sepal_width", 
19 |         "petal_length", 
20 |         "petal_width", 
21 |         "class"
22 |     ],
23 |     dtype={
24 |         "sepal_length": np.float32, 
25 |         "sepal_width": np.float32, 
26 |         "petal_length": np.float32, 
27 |         "petal_width": np.float32, 
28 |         "class": "category",
29 |     })
30 | 
31 | xs, ys = (
32 |     df.loc[:, df.columns != "class"].to_numpy(), 
33 |     df["class"].cat.codes.to_numpy(dtype="int64"),
34 | )
35 | 
36 | rows, *_ = xs.shape
37 | fp = np.memmap(data_path / "train.memmap", dtype='float32', mode='w+', shape=xs.shape)
38 | fp[:rows] = xs[:rows]
39 | fp.flush()
40 | 
41 | rows, *_ = ys.shape
42 | fp = np.memmap(data_path / "target.memmap", dtype="int64", mode='w+', shape=ys.shape)
43 | fp[:rows] = ys[:rows]
44 | fp.flush()


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | cmake==3.26.3
 2 | contourpy==1.0.7
 3 | cycler==0.11.0
 4 | filelock==3.12.0
 5 | fonttools==4.39.4
 6 | Jinja2==3.1.2
 7 | joblib==1.2.0
 8 | kiwisolver==1.4.4
 9 | lit==16.0.3
10 | MarkupSafe==2.1.2
11 | matplotlib==3.7.1
12 | mpmath==1.3.0
13 | networkx==3.1
14 | numpy==1.24.3
15 | nvidia-cublas-cu11==11.10.3.66
16 | nvidia-cuda-cupti-cu11==11.7.101
17 | nvidia-cuda-nvrtc-cu11==11.7.99
18 | nvidia-cuda-runtime-cu11==11.7.99
19 | nvidia-cudnn-cu11==8.5.0.96
20 | nvidia-cufft-cu11==10.9.0.58
21 | nvidia-curand-cu11==10.2.10.91
22 | nvidia-cusolver-cu11==11.4.0.1
23 | nvidia-cusparse-cu11==11.7.4.91
24 | nvidia-nccl-cu11==2.14.3
25 | nvidia-nvtx-cu11==11.7.91
26 | packaging==23.1
27 | pandas==2.0.1
28 | Pillow==9.5.0
29 | pyarrow==12.0.0
30 | pyparsing==3.0.9
31 | python-dateutil==2.8.2
32 | pytz==2023.3
33 | scikit-learn==1.2.2
34 | scipy==1.10.1
35 | six==1.16.0
36 | sympy==1.12
37 | threadpoolctl==3.1.0
38 | torch==2.0.1
39 | triton==2.0.0
40 | typing_extensions==4.5.0
41 | tzdata==2023.3
42 | 


--------------------------------------------------------------------------------
/thoughts/README:
--------------------------------------------------------------------------------
1 | My general intuition tells me that you actually just need to understand what
2 | kind of problem you are trying to solve, break it down into sub-problems and
3 | traverse graph of sub-problems until you have either found an answer or you
4 | have clearly identified a roadblock that is too challenging or tedious to
5 | overcome.
6 | 


--------------------------------------------------------------------------------
/thoughts/bench/dataloader/Makefile:
--------------------------------------------------------------------------------
 1 | all: naive preallocate save_tensor load_from_disk
 2 | 
 3 | .PHONY: naive
 4 | naive:
 5 | 	python3 naive.py
 6 | 
 7 | .PHONY: preallocate
 8 | preallocate:
 9 | 	python3 preallocate_on_gpu.py
10 | 
11 | .PHONY: save_tensor
12 | save_tensor:
13 | 	python3 make_example_tensor.py
14 | 
15 | .PHONY: load_from_disk
16 | load_from_disk:
17 | 	python3 load_from_disk.py
18 | 


--------------------------------------------------------------------------------
/thoughts/bench/dataloader/load_from_disk.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import time
 3 | 
 4 | BATCH_SIZE = 32
 5 | DATA_SHAPE = (3, 224, 224)
 6 | 
 7 | # Preallocate memory on the GPU
 8 | preallocated_data = torch.empty(BATCH_SIZE, *DATA_SHAPE, device="cuda:0")
 9 | 
10 | for _ in range(10):
11 |     start = time.perf_counter()
12 |     
13 |     # Load tensor to CPU memory
14 |     gpu_data = torch.load("batch.pt", map_location=torch.device("cuda:0"))
15 | 
16 |     end = time.perf_counter()
17 | 
18 |     elapsed = end - start
19 |     data_size = gpu_data.nelement() * gpu_data.element_size()
20 |     throughput = data_size / elapsed / (1 << 30)
21 |     print(f"completed in {elapsed:.8f}s | {throughput=}GiB/s")
22 | 
23 | 


--------------------------------------------------------------------------------
/thoughts/bench/dataloader/make_example_tensor.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | BATCH_SIZE = 32
4 | DATA_SHAPE = (3, 224, 224)
5 | 
6 | preallocated_data = torch.randn(BATCH_SIZE, *DATA_SHAPE, dtype=torch.float16)
7 | torch.save(preallocated_data, "batch.pt")
8 | 


--------------------------------------------------------------------------------
/thoughts/bench/dataloader/naive.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import DataLoader, Dataset
 3 | import time
 4 | 
 5 | BATCH_SIZE = 32
 6 | NUM_WORKERS = 4
 7 | DATA_SHAPE = (3,224, 224)
 8 | 
 9 | 
10 | for _ in range(10):
11 |     # BATCH * CHANNELS * H * W
12 |     data = torch.randn(BATCH_SIZE, *DATA_SHAPE, dtype=torch.float16)
13 |     start = time.perf_counter()
14 |     data.to("cuda:0")
15 |     end = time.perf_counter()
16 | 
17 |     elapsed = end - start
18 |     data_size = data.nelement() * data.element_size()
19 |     throughput = data_size / elapsed / (1<<30)
20 |     print(f"completed in {elapsed:.8f}s | {throughput=}GiB/s")
21 | 


--------------------------------------------------------------------------------
/thoughts/bench/dataloader/preallocate_on_gpu.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import time
 3 | 
 4 | BATCH_SIZE = 32
 5 | DATA_SHAPE = (3, 224, 224)
 6 | 
 7 | # Preallocate memory on the GPU
 8 | preallocated_data = torch.empty(BATCH_SIZE, *DATA_SHAPE, dtype=torch.float16, device="cuda:0")
 9 | 
10 | for _ in range(10):
11 |     data = torch.randn(BATCH_SIZE, *DATA_SHAPE, dtype=torch.float16)
12 | 
13 |     start = time.perf_counter()
14 |     # Copy data to the preallocated space on the GPU
15 |     preallocated_data.copy_(data)
16 |     
17 |     end = time.perf_counter()
18 |     elapsed = end - start
19 |     data_size = data.nelement() * data.element_size()
20 |     throughput = data_size / elapsed / (1 << 30)
21 |     print(f"completed in {elapsed:.8f}s | {throughput=}GiB/s")
22 | 


--------------------------------------------------------------------------------
/thoughts/bench/preprocessing/.gitignore:
--------------------------------------------------------------------------------
1 | *.hdf5
2 | 


--------------------------------------------------------------------------------
/thoughts/bench/preprocessing/convert_imagenet_to_hdf5.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | from pathlib import Path
 3 | from argparse import ArgumentParser
 4 | import time
 5 | 
 6 | import torch
 7 | import numpy as np
 8 | from torch.utils.data import DataLoader
 9 | from torchvision.transforms import v2 as T
10 | 
11 | device = torch.device("cuda:0")
12 | 
13 | 
14 | parser = ArgumentParser()
15 | parser.add_argument('--workers', type=int, help='number of data loading workers', default=4)
16 | parser.add_argument('--imageSize', type=int, default=64, help='the height / width of the input image to network')
17 | parser.add_argument('--batchSize', type=int, default=32, help='input batch size')
18 | parser.add_argument('--csv',action='store_true',help='print format')
19 | 
20 | opt = parser.parse_args()
21 | 
22 | DATA_PATH = Path.home() / "datasets" / "huggingface"
23 | 
24 | dataset = load_dataset("imagenet-1k", cache_dir=DATA_PATH.as_posix(), trust_remote_code=False)
25 | 
26 | tv_transforms = T.Compose([
27 |     T.Lambda(lambda x: x.convert("RGB")),
28 |     T.Resize(opt.imageSize),
29 |     T.CenterCrop(opt.imageSize),
30 |     T.ToImage(),
31 |     T.ToDtype(torch.float32, scale=True),
32 |     T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
33 |     ])
34 | 
35 | def transforms(batch):
36 |     batch["image"] = tv_transforms(batch["image"])
37 |     return batch
38 | 
39 | dataset.set_transform(transforms)
40 | 
41 | itos = dataset["train"].features["label"].int2str
42 | stoi = dataset["train"].features["label"].str2int
43 | 
44 | 
45 | 
46 | NUM_CHANNELS = 3
47 | TRANSFORMED_SHAPE = (NUM_CHANNELS, opt.imageSize, opt.imageSize)
48 | 
49 | import h5py
50 | from tqdm import tqdm
51 | 
52 | with h5py.File('bench_imagenet.hdf5', 'a') as h5f:
53 |     for split in dataset:
54 |         # Count number of samples in the split
55 |         num_samples = len(dataset[split])
56 | 
57 |         # Create a group for each split in the H5PY file
58 |         group = h5f[split] if split in h5f else h5f.create_group(split)
59 | 
60 |         # Assuming a fixed image size, adjust as per actual size
61 |         image_shape = TRANSFORMED_SHAPE  # Example shape, modify as needed
62 |         dtype = np.float32  # Modify as per actual data type
63 | 
64 |         # Preallocate datasets for images and labels
65 |         MAX_IMAGE_SHAPE = (num_samples, *TRANSFORMED_SHAPE)
66 |         images_ds = group['images'] if 'images' in group else group.create_dataset('images', shape=MAX_IMAGE_SHAPE, maxshape=MAX_IMAGE_SHAPE, dtype=dtype)
67 | 
68 |         MAX_LABEL_SHAPE = (num_samples, )
69 |         labels_ds = group['labels'] if 'labels' in group else group.create_dataset('labels', shape=MAX_LABEL_SHAPE, maxshape=MAX_LABEL_SHAPE, dtype=np.int64, compression='gzip')
70 | 
71 |         dataloader = DataLoader(dataset[split], batch_size=opt.batchSize, num_workers=opt.workers)
72 |         start = time.perf_counter()
73 |         if opt.csv:
74 |             print(f"batch_start, batch_end, time_s, throughput_s, process_s, write_s")
75 |         # Fill the datasets
76 |         for i, batch in enumerate(dataloader):
77 |             batch_start = i * opt.batchSize
78 |             batch_end = min(batch_start + opt.batchSize, num_samples)
79 |             process_s = f"{(time.perf_counter() - start)}"
80 | 
81 |             images_ds[batch_start:batch_end] = batch["image"].numpy()
82 |             labels_ds[batch_start:batch_end] = batch["label"].numpy()
83 |             write_s = f"{(time.perf_counter() - start)}"
84 | 
85 |             if opt.csv:
86 |                 print(f"{batch_start}, {batch_end}, {(time.perf_counter() - start)}, {batch_end / (time.perf_counter() - start)}, {process_s}, {write_s}")
87 |             else:
88 |                 print(f"filling batch between {batch_start=} {batch_end=}")
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/thoughts/bench/preprocessing/hdf5_dataloader.py:
--------------------------------------------------------------------------------
 1 | import h5py
 2 | from torch.utils.data import DataLoader, Dataset
 3 | import time
 4 | import torch
 5 | 
 6 | from argparse import ArgumentParser
 7 | 
 8 | device = torch.device("cuda:0")
 9 | 
10 | parser = ArgumentParser()
11 | parser.add_argument("--workers", type=int, help="number of workers", default=4)
12 | parser.add_argument("--batch_size", type=int, help="batch_size", default=32)
13 | parser.add_argument("--file", type=str, help="hdf5 file", default="preprocessed_imagenet.hdf5")
14 | parser.add_argument("--group", type=str, help="hdf5 group", default="train")
15 | opt = parser.parse_args()
16 | 
17 | class H5Dataset(Dataset):
18 |     def __init__(self, file_name, split="train"):
19 |         with h5py.File(file_name, 'r') as f:
20 |             self.length = len(f[split]["images"])
21 |             self.img_hdf5 = h5py.File(file_name, 'r')
22 |             self.dataset = self.img_hdf5[split] # if you want dataset.
23 |     
24 |     def __getitem__(self, index):
25 |         record = self.h5_file[str(index)]
26 |         return (
27 |             record['data'].value,
28 |             record['target'].value,
29 |         )
30 |         
31 |     def __len__(self):
32 |         return self.length
33 | 
34 | class LXRTDataLoader(Dataset):
35 |     def __init__(self, file_name, split="train"):
36 |         with h5py.File(file_name, 'r') as f:
37 |             self.length = len(f[split]["images"])
38 |             self.img_hdf5 = h5py.File(file_name, 'r')
39 |             self.dataset = self.img_hdf5[split] # if you want dataset.
40 | 
41 |     def __getitem__(self, index: int):
42 |         img0 = self.dataset["images"][index] # Do loading here
43 |         img1 = self.dataset["labels"][index]
44 |         return img0, img1
45 |     
46 |     def __len__(self):
47 |         return self.length
48 | 
49 | train_ds = LXRTDataLoader(opt.file)
50 | train_loader = torch.utils.data.DataLoader(
51 |         dataset=train_ds,
52 |         batch_size=32,
53 |         num_workers=0
54 |         )
55 | 
56 | for i in train_loader:
57 |     print(i)
58 | 


--------------------------------------------------------------------------------
/thoughts/bench/preprocessing/hdf5_handwritten_loader.py:
--------------------------------------------------------------------------------
 1 | import h5py
 2 | from torch.utils.data import DataLoader, Dataset
 3 | import time
 4 | import torch
 5 | 
 6 | from argparse import ArgumentParser
 7 | 
 8 | device = torch.device("cuda:0")
 9 | 
10 | parser = ArgumentParser()
11 | parser.add_argument("--workers", type=int, help="number of workers", default=4)
12 | parser.add_argument("--batch_size", type=int, help="batch_size", default=32)
13 | parser.add_argument("--file", type=str, help="hdf5 file", default="preprocessed_imagenet.hdf5")
14 | parser.add_argument("--group", type=str, help="hdf5 group", default="train")
15 | parser.add_argument("--cuda", action="store_true", help="move images onto device")
16 | opt = parser.parse_args()
17 | 
18 | with h5py.File(opt.file, "r") as f:
19 |     start = time.perf_counter()
20 |     print(f"bs={opt.batch_size}-{'cuda' if opt.cuda else 'cpu'}")
21 |     print(f"idx, time_s, throughput_s")
22 |     for i in range(0, f[opt.group]['images'].shape[0], opt.batch_size):
23 |         image = torch.from_numpy(f[opt.group]["images"][i:i + opt.batch_size])
24 |         if opt.cuda:
25 |             image.to(device)
26 |         label = f[opt.group]["labels"][i:i + opt.batch_size]
27 | 
28 |         print(f"{i}, {(time.perf_counter() - start)}, {(i + opt.batch_size) / (time.perf_counter() - start)}")
29 | 


--------------------------------------------------------------------------------
/thoughts/bench/preprocessing/load_hf_imagenet_1k.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | from pathlib import Path
 3 | from argparse import ArgumentParser
 4 | import time
 5 | 
 6 | import torch
 7 | from torch.utils.data import DataLoader
 8 | from torchvision.transforms import v2 as T
 9 | 
10 | device = torch.device("cuda:0")
11 | 
12 | parser = ArgumentParser()
13 | parser.add_argument('--workers', type=int, help='number of data loading workers', default=4)
14 | parser.add_argument('--imageSize', type=int, default=64, help='the height / width of the input image to network')
15 | parser.add_argument('--batchSize', type=int, default=32, help='input batch size')
16 | parser.add_argument('--csv',action='store_true',help='print format')
17 | 
18 | opt = parser.parse_args()
19 | 
20 | DATA_PATH = Path.home() / "datasets" / "huggingface"
21 | 
22 | dataset = load_dataset("imagenet-1k", cache_dir=DATA_PATH.as_posix())
23 | 
24 | tv_transforms = T.Compose([
25 |     T.Lambda(lambda x: x.convert("RGB")),
26 |     T.Resize(opt.imageSize),
27 |     T.CenterCrop(opt.imageSize),
28 |     T.ToImage(),
29 |     T.ToDtype(torch.float32, scale=True),
30 |     T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
31 |     ])
32 | 
33 | def transforms(batch):
34 |     batch["image"] = tv_transforms(batch["image"])
35 |     return batch
36 | dataset.set_transform(transforms)
37 | 
38 | dataloader = DataLoader(dataset["train"], batch_size=opt.batchSize, num_workers=opt.workers)
39 | # print(dataset)
40 | 
41 | if opt.csv:
42 |     print("idx,images,img/s")
43 | 
44 | start = time.perf_counter()
45 | for idx, batch in enumerate(dataloader):
46 |     # batch["image"].to(device)
47 |     # if idx > 0 and idx % 10 == 0:
48 |     #     if opt.csv:
49 |     #         print(f"{idx},{idx * opt.batchSize},{(idx * opt.batchSize) / (time.perf_counter() - start)}")
50 |     #     else:
51 |     #         print(f"idx: {idx} | images: {idx * opt.batchSize} | throughput {(idx * opt.batchSize) / (time.perf_counter() - start)} img/s")
52 |     if opt.csv:
53 |         print(f"{idx},{idx * opt.batchSize},{(idx * opt.batchSize) / (time.perf_counter() - start)}")
54 |     else:
55 |         print(f"idx: {idx} | images: {idx * opt.batchSize} | throughput {(idx * opt.batchSize) / (time.perf_counter() - start)} img/s")
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/thoughts/bench/preprocessing/load_imagefolder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchvision.transforms import v2 as T
 3 | import torchvision.datasets as ds
 4 | from argparse import ArgumentParser
 5 | import time
 6 | 
 7 | parser = ArgumentParser()
 8 | parser.add_argument('--dataroot', required=True, help='path to dataset')
 9 | parser.add_argument('--workers', type=int, help='number of data loading workers', default=4)
10 | parser.add_argument('--batchSize', type=int, default=32, help='input batch size')
11 | parser.add_argument('--imageSize', type=int, default=64, help='the height / width of the input image to network')
12 | parser.add_argument('--niter', type=int, default=25, help='number of epochs to train for')
13 | parser.add_argument('--lr', type=float, default=0.0002, help='learning rate, default=0.0002')
14 | parser.add_argument('--cuda', action='store_true', default=False, help='enables cuda')
15 | parser.add_argument('--dry-run', action='store_true', help='check a single training cycle works')
16 | parser.add_argument('--ngpu', type=int, default=1, help='number of GPUs to use')
17 | parser.add_argument('--outf', default='.', help='folder to output images and model checkpoints')
18 | parser.add_argument('--manualSeed', type=int, help='manual seed')
19 | parser.add_argument('--csv',action='store_true',help='print format')
20 | 
21 | opt = parser.parse_args()
22 | 
23 | dataset = ds.ImageFolder(root=opt.dataroot,
24 |                          transform=T.Compose([
25 |                              T.Resize(opt.imageSize),
26 |                              T.CenterCrop(opt.imageSize),
27 |                              T.ToImage(),
28 |                              T.ToDtype(torch.float32, scale=True),
29 |                              T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
30 |                              ]))
31 | 
32 | dataloader = torch.utils.data.DataLoader(dataset, batch_size=opt.batchSize,
33 |                                          shuffle=True, num_workers=int(opt.workers))
34 | 
35 | 
36 | start = time.perf_counter()
37 | if opt.csv:
38 |     print("idx,images,img/s")
39 | for idx, batch in enumerate(dataloader):
40 |     if idx > 0 and idx % 10 == 0:
41 |         if opt.csv:
42 |             print(f"{idx},{idx * opt.batchSize},{(idx * opt.batchSize) / (time.perf_counter() - start)}")
43 |         else:
44 |             print(f"idx: {idx} | images: {idx * opt.batchSize} | throughput {(idx * opt.batchSize) / (time.perf_counter() - start)} img/s")
45 | end = time.perf_counter()
46 | 
47 | elapsed = end - start
48 | item = batch[0]
49 | data_size = item.nelement() * item.element_size()
50 | print(f"{data_size=} | {item.shape=} | {item.nelement()=} | {item.element_size()=}")
51 | throughput = (data_size * idx) / elapsed / (1<<30)
52 | 
53 | print(f"{elapsed}s | {(opt.batchSize * idx) / elapsed} img/s | {idx / elapsed} it/s | {throughput} GiB/s")
54 | 
55 | 


--------------------------------------------------------------------------------
/thoughts/bench/preprocessing/np_memmap_loader.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import DataLoader, Dataset
 2 | import time
 3 | import torch
 4 | import numpy as np
 5 | 
 6 | from argparse import ArgumentParser
 7 | 
 8 | device = torch.device("cuda:0")
 9 | 
10 | parser = ArgumentParser()
11 | parser.add_argument("--workers", type=int, help="number of workers", default=4)
12 | parser.add_argument("--batch_size", type=int, help="batch_size", default=32)
13 | parser.add_argument("--file", type=str, help="hdf5 file", default="data/train_images.dat")
14 | parser.add_argument("--group", type=str, help="hdf5 group", default="train")
15 | parser.add_argument("--cuda", action="store_true", help="move images onto device")
16 | opt = parser.parse_args()
17 | 
18 | fp = np.memmap(opt.file, dtype="float32", mode="r", shape=(1281167, 3, 64, 64))
19 | loader = DataLoader(fp, num_workers=opt.workers, batch_size=opt.batch_size, pin_memory=True)
20 | 
21 | print(f"bs={opt.batch_size}-{'cuda' if opt.cuda else 'cpu'}")
22 | print(f"idx, time_s, throughput_s")
23 | start = time.perf_counter()
24 | for i, batch in enumerate(loader, 1):
25 |     image = batch
26 |     if opt.cuda:
27 |         out = image.to(device) * 2 
28 |         torch.cuda.synchronize()
29 |     print(f"{i}, {(time.perf_counter() - start)}, {(i * opt.batch_size) / (time.perf_counter() - start)}")
30 | torch.cuda.synchronize()
31 | print(f"{i}, {(time.perf_counter() - start)}, {(i * opt.batch_size) / (time.perf_counter() - start)}")
32 | 


--------------------------------------------------------------------------------
/thoughts/bench/preprocessing/read_hdf5.py:
--------------------------------------------------------------------------------
 1 | import h5py
 2 | from argparse import ArgumentParser
 3 | 
 4 | parser = ArgumentParser()
 5 | parser.add_argument("--index", type=int, help="retrieve index from hdf5")
 6 | parser.add_argument("--file", type=str, help="hdf5 file", default="imagenet.hdf5")
 7 | parser.add_argument("--group", type=str, help="hdf5 group", default="train")
 8 | opt = parser.parse_args()
 9 | 
10 | filename = opt.file
11 | with h5py.File(filename, "r") as f:
12 |     # Print all root level object names (aka keys) 
13 |     # these can be group or dataset names 
14 |     print("="*60)
15 |     print(f"Group Keys: {f.keys()}")
16 |     print(f"Selected: {opt.group}")
17 |     print(f"""{f[opt.group]["images"].shape=}""")
18 |     print(f"""{f[opt.group]["labels"].shape=}""")
19 |     print("="*60 + "\n")
20 |     image = f[opt.group]["images"][opt.index]
21 |     print(image)
22 |     print(f"{image.shape=}")
23 |     label = f[opt.group]["labels"][opt.index]
24 |     print(f"{label=}")
25 | 


--------------------------------------------------------------------------------
/thoughts/view_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from pathlib import Path
 3 | 
 4 | from safetensors import safe_open
 5 | from enum import StrEnum
 6 | 
 7 | class ModelType(StrEnum):
 8 |     SAFETENSORS = ".safetensors"
 9 |     PTH = ".pth"
10 |     CKPT = ".ckpt"
11 | 
12 | 
13 | class ModelViewer:
14 |     def __init__(self, model_path: Path):
15 |         if not model_path.exists(): raise FileExistsError(f"Can't find file {model_file}")
16 | 
17 |         self.model_path = model_path
18 | 
19 |     def view_weights(self):
20 |         match self.model_path.suffix:
21 |             case ModelType.SAFETENSORS:
22 |                 with safe_open(self.model_path.as_posix(), framework="pt", device="cpu") as f: # type: ignore
23 |                     table = "{key:<{key_width}} {dtype:<15} {shape:<20}"
24 |                     longest_key = max(map(lambda x: len(x), f.keys()))
25 |                     for key in f.keys():
26 |                         value = f.get_tensor(key)
27 |                         print(table.format(
28 |                             key=key, 
29 |                             dtype=str(value.dtype),
30 |                             shape=str(value.shape),
31 |                             key_width=longest_key + 5
32 |                             ))
33 | 
34 |             case ModelType.PTH:
35 |                 state_dict = torch.load(model_file.as_posix(), map_location="meta", mmap=True, weights_only=True)
36 |                 table = "{key:<{key_width}} {dtype:<15} {shape:<20}"
37 |                 longest_key = max(map(lambda x: len(x), state_dict.keys()))
38 |                 print(longest_key)
39 |                 for key, value in state_dict.items():
40 |                     print(table.format(
41 |                         key=key, 
42 |                         dtype=str(value.dtype),
43 |                         shape=str(value.shape),
44 |                         key_width=longest_key + 5
45 |                         ))
46 |             case _:
47 |                 print(f"unknown suffix: {model_file.suffix}")
48 | 
49 |     def get_file_metadata(self):
50 |         full_path = self.model_path.resolve().as_posix()
51 |         stats = self.model_path.stat()
52 |         TABLE_FORMAT = "{:<{label_size}} {:<{field_size}}"
53 |         table_data = [
54 |                 ("file_size", sizeof_fmt(stats.st_size)),
55 |                 ("full_path", full_path),
56 |                 ("file_name", self.model_path.name),
57 |                 ("suffix", self.model_path.suffix),
58 |                 ]
59 |         print("="*60)
60 |         for label, field in table_data:
61 |             print(TABLE_FORMAT.format(
62 |                 label + ":", field, 
63 |                 label_size = 12, 
64 |                 field_size = 20)
65 |                 )
66 |         print("="*60)
67 | 
68 | def sizeof_fmt(num, suffix="B"):
69 |     """ Returns a human readable string representation of bytes """
70 |     for unit in ("", "Ki", "Mi", "Gi", "Ti"):
71 |         if abs(num) < 1024.0:
72 |             return f"{num:3.1f}{unit}{suffix}"
73 |         num /= 1024.0
74 |     return f"{num:.1f}Ti{suffix}"
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     import argparse
79 |     parser = argparse.ArgumentParser(
80 |             prog='ModelViewer',
81 |             description="Reads model files and prints visual representations of the layout")
82 | 
83 |     parser.add_argument('FILENAME')
84 |     parser.add_argument('-v', '--verbose',
85 |                         action='store_true')
86 | 
87 |     args = parser.parse_args()
88 |     
89 |     model_file = Path(args.FILENAME)
90 |     model = ModelViewer(model_file)
91 |     model.get_file_metadata()
92 |     model.view_weights()
93 | 


--------------------------------------------------------------------------------