├── src
    ├── downloader.py
    └── dataloader.py
└── README.md


/src/downloader.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import time
  4 | import zipfile
  5 | from concurrent.futures import ThreadPoolExecutor
  6 | 
  7 | from tqdm import tqdm
  8 | 
  9 | 
 10 | class CLDatasets:
 11 |     """
 12 |     A class for downloading datasets from Google Cloud Storage.
 13 |     """
 14 | 
 15 |     def __init__(self, dataset: str, directory: str, unzip: bool = True):
 16 |         """
 17 |         Initialize the CLDatasets object.
 18 | 
 19 |         Args:
 20 |             dataset (str): The name of the dataset to download.
 21 |             directory (str): The directory where the dataset will be saved.
 22 |         """
 23 |         if dataset not in ['CGLM', 'CLOC', 'ImageNet2K']:
 24 |             print("Dataset not found!")
 25 |             return
 26 |         else:
 27 |             self.dataset = dataset
 28 |             self.directory = directory
 29 | 
 30 |             if not os.path.exists(self.directory):
 31 |                 os.makedirs(self.directory)
 32 | 
 33 |             print("Dataset Selected:", dataset)
 34 |             self.download_dataset()
 35 | 
 36 |             if unzip:
 37 |                 self.unzip_data_files(self.directory+f"/{self.dataset}/data")
 38 | 
 39 |     def download_dataset(self):
 40 |         """
 41 |         Download the order files from Google Cloud Storage.
 42 |         """
 43 |         print("Order files are being downloaded...")
 44 |         start_time = time.time()
 45 |         download_command = f"gsutil -m cp -r gs://cl-datasets/{self.dataset} {self.directory}/"
 46 |         os.system(download_command)
 47 |         elapsed_time = time.time() - start_time
 48 |         print("Elapsed time:", elapsed_time)
 49 | 
 50 |     def unzip_data_files(self, directory: str) -> None:
 51 |         """
 52 |         Extracts the contents of zip files in a directory into nested folders.
 53 | 
 54 |         Args:
 55 |             directory: The path to the directory containing the zip files.
 56 | 
 57 |         Returns:
 58 |             None
 59 |         """
 60 | 
 61 |         zip_files = [file for file in os.listdir(
 62 |             directory) if file.endswith('.zip')]
 63 | 
 64 |         def extract_single_zip(zip_file: str) -> None:
 65 | 
 66 |             zip_path = os.path.join(directory, zip_file)
 67 |             output_dir = os.path.join(
 68 |                 directory, os.path.splitext(zip_file)[0])
 69 | 
 70 |             os.makedirs(output_dir, exist_ok=True)
 71 | 
 72 |             with zipfile.ZipFile(zip_path, 'r') as zip_ref:
 73 |                 zip_ref.extractall(output_dir)
 74 | 
 75 |         with ThreadPoolExecutor() as executor, tqdm(total=len(zip_files)) as pbar:
 76 |             futures_list = []
 77 |             for zip_file in zip_files:
 78 |                 future = executor.submit(extract_single_zip, zip_file)
 79 |                 future.add_done_callback(lambda p: pbar.update(1))
 80 |                 futures_list.append(future)
 81 | 
 82 |             # Wait for all tasks to complete
 83 |             for future in futures_list:
 84 |                 future.result()
 85 | 
 86 |         # Remove zip files
 87 | 
 88 |         remove_command = f"rm {self.directory}/{self.dataset}/data/*.zip"
 89 |         os.system(remove_command)
 90 | 
 91 | 
 92 | if __name__ == "__main__":
 93 |     parser = argparse.ArgumentParser(
 94 |         description='Download datasets from Google Cloud Storage.')
 95 |     parser.add_argument('--dataset', type=str, default='CGLM',
 96 |                         help='The name of the dataset to download.')
 97 |     parser.add_argument('--directory', type=str, default='/data/cl_datasets/files/CGLM/',
 98 |                         help='The directory where the dataset will be saved.')
 99 |     parser.add_argument('--unzip', action='store_true',
100 |                         help='Whether to unzip the downloaded files.')
101 | 
102 |     args = parser.parse_args()
103 | 
104 |     gcp_cl_datasets = CLDatasets(
105 |         dataset=args.dataset,
106 |         directory=args.directory,
107 |         unzip=args.unzip)
108 | 


--------------------------------------------------------------------------------
/src/dataloader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Callable, Optional, Tuple
  3 | 
  4 | import h5py
  5 | from PIL import Image
  6 | 
  7 | 
  8 | class BaseDataClass:
  9 |     """Base class for a data class."""
 10 | 
 11 |     def __init__(self, dataset: str, directory: str):
 12 |         """
 13 |         Initialize the BaseDataClass.
 14 | 
 15 |         Args:
 16 |             dataset (str): Name of the dataset.
 17 |             directory (str): Path to the directory containing the data.
 18 | 
 19 |         Raises:
 20 |             FileNotFoundError: If 'order_files' or 'data' directories are not found.
 21 |         """
 22 |         self.dataset = dataset
 23 |         self.directory = directory
 24 | 
 25 |         # Check that 'order_files' and 'data' directories exist in the directory
 26 |         if not os.path.exists(os.path.join(self.directory, 'order_files')):
 27 |             raise FileNotFoundError("order_files directory not found!")
 28 | 
 29 |         if not os.path.exists(os.path.join(self.directory, 'data')):
 30 |             raise FileNotFoundError("data directory not found!")
 31 | 
 32 |         print(
 33 |             f"Found 'order_files' and 'data' directories for {self.dataset}!")
 34 | 
 35 |     def __getitem__(self, index):
 36 |         """
 37 |         Get an item from the data class.
 38 | 
 39 |         Args:
 40 |             index: Index of the item to retrieve.
 41 | 
 42 |         Raises:
 43 |             NotImplementedError: This method should be implemented by subclasses.
 44 |         """
 45 |         raise NotImplementedError
 46 | 
 47 |     def __len__(self):
 48 |         """
 49 |         Get the length of the data class.
 50 | 
 51 |         Raises:
 52 |             NotImplementedError: This method should be implemented by subclasses.
 53 |         """
 54 |         raise NotImplementedError
 55 | 
 56 | 
 57 | class H5Dataset(BaseDataClass):
 58 |     def __init__(self, dataset: str, directory: str, partition: str, transform: Optional[Callable] = None):
 59 |         """
 60 |         Initialize the H5Dataset.
 61 | 
 62 |         Args:
 63 |             dataset (str): Dataset name.
 64 |             dir (str): Directory path.
 65 |             partition (str): train, test, pretrain (all datasets), pretest, preval, cls_inc, data_inc (additional for ImageNet2K)
 66 |             transform (callable, optional): Transform to apply to the samples. Defaults to None.
 67 | 
 68 |         Raises:
 69 |             FileNotFoundError: If any of the required files is not found.
 70 |         """
 71 |         super().__init__(dataset=dataset, directory=directory)
 72 |         self.directory = directory
 73 |         self.image_paths = h5py.File(
 74 |             f"{directory}/order_files/{partition}_image_paths.hdf5", "r")["store_list"]
 75 |         self.labels = h5py.File(
 76 |             f"{directory}/order_files/{partition}_labels.hdf5", "r")["store_list"]
 77 |         self.transform = transform
 78 | 
 79 |         assert len(self.image_paths) == len(self.labels)
 80 | 
 81 |     def __getitem__(self, index: int) -> Tuple[Image.Image, int]:
 82 |         """
 83 |         Get an item from the H5Dataset.
 84 | 
 85 |         Args:
 86 |             index (int): Index of the item to retrieve.
 87 | 
 88 |         Returns:
 89 |             tuple: A tuple containing the sample and label.
 90 |         """
 91 |         img_path = self.directory + '/data/' + \
 92 |             self.image_paths[index].decode("utf-8").strip()
 93 |         label = self.labels[index]
 94 |         sample = pil_loader(img_path)
 95 | 
 96 |         if self.transform is not None:
 97 |             sample = self.transform(sample)
 98 | 
 99 |         return sample, label
100 | 
101 |     def __len__(self) -> int:
102 |         """
103 |         Get the length of the H5Dataset.
104 | 
105 |         Returns:
106 |             int: Length of the dataset.
107 |         """
108 |         return len(self.image_paths)
109 | 
110 | 
111 | def pil_loader(path: str) -> Image.Image:
112 |     # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
113 |     with open(path, "rb") as f:
114 |         img = Image.open(f)
115 |         return img.convert("RGB")
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     BaseDataClass(dataset='ImageNet2K',
120 |                   directory='/data/cl_datasets/files/ImageNet2K/')
121 | 
122 |     dataset = H5Dataset(
123 |         dataset='ImageNet2K', directory="/data/cl_datasets/files/ImageNet2K/", partition='data_inc')
124 |     print(len(dataset))
125 |     dataset[1][0].show()
126 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Continual Learning Datasets 📚
  2 | 
  3 | Welcome to the Continual Learning Datasets repository! Here, we aim to make large-scale continual learning datasets easily accessible for everyone. Our repository provides a convenient way to download three large scale diverse datasets: CLOC, CGLM, and ImageNet2K. Feel free to explore, experiment, and contribute to our repo!
  4 | 
  5 | 
  6 | <p align="center">
  7 |   <img src="https://github.com/hammoudhasan/CLDatasets/assets/74360386/b00e4627-0f4e-4e89-9e5a-aa1316be8f77" alt="test6" width="450">
  8 | </p>
  9 | 
 10 | 
 11 | 
 12 | ## Table of Contents 📋
 13 | 
 14 | - [Introduction](#introduction)
 15 | - [Repository Structure](#repository-structure)
 16 | - [Getting Started](#getting-started)
 17 | - [Citing Our Work](#citing-our-work)
 18 | - [Citing Relevant Works](#citing-relevant-works)
 19 | - [Contributing](#contributing)
 20 | - [License](#license)
 21 | 
 22 | ## Introduction 🌟
 23 | 
 24 | In the world of continual learning, obtaining and working with large-scale datasets can be a challenging task. Our Continual Learning Datasets repository addresses the following issues faced by researchers and practitioners: Setting up large-scale continual learning datasets can often be cumbersome and involve complicated processing steps.
 25 | 
 26 | To make your journey smoother, we provide an easy-to-use solution for downloading and working with three prominent continual learning datasets: CLOC, CGLM, and ImageNet2K. These datasets offer a rich variety of real-world scenarios to explore and develop continual learning algorithms.
 27 | 
 28 | ## Repository Structure 📂
 29 | 
 30 | The repository is structured as follows:
 31 | 
 32 | ```
 33 | 📦 CLDatasets
 34 |  ┣ 📂 src
 35 |  ┃ ┣ 📜 downloader.py
 36 |  ┃ ┗ 📜 dataloader.py
 37 |  ┗ 📜 README.md (You are here!)
 38 | 
 39 | ```
 40 | 
 41 | - **src**: This directory contains the source code for downloading and creating the continual learning datasets.
 42 |   - `downloader.py`: Use this script to download any of the three datasets along with their order files.
 43 |   - `dataloader.py`: This script allows you to create the dataset easily within your code.
 44 | 
 45 | ## Getting Started 🚀
 46 | 
 47 | To get started with Continual Learning Datasets, follow these simple steps:
 48 | 
 49 | 1. Install other requirements by running:
 50 | ```
 51 | pip install tqdm h5py Pillow datasets
 52 | ```
 53 | 
 54 | 2. Clone the repository to your local machine using the following command:
 55 | ```bash
 56 | git clone https://github.com/hammoudhasan/CLDatasets.git
 57 | ```
 58 | 3. Navigate to the `src` directory:
 59 | ```bash
 60 | cd CLDatasets/src
 61 | ```
 62 | 3. Use the `downloader.py` script to download your desired dataset. Open your terminal and run the following command:
 63 | ```bash
 64 | python downloader.py --dataset=<dataset_name> --directory=<directory> --unzip
 65 | ```
 66 |    Replace `<dataset_name>` with either `CLOC`, `CGLM`, or `ImageNet2K`, depending on the dataset you want to download. `<directory>` should be the path where you want to store the dataset. Additionally, you can include the `--unzip` flag to automatically extract the downloaded files, which is recommended.
 67 | 
 68 |    Here's an example of how you could use the script to download the CLOC dataset:
 69 |    ```bash
 70 |    python downloader.py --dataset='CLOC' --directory='/data/cl_datasets/' --unzip
 71 |    ```
 72 | 
 73 | 4. Once downloaded, you can use `dataloader.py` to easily incorporate the dataset into your code.
 74 | 
 75 | Note that datasets could also be found on [this link](https://huggingface.co/datasets/hammh0a/CGLM)  for CGLM, [this link](https://huggingface.co/datasets/hammh0a/CLOC) for CLOC and [this link](https://huggingface.co/datasets/hammh0a/ImageNet2K) for ImageNet-2K dataset.
 76 | 
 77 | ****
 78 | Feel free to explore the repository further and adapt the code to suit your specific requirements.
 79 | 
 80 | ## Citing Our Work 📖
 81 | 
 82 | This initiative is part of our work found [here](https://arxiv.org/abs/2305.09275). If you find the Continual Learning Datasets repository useful and download any of the datasets using our repo, please cite our work:
 83 | 
 84 | ```
 85 | @misc{hammoud2023rapid,
 86 |       title={Rapid Adaptation in Online Continual Learning: Are We Evaluating It Right?}, 
 87 |       author={Hasan Abed Al Kader Hammoud and Ameya Prabhu and Ser-Nam Lim and Philip H. S. Torr and Adel Bibi and Bernard Ghanem},
 88 |       year={2023},
 89 |       eprint={2305.09275},
 90 |       archivePrefix={arXiv},
 91 |       primaryClass={cs.LG}
 92 | }
 93 | ```
 94 | 
 95 | ## Citing Relevant Works 🔍
 96 | 
 97 | Additionally, when using the individual datasets, please cite the relevant works:
 98 | 
 99 | **For CLOC:**
100 | ```
101 | @InProceedings{cai2021online,
102 |     author    = {Cai, Zhipeng and Sener, Ozan and Koltun, Vladlen},
103 |     title     = {Online Continual Learning With Natural Distribution Shifts: An Empirical Study With Visual Data},
104 |     booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
105 |     year      = {2021},
106 |     pages     = {8281-8290}
107 | }
108 | ```
109 | 
110 | _Suggestion: You could also cite:_
111 | 
112 | ```
113 | @inproceedings{ghunaim2023real,
114 |   title={Real-time evaluation in online continual learning: A new hope},
115 |   author={Ghunaim, Yasir and Bibi, Adel and Alhamoud, Kumail and Alfarra, Motasem and Al Kader Hammoud, Hasan Abed and Prabhu, Ameya and Torr, Philip HS and Ghanem, Bernard},
116 |   booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
117 |   pages={11888--11897},
118 |   year={2023}
119 | }
120 | ```
121 | 
122 | **For CGLM:**
123 | ```
124 | @article{prabhu2023online,
125 |   title={Online Continual Learning Without the Storage Constraint},
126 |   author={Prabhu, Ameya and Cai, Zhipeng and Dokania, Puneet and Torr, Philip and Koltun, Vladlen and Sener, Ozan},
127 |   journal={arXiv preprint arXiv:2305.09253},
128 |   year={2023}
129 | }
130 | ```
131 | 
132 | _Suggestion: You could also cite:_
133 | ```
134 | @InProceedings{prabhu2023computationally,
135 |     author    = {Prabhu, Ameya and Al Kader Hammoud, Hasan Abed and Dokania, Puneet K. and Torr, Philip H.S. and Lim, Ser-Nam and Ghanem, Bernard and Bibi, Adel},
136 |     title     = {Computationally Budgeted Continual Learning: What Does Matter?},
137 |     booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
138 |     year      = {2023},
139 |     pages     = {3698-3707}
140 | }
141 | ```
142 | 
143 | **For ImageNet2K:**
144 | ```
145 | @InProceedings{prabhu2023computationally,
146 |     author    = {Prabhu, Ameya and Al Kader Hammoud, Hasan Abed and Dokania, Puneet K. and Torr, Philip H.S. and Lim, Ser-Nam and Ghanem, Bernard and Bibi, Adel},
147 |     title     = {Computationally Budgeted Continual Learning: What Does Matter?},
148 |     booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
149 |     year      = {2023},
150 |     pages     = {3698-3707}
151 | }
152 | ```
153 | 
154 | 
155 | ## Contributing 🤝
156 | 
157 | Contributions are welcome and appreciated! If you have any ideas, suggestions, or improvements, please open an issue or submit a pull request. Together, let's make continual learning more accessible and exciting for everyone!
158 | 
159 | ## License ⚖️
160 | 
161 | The datasets utilized in this project, namely ImageNet, CLOC, and CGLM, are subject to their respective original licensing terms. These datasets are provided for research purposes only, and you must adhere to the original licensing requirements set forth by their respective owners.
162 | 
163 | Modifying, redistributing, or sublicensing any part of the ImageNet, CLOC, or CGLM datasets without explicit permission from the original dataset owners is strictly prohibited.
164 | 
165 | Please refer to the individual dataset sources and their respective licenses for more details on the permissions and restrictions associated with each dataset.
166 | 
167 | ---
168 | 
169 | Let's embark on a continual learning journey! If you have any questions or need further assistance, feel free to reach out. Happy learning! 🚀✨
170 | 


--------------------------------------------------------------------------------