├── tests ├── test_data │ ├── .gitignore │ ├── example_metadata_id_102000022.json │ ├── example_metadata_tissue_154.json │ └── section_data_set_100055044_metadata.json └── test_aba_mouse_utils.py ├── src └── open_dataset_tools │ ├── __init__.py │ ├── aws_utils.py │ ├── ivy_gap_utils.py │ └── aba_mouse_utils.py ├── CONTRIBUTING.md ├── setup.py ├── LICENSE ├── .gitignore └── README.md /tests/test_data/.gitignore: -------------------------------------------------------------------------------- 1 | # 2 | -------------------------------------------------------------------------------- /src/open_dataset_tools/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.0.0" -------------------------------------------------------------------------------- /src/open_dataset_tools/aws_utils.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from botocore import UNSIGNED 3 | from botocore.client import Config 4 | 5 | 6 | def get_public_boto3_client(): 7 | """Convenience function to return a boto3 client that can access 8 | publically available AWS services (like public S3 buckets) without 9 | any AWS credentials. 10 | 11 | Returns 12 | ------- 13 | A boto3 client instance. 14 | """ 15 | public_client = boto3.client( 16 | 's3', config=Config(signature_version=UNSIGNED) 17 | ) 18 | return public_client 19 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Allen Institute Contribution Agreement 2 | 3 | This document describes the terms under which you may make “Contributions” — 4 | which may include without limitation, software additions, revisions, bug fixes, configuration changes, 5 | documentation, or any other materials — to any of the projects owned or managed by the Allen Institute. 6 | If you have questions about these terms, please contact us at terms@alleninstitute.org. 7 | 8 | You certify that: 9 | 10 | • Your Contributions are either: 11 | 12 | 1. Created in whole or in part by you and you have the right to submit them under the designated license 13 | (described below); or 14 | 2. Based upon previous work that, to the best of your knowledge, is covered under an appropriate 15 | open source license and you have the right under that license to submit that work with modifications, 16 | whether created in whole or in part by you, under the designated license; or 17 | 18 | 3. Provided directly to you by some other person who certified (1) or (2) and you have not modified them. 19 | 20 | • You are granting your Contributions to the Allen Institute under the terms of the [2-Clause BSD license](https://opensource.org/licenses/BSD-2-Clause) 21 | (the “designated license”). 22 | 23 | • You understand and agree that the Allen Institute projects and your Contributions are public and that 24 | a record of the Contributions (including all metadata and personal information you submit with them) is 25 | maintained indefinitely and may be redistributed consistent with the Allen Institute’s mission and the 26 | 2-Clause BSD license. 27 | -------------------------------------------------------------------------------- /tests/test_data/example_metadata_id_102000022.json: -------------------------------------------------------------------------------- 1 | { 2 | "bits_per_component": 8, 3 | "id": 102000022, 4 | "resolution": 1.049, 5 | "section_number": 90, 6 | "downsampling": { 7 | "downsample_0": { 8 | "width": 14001, 9 | "height": 7601, 10 | "x": 0, 11 | "y": 0, 12 | "image_file_width": 14001, 13 | "image_file_height": 7601 14 | }, 15 | "downsample_1": { 16 | "width": 7000, 17 | "height": 3800, 18 | "x": 0, 19 | "y": 0, 20 | "image_file_width": 7000, 21 | "image_file_height": 3800 22 | }, 23 | "downsample_2": { 24 | "width": 3500, 25 | "height": 1900, 26 | "x": 0, 27 | "y": 0, 28 | "image_file_width": 3500, 29 | "image_file_height": 1900 30 | }, 31 | "downsample_3": { 32 | "width": 1750, 33 | "height": 950, 34 | "x": 0, 35 | "y": 0, 36 | "image_file_width": 1750, 37 | "image_file_height": 950 38 | }, 39 | "downsample_4": { 40 | "width": 875, 41 | "height": 475, 42 | "x": 0, 43 | "y": 0, 44 | "image_file_width": 875, 45 | "image_file_height": 475 46 | }, 47 | "downsample_5": { 48 | "width": 437, 49 | "height": 237, 50 | "x": 0, 51 | "y": 0, 52 | "image_file_width": 437, 53 | "image_file_height": 237 54 | }, 55 | "downsample_6": { 56 | "width": 218, 57 | "height": 118, 58 | "x": 0, 59 | "y": 0, 60 | "image_file_width": 218, 61 | "image_file_height": 118 62 | } 63 | }, 64 | "image_file_name": "section_image_102000022.tiff" 65 | } 66 | -------------------------------------------------------------------------------- /tests/test_data/example_metadata_tissue_154.json: -------------------------------------------------------------------------------- 1 | { 2 | "bits_per_component": 8, 3 | "id": 102000038, 4 | "resolution": 1.049, 5 | "section_number": 154, 6 | "downsampling": { 7 | "downsample_0": { 8 | "width": 15297, 9 | "height": 8097, 10 | "x": 0, 11 | "y": 0, 12 | "image_file_width": 15297, 13 | "image_file_height": 8097 14 | }, 15 | "downsample_1": { 16 | "width": 7648, 17 | "height": 4048, 18 | "x": 0, 19 | "y": 0, 20 | "image_file_width": 7648, 21 | "image_file_height": 4048 22 | }, 23 | "downsample_2": { 24 | "width": 3824, 25 | "height": 2024, 26 | "x": 0, 27 | "y": 0, 28 | "image_file_width": 3824, 29 | "image_file_height": 2024 30 | }, 31 | "downsample_3": { 32 | "width": 1912, 33 | "height": 1012, 34 | "x": 0, 35 | "y": 0, 36 | "image_file_width": 1912, 37 | "image_file_height": 1012 38 | }, 39 | "downsample_4": { 40 | "width": 956, 41 | "height": 506, 42 | "x": 0, 43 | "y": 0, 44 | "image_file_width": 956, 45 | "image_file_height": 506 46 | }, 47 | "downsample_5": { 48 | "width": 478, 49 | "height": 253, 50 | "x": 0, 51 | "y": 0, 52 | "image_file_width": 478, 53 | "image_file_height": 253 54 | }, 55 | "downsample_6": { 56 | "width": 239, 57 | "height": 126, 58 | "x": 0, 59 | "y": 0, 60 | "image_file_width": 239, 61 | "image_file_height": 126 62 | } 63 | }, 64 | "image_file_name": "section_image_102000038.tiff" 65 | } 66 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import re 2 | from pathlib import Path 3 | from setuptools import setup, find_packages 4 | 5 | minimum_requirements = [ 6 | "Pillow", 7 | "pandas", 8 | "boto3", 9 | "matplotlib" 10 | ] 11 | 12 | full_requirements = minimum_requirements + [ 13 | "notebook" 14 | ] 15 | 16 | test_requirements = minimum_requirements + [ 17 | "pytest" 18 | ] 19 | 20 | version_file_path = ( 21 | Path(__file__).parent / "src/open_dataset_tools/__init__.py" 22 | ) 23 | 24 | def find_version(version_file_path: Path): 25 | with version_file_path.open("r") as f: 26 | version_file_contents = f.read() 27 | version_match = re.search( 28 | r"^__version__ = ['\"]([^'\"]*)['\"]", version_file_contents, re.M 29 | ) 30 | if version_match: 31 | return version_match.group(1) 32 | raise RuntimeError("Unable to find version string.") 33 | 34 | 35 | setup ( 36 | version=find_version(version_file_path), 37 | name="aibs_open_dataset_tools", 38 | description=( 39 | "An open source package containing example code and Ipython notebooks " 40 | "that demonstrate how to access open datasets such as the " 41 | "Allen Mouse Brain Atlas or the Ivy Glioblastoma Atlas" 42 | ), 43 | author="Scott Daniel, Nicholas Mei, Wayne Wakeman", 44 | author_email="waynew@alleninstitute.org", 45 | url="https://github.com/AllenInstitute/open_dataset_tools", 46 | package_dir={"": "src"}, 47 | packages=find_packages(where="src"), 48 | python_requires=">=3.8", 49 | install_requires=minimum_requirements, 50 | extras_require={ 51 | "full": full_requirements, 52 | "test": test_requirements 53 | } 54 | ) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Allen Institute Software License – This software license is the 2-clause BSD 2 | license plus a third clause that prohibits redistribution and use for 3 | commercial purposes without further permission. 4 | 5 | Copyright © 2020. Allen Institute. All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | 1. Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | 2. Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | 3. Redistributions and use for commercial purposes are not permitted without 18 | the Allen Institute’s written permission. For purposes of this license, 19 | commercial purposes are the incorporation of the Allen Institute's software 20 | into anything for which you will charge fees or other compensation or use of 21 | the software to perform a commercial service for a third party. Contact 22 | terms@alleninstitute.org for commercial licensing opportunities. 23 | 24 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 25 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 26 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 27 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 28 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 30 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 31 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 32 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 33 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | This repository contains code to support the download of images and metadata 4 | from two free and publically available datasets. 5 | 6 | Example notebooks for how to use the provided code can be found under the 7 | [example_notebooks subfolder](example_notebooks/) of this repository. 8 | 9 | ## Installing tools 10 | 11 | To install the tools provided by this repository simply: 12 | 13 | ``` 14 | # Feel free to change /tmp/open_dataset_tools to whatever location works best for you 15 | 16 | > git clone https://github.com/AllenInstitute/open_dataset_tools /tmp/open_dataset_tools 17 | 18 | # Don't forget to activate the python environment you want to install this package into 19 | # The following example creates an environment using Anaconda Python 20 | # See: https://docs.conda.io/en/latest/miniconda.html 21 | 22 | > conda create --yes --name open_dataset_tools python=3.8 23 | > conda activate open_dataset_tools 24 | > cd /tmp/open_dataset_tools 25 | > pip install .[full] 26 | ``` 27 | 28 | ## Running the example notebooks for yourself 29 | 30 | Once you've installed the package you can access and try running the example 31 | notebooks with: 32 | 33 | ``` 34 | > conda activate open_dataset_tools 35 | > cd /tmp/open_dataset_tools/example_notebooks 36 | > jupyter notebook 37 | ``` 38 | 39 | ## Allen Mouse Brain Atlas 40 | The [Allen Mouse Brain Atlas](https://registry.opendata.aws/allen-mouse-brain-atlas/) 41 | is now hosted on AWS. 42 | 43 | The Jupyter notebook [Accessing_Allen_Mouse_Brain_Atlas_Data.ipynb](example_notebooks/Accessing_Allen_Mouse_Brain_Atlas_Data.ipynb) 44 | demonstrates all of the helper functions necessary to download and view 45 | images from the atlas. It also demonstrates functions for downloading 46 | metadata and loading them as python data structures for easy searching, 47 | so that users can identify the images that will be most helpful in their 48 | research. 49 | 50 | This module uses the open source [boto3](https://github.com/boto/boto3) API 51 | to interface with AWS S3. 52 | 53 | ## Ivy Glioblastoma Atlas Project 54 | 55 | The Jupyter notebook [Accessing_Ivy_Glioblastoma_Atlas_Project_Data.ipynb](example_notebooks/Accessing_Ivy_Glioblastoma_Atlas_Project_Data.ipynb) demonstrates how to 56 | programmatically search, download, and interact with images from the 57 | [Ivy Glioblastoma Atlas Project](https://glioblastoma.alleninstitute.org/). 58 | 59 | # Level of Support 60 | 61 | This module is provided as a means to give the broader neuroscience community 62 | access to the data generated by the Allen Institute. As such, we are very 63 | interested in maintaining the code so that it remains useful. Please file 64 | any bug reports through [GitHub](https://github.com/AllenInstitute/open_dataset_tools/issues). 65 | We will consider pull requests, so long as they do not conflict with internal 66 | Institute policies regarding software. 67 | -------------------------------------------------------------------------------- /src/open_dataset_tools/ivy_gap_utils.py: -------------------------------------------------------------------------------- 1 | import io 2 | import warnings 3 | from typing import Optional 4 | from pathlib import Path 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import matplotlib.image as mp_img 9 | from PIL import Image 10 | 11 | from open_dataset_tools.aws_utils import get_public_boto3_client 12 | 13 | 14 | def load_s3_json_as_dataframe(client, bucket: str, key: str) -> pd.DataFrame: 15 | """Given a boto3 S3 client, an S3 bucket name, and a key for a 16 | JSON file to be downloaded, download it and parse it as a pandas 17 | DataFrame. 18 | 19 | Parameters 20 | ---------- 21 | client : Boto3 S3 Client 22 | A boto3 s3 Client object 23 | bucket : str 24 | The name of the bucket to download JSON file from 25 | key : str 26 | The S3 key for accessing the JSON file 27 | 28 | Returns 29 | ------- 30 | pd.DataFrame 31 | A pandas DataFrame 32 | """ 33 | obj = client.get_object(Bucket=bucket, Key=key)["Body"] 34 | json_data = io.BytesIO(obj.read()) 35 | return pd.read_json(json_data) 36 | 37 | 38 | def get_donor_metadata(client) -> pd.DataFrame: 39 | bucket = "allen-ivy-glioblastoma-atlas" 40 | key = "donor_metadata.json" 41 | return load_s3_json_as_dataframe(client, bucket, key) 42 | 43 | 44 | def get_specimen_metadata(client) -> pd.DataFrame: 45 | bucket = "allen-ivy-glioblastoma-atlas" 46 | key = "specimen_metadata.json" 47 | return load_s3_json_as_dataframe(client, bucket, key) 48 | 49 | 50 | def get_section_metadata(client) -> pd.DataFrame: 51 | bucket = "allen-ivy-glioblastoma-atlas" 52 | key = "section_metadata.json" 53 | return load_s3_json_as_dataframe(client, bucket, key) 54 | 55 | 56 | class ImagePromise(object): 57 | """The ImagePromise class is intended to defer loading IVY GAP image 58 | data until absolutely necessary, because the images take up a potentially 59 | huge amount of memory. 60 | 61 | This class is instantiated with image width/height in pixels as well as an 62 | s3 object url string which specifies a specific image: 63 | s3://allen-ivy-glioblastoma-atlas/{specific_image_resource_key} 64 | 65 | Optionally, a local save directory can also be provided that contains 66 | the specific image to have the class load images from local disk 67 | instead of downloading from S3. 68 | 69 | When the load() method is called, if only an s3_obj_url was provided 70 | then the image repreasented by the ImagePromise will be downloaded from 71 | S3. If both s3_obj_url and local_save_directory were provided, then 72 | the image will be loaded from local disk. 73 | """ 74 | 75 | def __init__( 76 | self, 77 | s3_obj_url: str, 78 | image_width: int, 79 | image_height: int, 80 | local_save_directory: Optional[Path] = None, 81 | s3_client=None, 82 | verbose: bool = False 83 | ): 84 | # Absolutely necessary parameters 85 | self._s3_obj_url = s3_obj_url 86 | self._image_width = image_width 87 | self._image_height = image_height 88 | 89 | # Need either local_save_directory or s3_client 90 | # Give a warning if both are provided and prefer local_save_directory 91 | if local_save_directory and s3_client: 92 | warnings.warn( 93 | "Both an s3_client as well as local_save_directory parameter " 94 | "were provided, the s3_client parameter will be ignored!" 95 | ) 96 | 97 | if local_save_directory is None and s3_client is None: 98 | raise RuntimeError( 99 | "Must provide either an s3_client parameter or a " 100 | "local_save_directory parameter!" 101 | ) 102 | self._s3_client = s3_client 103 | self._local_save_dir = local_save_directory 104 | 105 | # Completely optional 106 | self._verbose = verbose 107 | 108 | @property 109 | def num_pixels(self) -> int: 110 | return self._image_width * self._image_height 111 | 112 | def load(self) -> Image: 113 | 114 | rel_path = self._s3_obj_url.lstrip("s3://allen-ivy-glioblastoma-atlas/") 115 | 116 | if self._local_save_dir is not None: 117 | file_object = (self._local_save_dir / rel_path).resolve() 118 | if self._verbose: 119 | print(f"Loading image from: {str(file_object)}") 120 | else: 121 | file_object = io.BytesIO() 122 | if self._verbose: 123 | print(f"Downloading image from: {self._s3_obj_url}") 124 | img_obj = self._s3_client.download_fileobj( 125 | Bucket="allen-ivy-glioblastoma-atlas", 126 | Key=rel_path, 127 | Fileobj=file_object 128 | ) 129 | 130 | Image.MAX_IMAGE_PIXELS = self.num_pixels 131 | with Image.open(file_object, "r") as img: 132 | img.load() 133 | 134 | return img 135 | 136 | 137 | def section_image_loader( 138 | section_meta_table: pd.DataFrame, 139 | section_data_set_id: int, 140 | local_save_directory: Optional[Path] = None, 141 | verbose: bool = False 142 | ) -> pd.DataFrame: 143 | """ 144 | Given a section metadata DataFrame and a specific `section_data_set_id` 145 | return a DataFrame containing image data and metadata associated with 146 | the requested `section_data_set_id`. 147 | 148 | Parameters 149 | ---------- 150 | section_meta_table : pd.DataFrame 151 | A section metadata table (can be obtained from `get_section_metadata()` 152 | or `local_section_metadata_loader`) 153 | section_data_set_id : int 154 | A unique identifier for a section entry in the section_meta_table 155 | local_save_directory: Optional[Path] 156 | A Python pathlib.Path object pointing to a user local directory 157 | where IVY GAP images have already been downloaded. If this parameter 158 | is left as None, then images will be downloaded from AWS S3. 159 | verbose: 160 | Whether this function should print updates on what it is downloading 161 | 162 | Returns 163 | ------- 164 | pd.DataFrame 165 | A section image table that contains metadata about images associated 166 | with a section as well as "ImagePromise" objects which will 167 | yield PIL "Image" object instances when ImagePromise.load() is called. 168 | """ 169 | 170 | try: 171 | sub_images = section_meta_table[ 172 | section_meta_table["section_data_set_id"] == section_data_set_id 173 | ]["sub_images"].iloc[0] 174 | 175 | except IndexError as e: 176 | e_msg = ( 177 | f"Could not find the `section_data_set_id` specified " 178 | f"({section_data_set_id}) in the `section_meta_table` provided!" 179 | ) 180 | raise RuntimeError(e_msg) from e 181 | 182 | if local_save_directory is None: 183 | s3_client = get_public_boto3_client() 184 | 185 | loaded_sub_images = [] 186 | for img_dict in sub_images: 187 | new_img_dict = {**img_dict} 188 | 189 | for k, v in new_img_dict["s3_data"].items(): 190 | 191 | # If user hasn't provided a local_save_directory then assume 192 | # we need to download from S3 193 | if local_save_directory is None: 194 | image_promise = ImagePromise( 195 | s3_obj_url=v, 196 | image_width=img_dict["width"], 197 | image_height=img_dict["height"], 198 | s3_client=s3_client, 199 | verbose=verbose 200 | ) 201 | else: 202 | image_promise = ImagePromise( 203 | s3_obj_url=v, 204 | image_width=img_dict["width"], 205 | image_height=img_dict["height"], 206 | local_save_directory=local_save_directory, 207 | verbose=verbose 208 | ) 209 | 210 | new_img_dict[k] = image_promise 211 | 212 | del new_img_dict["s3_data"] 213 | loaded_sub_images.append(new_img_dict) 214 | 215 | return pd.DataFrame(loaded_sub_images) 216 | 217 | 218 | def local_section_metadata_loader( 219 | local_save_directory: Path, 220 | verbose: bool = False 221 | ) -> pd.DataFrame: 222 | """ 223 | Download and save the section metadata file to a local save directory. 224 | If local_save_directory contains a previously downloaded section metadata file, 225 | that file will be loaded in lieu of a download. 226 | 227 | Parameters 228 | ---------- 229 | local_save_directory : Path 230 | A local path where the Ivy GAP dataset should be downloaded 231 | verbose : bool 232 | Whether detailed information about what files are being 233 | downloaded or loaded should be shown. 234 | 235 | Returns 236 | ------- 237 | pd.DataFrame 238 | A section metadata table 239 | """ 240 | 241 | s3_client = get_public_boto3_client() 242 | 243 | if not local_save_directory.exists(): 244 | local_save_directory.mkdir(parents=True, exist_ok=True) 245 | 246 | section_metadata_cache_loc = local_save_directory / "section_metadata.json" 247 | if not section_metadata_cache_loc.exists(): 248 | # Download and cache section_metadata table since it is the 249 | # only metadata of appreciable size (~56MB) 250 | if verbose: 251 | print( 252 | f"Downloading section_metadata.json to " 253 | f"{str(section_metadata_cache_loc)}\n" 254 | ) 255 | section_metadata = get_section_metadata(s3_client) 256 | section_metadata.to_json( 257 | section_metadata_cache_loc, 258 | orient="records", 259 | indent=4 260 | ) 261 | else: 262 | if verbose: 263 | print( 264 | f"Loading section_metadata.json from " 265 | f"{str(section_metadata_cache_loc)}\n" 266 | ) 267 | section_metadata = pd.read_json(section_metadata_cache_loc) 268 | 269 | return section_metadata 270 | 271 | 272 | def section_image_downloader( 273 | local_save_directory: Path, 274 | section_data_set_id: int, 275 | verbose: bool = False): 276 | """ 277 | Given a `local_save_directory` and a specific `section_data_set_id` 278 | download image data associated with the requested `section_data_set_id` 279 | and save it to the `local_save_directory` 280 | 281 | Parameters 282 | ---------- 283 | local_save_directory : Path 284 | Path to the desired local directory where downloaded images should 285 | be saved 286 | section_data_set_id : int 287 | A unique identifier for a section entry in the section_meta_table 288 | verbose: 289 | Whether this function should print updates on what it is downloading 290 | """ 291 | 292 | s3_client = get_public_boto3_client() 293 | 294 | section_metadata = local_section_metadata_loader( 295 | local_save_directory, verbose=verbose 296 | ) 297 | 298 | try: 299 | sub_images = section_metadata[ 300 | section_metadata["section_data_set_id"] == section_data_set_id 301 | ]["sub_images"].iloc[0] 302 | except IndexError as e: 303 | e_msg = ( 304 | f"Could not find the `section_data_set_id` specified " 305 | f"({section_data_set_id}) in the `section_meta_table` provided!" 306 | ) 307 | raise RuntimeError(e_msg) from e 308 | 309 | for img_dict in sub_images: 310 | 311 | for k, v in img_dict["s3_data"].items(): 312 | rel_path = v.lstrip("s3://allen-ivy-glioblastoma-atlas/") 313 | local_save_path = local_save_directory / rel_path 314 | 315 | # Create parent directories if they don't exist 316 | local_save_path.parent.mkdir(parents=True, exist_ok=True) 317 | 318 | if verbose: 319 | print(f"Saving image from {v} to {str(local_save_path)}\n") 320 | 321 | with local_save_path.open('wb') as fp: 322 | s3_client.download_fileobj( 323 | Bucket="allen-ivy-glioblastoma-atlas", 324 | Key=rel_path, 325 | Fileobj=fp 326 | ) 327 | -------------------------------------------------------------------------------- /src/open_dataset_tools/aba_mouse_utils.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Union 2 | from pathlib import Path 3 | import tempfile 4 | import hashlib 5 | import json 6 | import copy 7 | import warnings 8 | 9 | from PIL import Image 10 | from PIL import ImageFile 11 | 12 | from open_dataset_tools.aws_utils import get_public_boto3_client 13 | 14 | 15 | ImageFile.LOAD_TRUNCATED_IMAGES = True 16 | 17 | 18 | def _get_aws_md5( 19 | fname: str, s3_client, bucket_name='allen-mouse-brain-atlas' 20 | ) -> str: 21 | """ 22 | Get and return the md5 checksum (str) of a file in AWS 23 | """ 24 | # get the md5sum of the section_data_sets.json file 25 | # to determine if the file must be downloaded 26 | obj_list = s3_client.list_objects(Bucket=bucket_name, 27 | Prefix=fname)['Contents'] 28 | if len(obj_list) != 1: 29 | msg = '\nquerying bucket for %s ' % fname 30 | msg += 'returned %d results\n' % len(obj_list) 31 | raise RuntimeError(msg) 32 | 33 | return obj_list[0]['ETag'].replace('"','') 34 | 35 | 36 | def _compare_md5(fname: Path, target: str) -> bool: 37 | """ 38 | Compare the md5 checksum of the file specified by fname to the 39 | string specified by target. Return boolean result indicating if 40 | they are equal. 41 | """ 42 | md5_obj = hashlib.md5() 43 | with open(fname, 'rb') as in_file: 44 | for line in in_file: 45 | md5_obj.update(line) 46 | return md5_obj.hexdigest() == target 47 | 48 | 49 | def _need_to_download( 50 | aws_key: str, local_filename: Path, s3_client, 51 | bucket_name='allen-mouse-brain-atlas' 52 | ) -> Tuple[bool, str]: 53 | """ 54 | Check whether or not aws_key needs to be downloaded to keep 55 | local_filename up-to-date. 56 | 57 | Parameters 58 | ---------- 59 | aws_key is the Key of the file in S3 60 | 61 | local_filename is the name of the local corresponding to aws_key 62 | 63 | s3_client is a boto3 client for the AWS S3 service 64 | 65 | bucket_name is the name of the S3 bucket where the file resides 66 | (Default: 'allen-mouse-brain-atlas') 67 | 68 | Returns 69 | ------- 70 | A boolean that is True if the file needs to be downloaded from S3 71 | 72 | A string containing the md5checksum of the file 73 | """ 74 | target_md5 = _get_aws_md5(aws_key, s3_client, bucket_name=bucket_name) 75 | must_download = False 76 | if not local_filename.exists(): 77 | must_download = True 78 | else: 79 | if not local_filename.is_file(): 80 | raise RuntimeError( 81 | '\n%s\nexists but is not a file' % local_filename 82 | ) 83 | 84 | if not _compare_md5(local_filename, target_md5): 85 | must_download = True 86 | return must_download, target_md5 87 | 88 | 89 | def _get_aws_file(aws_key, local_filename: Path, s3_client, 90 | bucket_name='allen-mouse-brain-atlas'): 91 | """ 92 | Download the AWS file specified by bucket_name:aws_key to 93 | local_filename, but only if necessary 94 | 95 | Parameters 96 | ---------- 97 | aws_key is the Key of the file in S3 98 | 99 | local_filename is the name of the local corresponding to aws_key 100 | 101 | s3_client is a boto3 client for the AWS S3 service 102 | 103 | bucket_name is the name of the S3 bucket where the file resides 104 | (Default: 'allen-mouse-brain-atlas') 105 | 106 | Returns 107 | ------- 108 | None; just download the file to the specified local_filename 109 | """ 110 | (must_download, target_md5) = _need_to_download( 111 | aws_key, local_filename, s3_client,bucket_name=bucket_name 112 | ) 113 | 114 | if must_download: 115 | print('Downloading %s' % aws_key) 116 | s3_client.download_file( 117 | Bucket=bucket_name, 118 | Key=aws_key, 119 | Filename=str(local_filename) 120 | ) 121 | 122 | if not _compare_md5(local_filename, target_md5): 123 | msg = '\nDownloaded section_data_sets.json; ' 124 | msg += 'md5 checksum != %s\n' % target_md5 125 | raise RuntimeError(msg) 126 | 127 | return None 128 | 129 | 130 | def download_s3_metadata_file( 131 | download_directory: Union[str, Path], 132 | downloaded_local_fname: str, 133 | metadata_s3_key: str, 134 | s3_client = None, 135 | bucket_name: str = "allen-mouse-brain-atlas" 136 | ) -> Union[dict, List[dict]]: 137 | """Download and parse a metadata *.json file for the Allen Mouse Brain 138 | Atlas dataset. 139 | 140 | Parameters 141 | ---------- 142 | download_directory : Union[str, Path] 143 | The desired local directory path to save downloaded metadata. If the 144 | provided directory does not exist, it and any necessary parent 145 | directories will be created automatically. 146 | downloaded_local_fname : str 147 | The file name that the downloaded metadata should have. 148 | metadata_s3_key : str 149 | The S3 Key used to select which file to downloaded. 150 | s3_client : 151 | A boto3.Client of the S3 variety. If None, an s3 client with 152 | anonymous credentials will be automatically created. 153 | bucket_name : str, optional 154 | The name of the bucket to download the metadata file from, 155 | by default "allen-mouse-brain-atlas" 156 | 157 | Returns 158 | ------- 159 | Union[dict, List[dict]] 160 | The parsed contents of a *.json file. Can be a list of dicts or 161 | just a dict. 162 | """ 163 | if type(download_directory) is str: 164 | download_directory = Path(download_directory).resolve() 165 | 166 | if not download_directory.exists(): 167 | download_directory.mkdir(parents=True, exist_ok=True) 168 | 169 | if s3_client is None: 170 | s3_client = get_public_boto3_client() 171 | 172 | local_metadata_path = download_directory / downloaded_local_fname 173 | 174 | _get_aws_file( 175 | aws_key=metadata_s3_key, 176 | local_filename=local_metadata_path, 177 | s3_client=s3_client, 178 | bucket_name=bucket_name 179 | ) 180 | 181 | with local_metadata_path.open('rb') as f: 182 | metadata = json.load(f) 183 | 184 | return metadata 185 | 186 | 187 | def get_atlas_metadata( 188 | download_directory: Union[str, Path], 189 | s3_client=None, 190 | bucket_name: str = 'allen-mouse-brain-atlas' 191 | ) -> List[dict]: 192 | """ 193 | Load the metadata for the entire atlas into memory. 194 | If you have not already downloaded this file, it will 195 | be downloaded to the specified `download_directory` 196 | 197 | Parameters 198 | ---------- 199 | s3_client 200 | A boto3.Client of the S3 variety. If None, this function will 201 | try to create an s3 client with anonymous credentials which is 202 | sufficient to access public AWS services 203 | download_directory : Union[str, Path] 204 | The desired local directory path to save downloaded metadata. If the 205 | provided directory does not exist, it and any necessary parent 206 | directories will be created automatically. 207 | bucket_name : str 208 | The name of the S3 bucket to download metadata from. 209 | 210 | Returns 211 | ------- 212 | A list of dicts containing the metadata for the atlas. 213 | This is the result of running json.load on section_data_sets.json 214 | """ 215 | 216 | metadata = download_s3_metadata_file( 217 | download_directory=download_directory, 218 | metadata_s3_key="section_data_sets.json", 219 | downloaded_local_fname="section_data_sets.json", 220 | bucket_name=bucket_name 221 | ) 222 | 223 | return metadata 224 | 225 | 226 | def get_section_metadata( 227 | section_id: int, 228 | download_directory: Union[str, Path], 229 | s3_client=None, 230 | bucket_name: str = 'allen-mouse-brain-atlas' 231 | ) -> dict: 232 | """ 233 | Get the dict representing the metadata for a specific image series. 234 | 235 | Parameters 236 | ---------- 237 | section_id : int 238 | An integer representing the section whose metadata should be loaded 239 | s3_client 240 | A boto3.Client of the S3 variety. If None, this function will 241 | try to create an s3 client with anonymous credentials which is 242 | sufficient to access public AWS services 243 | download_directory : Union[str, Path] 244 | The desired local directory path to save downloaded metadata. If the 245 | provided directory does not exist, it and any necessary parent 246 | directories will be created automatically. 247 | bucket_name : str 248 | The name of the S3 bucket to download metadata from. 249 | 250 | Returns 251 | ------- 252 | A dict containing the metadata for the specified section_id. 253 | """ 254 | 255 | metadata_s3_key = f"section_data_set_{section_id}/section_data_set.json" 256 | local_fname = f"section_data_set_{section_id}_metadata.json" 257 | 258 | metadata = download_s3_metadata_file( 259 | download_directory=download_directory, 260 | metadata_s3_key=metadata_s3_key, 261 | downloaded_local_fname=local_fname, 262 | bucket_name=bucket_name 263 | ) 264 | 265 | return metadata 266 | 267 | 268 | class SectionDataSet(object): 269 | 270 | def __init__( 271 | self, 272 | section_id: int, 273 | download_directory: Union[str, Path], 274 | s3_client=None 275 | ): 276 | """ 277 | Load and store the metadata for the section_data_set specified 278 | by section_id. Use the boto3 s3_client provided as a kwarg. 279 | 280 | Parameters 281 | ---------- 282 | section_id : 283 | An int indicating which section_data_set to load 284 | download_directory : Union[str, Path] 285 | The desired local directory path to save downloaded metadata. If 286 | the provided directory does not exist, it and any necessary parent 287 | directories will be created automatically. 288 | s3_client : 289 | A boto3.Client of the S3 variety. If None, an s3 client with 290 | anonymous credentials will be automatically created. 291 | """ 292 | 293 | if type(download_directory) is str: 294 | download_directory = Path(download_directory).resolve() 295 | 296 | if not download_directory.exists(): 297 | download_directory.mkdir(parents=True, exist_ok=True) 298 | 299 | if s3_client is None: 300 | s3_client = get_public_boto3_client() 301 | 302 | self.download_dir = download_directory 303 | self.section_id = section_id 304 | self.s3_client = s3_client 305 | self.metadata = get_section_metadata( 306 | section_id=section_id, 307 | download_directory=download_directory 308 | ) 309 | 310 | # remove section images and construct dicts keyed on 311 | # tissue_index and sub_image_id 312 | tmp_section_images = self.metadata.pop('section_images') 313 | 314 | self.tissue_index_to_section_img = {} 315 | self.subimg_to_tissue_index = {} 316 | self.tissue_index_to_subimg = {} 317 | for img in tmp_section_images: 318 | tissue_index = img['section_number'] 319 | assert tissue_index not in self.tissue_index_to_section_img 320 | self.tissue_index_to_section_img[tissue_index] = img 321 | subimg_id = img['id'] 322 | assert subimg_id not in self.subimg_to_tissue_index 323 | self.subimg_to_tissue_index[subimg_id] = tissue_index 324 | assert tissue_index not in self.tissue_index_to_subimg 325 | self.tissue_index_to_subimg[tissue_index] = subimg_id 326 | 327 | self._tissue_indices = list(self.tissue_index_to_section_img.keys()) 328 | self._tissue_indices.sort() 329 | self._subimg_ids = list(self.subimg_to_tissue_index.keys()) 330 | self._subimg_ids.sort() 331 | 332 | @property 333 | def tissue_indices(self): 334 | """ 335 | Return a sorted list of all of the tissue index values 336 | available for the section_data_set 337 | """ 338 | return self._tissue_indices 339 | 340 | @property 341 | def sub_image_ids(self): 342 | """ 343 | Return a sorted list of all the sub-image ID values 344 | for the section_data_set 345 | """ 346 | return self._subimg_ids 347 | 348 | def image_metadata_from_tissue_index(self, tissue_index): 349 | """ 350 | Return the metadata of the section_image associated with the 351 | specified tissue_index. 352 | 353 | Returns None if an invalid tissue_index is specified 354 | """ 355 | if tissue_index not in self.tissue_index_to_section_img: 356 | warnings.warn("tissue_index %d does not " 357 | "exist in section_data_set_%d" % 358 | (tissue_index, self.section_id)) 359 | return None 360 | 361 | return copy.deepcopy(self.tissue_index_to_section_img[tissue_index]) 362 | 363 | def image_metadata_from_sub_image(self, sub_image): 364 | """ 365 | Return the metadata of the section_image associated with the 366 | specified subimage ID 367 | 368 | Returns None if an invalid subimage ID is specified 369 | """ 370 | if sub_image not in self.subimg_to_tissue_index: 371 | warnings.warn("sub_image %d does not exist " 372 | "in section_data_set_%d" % 373 | (sub_image, self.section_id)) 374 | 375 | return None 376 | 377 | tissue_index = self.subimg_to_tissue_index[sub_image] 378 | return self.image_metadata_from_tissue_index(tissue_index) 379 | 380 | def _download_img( 381 | self, tissue_index: int, downsample: int, 382 | local_savepath: Path, clobber: bool = False 383 | ): 384 | """ 385 | Download the TIFF file specified by fname and downsample 386 | 387 | Parameters 388 | ---------- 389 | tissue_index is the tissue index of the sub-image whose TIFF file 390 | we are to download 391 | 392 | downsample is an integer denoting the downsampling tier to download 393 | 394 | local_savepath is the file path where the TIFF file will be saved 395 | 396 | clobber is a boolean. If True, overwrite pre-existing local_savepath. 397 | Otherwise, throw a warning and exit if local_savepath already exists 398 | 399 | Returns 400 | ------- 401 | True if the TIFF file was successfully downloaded to local_savepath; 402 | False if not. 403 | """ 404 | 405 | if local_savepath.exists(): 406 | if not local_savepath.is_file(): 407 | warnings.warn( 408 | '%s already exists but is not a file' % local_savepath 409 | ) 410 | return False 411 | if not clobber: 412 | warnings.warn("%s already exists; re-run with " 413 | "clobber=True to overwrite" % local_savepath) 414 | return False 415 | 416 | img_metadata = self.image_metadata_from_tissue_index(tissue_index) 417 | fname = img_metadata['image_file_name'] 418 | 419 | downsample_key = 'downsample_%d' % downsample 420 | if downsample_key not in img_metadata['downsampling'].keys(): 421 | warnings.warn("%d is not a valid downsampling tier for %s" 422 | % (downsample, fname)) 423 | return False 424 | aws_key = 'section_data_set_%d/%s/%s' % ( 425 | self.section_id, downsample_key, fname 426 | ) 427 | 428 | # Download the TIFF into a temporary location 429 | # then use PIL to crop the image to only include 430 | # the specified section of brain. 431 | 432 | with tempfile.NamedTemporaryFile( 433 | mode="wb", 434 | dir=self.download_dir, 435 | prefix="tmp_before_crop_", 436 | suffix=".tiff" 437 | ) as f: 438 | 439 | self.s3_client.download_fileobj( 440 | Bucket='allen-mouse-brain-atlas', 441 | Key=aws_key, 442 | Fileobj=f 443 | ) 444 | 445 | tier_metadata = img_metadata['downsampling'][downsample_key] 446 | x0 = tier_metadata['x'] 447 | y0 = tier_metadata['y'] 448 | x1 = x0 + tier_metadata['width'] 449 | y1 = y0 + tier_metadata['height'] 450 | 451 | with Image.open(Path(f.name).resolve(), "r") as img: 452 | cropped_img = img.crop((x0, y0, x1, y1)) 453 | cropped_img.save(str(local_savepath)) 454 | cropped_img.close() 455 | 456 | return True 457 | 458 | def download_image_from_tissue_index( 459 | self, tissue_index: int, downsample: int, 460 | local_savepath: Path, clobber: bool = False 461 | ): 462 | """ 463 | Download a TIFF file specified by its tissue_index and downsampling 464 | tier. 465 | 466 | Parameters 467 | ---------- 468 | tissue_index is an integer corressponding to the 469 | tissue_index/section_number of the TIFF to be downloaded 470 | 471 | downsample is an integer denoting the downsampling 472 | tier of the TIFF to be downloaded 473 | 474 | local_savepath is the file path where the downloaded 475 | TIFF file should be saved 476 | 477 | clobber is a boolean. If True, overwrite pre-existing 478 | local_savepath. If False, raise a warning and exit in 479 | the case where local_savepath already exists 480 | 481 | Returns 482 | ------- 483 | True if the TIFF was successfully downloaded to local_savepath; 484 | False if not 485 | """ 486 | if tissue_index not in self.tissue_index_to_section_img: 487 | warnings.warn("tissue_index %d does not exist in " 488 | "section_data_set_%d" % 489 | (tissue_index, self.section_id)) 490 | return False 491 | return self._download_img( 492 | tissue_index, downsample, local_savepath, clobber=clobber 493 | ) 494 | 495 | def download_image_from_sub_image( 496 | self, sub_image: int, downsample: int, 497 | local_savepath: str, clobber: bool = False 498 | ): 499 | """ 500 | Download a TIFF file specified by its sub-image ID and downsampling 501 | tier. 502 | 503 | Parameters 504 | ---------- 505 | sub_image is an integer corressponding to the sub-image ID 506 | of the TIFF to be downloaded 507 | 508 | downsample is an integer denoting the downsampling tier of 509 | the TIFF to be downloaded 510 | 511 | local_savepath is the file name where the downloaded TIFF 512 | file should be saved 513 | 514 | clobber is a boolean. If True, overwrite pre-existing 515 | local_savepath. If False, raise a warning and exit in the 516 | case where local_savepath already exists 517 | 518 | Returns 519 | ------- 520 | True if the TIFF was successfully downloaded to local_savepath; 521 | False if not 522 | """ 523 | if sub_image not in self.subimg_to_tissue_index: 524 | warnings.warn("sub_image %d does not exist " 525 | "in section_data_set_%d" % 526 | (sub_image, self.section_id)) 527 | return False 528 | tissue_index = self.subimg_to_tissue_index[sub_image] 529 | return self.download_image_from_tissue_index( 530 | tissue_index, downsample, local_savepath, clobber=clobber 531 | ) 532 | 533 | def section_url(self): 534 | """ 535 | Return the URL for the brain-map.org viewer for this SectionDataSet 536 | """ 537 | return "http://mouse.brain-map.org/experiment/show/{id}".format(id=self.section_id) 538 | 539 | def sub_image_url(self, sub_image_id: int) -> str: 540 | """ 541 | Return URL for a high quality image of a specific sub-image, 542 | specified by sub_image_id 543 | """ 544 | base = "http://mouse.brain-map.org/experiment/siv?id={sect}&imageId={img}&initImage=ish" 545 | return base.format(sect=self.section_id, img=sub_image_id) 546 | 547 | def tissue_index_url(self, tissue_index: int) -> str: 548 | """ 549 | Return URL for a high quality image of a specific sub-image, 550 | specified by tissued_index 551 | """ 552 | sub_img = self.tissue_index_to_subimg[tissue_index] 553 | return self.sub_image_url(sub_img) 554 | -------------------------------------------------------------------------------- /tests/test_aba_mouse_utils.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import hashlib 3 | import json 4 | import os 5 | import shutil 6 | import sys 7 | import tempfile 8 | import time 9 | import unittest 10 | import warnings 11 | from pathlib import Path 12 | 13 | from PIL import Image 14 | 15 | import open_dataset_tools.aba_mouse_utils as mouse_utils 16 | from open_dataset_tools.aws_utils import get_public_boto3_client 17 | 18 | 19 | @contextlib.contextmanager 20 | def make_tmp_dir(auto_delete: bool = True): 21 | tmp_dir_base = Path(__file__).resolve().parent / 'test_tmp' 22 | tmp_dir_base.mkdir(parents=True, exist_ok=True) 23 | 24 | tmp_dir = Path(tempfile.mkdtemp(dir=tmp_dir_base)) 25 | 26 | try: 27 | yield tmp_dir 28 | finally: 29 | if auto_delete: 30 | shutil.rmtree(tmp_dir) 31 | 32 | class MetadataTestCase(unittest.TestCase): 33 | 34 | @classmethod 35 | def setUpClass(cls): 36 | # regarding warnings filter, see 37 | # https://github.com/boto/boto3/issues/454#issuecomment-380900404 38 | warnings.filterwarnings("ignore", category=ResourceWarning, 39 | message='unclosed