├── example ├── TA232-mask.png ├── TA232-source.md ├── performance-EfficientNetV2S.py ├── performance-detection.py ├── performance-mnist.py ├── performance-EfficientNet_V2_S_Weights.IMAGENET1K_V1.py ├── tensorflow_stream.ipynb └── pytorch_stream.ipynb ├── documentation ├── runtime.png ├── H&E_chunk.png └── slide_chunk_tile.png ├── .github ├── dependabot.yml └── workflows │ └── build-test-package.yml ├── .git-blame-ignore-revs ├── .gitignore ├── pyproject.toml ├── histomics_stream ├── __init__.py ├── codecs.py ├── pytorch.py ├── tensorflow.py └── configure.py ├── test ├── test_find_imports.py ├── test_mask.py └── test_create_study.py ├── StudyObject.md ├── LICENSE.txt └── README.md /example/TA232-mask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DigitalSlideArchive/HistomicsStream/HEAD/example/TA232-mask.png -------------------------------------------------------------------------------- /documentation/runtime.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DigitalSlideArchive/HistomicsStream/HEAD/documentation/runtime.png -------------------------------------------------------------------------------- /documentation/H&E_chunk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DigitalSlideArchive/HistomicsStream/HEAD/documentation/H&E_chunk.png -------------------------------------------------------------------------------- /documentation/slide_chunk_tile.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DigitalSlideArchive/HistomicsStream/HEAD/documentation/slide_chunk_tile.png -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: 2 3 | updates: 4 | - package-ecosystem: "github-actions" 5 | directory: "/" 6 | schedule: 7 | interval: "weekly" 8 | -------------------------------------------------------------------------------- /example/TA232-source.md: -------------------------------------------------------------------------------- 1 | # Data source 2 | The file `example/TA232.svs` comes from the zip file available as 3 | ``` 4 | https://stanfordmedicine.box.com/s/ub8e0wlhsdenyhdsuuzp6zhj0i82xrb1 5 | ``` 6 | from the web page 7 | ``` 8 | https://github.com/stanfordmlgroup/DLBCL-Morph 9 | ``` 10 | It is in that zip file as 11 | ``` 12 | DLBCL-Morph/TMA/MYC/TA232.svs 13 | ``` 14 | 15 | The corresponding mask `example/TA232-mask.png` is randomly generated in Python with 16 | ```python 17 | import numpy as np 18 | from PIL import Image 19 | arr = np.random.randint(0, 2, (mask_height, mask_width), dtype=np.int8) 20 | im = Image.fromarray(arr) 21 | im.save("TA232-mask.png") 22 | ``` 23 | -------------------------------------------------------------------------------- /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # 2 | # This file lists revisions that should be ignored when considering 3 | # attribution for the actual code written. Code style changes should 4 | # not be considered as modifications with regards to attribution. 5 | # 6 | # To see clean and meaningful blame information. 7 | # $ git blame important.py --ignore-revs-file .git-blame-ignore-revs 8 | # 9 | # To configure git to automatically ignore revisions listed in a file on every call to git blame. 10 | # $ git config blame.ignoreRevsFile .git-blame-ignore-revs 11 | # 12 | # Ignore changes introduced when doing global file format changes 13 | # STYLE: change camelCase to snake_case 14 | 6acf4583eb1362af40cb03db068e139bb29d6b96 15 | # STYLE: Apply `black` formatting. 16 | 649e8d0577431734481590c83651adefbce31777 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Do not add ExternalData module staging files 2 | .ExternalData* 3 | 4 | # back-up files 5 | *~ 6 | *.bak 7 | # vim swp files 8 | *.swp 9 | ## Ignore files that are used for auto_completion with clang 10 | *.clang_complete 11 | ## YouCompleteMe vim plugin configuration file 12 | .ycm_extra_conf.py 13 | 14 | 15 | # KWStyle hook output 16 | *.kws 17 | 18 | # compiled python files 19 | *.pyc 20 | 21 | # Binary directory 22 | BUILD* 23 | build* 24 | 25 | # qtcreator 26 | CMakeLists.txt.user* 27 | 28 | # kdevelop 29 | *.kdev* 30 | .kdev* 31 | 32 | # back-up files when conflicts occur 33 | *.orig 34 | 35 | # Clion editor internal project information 36 | .idea 37 | 38 | # Visual Studio 39 | .vs 40 | 41 | # Mac System File 42 | .DS_Store 43 | 44 | # Ignore testing temporary files 45 | Testing/Temporary/ 46 | 47 | # Checkpoint files for Jupyter 48 | .ipynb_checkpoints/ 49 | 50 | # Compiled Python files 51 | __pycache__/ -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["flit_core >=3.4,<4"] 3 | build-backend = "flit_core.buildapi" 4 | 5 | [project] 6 | name = "histomics_stream" 7 | readme = "README.md" 8 | requires-python = ">=3.6" 9 | authors = [{name = "Lee A. Newberg", email = "lee.newberg@kitware.com"}] 10 | maintainers = [{name = "Lee A. Newberg", email = "lee.newberg@kitware.com"}] 11 | keywords = ["tensorflow", "torch", "whole slide image", "stream", "machine learning"] 12 | classifiers = ["License :: OSI Approved :: Apache Software License"] 13 | dependencies = [ 14 | "imagecodecs", 15 | "itk", 16 | "numcodecs", 17 | "numpy", 18 | "scipy", 19 | ] 20 | dynamic = ["version", "description"] 21 | 22 | [project.optional-dependencies] 23 | tensorflow = [ 24 | "tensorflow<3.0.0", 25 | "keras", 26 | ] 27 | torch = [ 28 | "torch<2.0.0", 29 | ] 30 | zarr = [ 31 | "zarr", 32 | ] 33 | 34 | [project.urls] 35 | Source = "https://github.com/DigitalSlideArchive/HistomicsStream" 36 | 37 | [project.scripts] 38 | flit = "flit:main" 39 | -------------------------------------------------------------------------------- /histomics_stream/__init__.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # 3 | # Copyright NumFOCUS 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # https://www.apache.org/licenses/LICENSE-2.0.txt 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # ========================================================================= 18 | 19 | """Whole-slide image streamer for machine learning frameworks.""" 20 | 21 | __version__ = "2.5.3" 22 | 23 | """ 24 | 25 | This module supports efficient whole-slide reading and processing for a machine learning 26 | execution graph. 27 | 28 | """ 29 | from . import configure, codecs # noqa: F401,E402 30 | -------------------------------------------------------------------------------- /test/test_find_imports.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # ========================================================================= 4 | # 5 | # Copyright NumFOCUS 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # https://www.apache.org/licenses/LICENSE-2.0.txt 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | # ========================================================================= 20 | 21 | 22 | def test_imports_can_be_found(): 23 | """Purpose: Test to check that each import can be found""" 24 | 25 | import imagecodecs # noqa: F401 26 | import itk # noqa: F401 27 | import numcodecs # noqa: F401 28 | import numpy # noqa: F401 29 | import scipy.interpolate # noqa: F401 30 | import tensorflow # noqa: F401 31 | import torch # noqa: F401 32 | import zarr # noqa: F401 33 | 34 | 35 | if __name__ == "__main__": 36 | test_imports_can_be_found() 37 | -------------------------------------------------------------------------------- /.github/workflows/build-test-package.yml: -------------------------------------------------------------------------------- 1 | name: Build, test, package 2 | 3 | on: [push,pull_request] 4 | 5 | jobs: 6 | test-python: 7 | runs-on: ubuntu-20.04 8 | strategy: 9 | max-parallel: 2 10 | matrix: 11 | python-version: ["3.8", "3.9", "3.10", "3.11"] 12 | 13 | include: 14 | - flake8-python-git-tag: "" 15 | - pooch-python-git-tag: "" 16 | - pytest-python-git-tag: "" 17 | 18 | steps: 19 | - uses: actions/checkout@v6 20 | - name: 'Free up disk space' 21 | run: | 22 | # Workaround for https://github.com/actions/virtual-environments/issues/709 23 | df -h 24 | sudo apt-get clean 25 | sudo rm -rf "/usr/local/share/boost" 26 | sudo rm -rf "$AGENT_TOOLSDIRECTORY" 27 | df -h 28 | 29 | - name: Set up Python ${{ matrix.python-version }} 30 | uses: actions/setup-python@v6 31 | with: 32 | python-version: ${{ matrix.python-version }} 33 | 34 | - name: Install Python dependencies 35 | run: | 36 | sudo apt update 37 | sudo apt install openslide-tools python3-openslide 38 | python -m pip install --upgrade pip setuptools wheel 39 | pip install 'flake8${{ matrix.flake8-python-git-tag }}' 'pooch${{ matrix.pooch-python-git-tag }}' 'pytest${{ matrix.pytest-python-git-tag }}' 40 | pip install 'large-image[bioformats,ometiff,openjpeg,openslide,tiff]' 'scikit_image' --find-links https://girder.github.io/large_image_wheels 41 | 42 | - name: Install histomics_stream 43 | run: | 44 | pip install .[tensorflow,torch,zarr] 45 | # With Python 3.8, tensorflow downgrades typing-extensions, which appears to 46 | # be unnecessary and breaks a dependency of large_image, so we overrule that 47 | # next. 48 | pip install --upgrade typing-extensions 49 | 50 | - name: Lint with flake8 51 | run: | 52 | # stop the build if there are Python syntax errors or undefined names 53 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 54 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 55 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 56 | 57 | - name: Test with pytest 58 | run: | 59 | cd test 60 | pytest 61 | shell: bash 62 | 63 | build-n-publish: 64 | name: Build and publish Python 🐍 distributions 📦 to PyPI 65 | runs-on: ubuntu-20.04 66 | permissions: 67 | id-token: write 68 | steps: 69 | - uses: actions/checkout@master 70 | - name: Set up Python "3.9" 71 | uses: actions/setup-python@v6 72 | with: 73 | python-version: "3.9" 74 | - name: Install pypa/build 75 | run: >- 76 | python -m pip install build --user 77 | - name: Build a binary wheel and a source tarball 78 | run: >- 79 | python -m build --sdist --wheel --outdir dist/ . 80 | - name: Publish to Test PyPI 81 | if: github.event.repository.fork == false 82 | uses: pypa/gh-action-pypi-publish@release/v1 83 | with: 84 | password: ${{ secrets.TEST_PYPI_API_TOKEN }} 85 | repository-url: https://test.pypi.org/legacy/ 86 | skip-existing: true 87 | - name: Publish distribution 📦 to PyPI 88 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') 89 | uses: pypa/gh-action-pypi-publish@release/v1 90 | with: 91 | password: ${{ secrets.PYPI_API_TOKEN }} 92 | -------------------------------------------------------------------------------- /histomics_stream/codecs.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # 3 | # Copyright NumFOCUS 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # https://www.apache.org/licenses/LICENSE-2.0.txt 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # ========================================================================= 18 | 19 | """Whole-slide image streamer for machine learning frameworks. 20 | 21 | The histomics_stream.codecs module supplies codecs that are useful for Zarr file storage 22 | with jpeg or jpeg2k compression. 23 | 24 | """ 25 | 26 | from imagecodecs import jpeg2k_decode, jpeg2k_encode, jpeg_decode, jpeg_encode 27 | from numcodecs.abc import Codec 28 | from numcodecs.compat import ensure_contiguous_ndarray, ensure_ndarray, ndarray_copy 29 | from numcodecs.registry import register_codec 30 | 31 | 32 | class jpeg(Codec): 33 | """Codec providing jpeg compression via imagecodecs. 34 | 35 | Parameters 36 | ---------- 37 | quality : int 38 | Compression level. 39 | 40 | Notes 41 | ----- 42 | For the code that uses Zarr data storage for jpeg images, we need to supply codecs. 43 | Note that we use this codec instead of that available from the zarr_jpeg package. 44 | The latter collapses dimensions by default, can require us to transpose dimensions, 45 | and can miss optimizations based upon RGB data. 46 | 47 | """ 48 | 49 | codec_id = "jpeg" 50 | 51 | def __init__(self, quality=100): 52 | self.quality = quality 53 | assert 0 < self.quality <= 100 and isinstance(self.quality, int) 54 | super().__init__() 55 | 56 | def encode(self, buf): 57 | """The method to encode a raw image into jpeg format. 58 | 59 | Parameters 60 | ---------- 61 | buf : ndarray 62 | The raw image to be encoded into jpeg format 63 | 64 | Returns 65 | ------- 66 | ndarray 67 | The image in jpeg format 68 | 69 | """ 70 | 71 | bufa = ensure_ndarray(buf) 72 | assert 2 <= bufa.ndim <= 3 73 | return jpeg_encode(bufa, level=self.quality) 74 | 75 | def decode(self, buf, out=None): 76 | """The method to decode a jpeg image into a raw format. 77 | 78 | Parameters 79 | ---------- 80 | buf : contiguous_ndarray 81 | The jpeg image to be decoded into raw format. 82 | out : contiguous_ndarray, optional 83 | Another location to write the raw image to. 84 | 85 | Returns 86 | ------- 87 | ndarray 88 | The image in raw format 89 | 90 | """ 91 | 92 | buf = ensure_contiguous_ndarray(buf) 93 | if out is not None: 94 | out = ensure_contiguous_ndarray(out) 95 | tiled = jpeg_decode(buf) 96 | return ndarray_copy(tiled, out) 97 | 98 | 99 | register_codec(jpeg) 100 | 101 | 102 | class jpeg2k(Codec): 103 | """Codec providing jpeg2k compression via imagecodecs. 104 | 105 | Parameters 106 | ---------- 107 | quality : int 108 | Compression level. 109 | 110 | """ 111 | 112 | codec_id = "jpeg2k" 113 | 114 | def __init__(self, quality=100): 115 | self.quality = quality 116 | assert 0 < self.quality <= 100 and isinstance(self.quality, int) 117 | super().__init__() 118 | 119 | def encode(self, buf): 120 | """The method to encode a raw image into jpeg2k format. 121 | 122 | Parameters 123 | ---------- 124 | buf : ndarray 125 | The raw image to be encoded into jpeg2k format 126 | 127 | Returns 128 | ------- 129 | ndarray 130 | The image in jpeg2k format 131 | 132 | """ 133 | 134 | bufa = ensure_ndarray(buf) 135 | assert 2 <= bufa.ndim <= 3 136 | return jpeg2k_encode(bufa, level=self.quality) 137 | 138 | def decode(self, buf, out=None): 139 | """The method to decode a jpeg2k image into a raw format. 140 | 141 | Parameters 142 | ---------- 143 | buf : contiguous_ndarray 144 | The jpeg2k image to be decoded into raw format. 145 | out : contiguous_ndarray, optional 146 | Another location to write the raw image to. 147 | 148 | Returns 149 | ------- 150 | ndarray 151 | The image in raw format 152 | 153 | """ 154 | 155 | buf = ensure_contiguous_ndarray(buf) 156 | if out is not None: 157 | out = ensure_contiguous_ndarray(out) 158 | tiled = jpeg2k_decode(buf) 159 | return ndarray_copy(tiled, out) 160 | 161 | 162 | register_codec(jpeg2k) 163 | -------------------------------------------------------------------------------- /test/test_mask.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # ========================================================================= 4 | # 5 | # Copyright NumFOCUS 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # https://www.apache.org/licenses/LICENSE-2.0.txt 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | # ========================================================================= 20 | 21 | 22 | def test_mask_threshold(): 23 | import histomics_stream as hs 24 | import os 25 | import pooch 26 | 27 | wsi_path = pooch.retrieve( 28 | fname="TCGA-AN-A0G0-01Z-00-DX1.svs", 29 | url=( 30 | "https://drive.usercontent.google.com/download" 31 | "?export=download" 32 | "&id=19agE_0cWY582szhOVxp9h3kozRfB4CvV" 33 | "&confirm=t" 34 | ), 35 | known_hash="d046f952759ff6987374786768fc588740eef1e54e4e295a684f3bd356c8528f", 36 | path=str(pooch.os_cache("pooch")) + os.sep + "wsi", 37 | ) 38 | print(f"Have {wsi_path}") 39 | 40 | # download binary mask image 41 | mask_path = pooch.retrieve( 42 | fname="TCGA-AN-A0G0-01Z-00-DX1.mask.png", 43 | url=( 44 | "https://drive.usercontent.google.com/download" 45 | "?export=download" 46 | "&id=17GOOHbL8Bo3933rdIui82akr7stbRfta" 47 | "&confirm=t" 48 | ), 49 | known_hash="bb657ead9fd3b8284db6ecc1ca8a1efa57a0e9fd73d2ea63ce6053fbd3d65171", 50 | path=str(pooch.os_cache("pooch")) + os.sep + "wsi", 51 | ) 52 | print(f"Have {mask_path}") 53 | 54 | my_study = dict( 55 | version="version-1", 56 | number_pixel_columns_for_tile=5471, 57 | number_pixel_rows_for_tile=5743, 58 | overlap_width=127, 59 | overlap_height=101, 60 | slides=dict( 61 | Slide_0=dict( 62 | filename=wsi_path, 63 | slide_name=os.path.splitext(os.path.split(wsi_path)[1])[0], 64 | slide_group="test_mask_threshold", 65 | chunk_width=31, 66 | chunk_height=37, 67 | ) 68 | ), 69 | ) 70 | find_slide_resolution = hs.configure.FindResolutionForSlide( 71 | my_study, target_magnification=20, magnification_source="native" 72 | ) 73 | for slide in my_study["slides"].values(): 74 | find_slide_resolution(slide) 75 | 76 | tiler_thresholds = (0.00, 0.20, 0.50, 0.80, 1.00) 77 | tilers = [ 78 | hs.configure.TilesByGridAndMask( 79 | my_study, 80 | mask_filename=mask_path, 81 | mask_threshold=threshold, 82 | number_pixel_overlap_rows_for_tile=101, 83 | number_pixel_overlap_columns_for_tile=127, 84 | ) 85 | for threshold in tiler_thresholds 86 | ] 87 | 88 | def run_tiler(study, tiler): 89 | for slide in study["slides"].values(): 90 | tiler(slide) 91 | return [ 92 | ( 93 | value["filename"], 94 | [ 95 | (tile["tile_top"], tile["tile_left"]) 96 | for tile in value["tiles"].values() 97 | ], 98 | ) 99 | for value in study["slides"].values() 100 | ] 101 | 102 | found_tiles = [run_tiler(my_study, tiler) for tiler in tilers] 103 | 104 | # print(f" expected_tiles = {repr(found_tiles)}") 105 | expected_tiles = [ 106 | [ 107 | ( 108 | wsi_path, 109 | [(0, 10688), (0, 16032), (0, 21376)] 110 | + [(5642, 5344), (5642, 10688), (5642, 16032), (5642, 21376)] 111 | + [(11284, 5344), (11284, 10688), (11284, 16032), (11284, 21376)], 112 | ) 113 | ], 114 | [ 115 | ( 116 | wsi_path, 117 | [(0, 16032), (0, 21376)] 118 | + [(5642, 5344), (5642, 10688), (5642, 16032), (5642, 21376)] 119 | + [(11284, 5344), (11284, 10688), (11284, 16032), (11284, 21376)], 120 | ) 121 | ], 122 | [ 123 | ( 124 | wsi_path, 125 | [(0, 16032), (0, 21376)] 126 | + [(5642, 10688), (5642, 16032), (5642, 21376)] 127 | + [(11284, 10688), (11284, 16032)], 128 | ) 129 | ], 130 | [(wsi_path, [(5642, 10688), (5642, 16032), (11284, 10688), (11284, 16032)])], 131 | [(wsi_path, [(5642, 16032), (11284, 16032)])], 132 | ] 133 | 134 | for i in range(len(found_tiles) - 1): 135 | assert set(found_tiles[i + 1][0][1]).issubset(set(found_tiles[i + 1][0][1])) 136 | for i in range(len(found_tiles)): 137 | assert found_tiles[i] == expected_tiles[i] 138 | print("Test succeeded") 139 | 140 | 141 | if __name__ == "__main__": 142 | test_mask_threshold() 143 | -------------------------------------------------------------------------------- /test/test_create_study.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # ========================================================================= 4 | # 5 | # Copyright NumFOCUS 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # https://www.apache.org/licenses/LICENSE-2.0.txt 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | # ========================================================================= 20 | 21 | 22 | def test_create_study(): 23 | """ 24 | Purpose: Exercise the basic steps for creating a study dict, which is the precursor 25 | step to creating a dataset/dataloader for a machine learning framework such as 26 | TensorFlow or Torch. 27 | """ 28 | import copy 29 | import histomics_stream as hs 30 | 31 | # Create a study and insert study-wide information 32 | my_study0 = {"version": "version-1"} 33 | my_study0["tile_height"] = 256 34 | my_study0["tile_width"] = 256 35 | my_slides = my_study0["slides"] = {} 36 | 37 | # Add a slide to the study, including slide-wide information with it. 38 | my_slide0 = my_slides["Slide_0"] = {} 39 | my_slide0["filename"] = ( 40 | "/tf/notebooks/histomics_stream/example/" 41 | "TCGA-BH-A0BZ-01Z-00-DX1.45EB3E93-A871-49C6-9EAE-90D98AE01913.svs" 42 | ) 43 | my_slide0["slide_name"] = "TCGA-BH-A0BZ-01Z-00-DX1" 44 | my_slide0["slide_group"] = "TCGA-BH-A0BZ" 45 | my_slide0["chunk_height"] = 2048 46 | my_slide0["chunk_width"] = 2048 47 | 48 | if False: 49 | # For each slide, find the appropriate resolution given the 50 | # desired_magnification and magnification_tolerance. In this example, we use 51 | # the same parameters for each slide, but this is not required generally. 52 | find_slide_resolution = hs.configure.FindResolutionForSlide( 53 | my_study0, desired_magnification=20, magnification_tolerance=0.02 54 | ) 55 | for slide in my_study0["slides"].values(): 56 | find_slide_resolution(slide) 57 | else: 58 | # Because we don't actually have the image available, make up some numbers. 59 | my_slide0["level"] = 0 60 | my_slide0["factor"] = 0.5 61 | my_slide0["slide_width"] = 85047 62 | my_slide0["slide_height"] = 112334 63 | 64 | # We are going to demonstrate several approaches to choosing tiles. Each approach 65 | # will start with its own copy of the my_study0 that we have built so far. 66 | 67 | # Demonstrate TilesByGridAndMask without a mask 68 | my_study_by_grid = copy.deepcopy(my_study0) 69 | tiles_by_grid = hs.configure.TilesByGridAndMask( 70 | my_study_by_grid, overlap_height=32, overlap_width=32, randomly_select=100 71 | ) 72 | # We could apply this to a subset of the slides, but we will apply it to all slides 73 | # in this example. 74 | for slide in my_study_by_grid["slides"].values(): 75 | tiles_by_grid(slide) 76 | 77 | if False: 78 | # Skip this test for now because we don't have the mask file available. 79 | # Demonstrate TilesByGridAndMask with a mask 80 | my_study_by_grid_and_mask = copy.deepcopy(my_study0) 81 | tiles_by_grid_and_mask = hs.configure.TilesByGridAndMask( 82 | my_study_by_grid_and_mask, 83 | overlap_height=0, 84 | overlap_width=0, 85 | mask_filename="/tf/notebooks/histomics_stream/example/" 86 | "TCGA-BH-A0BZ-01Z-00-DX1.45EB3E93-A871-49C6-9EAE-90D98AE01913-mask.png", 87 | randomly_select=100, 88 | ) 89 | # We could apply this to a subset of the slides, but we will apply it to all 90 | # slides in this example. 91 | for slide in my_study_by_grid_and_mask["slides"].values(): 92 | tiles_by_grid_and_mask(slide) 93 | 94 | # Demonstrate TilesByList 95 | my_study_by_list = copy.deepcopy(my_study0) 96 | tiles_by_list = hs.configure.TilesByList( 97 | my_study_by_list, 98 | randomly_select=5, 99 | tiles_dictionary=my_study_by_grid["slides"]["Slide_0"]["tiles"], 100 | ) 101 | # We could apply this to a subset of the slides, but we will apply it to all slides 102 | # in this example. 103 | for slide in my_study_by_list["slides"].values(): 104 | tiles_by_list(slide) 105 | 106 | # Demonstrate TilesRandomly 107 | my_study_randomly = copy.deepcopy(my_study0) 108 | tiles_randomly = hs.configure.TilesRandomly(my_study_randomly, randomly_select=3) 109 | # We could apply this to a subset of the slides, but we will apply it to all slides 110 | # in this example. 111 | for slide in my_study_randomly["slides"].values(): 112 | tiles_randomly(slide) 113 | 114 | # The next step would be creating a dataset/dataloader for a machine learning 115 | # framework such as TensorFlow or Torch. However, we will not do that in this test. 116 | 117 | 118 | if __name__ == "__main__": 119 | test_create_study() 120 | -------------------------------------------------------------------------------- /example/performance-EfficientNetV2S.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # 3 | # Copyright NumFOCUS 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # https://www.apache.org/licenses/LICENSE-2.0.txt 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # ========================================================================= 18 | 19 | import os 20 | import time 21 | 22 | import pooch 23 | import tensorflow as tf 24 | 25 | import histomics_stream as hs 26 | import histomics_stream.tensorflow 27 | 28 | 29 | """ 30 | This is a script that is used to make timings of histomics_stream. To some extent, it 31 | may be specific to the computer / docker image it is used with and need minor tweaks to 32 | run on another computer. 33 | """ 34 | 35 | """ 36 | # If you've just started a fresh docker container you may need some of this: 37 | apt update ; apt install -y git emacs ; \ 38 | rm -rf /.local ; \ 39 | pip install -U pip setuptools wheel ; \ 40 | pip install \ 41 | 'batchbald_redux' \ 42 | 'black[jupyter]' \ 43 | 'large_image[openslide,tiff]' \ 44 | 'nbformat>=5.2.0' \ 45 | 'pooch' \ 46 | 'protobuf<3.20' \ 47 | 'tensorflow_datasets' \ 48 | 'torch==1.12.1+cu113' \ 49 | '/tf/notebooks/histomics_stream' \ 50 | --extra-index-url https://download.pytorch.org/whl/cu113 \ 51 | --find-links https://girder.github.io/large_image_wheels 52 | """ 53 | 54 | 55 | def get_data(): 56 | start_time = time.time() 57 | wsi_path = pooch.retrieve( 58 | fname="TCGA-AN-A0G0-01Z-00-DX1.svs", 59 | url="https://drive.google.com/uc" 60 | "?export=download" 61 | "&id=19agE_0cWY582szhOVxp9h3kozRfB4CvV" 62 | "&confirm=t" 63 | "&uuid=6f2d51e7-9366-4e98-abc7-4f77427dd02c" 64 | "&at=ALgDtswlqJJw1KU7P3Z1tZNcE01I:1679111148632", 65 | known_hash="d046f952759ff6987374786768fc588740eef1e54e4e295a684f3bd356c8528f", 66 | path=str(pooch.os_cache("pooch")) + os.sep + "wsi", 67 | ) 68 | print(f"Retrieved {wsi_path} in {time.time() - start_time}s", flush=True) 69 | 70 | # download binary mask image 71 | start_time = time.time() 72 | mask_path = pooch.retrieve( 73 | fname="TCGA-AN-A0G0-01Z-00-DX1.mask.png", 74 | url="https://drive.google.com/uc" 75 | "?export=download" 76 | "&id=17GOOHbL8Bo3933rdIui82akr7stbRfta", 77 | known_hash="bb657ead9fd3b8284db6ecc1ca8a1efa57a0e9fd73d2ea63ce6053fbd3d65171", 78 | path=str(pooch.os_cache("pooch")) + os.sep + "wsi", 79 | ) 80 | print(f"Retrieved {mask_path} in {time.time() - start_time}s", flush=True) 81 | return wsi_path, mask_path 82 | 83 | 84 | class WrappedModel(tf.keras.Model): 85 | def __init__(self, unwrapped_model, *args, **kwargs): 86 | super(WrappedModel, self).__init__(*args, **kwargs) 87 | self.unwrapped_model = unwrapped_model 88 | 89 | def call(self, element): 90 | return self.unwrapped_model(element[0]), element[1] 91 | 92 | 93 | def normalize_img(image, label): 94 | """Normalizes images: `uint8` -> `float32`.""" 95 | return tf.cast(image, tf.float32) / 255.0, label 96 | 97 | 98 | def build_model(training_batch, epochs): 99 | start_time = time.time() 100 | unwrapped_model = tf.keras.applications.efficientnet_v2.EfficientNetV2S( 101 | include_top=False, weights="imagenet", input_shape=(224, 224, 3), pooling="avg" 102 | ) 103 | unwrapped_model.compile( 104 | optimizer=tf.keras.optimizers.Adam(0.001), 105 | loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 106 | metrics=[tf.keras.metrics.SparseCategoricalAccuracy()], 107 | ) 108 | # unwrapped_model.fit(ds_train, epochs=epochs, validation_data=ds_test) 109 | 110 | wrapped_model = WrappedModel(unwrapped_model) 111 | 112 | print(f"Finished model in {time.time() - start_time}s", flush=True) 113 | return unwrapped_model, wrapped_model 114 | 115 | 116 | def create_study(wsi_path, mask_path, chunk_size): 117 | start_time = time.time() 118 | slide_name = os.path.splitext(os.path.split(wsi_path)[1])[0] 119 | slide_group = "Group 3" 120 | 121 | study = dict( 122 | version="version-1", 123 | tile_height=224, 124 | tile_width=224, 125 | overlap_height=0, 126 | overlap_width=0, 127 | slides=dict( 128 | Slide_0=dict( 129 | filename=wsi_path, 130 | slide_name=slide_name, 131 | slide_group=slide_group, 132 | chunk_height=chunk_size, 133 | chunk_width=chunk_size, 134 | ) 135 | ), 136 | ) 137 | 138 | find_slide_resolution = hs.configure.FindResolutionForSlide( 139 | study, target_magnification=20, magnification_source="exact" 140 | ) 141 | tiles_by_grid_and_mask = hs.configure.TilesByGridAndMask( 142 | study, mask_filename=mask_path 143 | ) 144 | # We could apply these to a subset of the slides, but we will apply it to all slides 145 | # in this example. 146 | for slide in study["slides"].values(): 147 | find_slide_resolution(slide) 148 | tiles_by_grid_and_mask(slide) 149 | print(f"Masked study in {time.time() - start_time}s", flush=True) 150 | 151 | start_time = time.time() 152 | create_tensorflow_dataset = hs.tensorflow.CreateTensorFlowDataset() 153 | tiles = create_tensorflow_dataset(study, num_workers=1, worker_index=0) 154 | print(f"#tiles = {len(create_tensorflow_dataset.get_tiles(study)[0][1])}") 155 | print(f"Chunked study in {time.time() - start_time}s", flush=True) 156 | 157 | return study, tiles 158 | 159 | 160 | def predict(take_predictions, prediction_batch, model, tiles): 161 | start_time = time.time() 162 | tiles = tiles.batch(prediction_batch) 163 | if take_predictions > 0: 164 | predictions = model.predict( 165 | tiles.take(1 + (take_predictions - 1) // prediction_batch) 166 | ) 167 | else: 168 | predictions = model.predict(tiles) 169 | print(f"predictions[0].shape = {predictions[0].shape}") 170 | print(f"Made predictions in {time.time() - start_time}s", flush=True) 171 | return predictions 172 | 173 | 174 | if True: 175 | gpus = [gpu.name for gpu in tf.config.list_logical_devices("GPU")] 176 | print(f"gpus = {repr(gpus)}") 177 | 178 | # if __name__ == "__main__": 179 | with tf.device(gpus[0]): 180 | device = "gpu" if True else "cpu" 181 | print(f"***** device = {device} *****") 182 | training_batch = 2**7 183 | num_epochs = 6 184 | take_predictions = 2**10 if False else 0 185 | 186 | wsi_path, mask_path = get_data() 187 | unwrapped_model, model = build_model(training_batch, num_epochs) 188 | 189 | for prediction_batch in [2**j for j in range(5, 11)]: 190 | for chunk_size in [256] + [2**j for j in range(8, 14)]: 191 | print( 192 | f"***** chunk_size = {chunk_size}," 193 | f" prediction_batch = {prediction_batch}," 194 | f" take_predictions = {take_predictions} ****", 195 | flush=True, 196 | ) 197 | study, tiles = create_study(wsi_path, mask_path, chunk_size) 198 | predictions = predict(take_predictions, prediction_batch, model, tiles) 199 | print(f"***** Finished with device = {device} *****") 200 | -------------------------------------------------------------------------------- /example/performance-detection.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # 3 | # Copyright NumFOCUS 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # https://www.apache.org/licenses/LICENSE-2.0.txt 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # ========================================================================= 18 | 19 | import os 20 | import time 21 | 22 | import pooch 23 | import tensorflow as tf 24 | 25 | import histomics_stream as hs 26 | import histomics_stream.tensorflow 27 | 28 | 29 | """ 30 | This is a script that is used to make timings of histomics_stream. To some extent, it 31 | may be specific to the computer / docker image it is used with and need minor tweaks to 32 | run on another computer. 33 | """ 34 | 35 | """ 36 | # If you've just started a fresh docker container you may need some of this: 37 | apt update ; apt install -y git emacs ; \ 38 | rm -rf /.local ; \ 39 | pip install -U pip setuptools wheel ; \ 40 | pip install \ 41 | 'batchbald_redux' \ 42 | 'black[jupyter]' \ 43 | 'large_image[openslide,tiff]' \ 44 | 'nbformat>=5.2.0' \ 45 | 'pooch' \ 46 | 'protobuf<3.20' \ 47 | 'tensorflow_datasets' \ 48 | 'torch==1.12.1+cu113' \ 49 | '/tf/notebooks/histomics_stream' \ 50 | '/tf/notebooks/histomics_detect' \ 51 | --extra-index-url https://download.pytorch.org/whl/cu113 \ 52 | --find-links https://girder.github.io/large_image_wheels 53 | """ 54 | 55 | 56 | def get_data(): 57 | start_time = time.time() 58 | wsi_path = pooch.retrieve( 59 | fname="TCGA-AN-A0G0-01Z-00-DX1.svs", 60 | url="https://drive.google.com/uc" 61 | "?export=download" 62 | "&id=19agE_0cWY582szhOVxp9h3kozRfB4CvV" 63 | "&confirm=t" 64 | "&uuid=6f2d51e7-9366-4e98-abc7-4f77427dd02c" 65 | "&at=ALgDtswlqJJw1KU7P3Z1tZNcE01I:1679111148632", 66 | known_hash="d046f952759ff6987374786768fc588740eef1e54e4e295a684f3bd356c8528f", 67 | path=str(pooch.os_cache("pooch")) + os.sep + "wsi", 68 | ) 69 | print(f"Retrieved {wsi_path} in {time.time() - start_time}s", flush=True) 70 | 71 | # download binary mask image 72 | start_time = time.time() 73 | mask_path = pooch.retrieve( 74 | fname="TCGA-AN-A0G0-01Z-00-DX1.mask.png", 75 | url="https://drive.google.com/uc" 76 | "?export=download" 77 | "&id=17GOOHbL8Bo3933rdIui82akr7stbRfta", 78 | known_hash="bb657ead9fd3b8284db6ecc1ca8a1efa57a0e9fd73d2ea63ce6053fbd3d65171", 79 | path=str(pooch.os_cache("pooch")) + os.sep + "wsi", 80 | ) 81 | print(f"Retrieved {mask_path} in {time.time() - start_time}s", flush=True) 82 | return wsi_path, mask_path 83 | 84 | 85 | class WrappedModel(tf.keras.Model): 86 | def __init__(self, unwrapped_model, *args, **kwargs): 87 | super(WrappedModel, self).__init__(*args, **kwargs) 88 | self.unwrapped_model = unwrapped_model 89 | 90 | def call(self, element): 91 | return self.unwrapped_model(element[0]), element[1] 92 | 93 | 94 | def build_model(): 95 | start_time = time.time() 96 | model_path = pooch.retrieve( 97 | fname="tcga_brca_model", 98 | url="https://drive.google.com/uc" 99 | "?export=download" 100 | "&id=1KxB6iAn9j2Wp7oyFlV4T1Kli-mR8-35G" 101 | "&confirm=t" 102 | "&uuid=c5df8dfd-ed48-4cef-81a0-19df97677fe5" 103 | "&at=ALgDtswWzs0BEdkVNgFrp83p9NDO:1679111246793", 104 | known_hash="b5b5444cc8874d17811a89261abeafd9b9603e7891a8b2a98d8f13e2846a6689", 105 | path=str(pooch.os_cache("pooch")) + os.sep + "model", 106 | processor=pooch.Unzip(), 107 | ) 108 | model_path = os.path.split(model_path[0])[0] 109 | print(f"Have {model_path}.") 110 | 111 | # restore keras model 112 | from histomics_detect.models import FasterRCNN 113 | 114 | model = tf.keras.models.load_model( 115 | model_path, custom_objects={"FasterRCNN": FasterRCNN} 116 | ) 117 | 118 | unwrapped_model = model 119 | model = WrappedModel(unwrapped_model) 120 | 121 | print(f"Finished model in {time.time() - start_time}s", flush=True) 122 | return unwrapped_model, model 123 | 124 | 125 | def create_study(wsi_path, mask_path, chunk_size): 126 | start_time = time.time() 127 | slide_name = os.path.splitext(os.path.split(wsi_path)[1])[0] 128 | slide_group = "Group 3" 129 | 130 | study = dict( 131 | version="version-1", 132 | tile_height=256, 133 | tile_width=256, 134 | overlap_height=64, 135 | overlap_width=64, 136 | slides=dict( 137 | Slide_0=dict( 138 | filename=wsi_path, 139 | slide_name=slide_name, 140 | slide_group=slide_group, 141 | chunk_height=chunk_size, 142 | chunk_width=chunk_size, 143 | ) 144 | ), 145 | ) 146 | 147 | find_slide_resolution = hs.configure.FindResolutionForSlide( 148 | study, target_magnification=20, magnification_source="exact" 149 | ) 150 | tiles_by_grid_and_mask = hs.configure.TilesByGridAndMask( 151 | study, mask_filename=mask_path 152 | ) 153 | # We could apply these to a subset of the slides, but we will apply it to all slides 154 | # in this example. 155 | for slide in study["slides"].values(): 156 | find_slide_resolution(slide) 157 | tiles_by_grid_and_mask(slide) 158 | print(f"Masked study in {time.time() - start_time}s", flush=True) 159 | 160 | start_time = time.time() 161 | create_tensorflow_dataset = hs.tensorflow.CreateTensorFlowDataset() 162 | tiles = create_tensorflow_dataset(study, num_workers=1, worker_index=0) 163 | print(f"#tiles = {len(create_tensorflow_dataset.get_tiles(study)[0][1])}") 164 | print(f"Chunked study in {time.time() - start_time}s", flush=True) 165 | 166 | return study, tiles 167 | 168 | 169 | def predict(take_predictions, prediction_batch, model, tiles): 170 | start_time = time.time() 171 | tiles = tiles.batch(prediction_batch) 172 | if take_predictions > 0: 173 | predictions = model.predict( 174 | tiles.take(1 + (take_predictions - 1) // prediction_batch) 175 | ) 176 | else: 177 | predictions = model.predict(tiles) 178 | print(f"predictions[0].shape = {predictions[0].shape}") 179 | print(f"Made predictions in {time.time() - start_time}s", flush=True) 180 | return predictions 181 | 182 | 183 | if True: 184 | gpus = [gpu.name for gpu in tf.config.list_logical_devices("GPU")] 185 | print(f"gpus = {repr(gpus)}") 186 | 187 | # if __name__ == "__main__": 188 | with tf.device(gpus[0]): 189 | device = "cuda" 190 | print(f"***** device = {device} *****") 191 | take_predictions = 2**17 if False else 0 192 | wsi_path, mask_path = get_data() 193 | unwrapped_model, model = build_model() 194 | 195 | for prediction_batch in (1,): 196 | for chunk_size in [256] + [2**j for j in range(8, 14)]: 197 | print( 198 | f"***** chunk_size = {chunk_size}," 199 | f" prediction_batch = {prediction_batch}," 200 | f" take_predictions = {take_predictions} ****", 201 | flush=True, 202 | ) 203 | study, tiles = create_study(wsi_path, mask_path, chunk_size) 204 | predictions = predict(take_predictions, prediction_batch, model, tiles) 205 | print(f"***** Finished with device = {device} *****") 206 | -------------------------------------------------------------------------------- /example/performance-mnist.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # 3 | # Copyright NumFOCUS 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # https://www.apache.org/licenses/LICENSE-2.0.txt 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # ========================================================================= 18 | 19 | import os 20 | import time 21 | 22 | import pooch 23 | import tensorflow as tf 24 | import tensorflow_datasets as tfds 25 | 26 | import histomics_stream as hs 27 | import histomics_stream.tensorflow 28 | 29 | 30 | """ 31 | This is a script that is used to make timings of histomics_stream. To some extent, it 32 | may be specific to the computer / docker image it is used with and need minor tweaks to 33 | run on another computer. 34 | """ 35 | 36 | """ 37 | # If you've just started a fresh docker container you may need some of this: 38 | apt update ; apt install -y git emacs ; \ 39 | rm -rf /.local ; \ 40 | pip install -U pip setuptools wheel ; \ 41 | pip install \ 42 | 'batchbald_redux' \ 43 | 'black[jupyter]' \ 44 | 'large_image[openslide,tiff]' \ 45 | 'nbformat>=5.2.0' \ 46 | 'pooch' \ 47 | 'protobuf<3.20' \ 48 | 'tensorflow_datasets' \ 49 | 'torch==1.12.1+cu113' \ 50 | '/tf/notebooks/histomics_stream' \ 51 | --extra-index-url https://download.pytorch.org/whl/cu113 \ 52 | --find-links https://girder.github.io/large_image_wheels 53 | """ 54 | 55 | 56 | def get_data(): 57 | start_time = time.time() 58 | wsi_path = pooch.retrieve( 59 | fname="TCGA-AN-A0G0-01Z-00-DX1.svs", 60 | url="https://drive.google.com/uc" 61 | "?export=download" 62 | "&id=19agE_0cWY582szhOVxp9h3kozRfB4CvV" 63 | "&confirm=t" 64 | "&uuid=6f2d51e7-9366-4e98-abc7-4f77427dd02c" 65 | "&at=ALgDtswlqJJw1KU7P3Z1tZNcE01I:1679111148632", 66 | known_hash="d046f952759ff6987374786768fc588740eef1e54e4e295a684f3bd356c8528f", 67 | path=str(pooch.os_cache("pooch")) + os.sep + "wsi", 68 | ) 69 | print(f"Retrieved {wsi_path} in {time.time() - start_time}s", flush=True) 70 | 71 | # download binary mask image 72 | start_time = time.time() 73 | mask_path = pooch.retrieve( 74 | fname="TCGA-AN-A0G0-01Z-00-DX1.mask.png", 75 | url="https://drive.google.com/uc" 76 | "?export=download" 77 | "&id=17GOOHbL8Bo3933rdIui82akr7stbRfta", 78 | known_hash="bb657ead9fd3b8284db6ecc1ca8a1efa57a0e9fd73d2ea63ce6053fbd3d65171", 79 | path=str(pooch.os_cache("pooch")) + os.sep + "wsi", 80 | ) 81 | print(f"Retrieved {mask_path} in {time.time() - start_time}s", flush=True) 82 | return wsi_path, mask_path 83 | 84 | 85 | class WrappedModel(tf.keras.Model): 86 | def __init__(self, model, *args, **kwargs): 87 | super(WrappedModel, self).__init__(*args, **kwargs) 88 | self.model = model 89 | 90 | def call(self, element): 91 | # Use just red of the color image 92 | return (self.model(element[0][..., 0]), element[1]) 93 | 94 | 95 | def normalize_img(image, label): 96 | """Normalizes images: `uint8` -> `float32`.""" 97 | return tf.cast(image, tf.float32) / 255.0, label 98 | 99 | 100 | def build_model(training_batch, epochs): 101 | start_time = time.time() 102 | (ds_train, ds_test), ds_info = tfds.load( 103 | "mnist", 104 | split=["train", "test"], 105 | shuffle_files=True, 106 | as_supervised=True, 107 | with_info=True, 108 | ) 109 | print(f"Finished tfds.load in {time.time() - start_time}s", flush=True) 110 | 111 | start_time = time.time() 112 | ds_train = ds_train.map(normalize_img, num_parallel_calls=tf.data.AUTOTUNE) 113 | ds_train = ds_train.cache() 114 | ds_train = ds_train.shuffle(ds_info.splits["train"].num_examples) 115 | ds_train = ds_train.batch(training_batch) 116 | ds_train = ds_train.prefetch(tf.data.AUTOTUNE) 117 | ds_test = ds_test.map(normalize_img, num_parallel_calls=tf.data.AUTOTUNE) 118 | ds_test = ds_test.batch(training_batch) 119 | ds_test = ds_test.cache() 120 | ds_test = ds_test.prefetch(tf.data.AUTOTUNE) 121 | print(f"Finished (ds_train, ds_test) in {time.time() - start_time}s", flush=True) 122 | 123 | start_time = time.time() 124 | model = tf.keras.models.Sequential( 125 | [ 126 | tf.keras.layers.Flatten(input_shape=(28, 28)), 127 | tf.keras.layers.Dense(128, activation="relu"), 128 | tf.keras.layers.Dense(10), 129 | ] 130 | ) 131 | model.compile( 132 | optimizer=tf.keras.optimizers.Adam(0.001), 133 | loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 134 | metrics=[tf.keras.metrics.SparseCategoricalAccuracy()], 135 | ) 136 | model.fit(ds_train, epochs=epochs, validation_data=ds_test) 137 | 138 | unwrapped_model = model 139 | model = WrappedModel(unwrapped_model) 140 | 141 | print(f"Finished model in {time.time() - start_time}s", flush=True) 142 | return unwrapped_model, model 143 | 144 | 145 | def create_study(wsi_path, mask_path, chunk_size): 146 | start_time = time.time() 147 | slide_name = os.path.splitext(os.path.split(wsi_path)[1])[0] 148 | slide_group = "Group 3" 149 | 150 | study = dict( 151 | version="version-1", 152 | tile_height=28, 153 | tile_width=28, 154 | overlap_height=14, 155 | overlap_width=14, 156 | slides=dict( 157 | Slide_0=dict( 158 | filename=wsi_path, 159 | slide_name=slide_name, 160 | slide_group=slide_group, 161 | chunk_height=chunk_size, 162 | chunk_width=chunk_size, 163 | ) 164 | ), 165 | ) 166 | 167 | find_slide_resolution = hs.configure.FindResolutionForSlide( 168 | study, target_magnification=20, magnification_source="exact" 169 | ) 170 | tiles_by_grid_and_mask = hs.configure.TilesByGridAndMask( 171 | study, mask_filename=mask_path 172 | ) 173 | # We could apply these to a subset of the slides, but we will apply it to all slides 174 | # in this example. 175 | for slide in study["slides"].values(): 176 | find_slide_resolution(slide) 177 | tiles_by_grid_and_mask(slide) 178 | print(f"Masked study in {time.time() - start_time}s", flush=True) 179 | 180 | start_time = time.time() 181 | create_tensorflow_dataset = hs.tensorflow.CreateTensorFlowDataset() 182 | tiles = create_tensorflow_dataset(study, num_workers=1, worker_index=0) 183 | print(f"#tiles = {len(create_tensorflow_dataset.get_tiles(study)[0][1])}") 184 | print(f"Chunked study in {time.time() - start_time}s", flush=True) 185 | 186 | return study, tiles 187 | 188 | 189 | def predict(take_predictions, prediction_batch, model, tiles): 190 | start_time = time.time() 191 | tiles = tiles.batch(prediction_batch) 192 | if take_predictions > 0: 193 | predictions = model.predict( 194 | tiles.take(1 + (take_predictions - 1) // prediction_batch) 195 | ) 196 | else: 197 | predictions = model.predict(tiles) 198 | print(f"predictions[0].shape = {predictions[0].shape}") 199 | print(f"Made predictions in {time.time() - start_time}s", flush=True) 200 | return predictions 201 | 202 | 203 | if True: 204 | gpus = [gpu.name for gpu in tf.config.list_logical_devices("GPU")] 205 | print(f"gpus = {repr(gpus)}") 206 | 207 | # if __name__ == "__main__": 208 | with tf.device(gpus[0]): 209 | device = "cuda" 210 | print(f"***** device = {device} *****") 211 | training_batch = 2**7 212 | num_epochs = 6 213 | take_predictions = 2**17 if False else 0 214 | 215 | wsi_path, mask_path = get_data() 216 | unwrapped_model, model = build_model(training_batch, num_epochs) 217 | 218 | for prediction_batch in [2**j for j in range(5, 11)]: 219 | for chunk_size in [28] + [2**j for j in range(6, 14)]: 220 | print( 221 | f"***** chunk_size = {chunk_size}," 222 | f" prediction_batch = {prediction_batch}," 223 | f" take_predictions = {take_predictions} ****", 224 | flush=True, 225 | ) 226 | study, tiles = create_study(wsi_path, mask_path, chunk_size) 227 | predictions = predict(take_predictions, prediction_batch, model, tiles) 228 | print(f"***** Finished with device = {device} *****") 229 | -------------------------------------------------------------------------------- /example/performance-EfficientNet_V2_S_Weights.IMAGENET1K_V1.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # 3 | # Copyright NumFOCUS 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # https://www.apache.org/licenses/LICENSE-2.0.txt 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # ========================================================================= 18 | 19 | import argparse 20 | import itertools 21 | import os 22 | import time 23 | 24 | import pooch 25 | import torch 26 | import torchvision 27 | 28 | import histomics_stream as hs 29 | import histomics_stream.pytorch 30 | 31 | 32 | """ 33 | This is a script that is used to make timings of histomics_stream. To some extent, it 34 | may be specific to the computer / docker image it is used with and need minor tweaks to 35 | run on another computer. 36 | """ 37 | 38 | """ 39 | # If you've just started a fresh docker container you may need some of this: 40 | apt update ; apt install -y git emacs ; \ 41 | rm -rf /.local ; \ 42 | pip install -U pip setuptools wheel pillow ; \ 43 | pip install \ 44 | 'black[jupyter]' \ 45 | 'large_image[openslide,tiff]' \ 46 | 'monai[pillow,tqdm,ignite,gdown]' \ 47 | 'nbformat>=5.2.0' \ 48 | 'pooch' \ 49 | 'protobuf' \ 50 | '/tf/notebooks/histomics_stream' \ 51 | --find-links https://girder.github.io/large_image_wheels 52 | """ 53 | 54 | 55 | def get_data(): 56 | start_time = time.time() 57 | wsi_path = pooch.retrieve( 58 | fname="TCGA-AN-A0G0-01Z-00-DX1.svs", 59 | url="https://drive.google.com/uc" 60 | "?export=download" 61 | "&id=19agE_0cWY582szhOVxp9h3kozRfB4CvV" 62 | "&confirm=t" 63 | "&uuid=6f2d51e7-9366-4e98-abc7-4f77427dd02c" 64 | "&at=ALgDtswlqJJw1KU7P3Z1tZNcE01I:1679111148632", 65 | known_hash="d046f952759ff6987374786768fc588740eef1e54e4e295a684f3bd356c8528f", 66 | path=str(pooch.os_cache("pooch")) + os.sep + "wsi", 67 | ) 68 | print(f"Retrieved {wsi_path} in {time.time() - start_time}s", flush=True) 69 | 70 | # download binary mask image 71 | start_time = time.time() 72 | mask_path = pooch.retrieve( 73 | fname="TCGA-AN-A0G0-01Z-00-DX1.mask.png", 74 | url="https://drive.google.com/uc" 75 | "?export=download" 76 | "&id=17GOOHbL8Bo3933rdIui82akr7stbRfta", 77 | known_hash="bb657ead9fd3b8284db6ecc1ca8a1efa57a0e9fd73d2ea63ce6053fbd3d65171", 78 | path=str(pooch.os_cache("pooch")) + os.sep + "wsi", 79 | ) 80 | print(f"Retrieved {mask_path} in {time.time() - start_time}s", flush=True) 81 | return wsi_path, mask_path 82 | 83 | 84 | class WrappedModel(torch.nn.modules.module.Module): 85 | def __init__(self, model, preprocess_fn, *args, device="cuda", **kwargs): 86 | super(WrappedModel, self).__init__(*args, **kwargs) 87 | self.device = torch.device(device) 88 | self.model = model.to(self.device) 89 | self.preprocess_fn = preprocess_fn.to(self.device) 90 | 91 | def forward(self, x): 92 | p = self.model(self.preprocess_fn(x[0].to(self.device))) 93 | return p, x[1] 94 | 95 | 96 | def build_model(device="cuda"): 97 | start_time = time.time() 98 | # print(f"available_models = {repr(sorted(torchvision.models.list_models()))}") 99 | weights = torchvision.models.EfficientNet_V2_S_Weights.DEFAULT 100 | model = torchvision.models.efficientnet_v2_s(weights=weights) 101 | _ = model.eval() 102 | preprocess_fn = weights.transforms() 103 | 104 | unwrapped_model = model 105 | model = WrappedModel(unwrapped_model, preprocess_fn, device=device).to(device) 106 | 107 | print(f"Finished model in {time.time() - start_time}s", flush=True) 108 | return unwrapped_model, model 109 | 110 | 111 | def create_study(wsi_path, mask_path, chunk_size): 112 | start_time = time.time() 113 | slide_name = os.path.splitext(os.path.split(wsi_path)[1])[0] 114 | slide_group = "Group 3" 115 | 116 | study = dict( 117 | version="version-1", 118 | tile_height=224, 119 | tile_width=224, 120 | overlap_height=0, 121 | overlap_width=0, 122 | slides=dict( 123 | Slide_0=dict( 124 | filename=wsi_path, 125 | slide_name=slide_name, 126 | slide_group=slide_group, 127 | chunk_height=chunk_size, 128 | chunk_width=chunk_size, 129 | ) 130 | ), 131 | ) 132 | 133 | find_slide_resolution = hs.configure.FindResolutionForSlide( 134 | study, target_magnification=20, magnification_source="exact" 135 | ) 136 | tiles_by_grid_and_mask = hs.configure.TilesByGridAndMask( 137 | study, mask_filename=mask_path 138 | ) 139 | # We could apply these to a subset of the slides, but we will apply it to all slides 140 | # in this example. 141 | for slide in study["slides"].values(): 142 | find_slide_resolution(slide) 143 | tiles_by_grid_and_mask(slide) 144 | print(f"Masked study in {time.time() - start_time}s", flush=True) 145 | 146 | start_time = time.time() 147 | create_torch_dataloader = hs.pytorch.CreateTorchDataloader() 148 | tiles = create_torch_dataloader(study) 149 | print(f"#tiles = {len(create_torch_dataloader.get_tiles(study)[0][1])}") 150 | print(f"Chunked study in {time.time() - start_time}s", flush=True) 151 | return study, tiles 152 | 153 | 154 | def show_structure(x): 155 | if isinstance(x, list): 156 | if len(x) > 0: 157 | return f"[{len(x)} of {show_structure(x[0])}]" 158 | else: 159 | return repr(list()) 160 | if isinstance(x, tuple): 161 | if len(x) > 0: 162 | return f"({len(x)} of {show_structure(x[0])})" 163 | else: 164 | return repr(tuple()) 165 | if isinstance(x, set): 166 | if len(x) > 0: 167 | return f"{{{len(x)} of {show_structure(next(iter(x)))}}}" 168 | else: 169 | return repr(set()) 170 | if isinstance(x, dict): 171 | if len(x) > 0: 172 | return f"{{{len(x)} of {show_structure(next(iter(x.keys())))}: {show_structure(next(iter(x.values())))}}}" 173 | else: 174 | return repr(dict()) 175 | return repr(type(x)) 176 | 177 | 178 | """ 179 | !!! Probably we should be using torch.utils.data.DataLoader batch_size option instead of 180 | !!! this batched() function. 181 | """ 182 | 183 | 184 | def batched(iterable, batch_size): 185 | """ 186 | Batch data into lists of length batch_size. The last batch may be shorter: 187 | batched('ABCDEFG', 3) --> ABC DEF G 188 | """ 189 | iterator = iter(iterable) 190 | # !!! Can we get rid of `list` here and a few lines below? It is used so that we 191 | # !!! can detect an empty list with `while`. 192 | batch = list(itertools.islice(iterator, batch_size)) 193 | while batch: 194 | # Yield `batch` in such a way that this iterator does not keep a reference count 195 | # for it. 196 | batch_in_list = [batch] 197 | del batch 198 | yield batch_in_list.pop() 199 | batch = list(itertools.islice(iterator, batch_size)) 200 | 201 | 202 | def predict_and_detach(model, item): 203 | predict = model(item) 204 | return predict[0].detach().cpu().numpy(), predict[1] 205 | 206 | 207 | def predict(take_predictions, prediction_batch, model, tiles): 208 | start_time = time.time() 209 | if take_predictions > 0: 210 | tiles = itertools.islice(tiles, take_predictions) 211 | batched_tiles = ( 212 | batched(tiles, prediction_batch) if prediction_batch > 0 else [tiles] 213 | ) 214 | predictions = list() 215 | for batch in batched_tiles: 216 | batch_predictions = [predict_and_detach(model, item) for item in batch] 217 | predictions.extend(batch_predictions) 218 | del batch_predictions, batch 219 | print(f"Made predictions in {time.time() - start_time}s", flush=True) 220 | return predictions 221 | 222 | 223 | def create_and_predict( 224 | wsi_path, mask_path, chunk_size, take_predictions, prediction_batch, model 225 | ): 226 | study, tiles = create_study(wsi_path, mask_path, chunk_size) 227 | predictions = predict(take_predictions, prediction_batch, model, tiles) 228 | print(f"show_structure(predictions) = {show_structure(predictions)}") 229 | 230 | 231 | if __name__ == "__main__": 232 | parser = argparse.ArgumentParser() 233 | parser.add_argument("device") 234 | args = parser.parse_args() 235 | # device = "cuda" if True else "cpu" 236 | device = args.device 237 | print(f"***** device = {device} *****") 238 | take_predictions = 2**8 if True else 0 239 | 240 | wsi_path, mask_path = get_data() 241 | unwrapped_model, model = build_model(device=device) 242 | 243 | # for prediction_batch in [2**j for j in range(0, 6)]: 244 | for prediction_batch in [0]: 245 | for chunk_size in [1024] + [2**j for j in range(8, 14)]: 246 | print( 247 | f"***** chunk_size = {chunk_size}," 248 | f" prediction_batch = {prediction_batch}," 249 | f" take_predictions = {take_predictions} ****", 250 | flush=True, 251 | ) 252 | create_and_predict( 253 | wsi_path, 254 | mask_path, 255 | chunk_size, 256 | take_predictions, 257 | prediction_batch, 258 | model, 259 | ) 260 | print(f"***** Finished with device = {device} *****") 261 | -------------------------------------------------------------------------------- /StudyObject.md: -------------------------------------------------------------------------------- 1 | # The histomcis_stream `study` Python dict 2 | 3 | The study is defined as a hierarchy of nested Python dict (dictionary) objects. Further below, arguments to the `histomics_stream` function objects are described. 4 | 5 | ## Keys and values 6 | The keys in the Python dict for the study and their corresponding values are as follows. The format in the following list is 7 | + **key** (type of value): description of value 8 | 9 | Those keys that are fixed strings are in bold. The keys whose values are set directly by the user are in italics; all other values are set by calls to `histomics_stream` function objects. 10 | 11 | + ***version*** (str): 12 | Always equal to "version-1". 13 | + ***tile_height*** (int): 14 | How high is each tile, measured in pixels using the `target_magnification` (described below). 15 | + ***tile_width*** (int): 16 | How wide is each tile, measured in pixels using the `target_magnification` (described below). 17 | + ***overlap_height*** (int): 18 | Specifies the desired amount of vertical overlap between adjacent tiles, measured in pixels using the `target_magnification` (described below). If overlap_height is not supplied, it is set to zero. Zero indicates that there is no overlap between adjacent tiles; they are abutting. 19 | + ***overlap_width*** (int): 20 | Specifies the desired amount of horizontal overlap between adjacent tiles, measured in pixels using the `target_magnification` (described below). If overlap_width is not supplied, it is set to zero. Zero indicates that there is no overlap between adjacent tiles; they are abutting. 21 | + ***slides*** (Python dict): 22 | Contains information about the study's slides. The distinct keys of this Python dict are set by the user for their own convenience, one per slide. 23 | + *user-selected key for slide* (Python dict): 24 | Contains information about this slide. The keys and values for this Python dict are: 25 | + ***filename*** (str): 26 | The path to the file containing the pixel data for this slide. 27 | + ***slide_name*** (str): 28 | A user-supplied name for this slide. 29 | + ***slide_group*** (str): 30 | A user-supplied name for the group to which this slide belongs. 31 | + ***chunk_height*** (int): 32 | For read efficiency, how high a chunk of data that is read in one read should be, measured in pixels using the `target_magnification` (described below). 33 | + ***chunk_width*** (int): 34 | For read efficiency, how wide a chunk of data that is read in one read should be, measured in pixels using the `target_magnification` (described below). 35 | + **`target_magnification`** (float): 36 | The image magnification that the user wishes to use for the slide, if available given other restrictions. A value of 10 corresponds to a pixel resolution of approximately 1 micron; magnification 40 is approximately 0.25 microns per pixel. 37 | + **scan_magnification** (float): 38 | The highest magnification directly available from the file storing the image. 39 | + **read_magnification** (float): 40 | The magnification directly read from the file storing the image. This will be the smallest magnification directly available that is at least as large as the `target_magnification` if `magnification_source in ("exact", "native")`; it will be the `scan_magnification` if `magnification_source="scan"` is selected. 41 | + **returned_magnification** (float): 42 | The magnification of the pixel data returned by `histomics_stream`. This will be the `target_magnification` if `magnification_source="exact"` is selected; it will be `read_magnification` if `magnification_source="native"` is selected; it will be `scan_magnification` if `magnification_source="scan"` is selected. 43 | + **level** (float): 44 | the internal `large_image` level that defines the `returned_magnification`. 45 | + **slide_width** (int): 46 | How high is the slide, measured in pixels using the `target_magnification` (described above). 47 | + **slide_height** (int): 48 | How wide is the slide, measured in pixels using the `target_magnification` (described above). 49 | + **slide_height_tiles** (int): 50 | How many (possibly overlapping) tiles fit into the height of the slide. 51 | + **slide_width_tiles** (int): 52 | How many (possibly overlapping) tiles fit into the width of the slide. 53 | + **mask_height** (int): 54 | If a mask is supplied this is the mask's height in its scan resolution. 55 | + **mask_width** (int): 56 | If a mask is supplied this is the mask's width in its scan resolution. 57 | + **tiles** (Python dict): 58 | Contains information about the slide's tiles. The keys of this Python dict are set by the user for their own convenience, one per tile. 59 | + *user-selected key for tile* (Python dict): 60 | Contains information about this tile. The keys and values for this Python dict are: 61 | + **tile_top** (int): 62 | The index of the top row of the tile, where 0 is the top row of the slide, measured in pixels using the `target_magnification` (described above). 63 | + **tile_left** (int): 64 | The index of the leftmost column of the tile, where 0 is the leftmost column of the slide, measured in pixels using the `target_magnification` (described above). 65 | + **chunks** (Python dict): 66 | Contains information about the slide's read chunks. The keys of this Python dict are set by `histomics_stream` for its own convenience, one per chunk. 67 | + key for chunk (Python dict): 68 | Contains information about this chunk. The keys and values for this Python dict are: 69 | + **chunk_top** (int): 70 | The index of the top row of the chunk, where 0 is the top row of the slide, measured in pixels using the `target_magnification` (described above). 71 | + **chunk_left** (int): 72 | The index of the leftmost column of the chunk, where 0 is the leftmost column of the slide, measured in pixels using the `target_magnification` (described above). 73 | + **chunk_bottom** (int): 74 | The index of the bottom row of the chunk, where 0 is the top row of the slide, measured in pixels using the `target_magnification` (described above). 75 | + **chunk_right** (int): 76 | The index of the rightmost column of the chunk, where 0 is the leftmost column of the slide, measured in pixels using the `target_magnification` (described above). 77 | + **tiles** (Python dict): 78 | The tiles that that will be read together when this chunk is read; `chunk["tiles"][tile_key]` is a reference to the corresponding `slide["tiles"][tile_key]` value. 79 | 80 | ## Arguments for `histomics_stream` function objects. 81 | 82 | + ***target_magnification*** (float): 83 | The image magnification that the user wishes to use for the slide, if available given other restrictions. A value of 10 corresponds to a pixel resolution of approximately 1 micron; magnification 40 is approximately 0.25 microns per pixel. 84 | 85 | + ***magnification_source*** (str in ["scan", "native", "exact"]): 86 | "scan" will produce tiles from the highest magnification avaialable. This is typically the slide scanner's objective magnification. 87 | 88 | "native" will produce tiles from the nearest available magnification equal to or greater than target_magnification (within a 2% tolerance). The "native" option is useful when you want to handle resizing of tiles to target_magnification on your own. 89 | 90 | "exact" will produce tiles using "native" option and then resize these tiles to match target_magnification. Resizing is handled by PIL using the Lanczos antialiasing filter since the resizing shrinks the tile by definition. 91 | 92 | For either "scan" or "native", the size of the read and returned tiles will be (tile_height * returned_magnification / target_magnification, tile_width * returned_magnification / target_magnification). For "exact" the size of the returned tiles will be (tile_height, tile_width). 93 | 94 | This procedure sets values in the Python dict for this slide to capture the scan, read, and returned magnification of the tiles. This is helpful for example to resize results to the scan magnification for visualization in HistomicsUI, or to resize between native and target magnification when using "native". "scan_magnification" is the highest magnification from the source file; "read_magnification" is the magnification read from the source file; "returned_magnification" is the magnification of the returned tiles which is same as "read_magnification" in the case of "scan" or "native" or is the same as "target_magnification" in the case of "exact". 95 | 96 | + ***randomly_select*** (int): 97 | The number of tiles to be randomly selected from the list that would otherwise be written to the Python dict for this slide. A value of `-1` is the default and means that all tiles should be written, except that the default is `+1` for `TilesRandomly`. 98 | 99 | + ***overlap_height*** (int): 100 | Specifies the desired amount of vertical overlap between adjacent tiles, measured in pixels using the `target_magnification` (described above). If overlap_height is not supplied, it is read from the study dictionary, if available, otherwise it is set to zero. Zero indicates that there is no overlap between adjacent tiles; they are abutting. 101 | 102 | + ***overlap_width*** (int): 103 | Specifies the desired amount of horizontal overlap between adjacent tiles, measured in pixels using the `target_magnification` (described above). If overlap_width is not supplied, it is read from the study dictionary, if available, otherwise it is set to zero. Zero indicates that there is no overlap between adjacent tiles; they are abutting. 104 | 105 | + ***mask_filename*** (str): 106 | The path of the image file to be read and used as a mask. The aspect ratio of the mask (in terms of its pixel dimensions) is expected to be about the same as the aspect ratio of the main image ( in terms of its grid of tiles). A non-zero value in the mask indicates that the tile should be retained. The default is "", which means that there is no masking. 107 | 108 | + ***mask_threshold*** (float): 109 | A value in [0.0, 1.0]. A tile is retained if the fraction of the tile overlapping non-zero pixels in the mask is at least the mask_threshold. 110 | 111 | + ***tiles_dictionary*** (Python dict): 112 | For example, `{'AB234': {'tile_top': top0, 'tile_left': left0}, 'CD43': {'tile_top': top1, 'tile_left': left1}, ...}`. Tiles from this list will copied into the Python dict for this slide if they are randomly selected. 113 | -------------------------------------------------------------------------------- /histomics_stream/pytorch.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # 3 | # Copyright NumFOCUS 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # https://www.apache.org/licenses/LICENSE-2.0.txt 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # ========================================================================= 18 | 19 | """Whole-slide image streamer for machine learning frameworks.""" 20 | 21 | import numpy as np 22 | import torch 23 | 24 | from . import configure 25 | 26 | 27 | """ 28 | See: How to load a list of numpy arrays to pytorch dataset loader? 29 | https://stackoverflow.com/questions/44429199/how-to-load-a-list-of-numpy-arrays-to-pytorch-dataset-loader 30 | 31 | torchvision.transforms.ToTensor transforms numpy array or PIL image to torch tensor 32 | torchvision.transforms.LongTensor maybe stacks tensors 33 | Subclassing torch.utils.data.Dataset maybe provides something like a tensorflow 34 | dataset's interface 35 | Using a torch.utils.data.DataLoader on the torch.utils.data.Dataset subclass is maybe 36 | like creating the actual dataset. 37 | """ 38 | 39 | """ 40 | See: A Comprehensive Guide to the DataLoader Class and Abstractions in PyTorch 41 | https://blog.paperspace.com/dataloaders-abstractions-pytorch/ 42 | """ 43 | 44 | """ 45 | Notes 5/31/2023: For multi-processing, torch seems to like a single shared 46 | torch.utils.data.IterableDataset, but one torch.utils.data.DataLoader per worker. If we 47 | are to avoid loading in all workers' pixel data for each worker, the Dataset should not 48 | be loading in the pixel data, just creating the associated dictionary. There should be 49 | one dictionary per *chunk*, which includes its list of tiles, and the loading of the 50 | pixel data per chunk should somehow be deferred to the DataLoader. 51 | 52 | If we create the dataset in an eager fashion, which may be reasonable if it is not 53 | including the pixel data, then it can instead be a (map-style rather iterable-style) 54 | torch.utils.data.Dataset. Especially if we compute worker_index = chunk_index % 55 | num_workers as part of the annotation, it might be quite easy to use a DataLoader's 56 | `num_workers` and `sampler` parameters to direct that pixel data are read only for those 57 | chunks that belong to a given worker, at the time that the DataLoader is created. 58 | 59 | Ultimately the goal is to have the pixel data read and predicted within a single worker 60 | before it is grouped back together to return to the user. If the above doesn't work for 61 | that, alternatively to using `num_workers` in the DataLoader constructor, we might 62 | explicitly use num_worker instances of a DataLoader, created using num_worker calls to 63 | torch.multiprocessing.Process(target=DataLoader, args=) or similar. These are started 64 | in one loop and then joined in another loop. See 65 | https://pytorch.org/docs/stable/notes/multiprocessing.html. We'll probably need a 66 | torch.multiprocessing.Queue to collect outputs, similarly to but not quite the same as 67 | https://teddykoker.com/2020/12/dataloader/. 68 | """ 69 | 70 | 71 | class CreateTorchDataloader(configure.ChunkLocations): 72 | class MyDataset(torch.utils.data.IterableDataset, configure._TilesByCommon): 73 | def __init__(self, study_description): 74 | configure._TilesByCommon.__init__(self) 75 | torch.utils.data.IterableDataset.__init__(self) 76 | """Store in self the data or pointers to it""" 77 | # Update keys of the dictionary from deprecated names 78 | self._update_dict(study_description) 79 | for slide_description in study_description["slides"].values(): 80 | self._update_dict(slide_description) 81 | for chunk_description in slide_description["chunks"].values(): 82 | self._update_dict(chunk_description) 83 | for tile_description in chunk_description["tiles"].values(): 84 | self._update_dict(tile_description) 85 | 86 | self.study_description = study_description 87 | 88 | def __iter__(self): 89 | """Return an iterable that yields tiles=(pixel data, annotation_dict)""" 90 | 91 | def my_iterable(): 92 | """This is the iterable that we will return""" 93 | study_description = self.study_description 94 | study_dict = { 95 | # !!! Is it better to have the dictionary values be length-one 96 | # !!! lists, here and below? 97 | # !!! Or use 98 | # !!! {key: torch.from_numpy(np.array(study_description[key]))}? 99 | key: study_description[key] 100 | for key in study_description.keys() 101 | if key != "slides" 102 | } 103 | for slide_description in study_description["slides"].values(): 104 | slide_dict = { 105 | **study_dict, 106 | **{ 107 | key: slide_description[key] 108 | for key in slide_description.keys() 109 | if key not in ["tiles", "chunks"] 110 | }, 111 | } 112 | 113 | filename = slide_dict["filename"] 114 | returned_magnification = slide_dict["returned_magnification"] 115 | factor = slide_dict["target_magnification"] / returned_magnification 116 | scaled_tile_height = configure.ChunkLocations.scale_it( 117 | slide_dict["tile_height"], factor 118 | ) 119 | scaled_tile_width = configure.ChunkLocations.scale_it( 120 | slide_dict["tile_width"], factor 121 | ) 122 | 123 | for chunk_description in slide_description["chunks"].values(): 124 | chunk_dict = { 125 | **slide_dict, 126 | **{ 127 | key: chunk_description[key] 128 | for key in chunk_description.keys() 129 | if key != "tiles" 130 | }, 131 | } 132 | 133 | # Call to the superclass to get the pixel data for this chunk. 134 | # Keep only first 3 colors. Convert to np.uint8. 135 | scaled_chunk_top = configure.ChunkLocations.scale_it( 136 | chunk_dict["chunk_top"], factor 137 | ) 138 | scaled_chunk_left = configure.ChunkLocations.scale_it( 139 | chunk_dict["chunk_left"], factor 140 | ) 141 | scaled_chunk_bottom = configure.ChunkLocations.scale_it( 142 | chunk_dict["chunk_bottom"], factor 143 | ) 144 | scaled_chunk_right = configure.ChunkLocations.scale_it( 145 | chunk_dict["chunk_right"], factor 146 | ) 147 | 148 | # Use `:3` to change RGBA (if applicable) to RGB. 149 | scaled_chunk_pixels = configure.ChunkLocations.read_large_image( 150 | filename, 151 | scaled_chunk_top, 152 | scaled_chunk_left, 153 | scaled_chunk_bottom, 154 | scaled_chunk_right, 155 | returned_magnification, 156 | )[..., :3].astype(dtype=np.float32) 157 | # Color is the last/fastest dimension for images read with 158 | # large_image, but channel is the first/slowest for Torch 159 | # tensors. 160 | scaled_chunk_pixels = np.moveaxis(scaled_chunk_pixels, -1, 0) 161 | scaled_chunk_pixels = torch.from_numpy(scaled_chunk_pixels) 162 | 163 | for tile_description in chunk_description["tiles"].values(): 164 | tile_dict = { 165 | **chunk_dict, 166 | **{ 167 | key: tile_description[key] 168 | for key in tile_description.keys() 169 | }, 170 | } 171 | scaled_tile_top = ( 172 | configure.ChunkLocations.scale_it( 173 | tile_dict["tile_top"], factor 174 | ) 175 | - scaled_chunk_top 176 | ) 177 | scaled_tile_left = ( 178 | configure.ChunkLocations.scale_it( 179 | tile_dict["tile_left"], factor 180 | ) 181 | - scaled_chunk_left 182 | ) 183 | scaled_tile_bottom = scaled_tile_top + scaled_tile_height 184 | scaled_tile_right = scaled_tile_left + scaled_tile_width 185 | scaled_tile_pixels = scaled_chunk_pixels[ 186 | :, 187 | scaled_tile_top:scaled_tile_bottom, 188 | scaled_tile_left:scaled_tile_right, 189 | ] 190 | 191 | # Yield the pixel data as a tensor and the Python dict of 192 | # associated information. Rather than `yield 193 | # scaled_tile_pixels, tile_dict` we use lists and pop() so 194 | # that this iterator does not maintain a reference count for 195 | # the returned objects. 196 | pixels_in_list = [scaled_tile_pixels] 197 | dict_in_list = [tile_dict] 198 | del scaled_tile_pixels, tile_dict 199 | yield pixels_in_list.pop(), dict_in_list.pop() 200 | 201 | """Return this generator (iterable) over the tiles""" 202 | return my_iterable() 203 | 204 | def __init__(self): 205 | """Set global options""" 206 | configure.ChunkLocations.__init__(self) 207 | # !!! Instead, get `batch_size` from somewhere 208 | self.batch_size = 1 209 | 210 | def __call__(self, study_description): 211 | """ 212 | From scratch, creates a torch dataloader with one torch element per tile 213 | """ 214 | # Call to superclass to find the locations for the chunks 215 | super().__call__(study_description) 216 | 217 | my_dataset = self.MyDataset(study_description) 218 | # !!! DataLoader has additional parameters that we may wish to use 219 | my_data_loader = torch.utils.data.DataLoader( 220 | my_dataset, batch_size=self.batch_size 221 | ) 222 | 223 | return my_data_loader 224 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | https://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | https://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HistomicsStream 2 | 3 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/InsightSoftwareConsortium/ITK/blob/master/LICENSE) [![PyPI Version](https://img.shields.io/pypi/v/histomics_stream.svg)](https://pypi.python.org/pypi/histomics_stream) [![GitHub repository](https://img.shields.io/badge/Powered%20by-HistomicsStream-blue.svg)](https://github.com/DigitalSlideArchive/HistomicsStream) [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DigitalSlideArchive/HistomicsStream/blob/master/example/tensorflow_stream.ipynb) 4 | 5 | ## Overview 6 | 7 | The goal of this project is to create a whole-slide image file reader for machine learning. This reader allows users to extract pixel data from whole-slide image formats, and supports reading paradigms that are commonly used during machine learning training and inference. The package currently supports TensorFlow 2. 8 | 9 | ## Installation for Python 10 | 11 | `histomics_stream` can be easily installed with Python wheels. If you do not want the installation to be to your current Python environment, you should first create and activate a [Python virtual environment (venv)](https://docs.python.org/3/tutorial/venv.html) to work in. Then, run the following from the command lines: 12 | 13 | ```shell-script 14 | sudo apt update 15 | sudo apt install -y python3-openslide openslide-tools 16 | pip install histomics_stream 'large_image[openslide]' \ 17 | scikit_image --find-links https://girder.github.io/large_image_wheels 18 | ``` 19 | Additional functionality is supported with subpackages, e.g., `histomics_stream[tensorflow,torch,zarr]`. These packages are optional when histomics_stream is used only for masking and/or organizing image tiles into larger image chunks that are more efficient to read than individual image tiles. However, if you are creating a tensorflow `Dataset` or a pytorch `DataLoader` then you will need the corresponding packages. 20 | 21 | Additional image readers can be supported by using, e.g., `large_image[openslide,ometiff,openjpeg,bioformats]` instead of `large_image[openslide]`. 22 | 23 | After launching `python3`, import the `histomics_stream` package with: 24 | 25 | ```python 26 | import histomics_stream as hs 27 | ``` 28 | 29 | This has been tested with `tensorflow:2.6.2-gpu` and `tensorflow:2.8.0-gpu`. 30 | 31 | ## History 32 | 33 | Through version 1.0.6, this project was known as `tensorflow_reader`. 34 | 35 | ## Study representation 36 | 37 | `histomics_stream` works in two steps. It first builds an object that represents the study. Second, from that study object, it builds a `tensorflow` `Dataset` object, which efficiently reads the pixel data from files. The study object is described in [StudyObject.md](StudyObject.md). 38 | 39 | ## Introduction 40 | 41 | ![This is a chunk of an H&E stained slide that is about 0.5 mm by 0.3 mm, which is 1821 × 1196 pixels or about 7 × 4 tiles.](documentation/H&E_chunk.png) 42 | 43 | Histopathology is the study of biopsied tissues under the microscope for the purpose of diagnosing disease. Glass slides of tissue specimens are prepared by staining thin tissue slices with chemicals to highlight cellular structures for examination. Traditionally pathologists have examined glass slides to look for telltale signs of disease, but recently whole-slide images (WSIs) that digitize the entire slide at high magnification are being used in diagnosis. A single research study may involve thousands of WSIs, each containing several billion pixels that need to be analyzed by medical personnel. Computer vision algorithms based on machine learning are also increasingly being used to detect, classify, and measure structures in WSIs, both in research and clinical practice. Developing algorithms to analyze WSIs is challenging, since popular machine learning frameworks and computing hardware are built for analyzing much smaller images. For example, a typical WSI with 120,000 × 80,000 pixels contains the equivalent of 191 thousand 224 × 224 images, a typical size used in machine learning frameworks. 44 | 45 | We are producing software tools to simplify the development of computer vision algorithms for WSIs. These tools make working with WSI data more approachable for computer vision and machine learning experts, and will significantly accelerate research by attracting more people to the field of computational pathology. The National Institutes of Health-funded work, a collaboration of Kitware, Inc., Northwestern University, Wake Forest School of Medicine, and Emory University, uses machine learning to find regions of interest. `histomics_stream` sits at the start of the workflow. Specifically, `histomics_stream` is responsible for efficient access to the input image data that will be used to fit a new machine learning model or will be used to predict regions of interest in novel inputs using an already learned model. 46 | 47 | A histopathology tissue sample that is 25 mm × 25 mm (1 inch × 1 inch) and is imaged at a typical 40x magnification will be approximately 100,000 × 100,000 pixels, which is 30 gigabytes of uncompressed RGB data for a single image. A research study may have 10-10,000 such whole slide images. For machine learning purposes such as proposing regions of diagnostic value, these images are usually broken up into tiles, for example 256 × 256 pixels each, and there may be millions to billions of such tiles to be processed in machine learning operations. Especially with the prediction step of machine learning, simply reading these data from disk can be the biggest determinant of runtime performance. 48 | 49 | Several Python libraries, such as [`openslide`](https://openslide.org/api/python/) and [`large_image`](https://girder.github.io/large_image/), are capable of reading whole-slide images with efficiency. Additional power comes from packages such as [`Zarr`](https://www.nature.com/articles/s41592-021-01326-w), which distributes a single image’s data across multiple files. These packages are able to efficiently read a tile from anywhere within a whole-slide image without having to read the entire image. These work well in single-threaded CPU-based applications. However, machine learning involves massive parallelization and sophisticated scheduling, GPU-based computations, and relatively limited GPU-accessible memory. 50 | 51 | ## Methods 52 | 53 | ![A whole-slide image (blue boundary, on order of 100,000 × 100,000) is broken up into chunks (orange boundary, on order of 2048 × 2048) that are read in a single I/O operation and are split into tiles (magenta border, on order of 256 × 256) that are analyzed.](documentation/slide_chunk_tile.png) 54 | 55 | `histomics_stream` is a Python package that enables efficient access to large datasets of whole slide images for use in machine learning. In the first step, the user specifies the details of the data set and the desired operating parameters. The user specifies which images will be processed, where they can be found, what metadata is associated with each (e.g., cohort, subject identifier), a “chunk” size for each image, and a desired magnification to be used. The chunk size of 2048 × 2048 pixels works well in many scenarios we tested, but other values can be specified by the user; the chunk size indicates how many pixel data should be read from disk with each read and in the default case means that an 8 × 8 grid of tiles, each 256 × 256 pixels, is efficient to read with each disk read. In some image types such as TIF and SVS, the image file includes the image data at multiple resolutions. `histomics_stream` selects which native resolution to use based upon the user-specified desired magnification. 56 | 57 | In the first step the user also specifies the operating parameters. What size should each tile be? Should tiles be chosen uniformly in a grid fashion and, if so, how much overlap, if any, should there be between adjacent tiles? The user can supply a mask indicating which tiles from the grid should be used. Alternatively the user can supply an explicit list of tiles to be used, whether or not they are on a grid. The user can indicate that a random subset of the otherwise allowable tiles should be selected. 58 | 59 | As its second step, `histomics_stream` creates a TensorFlow Dataset object from the study description. As is the paradigm for TensorFlow, the creation is done in a lazy, non-eager fashion. By invoking the TensorFlow commands, `histomics_stream` creates a TensorFlow execution graph that specifies the dependencies within the data workflow. Together with TensorFlow’s scheduling and parallelism functionality, this execution graph simply and efficiently directs the reading of tiles from disk for direct and efficient use in TensorFlow model operations. The TensorFlow Dataset created by `histomics_stream` is then used directly in TensorFlow operations for machine learning, whether for model fitting, model evaluation, or use of a model to make predictions in novel input data. 60 | 61 | ## Results 62 | 63 | `histomics_stream` increases runtime performance and eases the construction of the needed TensorFlow execution graph. 64 | 65 | ### Performance 66 | 67 | ![histomics_stream is 65% faster in a typical example.](documentation/runtime.png) 68 | 69 | The `histomics_stream` package significantly improves runtime performance. In a typical example, reading a single whole-slide image that is 19,784 × 27,888 pixels as non-overlapping tiles that are 256 × 256 pixels produces a 77 × 108 grid of 8316 tiles. The `large_image` package is impressive in its ability to seamlessly read multiple file formats and to efficiently read tiles from within large images; with `large_image` the runtime is a quick 16.9 tiles per second including reading and machine learning prediction, using a single GeForce RTX 2080 Ti. With `histomics_stream` this workflow throughput is increased to 27.9 tiles per second, which is a 65% performance improvement. Much of the performance gain comes from reading data one chunk at a time rather than one tile at a time. Additional performance gain comes from the reliance on TensorFlow for the scheduling of reads; TensorFlow’s graph execution schedules each read to optimize the overall performance of the workflow as a whole. 70 | 71 | ### Implementation 72 | 73 | The steps of `histomics_stream` are demonstrated in the Jupyter lab notebook [`example/tensorflow_stream.ipynb`](https://github.com/DigitalSlideArchive/HistomicsStream/blob/master/example/tensorflow_stream.ipynb), which is also available in [Google Colab](https://colab.research.google.com/github/DigitalSlideArchive/HistomicsStream/blob/master/example/tensorflow_stream.ipynb). Construction of a Python dictionary that describes the study data set is straightforward and key steps are implemented by `histomics_stream`. Complexities from TensorFlow are seamlessly handled. For example, the syntax for parallelizable for loops in TensorFlow, which are often essential for runtime performance, is non-intuitive; `histomics_stream` provides the desired parallelism without exposing this complexity. Similarly TensorFlow can be temperamental about conditional control flows, requiring that its graph execution construction routines can prove that alternative execution branches that should be producing objects of the same shape actually do so; the design of `histomics_stream` gives the user the power to, e.g., efficiently select tiles under several alternative strategies, without exposing this graph execution complexity to the user. 74 | 75 | ## Conclusions 76 | 77 | The TensorFlow graph execution interface can be challenging and unintuitive. Instead bioinformatics model creators can use `histomics_stream` to specify the dataset that is to be analyzed. `histomics_stream` takes care of TensorFlow execution graph creation and provides a significant runtime performance improvement. 78 | 79 | ## Acknowledgments 80 | 81 | This work was funded by the National Institutes of Health National Cancer Institute Informatics Technologies for Cancer Research (NIH NCR ITCR) U01 grant [5U01CA220401-04](https://reporter.nih.gov/search/dyu6NCTti06k6svCyr7--Q/project-details/9929565) entitled “Informatics Tools for Quantitative Digital Pathology Profiling and Integrated Prognostic Modeling” with Lee A. D. Cooper (Northwestern University), Metin N. Gurcan (Wake Forest School of Medicine), and Christopher R. Flowers (Emory University) as principal investigators and Kitware, Inc. as a subcontractor. Implementation is primarily by Lee A. Newberg (Kitware, Inc.). 82 | -------------------------------------------------------------------------------- /histomics_stream/tensorflow.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # 3 | # Copyright NumFOCUS 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # https://www.apache.org/licenses/LICENSE-2.0.txt 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # ========================================================================= 18 | 19 | """Whole-slide image streamer for machine learning frameworks.""" 20 | 21 | import math 22 | 23 | import tensorflow as tf 24 | 25 | from . import configure 26 | 27 | 28 | class CreateTensorFlowDataset(configure.ChunkLocations): 29 | def __init__(self): 30 | configure.ChunkLocations.__init__(self) 31 | self.dataset_map_options = { 32 | "num_parallel_calls": tf.data.experimental.AUTOTUNE, 33 | "deterministic": False, 34 | } 35 | 36 | def __call__( 37 | self, 38 | study_description, 39 | num_workers=None, 40 | worker_index=None, 41 | private_threadpool_size=None, 42 | ): 43 | """ 44 | From scratch, creates a tensorflow dataset with one tensorflow element per tile 45 | """ 46 | num_workers = num_workers if num_workers is not None else 1 47 | worker_index = worker_index if worker_index is not None else 0 48 | private_threadpool_size = ( 49 | private_threadpool_size if private_threadpool_size is not None else 1 50 | ) 51 | 52 | # Call to superclass to find the locations for the chunks 53 | # print(f"Build chunks: begin {datetime.datetime.now()}") 54 | configure.ChunkLocations.__call__(self, study_description) 55 | # print(f"Build chunks: end {datetime.datetime.now()}") 56 | 57 | # print(f"Build one_chunk_per_slice: begin {datetime.datetime.now()}") 58 | study_keys = study_description 59 | slide_keys = next(iter(study_keys["slides"].values())) 60 | chunk_keys = next(iter(slide_keys["chunks"].values())) 61 | tile_keys = {"tiles_top": "tile_top", "tiles_left": "tile_left"} 62 | one_chunk_per_slice = { 63 | **{ 64 | key: tf.constant( 65 | [ 66 | study_description[key] 67 | for slide_description in study_description["slides"].values() 68 | for chunk_description in slide_description["chunks"].values() 69 | ] 70 | ) 71 | for key in study_keys 72 | if key != "slides" 73 | }, 74 | **{ 75 | key: tf.constant( 76 | [ 77 | slide_description[key] 78 | for slide_description in study_description["slides"].values() 79 | for chunk_description in slide_description["chunks"].values() 80 | ] 81 | ) 82 | for key in slide_keys 83 | if key not in ("tiles", "chunks") 84 | }, 85 | **{ 86 | key: tf.constant( 87 | [ 88 | chunk_description[key] 89 | for slide_description in study_description["slides"].values() 90 | for chunk_description in slide_description["chunks"].values() 91 | ] 92 | ) 93 | for key in chunk_keys 94 | if key != "tiles" 95 | }, 96 | **{ 97 | plural: tf.ragged.constant( 98 | [ 99 | [ 100 | tile_description[singular] 101 | for tile_description in chunk_description["tiles"].values() 102 | ] 103 | for slide_description in study_description["slides"].values() 104 | for chunk_description in slide_description["chunks"].values() 105 | ] 106 | ) 107 | for plural, singular in tile_keys.items() 108 | }, 109 | } 110 | # print(f"Build one_chunk_per_slice: end {datetime.datetime.now()}") 111 | 112 | # print( 113 | # "Build study_dataset from_tensor_slices: begin " 114 | # f"{datetime.datetime.now()}" 115 | # ) 116 | study_dataset = tf.data.Dataset.from_tensor_slices(one_chunk_per_slice) 117 | del one_chunk_per_slice 118 | # print( 119 | # f"Build study_dataset from_tensor_slices: end {datetime.datetime.now()}" 120 | # ) 121 | 122 | # print(f"study_dataset.element_spec = {study_dataset.element_spec}") 123 | 124 | # Shard the dataset before we have broken chunks into tiles so that all a 125 | # chunk's tiles stay together. 126 | if num_workers != 1 or worker_index != 0: 127 | study_dataset = study_dataset.shard(num_workers, worker_index) 128 | 129 | # We have accumulated the chunk datasets into a study_dataset where each element 130 | # is a chunk. Read in the chunk pixel data and split it into tiles. 131 | # print(f"Build study_dataset map: begin {datetime.datetime.now()}") 132 | study_dataset = study_dataset.map( 133 | CreateTensorFlowDataset._read_and_split_chunk, **self.dataset_map_options 134 | ) 135 | # print(f"Build study_dataset map: end {datetime.datetime.now()}") 136 | 137 | # Change study_dataset so that each element is a tile. 138 | study_dataset = study_dataset.unbatch() 139 | 140 | # Make the tile pixels easier to find in each study_dataset element. Also, tack 141 | # on additional elements to the tuple so that the form is (inputs, targets, 142 | # sample_weights). 143 | # print(f"Build study_dataset pop: begin {datetime.datetime.now()}") 144 | study_dataset = study_dataset.map( 145 | lambda elem: ((elem.pop("tile_pixels"), elem),), **self.dataset_map_options 146 | ) 147 | study_dataset = study_dataset.map( 148 | lambda elem: (elem, None, None), **self.dataset_map_options 149 | ) 150 | # print(f"Build study_dataset pop: end {datetime.datetime.now()}") 151 | 152 | # By default `private_threadpool_size` is set to 0, which means that Tensorflow 153 | # is free to choose the number without limit. However, Tensorflow can grind to 154 | # a halt when processing a large dataset with this default behavior on GPU. A 155 | # value of 1 for `private_threadpool_size` runs more quickly than other values 156 | # on some tests we tried. Changing `private_threadpool_size` here is achieved 157 | # as a transformation of the dataset with an `options` object. 158 | options = tf.data.Options() 159 | options.threading.private_threadpool_size = private_threadpool_size 160 | study_dataset = study_dataset.with_options(options) 161 | 162 | return study_dataset 163 | 164 | @staticmethod 165 | def _read_and_split_chunk(elem): 166 | # Get chunk's pixel data from disk and load it into chunk_as_tensor. 167 | # Note that if elem["factor"] differs from 1.0 then this chunk will have 168 | # num_rows ((chunk_bottom - chunk_top) / factor, and num_columns = 169 | # ((chunk_right - chunk_left) / factor. 170 | # tf.print("#_read_and_split_chunk begin") 171 | zero = tf.constant(0, dtype=tf.int32) 172 | one = tf.constant(1, dtype=tf.int32) 173 | epsilon = tf.constant(0.01, dtype=tf.float32) 174 | 175 | factor = tf.cast(elem["target_magnification"], dtype=tf.float32) / tf.cast( 176 | elem["returned_magnification"], dtype=tf.float32 177 | ) 178 | chunk_as_tensor = tf.py_function( 179 | func=CreateTensorFlowDataset._py_read_chunk, 180 | inp=[ 181 | elem["chunk_top"], 182 | elem["chunk_left"], 183 | elem["chunk_bottom"], 184 | elem["chunk_right"], 185 | elem["filename"], 186 | elem["returned_magnification"], 187 | factor, 188 | ], 189 | Tout=tf.uint8, 190 | ) 191 | num_tiles = tf.size(elem["tiles_top"]) 192 | tiles = tf.TensorArray(dtype=tf.uint8, size=num_tiles) 193 | 194 | scaled_tile_height = tf.cast( 195 | tf.math.floor( 196 | tf.cast(elem["tile_height"], dtype=tf.float32) / factor + epsilon 197 | ), 198 | dtype=tf.int32, 199 | ) 200 | scaled_tile_width = tf.cast( 201 | tf.math.floor( 202 | tf.cast(elem["tile_width"], dtype=tf.float32) / factor + epsilon 203 | ), 204 | dtype=tf.int32, 205 | ) 206 | scaled_chunk_top = tf.cast( 207 | tf.math.floor( 208 | tf.cast(elem["chunk_top"], dtype=tf.float32) / factor + epsilon 209 | ), 210 | dtype=tf.int32, 211 | ) 212 | scaled_chunk_left = tf.cast( 213 | tf.math.floor( 214 | tf.cast(elem["chunk_left"], dtype=tf.float32) / factor + epsilon 215 | ), 216 | dtype=tf.int32, 217 | ) 218 | 219 | def condition(i, _): 220 | return tf.less(i, num_tiles) 221 | 222 | def body(i, tiles): 223 | return ( 224 | i + one, 225 | tiles.write( 226 | i, 227 | tf.image.crop_to_bounding_box( 228 | chunk_as_tensor, 229 | tf.cast( 230 | tf.math.floor( 231 | tf.cast( 232 | tf.gather(elem["tiles_top"], i), dtype=tf.float32 233 | ) 234 | / factor 235 | + epsilon 236 | ), 237 | dtype=tf.int32, 238 | ) 239 | - scaled_chunk_top, 240 | tf.cast( 241 | tf.math.floor( 242 | tf.cast( 243 | tf.gather(elem["tiles_left"], i), dtype=tf.float32 244 | ) 245 | / factor 246 | + epsilon 247 | ), 248 | dtype=tf.int32, 249 | ) 250 | - scaled_chunk_left, 251 | scaled_tile_height, 252 | scaled_tile_width, 253 | ), 254 | ), 255 | ) 256 | 257 | _, tiles = tf.while_loop(condition, body, [zero, tiles]) 258 | tiles = tiles.stack() 259 | 260 | response = { 261 | **{ 262 | key: tf.repeat(elem[key], num_tiles) 263 | for key in elem.keys() 264 | if key not in ("tiles_top", "tiles_left") 265 | }, 266 | "tile_top": elem["tiles_top"], 267 | "tile_left": elem["tiles_left"], 268 | "tile_pixels": tiles, 269 | } 270 | 271 | # tf.print("#_read_and_split_chunk end") 272 | return response 273 | 274 | @staticmethod 275 | def _py_read_chunk( 276 | chunk_top, 277 | chunk_left, 278 | chunk_bottom, 279 | chunk_right, 280 | filename, 281 | returned_magnification, 282 | factor, 283 | ): 284 | """ 285 | Read from disk all the pixel data for a specific chunk of the 286 | whole slide. 287 | """ 288 | 289 | # if "_num_chunks" not in CreateTensorFlowDataset._py_read_chunk.__dict__: 290 | # CreateTensorFlowDataset._py_read_chunk._num_chunks = 0 291 | # chunk_name = ( 292 | # f"#_py_read_chunk {CreateTensorFlowDataset._py_read_chunk._num_chunks:06}" 293 | # ) 294 | # CreateTensorFlowDataset._py_read_chunk._num_chunks += 1 295 | 296 | # print(f"{chunk_name} begin {datetime.datetime.now()}") 297 | filename = filename.numpy().decode("utf-8") 298 | chunk_top = math.floor(chunk_top.numpy() / factor.numpy() + 0.01) 299 | chunk_left = math.floor(chunk_left.numpy() / factor.numpy() + 0.01) 300 | chunk_bottom = math.floor(chunk_bottom.numpy() / factor.numpy() + 0.01) 301 | chunk_right = math.floor(chunk_right.numpy() / factor.numpy() + 0.01) 302 | returned_magnification = returned_magnification.numpy() 303 | 304 | # print(f"{chunk_name} begin1 {datetime.datetime.now()}") 305 | # Call to the superclass to get the pixel data for this chunk 306 | chunk = configure.ChunkLocations.read_large_image( 307 | filename, 308 | chunk_top, 309 | chunk_left, 310 | chunk_bottom, 311 | chunk_right, 312 | returned_magnification, 313 | ) 314 | # print(f"{chunk_name} begin2 {datetime.datetime.now()}") 315 | 316 | # Do we want to support other than RGB?!!! 317 | chunk = chunk[..., :3] 318 | # print(f"{chunk_name} end {datetime.datetime.now()}") 319 | return chunk 320 | -------------------------------------------------------------------------------- /example/tensorflow_stream.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "b0c50c61", 6 | "metadata": {}, 7 | "source": [ 8 | "# Demonstration of histomics_stream\n", 9 | "\n", 10 | "Click to open in [[GitHub](https://github.com/DigitalSlideArchive/HistomicsStream/tree/master/example/tensorflow_stream.ipynb)] [[Google Colab](https://colab.research.google.com/github/DigitalSlideArchive/HistomicsStream/blob/master/example/tensorflow_stream.ipynb)]\n", 11 | "\n", 12 | "The `histomics_stream` Python package sits at the start of any machine learning workflow that is built on the TensorFlow machine learning library. The package is responsible for efficient access to the input image data that will be used to fit a new machine learning model or will be used to predict regions of interest in novel inputs using an already learned model." 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "id": "10f22613", 18 | "metadata": {}, 19 | "source": [ 20 | "## Installation\n", 21 | "\n", 22 | "If you are running this notebook on Google Colab or another system where `histomics_stream` and its dependencies are not yet installed then they can be installed with the following commands. Note that image readers in addition to openslide are also supported by using, e.g., `large_image[bioformats,ometiff,openjpeg,openslide,tiff]` on the below pip install command line." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "9ac13166-ba70-495b-be71-43036afc5cb7", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# Get histomics_stream and its dependencies\n", 33 | "!apt update\n", 34 | "!apt install -y python3-openslide openslide-tools\n", 35 | "!pip install 'large_image[openslide,tiff]' --find-links https://girder.github.io/large_image_wheels\n", 36 | "!pip install histomics_stream[tensorflow]\n", 37 | "\n", 38 | "# Get other packages used in this notebook\n", 39 | "# N.B. itkwidgets works with jupyter<=3.0.0\n", 40 | "!apt install libcudnn8 libcudnn8-dev\n", 41 | "!pip install histomics_detect pooch itkwidgets\n", 42 | "!jupyter labextension install @jupyter-widgets/jupyterlab-manager jupyter-matplotlib jupyterlab-datawidgets itkwidgets\n", 43 | "\n", 44 | "print(\n", 45 | " \"\\nNOTE!: On Google Colab you may need to choose 'Runtime->Restart runtime' for these updates to take effect.\"\n", 46 | ")" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "id": "1b4b1fd0", 52 | "metadata": {}, 53 | "source": [ 54 | "## Fetching and creating the test data\n", 55 | "This notebook has demonstrations that use the files `TCGA-AN-A0G0-01Z-00-DX1.svs` (365 MB) and `TCGA-AN-A0G0-01Z-00-DX1.mask.png` (4 kB), The pooch commands will fetch them if they are not already available." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "id": "8b9784b2", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "import os\n", 66 | "import pooch\n", 67 | "\n", 68 | "# download whole slide image\n", 69 | "wsi_path = pooch.retrieve(\n", 70 | " fname=\"TCGA-AN-A0G0-01Z-00-DX1.svs\",\n", 71 | " url=\"https://drive.google.com/uc?export=download&id=19agE_0cWY582szhOVxp9h3kozRfB4CvV&confirm=t&uuid=6f2d51e7-9366-4e98-abc7-4f77427dd02c&at=ALgDtswlqJJw1KU7P3Z1tZNcE01I:1679111148632\",\n", 72 | " known_hash=\"d046f952759ff6987374786768fc588740eef1e54e4e295a684f3bd356c8528f\",\n", 73 | " path=str(pooch.os_cache(\"pooch\")) + os.sep + \"wsi\",\n", 74 | ")\n", 75 | "print(f\"Have {wsi_path}\")\n", 76 | "\n", 77 | "# download binary mask image\n", 78 | "mask_path = pooch.retrieve(\n", 79 | " fname=\"TCGA-AN-A0G0-01Z-00-DX1.mask.png\",\n", 80 | " url=\"https://drive.google.com/uc?export=download&id=17GOOHbL8Bo3933rdIui82akr7stbRfta\",\n", 81 | " known_hash=\"bb657ead9fd3b8284db6ecc1ca8a1efa57a0e9fd73d2ea63ce6053fbd3d65171\",\n", 82 | " path=str(pooch.os_cache(\"pooch\")) + os.sep + \"wsi\",\n", 83 | ")\n", 84 | "print(f\"Have {mask_path}\")" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "id": "cb4179b8", 90 | "metadata": {}, 91 | "source": [ 92 | "## Creating a study for use with histomics_stream\n", 93 | "\n", 94 | "We describe the input and desired parameters using standard Python lists and dictionaries. Here we give a high-level configuration; selection of tiles is done subsequently.\n", 95 | "\n", 96 | "N.B.: __*all*__ values that are number of pixels are based upon the `target_magnification` that is supplied to `FindResolutionForSlide`. This includes pixel sizes of a slide, chunk, or tile and it includes the pixel coordinates for a chunk or tile. It applies whether the numbers are supplied to histomics_stream or returned by histomics_stream. However, if the `magnification_source` is not `exact` the `returned_magnification` may not equal the `target_magnification`; to get the number of pixels that is relevant for the `returned_magnification`, typically these numbers of pixels are multiplied by the ratio `returned_magnification / target_magnification`. In particular, the *pixel size of the returned tiles* will be the requested size times this ratio." 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "id": "0de1e5a5-58ed-4cc9-9348-9e22e0c9fa23", 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "import histomics_stream as hs\n", 107 | "import histomics_stream.tensorflow\n", 108 | "import tensorflow as tf" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "id": "08cfbc01-1b50-426e-ac4e-9c73916329d4", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "# Create a study and insert study-wide information.\n", 119 | "# Add a slide to the study, including slide-wide information with it.\n", 120 | "my_study0 = dict(\n", 121 | " version=\"version-1\",\n", 122 | " tile_height=256,\n", 123 | " tile_width=256,\n", 124 | " overlap_height=0,\n", 125 | " overlap_width=0,\n", 126 | " slides=dict(\n", 127 | " Slide_0=dict(\n", 128 | " filename=wsi_path,\n", 129 | " slide_name=os.path.splitext(os.path.split(wsi_path)[1])[0],\n", 130 | " slide_group=\"Group 3\",\n", 131 | " chunk_height=2048,\n", 132 | " chunk_width=2048,\n", 133 | " )\n", 134 | " ),\n", 135 | ")\n", 136 | "\n", 137 | "# For each slide, find the appropriate resolution given the target_magnification and\n", 138 | "# magnification_tolerance. In this example, we use the same parameters for each slide,\n", 139 | "# but this is not required generally.\n", 140 | "find_slide_resolution = hs.configure.FindResolutionForSlide(\n", 141 | " my_study0, target_magnification=20, magnification_source=\"native\"\n", 142 | ")\n", 143 | "for slide in my_study0[\"slides\"].values():\n", 144 | " find_slide_resolution(slide)\n", 145 | "print(f\"my_study0 = {my_study0}\")" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "id": "dd18bd4e", 151 | "metadata": {}, 152 | "source": [ 153 | "## Tile selection\n", 154 | "\n", 155 | "We are going to demonstrate several approaches to choosing tiles. Each approach will start with its own copy of the `my_study0` that we have built so far." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "id": "4b4e5990", 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "import copy" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "id": "56e2d816", 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "# Demonstrate TilesByGridAndMask without a mask\n", 176 | "my_study_by_grid = copy.deepcopy(my_study0)\n", 177 | "tiles_by_grid = hs.configure.TilesByGridAndMask(\n", 178 | " my_study_by_grid, overlap_height=32, overlap_width=32, randomly_select=5\n", 179 | ")\n", 180 | "# We could apply this to a subset of the slides, but we will apply it to all slides in\n", 181 | "# this example.\n", 182 | "for slide in my_study_by_grid[\"slides\"].values():\n", 183 | " tiles_by_grid(slide)\n", 184 | "# Take a look at what we have made\n", 185 | "print(f\"==== The entire dictionary is now ==== \\nmy_study_by_grid = {my_study_by_grid}\")\n", 186 | "just_tiles = tiles_by_grid.get_tiles(my_study_by_grid)\n", 187 | "print(f\"==== A quick look at just the tiles is now ====\\njust_tiles = {just_tiles}\")" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "id": "018d44a8", 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "# Demonstrate TilesByGridAndMask with a mask\n", 198 | "my_study_by_grid_and_mask = copy.deepcopy(my_study0)\n", 199 | "tiles_by_grid_and_mask = hs.configure.TilesByGridAndMask(\n", 200 | " my_study_by_grid_and_mask, mask_filename=mask_path, randomly_select=10\n", 201 | ")\n", 202 | "# We could apply this to a subset of the slides, but we will apply it to all slides in\n", 203 | "# this example.\n", 204 | "for slide in my_study_by_grid_and_mask[\"slides\"].values():\n", 205 | " tiles_by_grid_and_mask(slide)\n", 206 | "# Take a look at what we have made\n", 207 | "print(\n", 208 | " f\"==== The entire dictionary is now ==== \\nmy_study_by_grid_and_mask = {my_study_by_grid_and_mask}\"\n", 209 | ")\n", 210 | "just_tiles = tiles_by_grid_and_mask.get_tiles(my_study_by_grid_and_mask)\n", 211 | "print(f\"==== A quick look at just the tiles is now ====\\njust_tiles = {just_tiles}\")" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "id": "91970864", 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "# Demonstrate TilesByList\n", 222 | "my_study_by_list = copy.deepcopy(my_study0)\n", 223 | "tiles_by_list = hs.configure.TilesByList(\n", 224 | " my_study_by_list,\n", 225 | " randomly_select=5,\n", 226 | " tiles_dictionary=my_study_by_grid[\"slides\"][\"Slide_0\"][\"tiles\"],\n", 227 | ")\n", 228 | "# We could apply this to a subset of the slides, but we will apply it to all slides in\n", 229 | "# this example.\n", 230 | "for slide in my_study_by_list[\"slides\"].values():\n", 231 | " tiles_by_list(slide)\n", 232 | "# Take a look at what we have made\n", 233 | "print(f\"==== The entire dictionary is now ==== \\nmy_study_by_list = {my_study_by_list}\")\n", 234 | "just_tiles = tiles_by_list.get_tiles(my_study_by_list)\n", 235 | "print(f\"==== A quick look at just the tiles is now ====\\njust_tiles = {just_tiles}\")" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "id": "e120014f", 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "# Demonstrate TilesRandomly\n", 246 | "my_study_randomly = copy.deepcopy(my_study0)\n", 247 | "tiles_randomly = hs.configure.TilesRandomly(my_study_randomly, randomly_select=10)\n", 248 | "# We could apply this to a subset of the slides, but we will apply it to all slides in\n", 249 | "# this example.\n", 250 | "for slide in my_study_randomly[\"slides\"].values():\n", 251 | " tiles_randomly(slide)\n", 252 | "# Take a look at what we have made\n", 253 | "print(\n", 254 | " f\"==== The entire dictionary is now ==== \\nmy_study_randomly = {my_study_randomly}\"\n", 255 | ")\n", 256 | "just_tiles = tiles_randomly.get_tiles(my_study_randomly)\n", 257 | "print(f\"==== A quick look at just the tiles is now ====\\njust_tiles = {just_tiles}\")" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "id": "905bcb07", 263 | "metadata": {}, 264 | "source": [ 265 | "## Creating a TensorFlow Dataset\n", 266 | "\n", 267 | "We request tiles indicated by the mask and create a tensorflow Dataset that has the image data for these tiles as well as associated parameters for each tile, such as its location." 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "id": "6618f2e1", 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "# Demonstrate TilesByGridAndMask with a mask\n", 278 | "my_study = copy.deepcopy(my_study0)\n", 279 | "tiles_by_grid_and_mask = hs.configure.TilesByGridAndMask(\n", 280 | " my_study, mask_filename=mask_path, mask_threshold=0.5, randomly_select=100\n", 281 | ")\n", 282 | "for slide in my_study[\"slides\"].values():\n", 283 | " tiles_by_grid_and_mask(slide)\n", 284 | "print(\"Finished selecting tiles.\")\n", 285 | "\n", 286 | "create_tensorflow_dataset = hs.tensorflow.CreateTensorFlowDataset()\n", 287 | "tiles = create_tensorflow_dataset(my_study)\n", 288 | "print(\"Finished with CreateTensorFlowDataset\")\n", 289 | "print(f\"... with tile shape = {tiles.take(1).get_single_element()[0][0].shape}\")" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "id": "72421b0a", 295 | "metadata": {}, 296 | "source": [ 297 | "## Fetch a model for prediction\n", 298 | "\n", 299 | "We fetch a model (840 MB compressed, 1.3 GB decompressed) that we will use to make predictions.\n", 300 | "\n", 301 | "Because each element of our Dataset is a tuple `(rgb_image_data, dictionary_of_annotation)`, a typical model that accepts only the former as its input needs to be wrapped.\n", 302 | "\n", 303 | "Note that this model assumes that the tiles/images are not batched, with the understanding that if there is enough memory to do batching then one should instead choose a larger tile size. " 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "id": "dfbf1170", 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "# download trained model.\n", 314 | "model_path = pooch.retrieve(\n", 315 | " fname=\"tcga_brca_model\",\n", 316 | " url=\"https://drive.google.com/uc?export=download&id=1KxB6iAn9j2Wp7oyFlV4T1Kli-mR8-35G&confirm=t&uuid=c5df8dfd-ed48-4cef-81a0-19df97677fe5&at=ALgDtswWzs0BEdkVNgFrp83p9NDO:1679111246793\",\n", 317 | " known_hash=\"b5b5444cc8874d17811a89261abeafd9b9603e7891a8b2a98d8f13e2846a6689\",\n", 318 | " path=str(pooch.os_cache(\"pooch\")) + os.sep + \"model\",\n", 319 | " processor=pooch.Unzip(),\n", 320 | ")\n", 321 | "model_path = os.path.split(model_path[0])[0]\n", 322 | "print(f\"Have {model_path}.\")\n", 323 | "\n", 324 | "# restore keras model\n", 325 | "from histomics_detect.models import FasterRCNN\n", 326 | "\n", 327 | "model = tf.keras.models.load_model(\n", 328 | " model_path, custom_objects={\"FasterRCNN\": FasterRCNN}\n", 329 | ")\n", 330 | "\n", 331 | "\n", 332 | "# Each element of the `tiles` tensorflow Dataset is a (rgb_image_data, dictionary_of_annotation) pair.\n", 333 | "# Wrap the unwrapped_model so that it knows to use the image.\n", 334 | "class WrappedModel(tf.keras.Model):\n", 335 | " def __init__(self, model, *args, **kwargs):\n", 336 | " super(WrappedModel, self).__init__(*args, **kwargs)\n", 337 | " self.model = model\n", 338 | "\n", 339 | " def call(self, element):\n", 340 | " return (self.model(element[0]), element[1])\n", 341 | "\n", 342 | "\n", 343 | "unwrapped_model = model\n", 344 | "model = WrappedModel(unwrapped_model)\n", 345 | "print(\"Model built and wrapped.\")" 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "id": "4614c2a3", 351 | "metadata": {}, 352 | "source": [ 353 | "## Make predictions" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "id": "b050a930", 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "import time\n", 364 | "\n", 365 | "print(\"Starting predictions\")\n", 366 | "start_time = time.time()\n", 367 | "# This model assumes that the tiles are not batched. Do not use, e.g., tiles.batch(32).\n", 368 | "predictions = model.predict(tiles)\n", 369 | "end_time = time.time()\n", 370 | "num_inputs = len([0 for tile in tiles])\n", 371 | "num_predictions = predictions[0].shape[0]\n", 372 | "print(\n", 373 | " f\"Made {num_predictions} predictions for {num_inputs} tiles in {end_time - start_time} s.\"\n", 374 | ")\n", 375 | "print(f\"Average of {(end_time - start_time) / num_inputs} s per tile.\")" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "id": "09fc739b", 381 | "metadata": {}, 382 | "source": [ 383 | "## Look at internals" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "id": "1144f373", 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [ 393 | "my_element = tiles.take(1).get_single_element()\n", 394 | "my_pair = my_element[0]\n", 395 | "my_target = my_element[1]\n", 396 | "my_weight = my_element[2]\n", 397 | "my_image = my_pair[0]\n", 398 | "my_annotation = my_pair[1]\n", 399 | "\n", 400 | "print(f\" type(my_element) = {type(my_element)}\")\n", 401 | "print(f\" len(my_element) = {len(my_element)}\")\n", 402 | "print(f\" type(my_pair) = {type(my_pair)}\")\n", 403 | "print(f\" len(my_pair) = {len(my_pair)}\")\n", 404 | "print(f\" type(my_target) = {type(my_target)}\")\n", 405 | "print(f\" type(my_weight) = {type(my_weight)}\")\n", 406 | "print(f\" type(my_image) = {type(my_image)}\")\n", 407 | "print(f\" my_image.shape = {my_image.shape}\")\n", 408 | "print(f\"type(my_annotation) = {type(my_annotation)}\")" 409 | ] 410 | }, 411 | { 412 | "cell_type": "markdown", 413 | "id": "d492e513", 414 | "metadata": {}, 415 | "source": [ 416 | "## Display a tile" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": null, 422 | "id": "9531e48d", 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "import itk, itkwidgets\n", 427 | "\n", 428 | "itkwidgets.view(itk.image_from_array(my_image.numpy(), is_vector=True))" 429 | ] 430 | } 431 | ], 432 | "metadata": { 433 | "kernelspec": { 434 | "display_name": "Python 3 (ipykernel)", 435 | "language": "python", 436 | "name": "python3" 437 | }, 438 | "language_info": { 439 | "codemirror_mode": { 440 | "name": "ipython", 441 | "version": 3 442 | }, 443 | "file_extension": ".py", 444 | "mimetype": "text/x-python", 445 | "name": "python", 446 | "nbconvert_exporter": "python", 447 | "pygments_lexer": "ipython3", 448 | "version": "3.8.10" 449 | } 450 | }, 451 | "nbformat": 4, 452 | "nbformat_minor": 5 453 | } 454 | -------------------------------------------------------------------------------- /example/pytorch_stream.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "f87613a9", 6 | "metadata": {}, 7 | "source": [ 8 | "# Demonstration of histomics_stream\n", 9 | "\n", 10 | "Click to open in [[GitHub](https://github.com/DigitalSlideArchive/HistomicsStream/tree/master/example/pytorch.ipynb)] [[Google Colab](https://colab.research.google.com/github/DigitalSlideArchive/HistomicsStream/blob/master/example/pytorch_stream.ipynb)]\n", 11 | "\n", 12 | "The `histomics_stream` Python package sits at the start of any machine learning workflow that is built on the PyTorch machine learning library. The package is responsible for efficient access to the input image data that will be used to fit a new machine learning model or will be used to predict regions of interest in novel inputs using an already learned model." 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "id": "a8490f25", 18 | "metadata": {}, 19 | "source": [ 20 | "## Installation\n", 21 | "\n", 22 | "If you are running this notebook on Google Colab or another system where `histomics_stream` and its dependencies are not yet installed then they can be installed with the following commands. Note that image readers in addition to openslide are also supported by using, e.g., `large_image[bioformats,ometiff,openjpeg,openslide,tiff]` on the below pip install command line." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "5aa174fa-c59b-42a5-ae59-7b28d3b3c50d", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# Get histomics_stream and its dependencies\n", 33 | "!apt update\n", 34 | "!apt install -y python3-openslide openslide-tools\n", 35 | "!pip install 'large_image[openslide,tiff]' --find-links https://girder.github.io/large_image_wheels\n", 36 | "!pip install histomics_stream[torch]\n", 37 | "\n", 38 | "# Get other packages used in this notebook\n", 39 | "# N.B. itkwidgets works with jupyter<=3.0.0\n", 40 | "!apt install libcudnn8 libcudnn8-dev\n", 41 | "!pip install pooch itkwidgets\n", 42 | "!jupyter labextension install @jupyter-widgets/jupyterlab-manager jupyter-matplotlib jupyterlab-datawidgets itkwidgets\n", 43 | "\n", 44 | "print(\n", 45 | " \"\\nNOTE!: On Google Colab you may need to choose 'Runtime->Restart runtime' for these updates to take effect.\"\n", 46 | ")" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "id": "ed2efd66", 52 | "metadata": {}, 53 | "source": [ 54 | "## Fetching and creating the test data\n", 55 | "This notebook has demonstrations that use the files `TCGA-AN-A0G0-01Z-00-DX1.svs` (365 MB) and `TCGA-AN-A0G0-01Z-00-DX1.mask.png` (4 kB), The pooch commands will fetch them if they are not already available." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "id": "b2ea3c60", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "import os\n", 66 | "import pooch\n", 67 | "\n", 68 | "# download whole slide image\n", 69 | "wsi_path = pooch.retrieve(\n", 70 | " fname=\"TCGA-AN-A0G0-01Z-00-DX1.svs\",\n", 71 | " url=\"https://drive.google.com/uc?export=download&id=19agE_0cWY582szhOVxp9h3kozRfB4CvV&confirm=t&uuid=6f2d51e7-9366-4e98-abc7-4f77427dd02c&at=ALgDtswlqJJw1KU7P3Z1tZNcE01I:1679111148632\",\n", 72 | " known_hash=\"d046f952759ff6987374786768fc588740eef1e54e4e295a684f3bd356c8528f\",\n", 73 | " path=str(pooch.os_cache(\"pooch\")) + os.sep + \"wsi\",\n", 74 | ")\n", 75 | "print(f\"Have {wsi_path}\")\n", 76 | "\n", 77 | "# download binary mask image\n", 78 | "mask_path = pooch.retrieve(\n", 79 | " fname=\"TCGA-AN-A0G0-01Z-00-DX1.mask.png\",\n", 80 | " url=\"https://drive.google.com/uc?export=download&id=17GOOHbL8Bo3933rdIui82akr7stbRfta\",\n", 81 | " known_hash=\"bb657ead9fd3b8284db6ecc1ca8a1efa57a0e9fd73d2ea63ce6053fbd3d65171\",\n", 82 | " path=str(pooch.os_cache(\"pooch\")) + os.sep + \"wsi\",\n", 83 | ")\n", 84 | "print(f\"Have {mask_path}\")" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "id": "4274b5d6", 90 | "metadata": {}, 91 | "source": [ 92 | "## Creating a study for use with histomics_stream\n", 93 | "\n", 94 | "We describe the input and desired parameters using standard Python lists and dictionaries. Here we give a high-level configuration; selection of tiles is done subsequently.\n", 95 | "\n", 96 | "N.B.: __*all*__ values that are number of pixels are based upon the `target_magnification` that is supplied to `FindResolutionForSlide`. This includes pixel sizes of a slide, chunk, or tile and it includes the pixel coordinates for a chunk or tile. It applies whether the numbers are supplied to histomics_stream or returned by histomics_stream. However, if the `magnification_source` is not `exact` the `returned_magnification` may not equal the `target_magnification`; to get the number of pixels that is relevant for the `returned_magnification`, typically these numbers of pixels are multiplied by the ratio `returned_magnification / target_magnification`. In particular, the *pixel size of the returned tiles* will be the requested size times this ratio." 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "id": "1e17612d-0216-4652-92cd-d8ea5e0ac6d7", 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "import histomics_stream as hs\n", 107 | "import histomics_stream.pytorch\n", 108 | "import torch" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "id": "effa3803-fc82-4bd2-93f1-538de00d7607", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "# Create a study and insert study-wide information.\n", 119 | "# Add a slide to the study, including slide-wide information with it.\n", 120 | "my_study0 = dict(\n", 121 | " version=\"version-1\",\n", 122 | " tile_height=256,\n", 123 | " tile_width=256,\n", 124 | " overlap_height=0,\n", 125 | " overlap_width=0,\n", 126 | " slides=dict(\n", 127 | " Slide_0=dict(\n", 128 | " filename=wsi_path,\n", 129 | " slide_name=os.path.splitext(os.path.split(wsi_path)[1])[0],\n", 130 | " slide_group=\"Group 3\",\n", 131 | " chunk_height=2048,\n", 132 | " chunk_width=2048,\n", 133 | " )\n", 134 | " ),\n", 135 | ")\n", 136 | "\n", 137 | "# For each slide, find the appropriate resolution given the target_magnification and\n", 138 | "# magnification_tolerance. In this example, we use the same parameters for each slide,\n", 139 | "# but this is not required generally.\n", 140 | "find_slide_resolution = hs.configure.FindResolutionForSlide(\n", 141 | " my_study0, target_magnification=20, magnification_source=\"exact\"\n", 142 | ")\n", 143 | "for slide in my_study0[\"slides\"].values():\n", 144 | " find_slide_resolution(slide)\n", 145 | "print(f\"my_study0 = {my_study0}\")" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "id": "0fde9d2e", 151 | "metadata": {}, 152 | "source": [ 153 | "## Tile selection\n", 154 | "\n", 155 | "We are going to demonstrate several approaches to choosing tiles. Each approach will start with its own copy of the `my_study0` that we have built so far." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "id": "4ca79608", 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "import copy" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "id": "cba3ab43", 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "# Demonstrate TilesByGridAndMask without a mask\n", 176 | "my_study_by_grid = copy.deepcopy(my_study0)\n", 177 | "tiles_by_grid = hs.configure.TilesByGridAndMask(\n", 178 | " my_study_by_grid, overlap_height=32, overlap_width=32, randomly_select=5\n", 179 | ")\n", 180 | "# We could apply this to a subset of the slides, but we will apply it to all slides in\n", 181 | "# this example.\n", 182 | "for slide in my_study_by_grid[\"slides\"].values():\n", 183 | " tiles_by_grid(slide)\n", 184 | "# Take a look at what we have made\n", 185 | "print(f\"==== The entire dictionary is now ==== \\nmy_study_by_grid = {my_study_by_grid}\")\n", 186 | "just_tiles = tiles_by_grid.get_tiles(my_study_by_grid)\n", 187 | "print(f\"==== A quick look at just the tiles is now ====\\njust_tiles = {just_tiles}\")" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "id": "953ebb17", 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "# Demonstrate TilesByGridAndMask with a mask\n", 198 | "my_study_by_grid_and_mask = copy.deepcopy(my_study0)\n", 199 | "tiles_by_grid_and_mask = hs.configure.TilesByGridAndMask(\n", 200 | " my_study_by_grid_and_mask, mask_filename=mask_path, randomly_select=10\n", 201 | ")\n", 202 | "# We could apply this to a subset of the slides, but we will apply it to all slides in\n", 203 | "# this example.\n", 204 | "for slide in my_study_by_grid_and_mask[\"slides\"].values():\n", 205 | " tiles_by_grid_and_mask(slide)\n", 206 | "# Take a look at what we have made\n", 207 | "print(\n", 208 | " f\"==== The entire dictionary is now ==== \\nmy_study_by_grid_and_mask = {my_study_by_grid_and_mask}\"\n", 209 | ")\n", 210 | "just_tiles = tiles_by_grid_and_mask.get_tiles(my_study_by_grid_and_mask)\n", 211 | "print(f\"==== A quick look at just the tiles is now ====\\njust_tiles = {just_tiles}\")" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "id": "f341e882", 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "# Demonstrate TilesByList\n", 222 | "my_study_by_list = copy.deepcopy(my_study0)\n", 223 | "tiles_by_list = hs.configure.TilesByList(\n", 224 | " my_study_by_list,\n", 225 | " randomly_select=5,\n", 226 | " tiles_dictionary=my_study_by_grid[\"slides\"][\"Slide_0\"][\"tiles\"],\n", 227 | ")\n", 228 | "# We could apply this to a subset of the slides, but we will apply it to all slides in\n", 229 | "# this example.\n", 230 | "for slide in my_study_by_list[\"slides\"].values():\n", 231 | " tiles_by_list(slide)\n", 232 | "# Take a look at what we have made\n", 233 | "print(f\"==== The entire dictionary is now ==== \\nmy_study_by_list = {my_study_by_list}\")\n", 234 | "just_tiles = tiles_by_list.get_tiles(my_study_by_list)\n", 235 | "print(f\"==== A quick look at just the tiles is now ====\\njust_tiles = {just_tiles}\")" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "id": "9bc2770f", 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "# Demonstrate TilesRandomly\n", 246 | "my_study_randomly = copy.deepcopy(my_study0)\n", 247 | "tiles_randomly = hs.configure.TilesRandomly(my_study_randomly, randomly_select=10)\n", 248 | "# We could apply this to a subset of the slides, but we will apply it to all slides in\n", 249 | "# this example.\n", 250 | "for slide in my_study_randomly[\"slides\"].values():\n", 251 | " tiles_randomly(slide)\n", 252 | "# Take a look at what we have made\n", 253 | "print(\n", 254 | " f\"==== The entire dictionary is now ==== \\nmy_study_randomly = {my_study_randomly}\"\n", 255 | ")\n", 256 | "just_tiles = tiles_randomly.get_tiles(my_study_randomly)\n", 257 | "print(f\"==== A quick look at just the tiles is now ====\\njust_tiles = {just_tiles}\")" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "id": "e35fe040", 263 | "metadata": {}, 264 | "source": [ 265 | "## Creating a Dataset\n", 266 | "\n", 267 | "We request tiles indicated by the mask and create a Dataset that has the image data for these tiles as well as associated parameters for each tile, such as its location." 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "id": "0d272866", 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "# Demonstrate TilesByGridAndMask with a mask\n", 278 | "my_study = copy.deepcopy(my_study0)\n", 279 | "tiles_by_grid_and_mask = hs.configure.TilesByGridAndMask(\n", 280 | " my_study, mask_filename=mask_path, mask_threshold=0.5, randomly_select=100\n", 281 | ")\n", 282 | "for slide in my_study[\"slides\"].values():\n", 283 | " tiles_by_grid_and_mask(slide)\n", 284 | "print(\"Finished selecting tiles.\")\n", 285 | "\n", 286 | "create_pytorch_dataloader = hs.pytorch.CreateTorchDataloader()\n", 287 | "tiles = create_pytorch_dataloader(my_study)\n", 288 | "print(\"Finished with CreateTorchDataloader\")\n", 289 | "# print(f\"{tile = }\")\n", 290 | "# print(f\"... with tile shape = {tiles.take(1).get_single_element()[0][0].shape}\")" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "id": "800f2502", 296 | "metadata": {}, 297 | "source": [ 298 | "## Fetch a model for prediction\n", 299 | "\n", 300 | "We build a arbitrary but reasonable model for demonstration purposes.\n", 301 | "\n", 302 | "Because each element of our Dataset is a tuple `(rgb_image_data, dictionary_of_annotation)`, a typical model that accepts only the former as its input needs to be wrapped." 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "id": "fd890cf5", 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "class MyTorchModel(torch.nn.modules.module.Module):\n", 313 | " def __init__(\n", 314 | " self, in_channels, tile_height, tile_width, num_categories, kernel_size\n", 315 | " ):\n", 316 | " print(f\"{in_channels = }\")\n", 317 | " print(f\"{tile_height = }\")\n", 318 | " print(f\"{tile_width = }\")\n", 319 | " print(f\"{num_categories = }\")\n", 320 | " print(f\"{kernel_size = }\")\n", 321 | " super(MyTorchModel, self).__init__()\n", 322 | " out1_channels = 2 * in_channels\n", 323 | " padding = tuple(int((k - 1) // 2) for k in kernel_size)\n", 324 | " self.conv1 = torch.nn.Conv2d(\n", 325 | " in_channels, out1_channels, kernel_size, padding=padding\n", 326 | " )\n", 327 | " out2_channels = 4 * in_channels\n", 328 | " self.conv2 = torch.nn.Conv2d(\n", 329 | " out1_channels, out2_channels, kernel_size, padding=padding\n", 330 | " )\n", 331 | " self.relu = torch.nn.ReLU()\n", 332 | " self.pool = torch.nn.MaxPool2d(2, 2)\n", 333 | " self.flat_size = int(\n", 334 | " in_channels * tile_height * tile_width / (out2_channels / in_channels)\n", 335 | " )\n", 336 | " self.fc1 = torch.nn.Linear(self.flat_size, 128)\n", 337 | " self.fc2 = torch.nn.Linear(128, num_categories)\n", 338 | "\n", 339 | " def forward(self, x):\n", 340 | " x = self.pool(self.relu(self.conv1(x)))\n", 341 | " x = self.pool(self.relu(self.conv2(x)))\n", 342 | " x = x.view(-1, self.flat_size)\n", 343 | " x = self.relu(self.fc1(x))\n", 344 | " x = self.relu(self.fc2(x))\n", 345 | " return x\n", 346 | "\n", 347 | "\n", 348 | "unwrapped_model = MyTorchModel(\n", 349 | " in_channels=3,\n", 350 | " tile_height=my_study_randomly[\"tile_height\"],\n", 351 | " tile_width=my_study_randomly[\"tile_width\"],\n", 352 | " num_categories=2,\n", 353 | " kernel_size=(5, 5),\n", 354 | ")\n", 355 | "\n", 356 | "# At this point it would be standard to train the model. This example is so dumb that\n", 357 | "# we won't do that here.\n", 358 | "\n", 359 | "\n", 360 | "class WrapModel(torch.nn.modules.module.Module):\n", 361 | " def __init__(self, model, *args, **kwargs):\n", 362 | " super(WrapModel, self).__init__(*args, **kwargs)\n", 363 | " self.model = unwrapped_model\n", 364 | "\n", 365 | " def forward(self, x):\n", 366 | " p = self.model(x[0])\n", 367 | " return p, x[1]\n", 368 | "\n", 369 | "\n", 370 | "model = WrapModel(unwrapped_model)\n", 371 | "print(\"Model created\")" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "id": "6e687409", 377 | "metadata": {}, 378 | "source": [ 379 | "## Make predictions" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "id": "e1e890c9-9400-4324-ba6d-22d3aae90669", 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "import time\n", 390 | "\n", 391 | "print(\"Starting predictions\")\n", 392 | "start_time = time.time()\n", 393 | "# Consider adding a batch factor to the data loader\n", 394 | "predictions = [model(tile) for tile in tiles]\n", 395 | "end_time = time.time()\n", 396 | "print(\"Done predicting\")\n", 397 | "num_inputs = len([0 for tile in tiles])\n", 398 | "num_predictions = len(predictions)\n", 399 | "print(\n", 400 | " f\"Made {num_predictions} predictions for {num_inputs} tiles \"\n", 401 | " f\"in {end_time - start_time} s.\"\n", 402 | ")\n", 403 | "print(f\"Average of {(end_time - start_time) / num_inputs} s per tile.\")" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "id": "1f16b044", 409 | "metadata": {}, 410 | "source": [ 411 | "## Look at internals" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "id": "c277a4c8", 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "tile_iter = iter(tiles)\n", 422 | "tile = next(tile_iter)\n", 423 | "print(f\" {type(tiles) = }\")\n", 424 | "print(f\" {type(tiles.dataset) = }\")\n", 425 | "print(f\" {type(iter(tiles.dataset)) = }\")\n", 426 | "print(f\" {type(tile_iter) = }\")\n", 427 | "print(f\" {type(tile) = }\")\n", 428 | "print(f\" {len(tile) = }\")\n", 429 | "print(f\" {type(tile[0]) = }\")\n", 430 | "print(f\" {tile[0].shape = }\")\n", 431 | "print(f\" {type(tile[1]) = }\")\n", 432 | "print(f\"{tile[0][0,0,0,0].to(torch.float32) = }\")\n", 433 | "pred = predictions[0]\n", 434 | "print(f\" {type(predictions) = }\")\n", 435 | "print(f\" {len(predictions) = }\")\n", 436 | "print(f\" {type(pred) = }\")\n", 437 | "print(f\" {len(pred) = }\")\n", 438 | "print(f\" {type(pred[0]) = }\")\n", 439 | "print(f\" {pred[0].shape = }\")\n", 440 | "print(f\" {pred[0] = }\")\n", 441 | "print(f\" {type(pred[1]) = }\")\n", 442 | "print(f\" {pred[1].keys() = }\")" 443 | ] 444 | } 445 | ], 446 | "metadata": { 447 | "kernelspec": { 448 | "display_name": "Python 3 (ipykernel)", 449 | "language": "python", 450 | "name": "python3" 451 | }, 452 | "language_info": { 453 | "codemirror_mode": { 454 | "name": "ipython", 455 | "version": 3 456 | }, 457 | "file_extension": ".py", 458 | "mimetype": "text/x-python", 459 | "name": "python", 460 | "nbconvert_exporter": "python", 461 | "pygments_lexer": "ipython3", 462 | "version": "3.8.10" 463 | } 464 | }, 465 | "nbformat": 4, 466 | "nbformat_minor": 5 467 | } 468 | -------------------------------------------------------------------------------- /histomics_stream/configure.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # 3 | # Copyright NumFOCUS 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # https://www.apache.org/licenses/LICENSE-2.0.txt 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # ========================================================================= 18 | 19 | """Whole-slide image streamer for machine learning frameworks.""" 20 | 21 | import copy 22 | import itertools 23 | import math 24 | import os 25 | import random 26 | import re 27 | 28 | import itk 29 | import numpy as np 30 | import scipy.interpolate 31 | 32 | 33 | class _TilesByCommon: 34 | def __init__(self): 35 | self._key_mapping = { 36 | "number_pixel_columns_for_chunk": "chunk_width", 37 | "number_pixel_columns_for_mask": "mask_width", 38 | "number_pixel_columns_for_slide": "slide_width", 39 | "number_pixel_columns_for_tile": "tile_width", 40 | "number_pixel_overlap_columns_for_tile": "tile_overlap_width", 41 | "number_pixel_overlap_rows_for_tile": "tile_overlap_height", 42 | "number_pixel_rows_for_chunk": "chunk_height", 43 | "number_pixel_rows_for_mask": "mask_height", 44 | "number_pixel_rows_for_slide": "slide_height", 45 | "number_pixel_rows_for_tile": "tile_height", 46 | "number_tile_columns_for_slide": "slide_width_tiles", 47 | "number_tile_rows_for_slide": "slide_height_tiles", 48 | "tile_overlap_height": "overlap_height", 49 | "tile_overlap_width": "overlap_width", 50 | } 51 | 52 | self._keys_warned = set() 53 | 54 | # For each filename, select just upper-left corner for each tile. 55 | # Note that each upper-left corner is returned as (top, left), not (left, top). 56 | @staticmethod 57 | def get_tiles(study): 58 | return [ 59 | ( 60 | slide["filename"], 61 | [ 62 | (tile["tile_top"], tile["tile_left"]) 63 | for tile in slide["tiles"].values() 64 | ], 65 | ) 66 | for slide in study["slides"].values() 67 | ] 68 | 69 | # Private function to map old key names to their current equivalent 70 | def _update_dict(self, d): 71 | for old_key in d.keys() & self._key_mapping.keys(): 72 | # An old key is in use in `d`. 73 | new_key = self._key_mapping[old_key] 74 | while new_key in self._key_mapping: 75 | # Multiple, serial name changes 76 | new_key = self._key_mapping[new_key] 77 | if new_key in d: 78 | # Both the old and new key are used. 79 | raise ValueError( 80 | f"Cannot use both {repr(old_key)} key (deprecated) " 81 | f"and its replacement {repr(new_key)}" 82 | ) 83 | if old_key not in self._keys_warned: 84 | print( 85 | f"Warning: updating deprecated key {repr(old_key)} " 86 | f"to new name {repr(new_key)}" 87 | ) 88 | # Comment out the next line so we do have repeated warnings, in case a 89 | # second study comes in with deprecated keys. 90 | # self._keys_warned.add(old_key) 91 | d[new_key] = d[old_key] 92 | del d[old_key] 93 | 94 | 95 | class FindResolutionForSlide(_TilesByCommon): 96 | """ 97 | A class that computes read parameters for slides. 98 | 99 | An instance of class FindResolutionForSlide is a callable that will add level, 100 | target_magnification, scan_magnification, read_magnification, 101 | returned_magnification, slide_height, and slide_width fields to a slide dictionary. 102 | 103 | Parameters for the constructor 104 | ------------------------------ 105 | 106 | filename : string 107 | The path of the image file to be read. 108 | 109 | target_magnification : float 110 | The desired objective magnification for generated tiles. For example, a value 111 | of 10 corresponds to about 1 micron per pixel and a value of 20 corresponds to 112 | about 0.5 microns per pixel. 113 | 114 | magnification_source : str in ["scan", "native", "exact"] 115 | "scan" will produce tiles from the highest magnification avaialable. This is 116 | typically the slide scanner's objective magnification. 117 | 118 | "native" will produce tiles from the nearest available magnification equal to or 119 | greater than target_magnification (within a 2% tolerance). The "native" option 120 | is useful when you want to handle resizing of tiles to target_magnification on 121 | your own. 122 | 123 | "exact" will produce tiles using "native" option and then resize these tiles to 124 | match target_magnification. Resizing is handled by PIL using the Lanczos 125 | antialiasing filter since the resizing shrinks the tile by definition. 126 | 127 | For either "scan" or "native", the size of the read and returned tiles will be 128 | (tile_height * returned_magnification / target_magnification, tile_width * 129 | returned_magnification / target_magnification). For "exact" the size of the 130 | returned tiles will be (tile_height, tile_width). 131 | 132 | This procedure sets values in the slide dictionary to capture the scan, read, 133 | and returned magnification of the tiles. This is helpful for example to resize 134 | results to the scan magnification for visualization in HistomicsUI, or to resize 135 | between native and target magnification when using 136 | "native". "scan_magnification" is the highest magnification from the source 137 | file; "read_magnification" is the magnification read from the source file; 138 | "returned_magnification" is the magnification of the returned tiles which is 139 | same as "read_magnification" in the case of "scan" or "native" or 140 | "target_magnification" in the case of "exact". 141 | """ 142 | 143 | def __init__(self, study, target_magnification, magnification_source): 144 | """ 145 | Sanity check the supplied parameters and store them for later use. 146 | """ 147 | _TilesByCommon.__init__(self) 148 | # Check values. 149 | if not ("version" in study and study["version"] == "version-1"): 150 | raise ValueError('study["version"] must exist and be equal to "version-1".') 151 | if not ( 152 | isinstance(target_magnification, (int, np.integer, float, np.floating)) 153 | and 0 < target_magnification 154 | ): 155 | raise ValueError( 156 | f"target_magnification ({target_magnification})" 157 | " must be a positive number" 158 | ) 159 | if not ( 160 | isinstance(magnification_source, str) 161 | and magnification_source in ["scan", "native", "exact"] 162 | ): 163 | raise ValueError( 164 | f"magnification_source ({magnification_source})" 165 | " must be one of {['scan', 'native', 'exact']}." 166 | ) 167 | 168 | # Save values. 169 | self.target_magnification = float(target_magnification) 170 | self.magnification_source = magnification_source 171 | 172 | def __call__(self, slide): 173 | """ 174 | Add level, target_magnification, scan_magnification, read_magnification, 175 | returned_magnification, slide_height, and slide_width fields to a slide 176 | dictionary. 177 | """ 178 | 179 | # Check values. 180 | if "filename" not in slide: 181 | raise ValueError('slide["filename"] must be already set.') 182 | filename = slide["filename"] 183 | 184 | # Do the work. 185 | if not re.compile(r"\.zarr$").search(filename): 186 | # create large_image, prioritizing tiff source over openslide 187 | try: 188 | import large_image_source_tiff 189 | 190 | ts = large_image_source_tiff.open(filename) 191 | except: 192 | import large_image 193 | 194 | ts = large_image.open(filename) 195 | 196 | # scan_magnification = highest available magnification from source 197 | scan_magnification = float(ts.getNativeMagnification()["magnification"]) 198 | 199 | if self.magnification_source == "exact": 200 | # Use the tile-source level that large_image is willing to interpolate 201 | # for us. 202 | preferred_levels = [ 203 | ts.getLevelForMagnification( 204 | self.target_magnification, rounding=False 205 | ) 206 | ] 207 | else: # self.magnification_source in ["scan", "native"] 208 | # Use one of the tile-source levels that is stored in the image file. 209 | preferred_levels = list( 210 | set(ts.getPreferredLevel(level) for level in range(ts.levels)) 211 | ) 212 | preferred_levels.sort(reverse=True) 213 | if self.magnification_source == "scan": 214 | # Keep only the maximum tile-source level 215 | preferred_levels = preferred_levels[0:1] 216 | 217 | estimated_magnifications = np.array( 218 | [ 219 | float(ts.getMagnificationForLevel(level)["magnification"]) 220 | for level in preferred_levels 221 | ] 222 | ) 223 | 224 | # Find best tile-source level to use 225 | (level, returned_magnification) = self._get_level_and_magnifications( 226 | self.target_magnification, estimated_magnifications 227 | ) 228 | # Rather than as the index into preferred_levels, change level to be the 229 | # value that large_image uses 230 | level = preferred_levels[level] 231 | 232 | # If large_image is resampling a native level for us, it is starting with 233 | # the preferred level that is the least one that is not smaller than the 234 | # resampled level. 235 | read_magnification = float( 236 | ts.getMagnificationForLevel( 237 | min( 238 | [ 239 | ts.getPreferredLevel(i) 240 | for i in range(ts.levels) 241 | if i >= level 242 | ] 243 | ) 244 | )["magnification"] 245 | ) 246 | 247 | slide["target_magnification"] = self.target_magnification 248 | slide["scan_magnification"] = scan_magnification 249 | slide["read_magnification"] = read_magnification 250 | slide["returned_magnification"] = returned_magnification 251 | 252 | # We don't want to walk off the right or bottom of the slide so we are 253 | # conservative as to how many pixels large_image will return for us. 254 | # 1) large_image starts with an image that is of 255 | # read_magnification; we compute the dimensions for read_magnification 256 | # with math.floor from the dimensions of scan_magnification (i.e., 257 | # ts.sizeX and ts.sizeY) to be conservative. 258 | # 2) large_image or external software may resampled from the 259 | # read_magnification to the target_magnification; we compute dimensions 260 | # for the target_magnification with math.floor from the 261 | # read_magnification to be conservative. 262 | slide_height = ts.sizeY 263 | slide_width = ts.sizeX 264 | if scan_magnification != read_magnification: 265 | slide_height = math.floor( 266 | slide_height * read_magnification / scan_magnification 267 | ) 268 | slide_width = math.floor( 269 | slide_width * read_magnification / scan_magnification 270 | ) 271 | if read_magnification != self.target_magnification: 272 | slide_height = math.floor( 273 | slide_height * self.target_magnification / read_magnification 274 | ) 275 | slide_width = math.floor( 276 | slide_width * self.target_magnification / read_magnification 277 | ) 278 | 279 | else: 280 | import zarr 281 | import openslide as os 282 | 283 | # read whole-slide image and create zarr objects 284 | store = zarr.DirectoryStore(filename) 285 | source_group = zarr.open(store, mode="r") 286 | 287 | # scan_magnification = highest available magnification from source 288 | scan_magnification = float( 289 | source_group.attrs[os.PROPERTY_NAME_OBJECTIVE_POWER] 290 | ) 291 | 292 | preferred_levels = list(range(0, source_group.attrs["level_downsamples"])) 293 | if self.magnification_source == "scan": 294 | preferred_levels = [np.argmin(source_group.attrs["level_downsamples"])] 295 | 296 | # calculate magnifications of levels 297 | estimated_magnifications = np.array( 298 | scan_magnification / source_group.attrs["level_downsamples"][level] 299 | for level in preferred_levels 300 | ) 301 | 302 | # Find best native level to use 303 | (level, returned_magnification) = self._get_level_and_magnifications( 304 | self.target_magnification, estimated_magnifications 305 | ) 306 | # Rather than as the index into preferred_levels, change level to be the 307 | # value that zarr uses 308 | level = preferred_levels[level] 309 | 310 | slide["target_magnification"] = self.target_magnification 311 | slide["scan_magnification"] = scan_magnification 312 | slide["read_magnification"] = returned_magnification 313 | slide["returned_magnification"] = returned_magnification 314 | 315 | # get slide slide_height, slide_width at 316 | # desired magnification. (Note that slide_width is before 317 | # slide_height) 318 | slide_width, slide_height = source_group[format(level)].shape[0:2] 319 | 320 | if ( 321 | self.magnification_source == "exact" 322 | and self.target_magnification != returned_magnification 323 | ): 324 | raise ValueError( 325 | f"Couldn't find magnification {self.target_magnification}X " 326 | "in Zarr storage." 327 | ) 328 | 329 | int_level = int(round(level)) 330 | slide["level"] = int_level if abs(level - int_level) < 1e-4 else level 331 | # Note that slide size is defined by the requested magnification, which may not 332 | # be the same as the magnification for the selected level. To get the slide 333 | # size for the magnification that we are using, these values must later be 334 | # multiplied by returned_magnification / target_magnification. 335 | slide["slide_height"] = slide_height 336 | slide["slide_width"] = slide_width 337 | 338 | @staticmethod 339 | def _get_level_and_magnifications(target_magnification, estimated_magnifications): 340 | """ 341 | A private subroutine that computes level and magnifications. 342 | """ 343 | # calculate difference with magnification levels 344 | 345 | magnification_tolerance = 0.02 346 | delta = target_magnification - estimated_magnifications 347 | 348 | # match to existing levels 349 | if ( 350 | np.min(np.abs(np.divide(delta, target_magnification))) 351 | < magnification_tolerance 352 | ): # match 353 | level = np.squeeze(np.argmin(np.abs(delta))) 354 | elif np.any(delta < 0): 355 | value = np.max(delta[delta < 0]) 356 | level = np.squeeze(np.argwhere(delta == value)[0]) 357 | else: # desired magnification above base level - throw error 358 | raise ValueError("Cannot interpolate above scan magnification.") 359 | 360 | returned_magnification = estimated_magnifications[level] 361 | 362 | return level, returned_magnification 363 | 364 | 365 | class TilesByGridAndMask(_TilesByCommon): 366 | """ 367 | Select tiles according to a regular grid. Optionally, restrict the list by a mask 368 | that is read from a file. Optionally, further select a random subset of them. 369 | 370 | An instance of class TilesByGridAndMask is a callable that will select the 371 | coordinates of tiles to be taken from a slide. The selected tiles will be written 372 | to the slide dictionary. 373 | 374 | Parameters for the constructor 375 | ------------------------------ 376 | study : dictionary 377 | The study dictionary from which to read parameters about the study. 378 | randomly_select: int 379 | The number of tiles to be randomly selected from the list that would otherwise 380 | be written to the slide dictionary. A value of -1 is the default and means that 381 | all tiles should be written. 382 | overlap_height 383 | Specifies the desired amount of vertical overlap between adjacent tiles, 384 | measured in pixels using the `target_magnification`. If overlap_height is not 385 | supplied, it is read from the study dictionary, if available, otherwise it is 386 | set to zero. Zero indicates that there is no overlap between adjacent tiles; 387 | they are abutting. 388 | overlap_width 389 | Specifies the desired amount of horizontal overlap between adjacent tiles, 390 | measured in pixels using the `target_magnification`. If overlap_width is not 391 | supplied, it is read from the study dictionary, if available, otherwise it is 392 | set to zero. Zero indicates that there is no overlap between adjacent tiles; 393 | they are abutting. 394 | mask_filename: string 395 | The path of the image file to be read and used as a mask. The aspect ratio of 396 | the mask (in terms of its pixel dimensions) is expected to be about the same as 397 | the aspect ratio of the main image ( in terms of its grid of tiles). A non-zero 398 | value in the mask indicates that the tile should be retained. The default is 399 | "", which means that there is no masking. 400 | mask_threshold : float 401 | A value in [0.0, 1.0]. A tile is retained if the fraction of the tile 402 | overlapping non-zero pixels in the mask is at least the mask_threshold. The 403 | fraction must be strictly positive when the threshold is zero; the fraction has 404 | to be greater than or equal to the threshold when the threshold is not zero. 405 | 406 | """ 407 | 408 | def __init__(self, study, **kwargs): 409 | """ 410 | Sanity check the supplied parameters and store them for later use. 411 | """ 412 | _TilesByCommon.__init__(self) 413 | # Update keys of the dictionary from deprecated names 414 | self._update_dict(kwargs) 415 | bad_keys = kwargs.keys() - { 416 | "randomly_select", 417 | "overlap_height", 418 | "overlap_width", 419 | "mask_filename", 420 | "mask_threshold", 421 | } 422 | if bad_keys: 423 | raise ValueError( 424 | f"Unrecognized parameters {repr(bad_keys)} in " 425 | "TilesByGridAndMask.__init__" 426 | ) 427 | 428 | # randomly_select defaults to select all 429 | randomly_select = ( 430 | kwargs["randomly_select"] if "randomly_select" in kwargs else -1 431 | ) 432 | # Defaults to no masking 433 | mask_filename = kwargs["mask_filename"] if "mask_filename" in kwargs else "" 434 | # Defaults to any overlap with the mask 435 | mask_threshold = kwargs["mask_threshold"] if "mask_threshold" in kwargs else 0.0 436 | 437 | # Update keys of the dictionary from deprecated names 438 | self._update_dict(study) 439 | 440 | # If overlap is not supplied, it is read from the study dictionary, if 441 | # available, otherwise it is set to zero, which is no overlap. 442 | overlap_height = ( 443 | kwargs["overlap_height"] 444 | if "overlap_height" in kwargs 445 | else study["overlap_height"] if "overlap_height" in study else 0 446 | ) 447 | overlap_width = ( 448 | kwargs["overlap_width"] 449 | if "overlap_width" in kwargs 450 | else study["overlap_width"] if "overlap_width" in study else 0 451 | ) 452 | 453 | # Check values. 454 | if not ("version" in study and study["version"] == "version-1"): 455 | raise ValueError('study["version"] must exist and be equal to "version-1".') 456 | if not ( 457 | "tile_height" in study 458 | and isinstance(study["tile_height"], (int, np.integer)) 459 | and study["tile_height"] > 0 460 | ): 461 | raise ValueError( 462 | 'study["tile_height"]' " must exist and be a positive integer" 463 | ) 464 | if not ( 465 | "tile_width" in study 466 | and isinstance(study["tile_width"], (int, np.integer)) 467 | and study["tile_width"] > 0 468 | ): 469 | raise ValueError( 470 | 'study["tile_width"]' " must exist and be a positive integer" 471 | ) 472 | if not ( 473 | isinstance(randomly_select, (int, np.integer)) and -1 <= randomly_select 474 | ): 475 | raise ValueError( 476 | f"randomly_select ({randomly_select})" 477 | " must be a non-negative integer or -1." 478 | ) 479 | if not ( 480 | isinstance(overlap_height, (int, np.integer)) 481 | and overlap_height < study["tile_height"] 482 | ): 483 | raise ValueError( 484 | f"overlap_height ({overlap_height})" 485 | " must be less than" 486 | f' tile_height ({study["tile_height"]}).' 487 | ) 488 | if not ( 489 | isinstance(overlap_width, (int, np.integer)) 490 | and overlap_width < study["tile_width"] 491 | ): 492 | raise ValueError( 493 | f"overlap_width ({overlap_width})" 494 | " must be less than" 495 | f' tile_width ({study["tile_width"]}).' 496 | ) 497 | if mask_filename != "": 498 | mask_itk = self.check_mask_filename(mask_filename) 499 | if not ( 500 | isinstance(mask_threshold, (float, np.floating)) 501 | and mask_threshold >= 0.0 502 | and mask_threshold <= 1.0 503 | ): 504 | raise ValueError( 505 | f"mask_threshold ({mask_threshold}) must be between 0 and 1 inclusive." 506 | ) 507 | 508 | # Save values. To keep garbage collection efficient don't save all of `study`. 509 | self.tile_height = study["tile_height"] 510 | self.tile_width = study["tile_width"] 511 | self.randomly_select = randomly_select 512 | self.overlap_height = overlap_height 513 | self.overlap_width = overlap_width 514 | self.mask_filename = mask_filename 515 | if self.mask_filename != "": 516 | self.mask_itk = mask_itk 517 | self.mask_threshold = mask_threshold 518 | # If the user hasn't put the overlap information into the top-level study 519 | # dictionary then place it there. 520 | if "overlap_height" not in study: 521 | study["overlap_height"] = self.overlap_height 522 | if "overlap_width" not in study: 523 | study["overlap_width"] = self.overlap_width 524 | self.studywide_overlap_height = study["overlap_height"] 525 | self.studywide_overlap_width = study["overlap_width"] 526 | 527 | def __call__(self, slide): 528 | """ 529 | Select tiles according to a regular grid. Optionally, restrict the list by a 530 | mask. Optionally, select a random subset of them. 531 | """ 532 | 533 | # Update keys of the dictionary from deprecated names 534 | self._update_dict(slide) 535 | 536 | # Check values. 537 | if "slide_height" not in slide: 538 | raise ValueError('slide["slide_height"] must be already set.') 539 | self.slide_height = slide["slide_height"] 540 | if "slide_width" not in slide: 541 | raise ValueError('slide["slide_width"] must be already set.') 542 | self.slide_width = slide["slide_width"] 543 | 544 | slide["overlap_height"] = self.overlap_height 545 | slide["overlap_width"] = self.overlap_width 546 | # 547 | # Do the work. 548 | # 549 | height_stride = self.tile_height - self.overlap_height 550 | width_stride = self.tile_width - self.overlap_width 551 | 552 | # Return information to the user 553 | slide["slide_height_tiles"] = math.floor( 554 | (self.slide_height - self.overlap_height) / height_stride 555 | ) 556 | slide["slide_width_tiles"] = math.floor( 557 | (self.slide_width - self.overlap_width) / width_stride 558 | ) 559 | 560 | # Find the coordinates of each tile 561 | top_too_large = self.slide_height - self.tile_height + 1 562 | left_too_large = self.slide_width - self.tile_width + 1 563 | top_left = np.array( 564 | [ 565 | pair 566 | for pair in itertools.product( 567 | np.arange(0, top_too_large, height_stride), 568 | np.arange(0, left_too_large, width_stride), 569 | ) 570 | ], 571 | dtype=np.int64, 572 | ) 573 | 574 | if hasattr(self, "mask_itk"): 575 | # There is a mask that we will have to check 576 | (self.mask_height, self.mask_width) = self.mask_itk.shape 577 | # Let the user know 578 | slide["mask_height"] = self.mask_height 579 | slide["mask_width"] = self.mask_width 580 | slide["tiles"] = self.compute_from_mask(top_left) 581 | 582 | else: 583 | # There is no mask to check 584 | slide["tiles"] = { 585 | f"tile_{i}": {"tile_top": int(corner[0]), "tile_left": int(corner[1])} 586 | for i, corner in enumerate(top_left) 587 | } 588 | 589 | if 0 <= self.randomly_select < len(slide["tiles"]): 590 | # Choose a subset of the tiles randomly 591 | slide["tiles"] = dict( 592 | random.sample(sorted(slide["tiles"].items()), self.randomly_select) 593 | ) 594 | 595 | def check_mask_filename(self, mask_filename): 596 | mask_itk = itk.imread(mask_filename) # May throw exception 597 | if mask_itk.GetImageDimension() != 2: 598 | raise ValueError( 599 | f"The mask ({mask_filename}) should be a 2-dimensional image." 600 | ) 601 | return mask_itk 602 | 603 | def compute_from_mask(self, top_left): 604 | # Check that the input and output aspect ratios are pretty close 605 | if ( 606 | abs( 607 | math.log( 608 | (self.slide_height / self.slide_width) 609 | / (self.mask_height / self.mask_width) 610 | ) 611 | ) 612 | > 0.20 613 | ): 614 | raise ValueError( 615 | "The mask aspect ratio does not match " 616 | "that for the whole slide image." 617 | ) 618 | 619 | # cumulative_mask[row, column] will be the number of mask_itk[r, c] (i.e., 620 | # mask_itk.GetPixel((c,r))) values that are nonzero among all those with 621 | # both r < row and c < column; note the strict inequalities. We have added 622 | # a boundary on all sides of this array -- zeros on the top and left, and a 623 | # duplicate row (column) on the bottom (right) -- so that we do not need to 624 | # do extra testing in our code at the borders. We use int64 in case there 625 | # are 2^31 (~2 billion = ~ 46k by 46k) or more non-zero pixel values in our 626 | # mask. 627 | cumulative_mask = np.zeros( 628 | (self.mask_height + 2, self.mask_width + 2), dtype=np.int64 629 | ) 630 | cumulative_mask[1 : self.mask_height + 1, 1 : self.mask_width + 1] = ( 631 | itk.GetArrayViewFromImage(self.mask_itk).astype(bool).astype(np.int64) 632 | ) 633 | cumulative_mask = np.cumsum(np.cumsum(cumulative_mask, axis=0), axis=1) 634 | 635 | # Define the grid for the cumulative_mask using slide (not mask!) 636 | # coordinates. 637 | grid_points = ( 638 | np.arange(cumulative_mask.shape[0]) 639 | * (self.slide_height / self.mask_height), 640 | np.arange(cumulative_mask.shape[1]) * (self.slide_width / self.mask_width), 641 | ) 642 | 643 | # Tile boundaries may not line up with mask pixels, so we will need a 644 | # bi-linear interpolator. 645 | method = "linear" # bi-linear 646 | interpolator = scipy.interpolate.RegularGridInterpolator( 647 | grid_points, cumulative_mask, method 648 | ) 649 | # Find the coordinates of each tile 650 | top_right = top_left + np.array((0, self.tile_width)) 651 | bottom_left = top_left + np.array((self.tile_height, 0)) 652 | bottom_right = bottom_left + np.array((0, self.tile_width)) 653 | # Compute the total number of mask pixels (both whole and fractional) that 654 | # overlap each tile. 655 | cumulative_by_tile = ( 656 | interpolator(bottom_right) 657 | - interpolator(bottom_left) 658 | - interpolator(top_right) 659 | + interpolator(top_left) 660 | ) 661 | # When the threshold is greater than zero, any `cumulative_by_tile` that is 662 | # greater than or equal to `threshold` is accepted. Because we are worried 663 | # about rounding error, we'll use `epsilon` to let very close cases be 664 | # accepted. When the threshold is exactly zero, any cumulative_by_tile that 665 | # is strictly greater than zero is accepted. As `cumulative_by_tile` is, 666 | # `threshold` is a count of whole and fractional mask pixels. 667 | epsilon = 1e-6 668 | threshold = max( 669 | 0.0, 670 | self.mask_threshold 671 | * (self.tile_height * self.mask_height / self.slide_height) 672 | * (self.tile_width * self.mask_width / self.slide_width) 673 | - epsilon, 674 | ) 675 | return { 676 | f"tile_{i}": {"tile_top": int(corner[0]), "tile_left": int(corner[1])} 677 | for i, corner in enumerate(top_left) 678 | if cumulative_by_tile[i] > threshold 679 | } 680 | 681 | 682 | class TilesByList(_TilesByCommon): 683 | """ 684 | Select the tiles supplied by the user. Optionally, select a random subset of them. 685 | 686 | An instance of class TilesByList is a callable that will select the coordinates of 687 | tiles to be taken from a slide. The selected tiles will be written to the slide 688 | dictionary. 689 | 690 | Parameters for the constructor 691 | ------------------------------ 692 | study : dictionary 693 | The study dictionary from which to read parameters about the study. 694 | randomly_select: int 695 | The number of tiles to be randomly selected from the list that would otherwise 696 | be written to the slide dictionary. A value of -1 is the default and means that 697 | all tiles should be written. 698 | tiles_dictionary: dictionary 699 | For example, {'AB234': {'tile_top': top0, 'tile_left': left0}, 'CD43': 700 | {'tile_top': top1, 'tile_left': left1}, ...}. Tiles from this list will copied 701 | into the slide dictionary if they are randomly selected. 702 | 703 | """ 704 | 705 | def __init__(self, study, randomly_select=-1, tiles_dictionary={}): 706 | """ 707 | Sanity check the supplied parameters and store them for later use. 708 | 709 | randomly_select defaults to "select all". 710 | 711 | For example, 712 | tiles_dictionary = { 713 | "AB234": {"tile_top": top0, "tile_left": left0}, 714 | "CD43": {"tile_top": top1, "tile_left": left1}, 715 | ... 716 | } 717 | """ 718 | _TilesByCommon.__init__(self) 719 | 720 | # Update keys of the dictionary from deprecated names 721 | self._update_dict(study) 722 | 723 | # Check values 724 | if not ("version" in study and study["version"] == "version-1"): 725 | raise ValueError('study["version"] must exist and be equal to "version-1".') 726 | if not ( 727 | "tile_height" in study 728 | and isinstance(study["tile_height"], (int, np.integer)) 729 | and study["tile_height"] > 0 730 | ): 731 | raise ValueError( 732 | 'study["tile_height"]' " must exist and be a positive integer" 733 | ) 734 | if not ( 735 | "tile_width" in study 736 | and isinstance(study["tile_width"], (int, np.integer)) 737 | and study["tile_width"] > 0 738 | ): 739 | raise ValueError( 740 | 'study["tile_width"]' " must exist and be a positive integer" 741 | ) 742 | if not ( 743 | isinstance(randomly_select, (int, np.integer)) and -1 <= randomly_select 744 | ): 745 | raise ValueError( 746 | f"randomly_select ({randomly_select})" 747 | " must be a non-negative integer or -1." 748 | ) 749 | if not isinstance(tiles_dictionary, dict): 750 | raise ValueError("tiles_dictionary must be dictionary.") 751 | for tile_corner in tiles_dictionary.values(): 752 | # Update keys of the dictionary from deprecated names 753 | self._update_dict(tile_corner) 754 | if not ( 755 | all( 756 | [ 757 | isinstance(tile_corner, dict) 758 | for tile_corner in tiles_dictionary.values() 759 | ] 760 | ) 761 | and all( 762 | [ 763 | key in tile_corner.keys() 764 | for tile_corner in tiles_dictionary.values() 765 | for key in ("tile_top", "tile_left") 766 | ] 767 | ) 768 | and all( 769 | [ 770 | isinstance(tile_corner[key], (int, np.integer)) 771 | for tile_corner in tiles_dictionary.values() 772 | for key in ("tile_top", "tile_left") 773 | ] 774 | ) 775 | and all( 776 | [ 777 | tile_corner[key] >= 0 778 | for tile_corner in tiles_dictionary.values() 779 | for key in ("tile_top", "tile_left") 780 | ] 781 | ) 782 | ): 783 | raise ValueError( 784 | "tiles_dictionary must be dictionary of tiles." 785 | ' Each tile is a dictionary, with keys "tile_top" and "tile_left"' 786 | " and with values that are non-negative integers." 787 | ) 788 | 789 | # Save values. To keep garbage collection efficient don't save all of `study`, 790 | # just the parts that we need. 791 | self.tile_height = study["tile_height"] 792 | self.tile_width = study["tile_width"] 793 | self.randomly_select = randomly_select 794 | self.tiles_dictionary = copy.deepcopy( 795 | tiles_dictionary 796 | ) # in case user changes it later 797 | 798 | def __call__(self, slide): 799 | """ 800 | Select the tiles supplied by the user. Optionally, select a random subset of 801 | them. 802 | """ 803 | slide["tiles"] = copy.deepcopy( 804 | self.tiles_dictionary 805 | ) # in case __call__ is called again. 806 | if 0 <= self.randomly_select < len(slide["tiles"]): 807 | # Choose a subset of the tiles randomly 808 | slide["tiles"] = dict( 809 | random.sample(sorted(slide["tiles"].items()), self.randomly_select) 810 | ) 811 | 812 | 813 | class TilesRandomly(_TilesByCommon): 814 | """ 815 | Select a random subset of all possible tiles. 816 | 817 | An instance of class TilesRandomly is a callable that will select the coordinates of 818 | tiles to be taken from a slide. The selected tiles will be written to the slide 819 | dictionary. 820 | 821 | Parameters for the constructor 822 | ------------------------------ 823 | study : dictionary 824 | The study dictionary from which to read parameters about the study. 825 | randomly_select: int 826 | The number of tiles to be randomly selected from the slide. The value must be 827 | positive. A value of 1 is the default. 828 | 829 | """ 830 | 831 | def __init__(self, study, randomly_select=1): # Defaults to select one 832 | """ 833 | Sanity check the supplied parameters and store them for later use. 834 | """ 835 | _TilesByCommon.__init__(self) 836 | 837 | # Update keys of the dictionary from deprecated names 838 | self._update_dict(study) 839 | 840 | # Check values. 841 | if not ("version" in study and study["version"] == "version-1"): 842 | raise ValueError('study["version"] must exist and be equal to "version-1".') 843 | if not ( 844 | "tile_height" in study 845 | and isinstance(study["tile_height"], (int, np.integer)) 846 | and study["tile_height"] > 0 847 | ): 848 | raise ValueError( 849 | 'study["tile_height"]' " must exist and be a positive integer" 850 | ) 851 | if not ( 852 | "tile_width" in study 853 | and isinstance(study["tile_width"], (int, np.integer)) 854 | and study["tile_width"] > 0 855 | ): 856 | raise ValueError( 857 | 'study["tile_width"]' " must exist and be a positive integer" 858 | ) 859 | if not ( 860 | isinstance(randomly_select, (int, np.integer)) and 0 <= randomly_select 861 | ): 862 | raise ValueError( 863 | f"randomly_select ({randomly_select})" 864 | " must be a non-negative integer." 865 | ) 866 | 867 | # Save values. To keep garbage collection efficient don't save all of `study`. 868 | self.tile_height = study["tile_height"] 869 | self.tile_width = study["tile_width"] 870 | self.randomly_select = randomly_select 871 | 872 | def __call__(self, slide): 873 | """ 874 | Select a random subset of all possible tiles. 875 | """ 876 | 877 | # Update keys of the dictionary from deprecated names 878 | self._update_dict(slide) 879 | 880 | if "slide_height" not in slide: 881 | raise ValueError('slide["slide_height"] must be already set.') 882 | if "slide_width" not in slide: 883 | raise ValueError('slide["slide_width"] must be already set.') 884 | 885 | top_too_large = slide["slide_height"] - self.tile_height + 1 886 | left_too_large = slide["slide_width"] - self.tile_width + 1 887 | slide["tiles"] = { 888 | f"tile_{i}": { 889 | "tile_top": random.randrange(0, top_too_large), 890 | "tile_left": random.randrange(0, left_too_large), 891 | } 892 | for i in range(self.randomly_select) 893 | } 894 | 895 | 896 | class ChunkLocations(_TilesByCommon): 897 | def __init__(self): 898 | _TilesByCommon.__init__(self) 899 | self.no_indices = np.array((), dtype=np.int64) 900 | 901 | def __call__(self, study_description): 902 | """ 903 | Given the list of desired tile locations, computes the locations of chunks to be 904 | read 905 | """ 906 | 907 | # Update keys of the dictionary from deprecated names 908 | self._update_dict(study_description) 909 | 910 | if not ( 911 | "version" in study_description 912 | and study_description["version"] == "version-1" 913 | ): 914 | raise ValueError( 915 | 'study_description["version"] must exist and be equal to "version-1".' 916 | ) 917 | if not ( 918 | "tile_height" in study_description 919 | and isinstance(study_description["tile_height"], (int, np.integer)) 920 | and study_description["tile_height"] > 0 921 | ): 922 | raise ValueError( 923 | 'study_description["tile_height"]' 924 | " must exist and be a positive integer" 925 | ) 926 | if not ( 927 | "tile_width" in study_description 928 | and isinstance(study_description["tile_width"], (int, np.integer)) 929 | and study_description["tile_width"] > 0 930 | ): 931 | raise ValueError( 932 | 'study_description["tile_width"]' 933 | " must exist and be a positive integer" 934 | ) 935 | for slide in study_description["slides"].values(): 936 | # Update keys of the dictionary from deprecated names 937 | self._update_dict(slide) 938 | 939 | if not ( 940 | "returned_magnification" in slide 941 | and isinstance( 942 | slide["returned_magnification"], 943 | (int, np.integer, float, np.floating), 944 | ) 945 | and slide["returned_magnification"] > 0 946 | ): 947 | raise ValueError( 948 | 'slide["returned_magnification"]' 949 | " must exist and be a positive number" 950 | ) 951 | # Check that other necessary keys are also present!!! 952 | 953 | # Partition the set of tiles into chunks. 954 | self._designate_chunks_for_tiles(study_description) 955 | # cProfile.runctx( 956 | # "self._designate_chunks_for_tiles(study_description)", 957 | # globals=globals(), 958 | # locals=locals(), 959 | # sort="cumulative", 960 | # ) 961 | 962 | def _designate_chunks_for_tiles(self, study_description): 963 | # Update keys of the dictionary from deprecated names 964 | self._update_dict(study_description) 965 | 966 | tile_height = study_description["tile_height"] 967 | tile_width = study_description["tile_width"] 968 | 969 | for slide in study_description["slides"].values(): 970 | # Update keys of the dictionary from deprecated names 971 | self._update_dict(slide) 972 | 973 | if not ( 974 | "chunk_height" in slide 975 | and isinstance(slide["chunk_height"], (int, np.integer)) 976 | and slide["chunk_height"] > 0 977 | ): 978 | raise ValueError( 979 | 'slide["chunk_height"]' " must exist and be a positive integer" 980 | ) 981 | if not ( 982 | "chunk_width" in slide 983 | and isinstance(slide["chunk_width"], (int, np.integer)) 984 | and slide["chunk_width"] > 0 985 | ): 986 | raise ValueError( 987 | 'slide["chunk_width"]' " must exist and be a positive integer" 988 | ) 989 | chunk_height = slide["chunk_height"] 990 | chunk_width = slide["chunk_width"] 991 | 992 | tiles_names = list(slide["tiles"].keys()) 993 | tiles_data = np.array( 994 | [ 995 | [ 996 | slide["tiles"][tile]["tile_top"], 997 | slide["tiles"][tile]["tile_left"], 998 | ] 999 | for tile in tiles_names 1000 | ], 1001 | dtype=np.int64, 1002 | ) 1003 | self.build_tree(tiles_data) 1004 | chunks = slide["chunks"] = {} 1005 | num_chunks = 0 1006 | while self.get_tree() is not None: 1007 | tile = self.get_topmost() 1008 | chunk = chunks[f"chunk_{num_chunks}"] = { 1009 | "chunk_top": tiles_data[0], 1010 | "chunk_left": tiles_data[1], 1011 | "chunk_bottom": tiles_data[0] + chunk_height, 1012 | "chunk_right": tiles_data[1] + chunk_width, 1013 | } 1014 | num_chunks += 1 1015 | 1016 | mins = tile.copy() 1017 | maxs = tile.copy() 1018 | maxs[0] += chunk_height - tile_height + 1 1019 | maxs[1] += chunk_width - tile_width + 1 1020 | indices = self.find_in_range_and_delete(mins, maxs) 1021 | tiles = chunk["tiles"] = { 1022 | tiles_names[i]: { 1023 | "tile_top": tiles_data[i][0], 1024 | "tile_left": tiles_data[i][1], 1025 | } 1026 | for i in indices 1027 | } 1028 | # Make the chunk as small as possible given the tiles that it must 1029 | # support. Note that this also ensures that the pixels that are read do 1030 | # not run over the bottom or right border of the slide (assuming that 1031 | # the tiles do not go over those borders). 1032 | chunk["chunk_top"] = min([tile["tile_top"] for tile in tiles.values()]) 1033 | chunk["chunk_left"] = min( 1034 | [tile["tile_left"] for tile in tiles.values()] 1035 | ) 1036 | chunk["chunk_bottom"] = ( 1037 | max([tile["tile_top"] for tile in tiles.values()]) + tile_height 1038 | ) 1039 | chunk["chunk_right"] = ( 1040 | max([tile["tile_left"] for tile in tiles.values()]) + tile_width 1041 | ) 1042 | 1043 | @staticmethod 1044 | def read_large_image( 1045 | filename, 1046 | chunk_top, 1047 | chunk_left, 1048 | chunk_bottom, 1049 | chunk_right, 1050 | returned_magnification, 1051 | ): 1052 | # if "_num_chunks" not in ChunkLocations.read_large_image.__dict__: 1053 | # ChunkLocations.read_large_image._num_chunks = 0 1054 | # chunk_name = ( 1055 | # f"#read_large_image {ChunkLocations.read_large_image._num_chunks:06}" 1056 | # ) 1057 | # ChunkLocations.read_large_image._num_chunks += 1 1058 | 1059 | # print(f"{chunk_name} begin {datetime.datetime.now()}") 1060 | import large_image 1061 | import large_image_source_tiff 1062 | 1063 | ts = ( 1064 | large_image_source_tiff.open(filename) 1065 | if os.path.splitext(filename)[1] in (".tif", ".tiff", ".svs") 1066 | else large_image.open(filename) 1067 | ) 1068 | chunk = ts.getRegion( 1069 | scale=dict(magnification=returned_magnification), 1070 | format=large_image.constants.TILE_FORMAT_NUMPY, 1071 | region=dict( 1072 | left=chunk_left, 1073 | top=chunk_top, 1074 | width=chunk_right - chunk_left, 1075 | height=chunk_bottom - chunk_top, 1076 | units="mag_pixels", 1077 | ), 1078 | )[0] 1079 | # print(f"{chunk_name} end {datetime.datetime.now()}") 1080 | return chunk 1081 | 1082 | @staticmethod 1083 | def scale_it(value, factor): 1084 | return math.floor(value / factor + 0.01) 1085 | 1086 | def build_tree(self, data): 1087 | self.data = data 1088 | self.tree = self._build(np.arange(self.data.shape[0])) 1089 | 1090 | def get_data(self): 1091 | return self.data 1092 | 1093 | def get_tree(self): 1094 | return self.tree 1095 | 1096 | def get_topmost(self): 1097 | return self.tree["topmost"] 1098 | 1099 | def find_in_range_and_delete(self, mins, maxs): 1100 | self.mins = mins 1101 | self.maxs = maxs 1102 | indices, newtree = self._find_in_range_and_delete(subtree=self.tree) 1103 | self.tree = newtree 1104 | return indices 1105 | 1106 | def _build(self, indices): 1107 | # Split this subset of the data based upon its coordinate means 1108 | subset = self.data[indices, :] 1109 | means = np.mean(subset, axis=0) 1110 | # Calculate the quadrant (in range(2**m)) for each point 1111 | rants = (subset[:, 0] >= means[0]) + 0 1112 | for col in range(1, self.data.shape[1]): 1113 | rants = (rants * 2) + (subset[:, col] >= means[col]) 1114 | 1115 | # How to process this depends upon how many quadrants are used 1116 | occur = np.unique(rants) 1117 | if len(occur) == 1: 1118 | return {"means": means, "topmost": means, "indices": indices} 1119 | else: 1120 | recurse = {rant: self._build(indices[rants == rant]) for rant in occur} 1121 | qvalues = list(recurse.values()) 1122 | # Find the the topmost, in dictionary order 1123 | topmost = self._compute_topmost(qvalues) 1124 | 1125 | # Return what we have found 1126 | return {"means": means, "topmost": topmost, "quadrants": recurse} 1127 | 1128 | @staticmethod 1129 | def _compute_topmost(qvalues): 1130 | topmost = qvalues[0]["topmost"] 1131 | for k in range(1, len(qvalues)): 1132 | test_key = qvalues[k]["topmost"] 1133 | for c in range(len(topmost)): 1134 | if test_key[c] != topmost[c]: 1135 | break 1136 | if test_key[c] < topmost[c]: 1137 | topmost = test_key 1138 | return topmost 1139 | 1140 | def _find_in_range_and_delete(self, subtree): 1141 | if "indices" in subtree: 1142 | # Process this leaf node 1143 | if all(subtree["means"] >= self.mins) and all(subtree["means"] < self.maxs): 1144 | # Return these indices and remove the subtree 1145 | return subtree["indices"], None 1146 | else: 1147 | # Return no indices and remove nothing from the subtree 1148 | return self.no_indices, subtree 1149 | else: 1150 | # Process this internal node 1151 | means = subtree["means"] 1152 | recurse = dict( 1153 | ( 1154 | (qkey, self._find_in_range_and_delete(qvalue)) 1155 | if all( 1156 | ( 1157 | ( 1158 | self.maxs[col] > means[col] 1159 | if qkey & 2 ** (self.data.shape[1] - 1 - col) 1160 | else self.mins[col] < means[col] 1161 | ) 1162 | for col in range(self.data.shape[1]) 1163 | ) 1164 | ) 1165 | else (qkey, (self.no_indices, qvalue)) 1166 | ) 1167 | for qkey, qvalue in subtree["quadrants"].items() 1168 | ) 1169 | indices = np.array( 1170 | [index for pair in recurse.values() for index in pair[0]], 1171 | dtype=np.int64, 1172 | ) 1173 | quadrants = { 1174 | qkey: pair[1] for qkey, pair in recurse.items() if pair[1] is not None 1175 | } 1176 | if len(quadrants) == 0: 1177 | return indices, None 1178 | topmost = self._compute_topmost(list(quadrants.values())) 1179 | return indices, { 1180 | "means": subtree["means"], 1181 | "topmost": topmost, 1182 | "quadrants": quadrants, 1183 | } 1184 | --------------------------------------------------------------------------------