├── example
    ├── TA232-mask.png
    ├── TA232-source.md
    ├── performance-EfficientNetV2S.py
    ├── performance-detection.py
    ├── performance-mnist.py
    ├── performance-EfficientNet_V2_S_Weights.IMAGENET1K_V1.py
    ├── tensorflow_stream.ipynb
    └── pytorch_stream.ipynb
├── documentation
    ├── runtime.png
    ├── H&E_chunk.png
    └── slide_chunk_tile.png
├── .github
    ├── dependabot.yml
    └── workflows
    │   └── build-test-package.yml
├── .git-blame-ignore-revs
├── .gitignore
├── pyproject.toml
├── histomics_stream
    ├── __init__.py
    ├── codecs.py
    ├── pytorch.py
    ├── tensorflow.py
    └── configure.py
├── test
    ├── test_find_imports.py
    ├── test_mask.py
    └── test_create_study.py
├── StudyObject.md
├── LICENSE.txt
└── README.md


/example/TA232-mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DigitalSlideArchive/HistomicsStream/HEAD/example/TA232-mask.png


--------------------------------------------------------------------------------
/documentation/runtime.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DigitalSlideArchive/HistomicsStream/HEAD/documentation/runtime.png


--------------------------------------------------------------------------------
/documentation/H&E_chunk.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DigitalSlideArchive/HistomicsStream/HEAD/documentation/H&E_chunk.png


--------------------------------------------------------------------------------
/documentation/slide_chunk_tile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DigitalSlideArchive/HistomicsStream/HEAD/documentation/slide_chunk_tile.png


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | ---
2 | version: 2
3 | updates:
4 |   - package-ecosystem: "github-actions"
5 |     directory: "/"
6 |     schedule:
7 |       interval: "weekly"
8 | 


--------------------------------------------------------------------------------
/example/TA232-source.md:
--------------------------------------------------------------------------------
 1 | # Data source
 2 | The file `example/TA232.svs` comes from the zip file available as
 3 | ```
 4 | https://stanfordmedicine.box.com/s/ub8e0wlhsdenyhdsuuzp6zhj0i82xrb1
 5 | ```
 6 | from the web page
 7 | ```
 8 | https://github.com/stanfordmlgroup/DLBCL-Morph
 9 | ```
10 | It is in that zip file as
11 | ```
12 | DLBCL-Morph/TMA/MYC/TA232.svs
13 | ```
14 | 
15 | The corresponding mask `example/TA232-mask.png` is randomly generated in Python with
16 | ```python
17 | import numpy as np
18 | from PIL import Image
19 | arr = np.random.randint(0, 2, (mask_height, mask_width), dtype=np.int8)
20 | im = Image.fromarray(arr)
21 | im.save("TA232-mask.png")
22 | ```
23 | 


--------------------------------------------------------------------------------
/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file lists revisions that should be ignored when considering
 3 | # attribution for the actual code written.  Code style changes should
 4 | # not be considered as modifications with regards to attribution.
 5 | #
 6 | # To see clean and meaningful blame information.
 7 | # $ git blame important.py --ignore-revs-file .git-blame-ignore-revs
 8 | #
 9 | # To configure git to automatically ignore revisions listed in a file on every call to git blame.
10 | # $ git config blame.ignoreRevsFile .git-blame-ignore-revs
11 | #
12 | # Ignore changes introduced when doing global file format changes
13 | # STYLE: change camelCase to snake_case
14 | 6acf4583eb1362af40cb03db068e139bb29d6b96
15 | # STYLE: Apply `black` formatting.
16 | 649e8d0577431734481590c83651adefbce31777
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Do not add ExternalData module staging files
 2 | .ExternalData*
 3 | 
 4 | # back-up files
 5 | *~
 6 | *.bak
 7 | # vim swp files
 8 | *.swp
 9 | ## Ignore files that are used for auto_completion with clang
10 | *.clang_complete
11 | ## YouCompleteMe vim plugin configuration file
12 | .ycm_extra_conf.py
13 | 
14 | 
15 | # KWStyle hook output
16 | *.kws
17 | 
18 | # compiled python files
19 | *.pyc
20 | 
21 | # Binary directory
22 | BUILD*
23 | build*
24 | 
25 | # qtcreator
26 | CMakeLists.txt.user*
27 | 
28 | # kdevelop
29 | *.kdev*
30 | .kdev*
31 | 
32 | # back-up files when conflicts occur
33 | *.orig
34 | 
35 | # Clion editor internal project information
36 | .idea
37 | 
38 | # Visual Studio
39 | .vs
40 | 
41 | # Mac System File
42 | .DS_Store
43 | 
44 | # Ignore testing temporary files
45 | Testing/Temporary/
46 | 
47 | # Checkpoint files for Jupyter
48 | .ipynb_checkpoints/
49 | 
50 | # Compiled Python files
51 | __pycache__/


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["flit_core >=3.4,<4"]
 3 | build-backend = "flit_core.buildapi"
 4 | 
 5 | [project]
 6 | name = "histomics_stream"
 7 | readme = "README.md"
 8 | requires-python = ">=3.6"
 9 | authors = [{name = "Lee A. Newberg", email = "lee.newberg@kitware.com"}]
10 | maintainers = [{name = "Lee A. Newberg", email = "lee.newberg@kitware.com"}]
11 | keywords = ["tensorflow", "torch", "whole slide image", "stream", "machine learning"]
12 | classifiers = ["License :: OSI Approved :: Apache Software License"]
13 | dependencies = [
14 |     "imagecodecs",
15 |     "itk",
16 |     "numcodecs",
17 |     "numpy",
18 |     "scipy",
19 | ]
20 | dynamic = ["version", "description"]
21 | 
22 | [project.optional-dependencies]
23 | tensorflow = [
24 |     "tensorflow<3.0.0",
25 |     "keras",
26 | ]
27 | torch = [
28 |     "torch<2.0.0",
29 | ]
30 | zarr = [
31 |     "zarr",
32 | ]
33 | 
34 | [project.urls]
35 | Source = "https://github.com/DigitalSlideArchive/HistomicsStream"
36 | 
37 | [project.scripts]
38 | flit = "flit:main"
39 | 


--------------------------------------------------------------------------------
/histomics_stream/__init__.py:
--------------------------------------------------------------------------------
 1 | # =========================================================================
 2 | #
 3 | #   Copyright NumFOCUS
 4 | #
 5 | #   Licensed under the Apache License, Version 2.0 (the "License");
 6 | #   you may not use this file except in compliance with the License.
 7 | #   You may obtain a copy of the License at
 8 | #
 9 | #          https://www.apache.org/licenses/LICENSE-2.0.txt
10 | #
11 | #   Unless required by applicable law or agreed to in writing, software
12 | #   distributed under the License is distributed on an "AS IS" BASIS,
13 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | #   See the License for the specific language governing permissions and
15 | #   limitations under the License.
16 | #
17 | # =========================================================================
18 | 
19 | """Whole-slide image streamer for machine learning frameworks."""
20 | 
21 | __version__ = "2.5.3"
22 | 
23 | """
24 | 
25 | This module supports efficient whole-slide reading and processing for a machine learning
26 | execution graph.
27 | 
28 | """
29 | from . import configure, codecs  # noqa: F401,E402
30 | 


--------------------------------------------------------------------------------
/test/test_find_imports.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # =========================================================================
 4 | #
 5 | #   Copyright NumFOCUS
 6 | #
 7 | #   Licensed under the Apache License, Version 2.0 (the "License");
 8 | #   you may not use this file except in compliance with the License.
 9 | #   You may obtain a copy of the License at
10 | #
11 | #          https://www.apache.org/licenses/LICENSE-2.0.txt
12 | #
13 | #   Unless required by applicable law or agreed to in writing, software
14 | #   distributed under the License is distributed on an "AS IS" BASIS,
15 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | #   See the License for the specific language governing permissions and
17 | #   limitations under the License.
18 | #
19 | # =========================================================================
20 | 
21 | 
22 | def test_imports_can_be_found():
23 |     """Purpose: Test to check that each import can be found"""
24 | 
25 |     import imagecodecs  # noqa: F401
26 |     import itk  # noqa: F401
27 |     import numcodecs  # noqa: F401
28 |     import numpy  # noqa: F401
29 |     import scipy.interpolate  # noqa: F401
30 |     import tensorflow  # noqa: F401
31 |     import torch  # noqa: F401
32 |     import zarr  # noqa: F401
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     test_imports_can_be_found()
37 | 


--------------------------------------------------------------------------------
/.github/workflows/build-test-package.yml:
--------------------------------------------------------------------------------
 1 | name: Build, test, package
 2 | 
 3 | on: [push,pull_request]
 4 | 
 5 | jobs:
 6 |   test-python:
 7 |     runs-on: ubuntu-20.04
 8 |     strategy:
 9 |       max-parallel: 2
10 |       matrix:
11 |         python-version: ["3.8", "3.9", "3.10", "3.11"]
12 | 
13 |         include:
14 |           - flake8-python-git-tag: ""
15 |           - pooch-python-git-tag: ""
16 |           - pytest-python-git-tag: ""
17 | 
18 |     steps:
19 |       - uses: actions/checkout@v6
20 |       - name: 'Free up disk space'
21 |         run: |
22 |           # Workaround for https://github.com/actions/virtual-environments/issues/709
23 |           df -h
24 |           sudo apt-get clean
25 |           sudo rm -rf "/usr/local/share/boost"
26 |           sudo rm -rf "$AGENT_TOOLSDIRECTORY"
27 |           df -h
28 | 
29 |       - name: Set up Python ${{ matrix.python-version }}
30 |         uses: actions/setup-python@v6
31 |         with:
32 |           python-version: ${{ matrix.python-version }}
33 | 
34 |       - name: Install Python dependencies
35 |         run: |
36 |           sudo apt update
37 |           sudo apt install openslide-tools python3-openslide
38 |           python -m pip install --upgrade pip setuptools wheel
39 |           pip install 'flake8${{ matrix.flake8-python-git-tag }}' 'pooch${{ matrix.pooch-python-git-tag }}' 'pytest${{ matrix.pytest-python-git-tag }}'
40 |           pip install 'large-image[bioformats,ometiff,openjpeg,openslide,tiff]' 'scikit_image' --find-links https://girder.github.io/large_image_wheels
41 | 
42 |       - name: Install histomics_stream
43 |         run: |
44 |           pip install .[tensorflow,torch,zarr]
45 |           # With Python 3.8, tensorflow downgrades typing-extensions, which appears to
46 |           # be unnecessary and breaks a dependency of large_image, so we overrule that
47 |           # next.
48 |           pip install --upgrade typing-extensions
49 | 
50 |       - name: Lint with flake8
51 |         run: |
52 |           # stop the build if there are Python syntax errors or undefined names
53 |           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
54 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
55 |           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
56 | 
57 |       - name: Test with pytest
58 |         run: |
59 |           cd test
60 |           pytest
61 |         shell: bash
62 | 
63 |   build-n-publish:
64 |     name: Build and publish Python 🐍 distributions 📦 to PyPI
65 |     runs-on: ubuntu-20.04
66 |     permissions:
67 |       id-token: write
68 |     steps:
69 |     - uses: actions/checkout@master
70 |     - name: Set up Python "3.9"
71 |       uses: actions/setup-python@v6
72 |       with:
73 |         python-version: "3.9"
74 |     - name: Install pypa/build
75 |       run: >-
76 |         python -m pip install build --user
77 |     - name: Build a binary wheel and a source tarball
78 |       run: >-
79 |         python -m build --sdist --wheel --outdir dist/ .
80 |     - name: Publish to Test PyPI
81 |       if: github.event.repository.fork == false
82 |       uses: pypa/gh-action-pypi-publish@release/v1
83 |       with:
84 |         password: ${{ secrets.TEST_PYPI_API_TOKEN }}
85 |         repository-url: https://test.pypi.org/legacy/
86 |         skip-existing: true
87 |     - name: Publish distribution 📦 to PyPI
88 |       if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
89 |       uses: pypa/gh-action-pypi-publish@release/v1
90 |       with:
91 |         password: ${{ secrets.PYPI_API_TOKEN }}
92 | 


--------------------------------------------------------------------------------
/histomics_stream/codecs.py:
--------------------------------------------------------------------------------
  1 | # =========================================================================
  2 | #
  3 | #   Copyright NumFOCUS
  4 | #
  5 | #   Licensed under the Apache License, Version 2.0 (the "License");
  6 | #   you may not use this file except in compliance with the License.
  7 | #   You may obtain a copy of the License at
  8 | #
  9 | #          https://www.apache.org/licenses/LICENSE-2.0.txt
 10 | #
 11 | #   Unless required by applicable law or agreed to in writing, software
 12 | #   distributed under the License is distributed on an "AS IS" BASIS,
 13 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | #   See the License for the specific language governing permissions and
 15 | #   limitations under the License.
 16 | #
 17 | # =========================================================================
 18 | 
 19 | """Whole-slide image streamer for machine learning frameworks.
 20 | 
 21 | The histomics_stream.codecs module supplies codecs that are useful for Zarr file storage
 22 | with jpeg or jpeg2k compression.
 23 | 
 24 | """
 25 | 
 26 | from imagecodecs import jpeg2k_decode, jpeg2k_encode, jpeg_decode, jpeg_encode
 27 | from numcodecs.abc import Codec
 28 | from numcodecs.compat import ensure_contiguous_ndarray, ensure_ndarray, ndarray_copy
 29 | from numcodecs.registry import register_codec
 30 | 
 31 | 
 32 | class jpeg(Codec):
 33 |     """Codec providing jpeg compression via imagecodecs.
 34 | 
 35 |     Parameters
 36 |     ----------
 37 |     quality : int
 38 |         Compression level.
 39 | 
 40 |     Notes
 41 |     -----
 42 |     For the code that uses Zarr data storage for jpeg images, we need to supply codecs.
 43 |     Note that we use this codec instead of that available from the zarr_jpeg package.
 44 |     The latter collapses dimensions by default, can require us to transpose dimensions,
 45 |     and can miss optimizations based upon RGB data.
 46 | 
 47 |     """
 48 | 
 49 |     codec_id = "jpeg"
 50 | 
 51 |     def __init__(self, quality=100):
 52 |         self.quality = quality
 53 |         assert 0 < self.quality <= 100 and isinstance(self.quality, int)
 54 |         super().__init__()
 55 | 
 56 |     def encode(self, buf):
 57 |         """The method to encode a raw image into jpeg format.
 58 | 
 59 |         Parameters
 60 |         ----------
 61 |         buf : ndarray
 62 |             The raw image to be encoded into jpeg format
 63 | 
 64 |         Returns
 65 |         -------
 66 |         ndarray
 67 |             The image in jpeg format
 68 | 
 69 |         """
 70 | 
 71 |         bufa = ensure_ndarray(buf)
 72 |         assert 2 <= bufa.ndim <= 3
 73 |         return jpeg_encode(bufa, level=self.quality)
 74 | 
 75 |     def decode(self, buf, out=None):
 76 |         """The method to decode a jpeg image into a raw format.
 77 | 
 78 |         Parameters
 79 |         ----------
 80 |         buf : contiguous_ndarray
 81 |             The jpeg image to be decoded into raw format.
 82 |         out : contiguous_ndarray, optional
 83 |             Another location to write the raw image to.
 84 | 
 85 |         Returns
 86 |         -------
 87 |         ndarray
 88 |             The image in raw format
 89 | 
 90 |         """
 91 | 
 92 |         buf = ensure_contiguous_ndarray(buf)
 93 |         if out is not None:
 94 |             out = ensure_contiguous_ndarray(out)
 95 |         tiled = jpeg_decode(buf)
 96 |         return ndarray_copy(tiled, out)
 97 | 
 98 | 
 99 | register_codec(jpeg)
100 | 
101 | 
102 | class jpeg2k(Codec):
103 |     """Codec providing jpeg2k compression via imagecodecs.
104 | 
105 |     Parameters
106 |     ----------
107 |     quality : int
108 |         Compression level.
109 | 
110 |     """
111 | 
112 |     codec_id = "jpeg2k"
113 | 
114 |     def __init__(self, quality=100):
115 |         self.quality = quality
116 |         assert 0 < self.quality <= 100 and isinstance(self.quality, int)
117 |         super().__init__()
118 | 
119 |     def encode(self, buf):
120 |         """The method to encode a raw image into jpeg2k format.
121 | 
122 |         Parameters
123 |         ----------
124 |         buf : ndarray
125 |             The raw image to be encoded into jpeg2k format
126 | 
127 |         Returns
128 |         -------
129 |         ndarray
130 |             The image in jpeg2k format
131 | 
132 |         """
133 | 
134 |         bufa = ensure_ndarray(buf)
135 |         assert 2 <= bufa.ndim <= 3
136 |         return jpeg2k_encode(bufa, level=self.quality)
137 | 
138 |     def decode(self, buf, out=None):
139 |         """The method to decode a jpeg2k image into a raw format.
140 | 
141 |         Parameters
142 |         ----------
143 |         buf : contiguous_ndarray
144 |             The jpeg2k image to be decoded into raw format.
145 |         out : contiguous_ndarray, optional
146 |             Another location to write the raw image to.
147 | 
148 |         Returns
149 |         -------
150 |         ndarray
151 |             The image in raw format
152 | 
153 |         """
154 | 
155 |         buf = ensure_contiguous_ndarray(buf)
156 |         if out is not None:
157 |             out = ensure_contiguous_ndarray(out)
158 |         tiled = jpeg2k_decode(buf)
159 |         return ndarray_copy(tiled, out)
160 | 
161 | 
162 | register_codec(jpeg2k)
163 | 


--------------------------------------------------------------------------------
/test/test_mask.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # =========================================================================
  4 | #
  5 | #   Copyright NumFOCUS
  6 | #
  7 | #   Licensed under the Apache License, Version 2.0 (the "License");
  8 | #   you may not use this file except in compliance with the License.
  9 | #   You may obtain a copy of the License at
 10 | #
 11 | #          https://www.apache.org/licenses/LICENSE-2.0.txt
 12 | #
 13 | #   Unless required by applicable law or agreed to in writing, software
 14 | #   distributed under the License is distributed on an "AS IS" BASIS,
 15 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | #   See the License for the specific language governing permissions and
 17 | #   limitations under the License.
 18 | #
 19 | # =========================================================================
 20 | 
 21 | 
 22 | def test_mask_threshold():
 23 |     import histomics_stream as hs
 24 |     import os
 25 |     import pooch
 26 | 
 27 |     wsi_path = pooch.retrieve(
 28 |         fname="TCGA-AN-A0G0-01Z-00-DX1.svs",
 29 |         url=(
 30 |             "https://drive.usercontent.google.com/download"
 31 |             "?export=download"
 32 |             "&id=19agE_0cWY582szhOVxp9h3kozRfB4CvV"
 33 |             "&confirm=t"
 34 |         ),
 35 |         known_hash="d046f952759ff6987374786768fc588740eef1e54e4e295a684f3bd356c8528f",
 36 |         path=str(pooch.os_cache("pooch")) + os.sep + "wsi",
 37 |     )
 38 |     print(f"Have {wsi_path}")
 39 | 
 40 |     # download binary mask image
 41 |     mask_path = pooch.retrieve(
 42 |         fname="TCGA-AN-A0G0-01Z-00-DX1.mask.png",
 43 |         url=(
 44 |             "https://drive.usercontent.google.com/download"
 45 |             "?export=download"
 46 |             "&id=17GOOHbL8Bo3933rdIui82akr7stbRfta"
 47 |             "&confirm=t"
 48 |         ),
 49 |         known_hash="bb657ead9fd3b8284db6ecc1ca8a1efa57a0e9fd73d2ea63ce6053fbd3d65171",
 50 |         path=str(pooch.os_cache("pooch")) + os.sep + "wsi",
 51 |     )
 52 |     print(f"Have {mask_path}")
 53 | 
 54 |     my_study = dict(
 55 |         version="version-1",
 56 |         number_pixel_columns_for_tile=5471,
 57 |         number_pixel_rows_for_tile=5743,
 58 |         overlap_width=127,
 59 |         overlap_height=101,
 60 |         slides=dict(
 61 |             Slide_0=dict(
 62 |                 filename=wsi_path,
 63 |                 slide_name=os.path.splitext(os.path.split(wsi_path)[1])[0],
 64 |                 slide_group="test_mask_threshold",
 65 |                 chunk_width=31,
 66 |                 chunk_height=37,
 67 |             )
 68 |         ),
 69 |     )
 70 |     find_slide_resolution = hs.configure.FindResolutionForSlide(
 71 |         my_study, target_magnification=20, magnification_source="native"
 72 |     )
 73 |     for slide in my_study["slides"].values():
 74 |         find_slide_resolution(slide)
 75 | 
 76 |     tiler_thresholds = (0.00, 0.20, 0.50, 0.80, 1.00)
 77 |     tilers = [
 78 |         hs.configure.TilesByGridAndMask(
 79 |             my_study,
 80 |             mask_filename=mask_path,
 81 |             mask_threshold=threshold,
 82 |             number_pixel_overlap_rows_for_tile=101,
 83 |             number_pixel_overlap_columns_for_tile=127,
 84 |         )
 85 |         for threshold in tiler_thresholds
 86 |     ]
 87 | 
 88 |     def run_tiler(study, tiler):
 89 |         for slide in study["slides"].values():
 90 |             tiler(slide)
 91 |         return [
 92 |             (
 93 |                 value["filename"],
 94 |                 [
 95 |                     (tile["tile_top"], tile["tile_left"])
 96 |                     for tile in value["tiles"].values()
 97 |                 ],
 98 |             )
 99 |             for value in study["slides"].values()
100 |         ]
101 | 
102 |     found_tiles = [run_tiler(my_study, tiler) for tiler in tilers]
103 | 
104 |     # print(f"    expected_tiles = {repr(found_tiles)}")
105 |     expected_tiles = [
106 |         [
107 |             (
108 |                 wsi_path,
109 |                 [(0, 10688), (0, 16032), (0, 21376)]
110 |                 + [(5642, 5344), (5642, 10688), (5642, 16032), (5642, 21376)]
111 |                 + [(11284, 5344), (11284, 10688), (11284, 16032), (11284, 21376)],
112 |             )
113 |         ],
114 |         [
115 |             (
116 |                 wsi_path,
117 |                 [(0, 16032), (0, 21376)]
118 |                 + [(5642, 5344), (5642, 10688), (5642, 16032), (5642, 21376)]
119 |                 + [(11284, 5344), (11284, 10688), (11284, 16032), (11284, 21376)],
120 |             )
121 |         ],
122 |         [
123 |             (
124 |                 wsi_path,
125 |                 [(0, 16032), (0, 21376)]
126 |                 + [(5642, 10688), (5642, 16032), (5642, 21376)]
127 |                 + [(11284, 10688), (11284, 16032)],
128 |             )
129 |         ],
130 |         [(wsi_path, [(5642, 10688), (5642, 16032), (11284, 10688), (11284, 16032)])],
131 |         [(wsi_path, [(5642, 16032), (11284, 16032)])],
132 |     ]
133 | 
134 |     for i in range(len(found_tiles) - 1):
135 |         assert set(found_tiles[i + 1][0][1]).issubset(set(found_tiles[i + 1][0][1]))
136 |     for i in range(len(found_tiles)):
137 |         assert found_tiles[i] == expected_tiles[i]
138 |     print("Test succeeded")
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     test_mask_threshold()
143 | 


--------------------------------------------------------------------------------
/test/test_create_study.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # =========================================================================
  4 | #
  5 | #   Copyright NumFOCUS
  6 | #
  7 | #   Licensed under the Apache License, Version 2.0 (the "License");
  8 | #   you may not use this file except in compliance with the License.
  9 | #   You may obtain a copy of the License at
 10 | #
 11 | #          https://www.apache.org/licenses/LICENSE-2.0.txt
 12 | #
 13 | #   Unless required by applicable law or agreed to in writing, software
 14 | #   distributed under the License is distributed on an "AS IS" BASIS,
 15 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | #   See the License for the specific language governing permissions and
 17 | #   limitations under the License.
 18 | #
 19 | # =========================================================================
 20 | 
 21 | 
 22 | def test_create_study():
 23 |     """
 24 |     Purpose: Exercise the basic steps for creating a study dict, which is the precursor
 25 |     step to creating a dataset/dataloader for a machine learning framework such as
 26 |     TensorFlow or Torch.
 27 |     """
 28 |     import copy
 29 |     import histomics_stream as hs
 30 | 
 31 |     # Create a study and insert study-wide information
 32 |     my_study0 = {"version": "version-1"}
 33 |     my_study0["tile_height"] = 256
 34 |     my_study0["tile_width"] = 256
 35 |     my_slides = my_study0["slides"] = {}
 36 | 
 37 |     # Add a slide to the study, including slide-wide information with it.
 38 |     my_slide0 = my_slides["Slide_0"] = {}
 39 |     my_slide0["filename"] = (
 40 |         "/tf/notebooks/histomics_stream/example/"
 41 |         "TCGA-BH-A0BZ-01Z-00-DX1.45EB3E93-A871-49C6-9EAE-90D98AE01913.svs"
 42 |     )
 43 |     my_slide0["slide_name"] = "TCGA-BH-A0BZ-01Z-00-DX1"
 44 |     my_slide0["slide_group"] = "TCGA-BH-A0BZ"
 45 |     my_slide0["chunk_height"] = 2048
 46 |     my_slide0["chunk_width"] = 2048
 47 | 
 48 |     if False:
 49 |         # For each slide, find the appropriate resolution given the
 50 |         # desired_magnification and magnification_tolerance.  In this example, we use
 51 |         # the same parameters for each slide, but this is not required generally.
 52 |         find_slide_resolution = hs.configure.FindResolutionForSlide(
 53 |             my_study0, desired_magnification=20, magnification_tolerance=0.02
 54 |         )
 55 |         for slide in my_study0["slides"].values():
 56 |             find_slide_resolution(slide)
 57 |     else:
 58 |         # Because we don't actually have the image available, make up some numbers.
 59 |         my_slide0["level"] = 0
 60 |         my_slide0["factor"] = 0.5
 61 |         my_slide0["slide_width"] = 85047
 62 |         my_slide0["slide_height"] = 112334
 63 | 
 64 |     # We are going to demonstrate several approaches to choosing tiles.  Each approach
 65 |     # will start with its own copy of the my_study0 that we have built so far.
 66 | 
 67 |     # Demonstrate TilesByGridAndMask without a mask
 68 |     my_study_by_grid = copy.deepcopy(my_study0)
 69 |     tiles_by_grid = hs.configure.TilesByGridAndMask(
 70 |         my_study_by_grid, overlap_height=32, overlap_width=32, randomly_select=100
 71 |     )
 72 |     # We could apply this to a subset of the slides, but we will apply it to all slides
 73 |     # in this example.
 74 |     for slide in my_study_by_grid["slides"].values():
 75 |         tiles_by_grid(slide)
 76 | 
 77 |     if False:
 78 |         # Skip this test for now because we don't have the mask file available.
 79 |         # Demonstrate TilesByGridAndMask with a mask
 80 |         my_study_by_grid_and_mask = copy.deepcopy(my_study0)
 81 |         tiles_by_grid_and_mask = hs.configure.TilesByGridAndMask(
 82 |             my_study_by_grid_and_mask,
 83 |             overlap_height=0,
 84 |             overlap_width=0,
 85 |             mask_filename="/tf/notebooks/histomics_stream/example/"
 86 |             "TCGA-BH-A0BZ-01Z-00-DX1.45EB3E93-A871-49C6-9EAE-90D98AE01913-mask.png",
 87 |             randomly_select=100,
 88 |         )
 89 |         # We could apply this to a subset of the slides, but we will apply it to all
 90 |         # slides in this example.
 91 |         for slide in my_study_by_grid_and_mask["slides"].values():
 92 |             tiles_by_grid_and_mask(slide)
 93 | 
 94 |     # Demonstrate TilesByList
 95 |     my_study_by_list = copy.deepcopy(my_study0)
 96 |     tiles_by_list = hs.configure.TilesByList(
 97 |         my_study_by_list,
 98 |         randomly_select=5,
 99 |         tiles_dictionary=my_study_by_grid["slides"]["Slide_0"]["tiles"],
100 |     )
101 |     # We could apply this to a subset of the slides, but we will apply it to all slides
102 |     # in this example.
103 |     for slide in my_study_by_list["slides"].values():
104 |         tiles_by_list(slide)
105 | 
106 |     # Demonstrate TilesRandomly
107 |     my_study_randomly = copy.deepcopy(my_study0)
108 |     tiles_randomly = hs.configure.TilesRandomly(my_study_randomly, randomly_select=3)
109 |     # We could apply this to a subset of the slides, but we will apply it to all slides
110 |     # in this example.
111 |     for slide in my_study_randomly["slides"].values():
112 |         tiles_randomly(slide)
113 | 
114 |     # The next step would be creating a dataset/dataloader for a machine learning
115 |     # framework such as TensorFlow or Torch.  However, we will not do that in this test.
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     test_create_study()
120 | 


--------------------------------------------------------------------------------
/example/performance-EfficientNetV2S.py:
--------------------------------------------------------------------------------
  1 | # =========================================================================
  2 | #
  3 | #   Copyright NumFOCUS
  4 | #
  5 | #   Licensed under the Apache License, Version 2.0 (the "License");
  6 | #   you may not use this file except in compliance with the License.
  7 | #   You may obtain a copy of the License at
  8 | #
  9 | #          https://www.apache.org/licenses/LICENSE-2.0.txt
 10 | #
 11 | #   Unless required by applicable law or agreed to in writing, software
 12 | #   distributed under the License is distributed on an "AS IS" BASIS,
 13 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | #   See the License for the specific language governing permissions and
 15 | #   limitations under the License.
 16 | #
 17 | # =========================================================================
 18 | 
 19 | import os
 20 | import time
 21 | 
 22 | import pooch
 23 | import tensorflow as tf
 24 | 
 25 | import histomics_stream as hs
 26 | import histomics_stream.tensorflow
 27 | 
 28 | 
 29 | """
 30 | This is a script that is used to make timings of histomics_stream.  To some extent, it
 31 | may be specific to the computer / docker image it is used with and need minor tweaks to
 32 | run on another computer.
 33 | """
 34 | 
 35 | """
 36 | # If you've just started a fresh docker container you may need some of this:
 37 | apt update ; apt install -y git emacs ; \
 38 | rm -rf /.local ; \
 39 | pip install -U pip setuptools wheel ; \
 40 | pip install \
 41 |     'batchbald_redux' \
 42 |     'black[jupyter]' \
 43 |     'large_image[openslide,tiff]' \
 44 |     'nbformat>=5.2.0' \
 45 |     'pooch' \
 46 |     'protobuf<3.20' \
 47 |     'tensorflow_datasets' \
 48 |     'torch==1.12.1+cu113' \
 49 |     '/tf/notebooks/histomics_stream' \
 50 |     --extra-index-url https://download.pytorch.org/whl/cu113 \
 51 |     --find-links https://girder.github.io/large_image_wheels
 52 | """
 53 | 
 54 | 
 55 | def get_data():
 56 |     start_time = time.time()
 57 |     wsi_path = pooch.retrieve(
 58 |         fname="TCGA-AN-A0G0-01Z-00-DX1.svs",
 59 |         url="https://drive.google.com/uc"
 60 |         "?export=download"
 61 |         "&id=19agE_0cWY582szhOVxp9h3kozRfB4CvV"
 62 |         "&confirm=t"
 63 |         "&uuid=6f2d51e7-9366-4e98-abc7-4f77427dd02c"
 64 |         "&at=ALgDtswlqJJw1KU7P3Z1tZNcE01I:1679111148632",
 65 |         known_hash="d046f952759ff6987374786768fc588740eef1e54e4e295a684f3bd356c8528f",
 66 |         path=str(pooch.os_cache("pooch")) + os.sep + "wsi",
 67 |     )
 68 |     print(f"Retrieved {wsi_path} in {time.time() - start_time}s", flush=True)
 69 | 
 70 |     # download binary mask image
 71 |     start_time = time.time()
 72 |     mask_path = pooch.retrieve(
 73 |         fname="TCGA-AN-A0G0-01Z-00-DX1.mask.png",
 74 |         url="https://drive.google.com/uc"
 75 |         "?export=download"
 76 |         "&id=17GOOHbL8Bo3933rdIui82akr7stbRfta",
 77 |         known_hash="bb657ead9fd3b8284db6ecc1ca8a1efa57a0e9fd73d2ea63ce6053fbd3d65171",
 78 |         path=str(pooch.os_cache("pooch")) + os.sep + "wsi",
 79 |     )
 80 |     print(f"Retrieved {mask_path} in {time.time() - start_time}s", flush=True)
 81 |     return wsi_path, mask_path
 82 | 
 83 | 
 84 | class WrappedModel(tf.keras.Model):
 85 |     def __init__(self, unwrapped_model, *args, **kwargs):
 86 |         super(WrappedModel, self).__init__(*args, **kwargs)
 87 |         self.unwrapped_model = unwrapped_model
 88 | 
 89 |     def call(self, element):
 90 |         return self.unwrapped_model(element[0]), element[1]
 91 | 
 92 | 
 93 | def normalize_img(image, label):
 94 |     """Normalizes images: `uint8` -> `float32`."""
 95 |     return tf.cast(image, tf.float32) / 255.0, label
 96 | 
 97 | 
 98 | def build_model(training_batch, epochs):
 99 |     start_time = time.time()
100 |     unwrapped_model = tf.keras.applications.efficientnet_v2.EfficientNetV2S(
101 |         include_top=False, weights="imagenet", input_shape=(224, 224, 3), pooling="avg"
102 |     )
103 |     unwrapped_model.compile(
104 |         optimizer=tf.keras.optimizers.Adam(0.001),
105 |         loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
106 |         metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
107 |     )
108 |     # unwrapped_model.fit(ds_train, epochs=epochs, validation_data=ds_test)
109 | 
110 |     wrapped_model = WrappedModel(unwrapped_model)
111 | 
112 |     print(f"Finished model in {time.time() - start_time}s", flush=True)
113 |     return unwrapped_model, wrapped_model
114 | 
115 | 
116 | def create_study(wsi_path, mask_path, chunk_size):
117 |     start_time = time.time()
118 |     slide_name = os.path.splitext(os.path.split(wsi_path)[1])[0]
119 |     slide_group = "Group 3"
120 | 
121 |     study = dict(
122 |         version="version-1",
123 |         tile_height=224,
124 |         tile_width=224,
125 |         overlap_height=0,
126 |         overlap_width=0,
127 |         slides=dict(
128 |             Slide_0=dict(
129 |                 filename=wsi_path,
130 |                 slide_name=slide_name,
131 |                 slide_group=slide_group,
132 |                 chunk_height=chunk_size,
133 |                 chunk_width=chunk_size,
134 |             )
135 |         ),
136 |     )
137 | 
138 |     find_slide_resolution = hs.configure.FindResolutionForSlide(
139 |         study, target_magnification=20, magnification_source="exact"
140 |     )
141 |     tiles_by_grid_and_mask = hs.configure.TilesByGridAndMask(
142 |         study, mask_filename=mask_path
143 |     )
144 |     # We could apply these to a subset of the slides, but we will apply it to all slides
145 |     # in this example.
146 |     for slide in study["slides"].values():
147 |         find_slide_resolution(slide)
148 |         tiles_by_grid_and_mask(slide)
149 |     print(f"Masked study in {time.time() - start_time}s", flush=True)
150 | 
151 |     start_time = time.time()
152 |     create_tensorflow_dataset = hs.tensorflow.CreateTensorFlowDataset()
153 |     tiles = create_tensorflow_dataset(study, num_workers=1, worker_index=0)
154 |     print(f"#tiles = {len(create_tensorflow_dataset.get_tiles(study)[0][1])}")
155 |     print(f"Chunked study in {time.time() - start_time}s", flush=True)
156 | 
157 |     return study, tiles
158 | 
159 | 
160 | def predict(take_predictions, prediction_batch, model, tiles):
161 |     start_time = time.time()
162 |     tiles = tiles.batch(prediction_batch)
163 |     if take_predictions > 0:
164 |         predictions = model.predict(
165 |             tiles.take(1 + (take_predictions - 1) // prediction_batch)
166 |         )
167 |     else:
168 |         predictions = model.predict(tiles)
169 |     print(f"predictions[0].shape = {predictions[0].shape}")
170 |     print(f"Made predictions in {time.time() - start_time}s", flush=True)
171 |     return predictions
172 | 
173 | 
174 | if True:
175 |     gpus = [gpu.name for gpu in tf.config.list_logical_devices("GPU")]
176 |     print(f"gpus = {repr(gpus)}")
177 | 
178 | # if __name__ == "__main__":
179 | with tf.device(gpus[0]):
180 |     device = "gpu" if True else "cpu"
181 |     print(f"***** device = {device} *****")
182 |     training_batch = 2**7
183 |     num_epochs = 6
184 |     take_predictions = 2**10 if False else 0
185 | 
186 |     wsi_path, mask_path = get_data()
187 |     unwrapped_model, model = build_model(training_batch, num_epochs)
188 | 
189 |     for prediction_batch in [2**j for j in range(5, 11)]:
190 |         for chunk_size in [256] + [2**j for j in range(8, 14)]:
191 |             print(
192 |                 f"***** chunk_size = {chunk_size},"
193 |                 f" prediction_batch = {prediction_batch},"
194 |                 f" take_predictions = {take_predictions} ****",
195 |                 flush=True,
196 |             )
197 |             study, tiles = create_study(wsi_path, mask_path, chunk_size)
198 |             predictions = predict(take_predictions, prediction_batch, model, tiles)
199 |     print(f"***** Finished with device = {device} *****")
200 | 


--------------------------------------------------------------------------------
/example/performance-detection.py:
--------------------------------------------------------------------------------
  1 | # =========================================================================
  2 | #
  3 | #   Copyright NumFOCUS
  4 | #
  5 | #   Licensed under the Apache License, Version 2.0 (the "License");
  6 | #   you may not use this file except in compliance with the License.
  7 | #   You may obtain a copy of the License at
  8 | #
  9 | #          https://www.apache.org/licenses/LICENSE-2.0.txt
 10 | #
 11 | #   Unless required by applicable law or agreed to in writing, software
 12 | #   distributed under the License is distributed on an "AS IS" BASIS,
 13 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | #   See the License for the specific language governing permissions and
 15 | #   limitations under the License.
 16 | #
 17 | # =========================================================================
 18 | 
 19 | import os
 20 | import time
 21 | 
 22 | import pooch
 23 | import tensorflow as tf
 24 | 
 25 | import histomics_stream as hs
 26 | import histomics_stream.tensorflow
 27 | 
 28 | 
 29 | """
 30 | This is a script that is used to make timings of histomics_stream.  To some extent, it
 31 | may be specific to the computer / docker image it is used with and need minor tweaks to
 32 | run on another computer.
 33 | """
 34 | 
 35 | """
 36 | # If you've just started a fresh docker container you may need some of this:
 37 | apt update ; apt install -y git emacs ; \
 38 | rm -rf /.local ; \
 39 | pip install -U pip setuptools wheel ; \
 40 | pip install \
 41 |     'batchbald_redux' \
 42 |     'black[jupyter]' \
 43 |     'large_image[openslide,tiff]' \
 44 |     'nbformat>=5.2.0' \
 45 |     'pooch' \
 46 |     'protobuf<3.20' \
 47 |     'tensorflow_datasets' \
 48 |     'torch==1.12.1+cu113' \
 49 |     '/tf/notebooks/histomics_stream' \
 50 |     '/tf/notebooks/histomics_detect' \
 51 |     --extra-index-url https://download.pytorch.org/whl/cu113 \
 52 |     --find-links https://girder.github.io/large_image_wheels
 53 | """
 54 | 
 55 | 
 56 | def get_data():
 57 |     start_time = time.time()
 58 |     wsi_path = pooch.retrieve(
 59 |         fname="TCGA-AN-A0G0-01Z-00-DX1.svs",
 60 |         url="https://drive.google.com/uc"
 61 |         "?export=download"
 62 |         "&id=19agE_0cWY582szhOVxp9h3kozRfB4CvV"
 63 |         "&confirm=t"
 64 |         "&uuid=6f2d51e7-9366-4e98-abc7-4f77427dd02c"
 65 |         "&at=ALgDtswlqJJw1KU7P3Z1tZNcE01I:1679111148632",
 66 |         known_hash="d046f952759ff6987374786768fc588740eef1e54e4e295a684f3bd356c8528f",
 67 |         path=str(pooch.os_cache("pooch")) + os.sep + "wsi",
 68 |     )
 69 |     print(f"Retrieved {wsi_path} in {time.time() - start_time}s", flush=True)
 70 | 
 71 |     # download binary mask image
 72 |     start_time = time.time()
 73 |     mask_path = pooch.retrieve(
 74 |         fname="TCGA-AN-A0G0-01Z-00-DX1.mask.png",
 75 |         url="https://drive.google.com/uc"
 76 |         "?export=download"
 77 |         "&id=17GOOHbL8Bo3933rdIui82akr7stbRfta",
 78 |         known_hash="bb657ead9fd3b8284db6ecc1ca8a1efa57a0e9fd73d2ea63ce6053fbd3d65171",
 79 |         path=str(pooch.os_cache("pooch")) + os.sep + "wsi",
 80 |     )
 81 |     print(f"Retrieved {mask_path} in {time.time() - start_time}s", flush=True)
 82 |     return wsi_path, mask_path
 83 | 
 84 | 
 85 | class WrappedModel(tf.keras.Model):
 86 |     def __init__(self, unwrapped_model, *args, **kwargs):
 87 |         super(WrappedModel, self).__init__(*args, **kwargs)
 88 |         self.unwrapped_model = unwrapped_model
 89 | 
 90 |     def call(self, element):
 91 |         return self.unwrapped_model(element[0]), element[1]
 92 | 
 93 | 
 94 | def build_model():
 95 |     start_time = time.time()
 96 |     model_path = pooch.retrieve(
 97 |         fname="tcga_brca_model",
 98 |         url="https://drive.google.com/uc"
 99 |         "?export=download"
100 |         "&id=1KxB6iAn9j2Wp7oyFlV4T1Kli-mR8-35G"
101 |         "&confirm=t"
102 |         "&uuid=c5df8dfd-ed48-4cef-81a0-19df97677fe5"
103 |         "&at=ALgDtswWzs0BEdkVNgFrp83p9NDO:1679111246793",
104 |         known_hash="b5b5444cc8874d17811a89261abeafd9b9603e7891a8b2a98d8f13e2846a6689",
105 |         path=str(pooch.os_cache("pooch")) + os.sep + "model",
106 |         processor=pooch.Unzip(),
107 |     )
108 |     model_path = os.path.split(model_path[0])[0]
109 |     print(f"Have {model_path}.")
110 | 
111 |     # restore keras model
112 |     from histomics_detect.models import FasterRCNN
113 | 
114 |     model = tf.keras.models.load_model(
115 |         model_path, custom_objects={"FasterRCNN": FasterRCNN}
116 |     )
117 | 
118 |     unwrapped_model = model
119 |     model = WrappedModel(unwrapped_model)
120 | 
121 |     print(f"Finished model in {time.time() - start_time}s", flush=True)
122 |     return unwrapped_model, model
123 | 
124 | 
125 | def create_study(wsi_path, mask_path, chunk_size):
126 |     start_time = time.time()
127 |     slide_name = os.path.splitext(os.path.split(wsi_path)[1])[0]
128 |     slide_group = "Group 3"
129 | 
130 |     study = dict(
131 |         version="version-1",
132 |         tile_height=256,
133 |         tile_width=256,
134 |         overlap_height=64,
135 |         overlap_width=64,
136 |         slides=dict(
137 |             Slide_0=dict(
138 |                 filename=wsi_path,
139 |                 slide_name=slide_name,
140 |                 slide_group=slide_group,
141 |                 chunk_height=chunk_size,
142 |                 chunk_width=chunk_size,
143 |             )
144 |         ),
145 |     )
146 | 
147 |     find_slide_resolution = hs.configure.FindResolutionForSlide(
148 |         study, target_magnification=20, magnification_source="exact"
149 |     )
150 |     tiles_by_grid_and_mask = hs.configure.TilesByGridAndMask(
151 |         study, mask_filename=mask_path
152 |     )
153 |     # We could apply these to a subset of the slides, but we will apply it to all slides
154 |     # in this example.
155 |     for slide in study["slides"].values():
156 |         find_slide_resolution(slide)
157 |         tiles_by_grid_and_mask(slide)
158 |     print(f"Masked study in {time.time() - start_time}s", flush=True)
159 | 
160 |     start_time = time.time()
161 |     create_tensorflow_dataset = hs.tensorflow.CreateTensorFlowDataset()
162 |     tiles = create_tensorflow_dataset(study, num_workers=1, worker_index=0)
163 |     print(f"#tiles = {len(create_tensorflow_dataset.get_tiles(study)[0][1])}")
164 |     print(f"Chunked study in {time.time() - start_time}s", flush=True)
165 | 
166 |     return study, tiles
167 | 
168 | 
169 | def predict(take_predictions, prediction_batch, model, tiles):
170 |     start_time = time.time()
171 |     tiles = tiles.batch(prediction_batch)
172 |     if take_predictions > 0:
173 |         predictions = model.predict(
174 |             tiles.take(1 + (take_predictions - 1) // prediction_batch)
175 |         )
176 |     else:
177 |         predictions = model.predict(tiles)
178 |     print(f"predictions[0].shape = {predictions[0].shape}")
179 |     print(f"Made predictions in {time.time() - start_time}s", flush=True)
180 |     return predictions
181 | 
182 | 
183 | if True:
184 |     gpus = [gpu.name for gpu in tf.config.list_logical_devices("GPU")]
185 |     print(f"gpus = {repr(gpus)}")
186 | 
187 | # if __name__ == "__main__":
188 | with tf.device(gpus[0]):
189 |     device = "cuda"
190 |     print(f"***** device = {device} *****")
191 |     take_predictions = 2**17 if False else 0
192 |     wsi_path, mask_path = get_data()
193 |     unwrapped_model, model = build_model()
194 | 
195 |     for prediction_batch in (1,):
196 |         for chunk_size in [256] + [2**j for j in range(8, 14)]:
197 |             print(
198 |                 f"***** chunk_size = {chunk_size},"
199 |                 f" prediction_batch = {prediction_batch},"
200 |                 f" take_predictions = {take_predictions} ****",
201 |                 flush=True,
202 |             )
203 |             study, tiles = create_study(wsi_path, mask_path, chunk_size)
204 |             predictions = predict(take_predictions, prediction_batch, model, tiles)
205 |     print(f"***** Finished with device = {device} *****")
206 | 


--------------------------------------------------------------------------------
/example/performance-mnist.py:
--------------------------------------------------------------------------------
  1 | # =========================================================================
  2 | #
  3 | #   Copyright NumFOCUS
  4 | #
  5 | #   Licensed under the Apache License, Version 2.0 (the "License");
  6 | #   you may not use this file except in compliance with the License.
  7 | #   You may obtain a copy of the License at
  8 | #
  9 | #          https://www.apache.org/licenses/LICENSE-2.0.txt
 10 | #
 11 | #   Unless required by applicable law or agreed to in writing, software
 12 | #   distributed under the License is distributed on an "AS IS" BASIS,
 13 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | #   See the License for the specific language governing permissions and
 15 | #   limitations under the License.
 16 | #
 17 | # =========================================================================
 18 | 
 19 | import os
 20 | import time
 21 | 
 22 | import pooch
 23 | import tensorflow as tf
 24 | import tensorflow_datasets as tfds
 25 | 
 26 | import histomics_stream as hs
 27 | import histomics_stream.tensorflow
 28 | 
 29 | 
 30 | """
 31 | This is a script that is used to make timings of histomics_stream.  To some extent, it
 32 | may be specific to the computer / docker image it is used with and need minor tweaks to
 33 | run on another computer.
 34 | """
 35 | 
 36 | """
 37 | # If you've just started a fresh docker container you may need some of this:
 38 | apt update ; apt install -y git emacs ; \
 39 | rm -rf /.local ; \
 40 | pip install -U pip setuptools wheel ; \
 41 | pip install \
 42 |     'batchbald_redux' \
 43 |     'black[jupyter]' \
 44 |     'large_image[openslide,tiff]' \
 45 |     'nbformat>=5.2.0' \
 46 |     'pooch' \
 47 |     'protobuf<3.20' \
 48 |     'tensorflow_datasets' \
 49 |     'torch==1.12.1+cu113' \
 50 |     '/tf/notebooks/histomics_stream' \
 51 |     --extra-index-url https://download.pytorch.org/whl/cu113 \
 52 |     --find-links https://girder.github.io/large_image_wheels
 53 | """
 54 | 
 55 | 
 56 | def get_data():
 57 |     start_time = time.time()
 58 |     wsi_path = pooch.retrieve(
 59 |         fname="TCGA-AN-A0G0-01Z-00-DX1.svs",
 60 |         url="https://drive.google.com/uc"
 61 |         "?export=download"
 62 |         "&id=19agE_0cWY582szhOVxp9h3kozRfB4CvV"
 63 |         "&confirm=t"
 64 |         "&uuid=6f2d51e7-9366-4e98-abc7-4f77427dd02c"
 65 |         "&at=ALgDtswlqJJw1KU7P3Z1tZNcE01I:1679111148632",
 66 |         known_hash="d046f952759ff6987374786768fc588740eef1e54e4e295a684f3bd356c8528f",
 67 |         path=str(pooch.os_cache("pooch")) + os.sep + "wsi",
 68 |     )
 69 |     print(f"Retrieved {wsi_path} in {time.time() - start_time}s", flush=True)
 70 | 
 71 |     # download binary mask image
 72 |     start_time = time.time()
 73 |     mask_path = pooch.retrieve(
 74 |         fname="TCGA-AN-A0G0-01Z-00-DX1.mask.png",
 75 |         url="https://drive.google.com/uc"
 76 |         "?export=download"
 77 |         "&id=17GOOHbL8Bo3933rdIui82akr7stbRfta",
 78 |         known_hash="bb657ead9fd3b8284db6ecc1ca8a1efa57a0e9fd73d2ea63ce6053fbd3d65171",
 79 |         path=str(pooch.os_cache("pooch")) + os.sep + "wsi",
 80 |     )
 81 |     print(f"Retrieved {mask_path} in {time.time() - start_time}s", flush=True)
 82 |     return wsi_path, mask_path
 83 | 
 84 | 
 85 | class WrappedModel(tf.keras.Model):
 86 |     def __init__(self, model, *args, **kwargs):
 87 |         super(WrappedModel, self).__init__(*args, **kwargs)
 88 |         self.model = model
 89 | 
 90 |     def call(self, element):
 91 |         # Use just red of the color image
 92 |         return (self.model(element[0][..., 0]), element[1])
 93 | 
 94 | 
 95 | def normalize_img(image, label):
 96 |     """Normalizes images: `uint8` -> `float32`."""
 97 |     return tf.cast(image, tf.float32) / 255.0, label
 98 | 
 99 | 
100 | def build_model(training_batch, epochs):
101 |     start_time = time.time()
102 |     (ds_train, ds_test), ds_info = tfds.load(
103 |         "mnist",
104 |         split=["train", "test"],
105 |         shuffle_files=True,
106 |         as_supervised=True,
107 |         with_info=True,
108 |     )
109 |     print(f"Finished tfds.load in {time.time() - start_time}s", flush=True)
110 | 
111 |     start_time = time.time()
112 |     ds_train = ds_train.map(normalize_img, num_parallel_calls=tf.data.AUTOTUNE)
113 |     ds_train = ds_train.cache()
114 |     ds_train = ds_train.shuffle(ds_info.splits["train"].num_examples)
115 |     ds_train = ds_train.batch(training_batch)
116 |     ds_train = ds_train.prefetch(tf.data.AUTOTUNE)
117 |     ds_test = ds_test.map(normalize_img, num_parallel_calls=tf.data.AUTOTUNE)
118 |     ds_test = ds_test.batch(training_batch)
119 |     ds_test = ds_test.cache()
120 |     ds_test = ds_test.prefetch(tf.data.AUTOTUNE)
121 |     print(f"Finished (ds_train, ds_test) in {time.time() - start_time}s", flush=True)
122 | 
123 |     start_time = time.time()
124 |     model = tf.keras.models.Sequential(
125 |         [
126 |             tf.keras.layers.Flatten(input_shape=(28, 28)),
127 |             tf.keras.layers.Dense(128, activation="relu"),
128 |             tf.keras.layers.Dense(10),
129 |         ]
130 |     )
131 |     model.compile(
132 |         optimizer=tf.keras.optimizers.Adam(0.001),
133 |         loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
134 |         metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
135 |     )
136 |     model.fit(ds_train, epochs=epochs, validation_data=ds_test)
137 | 
138 |     unwrapped_model = model
139 |     model = WrappedModel(unwrapped_model)
140 | 
141 |     print(f"Finished model in {time.time() - start_time}s", flush=True)
142 |     return unwrapped_model, model
143 | 
144 | 
145 | def create_study(wsi_path, mask_path, chunk_size):
146 |     start_time = time.time()
147 |     slide_name = os.path.splitext(os.path.split(wsi_path)[1])[0]
148 |     slide_group = "Group 3"
149 | 
150 |     study = dict(
151 |         version="version-1",
152 |         tile_height=28,
153 |         tile_width=28,
154 |         overlap_height=14,
155 |         overlap_width=14,
156 |         slides=dict(
157 |             Slide_0=dict(
158 |                 filename=wsi_path,
159 |                 slide_name=slide_name,
160 |                 slide_group=slide_group,
161 |                 chunk_height=chunk_size,
162 |                 chunk_width=chunk_size,
163 |             )
164 |         ),
165 |     )
166 | 
167 |     find_slide_resolution = hs.configure.FindResolutionForSlide(
168 |         study, target_magnification=20, magnification_source="exact"
169 |     )
170 |     tiles_by_grid_and_mask = hs.configure.TilesByGridAndMask(
171 |         study, mask_filename=mask_path
172 |     )
173 |     # We could apply these to a subset of the slides, but we will apply it to all slides
174 |     # in this example.
175 |     for slide in study["slides"].values():
176 |         find_slide_resolution(slide)
177 |         tiles_by_grid_and_mask(slide)
178 |     print(f"Masked study in {time.time() - start_time}s", flush=True)
179 | 
180 |     start_time = time.time()
181 |     create_tensorflow_dataset = hs.tensorflow.CreateTensorFlowDataset()
182 |     tiles = create_tensorflow_dataset(study, num_workers=1, worker_index=0)
183 |     print(f"#tiles = {len(create_tensorflow_dataset.get_tiles(study)[0][1])}")
184 |     print(f"Chunked study in {time.time() - start_time}s", flush=True)
185 | 
186 |     return study, tiles
187 | 
188 | 
189 | def predict(take_predictions, prediction_batch, model, tiles):
190 |     start_time = time.time()
191 |     tiles = tiles.batch(prediction_batch)
192 |     if take_predictions > 0:
193 |         predictions = model.predict(
194 |             tiles.take(1 + (take_predictions - 1) // prediction_batch)
195 |         )
196 |     else:
197 |         predictions = model.predict(tiles)
198 |     print(f"predictions[0].shape = {predictions[0].shape}")
199 |     print(f"Made predictions in {time.time() - start_time}s", flush=True)
200 |     return predictions
201 | 
202 | 
203 | if True:
204 |     gpus = [gpu.name for gpu in tf.config.list_logical_devices("GPU")]
205 |     print(f"gpus = {repr(gpus)}")
206 | 
207 | # if __name__ == "__main__":
208 | with tf.device(gpus[0]):
209 |     device = "cuda"
210 |     print(f"***** device = {device} *****")
211 |     training_batch = 2**7
212 |     num_epochs = 6
213 |     take_predictions = 2**17 if False else 0
214 | 
215 |     wsi_path, mask_path = get_data()
216 |     unwrapped_model, model = build_model(training_batch, num_epochs)
217 | 
218 |     for prediction_batch in [2**j for j in range(5, 11)]:
219 |         for chunk_size in [28] + [2**j for j in range(6, 14)]:
220 |             print(
221 |                 f"***** chunk_size = {chunk_size},"
222 |                 f" prediction_batch = {prediction_batch},"
223 |                 f" take_predictions = {take_predictions} ****",
224 |                 flush=True,
225 |             )
226 |             study, tiles = create_study(wsi_path, mask_path, chunk_size)
227 |             predictions = predict(take_predictions, prediction_batch, model, tiles)
228 |     print(f"***** Finished with device = {device} *****")
229 | 


--------------------------------------------------------------------------------
/example/performance-EfficientNet_V2_S_Weights.IMAGENET1K_V1.py:
--------------------------------------------------------------------------------
  1 | # =========================================================================
  2 | #
  3 | #   Copyright NumFOCUS
  4 | #
  5 | #   Licensed under the Apache License, Version 2.0 (the "License");
  6 | #   you may not use this file except in compliance with the License.
  7 | #   You may obtain a copy of the License at
  8 | #
  9 | #          https://www.apache.org/licenses/LICENSE-2.0.txt
 10 | #
 11 | #   Unless required by applicable law or agreed to in writing, software
 12 | #   distributed under the License is distributed on an "AS IS" BASIS,
 13 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | #   See the License for the specific language governing permissions and
 15 | #   limitations under the License.
 16 | #
 17 | # =========================================================================
 18 | 
 19 | import argparse
 20 | import itertools
 21 | import os
 22 | import time
 23 | 
 24 | import pooch
 25 | import torch
 26 | import torchvision
 27 | 
 28 | import histomics_stream as hs
 29 | import histomics_stream.pytorch
 30 | 
 31 | 
 32 | """
 33 | This is a script that is used to make timings of histomics_stream.  To some extent, it
 34 | may be specific to the computer / docker image it is used with and need minor tweaks to
 35 | run on another computer.
 36 | """
 37 | 
 38 | """
 39 | # If you've just started a fresh docker container you may need some of this:
 40 | apt update ; apt install -y git emacs ; \
 41 | rm -rf /.local ; \
 42 | pip install -U pip setuptools wheel pillow ; \
 43 | pip install \
 44 |     'black[jupyter]' \
 45 |     'large_image[openslide,tiff]' \
 46 |     'monai[pillow,tqdm,ignite,gdown]' \
 47 |     'nbformat>=5.2.0' \
 48 |     'pooch' \
 49 |     'protobuf' \
 50 |     '/tf/notebooks/histomics_stream' \
 51 |     --find-links https://girder.github.io/large_image_wheels
 52 | """
 53 | 
 54 | 
 55 | def get_data():
 56 |     start_time = time.time()
 57 |     wsi_path = pooch.retrieve(
 58 |         fname="TCGA-AN-A0G0-01Z-00-DX1.svs",
 59 |         url="https://drive.google.com/uc"
 60 |         "?export=download"
 61 |         "&id=19agE_0cWY582szhOVxp9h3kozRfB4CvV"
 62 |         "&confirm=t"
 63 |         "&uuid=6f2d51e7-9366-4e98-abc7-4f77427dd02c"
 64 |         "&at=ALgDtswlqJJw1KU7P3Z1tZNcE01I:1679111148632",
 65 |         known_hash="d046f952759ff6987374786768fc588740eef1e54e4e295a684f3bd356c8528f",
 66 |         path=str(pooch.os_cache("pooch")) + os.sep + "wsi",
 67 |     )
 68 |     print(f"Retrieved {wsi_path} in {time.time() - start_time}s", flush=True)
 69 | 
 70 |     # download binary mask image
 71 |     start_time = time.time()
 72 |     mask_path = pooch.retrieve(
 73 |         fname="TCGA-AN-A0G0-01Z-00-DX1.mask.png",
 74 |         url="https://drive.google.com/uc"
 75 |         "?export=download"
 76 |         "&id=17GOOHbL8Bo3933rdIui82akr7stbRfta",
 77 |         known_hash="bb657ead9fd3b8284db6ecc1ca8a1efa57a0e9fd73d2ea63ce6053fbd3d65171",
 78 |         path=str(pooch.os_cache("pooch")) + os.sep + "wsi",
 79 |     )
 80 |     print(f"Retrieved {mask_path} in {time.time() - start_time}s", flush=True)
 81 |     return wsi_path, mask_path
 82 | 
 83 | 
 84 | class WrappedModel(torch.nn.modules.module.Module):
 85 |     def __init__(self, model, preprocess_fn, *args, device="cuda", **kwargs):
 86 |         super(WrappedModel, self).__init__(*args, **kwargs)
 87 |         self.device = torch.device(device)
 88 |         self.model = model.to(self.device)
 89 |         self.preprocess_fn = preprocess_fn.to(self.device)
 90 | 
 91 |     def forward(self, x):
 92 |         p = self.model(self.preprocess_fn(x[0].to(self.device)))
 93 |         return p, x[1]
 94 | 
 95 | 
 96 | def build_model(device="cuda"):
 97 |     start_time = time.time()
 98 |     # print(f"available_models = {repr(sorted(torchvision.models.list_models()))}")
 99 |     weights = torchvision.models.EfficientNet_V2_S_Weights.DEFAULT
100 |     model = torchvision.models.efficientnet_v2_s(weights=weights)
101 |     _ = model.eval()
102 |     preprocess_fn = weights.transforms()
103 | 
104 |     unwrapped_model = model
105 |     model = WrappedModel(unwrapped_model, preprocess_fn, device=device).to(device)
106 | 
107 |     print(f"Finished model in {time.time() - start_time}s", flush=True)
108 |     return unwrapped_model, model
109 | 
110 | 
111 | def create_study(wsi_path, mask_path, chunk_size):
112 |     start_time = time.time()
113 |     slide_name = os.path.splitext(os.path.split(wsi_path)[1])[0]
114 |     slide_group = "Group 3"
115 | 
116 |     study = dict(
117 |         version="version-1",
118 |         tile_height=224,
119 |         tile_width=224,
120 |         overlap_height=0,
121 |         overlap_width=0,
122 |         slides=dict(
123 |             Slide_0=dict(
124 |                 filename=wsi_path,
125 |                 slide_name=slide_name,
126 |                 slide_group=slide_group,
127 |                 chunk_height=chunk_size,
128 |                 chunk_width=chunk_size,
129 |             )
130 |         ),
131 |     )
132 | 
133 |     find_slide_resolution = hs.configure.FindResolutionForSlide(
134 |         study, target_magnification=20, magnification_source="exact"
135 |     )
136 |     tiles_by_grid_and_mask = hs.configure.TilesByGridAndMask(
137 |         study, mask_filename=mask_path
138 |     )
139 |     # We could apply these to a subset of the slides, but we will apply it to all slides
140 |     # in this example.
141 |     for slide in study["slides"].values():
142 |         find_slide_resolution(slide)
143 |         tiles_by_grid_and_mask(slide)
144 |     print(f"Masked study in {time.time() - start_time}s", flush=True)
145 | 
146 |     start_time = time.time()
147 |     create_torch_dataloader = hs.pytorch.CreateTorchDataloader()
148 |     tiles = create_torch_dataloader(study)
149 |     print(f"#tiles = {len(create_torch_dataloader.get_tiles(study)[0][1])}")
150 |     print(f"Chunked study in {time.time() - start_time}s", flush=True)
151 |     return study, tiles
152 | 
153 | 
154 | def show_structure(x):
155 |     if isinstance(x, list):
156 |         if len(x) > 0:
157 |             return f"[{len(x)} of {show_structure(x[0])}]"
158 |         else:
159 |             return repr(list())
160 |     if isinstance(x, tuple):
161 |         if len(x) > 0:
162 |             return f"({len(x)} of {show_structure(x[0])})"
163 |         else:
164 |             return repr(tuple())
165 |     if isinstance(x, set):
166 |         if len(x) > 0:
167 |             return f"{{{len(x)} of {show_structure(next(iter(x)))}}}"
168 |         else:
169 |             return repr(set())
170 |     if isinstance(x, dict):
171 |         if len(x) > 0:
172 |             return f"{{{len(x)} of {show_structure(next(iter(x.keys())))}: {show_structure(next(iter(x.values())))}}}"
173 |         else:
174 |             return repr(dict())
175 |     return repr(type(x))
176 | 
177 | 
178 | """
179 | !!! Probably we should be using torch.utils.data.DataLoader batch_size option instead of
180 | !!! this batched() function.
181 | """
182 | 
183 | 
184 | def batched(iterable, batch_size):
185 |     """
186 |     Batch data into lists of length batch_size. The last batch may be shorter:
187 |     batched('ABCDEFG', 3) --> ABC DEF G
188 |     """
189 |     iterator = iter(iterable)
190 |     # !!! Can we get rid of `list` here and a few lines below?  It is used so that we
191 |     # !!! can detect an empty list with `while`.
192 |     batch = list(itertools.islice(iterator, batch_size))
193 |     while batch:
194 |         # Yield `batch` in such a way that this iterator does not keep a reference count
195 |         # for it.
196 |         batch_in_list = [batch]
197 |         del batch
198 |         yield batch_in_list.pop()
199 |         batch = list(itertools.islice(iterator, batch_size))
200 | 
201 | 
202 | def predict_and_detach(model, item):
203 |     predict = model(item)
204 |     return predict[0].detach().cpu().numpy(), predict[1]
205 | 
206 | 
207 | def predict(take_predictions, prediction_batch, model, tiles):
208 |     start_time = time.time()
209 |     if take_predictions > 0:
210 |         tiles = itertools.islice(tiles, take_predictions)
211 |     batched_tiles = (
212 |         batched(tiles, prediction_batch) if prediction_batch > 0 else [tiles]
213 |     )
214 |     predictions = list()
215 |     for batch in batched_tiles:
216 |         batch_predictions = [predict_and_detach(model, item) for item in batch]
217 |         predictions.extend(batch_predictions)
218 |     del batch_predictions, batch
219 |     print(f"Made predictions in {time.time() - start_time}s", flush=True)
220 |     return predictions
221 | 
222 | 
223 | def create_and_predict(
224 |     wsi_path, mask_path, chunk_size, take_predictions, prediction_batch, model
225 | ):
226 |     study, tiles = create_study(wsi_path, mask_path, chunk_size)
227 |     predictions = predict(take_predictions, prediction_batch, model, tiles)
228 |     print(f"show_structure(predictions) = {show_structure(predictions)}")
229 | 
230 | 
231 | if __name__ == "__main__":
232 |     parser = argparse.ArgumentParser()
233 |     parser.add_argument("device")
234 |     args = parser.parse_args()
235 |     # device = "cuda" if True else "cpu"
236 |     device = args.device
237 |     print(f"***** device = {device} *****")
238 |     take_predictions = 2**8 if True else 0
239 | 
240 |     wsi_path, mask_path = get_data()
241 |     unwrapped_model, model = build_model(device=device)
242 | 
243 |     # for prediction_batch in [2**j for j in range(0, 6)]:
244 |     for prediction_batch in [0]:
245 |         for chunk_size in [1024] + [2**j for j in range(8, 14)]:
246 |             print(
247 |                 f"***** chunk_size = {chunk_size},"
248 |                 f" prediction_batch = {prediction_batch},"
249 |                 f" take_predictions = {take_predictions} ****",
250 |                 flush=True,
251 |             )
252 |             create_and_predict(
253 |                 wsi_path,
254 |                 mask_path,
255 |                 chunk_size,
256 |                 take_predictions,
257 |                 prediction_batch,
258 |                 model,
259 |             )
260 |     print(f"***** Finished with device = {device} *****")
261 | 


--------------------------------------------------------------------------------
/StudyObject.md:
--------------------------------------------------------------------------------
  1 | # The histomcis_stream `study` Python dict
  2 | 
  3 | The study is defined as a hierarchy of nested Python dict (dictionary) objects.  Further below, arguments to the `histomics_stream` function objects are described.
  4 | 
  5 | ## Keys and values
  6 | The keys in the Python dict for the study and their corresponding values are as follows.  The format in the following list is
  7 | + **key** (type of value): description of value
  8 | 
  9 | Those keys that are fixed strings are in bold.  The keys whose values are set directly by the user are in italics; all other values are set by calls to `histomics_stream` function objects.
 10 | 
 11 | + ***version*** (str):
 12 |   Always equal to "version-1".
 13 | + ***tile_height*** (int):
 14 |   How high is each tile, measured in pixels using the `target_magnification` (described below).
 15 | + ***tile_width*** (int):
 16 |   How wide is each tile, measured in pixels using the `target_magnification` (described below).
 17 | + ***overlap_height*** (int):
 18 |   Specifies the desired amount of vertical overlap between adjacent tiles, measured in pixels using the `target_magnification` (described below).  If overlap_height is not supplied, it is set to zero.  Zero indicates that there is no overlap between adjacent tiles; they are abutting.
 19 | + ***overlap_width*** (int):
 20 |   Specifies the desired amount of horizontal overlap between adjacent tiles, measured in pixels using the `target_magnification` (described below).  If overlap_width is not supplied, it is set to zero.  Zero indicates that there is no overlap between adjacent tiles; they are abutting.
 21 | + ***slides*** (Python dict):
 22 |   Contains information about the study's slides.  The distinct keys of this Python dict are set by the user for their own convenience, one per slide.
 23 |   + *user-selected key for slide* (Python dict):
 24 |     Contains information about this slide.  The keys and values for this Python dict are:
 25 |     + ***filename*** (str):
 26 |       The path to the file containing the pixel data for this slide.
 27 |     + ***slide_name*** (str):
 28 |       A user-supplied name for this slide.
 29 |     + ***slide_group*** (str):
 30 |       A user-supplied name for the group to which this slide belongs.
 31 |     + ***chunk_height*** (int):
 32 |       For read efficiency, how high a chunk of data that is read in one read should be, measured in pixels using the `target_magnification` (described below).
 33 |     + ***chunk_width*** (int):
 34 |       For read efficiency, how wide a chunk of data that is read in one read should be, measured in pixels using the `target_magnification` (described below).
 35 |     + **`target_magnification`** (float):
 36 |       The image magnification that the user wishes to use for the slide, if available given other restrictions.  A value of 10 corresponds to a pixel resolution of approximately 1 micron; magnification 40 is approximately 0.25 microns per pixel.
 37 |     + **scan_magnification** (float):
 38 |       The highest magnification directly available from the file storing the image.
 39 |     + **read_magnification** (float):
 40 |       The magnification directly read from the file storing the image.  This will be the smallest magnification directly available that is at least as large as the `target_magnification` if `magnification_source in ("exact", "native")`; it will be the `scan_magnification` if `magnification_source="scan"` is selected.
 41 |     + **returned_magnification** (float):
 42 |       The magnification of the pixel data returned by `histomics_stream`.  This will be the `target_magnification` if `magnification_source="exact"` is selected; it will be `read_magnification` if `magnification_source="native"` is selected; it will be `scan_magnification` if `magnification_source="scan"` is selected.
 43 |     + **level** (float):
 44 |       the internal `large_image` level that defines the `returned_magnification`.
 45 |     + **slide_width** (int):
 46 |       How high is the slide, measured in pixels using the `target_magnification` (described above).
 47 |     + **slide_height** (int):
 48 |       How wide is the slide, measured in pixels using the `target_magnification` (described above).
 49 |     + **slide_height_tiles** (int):
 50 |       How many (possibly overlapping) tiles fit into the height of the slide.
 51 |     + **slide_width_tiles** (int):
 52 |       How many (possibly overlapping) tiles fit into the width of the slide.
 53 |     + **mask_height** (int):
 54 |       If a mask is supplied this is the mask's height in its scan resolution.
 55 |     + **mask_width** (int):
 56 |       If a mask is supplied this is the mask's width in its scan resolution.
 57 |     + **tiles** (Python dict):
 58 |       Contains information about the slide's tiles.  The keys of this Python dict are set by the user for their own convenience, one per tile.
 59 |       + *user-selected key for tile* (Python dict):
 60 |         Contains information about this tile.  The keys and values for this Python dict are:
 61 |         + **tile_top** (int):
 62 |           The index of the top row of the tile, where 0 is the top row of the slide, measured in pixels using the `target_magnification` (described above).
 63 |         + **tile_left** (int):
 64 |           The index of the leftmost column of the tile, where 0 is the leftmost column of the slide, measured in pixels using the `target_magnification` (described above).
 65 |     + **chunks** (Python dict):
 66 |       Contains information about the slide's read chunks.  The keys of this Python dict are set by `histomics_stream` for its own convenience, one per chunk.
 67 |       + key for chunk (Python dict):
 68 |         Contains information about this chunk.  The keys and values for this Python dict are:
 69 |         + **chunk_top** (int):
 70 |           The index of the top row of the chunk, where 0 is the top row of the slide, measured in pixels using the `target_magnification` (described above).
 71 |         + **chunk_left** (int):
 72 |           The index of the leftmost column of the chunk, where 0 is the leftmost column of the slide, measured in pixels using the `target_magnification` (described above).
 73 |         + **chunk_bottom** (int):
 74 |           The index of the bottom row of the chunk, where 0 is the top row of the slide, measured in pixels using the `target_magnification` (described above).
 75 |         + **chunk_right** (int):
 76 |           The index of the rightmost column of the chunk, where 0 is the leftmost column of the slide, measured in pixels using the `target_magnification` (described above).
 77 |         + **tiles** (Python dict):
 78 |           The tiles that that will be read together when this chunk is read; `chunk["tiles"][tile_key]` is a reference to the corresponding `slide["tiles"][tile_key]` value.
 79 | 
 80 | ## Arguments for `histomics_stream` function objects.
 81 | 
 82 | + ***target_magnification*** (float):
 83 |   The image magnification that the user wishes to use for the slide, if available given other restrictions.  A value of 10 corresponds to a pixel resolution of approximately 1 micron; magnification 40 is approximately 0.25 microns per pixel.
 84 | 
 85 | + ***magnification_source*** (str in ["scan", "native", "exact"]):
 86 |     "scan" will produce tiles from the highest magnification avaialable.  This is typically the slide scanner's objective magnification.
 87 | 
 88 |     "native" will produce tiles from the nearest available magnification equal to or greater than target_magnification (within a 2% tolerance).  The "native" option is useful when you want to handle resizing of tiles to target_magnification on your own.
 89 | 
 90 |     "exact" will produce tiles using "native" option and then resize these tiles to match target_magnification.  Resizing is handled by PIL using the Lanczos antialiasing filter since the resizing shrinks the tile by definition.
 91 | 
 92 |     For either "scan" or "native", the size of the read and returned tiles will be (tile_height * returned_magnification / target_magnification, tile_width * returned_magnification / target_magnification).  For "exact" the size of the returned tiles will be (tile_height, tile_width).
 93 | 
 94 |     This procedure sets values in the Python dict for this slide to capture the scan, read, and returned magnification of the tiles.  This is helpful for example to resize results to the scan magnification for visualization in HistomicsUI, or to resize between native and target magnification when using "native".  "scan_magnification" is the highest magnification from the source file; "read_magnification" is the magnification read from the source file; "returned_magnification" is the magnification of the returned tiles which is same as "read_magnification" in the case of "scan" or "native" or is the same as "target_magnification" in the case of "exact".
 95 | 
 96 | + ***randomly_select*** (int):
 97 |     The number of tiles to be randomly selected from the list that would otherwise be written to the Python dict for this slide.  A value of `-1` is the default and means that all tiles should be written, except that the default is `+1` for `TilesRandomly`.
 98 | 
 99 | + ***overlap_height*** (int):
100 |     Specifies the desired amount of vertical overlap between adjacent tiles, measured in pixels using the `target_magnification` (described above).  If overlap_height is not supplied, it is read from the study dictionary, if available, otherwise it is set to zero.  Zero indicates that there is no overlap between adjacent tiles; they are abutting.
101 | 
102 | + ***overlap_width*** (int):
103 |      Specifies the desired amount of horizontal overlap between adjacent tiles, measured in pixels using the `target_magnification` (described above).  If overlap_width is not supplied, it is read from the study dictionary, if available, otherwise it is set to zero.  Zero indicates that there is no overlap between adjacent tiles; they are abutting.
104 | 
105 | + ***mask_filename*** (str):
106 |     The path of the image file to be read and used as a mask.  The aspect ratio of the mask (in terms of its pixel dimensions) is expected to be about the same as the aspect ratio of the main image ( in terms of its grid of tiles).  A non-zero value in the mask indicates that the tile should be retained.  The default is "", which means that there is no masking.
107 | 
108 | + ***mask_threshold*** (float):
109 |     A value in [0.0, 1.0].  A tile is retained if the fraction of the tile overlapping non-zero pixels in the mask is at least the mask_threshold.
110 | 
111 | + ***tiles_dictionary*** (Python dict):
112 |     For example, `{'AB234': {'tile_top': top0, 'tile_left': left0}, 'CD43': {'tile_top': top1, 'tile_left': left1}, ...}`.  Tiles from this list will copied into the Python dict for this slide if they are randomly selected.
113 | 


--------------------------------------------------------------------------------
/histomics_stream/pytorch.py:
--------------------------------------------------------------------------------
  1 | # =========================================================================
  2 | #
  3 | #   Copyright NumFOCUS
  4 | #
  5 | #   Licensed under the Apache License, Version 2.0 (the "License");
  6 | #   you may not use this file except in compliance with the License.
  7 | #   You may obtain a copy of the License at
  8 | #
  9 | #          https://www.apache.org/licenses/LICENSE-2.0.txt
 10 | #
 11 | #   Unless required by applicable law or agreed to in writing, software
 12 | #   distributed under the License is distributed on an "AS IS" BASIS,
 13 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | #   See the License for the specific language governing permissions and
 15 | #   limitations under the License.
 16 | #
 17 | # =========================================================================
 18 | 
 19 | """Whole-slide image streamer for machine learning frameworks."""
 20 | 
 21 | import numpy as np
 22 | import torch
 23 | 
 24 | from . import configure
 25 | 
 26 | 
 27 | """
 28 | See: How to load a list of numpy arrays to pytorch dataset loader?
 29 | https://stackoverflow.com/questions/44429199/how-to-load-a-list-of-numpy-arrays-to-pytorch-dataset-loader
 30 | 
 31 | torchvision.transforms.ToTensor transforms numpy array or PIL image to torch tensor
 32 | torchvision.transforms.LongTensor maybe stacks tensors
 33 | Subclassing torch.utils.data.Dataset maybe provides something like a tensorflow
 34 |   dataset's interface
 35 | Using a torch.utils.data.DataLoader on the torch.utils.data.Dataset subclass is maybe
 36 |   like creating the actual dataset.
 37 | """
 38 | 
 39 | """
 40 | See: A Comprehensive Guide to the DataLoader Class and Abstractions in PyTorch
 41 | https://blog.paperspace.com/dataloaders-abstractions-pytorch/
 42 | """
 43 | 
 44 | """
 45 | Notes 5/31/2023: For multi-processing, torch seems to like a single shared
 46 | torch.utils.data.IterableDataset, but one torch.utils.data.DataLoader per worker.  If we
 47 | are to avoid loading in all workers' pixel data for each worker, the Dataset should not
 48 | be loading in the pixel data, just creating the associated dictionary.  There should be
 49 | one dictionary per *chunk*, which includes its list of tiles, and the loading of the
 50 | pixel data per chunk should somehow be deferred to the DataLoader.
 51 | 
 52 | If we create the dataset in an eager fashion, which may be reasonable if it is not
 53 | including the pixel data, then it can instead be a (map-style rather iterable-style)
 54 | torch.utils.data.Dataset.  Especially if we compute worker_index = chunk_index %
 55 | num_workers as part of the annotation, it might be quite easy to use a DataLoader's
 56 | `num_workers` and `sampler` parameters to direct that pixel data are read only for those
 57 | chunks that belong to a given worker, at the time that the DataLoader is created.
 58 | 
 59 | Ultimately the goal is to have the pixel data read and predicted within a single worker
 60 | before it is grouped back together to return to the user.  If the above doesn't work for
 61 | that, alternatively to using `num_workers` in the DataLoader constructor, we might
 62 | explicitly use num_worker instances of a DataLoader, created using num_worker calls to
 63 | torch.multiprocessing.Process(target=DataLoader, args=) or similar.  These are started
 64 | in one loop and then joined in another loop.  See
 65 | https://pytorch.org/docs/stable/notes/multiprocessing.html.  We'll probably need a
 66 | torch.multiprocessing.Queue to collect outputs, similarly to but not quite the same as
 67 | https://teddykoker.com/2020/12/dataloader/.
 68 | """
 69 | 
 70 | 
 71 | class CreateTorchDataloader(configure.ChunkLocations):
 72 |     class MyDataset(torch.utils.data.IterableDataset, configure._TilesByCommon):
 73 |         def __init__(self, study_description):
 74 |             configure._TilesByCommon.__init__(self)
 75 |             torch.utils.data.IterableDataset.__init__(self)
 76 |             """Store in self the data or pointers to it"""
 77 |             # Update keys of the dictionary from deprecated names
 78 |             self._update_dict(study_description)
 79 |             for slide_description in study_description["slides"].values():
 80 |                 self._update_dict(slide_description)
 81 |                 for chunk_description in slide_description["chunks"].values():
 82 |                     self._update_dict(chunk_description)
 83 |                     for tile_description in chunk_description["tiles"].values():
 84 |                         self._update_dict(tile_description)
 85 | 
 86 |             self.study_description = study_description
 87 | 
 88 |         def __iter__(self):
 89 |             """Return an iterable that yields tiles=(pixel data, annotation_dict)"""
 90 | 
 91 |             def my_iterable():
 92 |                 """This is the iterable that we will return"""
 93 |                 study_description = self.study_description
 94 |                 study_dict = {
 95 |                     # !!! Is it better to have the dictionary values be length-one
 96 |                     # !!! lists, here and below?
 97 |                     # !!! Or use
 98 |                     # !!! {key: torch.from_numpy(np.array(study_description[key]))}?
 99 |                     key: study_description[key]
100 |                     for key in study_description.keys()
101 |                     if key != "slides"
102 |                 }
103 |                 for slide_description in study_description["slides"].values():
104 |                     slide_dict = {
105 |                         **study_dict,
106 |                         **{
107 |                             key: slide_description[key]
108 |                             for key in slide_description.keys()
109 |                             if key not in ["tiles", "chunks"]
110 |                         },
111 |                     }
112 | 
113 |                     filename = slide_dict["filename"]
114 |                     returned_magnification = slide_dict["returned_magnification"]
115 |                     factor = slide_dict["target_magnification"] / returned_magnification
116 |                     scaled_tile_height = configure.ChunkLocations.scale_it(
117 |                         slide_dict["tile_height"], factor
118 |                     )
119 |                     scaled_tile_width = configure.ChunkLocations.scale_it(
120 |                         slide_dict["tile_width"], factor
121 |                     )
122 | 
123 |                     for chunk_description in slide_description["chunks"].values():
124 |                         chunk_dict = {
125 |                             **slide_dict,
126 |                             **{
127 |                                 key: chunk_description[key]
128 |                                 for key in chunk_description.keys()
129 |                                 if key != "tiles"
130 |                             },
131 |                         }
132 | 
133 |                         # Call to the superclass to get the pixel data for this chunk.
134 |                         # Keep only first 3 colors.  Convert to np.uint8.
135 |                         scaled_chunk_top = configure.ChunkLocations.scale_it(
136 |                             chunk_dict["chunk_top"], factor
137 |                         )
138 |                         scaled_chunk_left = configure.ChunkLocations.scale_it(
139 |                             chunk_dict["chunk_left"], factor
140 |                         )
141 |                         scaled_chunk_bottom = configure.ChunkLocations.scale_it(
142 |                             chunk_dict["chunk_bottom"], factor
143 |                         )
144 |                         scaled_chunk_right = configure.ChunkLocations.scale_it(
145 |                             chunk_dict["chunk_right"], factor
146 |                         )
147 | 
148 |                         # Use `:3` to change RGBA (if applicable) to RGB.
149 |                         scaled_chunk_pixels = configure.ChunkLocations.read_large_image(
150 |                             filename,
151 |                             scaled_chunk_top,
152 |                             scaled_chunk_left,
153 |                             scaled_chunk_bottom,
154 |                             scaled_chunk_right,
155 |                             returned_magnification,
156 |                         )[..., :3].astype(dtype=np.float32)
157 |                         # Color is the last/fastest dimension for images read with
158 |                         # large_image, but channel is the first/slowest for Torch
159 |                         # tensors.
160 |                         scaled_chunk_pixels = np.moveaxis(scaled_chunk_pixels, -1, 0)
161 |                         scaled_chunk_pixels = torch.from_numpy(scaled_chunk_pixels)
162 | 
163 |                         for tile_description in chunk_description["tiles"].values():
164 |                             tile_dict = {
165 |                                 **chunk_dict,
166 |                                 **{
167 |                                     key: tile_description[key]
168 |                                     for key in tile_description.keys()
169 |                                 },
170 |                             }
171 |                             scaled_tile_top = (
172 |                                 configure.ChunkLocations.scale_it(
173 |                                     tile_dict["tile_top"], factor
174 |                                 )
175 |                                 - scaled_chunk_top
176 |                             )
177 |                             scaled_tile_left = (
178 |                                 configure.ChunkLocations.scale_it(
179 |                                     tile_dict["tile_left"], factor
180 |                                 )
181 |                                 - scaled_chunk_left
182 |                             )
183 |                             scaled_tile_bottom = scaled_tile_top + scaled_tile_height
184 |                             scaled_tile_right = scaled_tile_left + scaled_tile_width
185 |                             scaled_tile_pixels = scaled_chunk_pixels[
186 |                                 :,
187 |                                 scaled_tile_top:scaled_tile_bottom,
188 |                                 scaled_tile_left:scaled_tile_right,
189 |                             ]
190 | 
191 |                             # Yield the pixel data as a tensor and the Python dict of
192 |                             # associated information.  Rather than `yield
193 |                             # scaled_tile_pixels, tile_dict` we use lists and pop() so
194 |                             # that this iterator does not maintain a reference count for
195 |                             # the returned objects.
196 |                             pixels_in_list = [scaled_tile_pixels]
197 |                             dict_in_list = [tile_dict]
198 |                             del scaled_tile_pixels, tile_dict
199 |                             yield pixels_in_list.pop(), dict_in_list.pop()
200 | 
201 |             """Return this generator (iterable) over the tiles"""
202 |             return my_iterable()
203 | 
204 |     def __init__(self):
205 |         """Set global options"""
206 |         configure.ChunkLocations.__init__(self)
207 |         # !!! Instead, get `batch_size` from somewhere
208 |         self.batch_size = 1
209 | 
210 |     def __call__(self, study_description):
211 |         """
212 |         From scratch, creates a torch dataloader with one torch element per tile
213 |         """
214 |         # Call to superclass to find the locations for the chunks
215 |         super().__call__(study_description)
216 | 
217 |         my_dataset = self.MyDataset(study_description)
218 |         # !!! DataLoader has additional parameters that we may wish to use
219 |         my_data_loader = torch.utils.data.DataLoader(
220 |             my_dataset, batch_size=self.batch_size
221 |         )
222 | 
223 |         return my_data_loader
224 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         https://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        https://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # HistomicsStream
 2 | 
 3 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/InsightSoftwareConsortium/ITK/blob/master/LICENSE) [![PyPI Version](https://img.shields.io/pypi/v/histomics_stream.svg)](https://pypi.python.org/pypi/histomics_stream) [![GitHub repository](https://img.shields.io/badge/Powered%20by-HistomicsStream-blue.svg)](https://github.com/DigitalSlideArchive/HistomicsStream) [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DigitalSlideArchive/HistomicsStream/blob/master/example/tensorflow_stream.ipynb)
 4 | 
 5 | ## Overview
 6 | 
 7 | The goal of this project is to create a whole-slide image file reader for machine learning.  This reader allows users to extract pixel data from whole-slide image formats, and supports reading paradigms that are commonly used during machine learning training and inference.  The package currently supports TensorFlow 2.
 8 | 
 9 | ## Installation for Python
10 | 
11 | `histomics_stream` can be easily installed with Python wheels.  If you do not want the installation to be to your current Python environment, you should first create and activate a [Python virtual environment (venv)](https://docs.python.org/3/tutorial/venv.html) to work in.  Then, run the following from the command lines:
12 | 
13 | ```shell-script
14 | sudo apt update
15 | sudo apt install -y python3-openslide openslide-tools
16 | pip install histomics_stream 'large_image[openslide]' \
17 |   scikit_image --find-links https://girder.github.io/large_image_wheels
18 | ```
19 | Additional functionality is supported with subpackages, e.g., `histomics_stream[tensorflow,torch,zarr]`. These packages are optional when histomics_stream is used only for masking and/or organizing image tiles into larger image chunks that are more efficient to read than individual image tiles.  However, if you are creating a tensorflow `Dataset` or a pytorch `DataLoader` then you will need the corresponding packages.
20 | 
21 | Additional image readers can be supported by using, e.g., `large_image[openslide,ometiff,openjpeg,bioformats]` instead of `large_image[openslide]`.
22 | 
23 | After launching `python3`, import the `histomics_stream` package with:
24 | 
25 | ```python
26 | import histomics_stream as hs
27 | ```
28 | 
29 | This has been tested with `tensorflow:2.6.2-gpu` and `tensorflow:2.8.0-gpu`.
30 | 
31 | ## History
32 | 
33 | Through version 1.0.6, this project was known as `tensorflow_reader`.
34 | 
35 | ## Study representation
36 | 
37 | `histomics_stream` works in two steps. It first builds an object that represents the study.  Second, from that study object, it builds a `tensorflow` `Dataset` object, which efficiently reads the pixel data from files.  The study object is described in [StudyObject.md](StudyObject.md).
38 | 
39 | ## Introduction
40 | 
41 | ![This is a chunk of an H&E stained slide that is about 0.5 mm by 0.3 mm, which is 1821 × 1196 pixels or about 7 × 4 tiles.](documentation/H&E_chunk.png)
42 | 
43 | Histopathology is the study of biopsied tissues under the microscope for the purpose of diagnosing disease.  Glass slides of tissue specimens are prepared by staining thin tissue slices with chemicals to highlight cellular structures for examination.  Traditionally pathologists have examined glass slides to look for telltale signs of disease, but recently whole-slide images (WSIs) that digitize the entire slide at high magnification are being used in diagnosis.  A single research study may involve thousands of WSIs, each containing several billion pixels that need to be analyzed by medical personnel.  Computer vision algorithms based on machine learning are also increasingly being used to detect, classify, and measure structures in WSIs, both in research and clinical practice.  Developing algorithms to analyze WSIs is challenging, since popular machine learning frameworks and computing hardware are built for analyzing much smaller images.  For example, a typical WSI with 120,000 × 80,000 pixels contains the equivalent of 191 thousand 224 × 224 images, a typical size used in machine learning frameworks.
44 | 
45 | We are producing software tools to simplify the development of computer vision algorithms for WSIs.  These tools make working with WSI data more approachable for computer vision and machine learning experts, and will significantly accelerate research by attracting more people to the field of computational pathology.  The National Institutes of Health-funded work, a collaboration of Kitware, Inc., Northwestern University, Wake Forest School of Medicine, and Emory University, uses machine learning to find regions of interest.  `histomics_stream` sits at the start of the workflow.  Specifically, `histomics_stream` is responsible for efficient access to the input image data that will be used to fit a new machine learning model or will be used to predict regions of interest in novel inputs using an already learned model.
46 | 
47 | A histopathology tissue sample that is 25 mm × 25 mm (1 inch × 1 inch) and is imaged at a typical 40x magnification will be approximately 100,000 × 100,000 pixels, which is 30 gigabytes of uncompressed RGB data for a single image.  A research study may have 10-10,000 such whole slide images.  For machine learning purposes such as proposing regions of diagnostic value, these images are usually broken up into tiles, for example 256 × 256 pixels each, and there may be millions to billions of such tiles to be processed in machine learning operations.  Especially with the prediction step of machine learning, simply reading these data from disk can be the biggest determinant of runtime performance.
48 | 
49 | Several Python libraries, such as [`openslide`](https://openslide.org/api/python/) and [`large_image`](https://girder.github.io/large_image/), are capable of reading whole-slide images with efficiency.  Additional power comes from packages such as [`Zarr`](https://www.nature.com/articles/s41592-021-01326-w), which distributes a single image’s data across multiple files.  These packages are able to efficiently read a tile from anywhere within a whole-slide image without having to read the entire image.  These work well in single-threaded CPU-based applications.  However, machine learning involves massive parallelization and sophisticated scheduling, GPU-based computations, and relatively limited GPU-accessible memory.
50 | 
51 | ## Methods
52 | 
53 | ![A whole-slide image (blue boundary, on order of 100,000 × 100,000) is broken up into chunks (orange boundary, on order of 2048 × 2048) that are read in a single I/O operation and are split into tiles (magenta border, on order of 256 × 256) that are analyzed.](documentation/slide_chunk_tile.png)
54 | 
55 | `histomics_stream` is a Python package that enables efficient access to large datasets of whole slide images for use in machine learning.  In the first step, the user specifies the details of the data set and the desired operating parameters.  The user specifies which images will be processed, where they can be found, what metadata is associated with each (e.g., cohort, subject identifier), a “chunk” size for each image, and a desired magnification to be used.  The chunk size of 2048 × 2048 pixels works well in many scenarios we tested, but other values can be specified by the user; the chunk size indicates how many pixel data should be read from disk with each read and in the default case means that an 8 × 8 grid of tiles, each 256 × 256 pixels, is efficient to read with each disk read.  In some image types such as TIF and SVS, the image file includes the image data at multiple resolutions.  `histomics_stream` selects which native resolution to use based upon the user-specified desired magnification.
56 | 
57 | In the first step the user also specifies the operating parameters.  What size should each tile be?  Should tiles be chosen uniformly in a grid fashion and, if so, how much overlap, if any, should there be between adjacent tiles?  The user can supply a mask indicating which tiles from the grid should be used.  Alternatively the user can supply an explicit list of tiles to be used, whether or not they are on a grid.  The user can indicate that a random subset of the otherwise allowable tiles should be selected.
58 | 
59 | As its second step, `histomics_stream` creates a TensorFlow Dataset object from the study description.  As is the paradigm for TensorFlow, the creation is done in a lazy, non-eager fashion.  By invoking the TensorFlow commands, `histomics_stream` creates a TensorFlow execution graph that specifies the dependencies within the data workflow.  Together with TensorFlow’s scheduling and parallelism functionality, this execution graph simply and efficiently directs the reading of tiles from disk for direct and efficient use in TensorFlow model operations.  The TensorFlow Dataset created by `histomics_stream` is then used directly in TensorFlow operations for machine learning, whether for model fitting, model evaluation, or use of a model to make predictions in novel input data.
60 | 
61 | ## Results
62 | 
63 | `histomics_stream` increases runtime performance and eases the construction of the needed TensorFlow execution graph.
64 | 
65 | ### Performance
66 | 
67 | ![histomics_stream is 65% faster in a typical example.](documentation/runtime.png)
68 | 
69 | The `histomics_stream` package significantly improves runtime performance.  In a typical example, reading a single whole-slide image that is 19,784 × 27,888 pixels as non-overlapping tiles that are 256 × 256 pixels produces a 77 × 108 grid of 8316 tiles.  The `large_image` package is impressive in its ability to seamlessly read multiple file formats and to efficiently read tiles from within large images; with `large_image` the runtime is a quick 16.9 tiles per second including reading and machine learning prediction, using a single GeForce RTX 2080 Ti.  With `histomics_stream` this workflow throughput is increased to 27.9 tiles per second, which is a 65% performance improvement.  Much of the performance gain comes from reading data one chunk at a time rather than one tile at a time.  Additional performance gain comes from the reliance on TensorFlow for the scheduling of reads; TensorFlow’s graph execution schedules each read to optimize the overall performance of the workflow as a whole.
70 | 
71 | ### Implementation
72 | 
73 | The steps of `histomics_stream` are demonstrated in the Jupyter lab notebook [`example/tensorflow_stream.ipynb`](https://github.com/DigitalSlideArchive/HistomicsStream/blob/master/example/tensorflow_stream.ipynb), which is also available in [Google Colab](https://colab.research.google.com/github/DigitalSlideArchive/HistomicsStream/blob/master/example/tensorflow_stream.ipynb).  Construction of a Python dictionary that describes the study data set is straightforward and key steps are implemented by `histomics_stream`.  Complexities from TensorFlow are seamlessly handled.  For example, the syntax for parallelizable for loops in TensorFlow, which are often essential for runtime performance, is non-intuitive; `histomics_stream` provides the desired parallelism without exposing this complexity.  Similarly TensorFlow can be temperamental about conditional control flows, requiring that its graph execution construction routines can prove that alternative execution branches that should be producing objects of the same shape actually do so; the design of `histomics_stream` gives the user the power to, e.g., efficiently select tiles under several alternative strategies, without exposing this graph execution complexity to the user.
74 | 
75 | ## Conclusions
76 | 
77 | The TensorFlow graph execution interface can be challenging and unintuitive.  Instead bioinformatics model creators can use `histomics_stream` to specify the dataset that is to be analyzed.  `histomics_stream` takes care of TensorFlow execution graph creation and provides a significant runtime performance improvement.
78 | 
79 | ## Acknowledgments
80 | 
81 | This work was funded by the National Institutes of Health National Cancer Institute Informatics Technologies for Cancer Research (NIH NCR ITCR) U01 grant [5U01CA220401-04](https://reporter.nih.gov/search/dyu6NCTti06k6svCyr7--Q/project-details/9929565) entitled “Informatics Tools for Quantitative Digital Pathology Profiling and Integrated Prognostic Modeling” with Lee A. D. Cooper (Northwestern University), Metin N. Gurcan (Wake Forest School of Medicine), and Christopher R. Flowers (Emory University) as principal investigators and Kitware, Inc. as a subcontractor.  Implementation is primarily by Lee A. Newberg (Kitware, Inc.).
82 | 


--------------------------------------------------------------------------------
/histomics_stream/tensorflow.py:
--------------------------------------------------------------------------------
  1 | # =========================================================================
  2 | #
  3 | #   Copyright NumFOCUS
  4 | #
  5 | #   Licensed under the Apache License, Version 2.0 (the "License");
  6 | #   you may not use this file except in compliance with the License.
  7 | #   You may obtain a copy of the License at
  8 | #
  9 | #          https://www.apache.org/licenses/LICENSE-2.0.txt
 10 | #
 11 | #   Unless required by applicable law or agreed to in writing, software
 12 | #   distributed under the License is distributed on an "AS IS" BASIS,
 13 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | #   See the License for the specific language governing permissions and
 15 | #   limitations under the License.
 16 | #
 17 | # =========================================================================
 18 | 
 19 | """Whole-slide image streamer for machine learning frameworks."""
 20 | 
 21 | import math
 22 | 
 23 | import tensorflow as tf
 24 | 
 25 | from . import configure
 26 | 
 27 | 
 28 | class CreateTensorFlowDataset(configure.ChunkLocations):
 29 |     def __init__(self):
 30 |         configure.ChunkLocations.__init__(self)
 31 |         self.dataset_map_options = {
 32 |             "num_parallel_calls": tf.data.experimental.AUTOTUNE,
 33 |             "deterministic": False,
 34 |         }
 35 | 
 36 |     def __call__(
 37 |         self,
 38 |         study_description,
 39 |         num_workers=None,
 40 |         worker_index=None,
 41 |         private_threadpool_size=None,
 42 |     ):
 43 |         """
 44 |         From scratch, creates a tensorflow dataset with one tensorflow element per tile
 45 |         """
 46 |         num_workers = num_workers if num_workers is not None else 1
 47 |         worker_index = worker_index if worker_index is not None else 0
 48 |         private_threadpool_size = (
 49 |             private_threadpool_size if private_threadpool_size is not None else 1
 50 |         )
 51 | 
 52 |         # Call to superclass to find the locations for the chunks
 53 |         # print(f"Build chunks: begin {datetime.datetime.now()}")
 54 |         configure.ChunkLocations.__call__(self, study_description)
 55 |         # print(f"Build chunks: end {datetime.datetime.now()}")
 56 | 
 57 |         # print(f"Build one_chunk_per_slice: begin {datetime.datetime.now()}")
 58 |         study_keys = study_description
 59 |         slide_keys = next(iter(study_keys["slides"].values()))
 60 |         chunk_keys = next(iter(slide_keys["chunks"].values()))
 61 |         tile_keys = {"tiles_top": "tile_top", "tiles_left": "tile_left"}
 62 |         one_chunk_per_slice = {
 63 |             **{
 64 |                 key: tf.constant(
 65 |                     [
 66 |                         study_description[key]
 67 |                         for slide_description in study_description["slides"].values()
 68 |                         for chunk_description in slide_description["chunks"].values()
 69 |                     ]
 70 |                 )
 71 |                 for key in study_keys
 72 |                 if key != "slides"
 73 |             },
 74 |             **{
 75 |                 key: tf.constant(
 76 |                     [
 77 |                         slide_description[key]
 78 |                         for slide_description in study_description["slides"].values()
 79 |                         for chunk_description in slide_description["chunks"].values()
 80 |                     ]
 81 |                 )
 82 |                 for key in slide_keys
 83 |                 if key not in ("tiles", "chunks")
 84 |             },
 85 |             **{
 86 |                 key: tf.constant(
 87 |                     [
 88 |                         chunk_description[key]
 89 |                         for slide_description in study_description["slides"].values()
 90 |                         for chunk_description in slide_description["chunks"].values()
 91 |                     ]
 92 |                 )
 93 |                 for key in chunk_keys
 94 |                 if key != "tiles"
 95 |             },
 96 |             **{
 97 |                 plural: tf.ragged.constant(
 98 |                     [
 99 |                         [
100 |                             tile_description[singular]
101 |                             for tile_description in chunk_description["tiles"].values()
102 |                         ]
103 |                         for slide_description in study_description["slides"].values()
104 |                         for chunk_description in slide_description["chunks"].values()
105 |                     ]
106 |                 )
107 |                 for plural, singular in tile_keys.items()
108 |             },
109 |         }
110 |         # print(f"Build one_chunk_per_slice: end {datetime.datetime.now()}")
111 | 
112 |         # print(
113 |         #     "Build study_dataset from_tensor_slices: begin "
114 |         #     f"{datetime.datetime.now()}"
115 |         # )
116 |         study_dataset = tf.data.Dataset.from_tensor_slices(one_chunk_per_slice)
117 |         del one_chunk_per_slice
118 |         # print(
119 |         #     f"Build study_dataset from_tensor_slices: end {datetime.datetime.now()}"
120 |         # )
121 | 
122 |         # print(f"study_dataset.element_spec = {study_dataset.element_spec}")
123 | 
124 |         # Shard the dataset before we have broken chunks into tiles so that all a
125 |         # chunk's tiles stay together.
126 |         if num_workers != 1 or worker_index != 0:
127 |             study_dataset = study_dataset.shard(num_workers, worker_index)
128 | 
129 |         # We have accumulated the chunk datasets into a study_dataset where each element
130 |         # is a chunk.  Read in the chunk pixel data and split it into tiles.
131 |         # print(f"Build study_dataset map: begin {datetime.datetime.now()}")
132 |         study_dataset = study_dataset.map(
133 |             CreateTensorFlowDataset._read_and_split_chunk, **self.dataset_map_options
134 |         )
135 |         # print(f"Build study_dataset map: end {datetime.datetime.now()}")
136 | 
137 |         # Change study_dataset so that each element is a tile.
138 |         study_dataset = study_dataset.unbatch()
139 | 
140 |         # Make the tile pixels easier to find in each study_dataset element.  Also, tack
141 |         # on additional elements to the tuple so that the form is (inputs, targets,
142 |         # sample_weights).
143 |         # print(f"Build study_dataset pop: begin {datetime.datetime.now()}")
144 |         study_dataset = study_dataset.map(
145 |             lambda elem: ((elem.pop("tile_pixels"), elem),), **self.dataset_map_options
146 |         )
147 |         study_dataset = study_dataset.map(
148 |             lambda elem: (elem, None, None), **self.dataset_map_options
149 |         )
150 |         # print(f"Build study_dataset pop: end {datetime.datetime.now()}")
151 | 
152 |         # By default `private_threadpool_size` is set to 0, which means that Tensorflow
153 |         # is free to choose the number without limit.  However, Tensorflow can grind to
154 |         # a halt when processing a large dataset with this default behavior on GPU.  A
155 |         # value of 1 for `private_threadpool_size` runs more quickly than other values
156 |         # on some tests we tried.  Changing `private_threadpool_size` here is achieved
157 |         # as a transformation of the dataset with an `options` object.
158 |         options = tf.data.Options()
159 |         options.threading.private_threadpool_size = private_threadpool_size
160 |         study_dataset = study_dataset.with_options(options)
161 | 
162 |         return study_dataset
163 | 
164 |     @staticmethod
165 |     def _read_and_split_chunk(elem):
166 |         # Get chunk's pixel data from disk and load it into chunk_as_tensor.
167 |         # Note that if elem["factor"] differs from 1.0 then this chunk will have
168 |         # num_rows ((chunk_bottom - chunk_top) / factor, and num_columns =
169 |         # ((chunk_right - chunk_left) / factor.
170 |         # tf.print("#_read_and_split_chunk begin")
171 |         zero = tf.constant(0, dtype=tf.int32)
172 |         one = tf.constant(1, dtype=tf.int32)
173 |         epsilon = tf.constant(0.01, dtype=tf.float32)
174 | 
175 |         factor = tf.cast(elem["target_magnification"], dtype=tf.float32) / tf.cast(
176 |             elem["returned_magnification"], dtype=tf.float32
177 |         )
178 |         chunk_as_tensor = tf.py_function(
179 |             func=CreateTensorFlowDataset._py_read_chunk,
180 |             inp=[
181 |                 elem["chunk_top"],
182 |                 elem["chunk_left"],
183 |                 elem["chunk_bottom"],
184 |                 elem["chunk_right"],
185 |                 elem["filename"],
186 |                 elem["returned_magnification"],
187 |                 factor,
188 |             ],
189 |             Tout=tf.uint8,
190 |         )
191 |         num_tiles = tf.size(elem["tiles_top"])
192 |         tiles = tf.TensorArray(dtype=tf.uint8, size=num_tiles)
193 | 
194 |         scaled_tile_height = tf.cast(
195 |             tf.math.floor(
196 |                 tf.cast(elem["tile_height"], dtype=tf.float32) / factor + epsilon
197 |             ),
198 |             dtype=tf.int32,
199 |         )
200 |         scaled_tile_width = tf.cast(
201 |             tf.math.floor(
202 |                 tf.cast(elem["tile_width"], dtype=tf.float32) / factor + epsilon
203 |             ),
204 |             dtype=tf.int32,
205 |         )
206 |         scaled_chunk_top = tf.cast(
207 |             tf.math.floor(
208 |                 tf.cast(elem["chunk_top"], dtype=tf.float32) / factor + epsilon
209 |             ),
210 |             dtype=tf.int32,
211 |         )
212 |         scaled_chunk_left = tf.cast(
213 |             tf.math.floor(
214 |                 tf.cast(elem["chunk_left"], dtype=tf.float32) / factor + epsilon
215 |             ),
216 |             dtype=tf.int32,
217 |         )
218 | 
219 |         def condition(i, _):
220 |             return tf.less(i, num_tiles)
221 | 
222 |         def body(i, tiles):
223 |             return (
224 |                 i + one,
225 |                 tiles.write(
226 |                     i,
227 |                     tf.image.crop_to_bounding_box(
228 |                         chunk_as_tensor,
229 |                         tf.cast(
230 |                             tf.math.floor(
231 |                                 tf.cast(
232 |                                     tf.gather(elem["tiles_top"], i), dtype=tf.float32
233 |                                 )
234 |                                 / factor
235 |                                 + epsilon
236 |                             ),
237 |                             dtype=tf.int32,
238 |                         )
239 |                         - scaled_chunk_top,
240 |                         tf.cast(
241 |                             tf.math.floor(
242 |                                 tf.cast(
243 |                                     tf.gather(elem["tiles_left"], i), dtype=tf.float32
244 |                                 )
245 |                                 / factor
246 |                                 + epsilon
247 |                             ),
248 |                             dtype=tf.int32,
249 |                         )
250 |                         - scaled_chunk_left,
251 |                         scaled_tile_height,
252 |                         scaled_tile_width,
253 |                     ),
254 |                 ),
255 |             )
256 | 
257 |         _, tiles = tf.while_loop(condition, body, [zero, tiles])
258 |         tiles = tiles.stack()
259 | 
260 |         response = {
261 |             **{
262 |                 key: tf.repeat(elem[key], num_tiles)
263 |                 for key in elem.keys()
264 |                 if key not in ("tiles_top", "tiles_left")
265 |             },
266 |             "tile_top": elem["tiles_top"],
267 |             "tile_left": elem["tiles_left"],
268 |             "tile_pixels": tiles,
269 |         }
270 | 
271 |         # tf.print("#_read_and_split_chunk end")
272 |         return response
273 | 
274 |     @staticmethod
275 |     def _py_read_chunk(
276 |         chunk_top,
277 |         chunk_left,
278 |         chunk_bottom,
279 |         chunk_right,
280 |         filename,
281 |         returned_magnification,
282 |         factor,
283 |     ):
284 |         """
285 |         Read from disk all the pixel data for a specific chunk of the
286 |         whole slide.
287 |         """
288 | 
289 |         # if "_num_chunks" not in CreateTensorFlowDataset._py_read_chunk.__dict__:
290 |         #     CreateTensorFlowDataset._py_read_chunk._num_chunks = 0
291 |         # chunk_name = (
292 |         #     f"#_py_read_chunk {CreateTensorFlowDataset._py_read_chunk._num_chunks:06}"
293 |         # )
294 |         # CreateTensorFlowDataset._py_read_chunk._num_chunks += 1
295 | 
296 |         # print(f"{chunk_name} begin {datetime.datetime.now()}")
297 |         filename = filename.numpy().decode("utf-8")
298 |         chunk_top = math.floor(chunk_top.numpy() / factor.numpy() + 0.01)
299 |         chunk_left = math.floor(chunk_left.numpy() / factor.numpy() + 0.01)
300 |         chunk_bottom = math.floor(chunk_bottom.numpy() / factor.numpy() + 0.01)
301 |         chunk_right = math.floor(chunk_right.numpy() / factor.numpy() + 0.01)
302 |         returned_magnification = returned_magnification.numpy()
303 | 
304 |         # print(f"{chunk_name} begin1 {datetime.datetime.now()}")
305 |         # Call to the superclass to get the pixel data for this chunk
306 |         chunk = configure.ChunkLocations.read_large_image(
307 |             filename,
308 |             chunk_top,
309 |             chunk_left,
310 |             chunk_bottom,
311 |             chunk_right,
312 |             returned_magnification,
313 |         )
314 |         # print(f"{chunk_name} begin2 {datetime.datetime.now()}")
315 | 
316 |         # Do we want to support other than RGB?!!!
317 |         chunk = chunk[..., :3]
318 |         # print(f"{chunk_name} end {datetime.datetime.now()}")
319 |         return chunk
320 | 


--------------------------------------------------------------------------------
/example/tensorflow_stream.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "b0c50c61",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Demonstration of histomics_stream\n",
  9 |     "\n",
 10 |     "Click to open in [[GitHub](https://github.com/DigitalSlideArchive/HistomicsStream/tree/master/example/tensorflow_stream.ipynb)] [[Google Colab](https://colab.research.google.com/github/DigitalSlideArchive/HistomicsStream/blob/master/example/tensorflow_stream.ipynb)]\n",
 11 |     "\n",
 12 |     "The `histomics_stream` Python package sits at the start of any machine learning workflow that is built on the TensorFlow machine learning library.  The package is responsible for efficient access to the input image data that will be used to fit a new machine learning model or will be used to predict regions of interest in novel inputs using an already learned model."
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "id": "10f22613",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "## Installation\n",
 21 |     "\n",
 22 |     "If you are running this notebook on Google Colab or another system where `histomics_stream` and its dependencies are not yet installed then they can be installed with the following commands.  Note that image readers in addition to openslide are also supported by using, e.g., `large_image[bioformats,ometiff,openjpeg,openslide,tiff]` on the below pip install command line."
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "id": "9ac13166-ba70-495b-be71-43036afc5cb7",
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "# Get histomics_stream and its dependencies\n",
 33 |     "!apt update\n",
 34 |     "!apt install -y python3-openslide openslide-tools\n",
 35 |     "!pip install 'large_image[openslide,tiff]' --find-links https://girder.github.io/large_image_wheels\n",
 36 |     "!pip install histomics_stream[tensorflow]\n",
 37 |     "\n",
 38 |     "# Get other packages used in this notebook\n",
 39 |     "# N.B. itkwidgets works with jupyter<=3.0.0\n",
 40 |     "!apt install libcudnn8 libcudnn8-dev\n",
 41 |     "!pip install histomics_detect pooch itkwidgets\n",
 42 |     "!jupyter labextension install @jupyter-widgets/jupyterlab-manager jupyter-matplotlib jupyterlab-datawidgets itkwidgets\n",
 43 |     "\n",
 44 |     "print(\n",
 45 |     "    \"\\nNOTE!: On Google Colab you may need to choose 'Runtime->Restart runtime' for these updates to take effect.\"\n",
 46 |     ")"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "id": "1b4b1fd0",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "## Fetching and creating the test data\n",
 55 |     "This notebook has demonstrations that use the files `TCGA-AN-A0G0-01Z-00-DX1.svs` (365 MB) and `TCGA-AN-A0G0-01Z-00-DX1.mask.png` (4 kB),  The pooch commands will fetch them if they are not already available."
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "id": "8b9784b2",
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "import os\n",
 66 |     "import pooch\n",
 67 |     "\n",
 68 |     "# download whole slide image\n",
 69 |     "wsi_path = pooch.retrieve(\n",
 70 |     "    fname=\"TCGA-AN-A0G0-01Z-00-DX1.svs\",\n",
 71 |     "    url=\"https://drive.google.com/uc?export=download&id=19agE_0cWY582szhOVxp9h3kozRfB4CvV&confirm=t&uuid=6f2d51e7-9366-4e98-abc7-4f77427dd02c&at=ALgDtswlqJJw1KU7P3Z1tZNcE01I:1679111148632\",\n",
 72 |     "    known_hash=\"d046f952759ff6987374786768fc588740eef1e54e4e295a684f3bd356c8528f\",\n",
 73 |     "    path=str(pooch.os_cache(\"pooch\")) + os.sep + \"wsi\",\n",
 74 |     ")\n",
 75 |     "print(f\"Have {wsi_path}\")\n",
 76 |     "\n",
 77 |     "# download binary mask image\n",
 78 |     "mask_path = pooch.retrieve(\n",
 79 |     "    fname=\"TCGA-AN-A0G0-01Z-00-DX1.mask.png\",\n",
 80 |     "    url=\"https://drive.google.com/uc?export=download&id=17GOOHbL8Bo3933rdIui82akr7stbRfta\",\n",
 81 |     "    known_hash=\"bb657ead9fd3b8284db6ecc1ca8a1efa57a0e9fd73d2ea63ce6053fbd3d65171\",\n",
 82 |     "    path=str(pooch.os_cache(\"pooch\")) + os.sep + \"wsi\",\n",
 83 |     ")\n",
 84 |     "print(f\"Have {mask_path}\")"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "id": "cb4179b8",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "## Creating a study for use with histomics_stream\n",
 93 |     "\n",
 94 |     "We describe the input and desired parameters using standard Python lists and dictionaries.  Here we give a high-level configuration; selection of tiles is done subsequently.\n",
 95 |     "\n",
 96 |     "N.B.: __*all*__ values that are number of pixels are based upon the `target_magnification` that is supplied to `FindResolutionForSlide`.  This includes pixel sizes of a slide, chunk, or tile and it includes the pixel coordinates for a chunk or tile.  It applies whether the numbers are supplied to histomics_stream or returned by histomics_stream.  However, if the `magnification_source` is not `exact` the `returned_magnification` may not equal the `target_magnification`; to get the number of pixels that is relevant for the `returned_magnification`, typically these numbers of pixels are multiplied by the ratio `returned_magnification / target_magnification`.  In particular, the *pixel size of the returned tiles* will be the requested size times this ratio."
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "id": "0de1e5a5-58ed-4cc9-9348-9e22e0c9fa23",
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "import histomics_stream as hs\n",
107 |     "import histomics_stream.tensorflow\n",
108 |     "import tensorflow as tf"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "id": "08cfbc01-1b50-426e-ac4e-9c73916329d4",
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "# Create a study and insert study-wide information.\n",
119 |     "# Add a slide to the study, including slide-wide information with it.\n",
120 |     "my_study0 = dict(\n",
121 |     "    version=\"version-1\",\n",
122 |     "    tile_height=256,\n",
123 |     "    tile_width=256,\n",
124 |     "    overlap_height=0,\n",
125 |     "    overlap_width=0,\n",
126 |     "    slides=dict(\n",
127 |     "        Slide_0=dict(\n",
128 |     "            filename=wsi_path,\n",
129 |     "            slide_name=os.path.splitext(os.path.split(wsi_path)[1])[0],\n",
130 |     "            slide_group=\"Group 3\",\n",
131 |     "            chunk_height=2048,\n",
132 |     "            chunk_width=2048,\n",
133 |     "        )\n",
134 |     "    ),\n",
135 |     ")\n",
136 |     "\n",
137 |     "# For each slide, find the appropriate resolution given the target_magnification and\n",
138 |     "# magnification_tolerance.  In this example, we use the same parameters for each slide,\n",
139 |     "# but this is not required generally.\n",
140 |     "find_slide_resolution = hs.configure.FindResolutionForSlide(\n",
141 |     "    my_study0, target_magnification=20, magnification_source=\"native\"\n",
142 |     ")\n",
143 |     "for slide in my_study0[\"slides\"].values():\n",
144 |     "    find_slide_resolution(slide)\n",
145 |     "print(f\"my_study0 = {my_study0}\")"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "id": "dd18bd4e",
151 |    "metadata": {},
152 |    "source": [
153 |     "## Tile selection\n",
154 |     "\n",
155 |     "We are going to demonstrate several approaches to choosing tiles.  Each approach will start with its own copy of the `my_study0` that we have built so far."
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "id": "4b4e5990",
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "import copy"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "id": "56e2d816",
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "# Demonstrate TilesByGridAndMask without a mask\n",
176 |     "my_study_by_grid = copy.deepcopy(my_study0)\n",
177 |     "tiles_by_grid = hs.configure.TilesByGridAndMask(\n",
178 |     "    my_study_by_grid, overlap_height=32, overlap_width=32, randomly_select=5\n",
179 |     ")\n",
180 |     "# We could apply this to a subset of the slides, but we will apply it to all slides in\n",
181 |     "# this example.\n",
182 |     "for slide in my_study_by_grid[\"slides\"].values():\n",
183 |     "    tiles_by_grid(slide)\n",
184 |     "# Take a look at what we have made\n",
185 |     "print(f\"==== The entire dictionary is now ==== \\nmy_study_by_grid = {my_study_by_grid}\")\n",
186 |     "just_tiles = tiles_by_grid.get_tiles(my_study_by_grid)\n",
187 |     "print(f\"==== A quick look at just the tiles is now ====\\njust_tiles = {just_tiles}\")"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "id": "018d44a8",
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "# Demonstrate TilesByGridAndMask with a mask\n",
198 |     "my_study_by_grid_and_mask = copy.deepcopy(my_study0)\n",
199 |     "tiles_by_grid_and_mask = hs.configure.TilesByGridAndMask(\n",
200 |     "    my_study_by_grid_and_mask, mask_filename=mask_path, randomly_select=10\n",
201 |     ")\n",
202 |     "# We could apply this to a subset of the slides, but we will apply it to all slides in\n",
203 |     "# this example.\n",
204 |     "for slide in my_study_by_grid_and_mask[\"slides\"].values():\n",
205 |     "    tiles_by_grid_and_mask(slide)\n",
206 |     "# Take a look at what we have made\n",
207 |     "print(\n",
208 |     "    f\"==== The entire dictionary is now ==== \\nmy_study_by_grid_and_mask = {my_study_by_grid_and_mask}\"\n",
209 |     ")\n",
210 |     "just_tiles = tiles_by_grid_and_mask.get_tiles(my_study_by_grid_and_mask)\n",
211 |     "print(f\"==== A quick look at just the tiles is now ====\\njust_tiles = {just_tiles}\")"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": null,
217 |    "id": "91970864",
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "# Demonstrate TilesByList\n",
222 |     "my_study_by_list = copy.deepcopy(my_study0)\n",
223 |     "tiles_by_list = hs.configure.TilesByList(\n",
224 |     "    my_study_by_list,\n",
225 |     "    randomly_select=5,\n",
226 |     "    tiles_dictionary=my_study_by_grid[\"slides\"][\"Slide_0\"][\"tiles\"],\n",
227 |     ")\n",
228 |     "# We could apply this to a subset of the slides, but we will apply it to all slides in\n",
229 |     "# this example.\n",
230 |     "for slide in my_study_by_list[\"slides\"].values():\n",
231 |     "    tiles_by_list(slide)\n",
232 |     "# Take a look at what we have made\n",
233 |     "print(f\"==== The entire dictionary is now ==== \\nmy_study_by_list = {my_study_by_list}\")\n",
234 |     "just_tiles = tiles_by_list.get_tiles(my_study_by_list)\n",
235 |     "print(f\"==== A quick look at just the tiles is now ====\\njust_tiles = {just_tiles}\")"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "id": "e120014f",
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "# Demonstrate TilesRandomly\n",
246 |     "my_study_randomly = copy.deepcopy(my_study0)\n",
247 |     "tiles_randomly = hs.configure.TilesRandomly(my_study_randomly, randomly_select=10)\n",
248 |     "# We could apply this to a subset of the slides, but we will apply it to all slides in\n",
249 |     "# this example.\n",
250 |     "for slide in my_study_randomly[\"slides\"].values():\n",
251 |     "    tiles_randomly(slide)\n",
252 |     "# Take a look at what we have made\n",
253 |     "print(\n",
254 |     "    f\"==== The entire dictionary is now ==== \\nmy_study_randomly = {my_study_randomly}\"\n",
255 |     ")\n",
256 |     "just_tiles = tiles_randomly.get_tiles(my_study_randomly)\n",
257 |     "print(f\"==== A quick look at just the tiles is now ====\\njust_tiles = {just_tiles}\")"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "markdown",
262 |    "id": "905bcb07",
263 |    "metadata": {},
264 |    "source": [
265 |     "## Creating a TensorFlow Dataset\n",
266 |     "\n",
267 |     "We request tiles indicated by the mask and create a tensorflow Dataset that has the image data for these tiles as well as associated parameters for each tile, such as its location."
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "id": "6618f2e1",
274 |    "metadata": {},
275 |    "outputs": [],
276 |    "source": [
277 |     "# Demonstrate TilesByGridAndMask with a mask\n",
278 |     "my_study = copy.deepcopy(my_study0)\n",
279 |     "tiles_by_grid_and_mask = hs.configure.TilesByGridAndMask(\n",
280 |     "    my_study, mask_filename=mask_path, mask_threshold=0.5, randomly_select=100\n",
281 |     ")\n",
282 |     "for slide in my_study[\"slides\"].values():\n",
283 |     "    tiles_by_grid_and_mask(slide)\n",
284 |     "print(\"Finished selecting tiles.\")\n",
285 |     "\n",
286 |     "create_tensorflow_dataset = hs.tensorflow.CreateTensorFlowDataset()\n",
287 |     "tiles = create_tensorflow_dataset(my_study)\n",
288 |     "print(\"Finished with CreateTensorFlowDataset\")\n",
289 |     "print(f\"... with tile shape = {tiles.take(1).get_single_element()[0][0].shape}\")"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "markdown",
294 |    "id": "72421b0a",
295 |    "metadata": {},
296 |    "source": [
297 |     "## Fetch a model for prediction\n",
298 |     "\n",
299 |     "We fetch a model (840 MB compressed, 1.3 GB decompressed) that we will use to make predictions.\n",
300 |     "\n",
301 |     "Because each element of our Dataset is a tuple `(rgb_image_data, dictionary_of_annotation)`, a typical model that accepts only the former as its input needs to be wrapped.\n",
302 |     "\n",
303 |     "Note that this model assumes that the tiles/images are not batched, with the understanding that if there is enough memory to do batching then one should instead choose a larger tile size. "
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "id": "dfbf1170",
310 |    "metadata": {},
311 |    "outputs": [],
312 |    "source": [
313 |     "# download trained model.\n",
314 |     "model_path = pooch.retrieve(\n",
315 |     "    fname=\"tcga_brca_model\",\n",
316 |     "    url=\"https://drive.google.com/uc?export=download&id=1KxB6iAn9j2Wp7oyFlV4T1Kli-mR8-35G&confirm=t&uuid=c5df8dfd-ed48-4cef-81a0-19df97677fe5&at=ALgDtswWzs0BEdkVNgFrp83p9NDO:1679111246793\",\n",
317 |     "    known_hash=\"b5b5444cc8874d17811a89261abeafd9b9603e7891a8b2a98d8f13e2846a6689\",\n",
318 |     "    path=str(pooch.os_cache(\"pooch\")) + os.sep + \"model\",\n",
319 |     "    processor=pooch.Unzip(),\n",
320 |     ")\n",
321 |     "model_path = os.path.split(model_path[0])[0]\n",
322 |     "print(f\"Have {model_path}.\")\n",
323 |     "\n",
324 |     "# restore keras model\n",
325 |     "from histomics_detect.models import FasterRCNN\n",
326 |     "\n",
327 |     "model = tf.keras.models.load_model(\n",
328 |     "    model_path, custom_objects={\"FasterRCNN\": FasterRCNN}\n",
329 |     ")\n",
330 |     "\n",
331 |     "\n",
332 |     "# Each element of the `tiles` tensorflow Dataset is a (rgb_image_data, dictionary_of_annotation) pair.\n",
333 |     "# Wrap the unwrapped_model so that it knows to use the image.\n",
334 |     "class WrappedModel(tf.keras.Model):\n",
335 |     "    def __init__(self, model, *args, **kwargs):\n",
336 |     "        super(WrappedModel, self).__init__(*args, **kwargs)\n",
337 |     "        self.model = model\n",
338 |     "\n",
339 |     "    def call(self, element):\n",
340 |     "        return (self.model(element[0]), element[1])\n",
341 |     "\n",
342 |     "\n",
343 |     "unwrapped_model = model\n",
344 |     "model = WrappedModel(unwrapped_model)\n",
345 |     "print(\"Model built and wrapped.\")"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "markdown",
350 |    "id": "4614c2a3",
351 |    "metadata": {},
352 |    "source": [
353 |     "## Make predictions"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": null,
359 |    "id": "b050a930",
360 |    "metadata": {},
361 |    "outputs": [],
362 |    "source": [
363 |     "import time\n",
364 |     "\n",
365 |     "print(\"Starting predictions\")\n",
366 |     "start_time = time.time()\n",
367 |     "# This model assumes that the tiles are not batched.  Do not use, e.g., tiles.batch(32).\n",
368 |     "predictions = model.predict(tiles)\n",
369 |     "end_time = time.time()\n",
370 |     "num_inputs = len([0 for tile in tiles])\n",
371 |     "num_predictions = predictions[0].shape[0]\n",
372 |     "print(\n",
373 |     "    f\"Made {num_predictions} predictions for {num_inputs} tiles in {end_time - start_time} s.\"\n",
374 |     ")\n",
375 |     "print(f\"Average of {(end_time - start_time) / num_inputs} s per tile.\")"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "markdown",
380 |    "id": "09fc739b",
381 |    "metadata": {},
382 |    "source": [
383 |     "## Look at internals"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": null,
389 |    "id": "1144f373",
390 |    "metadata": {},
391 |    "outputs": [],
392 |    "source": [
393 |     "my_element = tiles.take(1).get_single_element()\n",
394 |     "my_pair = my_element[0]\n",
395 |     "my_target = my_element[1]\n",
396 |     "my_weight = my_element[2]\n",
397 |     "my_image = my_pair[0]\n",
398 |     "my_annotation = my_pair[1]\n",
399 |     "\n",
400 |     "print(f\"   type(my_element) = {type(my_element)}\")\n",
401 |     "print(f\"    len(my_element) = {len(my_element)}\")\n",
402 |     "print(f\"      type(my_pair) = {type(my_pair)}\")\n",
403 |     "print(f\"       len(my_pair) = {len(my_pair)}\")\n",
404 |     "print(f\"    type(my_target) = {type(my_target)}\")\n",
405 |     "print(f\"    type(my_weight) = {type(my_weight)}\")\n",
406 |     "print(f\"     type(my_image) = {type(my_image)}\")\n",
407 |     "print(f\"     my_image.shape = {my_image.shape}\")\n",
408 |     "print(f\"type(my_annotation) = {type(my_annotation)}\")"
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "markdown",
413 |    "id": "d492e513",
414 |    "metadata": {},
415 |    "source": [
416 |     "## Display a tile"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": null,
422 |    "id": "9531e48d",
423 |    "metadata": {},
424 |    "outputs": [],
425 |    "source": [
426 |     "import itk, itkwidgets\n",
427 |     "\n",
428 |     "itkwidgets.view(itk.image_from_array(my_image.numpy(), is_vector=True))"
429 |    ]
430 |   }
431 |  ],
432 |  "metadata": {
433 |   "kernelspec": {
434 |    "display_name": "Python 3 (ipykernel)",
435 |    "language": "python",
436 |    "name": "python3"
437 |   },
438 |   "language_info": {
439 |    "codemirror_mode": {
440 |     "name": "ipython",
441 |     "version": 3
442 |    },
443 |    "file_extension": ".py",
444 |    "mimetype": "text/x-python",
445 |    "name": "python",
446 |    "nbconvert_exporter": "python",
447 |    "pygments_lexer": "ipython3",
448 |    "version": "3.8.10"
449 |   }
450 |  },
451 |  "nbformat": 4,
452 |  "nbformat_minor": 5
453 | }
454 | 


--------------------------------------------------------------------------------
/example/pytorch_stream.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "f87613a9",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Demonstration of histomics_stream\n",
  9 |     "\n",
 10 |     "Click to open in [[GitHub](https://github.com/DigitalSlideArchive/HistomicsStream/tree/master/example/pytorch.ipynb)] [[Google Colab](https://colab.research.google.com/github/DigitalSlideArchive/HistomicsStream/blob/master/example/pytorch_stream.ipynb)]\n",
 11 |     "\n",
 12 |     "The `histomics_stream` Python package sits at the start of any machine learning workflow that is built on the PyTorch machine learning library.  The package is responsible for efficient access to the input image data that will be used to fit a new machine learning model or will be used to predict regions of interest in novel inputs using an already learned model."
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "id": "a8490f25",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "## Installation\n",
 21 |     "\n",
 22 |     "If you are running this notebook on Google Colab or another system where `histomics_stream` and its dependencies are not yet installed then they can be installed with the following commands.  Note that image readers in addition to openslide are also supported by using, e.g., `large_image[bioformats,ometiff,openjpeg,openslide,tiff]` on the below pip install command line."
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "id": "5aa174fa-c59b-42a5-ae59-7b28d3b3c50d",
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "# Get histomics_stream and its dependencies\n",
 33 |     "!apt update\n",
 34 |     "!apt install -y python3-openslide openslide-tools\n",
 35 |     "!pip install 'large_image[openslide,tiff]' --find-links https://girder.github.io/large_image_wheels\n",
 36 |     "!pip install histomics_stream[torch]\n",
 37 |     "\n",
 38 |     "# Get other packages used in this notebook\n",
 39 |     "# N.B. itkwidgets works with jupyter<=3.0.0\n",
 40 |     "!apt install libcudnn8 libcudnn8-dev\n",
 41 |     "!pip install pooch itkwidgets\n",
 42 |     "!jupyter labextension install @jupyter-widgets/jupyterlab-manager jupyter-matplotlib jupyterlab-datawidgets itkwidgets\n",
 43 |     "\n",
 44 |     "print(\n",
 45 |     "    \"\\nNOTE!: On Google Colab you may need to choose 'Runtime->Restart runtime' for these updates to take effect.\"\n",
 46 |     ")"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "id": "ed2efd66",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "## Fetching and creating the test data\n",
 55 |     "This notebook has demonstrations that use the files `TCGA-AN-A0G0-01Z-00-DX1.svs` (365 MB) and `TCGA-AN-A0G0-01Z-00-DX1.mask.png` (4 kB),  The pooch commands will fetch them if they are not already available."
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "id": "b2ea3c60",
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "import os\n",
 66 |     "import pooch\n",
 67 |     "\n",
 68 |     "# download whole slide image\n",
 69 |     "wsi_path = pooch.retrieve(\n",
 70 |     "    fname=\"TCGA-AN-A0G0-01Z-00-DX1.svs\",\n",
 71 |     "    url=\"https://drive.google.com/uc?export=download&id=19agE_0cWY582szhOVxp9h3kozRfB4CvV&confirm=t&uuid=6f2d51e7-9366-4e98-abc7-4f77427dd02c&at=ALgDtswlqJJw1KU7P3Z1tZNcE01I:1679111148632\",\n",
 72 |     "    known_hash=\"d046f952759ff6987374786768fc588740eef1e54e4e295a684f3bd356c8528f\",\n",
 73 |     "    path=str(pooch.os_cache(\"pooch\")) + os.sep + \"wsi\",\n",
 74 |     ")\n",
 75 |     "print(f\"Have {wsi_path}\")\n",
 76 |     "\n",
 77 |     "# download binary mask image\n",
 78 |     "mask_path = pooch.retrieve(\n",
 79 |     "    fname=\"TCGA-AN-A0G0-01Z-00-DX1.mask.png\",\n",
 80 |     "    url=\"https://drive.google.com/uc?export=download&id=17GOOHbL8Bo3933rdIui82akr7stbRfta\",\n",
 81 |     "    known_hash=\"bb657ead9fd3b8284db6ecc1ca8a1efa57a0e9fd73d2ea63ce6053fbd3d65171\",\n",
 82 |     "    path=str(pooch.os_cache(\"pooch\")) + os.sep + \"wsi\",\n",
 83 |     ")\n",
 84 |     "print(f\"Have {mask_path}\")"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "id": "4274b5d6",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "## Creating a study for use with histomics_stream\n",
 93 |     "\n",
 94 |     "We describe the input and desired parameters using standard Python lists and dictionaries.  Here we give a high-level configuration; selection of tiles is done subsequently.\n",
 95 |     "\n",
 96 |     "N.B.: __*all*__ values that are number of pixels are based upon the `target_magnification` that is supplied to `FindResolutionForSlide`.  This includes pixel sizes of a slide, chunk, or tile and it includes the pixel coordinates for a chunk or tile.  It applies whether the numbers are supplied to histomics_stream or returned by histomics_stream.  However, if the `magnification_source` is not `exact` the `returned_magnification` may not equal the `target_magnification`; to get the number of pixels that is relevant for the `returned_magnification`, typically these numbers of pixels are multiplied by the ratio `returned_magnification / target_magnification`.  In particular, the *pixel size of the returned tiles* will be the requested size times this ratio."
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "id": "1e17612d-0216-4652-92cd-d8ea5e0ac6d7",
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "import histomics_stream as hs\n",
107 |     "import histomics_stream.pytorch\n",
108 |     "import torch"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "id": "effa3803-fc82-4bd2-93f1-538de00d7607",
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "# Create a study and insert study-wide information.\n",
119 |     "# Add a slide to the study, including slide-wide information with it.\n",
120 |     "my_study0 = dict(\n",
121 |     "    version=\"version-1\",\n",
122 |     "    tile_height=256,\n",
123 |     "    tile_width=256,\n",
124 |     "    overlap_height=0,\n",
125 |     "    overlap_width=0,\n",
126 |     "    slides=dict(\n",
127 |     "        Slide_0=dict(\n",
128 |     "            filename=wsi_path,\n",
129 |     "            slide_name=os.path.splitext(os.path.split(wsi_path)[1])[0],\n",
130 |     "            slide_group=\"Group 3\",\n",
131 |     "            chunk_height=2048,\n",
132 |     "            chunk_width=2048,\n",
133 |     "        )\n",
134 |     "    ),\n",
135 |     ")\n",
136 |     "\n",
137 |     "# For each slide, find the appropriate resolution given the target_magnification and\n",
138 |     "# magnification_tolerance.  In this example, we use the same parameters for each slide,\n",
139 |     "# but this is not required generally.\n",
140 |     "find_slide_resolution = hs.configure.FindResolutionForSlide(\n",
141 |     "    my_study0, target_magnification=20, magnification_source=\"exact\"\n",
142 |     ")\n",
143 |     "for slide in my_study0[\"slides\"].values():\n",
144 |     "    find_slide_resolution(slide)\n",
145 |     "print(f\"my_study0 = {my_study0}\")"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "id": "0fde9d2e",
151 |    "metadata": {},
152 |    "source": [
153 |     "## Tile selection\n",
154 |     "\n",
155 |     "We are going to demonstrate several approaches to choosing tiles.  Each approach will start with its own copy of the `my_study0` that we have built so far."
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "id": "4ca79608",
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "import copy"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "id": "cba3ab43",
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "# Demonstrate TilesByGridAndMask without a mask\n",
176 |     "my_study_by_grid = copy.deepcopy(my_study0)\n",
177 |     "tiles_by_grid = hs.configure.TilesByGridAndMask(\n",
178 |     "    my_study_by_grid, overlap_height=32, overlap_width=32, randomly_select=5\n",
179 |     ")\n",
180 |     "# We could apply this to a subset of the slides, but we will apply it to all slides in\n",
181 |     "# this example.\n",
182 |     "for slide in my_study_by_grid[\"slides\"].values():\n",
183 |     "    tiles_by_grid(slide)\n",
184 |     "# Take a look at what we have made\n",
185 |     "print(f\"==== The entire dictionary is now ==== \\nmy_study_by_grid = {my_study_by_grid}\")\n",
186 |     "just_tiles = tiles_by_grid.get_tiles(my_study_by_grid)\n",
187 |     "print(f\"==== A quick look at just the tiles is now ====\\njust_tiles = {just_tiles}\")"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "id": "953ebb17",
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "# Demonstrate TilesByGridAndMask with a mask\n",
198 |     "my_study_by_grid_and_mask = copy.deepcopy(my_study0)\n",
199 |     "tiles_by_grid_and_mask = hs.configure.TilesByGridAndMask(\n",
200 |     "    my_study_by_grid_and_mask, mask_filename=mask_path, randomly_select=10\n",
201 |     ")\n",
202 |     "# We could apply this to a subset of the slides, but we will apply it to all slides in\n",
203 |     "# this example.\n",
204 |     "for slide in my_study_by_grid_and_mask[\"slides\"].values():\n",
205 |     "    tiles_by_grid_and_mask(slide)\n",
206 |     "# Take a look at what we have made\n",
207 |     "print(\n",
208 |     "    f\"==== The entire dictionary is now ==== \\nmy_study_by_grid_and_mask = {my_study_by_grid_and_mask}\"\n",
209 |     ")\n",
210 |     "just_tiles = tiles_by_grid_and_mask.get_tiles(my_study_by_grid_and_mask)\n",
211 |     "print(f\"==== A quick look at just the tiles is now ====\\njust_tiles = {just_tiles}\")"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": null,
217 |    "id": "f341e882",
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "# Demonstrate TilesByList\n",
222 |     "my_study_by_list = copy.deepcopy(my_study0)\n",
223 |     "tiles_by_list = hs.configure.TilesByList(\n",
224 |     "    my_study_by_list,\n",
225 |     "    randomly_select=5,\n",
226 |     "    tiles_dictionary=my_study_by_grid[\"slides\"][\"Slide_0\"][\"tiles\"],\n",
227 |     ")\n",
228 |     "# We could apply this to a subset of the slides, but we will apply it to all slides in\n",
229 |     "# this example.\n",
230 |     "for slide in my_study_by_list[\"slides\"].values():\n",
231 |     "    tiles_by_list(slide)\n",
232 |     "# Take a look at what we have made\n",
233 |     "print(f\"==== The entire dictionary is now ==== \\nmy_study_by_list = {my_study_by_list}\")\n",
234 |     "just_tiles = tiles_by_list.get_tiles(my_study_by_list)\n",
235 |     "print(f\"==== A quick look at just the tiles is now ====\\njust_tiles = {just_tiles}\")"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "id": "9bc2770f",
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "# Demonstrate TilesRandomly\n",
246 |     "my_study_randomly = copy.deepcopy(my_study0)\n",
247 |     "tiles_randomly = hs.configure.TilesRandomly(my_study_randomly, randomly_select=10)\n",
248 |     "# We could apply this to a subset of the slides, but we will apply it to all slides in\n",
249 |     "# this example.\n",
250 |     "for slide in my_study_randomly[\"slides\"].values():\n",
251 |     "    tiles_randomly(slide)\n",
252 |     "# Take a look at what we have made\n",
253 |     "print(\n",
254 |     "    f\"==== The entire dictionary is now ==== \\nmy_study_randomly = {my_study_randomly}\"\n",
255 |     ")\n",
256 |     "just_tiles = tiles_randomly.get_tiles(my_study_randomly)\n",
257 |     "print(f\"==== A quick look at just the tiles is now ====\\njust_tiles = {just_tiles}\")"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "markdown",
262 |    "id": "e35fe040",
263 |    "metadata": {},
264 |    "source": [
265 |     "## Creating a Dataset\n",
266 |     "\n",
267 |     "We request tiles indicated by the mask and create a Dataset that has the image data for these tiles as well as associated parameters for each tile, such as its location."
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "id": "0d272866",
274 |    "metadata": {},
275 |    "outputs": [],
276 |    "source": [
277 |     "# Demonstrate TilesByGridAndMask with a mask\n",
278 |     "my_study = copy.deepcopy(my_study0)\n",
279 |     "tiles_by_grid_and_mask = hs.configure.TilesByGridAndMask(\n",
280 |     "    my_study, mask_filename=mask_path, mask_threshold=0.5, randomly_select=100\n",
281 |     ")\n",
282 |     "for slide in my_study[\"slides\"].values():\n",
283 |     "    tiles_by_grid_and_mask(slide)\n",
284 |     "print(\"Finished selecting tiles.\")\n",
285 |     "\n",
286 |     "create_pytorch_dataloader = hs.pytorch.CreateTorchDataloader()\n",
287 |     "tiles = create_pytorch_dataloader(my_study)\n",
288 |     "print(\"Finished with CreateTorchDataloader\")\n",
289 |     "# print(f\"{tile = }\")\n",
290 |     "# print(f\"... with tile shape = {tiles.take(1).get_single_element()[0][0].shape}\")"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "markdown",
295 |    "id": "800f2502",
296 |    "metadata": {},
297 |    "source": [
298 |     "## Fetch a model for prediction\n",
299 |     "\n",
300 |     "We build a arbitrary but reasonable model for demonstration purposes.\n",
301 |     "\n",
302 |     "Because each element of our Dataset is a tuple `(rgb_image_data, dictionary_of_annotation)`, a typical model that accepts only the former as its input needs to be wrapped."
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": null,
308 |    "id": "fd890cf5",
309 |    "metadata": {},
310 |    "outputs": [],
311 |    "source": [
312 |     "class MyTorchModel(torch.nn.modules.module.Module):\n",
313 |     "    def __init__(\n",
314 |     "        self, in_channels, tile_height, tile_width, num_categories, kernel_size\n",
315 |     "    ):\n",
316 |     "        print(f\"{in_channels = }\")\n",
317 |     "        print(f\"{tile_height = }\")\n",
318 |     "        print(f\"{tile_width = }\")\n",
319 |     "        print(f\"{num_categories = }\")\n",
320 |     "        print(f\"{kernel_size = }\")\n",
321 |     "        super(MyTorchModel, self).__init__()\n",
322 |     "        out1_channels = 2 * in_channels\n",
323 |     "        padding = tuple(int((k - 1) // 2) for k in kernel_size)\n",
324 |     "        self.conv1 = torch.nn.Conv2d(\n",
325 |     "            in_channels, out1_channels, kernel_size, padding=padding\n",
326 |     "        )\n",
327 |     "        out2_channels = 4 * in_channels\n",
328 |     "        self.conv2 = torch.nn.Conv2d(\n",
329 |     "            out1_channels, out2_channels, kernel_size, padding=padding\n",
330 |     "        )\n",
331 |     "        self.relu = torch.nn.ReLU()\n",
332 |     "        self.pool = torch.nn.MaxPool2d(2, 2)\n",
333 |     "        self.flat_size = int(\n",
334 |     "            in_channels * tile_height * tile_width / (out2_channels / in_channels)\n",
335 |     "        )\n",
336 |     "        self.fc1 = torch.nn.Linear(self.flat_size, 128)\n",
337 |     "        self.fc2 = torch.nn.Linear(128, num_categories)\n",
338 |     "\n",
339 |     "    def forward(self, x):\n",
340 |     "        x = self.pool(self.relu(self.conv1(x)))\n",
341 |     "        x = self.pool(self.relu(self.conv2(x)))\n",
342 |     "        x = x.view(-1, self.flat_size)\n",
343 |     "        x = self.relu(self.fc1(x))\n",
344 |     "        x = self.relu(self.fc2(x))\n",
345 |     "        return x\n",
346 |     "\n",
347 |     "\n",
348 |     "unwrapped_model = MyTorchModel(\n",
349 |     "    in_channels=3,\n",
350 |     "    tile_height=my_study_randomly[\"tile_height\"],\n",
351 |     "    tile_width=my_study_randomly[\"tile_width\"],\n",
352 |     "    num_categories=2,\n",
353 |     "    kernel_size=(5, 5),\n",
354 |     ")\n",
355 |     "\n",
356 |     "# At this point it would be standard to train the model.  This example is so dumb that\n",
357 |     "# we won't do that here.\n",
358 |     "\n",
359 |     "\n",
360 |     "class WrapModel(torch.nn.modules.module.Module):\n",
361 |     "    def __init__(self, model, *args, **kwargs):\n",
362 |     "        super(WrapModel, self).__init__(*args, **kwargs)\n",
363 |     "        self.model = unwrapped_model\n",
364 |     "\n",
365 |     "    def forward(self, x):\n",
366 |     "        p = self.model(x[0])\n",
367 |     "        return p, x[1]\n",
368 |     "\n",
369 |     "\n",
370 |     "model = WrapModel(unwrapped_model)\n",
371 |     "print(\"Model created\")"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "markdown",
376 |    "id": "6e687409",
377 |    "metadata": {},
378 |    "source": [
379 |     "## Make predictions"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": null,
385 |    "id": "e1e890c9-9400-4324-ba6d-22d3aae90669",
386 |    "metadata": {},
387 |    "outputs": [],
388 |    "source": [
389 |     "import time\n",
390 |     "\n",
391 |     "print(\"Starting predictions\")\n",
392 |     "start_time = time.time()\n",
393 |     "# Consider adding a batch factor to the data loader\n",
394 |     "predictions = [model(tile) for tile in tiles]\n",
395 |     "end_time = time.time()\n",
396 |     "print(\"Done predicting\")\n",
397 |     "num_inputs = len([0 for tile in tiles])\n",
398 |     "num_predictions = len(predictions)\n",
399 |     "print(\n",
400 |     "    f\"Made {num_predictions} predictions for {num_inputs} tiles \"\n",
401 |     "    f\"in {end_time - start_time} s.\"\n",
402 |     ")\n",
403 |     "print(f\"Average of {(end_time - start_time) / num_inputs} s per tile.\")"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "markdown",
408 |    "id": "1f16b044",
409 |    "metadata": {},
410 |    "source": [
411 |     "## Look at internals"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": null,
417 |    "id": "c277a4c8",
418 |    "metadata": {},
419 |    "outputs": [],
420 |    "source": [
421 |     "tile_iter = iter(tiles)\n",
422 |     "tile = next(tile_iter)\n",
423 |     "print(f\"                       {type(tiles) = }\")\n",
424 |     "print(f\"               {type(tiles.dataset) = }\")\n",
425 |     "print(f\"         {type(iter(tiles.dataset)) = }\")\n",
426 |     "print(f\"                   {type(tile_iter) = }\")\n",
427 |     "print(f\"                        {type(tile) = }\")\n",
428 |     "print(f\"                         {len(tile) = }\")\n",
429 |     "print(f\"                     {type(tile[0]) = }\")\n",
430 |     "print(f\"                     {tile[0].shape = }\")\n",
431 |     "print(f\"                     {type(tile[1]) = }\")\n",
432 |     "print(f\"{tile[0][0,0,0,0].to(torch.float32) = }\")\n",
433 |     "pred = predictions[0]\n",
434 |     "print(f\"                 {type(predictions) = }\")\n",
435 |     "print(f\"                  {len(predictions) = }\")\n",
436 |     "print(f\"                        {type(pred) = }\")\n",
437 |     "print(f\"                         {len(pred) = }\")\n",
438 |     "print(f\"                     {type(pred[0]) = }\")\n",
439 |     "print(f\"                     {pred[0].shape = }\")\n",
440 |     "print(f\"                           {pred[0] = }\")\n",
441 |     "print(f\"                     {type(pred[1]) = }\")\n",
442 |     "print(f\"                    {pred[1].keys() = }\")"
443 |    ]
444 |   }
445 |  ],
446 |  "metadata": {
447 |   "kernelspec": {
448 |    "display_name": "Python 3 (ipykernel)",
449 |    "language": "python",
450 |    "name": "python3"
451 |   },
452 |   "language_info": {
453 |    "codemirror_mode": {
454 |     "name": "ipython",
455 |     "version": 3
456 |    },
457 |    "file_extension": ".py",
458 |    "mimetype": "text/x-python",
459 |    "name": "python",
460 |    "nbconvert_exporter": "python",
461 |    "pygments_lexer": "ipython3",
462 |    "version": "3.8.10"
463 |   }
464 |  },
465 |  "nbformat": 4,
466 |  "nbformat_minor": 5
467 | }
468 | 


--------------------------------------------------------------------------------
/histomics_stream/configure.py:
--------------------------------------------------------------------------------
   1 | # =========================================================================
   2 | #
   3 | #   Copyright NumFOCUS
   4 | #
   5 | #   Licensed under the Apache License, Version 2.0 (the "License");
   6 | #   you may not use this file except in compliance with the License.
   7 | #   You may obtain a copy of the License at
   8 | #
   9 | #          https://www.apache.org/licenses/LICENSE-2.0.txt
  10 | #
  11 | #   Unless required by applicable law or agreed to in writing, software
  12 | #   distributed under the License is distributed on an "AS IS" BASIS,
  13 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 | #   See the License for the specific language governing permissions and
  15 | #   limitations under the License.
  16 | #
  17 | # =========================================================================
  18 | 
  19 | """Whole-slide image streamer for machine learning frameworks."""
  20 | 
  21 | import copy
  22 | import itertools
  23 | import math
  24 | import os
  25 | import random
  26 | import re
  27 | 
  28 | import itk
  29 | import numpy as np
  30 | import scipy.interpolate
  31 | 
  32 | 
  33 | class _TilesByCommon:
  34 |     def __init__(self):
  35 |         self._key_mapping = {
  36 |             "number_pixel_columns_for_chunk": "chunk_width",
  37 |             "number_pixel_columns_for_mask": "mask_width",
  38 |             "number_pixel_columns_for_slide": "slide_width",
  39 |             "number_pixel_columns_for_tile": "tile_width",
  40 |             "number_pixel_overlap_columns_for_tile": "tile_overlap_width",
  41 |             "number_pixel_overlap_rows_for_tile": "tile_overlap_height",
  42 |             "number_pixel_rows_for_chunk": "chunk_height",
  43 |             "number_pixel_rows_for_mask": "mask_height",
  44 |             "number_pixel_rows_for_slide": "slide_height",
  45 |             "number_pixel_rows_for_tile": "tile_height",
  46 |             "number_tile_columns_for_slide": "slide_width_tiles",
  47 |             "number_tile_rows_for_slide": "slide_height_tiles",
  48 |             "tile_overlap_height": "overlap_height",
  49 |             "tile_overlap_width": "overlap_width",
  50 |         }
  51 | 
  52 |         self._keys_warned = set()
  53 | 
  54 |     # For each filename, select just upper-left corner for each tile.
  55 |     # Note that each upper-left corner is returned as (top, left), not (left, top).
  56 |     @staticmethod
  57 |     def get_tiles(study):
  58 |         return [
  59 |             (
  60 |                 slide["filename"],
  61 |                 [
  62 |                     (tile["tile_top"], tile["tile_left"])
  63 |                     for tile in slide["tiles"].values()
  64 |                 ],
  65 |             )
  66 |             for slide in study["slides"].values()
  67 |         ]
  68 | 
  69 |     # Private function to map old key names to their current equivalent
  70 |     def _update_dict(self, d):
  71 |         for old_key in d.keys() & self._key_mapping.keys():
  72 |             # An old key is in use in `d`.
  73 |             new_key = self._key_mapping[old_key]
  74 |             while new_key in self._key_mapping:
  75 |                 # Multiple, serial name changes
  76 |                 new_key = self._key_mapping[new_key]
  77 |             if new_key in d:
  78 |                 # Both the old and new key are used.
  79 |                 raise ValueError(
  80 |                     f"Cannot use both {repr(old_key)} key (deprecated) "
  81 |                     f"and its replacement {repr(new_key)}"
  82 |                 )
  83 |             if old_key not in self._keys_warned:
  84 |                 print(
  85 |                     f"Warning: updating deprecated key {repr(old_key)} "
  86 |                     f"to new name {repr(new_key)}"
  87 |                 )
  88 |                 # Comment out the next line so we do have repeated warnings, in case a
  89 |                 # second study comes in with deprecated keys.
  90 |                 # self._keys_warned.add(old_key)
  91 |             d[new_key] = d[old_key]
  92 |             del d[old_key]
  93 | 
  94 | 
  95 | class FindResolutionForSlide(_TilesByCommon):
  96 |     """
  97 |     A class that computes read parameters for slides.
  98 | 
  99 |     An instance of class FindResolutionForSlide is a callable that will add level,
 100 |     target_magnification, scan_magnification, read_magnification,
 101 |     returned_magnification, slide_height, and slide_width fields to a slide dictionary.
 102 | 
 103 |     Parameters for the constructor
 104 |     ------------------------------
 105 | 
 106 |     filename : string
 107 |         The path of the image file to be read.
 108 | 
 109 |     target_magnification : float
 110 |         The desired objective magnification for generated tiles.  For example, a value
 111 |         of 10 corresponds to about 1 micron per pixel and a value of 20 corresponds to
 112 |         about 0.5 microns per pixel.
 113 | 
 114 |     magnification_source : str in ["scan", "native", "exact"]
 115 |         "scan" will produce tiles from the highest magnification avaialable. This is
 116 |         typically the slide scanner's objective magnification.
 117 | 
 118 |         "native" will produce tiles from the nearest available magnification equal to or
 119 |         greater than target_magnification (within a 2% tolerance). The "native" option
 120 |         is useful when you want to handle resizing of tiles to target_magnification on
 121 |         your own.
 122 | 
 123 |         "exact" will produce tiles using "native" option and then resize these tiles to
 124 |         match target_magnification. Resizing is handled by PIL using the Lanczos
 125 |         antialiasing filter since the resizing shrinks the tile by definition.
 126 | 
 127 |         For either "scan" or "native", the size of the read and returned tiles will be
 128 |         (tile_height * returned_magnification / target_magnification, tile_width *
 129 |         returned_magnification / target_magnification).  For "exact" the size of the
 130 |         returned tiles will be (tile_height, tile_width).
 131 | 
 132 |         This procedure sets values in the slide dictionary to capture the scan, read,
 133 |         and returned magnification of the tiles. This is helpful for example to resize
 134 |         results to the scan magnification for visualization in HistomicsUI, or to resize
 135 |         between native and target magnification when using
 136 |         "native". "scan_magnification" is the highest magnification from the source
 137 |         file; "read_magnification" is the magnification read from the source file;
 138 |         "returned_magnification" is the magnification of the returned tiles which is
 139 |         same as "read_magnification" in the case of "scan" or "native" or
 140 |         "target_magnification" in the case of "exact".
 141 |     """
 142 | 
 143 |     def __init__(self, study, target_magnification, magnification_source):
 144 |         """
 145 |         Sanity check the supplied parameters and store them for later use.
 146 |         """
 147 |         _TilesByCommon.__init__(self)
 148 |         # Check values.
 149 |         if not ("version" in study and study["version"] == "version-1"):
 150 |             raise ValueError('study["version"] must exist and be equal to "version-1".')
 151 |         if not (
 152 |             isinstance(target_magnification, (int, np.integer, float, np.floating))
 153 |             and 0 < target_magnification
 154 |         ):
 155 |             raise ValueError(
 156 |                 f"target_magnification ({target_magnification})"
 157 |                 " must be a positive number"
 158 |             )
 159 |         if not (
 160 |             isinstance(magnification_source, str)
 161 |             and magnification_source in ["scan", "native", "exact"]
 162 |         ):
 163 |             raise ValueError(
 164 |                 f"magnification_source ({magnification_source})"
 165 |                 " must be one of {['scan', 'native', 'exact']}."
 166 |             )
 167 | 
 168 |         # Save values.
 169 |         self.target_magnification = float(target_magnification)
 170 |         self.magnification_source = magnification_source
 171 | 
 172 |     def __call__(self, slide):
 173 |         """
 174 |         Add level, target_magnification, scan_magnification, read_magnification,
 175 |         returned_magnification, slide_height, and slide_width fields to a slide
 176 |         dictionary.
 177 |         """
 178 | 
 179 |         # Check values.
 180 |         if "filename" not in slide:
 181 |             raise ValueError('slide["filename"] must be already set.')
 182 |         filename = slide["filename"]
 183 | 
 184 |         # Do the work.
 185 |         if not re.compile(r"\.zarr$").search(filename):
 186 |             # create large_image, prioritizing tiff source over openslide
 187 |             try:
 188 |                 import large_image_source_tiff
 189 | 
 190 |                 ts = large_image_source_tiff.open(filename)
 191 |             except:
 192 |                 import large_image
 193 | 
 194 |                 ts = large_image.open(filename)
 195 | 
 196 |             # scan_magnification = highest available magnification from source
 197 |             scan_magnification = float(ts.getNativeMagnification()["magnification"])
 198 | 
 199 |             if self.magnification_source == "exact":
 200 |                 # Use the tile-source level that large_image is willing to interpolate
 201 |                 # for us.
 202 |                 preferred_levels = [
 203 |                     ts.getLevelForMagnification(
 204 |                         self.target_magnification, rounding=False
 205 |                     )
 206 |                 ]
 207 |             else:  # self.magnification_source in ["scan", "native"]
 208 |                 # Use one of the tile-source levels that is stored in the image file.
 209 |                 preferred_levels = list(
 210 |                     set(ts.getPreferredLevel(level) for level in range(ts.levels))
 211 |                 )
 212 |                 preferred_levels.sort(reverse=True)
 213 |                 if self.magnification_source == "scan":
 214 |                     # Keep only the maximum tile-source level
 215 |                     preferred_levels = preferred_levels[0:1]
 216 | 
 217 |             estimated_magnifications = np.array(
 218 |                 [
 219 |                     float(ts.getMagnificationForLevel(level)["magnification"])
 220 |                     for level in preferred_levels
 221 |                 ]
 222 |             )
 223 | 
 224 |             # Find best tile-source level to use
 225 |             (level, returned_magnification) = self._get_level_and_magnifications(
 226 |                 self.target_magnification, estimated_magnifications
 227 |             )
 228 |             # Rather than as the index into preferred_levels, change level to be the
 229 |             # value that large_image uses
 230 |             level = preferred_levels[level]
 231 | 
 232 |             # If large_image is resampling a native level for us, it is starting with
 233 |             # the preferred level that is the least one that is not smaller than the
 234 |             # resampled level.
 235 |             read_magnification = float(
 236 |                 ts.getMagnificationForLevel(
 237 |                     min(
 238 |                         [
 239 |                             ts.getPreferredLevel(i)
 240 |                             for i in range(ts.levels)
 241 |                             if i >= level
 242 |                         ]
 243 |                     )
 244 |                 )["magnification"]
 245 |             )
 246 | 
 247 |             slide["target_magnification"] = self.target_magnification
 248 |             slide["scan_magnification"] = scan_magnification
 249 |             slide["read_magnification"] = read_magnification
 250 |             slide["returned_magnification"] = returned_magnification
 251 | 
 252 |             # We don't want to walk off the right or bottom of the slide so we are
 253 |             # conservative as to how many pixels large_image will return for us.
 254 |             # 1) large_image starts with an image that is of
 255 |             #    read_magnification; we compute the dimensions for read_magnification
 256 |             #    with math.floor from the dimensions of scan_magnification (i.e.,
 257 |             #    ts.sizeX and ts.sizeY) to be conservative.
 258 |             # 2) large_image or external software may resampled from the
 259 |             #    read_magnification to the target_magnification; we compute dimensions
 260 |             #    for the target_magnification with math.floor from the
 261 |             #    read_magnification to be conservative.
 262 |             slide_height = ts.sizeY
 263 |             slide_width = ts.sizeX
 264 |             if scan_magnification != read_magnification:
 265 |                 slide_height = math.floor(
 266 |                     slide_height * read_magnification / scan_magnification
 267 |                 )
 268 |                 slide_width = math.floor(
 269 |                     slide_width * read_magnification / scan_magnification
 270 |                 )
 271 |             if read_magnification != self.target_magnification:
 272 |                 slide_height = math.floor(
 273 |                     slide_height * self.target_magnification / read_magnification
 274 |                 )
 275 |                 slide_width = math.floor(
 276 |                     slide_width * self.target_magnification / read_magnification
 277 |                 )
 278 | 
 279 |         else:
 280 |             import zarr
 281 |             import openslide as os
 282 | 
 283 |             # read whole-slide image and create zarr objects
 284 |             store = zarr.DirectoryStore(filename)
 285 |             source_group = zarr.open(store, mode="r")
 286 | 
 287 |             # scan_magnification = highest available magnification from source
 288 |             scan_magnification = float(
 289 |                 source_group.attrs[os.PROPERTY_NAME_OBJECTIVE_POWER]
 290 |             )
 291 | 
 292 |             preferred_levels = list(range(0, source_group.attrs["level_downsamples"]))
 293 |             if self.magnification_source == "scan":
 294 |                 preferred_levels = [np.argmin(source_group.attrs["level_downsamples"])]
 295 | 
 296 |             # calculate magnifications of levels
 297 |             estimated_magnifications = np.array(
 298 |                 scan_magnification / source_group.attrs["level_downsamples"][level]
 299 |                 for level in preferred_levels
 300 |             )
 301 | 
 302 |             # Find best native level to use
 303 |             (level, returned_magnification) = self._get_level_and_magnifications(
 304 |                 self.target_magnification, estimated_magnifications
 305 |             )
 306 |             # Rather than as the index into preferred_levels, change level to be the
 307 |             # value that zarr uses
 308 |             level = preferred_levels[level]
 309 | 
 310 |             slide["target_magnification"] = self.target_magnification
 311 |             slide["scan_magnification"] = scan_magnification
 312 |             slide["read_magnification"] = returned_magnification
 313 |             slide["returned_magnification"] = returned_magnification
 314 | 
 315 |             # get slide slide_height, slide_width at
 316 |             # desired magnification. (Note that slide_width is before
 317 |             # slide_height)
 318 |             slide_width, slide_height = source_group[format(level)].shape[0:2]
 319 | 
 320 |             if (
 321 |                 self.magnification_source == "exact"
 322 |                 and self.target_magnification != returned_magnification
 323 |             ):
 324 |                 raise ValueError(
 325 |                     f"Couldn't find magnification {self.target_magnification}X "
 326 |                     "in Zarr storage."
 327 |                 )
 328 | 
 329 |         int_level = int(round(level))
 330 |         slide["level"] = int_level if abs(level - int_level) < 1e-4 else level
 331 |         # Note that slide size is defined by the requested magnification, which may not
 332 |         # be the same as the magnification for the selected level.  To get the slide
 333 |         # size for the magnification that we are using, these values must later be
 334 |         # multiplied by returned_magnification / target_magnification.
 335 |         slide["slide_height"] = slide_height
 336 |         slide["slide_width"] = slide_width
 337 | 
 338 |     @staticmethod
 339 |     def _get_level_and_magnifications(target_magnification, estimated_magnifications):
 340 |         """
 341 |         A private subroutine that computes level and magnifications.
 342 |         """
 343 |         # calculate difference with magnification levels
 344 | 
 345 |         magnification_tolerance = 0.02
 346 |         delta = target_magnification - estimated_magnifications
 347 | 
 348 |         # match to existing levels
 349 |         if (
 350 |             np.min(np.abs(np.divide(delta, target_magnification)))
 351 |             < magnification_tolerance
 352 |         ):  # match
 353 |             level = np.squeeze(np.argmin(np.abs(delta)))
 354 |         elif np.any(delta < 0):
 355 |             value = np.max(delta[delta < 0])
 356 |             level = np.squeeze(np.argwhere(delta == value)[0])
 357 |         else:  # desired magnification above base level - throw error
 358 |             raise ValueError("Cannot interpolate above scan magnification.")
 359 | 
 360 |         returned_magnification = estimated_magnifications[level]
 361 | 
 362 |         return level, returned_magnification
 363 | 
 364 | 
 365 | class TilesByGridAndMask(_TilesByCommon):
 366 |     """
 367 |     Select tiles according to a regular grid.  Optionally, restrict the list by a mask
 368 |     that is read from a file.  Optionally, further select a random subset of them.
 369 | 
 370 |     An instance of class TilesByGridAndMask is a callable that will select the
 371 |     coordinates of tiles to be taken from a slide.  The selected tiles will be written
 372 |     to the slide dictionary.
 373 | 
 374 |     Parameters for the constructor
 375 |     ------------------------------
 376 |     study : dictionary
 377 |         The study dictionary from which to read parameters about the study.
 378 |     randomly_select: int
 379 |         The number of tiles to be randomly selected from the list that would otherwise
 380 |         be written to the slide dictionary.  A value of -1 is the default and means that
 381 |         all tiles should be written.
 382 |     overlap_height
 383 |         Specifies the desired amount of vertical overlap between adjacent tiles,
 384 |         measured in pixels using the `target_magnification`.  If overlap_height is not
 385 |         supplied, it is read from the study dictionary, if available, otherwise it is
 386 |         set to zero.  Zero indicates that there is no overlap between adjacent tiles;
 387 |         they are abutting.
 388 |     overlap_width
 389 |         Specifies the desired amount of horizontal overlap between adjacent tiles,
 390 |         measured in pixels using the `target_magnification`.  If overlap_width is not
 391 |         supplied, it is read from the study dictionary, if available, otherwise it is
 392 |         set to zero.  Zero indicates that there is no overlap between adjacent tiles;
 393 |         they are abutting.
 394 |     mask_filename: string
 395 |         The path of the image file to be read and used as a mask.  The aspect ratio of
 396 |         the mask (in terms of its pixel dimensions) is expected to be about the same as
 397 |         the aspect ratio of the main image ( in terms of its grid of tiles).  A non-zero
 398 |         value in the mask indicates that the tile should be retained.  The default is
 399 |         "", which means that there is no masking.
 400 |     mask_threshold : float
 401 |         A value in [0.0, 1.0].  A tile is retained if the fraction of the tile
 402 |         overlapping non-zero pixels in the mask is at least the mask_threshold.  The
 403 |         fraction must be strictly positive when the threshold is zero; the fraction has
 404 |         to be greater than or equal to the threshold when the threshold is not zero.
 405 | 
 406 |     """
 407 | 
 408 |     def __init__(self, study, **kwargs):
 409 |         """
 410 |         Sanity check the supplied parameters and store them for later use.
 411 |         """
 412 |         _TilesByCommon.__init__(self)
 413 |         # Update keys of the dictionary from deprecated names
 414 |         self._update_dict(kwargs)
 415 |         bad_keys = kwargs.keys() - {
 416 |             "randomly_select",
 417 |             "overlap_height",
 418 |             "overlap_width",
 419 |             "mask_filename",
 420 |             "mask_threshold",
 421 |         }
 422 |         if bad_keys:
 423 |             raise ValueError(
 424 |                 f"Unrecognized parameters {repr(bad_keys)} in "
 425 |                 "TilesByGridAndMask.__init__"
 426 |             )
 427 | 
 428 |         # randomly_select defaults to select all
 429 |         randomly_select = (
 430 |             kwargs["randomly_select"] if "randomly_select" in kwargs else -1
 431 |         )
 432 |         # Defaults to no masking
 433 |         mask_filename = kwargs["mask_filename"] if "mask_filename" in kwargs else ""
 434 |         # Defaults to any overlap with the mask
 435 |         mask_threshold = kwargs["mask_threshold"] if "mask_threshold" in kwargs else 0.0
 436 | 
 437 |         # Update keys of the dictionary from deprecated names
 438 |         self._update_dict(study)
 439 | 
 440 |         # If overlap is not supplied, it is read from the study dictionary, if
 441 |         # available, otherwise it is set to zero, which is no overlap.
 442 |         overlap_height = (
 443 |             kwargs["overlap_height"]
 444 |             if "overlap_height" in kwargs
 445 |             else study["overlap_height"] if "overlap_height" in study else 0
 446 |         )
 447 |         overlap_width = (
 448 |             kwargs["overlap_width"]
 449 |             if "overlap_width" in kwargs
 450 |             else study["overlap_width"] if "overlap_width" in study else 0
 451 |         )
 452 | 
 453 |         # Check values.
 454 |         if not ("version" in study and study["version"] == "version-1"):
 455 |             raise ValueError('study["version"] must exist and be equal to "version-1".')
 456 |         if not (
 457 |             "tile_height" in study
 458 |             and isinstance(study["tile_height"], (int, np.integer))
 459 |             and study["tile_height"] > 0
 460 |         ):
 461 |             raise ValueError(
 462 |                 'study["tile_height"]' " must exist and be a positive integer"
 463 |             )
 464 |         if not (
 465 |             "tile_width" in study
 466 |             and isinstance(study["tile_width"], (int, np.integer))
 467 |             and study["tile_width"] > 0
 468 |         ):
 469 |             raise ValueError(
 470 |                 'study["tile_width"]' " must exist and be a positive integer"
 471 |             )
 472 |         if not (
 473 |             isinstance(randomly_select, (int, np.integer)) and -1 <= randomly_select
 474 |         ):
 475 |             raise ValueError(
 476 |                 f"randomly_select ({randomly_select})"
 477 |                 " must be a non-negative integer or -1."
 478 |             )
 479 |         if not (
 480 |             isinstance(overlap_height, (int, np.integer))
 481 |             and overlap_height < study["tile_height"]
 482 |         ):
 483 |             raise ValueError(
 484 |                 f"overlap_height ({overlap_height})"
 485 |                 " must be less than"
 486 |                 f' tile_height ({study["tile_height"]}).'
 487 |             )
 488 |         if not (
 489 |             isinstance(overlap_width, (int, np.integer))
 490 |             and overlap_width < study["tile_width"]
 491 |         ):
 492 |             raise ValueError(
 493 |                 f"overlap_width ({overlap_width})"
 494 |                 " must be less than"
 495 |                 f' tile_width ({study["tile_width"]}).'
 496 |             )
 497 |         if mask_filename != "":
 498 |             mask_itk = self.check_mask_filename(mask_filename)
 499 |         if not (
 500 |             isinstance(mask_threshold, (float, np.floating))
 501 |             and mask_threshold >= 0.0
 502 |             and mask_threshold <= 1.0
 503 |         ):
 504 |             raise ValueError(
 505 |                 f"mask_threshold ({mask_threshold}) must be between 0 and 1 inclusive."
 506 |             )
 507 | 
 508 |         # Save values.  To keep garbage collection efficient don't save all of `study`.
 509 |         self.tile_height = study["tile_height"]
 510 |         self.tile_width = study["tile_width"]
 511 |         self.randomly_select = randomly_select
 512 |         self.overlap_height = overlap_height
 513 |         self.overlap_width = overlap_width
 514 |         self.mask_filename = mask_filename
 515 |         if self.mask_filename != "":
 516 |             self.mask_itk = mask_itk
 517 |         self.mask_threshold = mask_threshold
 518 |         # If the user hasn't put the overlap information into the top-level study
 519 |         # dictionary then place it there.
 520 |         if "overlap_height" not in study:
 521 |             study["overlap_height"] = self.overlap_height
 522 |         if "overlap_width" not in study:
 523 |             study["overlap_width"] = self.overlap_width
 524 |         self.studywide_overlap_height = study["overlap_height"]
 525 |         self.studywide_overlap_width = study["overlap_width"]
 526 | 
 527 |     def __call__(self, slide):
 528 |         """
 529 |         Select tiles according to a regular grid.  Optionally, restrict the list by a
 530 |         mask.  Optionally, select a random subset of them.
 531 |         """
 532 | 
 533 |         # Update keys of the dictionary from deprecated names
 534 |         self._update_dict(slide)
 535 | 
 536 |         # Check values.
 537 |         if "slide_height" not in slide:
 538 |             raise ValueError('slide["slide_height"] must be already set.')
 539 |         self.slide_height = slide["slide_height"]
 540 |         if "slide_width" not in slide:
 541 |             raise ValueError('slide["slide_width"] must be already set.')
 542 |         self.slide_width = slide["slide_width"]
 543 | 
 544 |         slide["overlap_height"] = self.overlap_height
 545 |         slide["overlap_width"] = self.overlap_width
 546 |         #
 547 |         # Do the work.
 548 |         #
 549 |         height_stride = self.tile_height - self.overlap_height
 550 |         width_stride = self.tile_width - self.overlap_width
 551 | 
 552 |         # Return information to the user
 553 |         slide["slide_height_tiles"] = math.floor(
 554 |             (self.slide_height - self.overlap_height) / height_stride
 555 |         )
 556 |         slide["slide_width_tiles"] = math.floor(
 557 |             (self.slide_width - self.overlap_width) / width_stride
 558 |         )
 559 | 
 560 |         # Find the coordinates of each tile
 561 |         top_too_large = self.slide_height - self.tile_height + 1
 562 |         left_too_large = self.slide_width - self.tile_width + 1
 563 |         top_left = np.array(
 564 |             [
 565 |                 pair
 566 |                 for pair in itertools.product(
 567 |                     np.arange(0, top_too_large, height_stride),
 568 |                     np.arange(0, left_too_large, width_stride),
 569 |                 )
 570 |             ],
 571 |             dtype=np.int64,
 572 |         )
 573 | 
 574 |         if hasattr(self, "mask_itk"):
 575 |             # There is a mask that we will have to check
 576 |             (self.mask_height, self.mask_width) = self.mask_itk.shape
 577 |             # Let the user know
 578 |             slide["mask_height"] = self.mask_height
 579 |             slide["mask_width"] = self.mask_width
 580 |             slide["tiles"] = self.compute_from_mask(top_left)
 581 | 
 582 |         else:
 583 |             # There is no mask to check
 584 |             slide["tiles"] = {
 585 |                 f"tile_{i}": {"tile_top": int(corner[0]), "tile_left": int(corner[1])}
 586 |                 for i, corner in enumerate(top_left)
 587 |             }
 588 | 
 589 |         if 0 <= self.randomly_select < len(slide["tiles"]):
 590 |             # Choose a subset of the tiles randomly
 591 |             slide["tiles"] = dict(
 592 |                 random.sample(sorted(slide["tiles"].items()), self.randomly_select)
 593 |             )
 594 | 
 595 |     def check_mask_filename(self, mask_filename):
 596 |         mask_itk = itk.imread(mask_filename)  # May throw exception
 597 |         if mask_itk.GetImageDimension() != 2:
 598 |             raise ValueError(
 599 |                 f"The mask ({mask_filename}) should be a 2-dimensional image."
 600 |             )
 601 |         return mask_itk
 602 | 
 603 |     def compute_from_mask(self, top_left):
 604 |         # Check that the input and output aspect ratios are pretty close
 605 |         if (
 606 |             abs(
 607 |                 math.log(
 608 |                     (self.slide_height / self.slide_width)
 609 |                     / (self.mask_height / self.mask_width)
 610 |                 )
 611 |             )
 612 |             > 0.20
 613 |         ):
 614 |             raise ValueError(
 615 |                 "The mask aspect ratio does not match "
 616 |                 "that for the whole slide image."
 617 |             )
 618 | 
 619 |         # cumulative_mask[row, column] will be the number of mask_itk[r, c] (i.e.,
 620 |         # mask_itk.GetPixel((c,r))) values that are nonzero among all those with
 621 |         # both r < row and c < column; note the strict inequalities.  We have added
 622 |         # a boundary on all sides of this array -- zeros on the top and left, and a
 623 |         # duplicate row (column) on the bottom (right) -- so that we do not need to
 624 |         # do extra testing in our code at the borders.  We use int64 in case there
 625 |         # are 2^31 (~2 billion = ~ 46k by 46k) or more non-zero pixel values in our
 626 |         # mask.
 627 |         cumulative_mask = np.zeros(
 628 |             (self.mask_height + 2, self.mask_width + 2), dtype=np.int64
 629 |         )
 630 |         cumulative_mask[1 : self.mask_height + 1, 1 : self.mask_width + 1] = (
 631 |             itk.GetArrayViewFromImage(self.mask_itk).astype(bool).astype(np.int64)
 632 |         )
 633 |         cumulative_mask = np.cumsum(np.cumsum(cumulative_mask, axis=0), axis=1)
 634 | 
 635 |         # Define the grid for the cumulative_mask using slide (not mask!)
 636 |         # coordinates.
 637 |         grid_points = (
 638 |             np.arange(cumulative_mask.shape[0])
 639 |             * (self.slide_height / self.mask_height),
 640 |             np.arange(cumulative_mask.shape[1]) * (self.slide_width / self.mask_width),
 641 |         )
 642 | 
 643 |         # Tile boundaries may not line up with mask pixels, so we will need a
 644 |         # bi-linear interpolator.
 645 |         method = "linear"  # bi-linear
 646 |         interpolator = scipy.interpolate.RegularGridInterpolator(
 647 |             grid_points, cumulative_mask, method
 648 |         )
 649 |         # Find the coordinates of each tile
 650 |         top_right = top_left + np.array((0, self.tile_width))
 651 |         bottom_left = top_left + np.array((self.tile_height, 0))
 652 |         bottom_right = bottom_left + np.array((0, self.tile_width))
 653 |         # Compute the total number of mask pixels (both whole and fractional) that
 654 |         # overlap each tile.
 655 |         cumulative_by_tile = (
 656 |             interpolator(bottom_right)
 657 |             - interpolator(bottom_left)
 658 |             - interpolator(top_right)
 659 |             + interpolator(top_left)
 660 |         )
 661 |         # When the threshold is greater than zero, any `cumulative_by_tile` that is
 662 |         # greater than or equal to `threshold` is accepted.  Because we are worried
 663 |         # about rounding error, we'll use `epsilon` to let very close cases be
 664 |         # accepted.  When the threshold is exactly zero, any cumulative_by_tile that
 665 |         # is strictly greater than zero is accepted.  As `cumulative_by_tile` is,
 666 |         # `threshold` is a count of whole and fractional mask pixels.
 667 |         epsilon = 1e-6
 668 |         threshold = max(
 669 |             0.0,
 670 |             self.mask_threshold
 671 |             * (self.tile_height * self.mask_height / self.slide_height)
 672 |             * (self.tile_width * self.mask_width / self.slide_width)
 673 |             - epsilon,
 674 |         )
 675 |         return {
 676 |             f"tile_{i}": {"tile_top": int(corner[0]), "tile_left": int(corner[1])}
 677 |             for i, corner in enumerate(top_left)
 678 |             if cumulative_by_tile[i] > threshold
 679 |         }
 680 | 
 681 | 
 682 | class TilesByList(_TilesByCommon):
 683 |     """
 684 |     Select the tiles supplied by the user.  Optionally, select a random subset of them.
 685 | 
 686 |     An instance of class TilesByList is a callable that will select the coordinates of
 687 |     tiles to be taken from a slide.  The selected tiles will be written to the slide
 688 |     dictionary.
 689 | 
 690 |     Parameters for the constructor
 691 |     ------------------------------
 692 |     study : dictionary
 693 |         The study dictionary from which to read parameters about the study.
 694 |     randomly_select: int
 695 |         The number of tiles to be randomly selected from the list that would otherwise
 696 |         be written to the slide dictionary.  A value of -1 is the default and means that
 697 |         all tiles should be written.
 698 |     tiles_dictionary: dictionary
 699 |         For example, {'AB234': {'tile_top': top0, 'tile_left': left0}, 'CD43':
 700 |         {'tile_top': top1, 'tile_left': left1}, ...}.  Tiles from this list will copied
 701 |         into the slide dictionary if they are randomly selected.
 702 | 
 703 |     """
 704 | 
 705 |     def __init__(self, study, randomly_select=-1, tiles_dictionary={}):
 706 |         """
 707 |         Sanity check the supplied parameters and store them for later use.
 708 | 
 709 |         randomly_select defaults to "select all".
 710 | 
 711 |         For example,
 712 |         tiles_dictionary = {
 713 |             "AB234": {"tile_top": top0, "tile_left": left0},
 714 |             "CD43": {"tile_top": top1, "tile_left": left1},
 715 |             ...
 716 |         }
 717 |         """
 718 |         _TilesByCommon.__init__(self)
 719 | 
 720 |         # Update keys of the dictionary from deprecated names
 721 |         self._update_dict(study)
 722 | 
 723 |         # Check values
 724 |         if not ("version" in study and study["version"] == "version-1"):
 725 |             raise ValueError('study["version"] must exist and be equal to "version-1".')
 726 |         if not (
 727 |             "tile_height" in study
 728 |             and isinstance(study["tile_height"], (int, np.integer))
 729 |             and study["tile_height"] > 0
 730 |         ):
 731 |             raise ValueError(
 732 |                 'study["tile_height"]' " must exist and be a positive integer"
 733 |             )
 734 |         if not (
 735 |             "tile_width" in study
 736 |             and isinstance(study["tile_width"], (int, np.integer))
 737 |             and study["tile_width"] > 0
 738 |         ):
 739 |             raise ValueError(
 740 |                 'study["tile_width"]' " must exist and be a positive integer"
 741 |             )
 742 |         if not (
 743 |             isinstance(randomly_select, (int, np.integer)) and -1 <= randomly_select
 744 |         ):
 745 |             raise ValueError(
 746 |                 f"randomly_select ({randomly_select})"
 747 |                 " must be a non-negative integer or -1."
 748 |             )
 749 |         if not isinstance(tiles_dictionary, dict):
 750 |             raise ValueError("tiles_dictionary must be dictionary.")
 751 |         for tile_corner in tiles_dictionary.values():
 752 |             # Update keys of the dictionary from deprecated names
 753 |             self._update_dict(tile_corner)
 754 |         if not (
 755 |             all(
 756 |                 [
 757 |                     isinstance(tile_corner, dict)
 758 |                     for tile_corner in tiles_dictionary.values()
 759 |                 ]
 760 |             )
 761 |             and all(
 762 |                 [
 763 |                     key in tile_corner.keys()
 764 |                     for tile_corner in tiles_dictionary.values()
 765 |                     for key in ("tile_top", "tile_left")
 766 |                 ]
 767 |             )
 768 |             and all(
 769 |                 [
 770 |                     isinstance(tile_corner[key], (int, np.integer))
 771 |                     for tile_corner in tiles_dictionary.values()
 772 |                     for key in ("tile_top", "tile_left")
 773 |                 ]
 774 |             )
 775 |             and all(
 776 |                 [
 777 |                     tile_corner[key] >= 0
 778 |                     for tile_corner in tiles_dictionary.values()
 779 |                     for key in ("tile_top", "tile_left")
 780 |                 ]
 781 |             )
 782 |         ):
 783 |             raise ValueError(
 784 |                 "tiles_dictionary must be dictionary of tiles."
 785 |                 '  Each tile is a dictionary, with keys "tile_top" and "tile_left"'
 786 |                 " and with values that are non-negative integers."
 787 |             )
 788 | 
 789 |         # Save values.  To keep garbage collection efficient don't save all of `study`,
 790 |         # just the parts that we need.
 791 |         self.tile_height = study["tile_height"]
 792 |         self.tile_width = study["tile_width"]
 793 |         self.randomly_select = randomly_select
 794 |         self.tiles_dictionary = copy.deepcopy(
 795 |             tiles_dictionary
 796 |         )  # in case user changes it later
 797 | 
 798 |     def __call__(self, slide):
 799 |         """
 800 |         Select the tiles supplied by the user.  Optionally, select a random subset of
 801 |         them.
 802 |         """
 803 |         slide["tiles"] = copy.deepcopy(
 804 |             self.tiles_dictionary
 805 |         )  # in case __call__ is called again.
 806 |         if 0 <= self.randomly_select < len(slide["tiles"]):
 807 |             # Choose a subset of the tiles randomly
 808 |             slide["tiles"] = dict(
 809 |                 random.sample(sorted(slide["tiles"].items()), self.randomly_select)
 810 |             )
 811 | 
 812 | 
 813 | class TilesRandomly(_TilesByCommon):
 814 |     """
 815 |     Select a random subset of all possible tiles.
 816 | 
 817 |     An instance of class TilesRandomly is a callable that will select the coordinates of
 818 |     tiles to be taken from a slide.  The selected tiles will be written to the slide
 819 |     dictionary.
 820 | 
 821 |     Parameters for the constructor
 822 |     ------------------------------
 823 |     study : dictionary
 824 |         The study dictionary from which to read parameters about the study.
 825 |     randomly_select: int
 826 |         The number of tiles to be randomly selected from the slide.  The value must be
 827 |         positive.  A value of 1 is the default.
 828 | 
 829 |     """
 830 | 
 831 |     def __init__(self, study, randomly_select=1):  # Defaults to select one
 832 |         """
 833 |         Sanity check the supplied parameters and store them for later use.
 834 |         """
 835 |         _TilesByCommon.__init__(self)
 836 | 
 837 |         # Update keys of the dictionary from deprecated names
 838 |         self._update_dict(study)
 839 | 
 840 |         # Check values.
 841 |         if not ("version" in study and study["version"] == "version-1"):
 842 |             raise ValueError('study["version"] must exist and be equal to "version-1".')
 843 |         if not (
 844 |             "tile_height" in study
 845 |             and isinstance(study["tile_height"], (int, np.integer))
 846 |             and study["tile_height"] > 0
 847 |         ):
 848 |             raise ValueError(
 849 |                 'study["tile_height"]' " must exist and be a positive integer"
 850 |             )
 851 |         if not (
 852 |             "tile_width" in study
 853 |             and isinstance(study["tile_width"], (int, np.integer))
 854 |             and study["tile_width"] > 0
 855 |         ):
 856 |             raise ValueError(
 857 |                 'study["tile_width"]' " must exist and be a positive integer"
 858 |             )
 859 |         if not (
 860 |             isinstance(randomly_select, (int, np.integer)) and 0 <= randomly_select
 861 |         ):
 862 |             raise ValueError(
 863 |                 f"randomly_select ({randomly_select})"
 864 |                 " must be a non-negative integer."
 865 |             )
 866 | 
 867 |         # Save values.  To keep garbage collection efficient don't save all of `study`.
 868 |         self.tile_height = study["tile_height"]
 869 |         self.tile_width = study["tile_width"]
 870 |         self.randomly_select = randomly_select
 871 | 
 872 |     def __call__(self, slide):
 873 |         """
 874 |         Select a random subset of all possible tiles.
 875 |         """
 876 | 
 877 |         # Update keys of the dictionary from deprecated names
 878 |         self._update_dict(slide)
 879 | 
 880 |         if "slide_height" not in slide:
 881 |             raise ValueError('slide["slide_height"] must be already set.')
 882 |         if "slide_width" not in slide:
 883 |             raise ValueError('slide["slide_width"] must be already set.')
 884 | 
 885 |         top_too_large = slide["slide_height"] - self.tile_height + 1
 886 |         left_too_large = slide["slide_width"] - self.tile_width + 1
 887 |         slide["tiles"] = {
 888 |             f"tile_{i}": {
 889 |                 "tile_top": random.randrange(0, top_too_large),
 890 |                 "tile_left": random.randrange(0, left_too_large),
 891 |             }
 892 |             for i in range(self.randomly_select)
 893 |         }
 894 | 
 895 | 
 896 | class ChunkLocations(_TilesByCommon):
 897 |     def __init__(self):
 898 |         _TilesByCommon.__init__(self)
 899 |         self.no_indices = np.array((), dtype=np.int64)
 900 | 
 901 |     def __call__(self, study_description):
 902 |         """
 903 |         Given the list of desired tile locations, computes the locations of chunks to be
 904 |         read
 905 |         """
 906 | 
 907 |         # Update keys of the dictionary from deprecated names
 908 |         self._update_dict(study_description)
 909 | 
 910 |         if not (
 911 |             "version" in study_description
 912 |             and study_description["version"] == "version-1"
 913 |         ):
 914 |             raise ValueError(
 915 |                 'study_description["version"] must exist and be equal to "version-1".'
 916 |             )
 917 |         if not (
 918 |             "tile_height" in study_description
 919 |             and isinstance(study_description["tile_height"], (int, np.integer))
 920 |             and study_description["tile_height"] > 0
 921 |         ):
 922 |             raise ValueError(
 923 |                 'study_description["tile_height"]'
 924 |                 " must exist and be a positive integer"
 925 |             )
 926 |         if not (
 927 |             "tile_width" in study_description
 928 |             and isinstance(study_description["tile_width"], (int, np.integer))
 929 |             and study_description["tile_width"] > 0
 930 |         ):
 931 |             raise ValueError(
 932 |                 'study_description["tile_width"]'
 933 |                 " must exist and be a positive integer"
 934 |             )
 935 |         for slide in study_description["slides"].values():
 936 |             # Update keys of the dictionary from deprecated names
 937 |             self._update_dict(slide)
 938 | 
 939 |             if not (
 940 |                 "returned_magnification" in slide
 941 |                 and isinstance(
 942 |                     slide["returned_magnification"],
 943 |                     (int, np.integer, float, np.floating),
 944 |                 )
 945 |                 and slide["returned_magnification"] > 0
 946 |             ):
 947 |                 raise ValueError(
 948 |                     'slide["returned_magnification"]'
 949 |                     " must exist and be a positive number"
 950 |                 )
 951 |         # Check that other necessary keys are also present!!!
 952 | 
 953 |         # Partition the set of tiles into chunks.
 954 |         self._designate_chunks_for_tiles(study_description)
 955 |         # cProfile.runctx(
 956 |         #     "self._designate_chunks_for_tiles(study_description)",
 957 |         #     globals=globals(),
 958 |         #     locals=locals(),
 959 |         #     sort="cumulative",
 960 |         # )
 961 | 
 962 |     def _designate_chunks_for_tiles(self, study_description):
 963 |         # Update keys of the dictionary from deprecated names
 964 |         self._update_dict(study_description)
 965 | 
 966 |         tile_height = study_description["tile_height"]
 967 |         tile_width = study_description["tile_width"]
 968 | 
 969 |         for slide in study_description["slides"].values():
 970 |             # Update keys of the dictionary from deprecated names
 971 |             self._update_dict(slide)
 972 | 
 973 |             if not (
 974 |                 "chunk_height" in slide
 975 |                 and isinstance(slide["chunk_height"], (int, np.integer))
 976 |                 and slide["chunk_height"] > 0
 977 |             ):
 978 |                 raise ValueError(
 979 |                     'slide["chunk_height"]' " must exist and be a positive integer"
 980 |                 )
 981 |             if not (
 982 |                 "chunk_width" in slide
 983 |                 and isinstance(slide["chunk_width"], (int, np.integer))
 984 |                 and slide["chunk_width"] > 0
 985 |             ):
 986 |                 raise ValueError(
 987 |                     'slide["chunk_width"]' " must exist and be a positive integer"
 988 |                 )
 989 |             chunk_height = slide["chunk_height"]
 990 |             chunk_width = slide["chunk_width"]
 991 | 
 992 |             tiles_names = list(slide["tiles"].keys())
 993 |             tiles_data = np.array(
 994 |                 [
 995 |                     [
 996 |                         slide["tiles"][tile]["tile_top"],
 997 |                         slide["tiles"][tile]["tile_left"],
 998 |                     ]
 999 |                     for tile in tiles_names
1000 |                 ],
1001 |                 dtype=np.int64,
1002 |             )
1003 |             self.build_tree(tiles_data)
1004 |             chunks = slide["chunks"] = {}
1005 |             num_chunks = 0
1006 |             while self.get_tree() is not None:
1007 |                 tile = self.get_topmost()
1008 |                 chunk = chunks[f"chunk_{num_chunks}"] = {
1009 |                     "chunk_top": tiles_data[0],
1010 |                     "chunk_left": tiles_data[1],
1011 |                     "chunk_bottom": tiles_data[0] + chunk_height,
1012 |                     "chunk_right": tiles_data[1] + chunk_width,
1013 |                 }
1014 |                 num_chunks += 1
1015 | 
1016 |                 mins = tile.copy()
1017 |                 maxs = tile.copy()
1018 |                 maxs[0] += chunk_height - tile_height + 1
1019 |                 maxs[1] += chunk_width - tile_width + 1
1020 |                 indices = self.find_in_range_and_delete(mins, maxs)
1021 |                 tiles = chunk["tiles"] = {
1022 |                     tiles_names[i]: {
1023 |                         "tile_top": tiles_data[i][0],
1024 |                         "tile_left": tiles_data[i][1],
1025 |                     }
1026 |                     for i in indices
1027 |                 }
1028 |                 # Make the chunk as small as possible given the tiles that it must
1029 |                 # support.  Note that this also ensures that the pixels that are read do
1030 |                 # not run over the bottom or right border of the slide (assuming that
1031 |                 # the tiles do not go over those borders).
1032 |                 chunk["chunk_top"] = min([tile["tile_top"] for tile in tiles.values()])
1033 |                 chunk["chunk_left"] = min(
1034 |                     [tile["tile_left"] for tile in tiles.values()]
1035 |                 )
1036 |                 chunk["chunk_bottom"] = (
1037 |                     max([tile["tile_top"] for tile in tiles.values()]) + tile_height
1038 |                 )
1039 |                 chunk["chunk_right"] = (
1040 |                     max([tile["tile_left"] for tile in tiles.values()]) + tile_width
1041 |                 )
1042 | 
1043 |     @staticmethod
1044 |     def read_large_image(
1045 |         filename,
1046 |         chunk_top,
1047 |         chunk_left,
1048 |         chunk_bottom,
1049 |         chunk_right,
1050 |         returned_magnification,
1051 |     ):
1052 |         # if "_num_chunks" not in ChunkLocations.read_large_image.__dict__:
1053 |         #     ChunkLocations.read_large_image._num_chunks = 0
1054 |         # chunk_name = (
1055 |         #     f"#read_large_image {ChunkLocations.read_large_image._num_chunks:06}"
1056 |         # )
1057 |         # ChunkLocations.read_large_image._num_chunks += 1
1058 | 
1059 |         # print(f"{chunk_name} begin {datetime.datetime.now()}")
1060 |         import large_image
1061 |         import large_image_source_tiff
1062 | 
1063 |         ts = (
1064 |             large_image_source_tiff.open(filename)
1065 |             if os.path.splitext(filename)[1] in (".tif", ".tiff", ".svs")
1066 |             else large_image.open(filename)
1067 |         )
1068 |         chunk = ts.getRegion(
1069 |             scale=dict(magnification=returned_magnification),
1070 |             format=large_image.constants.TILE_FORMAT_NUMPY,
1071 |             region=dict(
1072 |                 left=chunk_left,
1073 |                 top=chunk_top,
1074 |                 width=chunk_right - chunk_left,
1075 |                 height=chunk_bottom - chunk_top,
1076 |                 units="mag_pixels",
1077 |             ),
1078 |         )[0]
1079 |         # print(f"{chunk_name} end {datetime.datetime.now()}")
1080 |         return chunk
1081 | 
1082 |     @staticmethod
1083 |     def scale_it(value, factor):
1084 |         return math.floor(value / factor + 0.01)
1085 | 
1086 |     def build_tree(self, data):
1087 |         self.data = data
1088 |         self.tree = self._build(np.arange(self.data.shape[0]))
1089 | 
1090 |     def get_data(self):
1091 |         return self.data
1092 | 
1093 |     def get_tree(self):
1094 |         return self.tree
1095 | 
1096 |     def get_topmost(self):
1097 |         return self.tree["topmost"]
1098 | 
1099 |     def find_in_range_and_delete(self, mins, maxs):
1100 |         self.mins = mins
1101 |         self.maxs = maxs
1102 |         indices, newtree = self._find_in_range_and_delete(subtree=self.tree)
1103 |         self.tree = newtree
1104 |         return indices
1105 | 
1106 |     def _build(self, indices):
1107 |         # Split this subset of the data based upon its coordinate means
1108 |         subset = self.data[indices, :]
1109 |         means = np.mean(subset, axis=0)
1110 |         # Calculate the quadrant (in range(2**m)) for each point
1111 |         rants = (subset[:, 0] >= means[0]) + 0
1112 |         for col in range(1, self.data.shape[1]):
1113 |             rants = (rants * 2) + (subset[:, col] >= means[col])
1114 | 
1115 |         # How to process this depends upon how many quadrants are used
1116 |         occur = np.unique(rants)
1117 |         if len(occur) == 1:
1118 |             return {"means": means, "topmost": means, "indices": indices}
1119 |         else:
1120 |             recurse = {rant: self._build(indices[rants == rant]) for rant in occur}
1121 |             qvalues = list(recurse.values())
1122 |             # Find the the topmost, in dictionary order
1123 |             topmost = self._compute_topmost(qvalues)
1124 | 
1125 |             # Return what we have found
1126 |             return {"means": means, "topmost": topmost, "quadrants": recurse}
1127 | 
1128 |     @staticmethod
1129 |     def _compute_topmost(qvalues):
1130 |         topmost = qvalues[0]["topmost"]
1131 |         for k in range(1, len(qvalues)):
1132 |             test_key = qvalues[k]["topmost"]
1133 |             for c in range(len(topmost)):
1134 |                 if test_key[c] != topmost[c]:
1135 |                     break
1136 |             if test_key[c] < topmost[c]:
1137 |                 topmost = test_key
1138 |         return topmost
1139 | 
1140 |     def _find_in_range_and_delete(self, subtree):
1141 |         if "indices" in subtree:
1142 |             # Process this leaf node
1143 |             if all(subtree["means"] >= self.mins) and all(subtree["means"] < self.maxs):
1144 |                 # Return these indices and remove the subtree
1145 |                 return subtree["indices"], None
1146 |             else:
1147 |                 # Return no indices and remove nothing from the subtree
1148 |                 return self.no_indices, subtree
1149 |         else:
1150 |             # Process this internal node
1151 |             means = subtree["means"]
1152 |             recurse = dict(
1153 |                 (
1154 |                     (qkey, self._find_in_range_and_delete(qvalue))
1155 |                     if all(
1156 |                         (
1157 |                             (
1158 |                                 self.maxs[col] > means[col]
1159 |                                 if qkey & 2 ** (self.data.shape[1] - 1 - col)
1160 |                                 else self.mins[col] < means[col]
1161 |                             )
1162 |                             for col in range(self.data.shape[1])
1163 |                         )
1164 |                     )
1165 |                     else (qkey, (self.no_indices, qvalue))
1166 |                 )
1167 |                 for qkey, qvalue in subtree["quadrants"].items()
1168 |             )
1169 |             indices = np.array(
1170 |                 [index for pair in recurse.values() for index in pair[0]],
1171 |                 dtype=np.int64,
1172 |             )
1173 |             quadrants = {
1174 |                 qkey: pair[1] for qkey, pair in recurse.items() if pair[1] is not None
1175 |             }
1176 |             if len(quadrants) == 0:
1177 |                 return indices, None
1178 |             topmost = self._compute_topmost(list(quadrants.values()))
1179 |             return indices, {
1180 |                 "means": subtree["means"],
1181 |                 "topmost": topmost,
1182 |                 "quadrants": quadrants,
1183 |             }
1184 | 


--------------------------------------------------------------------------------