├── .gitignore ├── LICENSE ├── README.md ├── buildkite.yml ├── datatap ├── __init__.py ├── api │ ├── .gitignore │ ├── __init__.py │ ├── endpoints │ │ ├── __init__.py │ │ ├── database_endpoints.py │ │ ├── dataset_endpoints.py │ │ ├── endpoints.py │ │ ├── repository_endpoints.py │ │ ├── request.py │ │ └── user_endpoints.py │ ├── entities │ │ ├── __init__.py │ │ ├── api.py │ │ ├── database.py │ │ ├── dataset.py │ │ ├── repository.py │ │ └── user.py │ └── types │ │ ├── __init__.py │ │ ├── database.py │ │ ├── dataset.py │ │ ├── repository.py │ │ └── user.py ├── comet │ └── __init__.py ├── droplet │ ├── __init__.py │ ├── _media.py │ ├── attributes.py │ ├── bounding_box.py │ ├── class_annotation.py │ ├── frame_annotation.py │ ├── image.py │ ├── image_annotation.py │ ├── instance.py │ ├── keypoint.py │ ├── multi_instance.py │ ├── segmentation.py │ ├── video.py │ └── video_annotation.py ├── examples │ ├── __init__.py │ └── torch.ipynb ├── geometry │ ├── __init__.py │ ├── mask.py │ ├── point.py │ ├── polygon.py │ └── rectangle.py ├── metrics │ ├── __init__.py │ ├── _types.py │ ├── confusion_matrix.py │ ├── iou.py │ └── precision_recall_curve.py ├── py.typed ├── template │ ├── __init__.py │ ├── class_annotation_template.py │ ├── frame_annotation_template.py │ ├── image_annotation_template.py │ ├── instance_template.py │ ├── multi_instance_template.py │ └── video_annotation_template.py ├── tf │ ├── __init__.py │ └── dataset.py ├── torch │ ├── __init__.py │ ├── _patch_torch.py │ ├── dataloader.py │ ├── dataset.py │ └── utils.py └── utils │ ├── __init__.py │ ├── cache_generator.py │ ├── environment.py │ ├── helpers.py │ ├── or_nullish.py │ └── print_helpers.py ├── dev_requirements.txt ├── examples └── coco_to_droplet.py ├── pyrightconfig.json ├── requirements.txt ├── requirements_image.txt ├── requirements_importers.txt ├── requirements_metrics.txt ├── requirements_tf.txt ├── requirements_torch.txt ├── setup.py ├── tests ├── __init__.py └── metrics │ ├── __init__.py │ ├── test_iou.py │ └── test_precision_recall_curve.py └── typings ├── PIL ├── Image.pyi └── __init__.pyi ├── boto3 └── __init__.pyi ├── comet_ml ├── API.pyi ├── APIExperiment.pyi ├── ExistingExperiment.pyi ├── Experiment.pyi ├── __init__.pyi ├── exceptions.pyi └── query │ └── __init__.pyi ├── dask ├── __init__.pyi ├── bag.pyi └── delayed.pyi ├── fsspec └── __init__.pyi ├── matplotlib ├── __init__.py └── pyplot │ └── __init__.py ├── neo4j ├── __init__.pyi ├── driver.pyi ├── graph_database.pyi ├── record.pyi ├── result.pyi ├── session.pyi └── transaction.pyi ├── pycocotools ├── __init__.pyi └── mask.pyi ├── requests └── __init__.pyi ├── scipy ├── __init__.pyi └── optimize │ └── __init__.pyi ├── shapely ├── __init__.pyi └── geometry │ └── __init__.pyi ├── skimage ├── __init__.pyi └── measure.pyi ├── sortedcontainers └── __init__.pyi ├── tensorflow ├── __init__.pyi ├── data │ ├── __init__.pyi │ └── dataset.pyi ├── distribute │ ├── __init__.pyi │ ├── distributed_dataset.pyi │ ├── experimental │ │ ├── __init__.pyi │ │ └── strategy.pyi │ └── input_context.pyi ├── io │ ├── __init__.py │ └── decode_image.py ├── tensor.pyi └── types.pyi └── torchvision ├── __init__.py └── transforms ├── __init__.py └── functional.pyi /.gitignore: -------------------------------------------------------------------------------- 1 | /out 2 | /test 3 | /dask-* 4 | /*.egg-info 5 | /tmp* 6 | .venv 7 | __pycache__ 8 | build 9 | dist 10 | .vscode 11 | html -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 |

6 | 7 | 8 | 9 |

10 | 11 |

12 | The visual data management platform from Zensors. 13 |

14 | 15 | ---------- 16 | 17 |

18 | Join for free at app.datatap.dev. 19 |

20 | 21 | 22 | The dataTap Python library is the primary interface for using dataTap's rich data management tools. Create datasets, stream annotations, and analyze model performance all with one library. 23 | 24 | ---------- 25 | 26 | ## Documentation 27 | 28 | Full documentation is available at [docs.datatap.dev](https://docs.datatap.dev/). 29 | 30 | ## Features 31 | 32 | - [x] ⚡ Begin training instantly 33 | - [x] 🔥 Works with all major ML frameworks (Pytorch, TensorFlow, etc.) 34 | - [x] 🛰️ Real-time streaming to avoid large dataset downloads 35 | - [x] 🌐 Universal data format for simple data exchange 36 | - [x] 🎨 Combine data from multiples sources into a single dataset easily 37 | - [x] 🧮 Rich ML utilities to compute PR-curves, confusion matrices, and accuracy metrics. 38 | - [x] 💽 Free access to a variety of open datasets. 39 | 40 | ## Getting Started (Platform) 41 | 42 | To begin, select a dataset from the dataTap repository. 43 | 44 |

45 | 46 |

47 | 48 | Then copy the starter code based on your library preference. 49 | 50 |

51 | 52 |

53 | 54 | Paste the starter code and start training. 55 | 56 |

57 | 58 |

59 | 60 | ## Getting Started (API) 61 | 62 | Install the client library. 63 | 64 | ```bash 65 | pip install datatap 66 | ``` 67 | 68 | Register at [app.datatap.dev](https://app.datatap.dev). Then, go to `Settings > Api Keys` to find your personal API key. 69 | 70 | ```bash 71 | export DATATAP_API_KEY="XXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXX" 72 | ``` 73 | 74 | Start using open datasets instantly. 75 | 76 | ```python 77 | from datatap import Api 78 | 79 | api = Api() 80 | coco = api.get_default_database().get_repository("_/coco") 81 | dataset = coco.get_dataset("latest") 82 | print("COCO: ", dataset) 83 | ``` 84 | 85 | 86 | 87 | ## Data Streaming Example 88 | 89 | ```python 90 | import itertools 91 | from datatap import Api 92 | 93 | api = Api() 94 | dataset = (api 95 | .get_default_database() 96 | .get_repository("_/wider-person") 97 | .get_dataset("latest") 98 | ) 99 | 100 | training_stream = dataset_version.stream_split("training") 101 | for annotation in itertools.islice(training_stream, 5): 102 | print("Received annotation:", annotation) 103 | ``` 104 | 105 | ## More Examples 106 | - [Documented Sample](https://github.com/Zensors/datatap-python/tree/master/datatap/examples/streaming-sample.md) 107 | - [Pytorch Jupyter Notebook](https://github.com/Zensors/datatap-python/tree/master/datatap/examples/torch.ipynb) 108 | 109 | 110 | ## Support and FAQ 111 | 112 | **Q. How do I resolve a missing API Key?** 113 | 114 | If you see the error `Exception: No API key available. Either provide it or use the [DATATAP_API_KEY] environment variable`, then the dataTap library was not able to find your API key. You can find your API key on [app.datatap.dev](https://app.datatap.dev) under settings. You can either set it as an environment variable or as the first argument to the `Api` constructor. 115 | 116 | **Q. Can dataTap be used offline?** 117 | 118 | Some functionality can be used offline, such as the droplet utilities and metrics. However, repository access and dataset streaming require internet access, even for local databases. 119 | 120 | **Q. Is dataTap accepting contributions?** 121 | 122 | dataTap currently uses a separate code review system for managing contributions. The team is looking into switching that system to GitHub to allow public contributions. Until then, we will actively monitor the GitHub issue tracker to help accomodate the community's needs. 123 | 124 | **Q. How can I get help using dataTap?** 125 | 126 | You can post a question in the [issue tracker](https://github.com/zensors/datatap-python/issues). The dataTap team actively monitors the repository, and will try to get back to you as soon as possible. 127 | -------------------------------------------------------------------------------- /buildkite.yml: -------------------------------------------------------------------------------- 1 | steps: 2 | - label: ":copyright: Typechecking Module" 3 | commands: 4 | - "echo '--- Setting Up'" 5 | - "mkdir -p /build" 6 | - "cp -R . /build" 7 | - "cd /build" 8 | - "echo '--- Installing Packages'" 9 | - "pip install -r dev_requirements.txt" 10 | - "pip install -r requirements_torch.txt" 11 | - "pip install -e '.[metrics,torch]'" 12 | - "yarn global add 'pyright@1.1.264'" 13 | - "echo '+++ Running Pyright'" 14 | - "pyright" 15 | if: build.message !~ /skip tests/ 16 | plugins: 17 | - docker#v3.7.0: 18 | image: "nikolaik/python-nodejs:python3.8-nodejs12" 19 | 20 | - label: ":python: Testing Module" 21 | commands: 22 | - "echo '--- Setting Up'" 23 | - "mkdir -p /test" 24 | - "cp -R . /test" 25 | - "cd /test" 26 | - "echo '--- Installing Packages'" 27 | - "pip install -e '.[metrics]'" 28 | - "echo '+++ Running Tests'" 29 | - "python -m unittest discover tests" 30 | if: build.message !~ /skip tests/ 31 | plugins: 32 | - docker#v3.7.0: 33 | image: "python:3.8" 34 | 35 | - wait 36 | 37 | - label: ":package: Building and Pushing Wheel" 38 | commands: 39 | - "python3.7 setup.py bdist_wheel" 40 | - "twine upload -r datatap dist/*" 41 | - "twine upload -r zensors dist/*" 42 | if: build.branch == "master" 43 | 44 | - label: ":package: Building and Pushing Documentation" 45 | commands: 46 | - "echo '--- Setting Up'" 47 | - "mkdir -p /build" 48 | - "cp -R . /build" 49 | - "cd /build" 50 | - "echo '--- Installing Packages'" 51 | - "pip install -r requirements_torch.txt" 52 | - "pip install -e '.[metrics,torch]'" 53 | - "pip install pdoc3 awscli comet_ml" 54 | - "echo '+++ Compiling Docs'" 55 | - "pdoc3 datatap --html" 56 | - "echo '+++ Uploading Docs'" 57 | - "cd html/datatap && aws s3 cp --recursive --acl public-read . s3://docs.datatap.dev/" 58 | plugins: 59 | - docker#v3.7.0: 60 | image: "python:3.8" 61 | volumes: 62 | - "/var/lib/buildkite-agent/.aws/:/root/.aws/" 63 | if: build.branch == "master" 64 | 65 | - label: ":github: Pushing to github" 66 | commands: 67 | - "git remote add gh git@github.com:Zensors/datatap-python.git || true" 68 | - "git push gh HEAD:${BUILDKITE_BRANCH}" 69 | if: "build.branch !~ /^refs/" 70 | -------------------------------------------------------------------------------- /datatap/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module provides classes and methods for interacting with dataTap. This includes inspecting individual annotations, 3 | creating or importing new annotations, and creating or loading datasets for machine learning. 4 | 5 | .. include:: ../README.md 6 | """ 7 | 8 | import sys as _sys 9 | 10 | if _sys.version_info < (3, 7): 11 | print("\x1b[38;5;1mUsing an unsupported python version. Please install Python 3.7 or greater\x1b[0m") 12 | raise Exception("Invalid python version") 13 | 14 | from .api.entities import Api 15 | 16 | __all__ = [ 17 | "Api", 18 | "api", 19 | "droplet", 20 | "geometry", 21 | "template", 22 | "utils", 23 | ] -------------------------------------------------------------------------------- /datatap/api/.gitignore: -------------------------------------------------------------------------------- 1 | test.py -------------------------------------------------------------------------------- /datatap/api/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The `datatap.api` module provides two different interfaces for the API. 3 | 4 | The simplest of these is found in `endpoints`, and contains classes and methods 5 | for directly interfacing with the API using its HTTP/JSON protocol. 6 | 7 | The more powerful interface is the `entities` interface, which wraps these 8 | endpoints into python objects with convenience methods for accessing other 9 | entities. 10 | """ 11 | 12 | from . import endpoints 13 | from . import entities 14 | from . import types 15 | 16 | __all__ = [ 17 | "endpoints", 18 | "entities", 19 | "types", 20 | ] -------------------------------------------------------------------------------- /datatap/api/endpoints/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Encapsulates all of the raw API requests. 3 | 4 | In most cases, it is preferable to interact with the api through the 5 | `datatap.api.entities` submodule. However, this module can be used 6 | as well. 7 | 8 | ```py 9 | from datatap.api.endpoints import ApiEndpoints 10 | 11 | api_endpoints = ApiEndpoints() 12 | 13 | print(api_endpoints.user.current()) 14 | print(api_endpoints.database.list()) 15 | ``` 16 | """ 17 | 18 | from .endpoints import ApiEndpoints 19 | 20 | __all__ = [ 21 | "ApiEndpoints" 22 | ] -------------------------------------------------------------------------------- /datatap/api/endpoints/database_endpoints.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from .request import ApiNamespace 4 | from ..types import JsonDatabase 5 | 6 | class Database(ApiNamespace): 7 | """ 8 | Raw API for interacting with database endpoints. 9 | """ 10 | def list(self) -> List[JsonDatabase]: 11 | """ 12 | Returns a list of `JsonDatabase`s that the current user has access to. 13 | """ 14 | return self.get[List[JsonDatabase]]("/database") 15 | 16 | def query_by_uid(self, database: str) -> JsonDatabase: 17 | """ 18 | Returns a specific `JsonDatabase`, identified by UID. 19 | """ 20 | return self.get[JsonDatabase](f"/database/{database}") 21 | 22 | def query_by_name(self, database_name: str) -> List[JsonDatabase]: 23 | """ 24 | Returns a list of `JsonDatabase`s with the name `database_name`. 25 | """ 26 | return self.post[List[JsonDatabase]](f"/database/query", { "name": database_name }) -------------------------------------------------------------------------------- /datatap/api/endpoints/dataset_endpoints.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from datatap.api.types.dataset import JsonDataset 3 | 4 | import tempfile 5 | import ctypes 6 | from typing import Generator 7 | from multiprocessing import Array, set_start_method 8 | 9 | from datatap.droplet import ImageAnnotationJson 10 | from datatap.utils import CacheGenerator 11 | 12 | from .request import ApiNamespace 13 | 14 | set_start_method("spawn", force = True) 15 | process_directory_value = Array(ctypes.c_char, tempfile.mkdtemp(prefix="datatap-").encode("ascii")) 16 | process_directory: str = process_directory_value.value.decode("ascii") 17 | 18 | class Dataset(ApiNamespace): 19 | """ 20 | Raw API for interacting with dataset endpoints. 21 | """ 22 | 23 | def query(self, database_uid: str, namespace: str, name: str, tag: str) -> JsonDataset: 24 | """ 25 | Queries the database for a dataset with given `namespace`, `name`, and `tag`. 26 | Returns a `JsonDataset`. 27 | """ 28 | return self.get[JsonDataset](f"/database/{database_uid}/repository/{namespace}/{name}/{tag}") 29 | 30 | def stream_split( 31 | self, 32 | *, 33 | database_uid: str, 34 | namespace: str, 35 | name: str, 36 | uid: str, 37 | split: str, 38 | chunk: int, 39 | nchunks: int 40 | ) -> Generator[ImageAnnotationJson, None, None]: 41 | """ 42 | Streams a split of a dataset. Required to stream are the `database_uid`, the full path of the daataset, and the 43 | `split`. Additionally, since this endpoint automatically shards the split, you must provide a chunk number 44 | (`chunk`) and the total number of chunks in the shard (`nchunks`). 45 | 46 | The result is a generator of `ImageAnnotationJson`s. 47 | """ 48 | if chunk < 0 or chunk >= nchunks: 49 | raise Exception(f"Invalid chunk specification. {chunk} must be in the range [0, {nchunks})") 50 | 51 | dir_name = f"{process_directory}/{namespace}-{name}-{uid}-{split}-{nchunks}" 52 | file_name = f"{dir_name}/chunk-{chunk}.jsonl" 53 | 54 | def create_stream(): 55 | return self.stream[ImageAnnotationJson]( 56 | f"/database/{database_uid}/repository/{namespace}/{name}/{uid}/split/{split}/stream", 57 | { "chunk": str(chunk), "nchunks": str(nchunks) } 58 | ) 59 | 60 | return CacheGenerator(file_name, create_stream) -------------------------------------------------------------------------------- /datatap/api/endpoints/endpoints.py: -------------------------------------------------------------------------------- 1 | from datatap.api.endpoints.repository_endpoints import Repository 2 | from typing import Optional 3 | 4 | from .request import Request 5 | from .user_endpoints import User 6 | from .database_endpoints import Database 7 | from .dataset_endpoints import Dataset 8 | 9 | class ApiEndpoints: 10 | """ 11 | Class for performing raw API requests. 12 | """ 13 | 14 | user: User 15 | """ 16 | User endpoints. 17 | """ 18 | 19 | database: Database 20 | """ 21 | Database endpoints. 22 | """ 23 | 24 | repository: Repository 25 | """ 26 | Repository endpoints. 27 | """ 28 | 29 | dataset: Dataset 30 | """ 31 | Dataset endpoints. 32 | """ 33 | 34 | _request: Request 35 | 36 | def __init__(self, api_key: Optional[str] = None, uri: Optional[str] = None): 37 | self._request = Request(api_key, uri) 38 | 39 | self.user = User(self._request) 40 | self.database = Database(self._request) 41 | self.repository = Repository(self._request) 42 | self.dataset = Dataset(self._request) -------------------------------------------------------------------------------- /datatap/api/endpoints/repository_endpoints.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import List 4 | from .request import ApiNamespace 5 | from ..types import JsonRepository 6 | 7 | class Repository(ApiNamespace): 8 | """ 9 | Raw API for interacting with repository endpoints. 10 | """ 11 | def list(self, database_uid: str) -> List[JsonRepository]: 12 | """ 13 | Returns a list of `JsonRepository`s in the database specified by `database_uid`. 14 | """ 15 | return self.get[List[JsonRepository]](f"/database/{database_uid}/repository") 16 | 17 | def query(self, database_uid: str, namespace: str, name: str) -> JsonRepository: 18 | """ 19 | Queries the database for the repository with a given `namespace` and `name`, and 20 | returns the corresponding `JsonRepository` list. 21 | """ 22 | return self.get[JsonRepository](f"/database/{database_uid}/repository/{namespace}/{name}") 23 | -------------------------------------------------------------------------------- /datatap/api/endpoints/request.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from datatap.utils.environment import Environment 3 | 4 | import json 5 | from base64 import b64encode 6 | from urllib.parse import urljoin 7 | from typing import Generator, Optional, Dict, TypeVar, Generic, Type, Any, cast 8 | 9 | import requests 10 | 11 | _T = TypeVar("_T") 12 | _S = TypeVar("_S") 13 | 14 | class GetRequester(Generic[_T]): 15 | """ 16 | A callable-class for performing typed `GET` requests to the API. 17 | """ 18 | api_key: str 19 | uri: str 20 | 21 | def __init__(self, api_key: str, base_uri: str): 22 | self.api_key = api_key 23 | self.uri = base_uri 24 | 25 | def __getitem__(self, s: Type[_S]) -> GetRequester[_S]: 26 | return cast(GetRequester[_S], self) 27 | 28 | def __call__(self, endpoint: str, query_params: Optional[Dict[str, str]] = None) -> _T: 29 | qualified_uri = urljoin(self.uri, "/api/" + endpoint) 30 | encoded_api_key = b64encode(bytes(self.api_key, "ascii")).decode("ascii") 31 | 32 | response = requests.get( 33 | qualified_uri, 34 | params=query_params, 35 | headers={ 36 | "Authorization": f"Bearer {encoded_api_key}" 37 | }, 38 | ) 39 | 40 | if not response.ok: 41 | error: str 42 | try: 43 | error = response.json()["error"] 44 | except: 45 | error = response.content.decode("ascii") 46 | raise Exception(error) 47 | 48 | return response.json() 49 | 50 | class PostRequester(Generic[_T]): 51 | """ 52 | A callable-class for performing typed `Post` requests to the API. 53 | """ 54 | api_key: str 55 | uri: str 56 | 57 | def __init__(self, api_key: str, base_uri: str): 58 | self.api_key = api_key 59 | self.uri = base_uri 60 | 61 | def __getitem__(self, s: Type[_S]) -> PostRequester[_S]: 62 | return cast(PostRequester[_S], self) 63 | 64 | def __call__(self, endpoint: str, body: Dict[str, Any], query_params: Optional[Dict[str, str]] = None) -> _T: 65 | qualified_uri = urljoin(self.uri, "/api/" + endpoint) 66 | encoded_api_key = b64encode(bytes(self.api_key, "ascii")).decode("ascii") 67 | 68 | response = requests.post( 69 | qualified_uri, 70 | params=query_params, 71 | headers={ 72 | "Authorization": f"Bearer {encoded_api_key}" 73 | }, 74 | json=body, 75 | ) 76 | 77 | if not response.ok: 78 | error: str 79 | try: 80 | error = response.json()["error"] 81 | except: 82 | error = response.content.decode("ascii") 83 | raise Exception(error) 84 | 85 | return response.json() 86 | 87 | class StreamRequester(Generic[_T]): 88 | """ 89 | A callable-class for performing typed stream requests to the API. 90 | """ 91 | api_key: str 92 | uri: str 93 | 94 | def __init__(self, api_key: str, uri: str): 95 | self.api_key = api_key 96 | self.uri = uri 97 | 98 | def __getitem__(self, s: Type[_S]) -> StreamRequester[_S]: 99 | return cast(StreamRequester[_S], self) 100 | 101 | def __call__(self, endpoint: str, query_params: Optional[Dict[str, str]] = None) -> Generator[_T, None, None]: 102 | qualified_uri = urljoin(self.uri, "/api/" + endpoint) 103 | encoded_api_key = b64encode(bytes(self.api_key, "ascii")).decode("ascii") 104 | 105 | response = requests.get( 106 | qualified_uri, 107 | params=query_params, 108 | headers={ 109 | "Authorization": f"Bearer {encoded_api_key}" 110 | }, 111 | stream=True 112 | ) 113 | 114 | if not response.ok: 115 | error: str 116 | try: 117 | error = response.json()["error"] 118 | except: 119 | error = response.content.decode("ascii") 120 | raise Exception(error) 121 | 122 | for line in response.iter_lines(decode_unicode=True): 123 | yield json.loads(line) 124 | 125 | 126 | class Request: 127 | """ 128 | A helper class that encapsulates the logic for making requests to the 129 | dataTap server. It is passed an optional `api_key`, which defaults to 130 | the `DATATAP_API_KEY` environment variable. It can also be passed a base 131 | `uri` for connecting to a different dataTap server (such as through a 132 | proxy). 133 | """ 134 | 135 | get: GetRequester[Any] 136 | """ 137 | Function for typesafe `GET` requests. 138 | """ 139 | 140 | post: PostRequester[Any] 141 | """ 142 | Function for typesafe `POST` requests. 143 | """ 144 | 145 | stream: StreamRequester[Any] 146 | """ 147 | Function for typesafe streaming requests. 148 | """ 149 | 150 | def __init__(self, api_key: Optional[str] = None, base_uri: Optional[str] = None): 151 | api_key = api_key or Environment.API_KEY 152 | base_uri = base_uri or Environment.BASE_URI 153 | if api_key is None: 154 | raise Exception("No API key available. Either provide it or use the [DATATAP_API_KEY] environment variable") 155 | 156 | self.get = GetRequester[Any](api_key, base_uri) 157 | self.post = PostRequester[Any](api_key, base_uri) 158 | self.stream = StreamRequester[Any](api_key, base_uri) 159 | 160 | class ApiNamespace: 161 | """ 162 | Base class for API endpoints. 163 | """ 164 | def __init__(self, request: Request): 165 | self.request = request 166 | self.get = request.get 167 | self.post = request.post 168 | self.stream = request.stream 169 | -------------------------------------------------------------------------------- /datatap/api/endpoints/user_endpoints.py: -------------------------------------------------------------------------------- 1 | from .request import ApiNamespace 2 | from ..types import JsonUser 3 | 4 | class User(ApiNamespace): 5 | """ 6 | Raw API for interacting with user endpoints. 7 | """ 8 | def current(self) -> JsonUser: 9 | """ 10 | Returns a `JsonUser` representing the logged in user. 11 | """ 12 | return self.get[JsonUser]("/user") 13 | -------------------------------------------------------------------------------- /datatap/api/entities/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The `datatap.api.entities` submodule contains several enttiies 3 | that provide a user-friendly abstraction for the dataTap API. 4 | """ 5 | 6 | from .api import Api 7 | 8 | from .user import User 9 | from .database import Database 10 | from .dataset import AnyDataset, Dataset 11 | from .repository import Repository, Tag, Split 12 | 13 | __all__ = [ 14 | "Api", 15 | "User", 16 | "Database", 17 | "AnyDataset", 18 | "Dataset", 19 | "Repository", 20 | "Tag", 21 | "Split", 22 | ] -------------------------------------------------------------------------------- /datatap/api/entities/api.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Union, overload 2 | 3 | from typing_extensions import Literal 4 | 5 | from datatap.utils.helpers import assert_one 6 | 7 | from .user import User 8 | from .database import Database 9 | from ..endpoints import ApiEndpoints 10 | 11 | class Api: 12 | """ 13 | The `Api` object is the primary method of interacting with the dataTap API. 14 | 15 | The `Api` constructor takes two optional arguments. 16 | 17 | The first, `api_key`, should be the current user's personal API key. In 18 | order to encourage good secret practices, this class will use the value 19 | found in the `DATATAP_API_KEY` if no key is passed in. Consider using 20 | environment variables or another secret manager for your API keys. 21 | 22 | The second argument is `uri`. This should only be used if you would like 23 | to target a different API server than the default. For instance, if you 24 | are using a proxy to reach the API, you can use the `uri` argument to 25 | point toward your proxy. 26 | 27 | This object encapsulates most of the logic for interacting with API. 28 | For instance, to get a list of all datasets that a user has access to, 29 | you can run 30 | 31 | ```py 32 | from datatap import Api 33 | 34 | api = Api() 35 | print([ 36 | dataset 37 | for database in api.get_database_list() 38 | for dataset in database.get_dataset_list() 39 | ]) 40 | ``` 41 | 42 | For more details on the functionality provided by the Api object, take 43 | a look at its documentation. 44 | """ 45 | def __init__(self, api_key: Optional[str] = None, uri: Optional[str] = None): 46 | self.endpoints = ApiEndpoints(api_key, uri) 47 | 48 | def get_current_user(self) -> User: 49 | """ 50 | Returns the current logged-in user. 51 | """ 52 | return User.from_json(self.endpoints, self.endpoints.user.current()) 53 | 54 | def get_database_list(self) -> List[Database]: 55 | """ 56 | Returns a list of all databases that the current user has access to. 57 | """ 58 | return [ 59 | Database.from_json(self.endpoints, json_db) 60 | for json_db in self.endpoints.database.list() 61 | ] 62 | 63 | def get_default_database(self) -> Database: 64 | """ 65 | Returns the default database for the user (this defaults to the public 66 | database). 67 | """ 68 | 69 | # TODO(zwade): Have a way of specifying a per-user default 70 | current_user = self.get_current_user() 71 | if current_user.default_database is None: 72 | raise Exception("Trying to find the default database, but none is specified") 73 | 74 | return self.get_database_by_uid(current_user.default_database) 75 | 76 | def get_database_by_uid(self, uid: str) -> Database: 77 | """ 78 | Queries a database by its UID and returns it. 79 | """ 80 | return Database.from_json(self.endpoints, self.endpoints.database.query_by_uid(uid)) 81 | 82 | 83 | @overload 84 | def get_database_by_name(self, name: str, allow_multiple: Literal[True]) -> List[Database]: ... 85 | @overload 86 | def get_database_by_name(self, name: str, allow_multiple: Literal[False] = False) -> Database: ... 87 | def get_database_by_name(self, name: str, allow_multiple: bool = False) -> Union[Database, List[Database]]: 88 | """ 89 | Queries a database by its name and returns it. If `allow_multiple` is true, it will return 90 | a list of databases. 91 | """ 92 | database_list = [ 93 | Database.from_json(self.endpoints, database) 94 | for database in self.endpoints.database.query_by_name(name) 95 | ] 96 | 97 | if allow_multiple: 98 | return database_list 99 | else: 100 | return assert_one(database_list) 101 | -------------------------------------------------------------------------------- /datatap/api/entities/database.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from datatap.api.entities.dataset import AnyDataset 3 | from typing import Any, List, overload 4 | 5 | from datatap.utils import basic_repr 6 | 7 | from .repository import Repository 8 | from ..endpoints import ApiEndpoints 9 | from ..types import JsonDatabase, JsonDatabaseOptions 10 | 11 | class Database: 12 | """ 13 | Represents a database. This database could either be the public database, 14 | or a user's private database that they have connected to the dataTap 15 | platform. 16 | 17 | This class provides utilites for viewing and updating the database's 18 | configuration, as well as inspecting its contents. 19 | """ 20 | _endpoints: ApiEndpoints 21 | 22 | uid: str 23 | """ 24 | The UID of this database. 25 | """ 26 | 27 | name: str 28 | """ 29 | The name of this database. 30 | """ 31 | 32 | connection_options: JsonDatabaseOptions 33 | """ 34 | How this database is configured. Sensitive details, such as database 35 | credentials, are omitted. 36 | """ 37 | 38 | @staticmethod 39 | def from_json(endpoints: ApiEndpoints, json: JsonDatabase) -> Database: 40 | """ 41 | Creates a `Database` from a `JsonDatabase`. 42 | """ 43 | return Database( 44 | endpoints, 45 | uid = json["uid"], 46 | name = json["name"], 47 | connection_options = json["connectionOptions"] 48 | ) 49 | 50 | def __init__(self, endpoints: ApiEndpoints, uid: str, *, name: str, connection_options: JsonDatabaseOptions): 51 | self._endpoints = endpoints 52 | self.uid = uid 53 | self.name = name 54 | self.connection_options = connection_options 55 | 56 | def get_repository_list(self) -> List[Repository]: 57 | """ 58 | Returns a list of all `Repository`s that are stored in this database. 59 | """ 60 | return [ 61 | Repository.from_json(self._endpoints, self.uid, repository_json) 62 | for repository_json in self._endpoints.repository.list(self.uid) 63 | ] 64 | 65 | 66 | @overload 67 | def get_repository(self, slug: str) -> Repository: ... 68 | @overload 69 | def get_repository(self, namespace: str, name: str) -> Repository: ... 70 | def get_repository(self, *args: str, **kwargs: Any) -> Repository: 71 | """ 72 | Queries a `Repository` by its namespace and name, or via its slug (namespace/name). 73 | """ 74 | if len(kwargs) > 0: 75 | raise ValueError("get_repository is positional-only") 76 | elif len(args) == 1: 77 | namespace, name = args[0].split("/") 78 | else: 79 | namespace, name = args 80 | 81 | return Repository.from_json(self._endpoints, self.uid, self._endpoints.repository.query(self.uid, namespace, name)) 82 | 83 | @overload 84 | def get_dataset(self, slug: str) -> AnyDataset: ... 85 | @overload 86 | def get_dataset(self, namespace: str, name: str, tag: str) -> AnyDataset: ... 87 | def get_dataset(self, *args: str, **kwargs: Any) -> AnyDataset: 88 | """ 89 | Queries a `Dataset` by its namespace, name, and tag, or via its slug (namespace/name:tag). 90 | """ 91 | if len(kwargs) > 0: 92 | raise ValueError("get_repository is positional-only") 93 | elif len(args) == 1: 94 | repo_slug, tag = args[0].split(":") 95 | repo = self.get_repository(repo_slug) 96 | else: 97 | namespace, name, tag = args 98 | repo = self.get_repository(namespace, name) 99 | 100 | return repo.get_dataset(tag) 101 | 102 | def __repr__(self): 103 | return basic_repr("Database", self.uid, name = self.name) 104 | -------------------------------------------------------------------------------- /datatap/api/entities/dataset.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from datatap.api.types.dataset import JsonDatasetRepository 3 | 4 | from typing import Generator, Generic, List, TypeVar, Union, overload 5 | 6 | from datatap.droplet import ImageAnnotation, VideoAnnotation 7 | from datatap.template import ImageAnnotationTemplate, VideoAnnotationTemplate 8 | from datatap.utils import basic_repr 9 | 10 | from ..endpoints import ApiEndpoints 11 | from ..types import JsonDataset 12 | 13 | T = TypeVar("T", ImageAnnotationTemplate, VideoAnnotationTemplate) 14 | 15 | class DatasetRepository: 16 | """ 17 | An object representing the repository a dataset came from. 18 | """ 19 | 20 | name: str 21 | """ 22 | The name of the repository. 23 | """ 24 | 25 | namespace: str 26 | """ 27 | The namespace of the repository. 28 | """ 29 | 30 | @staticmethod 31 | def from_json(json: JsonDatasetRepository) -> DatasetRepository: 32 | """ 33 | Creates a new `DatasetRepository` from a `JsonDatasetRepository`. 34 | """ 35 | return DatasetRepository(name = json["name"], namespace = json["namespace"]) 36 | 37 | def __init__(self, *, name: str, namespace: str): 38 | self.name = name 39 | self.namespace = namespace 40 | 41 | class Dataset(Generic[T]): 42 | """ 43 | Represents a concrete version of a dataset. Critically, `Dataset`s cannot be changed 44 | once they're created. 45 | 46 | For reproducable training, ensure that you store the specific `Dataset` used 47 | during training. 48 | """ 49 | _endpoints: ApiEndpoints 50 | 51 | uid: str 52 | """ 53 | The UID of this `Dataset`. 54 | """ 55 | 56 | database: str 57 | """ 58 | The UID of the database in which this dataset lives. 59 | """ 60 | 61 | repository: DatasetRepository 62 | """ 63 | The repository this dataset belongs to. 64 | """ 65 | 66 | splits: List[str] 67 | """ 68 | A list of all the splits that this dataset has. By default, this will be 69 | `["training", "validation"]`. 70 | """ 71 | 72 | template: T 73 | """ 74 | The template that all annotations in this dataset version adhere to. 75 | """ 76 | 77 | @staticmethod 78 | def from_json(endpoints: ApiEndpoints, json: JsonDataset) -> AnyDataset: 79 | """ 80 | Creates a new `Dataset` from a `JsonDataset`. 81 | """ 82 | template_json = json["template"] 83 | template: Union[ImageAnnotationTemplate, VideoAnnotationTemplate] 84 | 85 | if template_json["kind"] == "ImageAnnotationTemplate": 86 | template = ImageAnnotationTemplate.from_json(template_json) 87 | elif template_json["kind"] == "VideoAnnotationTemplate": 88 | template = VideoAnnotationTemplate.from_json(template_json) 89 | else: 90 | raise ValueError(f"Unknown template kind: {template_json['kind']}") 91 | 92 | return Dataset( 93 | endpoints, 94 | uid = json["uid"], 95 | database = json["database"], 96 | repository = DatasetRepository.from_json(json["repository"]), 97 | splits = json["splits"], 98 | template = template 99 | ) 100 | 101 | def __init__( 102 | self, 103 | endpoints: ApiEndpoints, 104 | uid: str, 105 | *, 106 | database: str, 107 | repository: DatasetRepository, 108 | splits: List[str], 109 | template: Union[ImageAnnotationTemplate, VideoAnnotationTemplate] 110 | ): 111 | self._endpoints = endpoints 112 | self.uid = uid 113 | self.database = database 114 | self.repository = repository 115 | self.splits = splits 116 | self.template = template 117 | 118 | @overload 119 | def stream_split( 120 | self: Dataset[ImageAnnotationTemplate], 121 | split: str 122 | ) -> Generator[ImageAnnotation, None, None]: ... 123 | @overload 124 | def stream_split( 125 | self: Dataset[ImageAnnotationTemplate], 126 | split: str, 127 | chunk: int, 128 | nchunks: int 129 | ) -> Generator[ImageAnnotation, None, None]: ... 130 | @overload 131 | def stream_split( 132 | self: Dataset[VideoAnnotationTemplate], 133 | split: str 134 | ) -> Generator[VideoAnnotation, None, None]: ... 135 | @overload 136 | def stream_split( 137 | self: Dataset[VideoAnnotationTemplate], 138 | split: str, 139 | chunk: int, 140 | nchunks: int 141 | ) -> Generator[VideoAnnotation, None, None]: ... 142 | def stream_split( 143 | self, 144 | split: str, 145 | chunk: int = 0, 146 | nchunks: int = 1 147 | ) -> Generator[Union[ImageAnnotation, VideoAnnotation], None, None]: 148 | """ 149 | Streams a specific split of this dataset from the database. All yielded annotations will adhere to this 150 | dataset's annotation template. 151 | 152 | If `chunk` and `nchunks` are omitted, then the full split will be streamed. Otherwise, the split will be 153 | broken into `nchunks` pieces, and only the chunk identified by `chunk` will be streamed. 154 | """ 155 | for droplet in self._endpoints.dataset.stream_split( 156 | database_uid = self.database, 157 | namespace = self.repository.namespace, 158 | name = self.repository.name, 159 | uid = self.uid, 160 | split = split, 161 | chunk = chunk, 162 | nchunks = nchunks, 163 | ): 164 | if isinstance(self.template, ImageAnnotationTemplate): 165 | yield ImageAnnotation.from_json(droplet) 166 | elif isinstance(self.template, VideoAnnotationTemplate): # type: ignore - isinstance is excessive 167 | yield VideoAnnotation.from_json(droplet) 168 | else: 169 | raise ValueError(f"Unknown template kind: {type(self.template)}") 170 | 171 | def get_stable_identifier(self) -> str: 172 | return f"{self.repository.namespace}/{self.repository.name}:{self.uid}" 173 | 174 | def __repr__(self) -> str: 175 | return basic_repr( 176 | "Dataset", 177 | self.get_stable_identifier(), 178 | database = self.database, 179 | splits = self.splits 180 | ) 181 | 182 | AnyDataset = Union[Dataset[ImageAnnotationTemplate], Dataset[VideoAnnotationTemplate]] 183 | -------------------------------------------------------------------------------- /datatap/api/entities/repository.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from datetime import datetime 4 | from typing import Sequence 5 | 6 | from datatap.utils import basic_repr 7 | 8 | from .dataset import AnyDataset, Dataset 9 | from ..types import JsonRepository, JsonSplit, JsonTag 10 | from ..endpoints import ApiEndpoints 11 | 12 | class Split: 13 | """ 14 | Represents the splits available for a given dataset. 15 | """ 16 | 17 | split: str 18 | """ 19 | The kind of the split (e.g, "training" or "validation"). 20 | """ 21 | 22 | annotation_count: int 23 | """ 24 | The number of annotations available in this split. 25 | """ 26 | 27 | @staticmethod 28 | def from_json(json: JsonSplit) -> Split: 29 | """ 30 | Creates a `Split` from a `JsonSplit` 31 | """ 32 | return Split(json["split"], json["annotationCount"]) 33 | 34 | def __init__(self, split: str, annotation_count: int): 35 | self.split = split 36 | self.annotation_count = annotation_count 37 | 38 | def __repr__(self) -> str: 39 | return basic_repr("Split", self.split, annotation_count = self.annotation_count) 40 | 41 | class Tag: 42 | """ 43 | Represents a single tag that may be accessed in this repository. 44 | """ 45 | 46 | tag: str 47 | """ 48 | A slug representing this tag (such as "latest"). 49 | """ 50 | 51 | dataset: str 52 | """ 53 | The uid of the dataset to which this tag points. 54 | """ 55 | 56 | updated_at: datetime 57 | """ 58 | When this tag was most recently updated. 59 | """ 60 | 61 | splits: Sequence[Split] 62 | """ 63 | A list of splits available on this tag. 64 | """ 65 | 66 | @staticmethod 67 | def from_json(json: JsonTag) -> Tag: 68 | """ 69 | Creates a `Tag` from a `JsonTag`. 70 | """ 71 | return Tag( 72 | json["tag"], 73 | json["dataset"], 74 | datetime.fromtimestamp(json["updatedAt"] / 1000), 75 | [Split.from_json(split) for split in json["splits"]] 76 | ) 77 | 78 | def __init__(self, tag: str, dataset: str, updated_at: datetime, splits: Sequence[Split]): 79 | self.tag = tag 80 | self.dataset = dataset 81 | self.updated_at = updated_at 82 | self.splits = splits 83 | 84 | def __repr__(self) -> str: 85 | return basic_repr("Tag", self.tag, dataset = self.dataset, splits = self.splits) 86 | 87 | class Repository: 88 | """ 89 | Represents a repository that contains one or more datasets. 90 | """ 91 | _endpoints: ApiEndpoints 92 | _database: str 93 | 94 | name: str 95 | """ 96 | The name of this repository. 97 | """ 98 | 99 | namespace: str 100 | """ 101 | The namespace of this repository. 102 | """ 103 | 104 | tags: Sequence[Tag] 105 | """ 106 | The tags available for this repository. 107 | """ 108 | 109 | @staticmethod 110 | def from_json(endpoints: ApiEndpoints, database: str, json: JsonRepository) -> Repository: 111 | """ 112 | Creates a `Dataset` from a `JsonDataset`. 113 | """ 114 | return Repository( 115 | endpoints, 116 | database, 117 | name = json["name"], 118 | namespace = json["namespace"], 119 | tags = [Tag.from_json(tag) for tag in json["tags"]], 120 | ) 121 | 122 | def __init__(self, endpoints: ApiEndpoints, database: str, *, name: str, namespace: str, tags: Sequence[Tag]): 123 | self._endpoints = endpoints 124 | self._database = database 125 | self.name = name 126 | self.namespace = namespace 127 | self.tags = tags 128 | 129 | def get_dataset(self, tag: str) -> AnyDataset: 130 | """ 131 | Fetches dataset by its tag (or UID). 132 | """ 133 | return Dataset.from_json( 134 | self._endpoints, 135 | self._endpoints.dataset.query(self._database, self.namespace, self.name, tag) 136 | ) 137 | 138 | def __repr__(self) -> str: 139 | return basic_repr("Repository", name = self.name, namespace = self.namespace, tags = [tag.tag for tag in self.tags]) 140 | -------------------------------------------------------------------------------- /datatap/api/entities/user.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Optional 3 | 4 | from datatap.utils import basic_repr 5 | 6 | from ..endpoints import ApiEndpoints 7 | from ..types import JsonUser 8 | 9 | class User: 10 | """ 11 | Represents a user account in the dataTap platform. 12 | """ 13 | 14 | _endpoints: ApiEndpoints 15 | 16 | uid: str 17 | """ 18 | The user's UID. 19 | """ 20 | 21 | username: str 22 | """ 23 | The user's username. 24 | """ 25 | 26 | email: str 27 | """ 28 | The user's email address. 29 | """ 30 | 31 | default_database: Optional[str] 32 | """ 33 | The user's default database 34 | """ 35 | 36 | @staticmethod 37 | def from_json(endpoints: ApiEndpoints, json: JsonUser) -> User: 38 | """ 39 | Creates a `User` from a `JsonUser`. 40 | """ 41 | return User( 42 | endpoints, 43 | json["uid"], 44 | username = json["username"], 45 | email = json["email"], 46 | default_database = json["defaultDatabase"] 47 | ) 48 | 49 | def __init__(self, endpoints: ApiEndpoints, uid: str, *, username: str, email: str, default_database: Optional[str]): 50 | self._endpoints = endpoints 51 | self.uid = uid 52 | self.username = username 53 | self.email = email 54 | self.default_database = default_database 55 | 56 | def __repr__(self) -> str: 57 | return basic_repr("User", self.uid, username = self.username, email = self.email) -------------------------------------------------------------------------------- /datatap/api/types/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The `datatap.api.types` library contains all of the types returned by the API. 3 | """ 4 | 5 | from .database import JsonDatabaseOptions, JsonDatabase 6 | from .dataset import JsonDataset 7 | from .repository import JsonRepository, JsonTag, JsonSplit 8 | from .user import JsonUser 9 | 10 | __all__ = [ 11 | "JsonDatabaseOptions", 12 | "JsonDatabase", 13 | "JsonDataset", 14 | "JsonRepository", 15 | "JsonTag", 16 | "JsonSplit", 17 | "JsonUser", 18 | ] -------------------------------------------------------------------------------- /datatap/api/types/database.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | from typing_extensions import Literal, TypedDict 3 | 4 | class JsonDatabaseOptionsDirect(TypedDict): 5 | """ 6 | Configuration options for a database that the server connects to directly. 7 | """ 8 | kind: Literal["direct"] 9 | protocol: Union[Literal["neo4j"], Literal["neo4j+s"]] 10 | host: str 11 | port: int 12 | 13 | JsonDatabaseOptions = JsonDatabaseOptionsDirect 14 | 15 | class JsonDatabase(TypedDict): 16 | """ 17 | The API type of a database. 18 | """ 19 | uid: str 20 | name: str 21 | connectionOptions: JsonDatabaseOptions 22 | 23 | -------------------------------------------------------------------------------- /datatap/api/types/dataset.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union 2 | 3 | from datatap.template.image_annotation_template import \ 4 | ImageAnnotationTemplateJson 5 | from datatap.template.video_annotation_template import \ 6 | VideoAnnotationTemplateJson 7 | from typing_extensions import TypedDict 8 | 9 | 10 | class JsonDatasetRepository(TypedDict): 11 | namespace: str 12 | name: str 13 | 14 | class JsonDataset(TypedDict): 15 | """ 16 | The API type of a dataset. 17 | """ 18 | uid: str 19 | database: str 20 | repository: JsonDatasetRepository 21 | template: Union[ImageAnnotationTemplateJson, VideoAnnotationTemplateJson] 22 | splits: List[str] 23 | 24 | -------------------------------------------------------------------------------- /datatap/api/types/repository.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from typing_extensions import TypedDict 3 | 4 | class JsonSplit(TypedDict): 5 | split: str 6 | annotationCount: int 7 | 8 | class JsonTag(TypedDict): 9 | tag: str 10 | dataset: str 11 | updatedAt: int 12 | splits: List[JsonSplit] 13 | 14 | class JsonRepository(TypedDict): 15 | """ 16 | The API type of a repository. 17 | """ 18 | namespace: str 19 | name: str 20 | tags: List[JsonTag] 21 | 22 | -------------------------------------------------------------------------------- /datatap/api/types/user.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from typing_extensions import TypedDict 3 | 4 | class JsonUser(TypedDict): 5 | """ 6 | The API type of an individual user. 7 | """ 8 | uid: str 9 | username: str 10 | email: str 11 | defaultDatabase: Optional[str] 12 | 13 | -------------------------------------------------------------------------------- /datatap/comet/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Optional, Sequence 4 | 5 | try: 6 | from comet_ml import APIExperiment, Experiment 7 | from comet_ml.exceptions import NotFound 8 | except ImportError: 9 | from datatap.utils import pprint 10 | pprint("{yellow}Unable to import comet_ml.") 11 | 12 | from datatap.api.entities import AnyDataset 13 | from datatap.droplet.image_annotation import ImageAnnotation 14 | 15 | 16 | def init_experiment(experiment: Experiment, dataset: AnyDataset): 17 | """ 18 | Initializes an experiment by logging the template and the validation set ground truths if they have not already 19 | been logged. 20 | """ 21 | api_experiment = APIExperiment(previous_experiment = experiment.id) 22 | 23 | if get_dataset(experiment) is None: 24 | log_dataset(experiment, dataset) 25 | 26 | try: 27 | api_experiment.get_asset("datatap/template.json") 28 | except NotFound: 29 | experiment.log_asset_data( 30 | [annotation.to_json() for annotation in dataset.stream_split("validation")], 31 | name = "datatap/validation/ground_truth.json" 32 | ) 33 | 34 | experiment.log_asset_data( 35 | dataset.template.to_json(), 36 | name = "datatap/template.json" 37 | ) 38 | 39 | def log_dataset(experiment: Experiment, dataset: AnyDataset): 40 | experiment.log_other("datatap-dataset", dataset.get_stable_identifier()) 41 | 42 | def get_dataset(experiment: Experiment) -> Optional[str]: 43 | api_experiment = APIExperiment(previous_experiment = experiment.id) 44 | others = api_experiment.get_others_summary() 45 | dataset_metrics = [other for other in others if other["name"] == "datatap-dataset"] 46 | 47 | if len(dataset_metrics) == 0: 48 | return None 49 | 50 | return dataset_metrics[0].get("valueCurrent", None) 51 | 52 | def log_validation_proposals(experiment: Experiment, proposals: Sequence[ImageAnnotation]): 53 | experiment.log_asset_data( 54 | [annotation.to_json() for annotation in proposals], 55 | name = "datatap/validation/proposals.json" 56 | ) 57 | -------------------------------------------------------------------------------- /datatap/droplet/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module provides classes for working with ML data. Specifically, it provides methods for creating new ML data 3 | objects, converting ML data objects to and from the JSON droplet format, and manipulating ML data objects. 4 | """ 5 | 6 | from .bounding_box import BoundingBox, BoundingBoxJson 7 | from .class_annotation import ClassAnnotation, ClassAnnotationJson 8 | from .frame_annotation import FrameAnnotation, FrameAnnotationJson 9 | from .image import Image, ImageJson 10 | from .image_annotation import ImageAnnotation, ImageAnnotationJson 11 | from .instance import Instance, InstanceJson 12 | from .keypoint import Keypoint, KeypointJson 13 | from .multi_instance import MultiInstance, MultiInstanceJson 14 | from .segmentation import Segmentation, SegmentationJson 15 | from .video import Video, VideoJson 16 | from .video_annotation import VideoAnnotation, VideoAnnotationJson 17 | 18 | __all__ = [ 19 | "BoundingBox", 20 | "BoundingBoxJson", 21 | "ClassAnnotation", 22 | "ClassAnnotationJson", 23 | "FrameAnnotation", 24 | "FrameAnnotationJson", 25 | "Image", 26 | "ImageJson", 27 | "ImageAnnotation", 28 | "ImageAnnotationJson", 29 | "Instance", 30 | "InstanceJson", 31 | "Keypoint", 32 | "KeypointJson", 33 | "MultiInstance", 34 | "MultiInstanceJson", 35 | "Segmentation", 36 | "SegmentationJson", 37 | "Video", 38 | "VideoJson", 39 | "VideoAnnotation", 40 | "VideoAnnotationJson", 41 | ] 42 | -------------------------------------------------------------------------------- /datatap/droplet/_media.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import sys 4 | from io import BytesIO 5 | from typing import Sequence 6 | 7 | try: 8 | import boto3 9 | except ImportError: 10 | boto3 = None 11 | 12 | try: 13 | import requests 14 | except ImportError: 15 | requests = None 16 | 17 | from ..utils import basic_repr 18 | 19 | class Media: 20 | """ 21 | The `Media` class acts as a base class for all loadable media. 22 | """ 23 | 24 | paths: Sequence[str] 25 | """ 26 | A sequence of URIs where the media can be found. The loader 27 | will try them in order until it finds one it can load. 28 | 29 | Supported schemes include `http(s):`, `s3:` 30 | """ 31 | 32 | def __init__(self, *, paths: Sequence[str]): 33 | self.paths = paths 34 | 35 | def __repr__(self) -> str: 36 | return basic_repr("Media", paths = self.paths) 37 | 38 | def __eq__(self, other: object) -> bool: 39 | if not isinstance(other, Media): 40 | return NotImplemented 41 | return self.paths == other.paths 42 | 43 | def load(self, quiet: bool = False, attempts: int = 3, allow_local: bool = False) -> BytesIO: 44 | """ 45 | Attempts to load the Video file specified by this reference. 46 | Resolution happpens in this order: 47 | 48 | 1. Load from an internal cache (either from a previous load, or from `from_pil`) 49 | 2. Try loading every path in order, returning once one loads 50 | 51 | Warning! `load` may attempt to read from the local file system or from private 52 | networks. Please ensure that the annotation you are loading is trusted. 53 | """ 54 | for path in self.paths: 55 | for i in range(attempts): 56 | try: 57 | scheme, file_name, *_ = path.split(":") 58 | if scheme.lower() == "s3" and boto3 is not None: 59 | bucket_name, *path_components = [ 60 | component 61 | for component in file_name.split("/") 62 | if component != "" 63 | ] 64 | path_name = "/".join(path_components) 65 | 66 | s3 = boto3.resource("s3") # type: ignore 67 | file_obj = s3.Object(bucket_name, path_name) # type: ignore 68 | data: bytes = file_obj.get()["Body"].read() # type: ignore 69 | elif scheme.lower() in ["http", "https"] and requests is not None: 70 | response = requests.get(path) 71 | data = response.content 72 | elif scheme.lower() == "file" and allow_local: 73 | with open(file_name, "rb") as file_obj: 74 | data = file_obj.read() 75 | else: 76 | raise NotImplementedError(f"Unsupported scheme: {scheme}") 77 | 78 | return BytesIO(data) 79 | except Exception as e: 80 | if not quiet: 81 | print(f"Cannot load {type(self).__name__} {path}, with error {str(e)}, attempt ({i + 1}/{attempts})", file = sys.stderr) 82 | 83 | raise FileNotFoundError(f"All paths for {type(self).__name__} failed to load", self.paths) 84 | -------------------------------------------------------------------------------- /datatap/droplet/attributes.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Optional, Sequence, Union 4 | 5 | from typing_extensions import TypedDict 6 | 7 | from ..utils import basic_repr 8 | 9 | class _AttributeValueOptional(TypedDict, total = False): 10 | confidence: float 11 | 12 | class AttributeValueJson(_AttributeValueOptional, TypedDict): 13 | """ 14 | The serialized JSON representation of an attribute candidate value. 15 | """ 16 | value: str 17 | 18 | AttributeValuesJson = Union[Sequence[AttributeValueJson], str] 19 | 20 | class AttributeValue: 21 | value: str 22 | confidence: Optional[float] 23 | 24 | def __init__(self, value: str, *, confidence: Optional[float] = None) -> None: 25 | self.value = value 26 | self.confidence = confidence 27 | 28 | def to_json(self) -> AttributeValueJson: 29 | json = AttributeValueJson(value=self.value) 30 | if self.confidence is not None: 31 | json["confidence"] = self.confidence 32 | return json 33 | 34 | @staticmethod 35 | def from_json(json: AttributeValueJson) -> AttributeValue: 36 | return AttributeValue(json["value"], confidence=json.get("confidence")) 37 | 38 | class AttributeValues: 39 | content: Sequence[AttributeValue] 40 | 41 | @staticmethod 42 | def from_json(json: AttributeValuesJson) -> AttributeValues: 43 | """ 44 | Constructs a `AttributeValues` from a `AttributeValuesJson`. 45 | """ 46 | if isinstance(json, str): 47 | return AttributeValues([AttributeValue(json)]) 48 | return AttributeValues([AttributeValue.from_json(c) for c in json]) 49 | 50 | def __init__(self, content: Sequence[AttributeValue]): 51 | self.content = content 52 | 53 | def __repr__(self) -> str: 54 | return basic_repr("AttributeValues", self.content) 55 | 56 | def to_json(self) -> Sequence[AttributeValueJson]: 57 | return [c.to_json() for c in self.content] 58 | 59 | def most_likely(self) -> Optional[AttributeValue]: 60 | """ 61 | Returns the most likely value of this specific attribute 62 | """ 63 | if len(self.content) == 0: 64 | return None 65 | 66 | return max(self.content, key=lambda c: c.confidence or 1.0) 67 | -------------------------------------------------------------------------------- /datatap/droplet/bounding_box.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Optional 4 | 5 | from typing_extensions import TypedDict 6 | 7 | from ..geometry import Rectangle, RectangleJson 8 | from ..utils import basic_repr 9 | 10 | class _BoundingBoxJsonOptional(TypedDict, total = False): 11 | confidence: float 12 | 13 | class BoundingBoxJson(_BoundingBoxJsonOptional, TypedDict): 14 | """ 15 | The serialized JSON representation of a bounding box. 16 | """ 17 | rectangle: RectangleJson 18 | 19 | class BoundingBox: 20 | """ 21 | A `BoundingBox` represents the area within an image taken up by a detection, 22 | specified as an axis-aligned rectangle. 23 | """ 24 | 25 | rectangle: Rectangle 26 | """ 27 | The area within the image where the corresponding detection appears. 28 | """ 29 | 30 | confidence: Optional[float] 31 | """ 32 | The confidence associated with this bounding box. 33 | """ 34 | 35 | @staticmethod 36 | def from_json(json: BoundingBoxJson) -> BoundingBox: 37 | """ 38 | Constructs a `BoundingBox` from a `BoundingBoxJson`. 39 | """ 40 | return BoundingBox( 41 | Rectangle.from_json(json["rectangle"]), 42 | confidence = json.get("confidence") 43 | ) 44 | 45 | def __init__(self, rectangle: Rectangle, *, confidence: Optional[float] = None): 46 | self.rectangle = rectangle 47 | self.confidence = confidence 48 | 49 | self.rectangle.assert_valid() 50 | 51 | def __repr__(self) -> str: 52 | return basic_repr("BoundingBox", self.rectangle, confidence = self.confidence) 53 | 54 | def __eq__(self, other: object) -> bool: 55 | if not isinstance(other, BoundingBox): 56 | return NotImplemented 57 | return self.rectangle == other.rectangle and self.confidence == other.confidence 58 | 59 | def to_json(self) -> BoundingBoxJson: 60 | """ 61 | Serializes this `BoundingBox` to a `BoundingBoxJson`. 62 | """ 63 | json: BoundingBoxJson = { 64 | "rectangle": self.rectangle.to_json() 65 | } 66 | 67 | if self.confidence is not None: 68 | json["confidence"] = self.confidence 69 | 70 | return json 71 | 72 | def meets_confidence_threshold(self, threshold: float) -> bool: 73 | """ 74 | Returns `True` if and only if the confidence of this bounding box is 75 | either unset or it is at least the given `threshold`. 76 | """ 77 | return self.confidence is None or self.confidence >= threshold 78 | -------------------------------------------------------------------------------- /datatap/droplet/class_annotation.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Callable, Sequence 4 | 5 | from typing_extensions import TypedDict 6 | 7 | from ..utils import basic_repr 8 | from .instance import Instance, InstanceJson 9 | from .multi_instance import MultiInstance, MultiInstanceJson 10 | 11 | __pdoc__ = { "ClassAnnotation.__add__": True } 12 | 13 | class ClassAnnotationJson(TypedDict, total = False): 14 | """ 15 | The serialized JSON representation of a class annotation. 16 | """ 17 | instances: Sequence[InstanceJson] 18 | multiInstances: Sequence[MultiInstanceJson] 19 | 20 | class ClassAnnotation: 21 | """ 22 | A `ClassAnnotation` represents the set of detections for a given 23 | class. These may either be individual instances, or "multi instances" 24 | that describe a visual clustering of the class. 25 | """ 26 | 27 | instances: Sequence[Instance] 28 | """ 29 | A sequence of individual instances of this class. 30 | """ 31 | 32 | multi_instances: Sequence[MultiInstance] 33 | """ 34 | A sequence of multi-instances of this class. An example of a 35 | multi instance would be a crowd of people (labeled as such). 36 | """ 37 | 38 | @staticmethod 39 | def from_json(json: ClassAnnotationJson) -> ClassAnnotation: 40 | """ 41 | Constructs a `ClassAnnotation` from a `ClassAnnotationJson`. 42 | """ 43 | return ClassAnnotation( 44 | instances = [Instance.from_json(instance) for instance in json["instances"]] if "instances" in json else [], 45 | multi_instances = [MultiInstance.from_json(multi_instance) for multi_instance in json["multiInstances"]] if "multiInstances" in json else [] 46 | ) 47 | 48 | def __init__(self, *, instances: Sequence[Instance], multi_instances: Sequence[MultiInstance] = []): 49 | self.instances = instances 50 | self.multi_instances = multi_instances 51 | 52 | def filter_detections( 53 | self, 54 | *, 55 | instance_filter: Callable[[Instance], bool], 56 | multi_instance_filter: Callable[[MultiInstance], bool] 57 | ) -> ClassAnnotation: 58 | """ 59 | Returns a new class annotation consisting only of the instances and 60 | multi-instances that meet the given constraints. 61 | """ 62 | return ClassAnnotation( 63 | instances = [ 64 | instance 65 | for instance in self.instances 66 | if instance_filter(instance) 67 | ], 68 | multi_instances = [ 69 | multi_instance 70 | for multi_instance in self.multi_instances 71 | if multi_instance_filter(multi_instance) 72 | ] 73 | ) 74 | 75 | def __repr__(self) -> str: 76 | return basic_repr("ClassAnnotation", instances = self.instances, multi_instances = self.multi_instances) 77 | 78 | def __eq__(self, other: object) -> bool: 79 | if not isinstance(other, ClassAnnotation): 80 | return NotImplemented 81 | return self.instances == other.instances and self.multi_instances == other.multi_instances 82 | 83 | def __add__(self, other: ClassAnnotation) -> ClassAnnotation: 84 | if not isinstance(other, ClassAnnotation): # type: ignore - pyright complains about the isinstance check being redundant 85 | return NotImplemented 86 | 87 | instances = list(self.instances) + list(other.instances) 88 | multi_instances = list(self.multi_instances) + list(other.multi_instances) 89 | 90 | return ClassAnnotation( 91 | instances = instances, 92 | multi_instances = multi_instances, 93 | ) 94 | 95 | def to_json(self) -> ClassAnnotationJson: 96 | """ 97 | Serializes this `ClassAnnotation` into a `ClassAnnotationJson`. 98 | """ 99 | 100 | return { 101 | "instances": [instance.to_json() for instance in self.instances], 102 | "multiInstances": [multi_instance.to_json() for multi_instance in self.multi_instances] 103 | } 104 | -------------------------------------------------------------------------------- /datatap/droplet/frame_annotation.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any, Callable, Dict, Mapping 4 | 5 | from typing_extensions import TypedDict 6 | 7 | from ..utils import basic_repr 8 | from .class_annotation import ClassAnnotation, ClassAnnotationJson 9 | from .instance import Instance 10 | from .multi_instance import MultiInstance 11 | 12 | class FrameAnnotationJson(TypedDict): 13 | """ 14 | The serialized JSON representation of an image annotation. 15 | """ 16 | 17 | classes: Mapping[str, ClassAnnotationJson] 18 | 19 | class FrameAnnotation: 20 | """ 21 | A collection of class annotations that annotate a given image. 22 | """ 23 | 24 | classes: Mapping[str, ClassAnnotation] 25 | """ 26 | A mapping from class name to the annotations of that class. 27 | """ 28 | 29 | @staticmethod 30 | def from_json(json: Mapping[str, Any]) -> FrameAnnotation: 31 | """ 32 | Constructs an `FrameAnnotation` from an `FrameAnnotationJson`. 33 | """ 34 | return FrameAnnotation( 35 | classes = { 36 | class_name: ClassAnnotation.from_json(json["classes"][class_name]) 37 | for class_name in json["classes"] 38 | } 39 | ) 40 | 41 | def __init__( 42 | self, 43 | *, 44 | classes: Mapping[str, ClassAnnotation], 45 | ): 46 | self.classes = classes 47 | 48 | def filter_detections( 49 | self, 50 | *, 51 | instance_filter: Callable[[Instance], bool], 52 | multi_instance_filter: Callable[[MultiInstance], bool] 53 | ) -> FrameAnnotation: 54 | """ 55 | Returns a new image annotation consisting only of the instances and 56 | multi-instances that meet the given constraints. 57 | """ 58 | return FrameAnnotation( 59 | classes = { 60 | class_name: class_annotation.filter_detections( 61 | instance_filter = instance_filter, 62 | multi_instance_filter = multi_instance_filter 63 | ) 64 | for class_name, class_annotation in self.classes.items() 65 | } 66 | ) 67 | 68 | def apply_bounding_box_confidence_threshold(self, threshold: float) -> FrameAnnotation: 69 | """ 70 | Returns a new image annotation consisting only of the instances and 71 | multi-instances that have bounding boxes which either do not have a 72 | confidence specified or which have a confience meeting the given 73 | threshold. 74 | """ 75 | return self.filter_detections( 76 | instance_filter = lambda instance: ( 77 | instance.bounding_box is not None 78 | and instance.bounding_box.meets_confidence_threshold(threshold) 79 | ), 80 | multi_instance_filter = lambda multi_instance: ( 81 | multi_instance.bounding_box is not None 82 | and multi_instance.bounding_box.meets_confidence_threshold(threshold) 83 | ) 84 | ) 85 | 86 | def apply_segmentation_confidence_threshold(self, threshold: float) -> FrameAnnotation: 87 | """ 88 | Returns a new image annotation consisting only of the instances and 89 | multi-instances that have segmentations which either do not have a 90 | confidence specified or which have a confience meeting the given 91 | threshold. 92 | """ 93 | return self.filter_detections( 94 | instance_filter = lambda instance: ( 95 | instance.segmentation is not None 96 | and instance.segmentation.meets_confidence_threshold(threshold) 97 | ), 98 | multi_instance_filter = lambda multi_instance: ( 99 | multi_instance.segmentation is not None 100 | and multi_instance.segmentation.meets_confidence_threshold(threshold) 101 | ) 102 | ) 103 | 104 | def __repr__(self) -> str: 105 | return basic_repr( 106 | "FrameAnnotation", 107 | classes = self.classes 108 | ) 109 | 110 | def __eq__(self, other: object) -> bool: 111 | if not isinstance(other, FrameAnnotation): 112 | return NotImplemented 113 | return self.classes == other.classes 114 | 115 | def __add__(self, other: FrameAnnotation) -> FrameAnnotation: 116 | if not isinstance(other, FrameAnnotation): # type: ignore - pyright complains about the isinstance check being redundant 117 | return NotImplemented 118 | 119 | classes: Dict[str, ClassAnnotation] = {} 120 | 121 | for key, value in self.classes.items(): 122 | classes[key] = value 123 | 124 | for key, value in other.classes.items(): 125 | if key in classes: 126 | classes[key] += value 127 | else: 128 | classes[key] = value 129 | 130 | return FrameAnnotation( 131 | classes = classes 132 | ) 133 | 134 | def to_json(self) -> FrameAnnotationJson: 135 | """ 136 | Serializes this image annotation into an `FrameAnnotationJson`. 137 | """ 138 | json: FrameAnnotationJson = { 139 | "classes": { 140 | name: class_annotation.to_json() 141 | for name, class_annotation in self.classes.items() 142 | } 143 | } 144 | 145 | return json 146 | -------------------------------------------------------------------------------- /datatap/droplet/image.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Optional, Sequence 4 | 5 | import PIL.Image 6 | from typing_extensions import TypedDict 7 | 8 | from ..utils import basic_repr 9 | from ._media import Media 10 | 11 | 12 | class _ImageJsonOptional(TypedDict, total = False): 13 | uid: str 14 | 15 | class ImageJson(_ImageJsonOptional, TypedDict): 16 | """ 17 | The serialized JSON representation of an `Image`. 18 | """ 19 | paths: Sequence[str] 20 | 21 | class Image(Media): 22 | """ 23 | The `Image` class contains information about what image was 24 | labeled by a given annotation. It also includes utilities 25 | for loading and manipulating images. 26 | """ 27 | 28 | uid: Optional[str] 29 | """ 30 | A unique ID for this image. 31 | """ 32 | 33 | _pil_image: Optional[PIL.Image.Image] 34 | 35 | @staticmethod 36 | def from_json(json: ImageJson) -> Image: 37 | """ 38 | Creates an `Image` from an `ImageJson`. 39 | """ 40 | return Image(uid = json.get("uid", None), paths = json["paths"]) 41 | 42 | @staticmethod 43 | def from_pil(pil_image: PIL.Image.Image) -> Image: 44 | """ 45 | Creates an `Image` from an existing PIL Image. Note that an 46 | image created this way will not have any `paths` set, but will 47 | still be able to load the image via `get_pil_image`. 48 | """ 49 | image = Image( 50 | paths = [], 51 | ) 52 | image._pil_image = pil_image 53 | return image 54 | 55 | def __init__(self, *, uid: Optional[str] = None, paths: Sequence[str]): 56 | super().__init__(paths = paths) 57 | self.uid = uid 58 | self._pil_image = None 59 | 60 | def __repr__(self) -> str: 61 | return basic_repr("Image", uid = self.uid, paths = self.paths) 62 | 63 | def __eq__(self, other: object) -> bool: 64 | if not isinstance(other, Image): 65 | return NotImplemented 66 | return self.paths == other.paths 67 | 68 | # TODO(mdsavage): consider using functools.cache here if we upgrade to Python >= 3.9 69 | def get_pil_image(self, quiet: bool = False, attempts: int = 3, allow_local: bool = False) -> PIL.Image.Image: 70 | """ 71 | Attempts to load the image specified by this reference. Resolution happpens in this order: 72 | 73 | 1. Load from an internal cache (either from a previous load, or from `from_pil`) 74 | 2. Try loading every path in order, returning once one loads 75 | 76 | Warning! `get_pil_image` may attempt to read from the local file system or from private 77 | networks. Please ensure that the annotation you are loading is trusted. 78 | """ 79 | if self._pil_image is not None: 80 | return self._pil_image 81 | 82 | return PIL.Image.open(self.load(quiet, attempts, allow_local)) 83 | 84 | def to_json(self) -> ImageJson: 85 | """ 86 | Serializes this `Image` into an `ImageJson`. 87 | """ 88 | json: ImageJson = { 89 | "paths": self.paths 90 | } 91 | 92 | if self.uid is not None: 93 | json["uid"] = self.uid 94 | 95 | return json 96 | -------------------------------------------------------------------------------- /datatap/droplet/image_annotation.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | from typing import Any, Callable, Dict, Mapping, Optional 5 | from urllib.parse import quote, urlencode 6 | 7 | from datatap.utils import Environment 8 | from typing_extensions import Literal, TypedDict 9 | 10 | from ..geometry import Mask, MaskJson 11 | from ..utils import basic_repr 12 | from .class_annotation import ClassAnnotation, ClassAnnotationJson 13 | from .image import Image, ImageJson 14 | from .instance import Instance 15 | from .multi_instance import MultiInstance 16 | 17 | 18 | class _ImageAnnotationJsonOptional(TypedDict, total = False): 19 | uid: str 20 | mask: MaskJson 21 | metadata: Mapping[str, Any] 22 | 23 | class ImageAnnotationJson(_ImageAnnotationJsonOptional, TypedDict): 24 | """ 25 | The serialized JSON representation of an image annotation. 26 | """ 27 | 28 | kind: Literal["ImageAnnotation"] 29 | image: ImageJson 30 | classes: Mapping[str, ClassAnnotationJson] 31 | 32 | class ImageAnnotation: 33 | """ 34 | A collection of class annotations that annotate a given image. 35 | """ 36 | 37 | image: Image 38 | """ 39 | The image being annotated. 40 | """ 41 | 42 | classes: Mapping[str, ClassAnnotation] 43 | """ 44 | A mapping from class name to the annotations of that class. 45 | """ 46 | 47 | uid: Optional[str] 48 | """ 49 | A unique identifier for this image annotation. 50 | """ 51 | 52 | mask: Optional[Mask] 53 | """ 54 | An optional region-of-interest mask to indicate that only 55 | features within the mask have been annotated. 56 | """ 57 | 58 | metadata: Optional[Mapping[str, Any]] 59 | """ 60 | An optional field for storing metadata on the annotation. 61 | """ 62 | 63 | @staticmethod 64 | def from_json(json: Mapping[str, Any]) -> ImageAnnotation: 65 | """ 66 | Constructs an `ImageAnnotation` from an `ImageAnnotationJson`. 67 | """ 68 | return ImageAnnotation( 69 | image = Image.from_json(json["image"]), 70 | classes = { 71 | class_name: ClassAnnotation.from_json(json["classes"][class_name]) 72 | for class_name in json["classes"] 73 | }, 74 | mask = Mask.from_json(json["mask"]) if "mask" in json else None, 75 | uid = json.get("uid"), 76 | metadata = json.get("metadata") 77 | ) 78 | 79 | def __init__( 80 | self, 81 | *, 82 | image: Image, 83 | classes: Mapping[str, ClassAnnotation], 84 | mask: Optional[Mask] = None, 85 | uid: Optional[str] = None, 86 | metadata: Optional[Mapping[str, Any]] = None 87 | ): 88 | self.image = image 89 | self.classes = classes 90 | self.mask = mask 91 | self.uid = uid 92 | self.metadata = metadata 93 | 94 | def filter_detections( 95 | self, 96 | *, 97 | instance_filter: Callable[[Instance], bool], 98 | multi_instance_filter: Callable[[MultiInstance], bool] 99 | ) -> ImageAnnotation: 100 | """ 101 | Returns a new image annotation consisting only of the instances and 102 | multi-instances that meet the given constraints. 103 | """ 104 | return ImageAnnotation( 105 | image = self.image, 106 | mask = self.mask, 107 | classes = { 108 | class_name: class_annotation.filter_detections( 109 | instance_filter = instance_filter, 110 | multi_instance_filter = multi_instance_filter 111 | ) 112 | for class_name, class_annotation in self.classes.items() 113 | }, 114 | uid = self.uid, 115 | metadata = self.metadata 116 | ) 117 | 118 | def apply_bounding_box_confidence_threshold(self, threshold: float) -> ImageAnnotation: 119 | """ 120 | Returns a new image annotation consisting only of the instances and 121 | multi-instances that have bounding boxes which either do not have a 122 | confidence specified or which have a confience meeting the given 123 | threshold. 124 | """ 125 | return self.filter_detections( 126 | instance_filter = lambda instance: ( 127 | instance.bounding_box is not None 128 | and instance.bounding_box.meets_confidence_threshold(threshold) 129 | ), 130 | multi_instance_filter = lambda multi_instance: ( 131 | multi_instance.bounding_box is not None 132 | and multi_instance.bounding_box.meets_confidence_threshold(threshold) 133 | ) 134 | ) 135 | 136 | def apply_segmentation_confidence_threshold(self, threshold: float) -> ImageAnnotation: 137 | """ 138 | Returns a new image annotation consisting only of the instances and 139 | multi-instances that have segmentations which either do not have a 140 | confidence specified or which have a confience meeting the given 141 | threshold. 142 | """ 143 | return self.filter_detections( 144 | instance_filter = lambda instance: ( 145 | instance.segmentation is not None 146 | and instance.segmentation.meets_confidence_threshold(threshold) 147 | ), 148 | multi_instance_filter = lambda multi_instance: ( 149 | multi_instance.segmentation is not None 150 | and multi_instance.segmentation.meets_confidence_threshold(threshold) 151 | ) 152 | ) 153 | 154 | def apply_metadata(self, metadata: Mapping[str, Any]) -> ImageAnnotation: 155 | """ 156 | Returns a new image annotation with the supplied metadata. 157 | """ 158 | return ImageAnnotation( 159 | image = self.image, 160 | mask = self.mask, 161 | classes = self.classes, 162 | uid = self.uid, 163 | metadata = metadata 164 | ) 165 | 166 | def __repr__(self) -> str: 167 | return basic_repr( 168 | "ImageAnnotation", 169 | uid = self.uid, 170 | image = self.image, 171 | mask = self.mask, 172 | classes = self.classes, 173 | metadata = self.metadata 174 | ) 175 | 176 | def __eq__(self, other: object) -> bool: 177 | if not isinstance(other, ImageAnnotation): 178 | return NotImplemented 179 | return self.image == other.image and self.classes == other.classes and self.mask == other.mask 180 | 181 | def __add__(self, other: ImageAnnotation) -> ImageAnnotation: 182 | if not isinstance(other, ImageAnnotation): # type: ignore - pyright complains about the isinstance check being redundant 183 | return NotImplemented 184 | 185 | classes: Dict[str, ClassAnnotation] = {} 186 | 187 | for key, value in self.classes.items(): 188 | classes[key] = value 189 | 190 | for key, value in other.classes.items(): 191 | if key in classes: 192 | classes[key] += value 193 | else: 194 | classes[key] = value 195 | 196 | return ImageAnnotation( 197 | image = self.image, 198 | classes = classes, 199 | mask = self.mask, 200 | uid = self.uid if self.uid is not None else other.uid, 201 | metadata = self.metadata 202 | ) 203 | 204 | def to_json(self) -> ImageAnnotationJson: 205 | """ 206 | Serializes this image annotation into an `ImageAnnotationJson`. 207 | """ 208 | json: ImageAnnotationJson = { 209 | "kind": "ImageAnnotation", 210 | "image": self.image.to_json(), 211 | "classes": { 212 | name: class_annotation.to_json() 213 | for name, class_annotation in self.classes.items() 214 | } 215 | } 216 | 217 | if self.mask is not None: 218 | json["mask"] = self.mask.to_json() 219 | 220 | if self.uid is not None: 221 | json["uid"] = self.uid 222 | 223 | if self.metadata is not None: 224 | json["metadata"] = self.metadata 225 | 226 | return json 227 | 228 | def get_visualization_url(self) -> str: 229 | """ 230 | Generates a URL on the dataTap platform that can be visited to view a 231 | visualization of this `ImageAnnotation`. 232 | """ 233 | params = { 234 | "annotation": json.dumps(self.to_json(), separators = (",", ":")) 235 | } 236 | 237 | return f"{Environment.BASE_URI}/visualizer/single#{urlencode(params, quote_via = quote)}" 238 | 239 | def get_comparison_url(self, other: ImageAnnotation) -> str: 240 | """ 241 | Generates a URL on the dataTap platform that can be visited to view a 242 | visual comparison of this `ImageAnnotation` (which is treated as the 243 | "ground truth") and the `other` argument (which is treated as the 244 | "proposal"). 245 | 246 | This method does not check that the two annotations agree on what image 247 | they are annotating, and will always use this `ImageAnnotation`'s 248 | image. 249 | """ 250 | params = { 251 | "groundTruth": json.dumps(self.to_json(), separators = (",", ":")), 252 | "proposal": json.dumps(other.to_json(), separators = (",", ":")) 253 | } 254 | 255 | return f"{Environment.BASE_URI}/visualizer/compare#{urlencode(params, quote_via = quote)}" 256 | -------------------------------------------------------------------------------- /datatap/droplet/instance.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Dict, Mapping, Optional 4 | 5 | from typing_extensions import TypedDict 6 | 7 | from ..utils import basic_repr 8 | from .attributes import AttributeValues, AttributeValuesJson 9 | from .bounding_box import BoundingBox, BoundingBoxJson 10 | from .keypoint import Keypoint, KeypointJson 11 | from .segmentation import Segmentation, SegmentationJson 12 | 13 | 14 | class InstanceJson(TypedDict, total = False): 15 | """ 16 | The JSON serialization of an `Instance`. 17 | """ 18 | id: str 19 | boundingBox: BoundingBoxJson 20 | segmentation: SegmentationJson 21 | keypoints: Mapping[str, Optional[KeypointJson]] 22 | attributes: Mapping[str, AttributeValuesJson] 23 | 24 | class Instance: 25 | """ 26 | A single appearance of an object of a particular class within a given image. 27 | """ 28 | 29 | id: Optional[str] 30 | """ 31 | A unique id for this instance (within the context of its containing 32 | annotation). Multiple instances with the same id should be interpreted 33 | to be the same object. 34 | """ 35 | 36 | bounding_box: Optional[BoundingBox] 37 | """ 38 | The bounding box of this instance. 39 | """ 40 | 41 | segmentation: Optional[Segmentation] 42 | """ 43 | The segmentation of this instance. 44 | """ 45 | 46 | keypoints: Optional[Mapping[str, Optional[Keypoint]]] 47 | """ 48 | A mapping from keypoint name to the keypoint within this instance. If a key 49 | maps to `None`, then the annotation is reporting the _absence of_ that 50 | keypoint (i.e., that it is not visible in the image and does not have an 51 | inferrable position in the image). 52 | """ 53 | 54 | attributes: Optional[Mapping[str, AttributeValues]] 55 | """ 56 | A mapping from attribute name to value. 57 | """ 58 | 59 | @staticmethod 60 | def from_json(json: InstanceJson) -> Instance: 61 | """ 62 | Creates an `Instance` from an `InstanceJson`. 63 | """ 64 | return Instance( 65 | id = json.get("id"), 66 | bounding_box = BoundingBox.from_json(json["boundingBox"]) if "boundingBox" in json else None, 67 | segmentation = Segmentation.from_json(json["segmentation"]) if "segmentation" in json else None, 68 | keypoints = { 69 | name: Keypoint.from_json(keypoint) if keypoint is not None else None 70 | for name, keypoint in json["keypoints"].items() 71 | } if "keypoints" in json else None, 72 | attributes = { 73 | k: AttributeValues.from_json(v) for k, v in json["attributes"].items() 74 | } if "attributes" in json else None 75 | ) 76 | 77 | def __init__( 78 | self, 79 | *, 80 | id: Optional[str] = None, 81 | bounding_box: Optional[BoundingBox] = None, 82 | segmentation: Optional[Segmentation] = None, 83 | keypoints: Optional[Mapping[str, Optional[Keypoint]]] = None, 84 | attributes: Optional[Mapping[str, AttributeValues]] = None 85 | ): 86 | self.id = id 87 | self.bounding_box = bounding_box 88 | self.segmentation = segmentation 89 | self.keypoints = keypoints 90 | self.attributes = attributes 91 | 92 | def __repr__(self) -> str: 93 | return basic_repr( 94 | "Instance", 95 | id = self.id, 96 | bounding_box = self.bounding_box, 97 | segmentation = self.segmentation, 98 | keypoints = self.keypoints, 99 | attributes = self.attributes 100 | ) 101 | 102 | def __eq__(self, other: object) -> bool: 103 | if not isinstance(other, Instance): 104 | return NotImplemented 105 | return ( 106 | self.id == other.id 107 | and self.bounding_box == other.bounding_box 108 | and self.segmentation == other.segmentation 109 | and self.keypoints == other.keypoints 110 | and self.attributes == other.attributes 111 | ) 112 | 113 | def to_json(self) -> InstanceJson: 114 | """ 115 | Serializes an `Instance` into an `InstanceJson`. 116 | """ 117 | json: InstanceJson = {} 118 | 119 | if self.id is not None: 120 | json["id"] = self.id 121 | 122 | if self.bounding_box is not None: 123 | json["boundingBox"] = self.bounding_box.to_json() 124 | 125 | if self.segmentation is not None: 126 | json["segmentation"] = self.segmentation.to_json() 127 | 128 | if self.keypoints is not None: 129 | keypoints: Dict[str, Optional[KeypointJson]] = {} 130 | 131 | for name, keypoint in self.keypoints.items(): 132 | keypoints[name] = keypoint.to_json() if keypoint is not None else None 133 | 134 | json["keypoints"] = keypoints 135 | 136 | if self.attributes is not None: 137 | json["attributes"] = { 138 | k: v.to_json() for k, v in self.attributes.items() 139 | } 140 | 141 | return json 142 | -------------------------------------------------------------------------------- /datatap/droplet/keypoint.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Optional 4 | 5 | from typing_extensions import TypedDict 6 | 7 | from ..geometry import Point, PointJson 8 | from ..utils import basic_repr 9 | 10 | class _KeypointJsonOptional(TypedDict, total = False): 11 | occluded: bool 12 | confidence: float 13 | 14 | class KeypointJson(_KeypointJsonOptional, TypedDict): 15 | """ 16 | The JSON serialization of a `Keypoint`. 17 | """ 18 | point: PointJson 19 | 20 | class Keypoint: 21 | """ 22 | An object representing a specific keypoint in a particular instance. 23 | """ 24 | 25 | point: Point 26 | """ 27 | The point in the image where this keypoint appears. 28 | """ 29 | 30 | occluded: Optional[bool] 31 | """ 32 | Whether this keypoint is occluded. 33 | 34 | If `False`, the keypoint is visible within the image. 35 | If `True`, the keypoint is not visible in the image because it is blocked by some other object, 36 | but has an inferrable position that would lie within the frame of the image. 37 | If `None`, then the data source did not differentiate between occluded and unoccluded keypoints. 38 | """ 39 | 40 | confidence: Optional[float] 41 | """ 42 | The confidence associated with this keypoint. 43 | """ 44 | 45 | @staticmethod 46 | def from_json(json: KeypointJson) -> Keypoint: 47 | """ 48 | Creates a `Keypoint` from a `KeypointJson`. 49 | """ 50 | return Keypoint( 51 | Point.from_json(json["point"]), 52 | occluded = json.get("occluded"), 53 | confidence = json.get("confidence") 54 | ) 55 | 56 | def __init__(self, point: Point, *, occluded: Optional[bool] = None, confidence: Optional[float] = None): 57 | self.point = point 58 | self.occluded = occluded 59 | self.confidence = confidence 60 | 61 | self.point.assert_valid() 62 | 63 | def __repr__(self) -> str: 64 | return basic_repr("Keypoint", self.point, occluded = self.occluded, confidence = self.confidence) 65 | 66 | def __eq__(self, other: object) -> bool: 67 | if not isinstance(other, Keypoint): 68 | return NotImplemented 69 | return self.point == other.point and self.occluded == other.occluded and self.confidence == other.confidence 70 | 71 | def to_json(self) -> KeypointJson: 72 | """ 73 | Serializes this object into a `KeypointJson`. 74 | """ 75 | json: KeypointJson = { 76 | "point": self.point.to_json() 77 | } 78 | 79 | if self.occluded is not None: 80 | json["occluded"] = self.occluded 81 | 82 | if self.confidence is not None: 83 | json["confidence"] = self.confidence 84 | 85 | return json 86 | -------------------------------------------------------------------------------- /datatap/droplet/multi_instance.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Optional 4 | 5 | from typing_extensions import TypedDict 6 | 7 | from ..utils import basic_repr 8 | from .bounding_box import BoundingBox, BoundingBoxJson 9 | from .segmentation import Segmentation, SegmentationJson 10 | 11 | 12 | class MultiInstanceJson(TypedDict, total = False): 13 | """ 14 | The JSON serialization of a `MultiInstance`. 15 | """ 16 | boundingBox: BoundingBoxJson 17 | segmentation: SegmentationJson 18 | count: int 19 | 20 | class MultiInstance: 21 | """ 22 | An appearance of a group of objects of a particular class in a particular image. 23 | 24 | There is not a strict definition as to when a group of instances should be categorized as a multi-instance. 25 | As such, when constructing a dataset, it is best to ensure that all of the `DataSource`s agree on what 26 | constitutes a `MultiInstance`. These are most often used in public datasets when the cost of annotating 27 | every instance would be too high. 28 | """ 29 | 30 | bounding_box: Optional[BoundingBox] 31 | """ 32 | The bounding box of this multi-instance. 33 | """ 34 | 35 | segmentation: Optional[Segmentation] 36 | """ 37 | The segmentation of this multi-instance. 38 | """ 39 | 40 | count: Optional[int] 41 | """ 42 | A count of how many true instances are encapsulated in this multi-instance. 43 | """ 44 | 45 | @staticmethod 46 | def from_json(json: MultiInstanceJson) -> MultiInstance: 47 | """ 48 | Creates a `MultiInstance` from a `MultiInstanceJson`. 49 | """ 50 | return MultiInstance( 51 | bounding_box = BoundingBox.from_json(json["boundingBox"]) if "boundingBox" in json else None, 52 | segmentation = Segmentation.from_json(json["segmentation"]) if "segmentation" in json else None, 53 | count = json.get("count") 54 | ) 55 | 56 | def __init__( 57 | self, 58 | *, 59 | bounding_box: Optional[BoundingBox] = None, 60 | segmentation: Optional[Segmentation] = None, 61 | count: Optional[int] = None 62 | ): 63 | self.bounding_box = bounding_box 64 | self.segmentation = segmentation 65 | self.count = count 66 | 67 | def __repr__(self) -> str: 68 | return basic_repr( 69 | "MultiInstance", 70 | bounding_box = self.bounding_box, 71 | segmentation = self.segmentation, 72 | count = self.count 73 | ) 74 | 75 | def __eq__(self, other: object) -> bool: 76 | if not isinstance(other, MultiInstance): 77 | return NotImplemented 78 | return self.bounding_box == other.bounding_box and self.segmentation == other.segmentation and self.count == other.count 79 | 80 | def to_json(self) -> MultiInstanceJson: 81 | """ 82 | Serializes this object as a `MultiInstanceJson`. 83 | """ 84 | json: MultiInstanceJson = {} 85 | 86 | if self.bounding_box is not None: 87 | json["boundingBox"] = self.bounding_box.to_json() 88 | 89 | if self.segmentation is not None: 90 | json["segmentation"] = self.segmentation.to_json() 91 | 92 | if self.count is not None: 93 | json["count"] = self.count 94 | 95 | return json 96 | -------------------------------------------------------------------------------- /datatap/droplet/segmentation.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Optional 4 | 5 | from typing_extensions import TypedDict 6 | 7 | from ..geometry import Mask, MaskJson 8 | from ..utils import basic_repr 9 | 10 | class _SegmentationJsonOptional(TypedDict, total = False): 11 | confidence: float 12 | 13 | class SegmentationJson(_SegmentationJsonOptional, TypedDict): 14 | """ 15 | The serialized JSON representation of a segmentation. 16 | """ 17 | mask: MaskJson 18 | 19 | class Segmentation: 20 | """ 21 | A `Segmentation` represents the area within an image taken up by a 22 | detection, specified as a `Mask`. 23 | """ 24 | 25 | mask: Mask 26 | """ 27 | The area within the image where the corresponding detection appears. 28 | """ 29 | 30 | confidence: Optional[float] 31 | """ 32 | The confidence associated with this segmentation. 33 | """ 34 | 35 | @staticmethod 36 | def from_json(json: SegmentationJson) -> Segmentation: 37 | """ 38 | Constructs a `Segmentation` from a `SegmentationJson`. 39 | """ 40 | return Segmentation( 41 | Mask.from_json(json["mask"]), 42 | confidence = json.get("confidence") 43 | ) 44 | 45 | def __init__(self, mask: Mask, *, confidence: Optional[float] = None): 46 | self.mask = mask 47 | self.confidence = confidence 48 | 49 | self.mask.assert_valid() 50 | 51 | def __repr__(self) -> str: 52 | return basic_repr("Segmentation", self.mask, confidence = self.confidence) 53 | 54 | def __eq__(self, other: object) -> bool: 55 | if not isinstance(other, Segmentation): 56 | return NotImplemented 57 | return self.mask == other.mask and self.confidence == other.confidence 58 | 59 | def to_json(self) -> SegmentationJson: 60 | """ 61 | Serializes this `Segmentation` to a `SegmentationJson`. 62 | """ 63 | json: SegmentationJson = { 64 | "mask": self.mask.to_json() 65 | } 66 | 67 | if self.confidence is not None: 68 | json["confidence"] = self.confidence 69 | 70 | return json 71 | 72 | def meets_confidence_threshold(self, threshold: float) -> bool: 73 | """ 74 | Returns `True` if and only if the confidence of this segmentation is 75 | either unset or is at least the given `threshold`. 76 | """ 77 | return self.confidence is None or self.confidence >= threshold 78 | -------------------------------------------------------------------------------- /datatap/droplet/video.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Optional, Sequence 4 | 5 | from typing_extensions import TypedDict 6 | 7 | from ..utils import basic_repr 8 | from .image import Image, ImageJson 9 | 10 | 11 | class VideoJson(TypedDict, total = False): 12 | """ 13 | The serialized JSON representation of an `Video`. 14 | """ 15 | 16 | uid: str 17 | paths: Sequence[str] 18 | frames: Sequence[ImageJson] 19 | 20 | 21 | class Video: 22 | """ 23 | The `Video` class contains information about what Video was 24 | labeled by a given annotation. It also includes utilities 25 | for loading and manipulating Videos. 26 | """ 27 | 28 | uid: Optional[str] 29 | """ 30 | A unique ID for this Video. 31 | """ 32 | 33 | paths: Optional[Sequence[str]] 34 | """ 35 | A sequence of URIs where the media can be found. The loader 36 | will try them in order until it finds one it can load. 37 | 38 | Supported schemes include `http(s):`, `s3:` 39 | """ 40 | 41 | frames: Optional[Sequence[Image]] 42 | """ 43 | A sequence of images representing the video. 44 | """ 45 | 46 | @staticmethod 47 | def from_json(json: VideoJson) -> Video: 48 | """ 49 | Creates an `Video` from an `VideoJson`. 50 | """ 51 | return Video( 52 | uid = json.get("uid"), 53 | paths = json.get("paths"), 54 | frames = [Image.from_json(frame) for frame in json["frames"]] if "frames" in json else None 55 | ) 56 | 57 | def __init__( 58 | self, 59 | *, 60 | uid: Optional[str] = None, 61 | paths: Optional[Sequence[str]] = None, 62 | frames: Optional[Sequence[Image]] = None 63 | ): 64 | self.uid = uid 65 | self.paths = paths 66 | self.frames = frames 67 | 68 | def __repr__(self) -> str: 69 | return basic_repr("Video", uid = self.uid, paths = self.paths, frames = self.frames) 70 | 71 | def __eq__(self, other: object) -> bool: 72 | if not isinstance(other, Video): 73 | return NotImplemented 74 | return self.paths == other.paths 75 | 76 | def to_json(self) -> VideoJson: 77 | """ 78 | Serializes this `Video` into an `VideoJson`. 79 | """ 80 | json: VideoJson = {} 81 | 82 | if self.uid is not None: 83 | json["uid"] = self.uid 84 | 85 | if self.paths is not None: 86 | json["paths"] = self.paths 87 | 88 | if self.frames is not None: 89 | json["frames"] = [frame.to_json() for frame in self.frames] 90 | 91 | return json 92 | -------------------------------------------------------------------------------- /datatap/droplet/video_annotation.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any, Callable, Mapping, Optional, Sequence 4 | from datatap.droplet.video import Video, VideoJson 5 | 6 | from typing_extensions import Literal, TypedDict 7 | 8 | from ..utils import basic_repr 9 | from .instance import Instance 10 | from .multi_instance import MultiInstance 11 | from .frame_annotation import FrameAnnotation, FrameAnnotationJson 12 | 13 | 14 | class _VideoAnnotationJsonOptional(TypedDict, total = False): 15 | uid: str 16 | metadata: Mapping[str, Any] 17 | 18 | class VideoAnnotationJson(_VideoAnnotationJsonOptional, TypedDict): 19 | """ 20 | The serialized JSON representation of an video annotation. 21 | """ 22 | 23 | kind: Literal["VideoAnnotation"] 24 | video: VideoJson 25 | frames: Sequence[FrameAnnotationJson] 26 | 27 | class VideoAnnotation: 28 | """ 29 | A collection of class annotations that annotate a given image. 30 | """ 31 | 32 | video: Video 33 | """ 34 | The video being annotated. 35 | """ 36 | 37 | uid: Optional[str] 38 | """ 39 | A unique identifier for this image annotation. 40 | """ 41 | 42 | metadata: Optional[Mapping[str, Any]] 43 | """ 44 | An optional field for storing metadata on the annotation. 45 | """ 46 | 47 | @staticmethod 48 | def from_json(json: Mapping[str, Any]) -> VideoAnnotation: 49 | """ 50 | Constructs an `VideoAnnotation` from an `VideoAnnotationJson`. 51 | """ 52 | return VideoAnnotation( 53 | video = Video.from_json(json["video"]), 54 | frames = [FrameAnnotation.from_json(frame) for frame in json["frames"]], 55 | uid = json.get("uid"), 56 | metadata = json.get("metadata") 57 | ) 58 | 59 | def __init__( 60 | self, 61 | *, 62 | video: Video, 63 | frames: Sequence[FrameAnnotation], 64 | uid: Optional[str] = None, 65 | metadata: Optional[Mapping[str, Any]] = None 66 | ): 67 | self.video = video 68 | self.frames = frames 69 | self.uid = uid 70 | self.metadata = metadata 71 | 72 | def filter_detections( 73 | self, 74 | *, 75 | instance_filter: Callable[[Instance], bool], 76 | multi_instance_filter: Callable[[MultiInstance], bool] 77 | ) -> VideoAnnotation: 78 | """ 79 | Returns a new image annotation consisting only of the instances and 80 | multi-instances that meet the given constraints. 81 | """ 82 | return VideoAnnotation( 83 | video = self.video, 84 | frames = [ 85 | frame.filter_detections( 86 | instance_filter = instance_filter, 87 | multi_instance_filter = multi_instance_filter 88 | ) 89 | for frame in self.frames 90 | ], 91 | uid = self.uid, 92 | metadata = self.metadata 93 | ) 94 | 95 | def apply_bounding_box_confidence_threshold(self, threshold: float) -> VideoAnnotation: 96 | """ 97 | Returns a new image annotation consisting only of the instances and 98 | multi-instances that have bounding boxes which either do not have a 99 | confidence specified or which have a confience meeting the given 100 | threshold. 101 | """ 102 | return self.filter_detections( 103 | instance_filter = lambda instance: ( 104 | instance.bounding_box is not None 105 | and instance.bounding_box.meets_confidence_threshold(threshold) 106 | ), 107 | multi_instance_filter = lambda multi_instance: ( 108 | multi_instance.bounding_box is not None 109 | and multi_instance.bounding_box.meets_confidence_threshold(threshold) 110 | ) 111 | ) 112 | 113 | def apply_segmentation_confidence_threshold(self, threshold: float) -> VideoAnnotation: 114 | """ 115 | Returns a new image annotation consisting only of the instances and 116 | multi-instances that have segmentations which either do not have a 117 | confidence specified or which have a confience meeting the given 118 | threshold. 119 | """ 120 | return self.filter_detections( 121 | instance_filter = lambda instance: ( 122 | instance.segmentation is not None 123 | and instance.segmentation.meets_confidence_threshold(threshold) 124 | ), 125 | multi_instance_filter = lambda multi_instance: ( 126 | multi_instance.segmentation is not None 127 | and multi_instance.segmentation.meets_confidence_threshold(threshold) 128 | ) 129 | ) 130 | 131 | def apply_metadata(self, metadata: Mapping[str, Any]) -> VideoAnnotation: 132 | """ 133 | Returns a new image annotation with the supplied metadata. 134 | """ 135 | return VideoAnnotation( 136 | video = self.video, 137 | frames = self.frames, 138 | uid = self.uid, 139 | metadata = metadata 140 | ) 141 | 142 | def __repr__(self) -> str: 143 | return basic_repr( 144 | "VideoAnnotation", 145 | uid = self.uid, 146 | video = self.video, 147 | frames = self.frames, 148 | metadata = self.metadata 149 | ) 150 | 151 | def __eq__(self, other: object) -> bool: 152 | if not isinstance(other, VideoAnnotation): 153 | return NotImplemented 154 | return ( 155 | self.video == other.video 156 | and self.frames == other.frames 157 | and self.uid == other.uid 158 | and self.metadata == other.metadata 159 | ) 160 | 161 | def __add__(self, other: VideoAnnotation) -> VideoAnnotation: 162 | if not isinstance(other, VideoAnnotation): # type: ignore - pyright complains about the isinstance check being redundant 163 | return NotImplemented 164 | 165 | if len(self.frames) != len(other.frames): 166 | raise ValueError("Unable to merge VideoAnnotations with different number of frames") 167 | 168 | return VideoAnnotation( 169 | video = self.video, 170 | frames = [ 171 | frame1 + frame2 172 | for frame1, frame2 in zip(self.frames, other.frames) 173 | ], 174 | uid = self.uid, 175 | metadata = self.metadata 176 | ) 177 | 178 | def to_json(self) -> VideoAnnotationJson: 179 | """ 180 | Serializes this image annotation into an `VideoAnnotationJson`. 181 | """ 182 | json: VideoAnnotationJson = { 183 | "kind": "VideoAnnotation", 184 | "video": self.video.to_json(), 185 | "frames": [frame.to_json() for frame in self.frames] 186 | } 187 | 188 | if self.uid is not None: 189 | json["uid"] = self.uid 190 | 191 | if self.metadata is not None: 192 | json["metadata"] = self.metadata 193 | 194 | return json 195 | -------------------------------------------------------------------------------- /datatap/examples/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example code 3 | """ -------------------------------------------------------------------------------- /datatap/geometry/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module provides geometric primitives for storing or manipulating ML annotations. 3 | 4 | Generally speaking, a geometric object is considered "valid" in the droplet format when it lies entirely within the unit 5 | plane. This is because annotations in the droplet format are scaled to 0-to-1 along both axes so that they are 6 | resolution-independent. This can be checked by invoking `assert_valid` on any of the geometric objects (though this is 7 | done automatically when geometric constructs are used to create droplets). 8 | """ 9 | 10 | from .mask import Mask, MaskJson 11 | from .point import Point, PointJson 12 | from .polygon import Polygon, PolygonJson 13 | from .rectangle import Rectangle, RectangleJson 14 | 15 | __all__ = [ 16 | "Mask", 17 | "MaskJson", 18 | "Point", 19 | "PointJson", 20 | "Polygon", 21 | "PolygonJson", 22 | "Rectangle", 23 | "RectangleJson" 24 | ] 25 | -------------------------------------------------------------------------------- /datatap/geometry/mask.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from datatap.geometry.point import Point 3 | 4 | from typing import Generator, Sequence, Tuple, Union 5 | 6 | from .polygon import Polygon, PolygonJson 7 | from ..utils import basic_repr 8 | 9 | MaskJson = Sequence[PolygonJson] 10 | 11 | class Mask: 12 | """ 13 | The shape resulting from XORing a set of polygons in 2D space. 14 | 15 | Generally, the expectation is that the polygons have no edge itersections; specifically, that for any pair of 16 | polygons in the mask, either they have no intersection or one completely contains the other. However, there is no 17 | assertion that this is the case, and generally speaking, the even-odd rule is used to determine if a particular 18 | point is contained by the mask. 19 | """ 20 | 21 | polygons: Sequence[Polygon] 22 | """ 23 | The constituent polygons of this `Mask`. 24 | """ 25 | 26 | @staticmethod 27 | def from_json(json: MaskJson) -> Mask: 28 | """ 29 | Creates a `Mask` from a `MaskJson`. 30 | """ 31 | return Mask([Polygon.from_json(poly) for poly in json]) 32 | 33 | def __init__(self, polygons: Sequence[Polygon]): 34 | self.polygons = polygons 35 | 36 | if len(self.polygons) < 1: 37 | raise ValueError(f"A mask must have at least one polygon; failed on mask {repr(self)}") 38 | 39 | def scale(self, factor: Union[float, int, Tuple[float, float], Point]) -> Mask: 40 | """ 41 | Resizes the mask according to `factor`. The scaling factor can either be 42 | a scalar (`int` or `float`), in which case the mask will be scaled by 43 | the same factor on both axes, or a point-like (`Tuple[float, float]` 44 | or `Point`), in which case the mask will be scaled independently on each 45 | axis. 46 | """ 47 | return Mask([p.scale(factor) for p in self.polygons]) 48 | 49 | def to_json(self) -> MaskJson: 50 | """ 51 | Serializes this object as a `MaskJson`. 52 | """ 53 | return [polygon.to_json() for polygon in self.polygons] 54 | 55 | def assert_valid(self) -> None: 56 | """ 57 | Asserts that this mask is valid on the unit plane. 58 | """ 59 | for polygon in self.polygons: 60 | polygon.assert_valid() 61 | # TODO(mdsavage): check for invalid polygon intersections? 62 | 63 | def __repr__(self) -> str: 64 | return basic_repr("Mask", self.polygons) 65 | 66 | def __eq__(self, other: object) -> bool: 67 | # TODO(mdsavage): currently, this requires the polygons to be in the same order, not just represent the same mask 68 | if not isinstance(other, Mask): 69 | return NotImplemented 70 | return self.polygons == other.polygons 71 | 72 | def __iter__(self) -> Generator[Polygon, None, None]: 73 | yield from self.polygons 74 | -------------------------------------------------------------------------------- /datatap/geometry/point.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Tuple, Union 4 | 5 | from ..utils import basic_repr 6 | 7 | PointJson = Tuple[float, float] 8 | 9 | class Point: 10 | """ 11 | A point in 2D space. Also often used to represent a 2D vector. 12 | """ 13 | 14 | x: float 15 | """ 16 | The x-coordinate of the point. 17 | """ 18 | 19 | y: float 20 | """ 21 | The y-coordinate of the point. 22 | """ 23 | 24 | @staticmethod 25 | def from_json(json: PointJson) -> Point: 26 | """ 27 | Creates a `Point` from a `PointJson`. 28 | """ 29 | return Point(json[0], json[1]) 30 | 31 | def __init__(self, x: float, y: float, clip: bool = False): 32 | self.x = min(max(x, 0), 1) if clip else x 33 | self.y = min(max(y, 0), 1) if clip else y 34 | 35 | def to_json(self) -> PointJson: 36 | """ 37 | Serializes this object as a `PointJson`. 38 | """ 39 | return (self.x, self.y) 40 | 41 | def distance(self, other: Point) -> float: 42 | """ 43 | Computes the scalar distance to another point. 44 | """ 45 | return ((self.x - other.x) ** 2 + (self.y - other.y) ** 2) ** 0.5 46 | 47 | def assert_valid(self) -> None: 48 | """ 49 | Asserts that this polygon is valid on the unit plane. 50 | """ 51 | assert 0 <= self.x <= 1 and 0 <= self.y <= 1, f"Point coordinates must be between 0 and 1; failed on point {repr(self)}" 52 | 53 | def clip(self) -> Point: 54 | """ 55 | Clips both coordinates of this point to the range [0, 1]. 56 | """ 57 | return Point(self.x, self.y, clip = True) 58 | 59 | def scale(self, factor: Union[float, int, Tuple[float, float], Point]) -> Point: 60 | """ 61 | Resizes the point according to `factor`. The scaling factor can either 62 | be a scalar (`int` or `float`), in which case the point will be scaled 63 | by the same factor on both axes, or a point-like (`Tuple[float, float]` 64 | or `Point`), in which case the point will be scaled independently on 65 | each axis. 66 | """ 67 | if isinstance(factor, (float, int)): 68 | return self * factor 69 | if isinstance(factor, tuple): 70 | return Point(self.x * factor[0], self.y * factor[1]) 71 | return Point(self.x * factor.x, self.x * factor.y) 72 | 73 | def __add__(self, o: Point) -> Point: 74 | if isinstance(o, Point): # type: ignore - pyright complains about the isinstance check being redundant 75 | return Point(self.x + o.x, self.y + o.y) 76 | return NotImplemented 77 | 78 | def __sub__(self, o: Point) -> Point: 79 | if isinstance(o, Point): # type: ignore - pyright complains about the isinstance check being redundant 80 | return Point(self.x - o.x, self.y - o.y) 81 | return NotImplemented 82 | 83 | def __mul__(self, o: Union[int, float]) -> Point: 84 | if isinstance(o, (int, float)): # type: ignore - pyright complains about the isinstance check being redundant 85 | return Point(self.x * o, self.y * o) 86 | return NotImplemented 87 | 88 | def __truediv__(self, o: Union[int, float]) -> Point: 89 | if isinstance(o, (int, float)): # type: ignore - pyright complains about the isinstance check being redundant 90 | return Point(self.x / o, self.y / o) 91 | return NotImplemented 92 | 93 | def __repr__(self) -> str: 94 | return basic_repr("Point", self.x, self.y) 95 | 96 | def __hash__(self) -> int: 97 | return hash((self.x, self.y)) 98 | 99 | def __eq__(self, other: object) -> bool: 100 | if isinstance(other, Point): 101 | return self.x == other.x and self.y == other.y 102 | return NotImplemented 103 | -------------------------------------------------------------------------------- /datatap/geometry/polygon.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Generator, Sequence, Tuple, Union 4 | 5 | from .point import Point, PointJson 6 | from ..utils import basic_repr 7 | 8 | PolygonJson = Sequence[PointJson] 9 | 10 | class Polygon: 11 | """ 12 | A polygon in 2D space. 13 | """ 14 | 15 | points: Sequence[Point] 16 | """ 17 | The vertices of this polygon. 18 | """ 19 | 20 | @staticmethod 21 | def from_json(json: PolygonJson) -> Polygon: 22 | """ 23 | Creates a `Polygon` from a `PolygonJson`. 24 | """ 25 | return Polygon([Point.from_json(pt) for pt in json]) 26 | 27 | def __init__(self, points: Sequence[Point]): 28 | self.points = points 29 | 30 | if len(self.points) < 3: 31 | raise ValueError(f"A polygon must have at least three points; failed on polygon {repr(self)}") 32 | 33 | def scale(self, factor: Union[float, int, Tuple[float, float], Point]) -> Polygon: 34 | """ 35 | Resizes the polygon according to `factor`. The scaling factor can either 36 | be a scalar (`int` or `float`), in which case the polygon will be scaled 37 | by the same factor on both axes, or a point-like (`Tuple[float, float]` 38 | or `Point`), in which case the polygon will be scaled independently on 39 | each axis. 40 | """ 41 | return Polygon([p.scale(factor) for p in self.points]) 42 | 43 | def to_json(self) -> PolygonJson: 44 | """ 45 | Serializes this object as a `PolygonJson`. 46 | """ 47 | return [point.to_json() for point in self.points] 48 | 49 | def assert_valid(self) -> None: 50 | """ 51 | Ensures that this polygon is valid on the unit plane. 52 | """ 53 | for point in self.points: 54 | point.assert_valid() 55 | # TODO(mdsavage): check for self-intersection? 56 | 57 | def __repr__(self) -> str: 58 | return basic_repr("Polygon", self.points) 59 | 60 | def __eq__(self, other: object) -> bool: 61 | # TODO(mdsavage): currently, this requires the points to be in the same order, not just represent the same polygon 62 | if not isinstance(other, Polygon): 63 | return NotImplemented 64 | return self.points == other.points 65 | 66 | def __mul__(self, o: Union[int, float]) -> Polygon: 67 | if not isinstance(o, (int, float)): # type: ignore - pyright complains about the isinstance check being redundant 68 | return NotImplemented 69 | return Polygon([p * o for p in self.points]) 70 | 71 | def __iter__(self) -> Generator[Point, None, None]: 72 | yield from self.points -------------------------------------------------------------------------------- /datatap/geometry/rectangle.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from shapely.geometry import box, Polygon as ShapelyPolygon 4 | from typing import Sequence, Tuple, Union 5 | 6 | from .point import Point, PointJson 7 | from ..utils import basic_repr 8 | 9 | RectangleJson = Tuple[PointJson, PointJson] 10 | 11 | class Rectangle: 12 | """ 13 | An axis-aligned rectangle in 2D space. 14 | """ 15 | 16 | p1: Point 17 | """ 18 | The top-left corner of the rectangle. 19 | """ 20 | 21 | p2: Point 22 | """ 23 | The bottom-right corner of the rectangle. 24 | """ 25 | 26 | @staticmethod 27 | def from_json(json: RectangleJson) -> Rectangle: 28 | """ 29 | Creates a `Rectangle` from a `RectangleJson`. 30 | """ 31 | return Rectangle(Point.from_json(json[0]), Point.from_json(json[1])) 32 | 33 | @staticmethod 34 | def from_point_set(points: Sequence[Point]) -> Rectangle: 35 | """ 36 | Creates the bounding rectangle of a set of points. 37 | 38 | Note, it is possible for this to create an invalid rectangle if all points 39 | are colinear and axis-aligned. 40 | """ 41 | return Rectangle( 42 | Point(min(p.x for p in points), min(p.y for p in points)), 43 | Point(max(p.x for p in points), max(p.y for p in points)) 44 | ) 45 | 46 | def __init__(self, p1: Point, p2: Point, normalize: bool = False): 47 | if normalize: 48 | self.p1 = Point(min(p1.x, p2.x), min(p1.y, p2.y)) 49 | self.p2 = Point(max(p1.x, p2.x), max(p1.y, p2.y)) 50 | else: 51 | self.p1 = p1 52 | self.p2 = p2 53 | 54 | def assert_valid(self) -> None: 55 | """ 56 | Ensures that this rectangle is valid on the unit plane. 57 | """ 58 | self.p1.assert_valid() 59 | self.p2.assert_valid() 60 | assert self.p1.x < self.p2.x and self.p1.y < self.p2.y, f"Rectangle has non-positive area; failed on rectangle {repr(self)}" 61 | 62 | def to_json(self) -> RectangleJson: 63 | """ 64 | Serializes this object as a `RectangleJson`. 65 | """ 66 | return (self.p1.to_json(), self.p2.to_json()) 67 | 68 | def to_shapely(self) -> ShapelyPolygon: 69 | """ 70 | Converts this rectangle into a `Shapely.Polygon`. 71 | """ 72 | return box(self.p1.x, self.p1.y, self.p2.x, self.p2.y) 73 | 74 | def to_xywh_tuple(self) -> Tuple[float, float, float, float]: 75 | """ 76 | Converts this rectangle into a tuple of `(x_coordinate, y_coordinate, width, height)`. 77 | """ 78 | w = self.p2.x - self.p1.x 79 | h = self.p2.y - self.p1.y 80 | return (self.p1.x, self.p1.y, w, h) 81 | 82 | def to_xyxy_tuple(self) -> Tuple[float, float, float, float]: 83 | """ 84 | Converts this rectangle into a tuple of `(x_min, y_min, x_max, y_max)`. 85 | """ 86 | return (self.p1.x, self.p1.y, self.p2.x, self.p2.y) 87 | 88 | def area(self) -> float: 89 | """ 90 | Computes the area of this rectangle. 91 | """ 92 | return abs(self.p1.x - self.p2.x) * abs(self.p1.y - self.p2.y) 93 | 94 | def iou(self, other: Rectangle) -> float: 95 | """ 96 | Computes the iou (intersection-over-union) of this rectangle with another. 97 | """ 98 | x1 = max(self.p1.x, other.p1.x) 99 | y1 = max(self.p1.y, other.p1.y) 100 | x2 = min(self.p2.x, other.p2.x) 101 | y2 = min(self.p2.y, other.p2.y) 102 | intersection_area = max(x2 - x1, 0) * max(y2 - y1, 0) 103 | union_area = self.area() + other.area() - intersection_area 104 | return intersection_area / union_area 105 | 106 | def diagonal(self) -> float: 107 | """ 108 | Computes the diagonal length of this rectangle. 109 | """ 110 | return self.p1.distance(self.p2) 111 | 112 | def scale(self, factor: Union[float, int, Tuple[float, float], Point]): 113 | """ 114 | Resizes the rectangle according to `factor`. The scaling factor can 115 | either be a scalar (`int` or `float`), in which case the rectangle will 116 | be scaled by the same factor on both axes, or a point-like 117 | (`Tuple[float, float]` or `Point`), in which case the rectangle will be 118 | scaled independently on each axis. 119 | """ 120 | return Rectangle(self.p1.scale(factor), self.p2.scale(factor)) 121 | 122 | def center(self) -> Point: 123 | """ 124 | Computes the center of this rectangle. 125 | """ 126 | return Point((self.p1.x + self.p2.x) / 2, (self.p1.y + self.p2.y) / 2) 127 | 128 | def scale_from_center(self, factor: Union[float, int, Tuple[float, float], Point]) -> Rectangle: 129 | """ 130 | Resizes the rectangle according to `factor`, though translates it so 131 | that its center does not move. The scaling factor can either be a scalar 132 | (`int` or `float`), in which case the rectangle will be scaled by the 133 | same factor on both axes, or a point-like (`Tuple[float, float]` or 134 | `Point`), in which case the rectangle will be scaled independently on 135 | each axis. 136 | """ 137 | center = self.center() 138 | return Rectangle( 139 | (self.p1 - center).scale(factor) + center, 140 | (self.p2 - center).scale(factor) + center 141 | ) 142 | 143 | def clip(self) -> Rectangle: 144 | """ 145 | Clips the rectangle the unit-plane. 146 | """ 147 | return Rectangle(self.p1.clip(), self.p2.clip()) 148 | 149 | def normalize(self) -> Rectangle: 150 | """ 151 | Returns a new rectangle that is guaranteed to have `p1` be the top left 152 | corner and `p2` be the bottom right corner. 153 | """ 154 | return Rectangle(self.p1, self.p2, True) 155 | 156 | def __repr__(self) -> str: 157 | return basic_repr("Rectangle", self.p1, self.p2) 158 | 159 | def __hash__(self) -> int: 160 | return hash((self.p1, self.p2)) 161 | 162 | def __eq__(self, other: object) -> bool: 163 | if not isinstance(other, Rectangle): 164 | return NotImplemented 165 | return self.p1 == other.p1 and self.p2 == other.p2 166 | 167 | def __mul__(self, o: Union[int, float]) -> Rectangle: 168 | if isinstance(o, (int, float)): # type: ignore - pyright complains about the isinstance check being redundant 169 | return Rectangle(self.p1 * o, self.p2 * o) 170 | return NotImplemented 171 | -------------------------------------------------------------------------------- /datatap/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The metrics module provides a number of utilities for analyzing droplets in the context 3 | of a broader training or evaluation job. 4 | 5 | Here are some examples of the metrics module 6 | 7 | ```py 8 | from datatap import Api, metrics 9 | from my_model import model 10 | 11 | api = Api() 12 | dataset = api.get_default_database().get_dataset_list()[0] 13 | latest_version = dataset.latest_version 14 | 15 | confusion_matrix = metrics.ConfusionMatrix(latest_version.template.classes.keys()) 16 | pr_curve = metrics.PrecisionRecallCurve() 17 | 18 | for annotation in latest_version.stream_split("validation"): 19 | prediction = model(annotation) 20 | confusion_matrix.add_annotation(annotation, prediction, 0.5, 0.5) 21 | pr_curve.add_annotation(annotation, prediction, 0.5) 22 | 23 | print(confusion_matrix.matrix) 24 | print(pr_curve.maximize_f1()) 25 | ``` 26 | """ 27 | 28 | from .confusion_matrix import ConfusionMatrix 29 | from .precision_recall_curve import PrecisionRecallCurve, MaximizeF1Result 30 | from .iou import generate_confusion_matrix, generate_pr_curve 31 | 32 | __all__ = [ 33 | "ConfusionMatrix", 34 | "PrecisionRecallCurve", 35 | "MaximizeF1Result", 36 | "generate_confusion_matrix", 37 | "generate_pr_curve", 38 | ] -------------------------------------------------------------------------------- /datatap/metrics/_types.py: -------------------------------------------------------------------------------- 1 | from typing import NamedTuple 2 | 3 | from datatap.geometry import Rectangle 4 | 5 | class PredictionBox(NamedTuple): 6 | confidence: float 7 | class_name: str 8 | box: Rectangle 9 | 10 | class GroundTruthBox(NamedTuple): 11 | class_name: str 12 | box: Rectangle 13 | 14 | -------------------------------------------------------------------------------- /datatap/metrics/confusion_matrix.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from collections import defaultdict 3 | 4 | from typing import DefaultDict, Iterable, Mapping, Optional, Sequence, cast 5 | 6 | import numpy as np 7 | from scipy.optimize import linear_sum_assignment 8 | 9 | from datatap.droplet import ImageAnnotation 10 | 11 | from ._types import GroundTruthBox, PredictionBox 12 | 13 | class ConfusionMatrix: 14 | """ 15 | Represents a confusion matrix for a collection of annotations. 16 | This class will handle the matching of instances in a ground truth annotations 17 | to instances in a set of matching prediction annotations. 18 | """ 19 | 20 | # TODO(mdsavage): make this accept matching strategies other than bounding box IOU 21 | 22 | classes: Sequence[str] 23 | """ 24 | A list of the classes that this confusion matrix is tracking. 25 | """ 26 | 27 | matrix: np.ndarray 28 | """ 29 | The current confusion matrix. Entry `(i, j)` represents the number of times that 30 | an instance of `self.classes[i]` was classified as an instance of `self.classes[j]` 31 | """ 32 | 33 | _class_map: Mapping[str, int] 34 | 35 | def __init__(self, classes: Sequence[str], matrix: Optional[np.ndarray] = None): 36 | self.classes = ["__background__"] + list(classes) 37 | self._class_map = dict([(class_name, index) for index, class_name in enumerate(self.classes)]) 38 | dim = len(self.classes) 39 | self.matrix = matrix if matrix is not None else np.zeros((dim, dim)) 40 | 41 | def add_annotation( 42 | self: ConfusionMatrix, 43 | ground_truth: ImageAnnotation, 44 | prediction: ImageAnnotation, 45 | iou_threshold: float, 46 | confidence_threshold: float 47 | ) -> None: 48 | """ 49 | Updates this confusion matrix for the given ground truth and prediction annotations evaluated with the given IOU 50 | threshold, only considering instances meeting the given confidence threshold. 51 | 52 | Note: this handles instances only; multi-instances are ignored. 53 | """ 54 | ground_truth_boxes = [ 55 | GroundTruthBox(class_name, instance.bounding_box.rectangle) 56 | for class_name in ground_truth.classes.keys() 57 | for instance in ground_truth.classes[class_name].instances 58 | if instance.bounding_box is not None 59 | ] 60 | 61 | prediction_boxes = sorted([ 62 | PredictionBox(instance.bounding_box.confidence or 1, class_name, instance.bounding_box.rectangle) 63 | for class_name in prediction.classes.keys() 64 | for instance in prediction.classes[class_name].instances 65 | if instance.bounding_box is not None and instance.bounding_box.meets_confidence_threshold(confidence_threshold) 66 | ], reverse = True, key = lambda p: p.confidence) 67 | 68 | iou_matrix = np.array([ 69 | [ground_truth_box.box.iou(prediction_box.box) for ground_truth_box in ground_truth_boxes] 70 | for prediction_box in prediction_boxes 71 | ], ndmin = 2) 72 | 73 | prediction_indices, ground_truth_indices = linear_sum_assignment(iou_matrix, maximize = True) 74 | 75 | unmatched_ground_truth_box_counts: DefaultDict[str, int] = defaultdict(lambda: 0) 76 | unmatched_prediction_box_counts: DefaultDict[str, int] = defaultdict(lambda: 0) 77 | 78 | for box in ground_truth_boxes: 79 | unmatched_ground_truth_box_counts[box.class_name] += 1 80 | 81 | for box in prediction_boxes: 82 | unmatched_prediction_box_counts[box.class_name] += 1 83 | 84 | for prediction_index, ground_truth_index in zip(cast(Iterable[int], prediction_indices), cast(Iterable[int], ground_truth_indices)): 85 | if iou_matrix[prediction_index, ground_truth_index] >= iou_threshold: 86 | ground_truth_box = ground_truth_boxes[ground_truth_index] 87 | prediction_box = prediction_boxes[prediction_index] 88 | self._add_detection(ground_truth_box.class_name, prediction_box.class_name) 89 | unmatched_ground_truth_box_counts[ground_truth_box.class_name] -= 1 90 | unmatched_prediction_box_counts[prediction_box.class_name] -= 1 91 | 92 | for class_name, count in unmatched_ground_truth_box_counts.items(): 93 | if count > 0: 94 | self._add_false_negative(class_name, count = count) 95 | 96 | for class_name, count in unmatched_prediction_box_counts.items(): 97 | if count > 0: 98 | self._add_false_positive(class_name, count = count) 99 | 100 | def batch_add_annotation( 101 | self: ConfusionMatrix, 102 | ground_truths: Sequence[ImageAnnotation], 103 | predictions: Sequence[ImageAnnotation], 104 | iou_threshold: float, 105 | confidence_threshold: float 106 | ) -> None: 107 | """ 108 | Updates this confusion matrix with the values from several annotations simultaneously. 109 | """ 110 | for ground_truth, prediction in zip(ground_truths, predictions): 111 | self.add_annotation( 112 | ground_truth, 113 | prediction, 114 | iou_threshold, 115 | confidence_threshold 116 | ) 117 | 118 | def _add_detection(self, ground_truth_class: str, prediction_class: str, count: int = 1) -> None: 119 | r = self._class_map[ground_truth_class] 120 | c = self._class_map[prediction_class] 121 | self.matrix[r, c] += count 122 | 123 | def _add_false_negative(self, ground_truth_class: str, count: int = 1) -> None: 124 | self._add_detection(ground_truth_class, "__background__", count) 125 | 126 | def _add_false_positive(self, ground_truth_class: str, count: int = 1) -> None: 127 | self._add_detection("__background__", ground_truth_class, count) 128 | 129 | 130 | def __add__(self, other: ConfusionMatrix) -> ConfusionMatrix: 131 | if isinstance(other, ConfusionMatrix): # type: ignore - pyright complains about the isinstance check being redundant 132 | return ConfusionMatrix(self.classes, cast(np.ndarray, self.matrix + other.matrix)) 133 | return NotImplemented 134 | -------------------------------------------------------------------------------- /datatap/metrics/iou.py: -------------------------------------------------------------------------------- 1 | from datatap.metrics.confusion_matrix import ConfusionMatrix 2 | from typing import Sequence 3 | 4 | from ..droplet import ImageAnnotation 5 | from ..template import ImageAnnotationTemplate 6 | from .precision_recall_curve import PrecisionRecallCurve 7 | 8 | 9 | def generate_pr_curve(ground_truths: Sequence[ImageAnnotation], predictions: Sequence[ImageAnnotation], iou_threshold: float) -> PrecisionRecallCurve: 10 | """ 11 | Returns a precision-recall curve for the given ground truth and prediction annotation lists evaluated with the given 12 | IOU threshold. 13 | 14 | Note: this handles instances only; multi-instances are ignored. 15 | """ 16 | precision_recall_curve = PrecisionRecallCurve() 17 | precision_recall_curve.batch_add_annotation(ground_truths, predictions, iou_threshold) 18 | return precision_recall_curve 19 | 20 | def generate_confusion_matrix( 21 | template: ImageAnnotationTemplate, 22 | ground_truths: Sequence[ImageAnnotation], 23 | predictions: Sequence[ImageAnnotation], 24 | iou_threshold: float, 25 | confidence_threshold: float 26 | ) -> ConfusionMatrix: 27 | """ 28 | Returns a confusion matrix for the given ground truth and prediction annotation lists evaluated with the given IOU 29 | threshold. 30 | 31 | Note: this handles instances only; multi-instances are ignored. 32 | """ 33 | confusion_matrix = ConfusionMatrix(sorted(template.classes.keys())) 34 | confusion_matrix.batch_add_annotation(ground_truths, predictions, iou_threshold, confidence_threshold) 35 | return confusion_matrix 36 | -------------------------------------------------------------------------------- /datatap/metrics/precision_recall_curve.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Iterable, Sequence, TYPE_CHECKING, List, NamedTuple, Optional, cast 4 | 5 | import numpy as np 6 | from scipy.optimize import linear_sum_assignment 7 | from sortedcontainers import SortedDict 8 | 9 | from datatap.droplet import ImageAnnotation 10 | 11 | from ._types import GroundTruthBox, PredictionBox 12 | 13 | if TYPE_CHECKING: 14 | import matplotlib.pyplot as plt 15 | 16 | class MaximizeF1Result(NamedTuple): 17 | """ 18 | Represents the precision, recall, and f1 for a given `PrecisionRecallCurve` 19 | at the threshold that maximizes f1. 20 | """ 21 | threshold: float 22 | precision: float 23 | recall: float 24 | f1: float 25 | 26 | class _PrecisionRecallPoint(NamedTuple): 27 | threshold: float 28 | precision: float 29 | recall: float 30 | 31 | class _DetectionEvent(NamedTuple): 32 | true_positive_delta: int 33 | false_positive_delta: int 34 | 35 | def __add__(self, other: _DetectionEvent) -> _DetectionEvent: 36 | if isinstance(other, _DetectionEvent): # type: ignore - pyright complains about the isinstance check being redundant 37 | return _DetectionEvent(self.true_positive_delta + other.true_positive_delta, self.false_positive_delta + other.false_positive_delta) 38 | return NotImplemented 39 | 40 | 41 | class PrecisionRecallCurve: 42 | """ 43 | Represents a curve relating a chosen detection threshold to precision and recall. Internally, this is actually 44 | stored as a sorted list of detection events, which are used to compute metrics on the fly when needed. 45 | """ 46 | 47 | # TODO(mdsavage): make this accept matching strategies other than bounding box IOU 48 | 49 | events: SortedDict[float, _DetectionEvent] 50 | ground_truth_positives: int 51 | 52 | def __init__(self, events: Optional[SortedDict[float, _DetectionEvent]] = None, ground_truth_positives: int = 0): 53 | self.events = SortedDict() if events is None else events 54 | self.ground_truth_positives = ground_truth_positives 55 | 56 | def clone(self) -> PrecisionRecallCurve: 57 | return PrecisionRecallCurve(self.events.copy(), self.ground_truth_positives) 58 | 59 | def maximize_f1(self) -> MaximizeF1Result: 60 | maximum = MaximizeF1Result(threshold = 1, precision = 0, recall = 0, f1 = 0) 61 | 62 | for threshold, precision, recall in self._compute_curve(): 63 | f1 = 2 / ((1 / precision) + (1 / recall)) if precision > 0 and recall > 0 else 0 64 | if f1 >= maximum.f1: 65 | maximum = MaximizeF1Result(threshold = threshold, precision = precision, recall = recall, f1 = f1) 66 | 67 | return maximum 68 | 69 | def plot(self) -> plt.Figure: 70 | import matplotlib.pyplot as plt 71 | fig = plt.figure() 72 | curve = self._compute_curve() 73 | plt.plot([pt.recall for pt in curve], [pt.precision for pt in curve], "o-") 74 | plt.xlabel("Recall") 75 | plt.ylabel("Precision") 76 | return fig 77 | 78 | def add_annotation( 79 | self: PrecisionRecallCurve, 80 | ground_truth: ImageAnnotation, 81 | prediction: ImageAnnotation, 82 | iou_threshold: float 83 | ) -> None: 84 | """ 85 | Returns a precision-recall curve for the given ground truth and prediction annotations evaluated with the given 86 | IOU threshold. 87 | 88 | Note: this handles instances only; multi-instances are ignored. 89 | """ 90 | ground_truth_boxes = [ 91 | GroundTruthBox(class_name, instance.bounding_box.rectangle) 92 | for class_name in ground_truth.classes.keys() 93 | for instance in ground_truth.classes[class_name].instances 94 | if instance.bounding_box is not None 95 | ] 96 | 97 | prediction_boxes = sorted([ 98 | PredictionBox(instance.bounding_box.confidence or 1, class_name, instance.bounding_box.rectangle) 99 | for class_name in prediction.classes.keys() 100 | for instance in prediction.classes[class_name].instances 101 | if instance.bounding_box is not None 102 | ], reverse = True, key = lambda p: p.confidence) 103 | 104 | iou_matrix = np.array([ 105 | [ground_truth_box.box.iou(prediction_box.box) for ground_truth_box in ground_truth_boxes] 106 | for prediction_box in prediction_boxes 107 | ]) 108 | 109 | self._add_ground_truth_positives(len(ground_truth_boxes)) 110 | 111 | previous_true_positives = 0 112 | previous_false_positives = 0 113 | 114 | for i in range(len(prediction_boxes)): 115 | confidence_threshold = prediction_boxes[i].confidence 116 | 117 | if i < len(prediction_boxes) - 1 and prediction_boxes[i+1].confidence == confidence_threshold: 118 | continue 119 | 120 | prediction_indices, ground_truth_indices = linear_sum_assignment(iou_matrix[:i+1,], maximize = True) 121 | 122 | true_positives = 0 123 | false_positives = max(0, i + 1 - len(ground_truth_boxes)) 124 | 125 | for prediction_index, ground_truth_index in zip(cast(Iterable[int], prediction_indices), cast(Iterable[int], ground_truth_indices)): 126 | if ( 127 | iou_matrix[prediction_index, ground_truth_index] >= iou_threshold 128 | and prediction_boxes[prediction_index].class_name == ground_truth_boxes[ground_truth_index].class_name 129 | ): 130 | true_positives += 1 131 | else: 132 | false_positives += 1 133 | 134 | self._add_event(confidence_threshold, _DetectionEvent( 135 | true_positive_delta = true_positives - previous_true_positives, 136 | false_positive_delta = false_positives - previous_false_positives 137 | )) 138 | 139 | previous_true_positives = true_positives 140 | previous_false_positives = false_positives 141 | 142 | def batch_add_annotation( 143 | self: PrecisionRecallCurve, 144 | ground_truths: Sequence[ImageAnnotation], 145 | predictions: Sequence[ImageAnnotation], 146 | iou_threshold: float 147 | ) -> None: 148 | """ 149 | Updates this precision-recall curve with the values from several annotations simultaneously. 150 | """ 151 | for ground_truth, prediction in zip(ground_truths, predictions): 152 | self.add_annotation(ground_truth, prediction, iou_threshold) 153 | 154 | def _compute_curve(self) -> List[_PrecisionRecallPoint]: 155 | assert self.ground_truth_positives > 0 156 | precision_recall_points: List[_PrecisionRecallPoint] = [] 157 | 158 | true_positives = 0 159 | detections = 0 160 | 161 | for threshold in reversed(self.events): 162 | true_positive_delta, false_positive_delta = self.events[threshold] 163 | true_positives += true_positive_delta 164 | detections += true_positive_delta + false_positive_delta 165 | assert detections > 0 166 | 167 | precision_recall_points.append(_PrecisionRecallPoint( 168 | threshold = threshold, 169 | precision = true_positives / detections, 170 | recall = true_positives / self.ground_truth_positives 171 | )) 172 | 173 | return precision_recall_points 174 | 175 | def _add_event(self, threshold: float, event: _DetectionEvent) -> None: 176 | if threshold not in self.events: 177 | self.events[threshold] = _DetectionEvent(0, 0) 178 | self.events[threshold] += event 179 | 180 | def _add_ground_truth_positives(self, count: int) -> None: 181 | self.ground_truth_positives += count 182 | 183 | def __add__(self, other: PrecisionRecallCurve) -> PrecisionRecallCurve: 184 | if isinstance(other, PrecisionRecallCurve): # type: ignore - pyright complains about the isinstance check being redundant 185 | ret = self.clone() 186 | ret._add_ground_truth_positives(other.ground_truth_positives) 187 | 188 | for threshold, event in other.events.items(): 189 | ret._add_event(threshold, event) 190 | 191 | return ret 192 | return NotImplemented 193 | 194 | -------------------------------------------------------------------------------- /datatap/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zensors/datatap-python/1a05b354f20bbbc24b9bad76e919e0cc9542538c/datatap/py.typed -------------------------------------------------------------------------------- /datatap/template/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Templates are used to describe how a given annotation (or set of annotations) is structured. 3 | 4 | All `Dataset`s and `DatasetVersion`s will have templates attached to them. If you need to 5 | create your own template (for instance, in order to create a new dataset), you can 6 | instantiate them as such: 7 | 8 | ```py 9 | from datatap.template import ImageAnnotationTemplate, ClassAnnotationTemplate, InstanceTemplate 10 | 11 | ImageAnnotationTemplate(classes = { 12 | "person": ClassAnnotationTemplate( 13 | instances = InstanceTemplate( 14 | bounding_box = True, 15 | segmentation = False, # this could also be omitted, since False is the default 16 | keypoints = { "head", "left shoulder", "right shoulder" }, 17 | attributes = { "face mask": { "present", "absent" } } 18 | ) 19 | ) 20 | }) 21 | ``` 22 | """ 23 | 24 | 25 | from .class_annotation_template import ClassAnnotationTemplate 26 | from .frame_annotation_template import FrameAnnotationTemplate 27 | from .image_annotation_template import ImageAnnotationTemplate 28 | from .instance_template import InstanceTemplate 29 | from .multi_instance_template import MultiInstanceTemplate 30 | from .video_annotation_template import VideoAnnotationTemplate 31 | 32 | __all__ = [ 33 | "ClassAnnotationTemplate", 34 | "FrameAnnotationTemplate", 35 | "ImageAnnotationTemplate", 36 | "InstanceTemplate", 37 | "MultiInstanceTemplate", 38 | "VideoAnnotationTemplate", 39 | ] 40 | -------------------------------------------------------------------------------- /datatap/template/class_annotation_template.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Optional 4 | 5 | from typing_extensions import TypedDict 6 | 7 | from ..utils import basic_repr 8 | from .instance_template import InstanceTemplate, InstanceTemplateJson 9 | from .multi_instance_template import MultiInstanceTemplate, MultiInstanceTemplateJson 10 | 11 | class ClassAnnotationTemplateJson(TypedDict, total=False): 12 | """ 13 | The serialized JSON representation of a class annotation template. 14 | """ 15 | 16 | instances: InstanceTemplateJson 17 | multiInstances: MultiInstanceTemplateJson 18 | 19 | class ClassAnnotationTemplate(): 20 | """ 21 | A `ClassAnnotationTemplate` describes what each class should provide. 22 | 23 | In practice, most of the specification is delegated to its constituent tepmlates, 24 | `instances` and `multi_instances`. 25 | """ 26 | 27 | instances: Optional[InstanceTemplate] 28 | """ 29 | An `InstanceTemplate` that describes how instances are structured. 30 | """ 31 | 32 | multi_instances: Optional[MultiInstanceTemplate] 33 | """ 34 | A `MultiInstanceTemplate` that describes how multi instances are structured. 35 | """ 36 | 37 | def __init__( 38 | self, 39 | *, 40 | instances: Optional[InstanceTemplate] = None, 41 | multi_instances: Optional[MultiInstanceTemplate] = None 42 | ): 43 | self.instances = instances 44 | self.multi_instances = multi_instances 45 | 46 | def to_json(self) -> ClassAnnotationTemplateJson: 47 | """ 48 | Serializes this object into JSON. 49 | """ 50 | json = ClassAnnotationTemplateJson() 51 | if self.instances is not None: json["instances"] = self.instances.to_json() 52 | if self.multi_instances is not None: json["multiInstances"] = self.multi_instances.to_json() 53 | return json 54 | 55 | @staticmethod 56 | def from_json(json: ClassAnnotationTemplateJson) -> ClassAnnotationTemplate: 57 | """ 58 | Deserializes a JSON object into a `ClassAnnotationTemplate`. 59 | """ 60 | instances = InstanceTemplate.from_json(json["instances"]) if "instances" in json else None 61 | multi_instances = MultiInstanceTemplate.from_json(json["multiInstances"]) if "multiInstances" in json else None 62 | return ClassAnnotationTemplate(instances=instances, multi_instances=multi_instances) 63 | 64 | def __repr__(self) -> str: 65 | return basic_repr( 66 | "ClassAnnotationTemplate", 67 | instances = self.instances, 68 | multi_instances = self.multi_instances 69 | ) 70 | -------------------------------------------------------------------------------- /datatap/template/frame_annotation_template.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Dict, Mapping 4 | 5 | from typing_extensions import TypedDict 6 | 7 | from ..utils import basic_repr 8 | from .class_annotation_template import ClassAnnotationTemplate, ClassAnnotationTemplateJson 9 | 10 | class FrameAnnotationTemplateJson(TypedDict): 11 | """ 12 | The serialized JSON representation of a frame annotation template. 13 | """ 14 | 15 | classes: Dict[str, ClassAnnotationTemplateJson] 16 | 17 | class FrameAnnotationTemplate(): 18 | """ 19 | Describes how a `FrameAnnotation` is structured. 20 | 21 | For each of its classes, it provides a `ClassAnnotationTemplate`. 22 | """ 23 | 24 | classes: Mapping[str, ClassAnnotationTemplate] 25 | """ 26 | A mapping from class name to `ClassAnnotationTemplate`. 27 | """ 28 | 29 | def __init__(self, *, classes: Mapping[str, ClassAnnotationTemplate]): 30 | self.classes = classes 31 | 32 | def to_json(self) -> FrameAnnotationTemplateJson: 33 | """ 34 | Serializes this object to JSON. 35 | """ 36 | return { 37 | "classes": { 38 | class_name: class_template.to_json() 39 | for class_name, class_template in self.classes.items() 40 | } 41 | } 42 | 43 | @staticmethod 44 | def from_json(json: FrameAnnotationTemplateJson) -> FrameAnnotationTemplate: 45 | """ 46 | Deserializes a JSON object into a `FrameAnnotationTemplate`. 47 | """ 48 | classes = { 49 | key: ClassAnnotationTemplate.from_json(value) 50 | for key, value in json.get("classes", {}).items() 51 | } 52 | 53 | return FrameAnnotationTemplate(classes=classes) 54 | 55 | def __repr__(self) -> str: 56 | return basic_repr( 57 | "FrameAnnotationTemplate", 58 | classes = self.classes 59 | ) 60 | -------------------------------------------------------------------------------- /datatap/template/image_annotation_template.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Dict, Mapping 4 | 5 | from typing_extensions import Literal, TypedDict 6 | 7 | from ..utils import basic_repr 8 | from .class_annotation_template import ClassAnnotationTemplate, ClassAnnotationTemplateJson 9 | 10 | class ImageAnnotationTemplateJson(TypedDict): 11 | """ 12 | The serialized JSON representation of an image annotation template. 13 | """ 14 | 15 | kind: Literal["ImageAnnotationTemplate"] 16 | classes: Dict[str, ClassAnnotationTemplateJson] 17 | 18 | class ImageAnnotationTemplate(): 19 | """ 20 | Describes how an `ImageAnnotation` is structured. 21 | 22 | For each of its classes, it provides a `ClassAnnotationTemplate`. 23 | """ 24 | 25 | classes: Mapping[str, ClassAnnotationTemplate] 26 | """ 27 | A mapping from class name to `ClassAnnotationTemplate`. 28 | """ 29 | 30 | def __init__(self, *, classes: Mapping[str, ClassAnnotationTemplate]): 31 | self.classes = classes 32 | 33 | def to_json(self) -> ImageAnnotationTemplateJson: 34 | """ 35 | Serializes this object to JSON. 36 | """ 37 | return { 38 | "kind": "ImageAnnotationTemplate", 39 | "classes": { 40 | class_name: class_template.to_json() 41 | for class_name, class_template in self.classes.items() 42 | } 43 | } 44 | 45 | @staticmethod 46 | def from_json(json: ImageAnnotationTemplateJson) -> ImageAnnotationTemplate: 47 | """ 48 | Deserializes a JSON object into an `ImageAnnotationTemplate`. 49 | """ 50 | classes = { 51 | key: ClassAnnotationTemplate.from_json(value) 52 | for key, value in json.get("classes", {}).items() 53 | } 54 | 55 | return ImageAnnotationTemplate(classes=classes) 56 | 57 | def __repr__(self) -> str: 58 | return basic_repr( 59 | "ImageAnnotationTemplate", 60 | classes = self.classes 61 | ) 62 | -------------------------------------------------------------------------------- /datatap/template/instance_template.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import AbstractSet, Dict, List, Mapping 4 | 5 | from typing_extensions import TypedDict 6 | 7 | from ..utils import basic_repr 8 | 9 | 10 | class InstanceTemplateJson(TypedDict, total=False): 11 | """ 12 | The serialized JSON representation of an instance template. 13 | """ 14 | 15 | id: bool 16 | boundingBox: bool 17 | segmentation: bool 18 | keypoints: List[str] 19 | attributes: Dict[str, List[str]] 20 | 21 | class InstanceTemplate(): 22 | """ 23 | Describes how an individual instance is structured. 24 | """ 25 | 26 | id: bool 27 | """ 28 | If `id` is `True`, then all corresponding `Instance`s will have an ID 29 | that uniquely identifies the object represented by the instance in the 30 | context of the containing annotation. 31 | """ 32 | 33 | bounding_box: bool 34 | """ 35 | If `bounding_box` is `True`, then all corresponding `Instance`s will have a 36 | `BoundingBox` representing the bounds of their shape. 37 | """ 38 | 39 | segmentation: bool 40 | """ 41 | If `segmentation` is `True`, then all corresponding `Instance`s will have a 42 | `Segmentation` tightly representing their shape. 43 | """ 44 | 45 | keypoints: AbstractSet[str] 46 | """ 47 | For each keypoint name specified in `keypoints`, all corresponding instances 48 | will have a corresponding key in their `keypoints` field, the value of which 49 | will contain he keypoint if it is present or has an inferrable position in 50 | the image or `None` if it is not in-frame. 51 | """ 52 | 53 | attributes: Mapping[str, AbstractSet[str]] 54 | """ 55 | For each attribute name specified in `attributes`, all corresponding 56 | `Instance`s will provide one of the given values. 57 | """ 58 | 59 | def __init__( 60 | self, 61 | *, 62 | id: bool = False, 63 | bounding_box: bool = False, 64 | segmentation: bool = False, 65 | keypoints: AbstractSet[str] = set(), 66 | attributes: Mapping[str, AbstractSet[str]] = dict(), 67 | ): 68 | self.id = id 69 | self.bounding_box = bounding_box 70 | self.segmentation = segmentation 71 | self.keypoints = keypoints 72 | self.attributes = attributes 73 | 74 | def to_json(self) -> InstanceTemplateJson: 75 | """ 76 | Serializes this object as JSON. 77 | """ 78 | json = InstanceTemplateJson() 79 | 80 | if self.id: json["id"] = True 81 | if self.bounding_box: json["boundingBox"] = True 82 | if self.segmentation: json["segmentation"] = True 83 | if len(self.keypoints) > 0: json["keypoints"] = list(self.keypoints) 84 | if len(self.attributes) > 0: json["attributes"] = { key: list(values) for key, values in self.attributes.items() } 85 | 86 | return json 87 | 88 | @staticmethod 89 | def from_json(json: InstanceTemplateJson) -> InstanceTemplate: 90 | """ 91 | Deserializes a JSON object as an `InstanceTemplate`. 92 | """ 93 | id = json.get("id", False) 94 | bounding_box = json.get("boundingBox", False) 95 | segmentation = json.get("segmentation", False) 96 | keypoints = set(json.get("keypoints", [])) 97 | attributes = { 98 | key: set(values) 99 | for key, values in json.get("attributes", {}).items() 100 | } 101 | return InstanceTemplate( 102 | id = id, 103 | bounding_box=bounding_box, 104 | segmentation=segmentation, 105 | keypoints=keypoints, 106 | attributes=attributes, 107 | ) 108 | 109 | def __repr__(self) -> str: 110 | return basic_repr( 111 | "InstanceTemplate", 112 | id = self.id, 113 | bounding_box = self.bounding_box, 114 | segmentation = self.segmentation, 115 | keypoints = self.keypoints, 116 | attributes = self.attributes, 117 | ) 118 | -------------------------------------------------------------------------------- /datatap/template/multi_instance_template.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing_extensions import TypedDict 4 | 5 | from ..utils import basic_repr 6 | 7 | class MultiInstanceTemplateJson(TypedDict, total=False): 8 | """ 9 | The serialized JSON representation of a multi instance template. 10 | """ 11 | 12 | boundingBox: bool 13 | segmentation: bool 14 | count: bool 15 | 16 | class MultiInstanceTemplate(): 17 | """ 18 | Describes how an individual multi-instance is structured. 19 | """ 20 | 21 | bounding_box: bool 22 | """ 23 | If `bounding_box` is `True`, then all corresponding `MultiInstance`s will 24 | have a `BoundingBox` representing the bounds of their shape. 25 | """ 26 | 27 | segmentation: bool 28 | """ 29 | If `segmentation` is `True`, then all corresponding `MultiInstance`s will 30 | have a `Segmentation` tightly representing their shape. 31 | """ 32 | 33 | count: bool 34 | """ 35 | If `count` is `True`, then all corresponding `MultiInstance`s will have a 36 | count of how many true instances are present in the multi-instance. 37 | """ 38 | 39 | def __init__( 40 | self, 41 | *, 42 | bounding_box: bool = False, 43 | segmentation: bool = False, 44 | count: bool = False 45 | ): 46 | self.bounding_box = bounding_box 47 | self.segmentation = segmentation 48 | self.count = count 49 | 50 | def to_json(self) -> MultiInstanceTemplateJson: 51 | json = MultiInstanceTemplateJson() 52 | if self.bounding_box: json["boundingBox"] = True 53 | if self.segmentation: json["segmentation"] = True 54 | if self.count: json["count"] = True 55 | return json 56 | 57 | @staticmethod 58 | def from_json(json: MultiInstanceTemplateJson) -> MultiInstanceTemplate: 59 | bounding_box = json.get("boundingBox", False) 60 | segmentation = json.get("segmentation", False) 61 | count = json.get("count", False) 62 | return MultiInstanceTemplate( 63 | bounding_box = bounding_box, 64 | segmentation = segmentation, 65 | count = count 66 | ) 67 | 68 | def __repr__(self) -> str: 69 | return basic_repr( 70 | "MultiInstanceTemplate", 71 | bounding_box = self.bounding_box, 72 | segmentation = self.segmentation 73 | ) 74 | -------------------------------------------------------------------------------- /datatap/template/video_annotation_template.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing_extensions import Literal, TypedDict 4 | 5 | from ..utils import basic_repr 6 | from .frame_annotation_template import (FrameAnnotationTemplate, 7 | FrameAnnotationTemplateJson) 8 | 9 | 10 | class VideoAnnotationTemplateJson(TypedDict): 11 | """ 12 | The serialized JSON representation of a video annotation template. 13 | """ 14 | 15 | kind: Literal["VideoAnnotationTemplate"] 16 | frames: FrameAnnotationTemplateJson 17 | 18 | class VideoAnnotationTemplate(): 19 | """ 20 | Describes how a `VideoAnnotation` is structured. 21 | 22 | It consists only of a `FrameAnnotationTemplate` that describes its frames. 23 | """ 24 | 25 | frames: FrameAnnotationTemplate 26 | """ 27 | A `FrameAnnotationTemplate` that describes how the frames are structured. 28 | """ 29 | 30 | def __init__(self, *, frames: FrameAnnotationTemplate): 31 | self.frames = frames 32 | 33 | def to_json(self) -> VideoAnnotationTemplateJson: 34 | """ 35 | Serializes this object to JSON. 36 | """ 37 | return { 38 | "kind": "VideoAnnotationTemplate", 39 | "frames": self.frames.to_json() 40 | } 41 | 42 | @staticmethod 43 | def from_json(json: VideoAnnotationTemplateJson) -> VideoAnnotationTemplate: 44 | """ 45 | Deserializes a JSON object into a `VideoAnnotationTemplate`. 46 | """ 47 | return VideoAnnotationTemplate( 48 | frames = FrameAnnotationTemplate.from_json(json["frames"]) 49 | ) 50 | 51 | def __repr__(self) -> str: 52 | return basic_repr( 53 | "VideoAnnotationTemplate", 54 | frames = self.frames 55 | ) 56 | -------------------------------------------------------------------------------- /datatap/tf/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The `tf` module provides utilities for using dataTap with Tensorflow. 3 | 4 | Please note that if you want to be able to use this module, you will 5 | either need to install Tensorflow manually, or install dataTap with the 6 | tensorflow extra: 7 | 8 | ```bash 9 | pip install 'datatap[tf]' 10 | ``` 11 | 12 | This module exports two helper functions for creating tensorflow datasets. 13 | Here is an example of the single-process one, `create_tf_dataset`: 14 | 15 | ```py 16 | import itertools 17 | from datatap import Api 18 | from datatap.tf import create_dataset 19 | 20 | api = Api() 21 | dataset = api.get_default_database().get_dataset_list()[0] 22 | latest_version = dataset.latest_version 23 | 24 | dataset = create_dataset(latest_version, "training", num_workers = 4) 25 | for (_image, bounding_boxes, labels) in itertools.islice(dataset, 3): 26 | print(bounding_boxes, labels) 27 | ``` 28 | """ 29 | 30 | from .dataset import create_dataset, create_multi_worker_dataset 31 | 32 | __all__ = [ 33 | "create_dataset", 34 | "create_multi_worker_dataset", 35 | ] -------------------------------------------------------------------------------- /datatap/tf/dataset.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from os import cpu_count 3 | 4 | import requests 5 | import functools 6 | from typing import Dict, Optional 7 | 8 | try: 9 | import tensorflow as tf 10 | except ImportError: 11 | tf = {} 12 | 13 | from datatap.api.entities import Dataset 14 | 15 | def _get_class_mapping(dataset: Dataset, class_mapping: Optional[Dict[str, int]] = None): 16 | classes_used = dataset.template.classes.keys() 17 | if class_mapping is not None: 18 | if set(class_mapping.keys()) != set(classes_used): 19 | print( 20 | "[WARNING]: Potentially invalid class mapping. Provided classes ", 21 | set(class_mapping.keys()), 22 | " but needed ", 23 | set(classes_used) 24 | ) 25 | return class_mapping 26 | else: 27 | return { 28 | cls: i 29 | for i, cls in enumerate(sorted(classes_used)) 30 | } 31 | 32 | def create_dataset( 33 | dataset: Dataset, 34 | split: str, 35 | input_class_mapping: Optional[Dict[str, int]] = None, 36 | num_workers: int = cpu_count() or 1, 37 | input_context: Optional[tf.distribute.InputContext] = None 38 | ): 39 | """ 40 | Creates a tensorflow `Dataset` object that will load a specified split of `version`. 41 | 42 | This function handles the necessary `Dataset` operations to parallelize the loading 43 | operation. Since image loading can be slow, it is recommended to have `num_workers` 44 | set to a value greater than 1. By default, it will try to load one image per CPU. 45 | 46 | If you intend to use the dataset across multiple processes or computers, consider 47 | using `create_tf_multi_worker_dataset` instead. 48 | """ 49 | class_mapping = _get_class_mapping(dataset, input_class_mapping) 50 | 51 | def gen(): 52 | worker_id = input_context.input_pipeline_id if input_context is not None else 0 53 | num_workers = input_context.num_input_pipelines if input_context is not None else 1 54 | 55 | for droplet in dataset.stream_split(split, worker_id, num_workers): 56 | image_url = tf.constant(droplet.image.paths[0]) 57 | 58 | bounding_boxes = tf.stack([ 59 | tf.constant(i.bounding_box.rectangle.to_xywh_tuple(), shape=(4,), dtype=tf.float64) 60 | for cls in droplet.classes.keys() 61 | for i in droplet.classes[cls].instances 62 | if i.bounding_box is not None 63 | ]) 64 | 65 | labels = tf.stack([ 66 | tf.constant(class_mapping[cls], dtype=tf.int32) 67 | for cls in droplet.classes.keys() 68 | for _ in droplet.classes[cls].instances 69 | ]) 70 | 71 | yield (image_url, bounding_boxes, labels) 72 | 73 | def _load_img_fn(image_url: tf.Tensor): 74 | res = requests.get(image_url.numpy().decode("ascii")) 75 | img = tf.io.decode_jpeg(res.content, channels=3) 76 | return img 77 | 78 | def load_img_fn(image_url: tf.Tensor): 79 | return tf.py_function(_load_img_fn, inp=(image_url,), Tout=(tf.uint8,)) 80 | 81 | def map_fn(image_url: tf.Tensor, boxes: tf.Tensor, labels: tf.Tensor): 82 | return (load_img_fn(image_url), boxes, labels) 83 | 84 | ds = tf.data.Dataset.from_generator( 85 | gen, 86 | (tf.string, tf.float64, tf.int32), 87 | (tf.TensorShape(()), tf.TensorShape((None, 4)), (tf.TensorShape((None)))) 88 | ) 89 | 90 | return ds.map(map_fn, num_parallel_calls=num_workers) 91 | 92 | 93 | def create_multi_worker_dataset( 94 | strategy: tf.distribute.experimental.Strategy, 95 | dataset: Dataset, 96 | split: str, 97 | num_workers: int = cpu_count() or 1, 98 | input_class_mapping: Optional[Dict[str, int]] = None, 99 | ): 100 | """ 101 | Creates a multi-worker sharded dataset. In addition to sharding the contents 102 | of the dataset across multiple machines, this function will also attempt to 103 | load the images across several workers. 104 | 105 | If you are running multiple workers on the same physical machine, consider lowering 106 | the value of `num_workers`, as by default each worker will try to use every CPU 107 | on the machine. 108 | """ 109 | ds = strategy.experimental_distribute_datasets_from_function( 110 | functools.partial(create_dataset, dataset, split, num_workers, input_class_mapping) 111 | ) 112 | return ds 113 | -------------------------------------------------------------------------------- /datatap/torch/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The `torch` module provides utilities for using dataTap with PyTorch. 3 | 4 | Please note that if you want to be able to use this module, you will 5 | either need to install PyTorch manually, or install dataTap with the 6 | PyTorch extra: 7 | 8 | ```bash 9 | pip install 'datatap[torch]' 10 | ``` 11 | 12 | The `torch` module provides both a `torch.IterableDataset` implementation, 13 | and a convenience method to create a `torch.Dataloader` using it. Here is 14 | an example of how to use these: 15 | 16 | ```py 17 | import itertools 18 | from datatap import Api 19 | from datatap.torch import create_dataloader 20 | 21 | import torchvision.transforms as T 22 | 23 | api = Api() 24 | dataset = api.get_default_database().get_dataset_list()[0] 25 | latest_version = dataset.latest_version 26 | 27 | transforms = T.Compose([ 28 | T.Resize((128, 128)), 29 | T.ColorJitter(hue=0.2), 30 | T.ToTensor(), 31 | ]) 32 | 33 | dataloader = create_dataloader(latest_version, "training", batch_size = 4, image_transform = transforms) 34 | for batch in itertools.islice(dataloader, 3): 35 | print(batch.boxes, batch.labels) 36 | ``` 37 | 38 | """ 39 | 40 | from ._patch_torch import patch_all as _patch_all 41 | _patch_all() 42 | 43 | from .dataset import DatasetElement, DatasetBatch, IterableDataset 44 | from .dataloader import create_dataloader 45 | from .utils import torch_to_image_annotation 46 | 47 | __all__ = [ 48 | "DatasetElement", 49 | "DatasetBatch", 50 | "IterableDataset", 51 | "create_dataloader", 52 | "torch_to_image_annotation", 53 | ] -------------------------------------------------------------------------------- /datatap/torch/_patch_torch.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file handles monkey patching the PyTorch dataset/dataloder to handle 3 | allowing them to be typed. 4 | """ 5 | 6 | import functools 7 | from typing import Any, Type, TypeVar 8 | 9 | _T = TypeVar("_T") 10 | 11 | def allow_generic(cls: Type[_T], type: Any) -> Type[_T]: 12 | """ 13 | This function is can be monkey patched onto any type to allow it 14 | to support generics (i.e. Cls[T]). 15 | 16 | If you are running into any issues with it, please file a bug 17 | report with dev@zensors.com. 18 | """ 19 | return cls 20 | 21 | def patch_generic_class(cls: Type[Any]): 22 | setattr(cls, "__class_getitem__", functools.partial(allow_generic, cls)) 23 | 24 | 25 | def patch_all(): 26 | from torch.utils.data import IterableDataset, DataLoader 27 | 28 | patch_generic_class(IterableDataset) 29 | patch_generic_class(DataLoader) -------------------------------------------------------------------------------- /datatap/torch/dataloader.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from os import cpu_count 4 | from typing import Callable, Dict, Generator, Optional, TypeVar, cast, TYPE_CHECKING 5 | 6 | import torch 7 | import PIL.Image 8 | import torchvision.transforms.functional as TF 9 | from torch.utils.data import DataLoader as TorchDataLoader 10 | 11 | from datatap.api.entities import Dataset 12 | 13 | from .dataset import IterableDataset, DatasetBatch, collate 14 | 15 | _T = TypeVar("_T") 16 | 17 | if TYPE_CHECKING: 18 | class DataLoader(TorchDataLoader[_T]): 19 | """ 20 | This is an ambient redeclaration of the dataloader class that 21 | has properly typed iter methods. 22 | """ 23 | def __iter__(self) -> Generator[_T, None, None]: ... 24 | else: 25 | DataLoader = TorchDataLoader 26 | 27 | def create_dataloader( 28 | dataset: Dataset, 29 | split: str, 30 | batch_size: int = 1, 31 | num_workers: int = cpu_count() or 0, 32 | *, 33 | image_transform: Callable[[PIL.Image.Image], torch.Tensor] = TF.to_tensor, 34 | class_mapping: Optional[Dict[str, int]] = None, 35 | device: torch.device = torch.device("cpu") 36 | ) -> DataLoader[DatasetBatch]: 37 | """ 38 | Creates a PyTorch `Dataloader` that yields batches of annotations. 39 | 40 | This `Dataloader` is using `datatap.torch.Dataset` under the hood, so 41 | all of the same restrictions apply, most notably that the `image_transform` 42 | function must ultimately return a `torch.Tensor` of dimensionality 43 | `(..., H, W)`. 44 | """ 45 | if torch.multiprocessing.get_start_method(allow_none = True) is None: 46 | torch.multiprocessing.set_start_method("spawn") 47 | 48 | torch_dataset = IterableDataset(dataset, split, image_transform = image_transform, class_mapping = class_mapping, device = device) 49 | dataloader = cast( 50 | DataLoader[DatasetBatch], 51 | DataLoader( 52 | torch_dataset, 53 | batch_size, 54 | collate_fn = collate, # type: ignore (Torch's types are off) 55 | num_workers = num_workers, 56 | ) 57 | ) 58 | 59 | return dataloader 60 | -------------------------------------------------------------------------------- /datatap/torch/dataset.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any, Callable, Dict, Iterator, List, Optional, Union, overload 4 | 5 | import torch 6 | import PIL.Image 7 | import torchvision.transforms.functional as TF 8 | from torch.utils.data import IterableDataset as TorchIterableDataset, get_worker_info # type: ignore 9 | 10 | from datatap.droplet import ImageAnnotation 11 | from datatap.api.entities import Dataset 12 | 13 | class DatasetElement(): 14 | """ 15 | Represents a single element from the dataset. 16 | """ 17 | 18 | original_annotation: ImageAnnotation 19 | """ 20 | The original, untransformed annotation. 21 | """ 22 | 23 | image: torch.Tensor 24 | """ 25 | The image as transformed by the dataset. 26 | """ 27 | 28 | boxes: torch.Tensor 29 | """ 30 | The bounding boxes. They are specified in xyxy format `(min-x, min-y, max-x, max-y)`. 31 | """ 32 | 33 | labels: torch.Tensor 34 | """ 35 | The labels. They are a tensor of unsigned integers. 36 | """ 37 | 38 | def __init__(self, original_annotation: ImageAnnotation, image: torch.Tensor, boxes: torch.Tensor, labels: torch.Tensor): 39 | self.original_annotation = original_annotation 40 | self.image = image 41 | self.boxes = boxes 42 | self.labels = labels 43 | 44 | class DatasetBatch(): 45 | """ 46 | Represents a batch of images as produced by a `DataLoader`. 47 | """ 48 | 49 | original_annotations: List[ImageAnnotation] 50 | """ 51 | The original annotations from this batch. 52 | """ 53 | 54 | images: List[torch.Tensor] 55 | """ 56 | A list of the images in this batch. 57 | """ 58 | 59 | boxes: List[torch.Tensor] 60 | """ 61 | A list of all the per-image bounding boxes in this batch. 62 | """ 63 | 64 | labels: List[torch.Tensor] 65 | """ 66 | A list of all the per-image labels in this batch. 67 | """ 68 | 69 | def __init__(self, original_annotations: List[ImageAnnotation], images: List[torch.Tensor], boxes: List[torch.Tensor], labels: List[torch.Tensor]): 70 | self.original_annotations = original_annotations 71 | self.images = images 72 | self.boxes = boxes 73 | self.labels = labels 74 | 75 | @overload 76 | def collate(elt: DatasetElement) -> DatasetBatch: ... 77 | @overload 78 | def collate(elt: List[DatasetElement]) -> DatasetBatch: ... 79 | def collate(elt: Union[DatasetElement, List[DatasetElement]]) -> DatasetBatch: 80 | """ 81 | A utility function that collates several `DatasetElement`s into one `DatasetBatch`. 82 | """ 83 | if not isinstance(elt, List): 84 | elt = [elt] 85 | 86 | return DatasetBatch( 87 | [d.original_annotation for d in elt], 88 | [d.image for d in elt], 89 | [d.boxes for d in elt], 90 | [d.labels for d in elt], 91 | ) 92 | 93 | class IterableDataset(TorchIterableDataset[DatasetElement]): 94 | """ 95 | A PyTorch `IterableDataset` that yields all of the annotations from a 96 | given `DatasetVersion`. Provides functionality for automatically applying 97 | transforms to images, and then scaling the annotations to the new dimensions. 98 | 99 | Note, it is required that the transformation produce a image tensor of 100 | dimensionality `[..., H, W]`. One way of doing this is using 101 | `torchvision.transforms.functional.to_tensor` as the final step of the transform. 102 | """ 103 | 104 | _dataset: Dataset 105 | _split: str 106 | _class_mapping: Dict[str, int] 107 | _class_names: Dict[int, str] 108 | _device: torch.device 109 | 110 | def __init__( 111 | self, 112 | dataset: Dataset, 113 | split: str, 114 | class_mapping: Optional[Dict[str, int]] = None, 115 | image_transform: Callable[[PIL.Image.Image], torch.Tensor] = TF.to_tensor, 116 | device: torch.device = torch.device("cpu") 117 | ): 118 | self._dataset = dataset 119 | self._split = split 120 | self._image_transform = image_transform 121 | self._device = device 122 | 123 | template_classes = dataset.template.classes.keys() 124 | if class_mapping is not None: 125 | if set(class_mapping.keys()) != set(template_classes): 126 | print( 127 | "[WARNING]: Potentially invalid class mapping. Provided classes ", 128 | set(class_mapping.keys()), 129 | " but needed ", 130 | set(template_classes) 131 | ) 132 | self._class_mapping = class_mapping 133 | else: 134 | self._class_mapping = { 135 | cls: i 136 | for i, cls in enumerate(sorted(template_classes)) 137 | } 138 | 139 | self._class_names = { 140 | i: cls 141 | for cls, i in self._class_mapping.items() 142 | } 143 | 144 | def _get_generator(self): 145 | worker_info: Optional[Any] = get_worker_info() 146 | 147 | if worker_info is None: 148 | return self._dataset.stream_split(self._split, 0, 1) 149 | else: 150 | num_workers: int = worker_info.num_workers 151 | worker_id: int = worker_info.id 152 | 153 | return self._dataset.stream_split(self._split, worker_id, num_workers) 154 | 155 | def __iter__(self) -> Iterator[DatasetElement]: 156 | for annotation in self._get_generator(): 157 | img = annotation.image.get_pil_image(True).convert("RGB") 158 | transformed_img = self._image_transform(img).to(self._device) 159 | h, w = transformed_img.shape[-2:] 160 | 161 | instance_boxes = [ 162 | ( 163 | instance.bounding_box.rectangle.p1.x * w, 164 | instance.bounding_box.rectangle.p1.y * h, 165 | instance.bounding_box.rectangle.p2.x * w, 166 | instance.bounding_box.rectangle.p2.y * h, 167 | ) 168 | for class_name in annotation.classes.keys() 169 | for instance in annotation.classes[class_name].instances 170 | if instance.bounding_box is not None 171 | ] 172 | 173 | instance_labels = [ 174 | self._class_mapping[class_name] 175 | for class_name in annotation.classes.keys() 176 | for _ in annotation.classes[class_name].instances 177 | if class_name in self._class_mapping 178 | ] 179 | 180 | target = torch.tensor(instance_boxes).reshape((-1, 4)).to(self._device) 181 | labels = torch.tensor(instance_labels, dtype = torch.int64).to(self._device) 182 | 183 | element = DatasetElement(annotation, transformed_img, target, labels) 184 | 185 | yield element 186 | -------------------------------------------------------------------------------- /datatap/torch/utils.py: -------------------------------------------------------------------------------- 1 | from datatap.droplet.bounding_box import BoundingBox 2 | from typing import Dict, List, Optional 3 | 4 | import torch 5 | import torchvision.transforms.functional as TF 6 | 7 | from datatap.geometry import Point, Rectangle 8 | from datatap.droplet import Instance, ClassAnnotation, ImageAnnotation, Image 9 | 10 | def tensor_to_rectangle(tensor: torch.Tensor) -> Rectangle: 11 | """ 12 | Expects a tensor of dimensionality `torch.Size([4])` in `xyxy` format 13 | """ 14 | return Rectangle( 15 | Point(float(tensor[0]), float(tensor[1]), clip = True), 16 | Point(float(tensor[2]), float(tensor[3]), clip = True), 17 | ) 18 | 19 | def torch_to_image_annotation( 20 | image: torch.Tensor, 21 | class_map: Dict[str, int], 22 | *, 23 | labels: torch.Tensor, 24 | boxes: torch.Tensor, 25 | scores: torch.Tensor, 26 | serialize_image: bool = False, 27 | uid: Optional[str] = None, 28 | ) -> ImageAnnotation: 29 | """ 30 | Creates an `ImageAnnotation` from a canonical tensor representation. 31 | 32 | This function assumes the following, 33 | 34 | 1. Image is of dimensionality `(..., height, width)` 35 | 2. Labels are an `int`/`uint` tensor of size `[n]` 36 | 3. Scores are a `float` tensor of size `[n]` 37 | 3. Boxes are a `float` tensor of size `[n, 4]` 38 | """ 39 | inverted_class_map = { 40 | i: cls 41 | for cls, i in class_map.items() 42 | } 43 | 44 | height, width = image.shape[-2:] 45 | 46 | # First construct the image. If we are asked to serialize it, then 47 | # use the tensor to construct a cached PIL image 48 | if serialize_image: 49 | pil_image = TF.to_pil_image(image, "RGB") 50 | droplet_image = Image.from_pil(pil_image) 51 | else: 52 | droplet_image = Image(paths = []) 53 | 54 | # Then, compute each of the class annotations 55 | class_annotations: Dict[str, List[Instance]] = {} 56 | 57 | boxes = boxes.cpu() / torch.tensor([width, height, width, height]) 58 | 59 | for i, label in enumerate(labels.cpu()): 60 | class_name = inverted_class_map.get(int(label)) 61 | if class_name is None: 62 | continue 63 | 64 | if class_name not in class_annotations: 65 | class_annotations[class_name] = [] 66 | 67 | class_annotations[class_name].append( 68 | Instance( 69 | bounding_box = BoundingBox( 70 | tensor_to_rectangle(boxes[i]), 71 | confidence = float(scores[i]), 72 | ) 73 | ) 74 | ) 75 | 76 | # Finally, construct the image annotation 77 | 78 | return ImageAnnotation( 79 | uid = uid, 80 | image = droplet_image, 81 | classes = { 82 | cls: ClassAnnotation(instances = instances, multi_instances = []) 83 | for cls, instances in class_annotations.items() 84 | } 85 | ) -------------------------------------------------------------------------------- /datatap/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | A collection of primarily internal-use utilities. 3 | """ 4 | 5 | from .environment import Environment 6 | from .helpers import assert_one, timer, DeletableGenerator 7 | from .cache_generator import CacheGenerator 8 | from .or_nullish import OrNullish 9 | from .print_helpers import basic_repr, color_repr, force_pretty_print, pprint, pprints 10 | 11 | __all__ = [ 12 | "Environment", 13 | "assert_one", 14 | "timer", 15 | "DeletableGenerator", 16 | "CacheGenerator", 17 | "OrNullish", 18 | "basic_repr", 19 | "color_repr", 20 | "force_pretty_print", 21 | "pprint", 22 | "pprints" 23 | ] 24 | -------------------------------------------------------------------------------- /datatap/utils/cache_generator.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import json 5 | import time 6 | from threading import Semaphore, Thread 7 | from queue import Queue 8 | from os import path 9 | from typing import Any, Callable, Generator, TypeVar, Optional 10 | 11 | from .helpers import DeletableGenerator 12 | 13 | _T = TypeVar("_T") 14 | 15 | def CacheGenerator(file_name: str, create_stream: Callable[[], Generator[_T, Any, Any]]) -> Generator[_T, None, None]: 16 | # We can't just naively stream from the server, unfortunately. Due to the sheer 17 | # volume of data, and the fact that training can be such a slow process, if we 18 | # try to stream the data directly from server to training process, we will end 19 | # up filling the OS' buffer, causing significant backpressure for the server. 20 | # 21 | # Likewise, we cannot necessarily stream from the server into a local buffer, 22 | # as a large dataset could be greater than our available RAM. 23 | # 24 | # As a result, this method streams data from the server to a temp file in a 25 | # subprocess. The main process then streams from that tempfile to the consumer 26 | # of the stream. Finally, once all data has been read, the main process stores 27 | # the stream file as an authoritative cache file for this particular stream. 28 | # Subsequent calls to this function with the same arguments will then pull from 29 | # that file. 30 | # 31 | # Please note that as a result of file-system accesses, streaming in this manner 32 | # incurs a non-trivial performance cost. For production training jobs, it is 33 | # recommended that this function be used with a data-loader capable of running 34 | # on multiple threads. 35 | 36 | # TODO(zwade): change this to UID once we have an endpoint for fetching it 37 | dir_name = path.dirname(file_name) 38 | tmp_file_name = f"{file_name}.stream" 39 | os.makedirs(dir_name, exist_ok=True) 40 | 41 | EOF = "EOF" 42 | 43 | # Checks for an authoritative cache, using it if it exists. 44 | if path.exists(file_name): 45 | def cache_generator(): 46 | with open(file_name, "r") as f: 47 | for line in f.readlines(): 48 | line = line.strip() 49 | if line == "" or line == EOF: 50 | continue 51 | yield json.loads(line) 52 | return 53 | return cache_generator() 54 | 55 | 56 | # `sync_queue` is used to synchronize startup and termination of the 57 | # subprocess, optionally propagating any errors that arise. 58 | sync_queue: Queue[Optional[Exception]] = Queue() 59 | 60 | # `available_annotations` counts how many lines have been written to 61 | # the stream file that have not yet been consumed. 62 | available_annotations = Semaphore() 63 | 64 | # `dead` is a flag that allows us to terminate our stream early 65 | dead = False 66 | 67 | def stream_target(): 68 | stream = create_stream() 69 | 70 | with open(tmp_file_name, "a+") as f: 71 | sync_queue.put(None) 72 | try: 73 | for element in stream: 74 | if dead: 75 | raise Exception("Premature termination") 76 | 77 | # We want to prioritize reading quickly, so after we write, we 78 | # flush to the disk. 79 | # 80 | # (Note that we do not synchronize, as `fsync` incurs a 10x 81 | # slowdown) 82 | f.write(json.dumps(element) + "\n") 83 | f.flush() 84 | # We then "release" our semaphore to indicate that we've made a 85 | # new asset available to the consumer 86 | available_annotations.release() 87 | 88 | sync_queue.put(None) 89 | except Exception as e: 90 | sync_queue.put(e) 91 | finally: 92 | # We explicitly write "EOF" at the end of the stream, since we 93 | # otherwise would not be able to distinguish between the actual 94 | # EOF and an incomplete write. 95 | f.write(EOF + "\n") 96 | f.flush() 97 | available_annotations.release() 98 | 99 | thread = Thread(target = stream_target) 100 | thread.start() 101 | 102 | def generator(): 103 | sync_queue.get() 104 | with open(tmp_file_name, "r") as f: 105 | while True: 106 | available_annotations.acquire() 107 | 108 | line = "" 109 | c = 0 110 | while line == "" or line[-1] != "\n": 111 | # Busy loop to wait for the file write. 112 | # 113 | # If we're eagerly fetching a large portion of the stream 114 | # we may become bottlenecked by file synchronization. In 115 | # this case, we implement a simple backoff to avoid 116 | # unnecessarily hammering the file system. 117 | line += f.readline() 118 | c += 1 119 | if c > 10: 120 | time.sleep(0.005) 121 | 122 | data = line.strip() 123 | if data == EOF: 124 | break 125 | 126 | yield json.loads(data) 127 | 128 | thread.join() 129 | 130 | error = sync_queue.get() 131 | if error is not None: 132 | # This error came from the data loading subprocess 133 | raise error 134 | 135 | def stop_processing(): 136 | # This is a rather gross way of killing it, but unlike `Process`, `Thread` 137 | # has no `terminate` method. 138 | nonlocal dead 139 | dead = True 140 | 141 | return DeletableGenerator(generator(), stop_processing) -------------------------------------------------------------------------------- /datatap/utils/environment.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | class Environment: 4 | """ 5 | A class providing static access to parameters related to the execution 6 | environment of the module. 7 | """ 8 | 9 | API_KEY = os.getenv("DATATAP_API_KEY") 10 | """ 11 | The default API key used for API calls. 12 | """ 13 | 14 | BASE_URI = os.getenv("DATATAP_BASE_URI", "https://app.datatap.dev") 15 | """ 16 | The base URI used for referencing the dataTap application, e.g. for API 17 | calls. One might change this to use an HTTP proxy, for example. 18 | """ 19 | -------------------------------------------------------------------------------- /datatap/utils/helpers.py: -------------------------------------------------------------------------------- 1 | import time 2 | from types import TracebackType 3 | from typing import Dict, List, Callable, Generator, Optional, Tuple, TypeVar 4 | from contextlib import contextmanager 5 | 6 | from .print_helpers import pprint 7 | 8 | _T = TypeVar("_T") 9 | _U = TypeVar("_U") 10 | _V = TypeVar("_V") 11 | 12 | class DeletableGenerator(Generator[_T, _U, _V]): 13 | """ 14 | A deletable generator wraps an existing generator with a deletion 15 | function to allow cleanup. 16 | """ 17 | 18 | _gen: Generator[_T, _U, _V] 19 | _delete: Callable[[], None] 20 | 21 | def __init__(self, gen: Generator[_T, _U, _V], delete_thunk: Callable[[], None]): 22 | self._gen = gen 23 | self._delete = delete_thunk 24 | 25 | def __next__(self): 26 | return next(self._gen) 27 | 28 | def send(self, value: _U): 29 | return self._gen.send(value) 30 | 31 | def throw(self, excn: BaseException, val: None, tb: Optional[TracebackType]): 32 | return self._gen.throw(excn, val, tb) 33 | 34 | def __del__(self): 35 | self._delete() 36 | pass 37 | 38 | 39 | def assert_one(item_list: List[_T]) -> _T: 40 | """ 41 | Given a list of items, asserts that the list is a singleton, 42 | and returns its value. 43 | """ 44 | if len(item_list) != 1: 45 | raise AssertionError(f"Expected one item in list, but found {len(item_list)}", item_list) 46 | 47 | return item_list[0] 48 | 49 | 50 | _timer_state: Dict[str, Tuple[float, int]] = {} 51 | @contextmanager 52 | def timer(name: str): 53 | start = time.time() 54 | yield None 55 | end = time.time() 56 | 57 | value = end - start 58 | avg, count = _timer_state.get(name, (0.0, 0)) 59 | count += 1 60 | avg += (value - avg) / count 61 | _timer_state[name] = (avg, count) 62 | 63 | pprint( 64 | "{blue}{name} took {yellow}{value:1.3f}s{blue} for an average of {yellow}{avg:1.3f}s", 65 | name = name, 66 | value = value, 67 | avg = avg, 68 | ) -------------------------------------------------------------------------------- /datatap/utils/or_nullish.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, TypeVar, Callable 2 | 3 | _T = TypeVar("_T") 4 | _S = TypeVar("_S") 5 | 6 | class OrNullish: 7 | """ 8 | A helper class to represent the monad `α OrNullish = α | None`. 9 | """ 10 | 11 | @staticmethod 12 | def bind(val: Optional[_T], fn: Callable[[_T], Optional[_S]]) -> Optional[_S]: 13 | """ 14 | Monadically binds `fn` to the value of `val`. 15 | """ 16 | if val is None: 17 | return None 18 | else: 19 | return fn(val) -------------------------------------------------------------------------------- /datatap/utils/print_helpers.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import sys 4 | from typing import Any, Dict, List, Tuple, Union, cast 5 | 6 | _ansi = { 7 | "gray": "\033[30m", 8 | "red": "\033[31m", 9 | "green": "\033[32m", 10 | "yellow": "\033[33m", 11 | "blue": "\033[34m", 12 | "purple": "\033[35m", 13 | "cyan": "\033[36m", 14 | "white": "\033[37m", 15 | "black": "\033[38m", 16 | 17 | "orange": "\033[38;5;209m", # TODO(zwade): This is a bit closer to "salmon" 18 | 19 | "clear": "\033[0m", 20 | 21 | "prev": "\033[F", 22 | "start": "\033[G", 23 | } 24 | 25 | IS_INTERACTIVE = sys.stdout.isatty() 26 | pretty_print: bool = False 27 | 28 | def force_pretty_print(): 29 | """ 30 | By default, this library only uses pretty-printing when it's in an 31 | interactive environment (terminal, python shell, etc.). However, there are a 32 | few cases when pretty-printing is desired in a non-interactive environment, 33 | such as when running under Jupyter. Calling this function once will ensure 34 | all future prints will be pretty. 35 | """ 36 | global pretty_print 37 | pretty_print = True 38 | 39 | def pprint(fmt: str, *args: Any, print_args: Dict[str, Any] = {}, **kwargs: Any) -> None: 40 | """ 41 | Pretty printer. The first argument is a format string, and the remaining 42 | arguments are the values for the string. Additionally, the format string 43 | can access a number of ansi escape codes such as colors, `clear`, `prev`, 44 | and `start`. 45 | 46 | ```py 47 | pprint("{prev}Progress: {orange}{i}{clear}/{total}, i=i, total=total) 48 | ``` 49 | """ 50 | print((fmt + "{clear}").format(*args, **{**kwargs, **_ansi}), **print_args) 51 | sys.stdout.flush() 52 | 53 | def pprints(fmt: str, *args: Any, **kwargs: Any) -> str: 54 | """ 55 | Pretty prints to a string. 56 | 57 | See `datatap.utils.pprint`. 58 | """ 59 | return (fmt + "{clear}").format(*args, **{**kwargs, **_ansi}) 60 | 61 | def color_repr(entity: Any) -> str: 62 | """ 63 | A dynamic pretty-printer that will syntax highlight different python 64 | entities. 65 | 66 | Rarely used on its own, see `datatap.utils.basic_repr`. 67 | """ 68 | if entity is None: 69 | return f"{_ansi['orange']}None{_ansi['clear']}" 70 | if isinstance(entity, str): 71 | return f"{_ansi['cyan']}\"{_ansi['green']}{entity}{_ansi['clear']}{_ansi['cyan']}\"{_ansi['clear']}" 72 | if isinstance(entity, (int, float)): 73 | return f"{_ansi['orange']}{entity}{_ansi['clear']}" 74 | if isinstance(entity, (list, tuple)): 75 | entity_list = cast(Union[List[Any], Tuple[Any]], entity) 76 | return ( 77 | f"{_ansi['cyan']}{'[' if type(entity_list) == list else '('}" + 78 | f"{_ansi['cyan']},{_ansi['clear']} ".join([color_repr(e) for e in entity_list]) + 79 | f"{_ansi['cyan']}{']' if type(entity_list) == list else ')'}" 80 | ) 81 | if isinstance(entity, dict): 82 | entity_dict = cast(Dict[Any, Any], entity) 83 | return ( 84 | f"{_ansi['cyan']}{{" + 85 | f"{_ansi['cyan']},{_ansi['clear']} ".join([ 86 | f"{color_repr(key)}{_ansi['cyan']}: {color_repr(value)}" 87 | for key, value in entity_dict.items() 88 | ]) + 89 | f"{_ansi['cyan']}}}" 90 | ) 91 | return repr(entity) 92 | 93 | def basic_repr(class_name: str, *args: Any, **kwargs: Any) -> str: 94 | """ 95 | A function to be used for defining a class's `__repr__` method. 96 | When possible, will pretty-print the object in a way that is both easy 97 | to read, and useful for testing. 98 | 99 | ```py 100 | from datatap.utils import basic_repr 101 | 102 | class Person: 103 | name: string 104 | age: int 105 | height: int 106 | 107 | def __repr__(self): 108 | return basic_repr("Person", name, age = age, height = height) 109 | ``` 110 | """ 111 | if not IS_INTERACTIVE and not pretty_print: 112 | positional_properties = [repr(value) for value in args] 113 | named_properties = [f"{key} = {repr(value)}" for key, value in kwargs.items() if value is not None] 114 | properties = ", ".join(positional_properties + named_properties) 115 | return f"{class_name}({properties})" 116 | else: 117 | positional_properties = [ 118 | f"{_ansi['green']}{color_repr(value)}{_ansi['clear']}" 119 | for value in args 120 | ] 121 | named_properties = [ 122 | f"{_ansi['red']}{key} {_ansi['purple']}= {color_repr(value)}" 123 | for key, value in kwargs.items() 124 | if value is not None 125 | ] 126 | properties = f"{_ansi['cyan']},{_ansi['clear']} ".join(positional_properties + named_properties) 127 | return f"{_ansi['yellow']}{class_name}{_ansi['cyan']}({_ansi['clear']}{properties}{_ansi['cyan']}){_ansi['clear']}" 128 | -------------------------------------------------------------------------------- /dev_requirements.txt: -------------------------------------------------------------------------------- 1 | boto3-stubs -------------------------------------------------------------------------------- /pyrightconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "typeCheckingMode": "strict", 3 | "reportMissingTypeStubs": "information", 4 | "reportUnusedImport": "warning", 5 | "reportUnusedVariable": "information", 6 | "stubPath": "./typings", 7 | "include": [ 8 | "datatap/api", 9 | "datatap/comet", 10 | "datatap/dataset", 11 | "datatap/droplet", 12 | "datatap/geometry", 13 | // "datatap/metrics", 14 | "datatap/template", 15 | "datatap/utils", 16 | // "examples", 17 | // "tests" 18 | ] 19 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | boto3>=1.15.14 2 | fastjsonschema==2.14.2 3 | toolz>=0.10.0 4 | simplejson>=3.17.0 5 | jsonmerge==1.7.0 6 | neo4j==4.0.1 7 | Shapely 8 | requests>=2.23.0 9 | typing-extensions 10 | -------------------------------------------------------------------------------- /requirements_image.txt: -------------------------------------------------------------------------------- 1 | palettable>=3.3.0 2 | Flask>=1.1.1 3 | Pillow>=7.1.0 4 | Shapely>=1.7.0 5 | scipy>=1.4.1 6 | importlib-resources>=1.4.0 7 | boto3>=1.15.14 -------------------------------------------------------------------------------- /requirements_importers.txt: -------------------------------------------------------------------------------- 1 | Shapely==1.7.0 2 | scikit-image==0.17.2 3 | Unidecode==1.1.1 4 | -------------------------------------------------------------------------------- /requirements_metrics.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.19.2 2 | sortedcontainers>=0.8.1 3 | matplotlib>=3.3.2 4 | scipy>=1.5.2 5 | -------------------------------------------------------------------------------- /requirements_tf.txt: -------------------------------------------------------------------------------- 1 | tensorflow>=2.0.0 -------------------------------------------------------------------------------- /requirements_torch.txt: -------------------------------------------------------------------------------- 1 | torch>=1.8.0+cpu 2 | torchvision>=0.9.0+cpu 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import setuptools 3 | 4 | with open("README.md", "r") as f: 5 | long_description = f.read() 6 | 7 | with open("requirements.txt", "r") as f: 8 | requirements = f.read().strip().split("\n") 9 | 10 | extras_require = {} 11 | for path in glob.glob("requirements_*.txt"): 12 | extra = path.split("_")[-1].split(".")[0] 13 | with open(path, "r") as f: 14 | extras_require[extra] = [ 15 | dep 16 | for dep in map(str.strip, f.readlines()) 17 | if dep != "" and not dep.startswith("-") 18 | ] 19 | 20 | 21 | setuptools.setup( 22 | name = "datatap", 23 | version = "0.3.0", 24 | author = "Zensors' Dev Team", 25 | author_email = "dev-team@zensors.com", 26 | description = "Client library for dataTap", 27 | long_description = long_description, 28 | long_description_content_type = "text/markdown", 29 | url = "https://github.com/zensors/datatap-python", # pypi will add extra information if the url is the repo 30 | packages = setuptools.find_packages(), 31 | package_data = { "": ["image/assets/*"], "datatap": ["py.typed"] }, 32 | classifiers = [ 33 | "Programming Language :: Python :: 3", 34 | "Operating System :: OS Independent", 35 | "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", 36 | ], 37 | python_requires = ">=3.7", 38 | install_requires = requirements, 39 | extras_require = extras_require, 40 | dependency_links = [ 41 | "https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html" 42 | ], 43 | ) 44 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zensors/datatap-python/1a05b354f20bbbc24b9bad76e919e0cc9542538c/tests/__init__.py -------------------------------------------------------------------------------- /tests/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zensors/datatap-python/1a05b354f20bbbc24b9bad76e919e0cc9542538c/tests/metrics/__init__.py -------------------------------------------------------------------------------- /tests/metrics/test_iou.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | from datatap.droplet import (BoundingBox, ClassAnnotation, Image, 5 | ImageAnnotation, Instance) 6 | from datatap.geometry import Point, Rectangle 7 | from datatap.metrics.confusion_matrix import ConfusionMatrix 8 | from datatap.metrics.iou import (generate_confusion_matrix, 9 | generate_pr_curve) 10 | from datatap.metrics.precision_recall_curve import (_DetectionEvent as DetectionEvent, 11 | PrecisionRecallCurve) 12 | from datatap.template import (ClassAnnotationTemplate, 13 | ImageAnnotationTemplate, InstanceTemplate) 14 | 15 | tpl = ImageAnnotationTemplate( 16 | classes = { 17 | "a": ClassAnnotationTemplate( 18 | instances = InstanceTemplate(bounding_box = True) 19 | ), 20 | "b": ClassAnnotationTemplate( 21 | instances = InstanceTemplate(bounding_box = True) 22 | ) 23 | } 24 | ) 25 | 26 | im = Image(paths = []) 27 | 28 | gt1 = ImageAnnotation( 29 | image = im, 30 | classes = { 31 | "a": ClassAnnotation( 32 | instances = [ 33 | Instance(bounding_box = BoundingBox(Rectangle(Point(0.5, 0.5), Point(0.7, 0.7)))), 34 | Instance(bounding_box = BoundingBox(Rectangle(Point(0.1, 0.1), Point(0.2, 0.2)))) 35 | ] 36 | ), 37 | "b": ClassAnnotation( 38 | instances = [ 39 | Instance(bounding_box = BoundingBox(Rectangle(Point(0.6, 0.5), Point(0.7, 0.7)))) 40 | ] 41 | ) 42 | } 43 | ) 44 | 45 | pred1 = ImageAnnotation( 46 | image = im, 47 | classes = { 48 | "a": ClassAnnotation( 49 | instances = [ 50 | Instance(bounding_box = BoundingBox(Rectangle(Point(0.58, 0.5), Point(0.7, 0.7)), confidence = 0.7)), 51 | Instance(bounding_box = BoundingBox(Rectangle(Point(0.1, 0.1), Point(0.18, 0.19)), confidence = 0.9)) 52 | ] 53 | ), 54 | "b": ClassAnnotation( 55 | instances = [ 56 | Instance(bounding_box = BoundingBox(Rectangle(Point(0.6, 0.5), Point(0.7, 0.7)), confidence = 0.6)), 57 | Instance(bounding_box = BoundingBox(Rectangle(Point(0.1, 0.8), Point(0.2, 0.9)), confidence = 0.2)) 58 | ] 59 | ) 60 | } 61 | ) 62 | 63 | gt2 = ImageAnnotation( 64 | image = im, 65 | classes = { 66 | "a": ClassAnnotation( 67 | instances = [ 68 | Instance(bounding_box = BoundingBox(Rectangle(Point(0.1, 0.1), Point(0.4, 0.4)))), 69 | ] 70 | ), 71 | "b": ClassAnnotation( 72 | instances = [] 73 | ) 74 | } 75 | ) 76 | 77 | pred2 = ImageAnnotation( 78 | image = im, 79 | classes = { 80 | "a": ClassAnnotation( 81 | instances = [ 82 | Instance(bounding_box = BoundingBox(Rectangle(Point(0.1, 0.12), Point(0.37, 0.4)), confidence = 0.8)), 83 | Instance(bounding_box = BoundingBox(Rectangle(Point(0.09, 0.08), Point(0.39, 0.4)), confidence = 0.6)) 84 | ] 85 | ), 86 | "b": ClassAnnotation( 87 | instances = [ 88 | Instance(bounding_box = BoundingBox(Rectangle(Point(0.6, 0), Point(0.8, 0.2)), confidence = 0.3)) 89 | ] 90 | ) 91 | } 92 | ) 93 | 94 | class TestIou(unittest.TestCase): 95 | def test_add_annotation_to_pr_curve_1(self): 96 | pr = PrecisionRecallCurve() 97 | pr.add_annotation(ground_truth = gt1, prediction = pred1, iou_threshold = 0.3) 98 | self.assertEqual(pr.events, { 99 | 0.2: DetectionEvent(0, 1), 100 | 0.6: DetectionEvent(2, -1), 101 | 0.7: DetectionEvent(0, 1), 102 | 0.9: DetectionEvent(1, 0) 103 | }) 104 | 105 | def test_add_annotation_to_pr_curve_2(self): 106 | pr = PrecisionRecallCurve() 107 | pr.add_annotation(ground_truth = gt2, prediction = pred2, iou_threshold = 0.3) 108 | self.assertEqual(pr.events, { 109 | 0.3: DetectionEvent(0, 1), 110 | 0.6: DetectionEvent(0, 1), 111 | 0.8: DetectionEvent(1, 0) 112 | }) 113 | 114 | def test_generate_pr_curve(self): 115 | pr = generate_pr_curve(ground_truths = [gt1, gt2], predictions = [pred1, pred2], iou_threshold = 0.3) 116 | self.assertEqual(pr.events, { 117 | 0.2: DetectionEvent(0, 1), 118 | 0.3: DetectionEvent(0, 1), 119 | 0.6: DetectionEvent(2, 0), 120 | 0.7: DetectionEvent(0, 1), 121 | 0.8: DetectionEvent(1, 0), 122 | 0.9: DetectionEvent(1, 0) 123 | }) 124 | 125 | def test_add_annotation_to_confusion_matrix_1a(self): 126 | cm = ConfusionMatrix(sorted(tpl.classes.keys())) 127 | cm.add_annotation( 128 | ground_truth = gt1, 129 | prediction = pred1, 130 | iou_threshold = 0.3, 131 | confidence_threshold = 0.1 132 | ) 133 | self.assertTrue( 134 | np.array_equal( 135 | cm.matrix, 136 | np.array([ 137 | [0, 0, 1], 138 | [0, 2, 0], 139 | [0, 0, 1] 140 | ]) 141 | ) 142 | ) 143 | 144 | def test_add_annotation_to_confusion_matrix_1b(self): 145 | cm = ConfusionMatrix(sorted(tpl.classes.keys())) 146 | cm.add_annotation( 147 | ground_truth = gt1, 148 | prediction = pred1, 149 | iou_threshold = 0.3, 150 | confidence_threshold = 0.6 151 | ) 152 | self.assertTrue( 153 | np.array_equal( 154 | cm.matrix, 155 | np.array([ 156 | [0, 0, 0], 157 | [0, 2, 0], 158 | [0, 0, 1] 159 | ]) 160 | ) 161 | ) 162 | 163 | def test_add_annotation_to_confusion_matrix_1c(self): 164 | cm = ConfusionMatrix(sorted(tpl.classes.keys())) 165 | cm.add_annotation( 166 | ground_truth = gt1, 167 | prediction = pred1, 168 | iou_threshold = 0.3, 169 | confidence_threshold = 0.61 170 | ) 171 | self.assertTrue( 172 | np.array_equal( 173 | cm.matrix, 174 | np.array([ 175 | [0, 0, 0], 176 | [1, 1, 0], 177 | [0, 1, 0] 178 | ]) 179 | ) 180 | ) 181 | 182 | def test_add_annotation_to_confusion_matrix_2(self): 183 | cm = ConfusionMatrix(sorted(tpl.classes.keys())) 184 | cm.add_annotation( 185 | ground_truth = gt2, 186 | prediction = pred2, 187 | iou_threshold = 0.3, 188 | confidence_threshold = 0.1 189 | ) 190 | self.assertTrue( 191 | np.array_equal( 192 | cm.matrix, 193 | np.array([ 194 | [0, 1, 1], 195 | [0, 1, 0], 196 | [0, 0, 0] 197 | ]) 198 | ) 199 | ) 200 | 201 | def test_generate_confusion_matrix(self): 202 | cm = generate_confusion_matrix( 203 | template = tpl, 204 | ground_truths = [gt1, gt2], 205 | predictions = [pred1, pred2], 206 | iou_threshold = 0.3, 207 | confidence_threshold = 0.1 208 | ) 209 | self.assertTrue( 210 | np.array_equal( 211 | cm.matrix, 212 | np.array([ 213 | [0, 1, 2], 214 | [0, 3, 0], 215 | [0, 0, 1] 216 | ]) 217 | ) 218 | ) 219 | 220 | if __name__ == "__main__": 221 | unittest.main() 222 | -------------------------------------------------------------------------------- /tests/metrics/test_precision_recall_curve.py: -------------------------------------------------------------------------------- 1 | # pyright: reportPrivateUsage=false 2 | 3 | from datatap.metrics.precision_recall_curve import _DetectionEvent as DetectionEvent, MaximizeF1Result, PrecisionRecallCurve 4 | 5 | import unittest 6 | 7 | class TestPrecisionRecallCurve(unittest.TestCase): 8 | def test_add(self): 9 | a = PrecisionRecallCurve() 10 | a._add_event(0.1, DetectionEvent(0, 1)) 11 | a._add_event(0.8, DetectionEvent(1, -1)) 12 | a._add_ground_truth_positives(3) 13 | 14 | b = PrecisionRecallCurve() 15 | b._add_event(0.25, DetectionEvent(1, 0)) 16 | b._add_event(0.6, DetectionEvent(0, -1)) 17 | b._add_ground_truth_positives(2) 18 | 19 | c = a + b 20 | self.assertEqual(c.ground_truth_positives, 5) 21 | self.assertEqual(c.events, { 22 | 0.1: DetectionEvent(0, 1), 23 | 0.25: DetectionEvent(1, 0), 24 | 0.6: DetectionEvent(0, -1), 25 | 0.8: DetectionEvent(1, -1) 26 | }) 27 | 28 | def test_maximize_f1(self): 29 | pr = PrecisionRecallCurve() 30 | pr._add_ground_truth_positives(5) 31 | pr._add_event(0.1, DetectionEvent(0, 1)) # p = 4/6, r = 4/5, f1 = 8/11 32 | pr._add_event(0.25, DetectionEvent(1, 0)) # p = 4/5, r = 4/5, f1 = 4/5 33 | pr._add_event(0.6, DetectionEvent(2, -1)) # p = 3/4, r = 3/5, f1 = 2/3 34 | pr._add_event(0.72, DetectionEvent(0, 1)) # p = 1/3, r = 1/5, f1 = 1/4 35 | pr._add_event(0.8, DetectionEvent(1, 0)) # p = 1/2, r = 1/5, f1 = 2/7 36 | pr._add_event(0.9, DetectionEvent(0, 1)) # p = 0/1, r = 0/5, f1 = 0 37 | self.assertEqual(pr.maximize_f1(), MaximizeF1Result(threshold = 0.25, precision = 0.8, recall = 0.8, f1 = 0.8)) 38 | 39 | if __name__ == "__main__": 40 | unittest.main() 41 | -------------------------------------------------------------------------------- /typings/PIL/Image.pyi: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from typing import Optional, Sequence, SupportsBytes, Tuple 3 | 4 | 5 | class Image: 6 | size: Tuple[int, int] 7 | 8 | def convert(self, mode: str) -> Image: ... 9 | def resize(self, size: Tuple[int, int], resample: Optional[int]) -> Image: ... 10 | def getdata(self) -> Sequence[int]: ... 11 | 12 | def open(data: BytesIO) -> Image: ... 13 | def fromarray(buffer: SupportsBytes, mode: Optional[str]) -> Image: ... 14 | 15 | BOX: int 16 | -------------------------------------------------------------------------------- /typings/PIL/__init__.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zensors/datatap-python/1a05b354f20bbbc24b9bad76e919e0cc9542538c/typings/PIL/__init__.pyi -------------------------------------------------------------------------------- /typings/boto3/__init__.pyi: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from typing import Literal, TypedDict 3 | 4 | 5 | class S3Resource: 6 | def Object(self, bucket_name: str, path_name: str) -> S3Object: ... 7 | 8 | class S3Object: 9 | def get(self) -> S3ObjectDict: ... 10 | 11 | class S3ObjectDict(TypedDict): 12 | Body: BytesIO 13 | 14 | def resource(type: Literal["s3"]) -> S3Resource: ... 15 | -------------------------------------------------------------------------------- /typings/comet_ml/API.pyi: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, overload 2 | 3 | from typing_extensions import TypedDict 4 | 5 | from .APIExperiment import APIAsset, APIExperiment 6 | from .query import QueryExpression 7 | 8 | class APIProject(TypedDict): 9 | projectId: str 10 | projectName: str 11 | ownerUserName: str 12 | projectDescription: str 13 | workspaceName: str 14 | numberOfExperiments: int 15 | lastUpdated: int 16 | public: bool 17 | 18 | class APIRegistryExperimentModel(TypedDict): 19 | experimentModelId: str 20 | experimentModelName: str 21 | experimentKey: str 22 | 23 | class APIRegistryVersion(TypedDict): 24 | registryModelItemId: str 25 | experimentModel: APIRegistryExperimentModel 26 | version: str 27 | comment: str 28 | stages: List[str] 29 | userName: str 30 | createdAt: int 31 | lastUpdated: int 32 | assets: List[APIAsset] 33 | restApiUrl: str 34 | 35 | class APIRegistryModel(TypedDict): 36 | registryModelId: str 37 | modelName: str 38 | description: str 39 | isPublic: bool 40 | createdAt: int 41 | lastUpdate: int 42 | userName: str 43 | versions: List[APIRegistryVersion] 44 | 45 | class API: 46 | def __init__(self, api_key: Optional[str] = ...) -> API: ... 47 | 48 | def query( 49 | self, 50 | workspace: str, 51 | project_name: str, 52 | query: QueryExpression, 53 | archived: bool = ... 54 | ) -> List[APIExperiment]: ... 55 | 56 | # From a type perspective, these could be collapsed into two overloads 57 | # but each one is doing something different, so i preferred to make it 58 | # explicit 59 | @overload 60 | def get(self) -> List[str]: ... 61 | @overload 62 | def get(self, workspace: str = ...) -> List[str]: ... 63 | @overload 64 | def get(self, workspace: str = ..., project_name: str = ...) -> List[str]: ... 65 | @overload 66 | def get(self, workspace: str = ..., project_name: str = ..., experiment: str = ...) -> APIExperiment: ... 67 | 68 | def get_experiment(self, workspace: str = ..., project_name: str = ..., experiment: str = ...) -> APIExperiment: ... 69 | def get_project(self, workspace: str, project_name: str) -> APIProject: ... 70 | def get_projects(self, workspace: str) -> List[APIProject]: ... 71 | 72 | def update_registry_model_version( 73 | self, 74 | workspace: str, 75 | registry_name: str, 76 | version: str, 77 | comment: Optional[str] = ..., 78 | stages: Optional[List[str]] = ... 79 | ) -> None: ... 80 | def get_registry_model_details( 81 | self, 82 | workspace: str, 83 | registry_name: str, 84 | version: Optional[str] = ... 85 | ) -> APIRegistryModel: ... 86 | def get_registry_model_names(self, workspace: str) -> List[str]: ... -------------------------------------------------------------------------------- /typings/comet_ml/APIExperiment.pyi: -------------------------------------------------------------------------------- 1 | from typing import Any, List, Optional, overload 2 | 3 | from typing_extensions import Literal, TypedDict 4 | 5 | class APIAsset(TypedDict): 6 | fileName: str 7 | fileSize: int 8 | runContext: Optional[str] 9 | step: Optional[int] 10 | link: str 11 | createdAt: int 12 | dir: str 13 | canView: bool 14 | audio: bool 15 | video: bool 16 | histogram: bool 17 | image: bool 18 | type: str 19 | metadata: Any 20 | assetId: str 21 | 22 | class APIMetrics(TypedDict): 23 | metricName: str 24 | metricValue: str 25 | timestamp: int 26 | step: int 27 | epoch: Optional[int] 28 | runContext: Optional[str] 29 | 30 | class APIMetricsSummary(TypedDict): 31 | name: str 32 | valueMax: str 33 | valueMin: str 34 | timestampMax: int 35 | timestampMin: int 36 | timestampCurrent: int 37 | stepMax: int 38 | stepMin: int 39 | stepCurrent: int 40 | valueCurrent: str 41 | 42 | class APIMetadata(TypedDict): 43 | archived: bool 44 | durationMillis: int 45 | endTimeMillis: int 46 | experimentKey: str 47 | experimentName: Optional[str] 48 | fileName: Optional[str] 49 | filePatah: Optional[str] 50 | projectId: str 51 | projectName: str 52 | running: bool 53 | startTimeMillis: int 54 | throttle: bool 55 | workspaceName: str 56 | 57 | 58 | class APIExperiment: 59 | id: str 60 | url: str 61 | name: str 62 | start_server_timestamp: int 63 | 64 | def __init__(self, *args: Any, **kwargs: Any) -> APIExperiment: ... 65 | 66 | def add_tags(self, tags: List[str]) -> None: ... 67 | @overload 68 | def get_asset(self, asset_id: str, return_type: Literal["binary"] = ...) -> bytes: ... 69 | @overload 70 | def get_asset(self, asset_id: str, return_type: Literal["text"]) -> str: ... 71 | @overload 72 | def get_asset(self, asset_id: str, return_type: str) -> str | bytes: ... 73 | def get_asset_list(self, asset_type: str = ...) -> List[APIAsset]: ... 74 | def get_model_asset_list(self, model_name: str) -> List[APIAsset]: ... 75 | def download_model(self, name: str, output_path: str = ..., expand: bool = ...) -> None: ... 76 | def register_model( 77 | self, 78 | model_name: str, 79 | version: str = ..., 80 | workspace: Optional[str] = ..., 81 | registry_name: Optional[str] = ..., 82 | description: Optional[str] = ..., 83 | comment: Optional[str] = ..., 84 | stages: Optional[List[str]] = ... 85 | ) -> None: ... 86 | def get_metrics(self, metric: Optional[str] = ...) -> List[APIMetrics]: ... 87 | def log_other(self, key: str, value: Any, timestamp: Optional[int] = ...) -> None: ... 88 | def get_tags(self) -> List[str]: ... 89 | def get_metadata(self) -> APIMetadata: ... 90 | @overload 91 | def get_metrics_summary(self) -> List[APIMetricsSummary]: ... 92 | @overload 93 | def get_metrics_summary(self, metric: str) -> APIMetricsSummary: ... 94 | @overload 95 | def get_others_summary(self) -> List[APIMetricsSummary]: ... 96 | @overload 97 | def get_others_summary(self, other: str) -> List[str]: ... 98 | -------------------------------------------------------------------------------- /typings/comet_ml/ExistingExperiment.pyi: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from .Experiment import Experiment 4 | 5 | 6 | class ExistingExperiment(Experiment): 7 | 8 | def __init__( 9 | self, 10 | api_key: Optional[str] = ..., 11 | previous_experiment: Optional[str] = ..., 12 | project_name: Optional[str] = ..., 13 | workspace: Optional[str] = ..., 14 | log_code: bool = ..., 15 | log_graph: bool = ..., 16 | auto_param_logging: bool = ..., 17 | auto_metric_logging: bool = ..., 18 | auto_weight_logging: bool = ..., 19 | auto_output_logging: bool = ..., 20 | auto_log_co2: bool = ..., 21 | parse_args: bool = ..., 22 | log_env_details: bool = ..., 23 | log_env_gpu: bool = ..., 24 | log_env_cpu: bool = ..., 25 | log_env_host: bool = ..., 26 | log_git_metadata: bool = ..., 27 | log_git_patch: bool = ..., 28 | display_summary_level: int = ..., 29 | disabled: bool = ... 30 | ) -> ExistingExperiment: ... 31 | -------------------------------------------------------------------------------- /typings/comet_ml/Experiment.pyi: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Union, Mapping 3 | 4 | from io import BufferedReader 5 | 6 | from typing_extensions import Literal 7 | 8 | class Experiment: 9 | id: str 10 | context: str 11 | 12 | def __init__( 13 | self, 14 | api_key: Optional[str] = ..., 15 | project_name: Optional[str] = ..., 16 | workspace: Optional[str] = ..., 17 | log_code: bool = ..., 18 | log_graph: bool = ..., 19 | auto_param_logging: bool = ..., 20 | auto_metric_logging: bool = ..., 21 | auto_weight_logging: bool = ..., 22 | auto_output_logging: bool = ..., 23 | auto_log_co2: bool = ..., 24 | parse_args: bool = ..., 25 | log_env_details: bool = ..., 26 | log_env_gpu: bool = ..., 27 | log_env_cpu: bool = ..., 28 | log_env_host: bool = ..., 29 | log_git_metadata: bool = ..., 30 | log_git_patch: bool = ..., 31 | display_summary_level: int = ..., 32 | disabled: bool = ... 33 | ) -> Experiment: ... 34 | 35 | def set_step(self, step: int) -> None: ... 36 | def log_asset( 37 | self, 38 | file_data: str | BufferedReader, 39 | file_name: Optional[str] = ..., 40 | overwrite: bool = ..., 41 | copy_to_tmp: bool = ..., 42 | step: Optional[int] = ... 43 | ) -> None: ... 44 | def log_asset_data( 45 | self, 46 | data: Any, 47 | name: Optional[str] = ..., 48 | overwrite: bool = ..., 49 | step: Optional[int] = ..., 50 | metadata: Optional[Any] = ..., 51 | file_name: Optional[str] = ... 52 | ) -> None: ... 53 | def log_metric( 54 | self, 55 | name: str, 56 | value: str, 57 | step: Optional[int] = ..., 58 | epoch: Optional[int] = ..., 59 | include_context: bool = ... 60 | ) -> None: ... 61 | def log_metrics( 62 | self, 63 | dict: Dict[str, str], 64 | prefix: Optional[str] = ..., 65 | step: Optional[int] = ..., 66 | epoch: Optional[int] = ... 67 | ) -> None: ... 68 | def log_table( 69 | self, 70 | filename: str, 71 | tabular_data: Optional[Sequence[Sequence[Any]]] = ..., 72 | headers: Literal[False] | Sequence[str] = ... 73 | ) -> None: ... 74 | def log_model( 75 | self, 76 | name: str, 77 | file_or_folder: str, 78 | file_name: Optional[str] = ..., 79 | overwrite: bool = ..., 80 | metadata: Any = ..., 81 | copy_to_tmp: bool = ... 82 | ) -> None: ... 83 | def log_confusion_matrix( 84 | self, 85 | y_true: Optional[List[int]] = ..., 86 | y_predicted: Optional[Sequence[int]] = ..., 87 | matrix: Optional[List[List[Any]]] = ..., 88 | labels: Optional[List[str]] = ..., 89 | title: str = ..., 90 | row_label: str = ..., 91 | column_label: str = ..., 92 | max_examples_per_cell: int = ..., 93 | max_categories: int = ..., 94 | winner_function: Optional[Callable[[List[List[int]]], int]] = ..., 95 | index_to_example_function: Optional[Callable[[int], str | int | Dict[str, str]]] = ..., 96 | cache: bool = ..., 97 | file_name: str = ..., 98 | overwrite: bool = ..., 99 | step: Optional[int] = ... 100 | ) -> None: ... 101 | def log_parameter( 102 | self, 103 | name: str, 104 | value: Union[float, int, bool, str, List[Any]], 105 | step: Optional[int] = ..., 106 | ) -> None: ... 107 | def log_parameters( 108 | self, 109 | parameters: Mapping[str, Any], 110 | prefix: Optional[str] = ..., 111 | step: Optional[int] = ..., 112 | ) -> None: ... 113 | def log_other( 114 | self, 115 | key: str, 116 | value: str, 117 | ) -> None: ... 118 | def get_other( 119 | self, 120 | key: str 121 | ) -> str: ... 122 | 123 | @contextmanager 124 | def context_manager(self, name: str) -> Iterator[Experiment]: ... -------------------------------------------------------------------------------- /typings/comet_ml/__init__.pyi: -------------------------------------------------------------------------------- 1 | from .API import API, APIProject, APIRegistryModel, APIRegistryVersion 2 | from .APIExperiment import APIExperiment 3 | from .ExistingExperiment import ExistingExperiment 4 | from .Experiment import Experiment 5 | 6 | __all__ = [ 7 | "API", 8 | "APIExperiment", 9 | "ExistingExperiment", 10 | "Experiment", 11 | ] -------------------------------------------------------------------------------- /typings/comet_ml/exceptions.pyi: -------------------------------------------------------------------------------- 1 | class NotFound(Exception): ... -------------------------------------------------------------------------------- /typings/comet_ml/query/__init__.pyi: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from comet_ml.query import QueryExpression 4 | 5 | 6 | class QueryExpression: 7 | def startswith(self, prefix: str) -> QueryExpression: ... 8 | 9 | class Tag(QueryExpression): 10 | def __init__(self, tag: str) -> Tag: ... 11 | 12 | class Metadata(QueryExpression): 13 | def __init__(self, metadata: str) -> Metadata: ... -------------------------------------------------------------------------------- /typings/dask/__init__.pyi: -------------------------------------------------------------------------------- 1 | from typing import Callable, TypeVar, overload 2 | from .delayed import Delayed 3 | 4 | A = TypeVar("A") 5 | B = TypeVar("B") 6 | C = TypeVar("C") 7 | D = TypeVar("D") 8 | E = TypeVar("E") 9 | 10 | T = TypeVar("T") 11 | 12 | @overload 13 | def delayed(fn: Callable[[A], T]) -> Callable[[A], Delayed[T]]: ... 14 | @overload 15 | def delayed(fn: Callable[[A, B], T]) -> Callable[[A, B], Delayed[T]]: ... 16 | @overload 17 | def delayed(fn: Callable[[A, B, C], T]) -> Callable[[A, B, C], Delayed[T]]: ... 18 | @overload 19 | def delayed(fn: Callable[[A, B, C, D], T]) -> Callable[[A, B, C, D], Delayed[T]]: ... 20 | @overload 21 | def delayed(fn: Callable[[A, B, C, D, E], T]) -> Callable[[A, B, C, D, E], Delayed[T]]: ... 22 | def delayed(fn: Callable[..., T]) -> Callable[..., Delayed[T]]: ... 23 | -------------------------------------------------------------------------------- /typings/dask/bag.pyi: -------------------------------------------------------------------------------- 1 | from typing import Generic, Iterable, TypeVar, List 2 | 3 | from .delayed import Delayed 4 | 5 | T = TypeVar("T") 6 | 7 | class Bag(Generic[T]): 8 | def to_delayed(self) -> List[Delayed[List[T]]]: ... 9 | def take(self, count: int) -> List[T]: ... 10 | 11 | def from_delayed(delayed: Iterable[Delayed[Iterable[T]]]) -> Bag[T]: ... -------------------------------------------------------------------------------- /typings/dask/delayed.pyi: -------------------------------------------------------------------------------- 1 | from typing import Generic, TypeVar 2 | 3 | T = TypeVar("T") 4 | 5 | class Delayed(Generic[T]): 6 | def compute(self) -> T: ... 7 | -------------------------------------------------------------------------------- /typings/fsspec/__init__.pyi: -------------------------------------------------------------------------------- 1 | from io import BufferedReader 2 | 3 | 4 | class OpenFile(BufferedReader): # this isn't true but it's close enough for how we're using fsspec 5 | pass 6 | 7 | def open(uri: str) -> OpenFile: ... 8 | 9 | -------------------------------------------------------------------------------- /typings/matplotlib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zensors/datatap-python/1a05b354f20bbbc24b9bad76e919e0cc9542538c/typings/matplotlib/__init__.py -------------------------------------------------------------------------------- /typings/matplotlib/pyplot/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | class Figure: 4 | pass 5 | 6 | def figure() -> Figure: ... 7 | def plot(*args: Any) -> None: ... 8 | def xlabel(label: str) -> None: ... 9 | def ylabel(label: str) -> None: ... 10 | -------------------------------------------------------------------------------- /typings/neo4j/__init__.pyi: -------------------------------------------------------------------------------- 1 | from .graph_database import GraphDatabase 2 | from .transaction import Transaction 3 | 4 | __all__ = [ 5 | "GraphDatabase", 6 | "Transaction", 7 | ] 8 | -------------------------------------------------------------------------------- /typings/neo4j/driver.pyi: -------------------------------------------------------------------------------- 1 | from typing import ContextManager 2 | 3 | from .session import Session 4 | 5 | class Driver: 6 | def session(self) -> ContextManager[Session]: ... 7 | -------------------------------------------------------------------------------- /typings/neo4j/graph_database.pyi: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | from .driver import Driver 3 | 4 | class GraphDatabase: 5 | @staticmethod 6 | def driver(url: str, *, auth: Tuple[str, str]) -> Driver: ... 7 | -------------------------------------------------------------------------------- /typings/neo4j/record.pyi: -------------------------------------------------------------------------------- 1 | from typing import Generic, Iterable, TypeVar 2 | 3 | T = TypeVar("T") 4 | V = TypeVar("V") 5 | K = TypeVar("K", str, int) 6 | R = TypeVar("R") 7 | 8 | class Record(Generic[T], Iterable[V]): 9 | def __getitem__(self, key: K) -> R: ... 10 | 11 | -------------------------------------------------------------------------------- /typings/neo4j/result.pyi: -------------------------------------------------------------------------------- 1 | from typing import Generic, Iterator, TypeVar 2 | 3 | from .record import Record 4 | 5 | T = TypeVar("T") 6 | 7 | class Result(Generic[T]): 8 | def __iter__(self) -> Iterator[Record[T]]: ... 9 | def single(self) -> Record[T]: ... 10 | -------------------------------------------------------------------------------- /typings/neo4j/session.pyi: -------------------------------------------------------------------------------- 1 | from typing import Any, ContextManager, Dict, TypeVar, Callable, overload 2 | 3 | from neo4j.result import Result 4 | 5 | from .transaction import Transaction 6 | 7 | 8 | _F = TypeVar("_F") 9 | 10 | class Session: 11 | def begin_transaction(self) -> ContextManager[Transaction]: ... 12 | 13 | def write_transaction(self, fn: Callable[[Transaction], _F]) -> _F: ... 14 | 15 | @overload 16 | def run(self, query: str, args: Dict[str, Any]) -> Result[_F]: ... 17 | @overload 18 | def run(self, query: str, **kwargs: Any) -> Result[_F]: ... 19 | -------------------------------------------------------------------------------- /typings/neo4j/transaction.pyi: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, TypeVar, overload 2 | from .result import Result 3 | 4 | T = TypeVar("T") 5 | 6 | class Transaction: 7 | @overload 8 | def run(self, query: str, args: Dict[str, Any]) -> Result[T]: ... 9 | @overload 10 | def run(self, query: str, **kwargs: Any) -> Result[T]: ... 11 | -------------------------------------------------------------------------------- /typings/pycocotools/__init__.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zensors/datatap-python/1a05b354f20bbbc24b9bad76e919e0cc9542538c/typings/pycocotools/__init__.pyi -------------------------------------------------------------------------------- /typings/pycocotools/mask.pyi: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Sequence, Tuple 4 | 5 | import numpy as np 6 | from typing_extensions import TypedDict 7 | 8 | 9 | class CocoRleJson(TypedDict): 10 | counts: Sequence[int] 11 | size: Tuple[int, int] 12 | 13 | class CocoRle: 14 | pass 15 | 16 | def frPyObjects(json: CocoRleJson, height: int, width: int) -> CocoRle: ... 17 | def decode(rle: CocoRle) -> np.ndarray: ... 18 | -------------------------------------------------------------------------------- /typings/requests/__init__.pyi: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, Generator, Optional, overload 2 | 3 | from typing_extensions import Literal 4 | 5 | class Response: 6 | content: bytes 7 | ok: bool 8 | 9 | def json(self) -> Any: ... 10 | @overload 11 | def iter_lines(self, *, decode_unicode: Literal[True], chunk_size: int = ...) -> Generator[str, None, None]: ... 12 | @overload 13 | def iter_lines(self, *, decode_unicode: Optional[Literal[False]] = ..., chunk_size: int = ...) -> Generator[bytes, None, None]: ... 14 | 15 | def get( 16 | url: str, 17 | params: Dict[str, str] | None = ..., 18 | headers: Dict[str, str | None] | None = ..., 19 | stream: bool = ..., 20 | ) -> Response: ... 21 | 22 | def post( 23 | url: str, 24 | params: Dict[str, str] | None = ..., 25 | headers: Dict[str, str | None] | None = ..., 26 | stream: bool = ..., 27 | json: Any = ... 28 | ) -> Response: ... -------------------------------------------------------------------------------- /typings/scipy/__init__.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zensors/datatap-python/1a05b354f20bbbc24b9bad76e919e0cc9542538c/typings/scipy/__init__.pyi -------------------------------------------------------------------------------- /typings/scipy/optimize/__init__.pyi: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | from numpy import ndarray 3 | 4 | def linear_sum_assignment(cost_matrix: ndarray, maximize: bool = ...) -> Tuple[ndarray, ndarray]: ... 5 | -------------------------------------------------------------------------------- /typings/shapely/__init__.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zensors/datatap-python/1a05b354f20bbbc24b9bad76e919e0cc9542538c/typings/shapely/__init__.pyi -------------------------------------------------------------------------------- /typings/shapely/geometry/__init__.pyi: -------------------------------------------------------------------------------- 1 | class Polygon: 2 | pass 3 | 4 | def box(minx: float, miny: float, maxx: float, maxy: float, ccw: bool = ...) -> Polygon: ... 5 | -------------------------------------------------------------------------------- /typings/skimage/__init__.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zensors/datatap-python/1a05b354f20bbbc24b9bad76e919e0cc9542538c/typings/skimage/__init__.pyi -------------------------------------------------------------------------------- /typings/skimage/measure.pyi: -------------------------------------------------------------------------------- 1 | from typing import Optional, Sequence 2 | 3 | import numpy as np 4 | 5 | 6 | Polygon = np.ndarray 7 | 8 | def approximate_polygon(polygon: Polygon, tolerance: float) -> Polygon: ... 9 | def find_contours(image: np.ndarray, level: Optional[float]) -> Sequence[Polygon]: ... 10 | -------------------------------------------------------------------------------- /typings/sortedcontainers/__init__.pyi: -------------------------------------------------------------------------------- 1 | from typing import Generic, Iterator, MutableMapping, TypeVar 2 | 3 | 4 | K = TypeVar("K") 5 | V = TypeVar("V") 6 | 7 | class SortedDict(Generic[K, V], MutableMapping[K, V]): 8 | def __reversed__(self) -> Iterator[K]: ... 9 | def copy(self) -> SortedDict[K, V]: ... 10 | -------------------------------------------------------------------------------- /typings/tensorflow/__init__.pyi: -------------------------------------------------------------------------------- 1 | from . import distribute 2 | from . import data 3 | from . import io 4 | from .types import DType, int8, uint8, int32, int64, float32, float64, string, TensorShape 5 | from .tensor import Tensor, constant, stack, py_function 6 | 7 | __all__ = [ 8 | "distribute", 9 | "data", 10 | "io", 11 | "DType", 12 | "int8", 13 | "uint8", 14 | "int32", 15 | "int64", 16 | "float32", 17 | "float64", 18 | "string", 19 | "TensorShape", 20 | "Tensor", 21 | "constant", 22 | "stack", 23 | "py_function", 24 | ] -------------------------------------------------------------------------------- /typings/tensorflow/data/__init__.pyi: -------------------------------------------------------------------------------- 1 | from .dataset import Dataset 2 | 3 | __all__ = [ 4 | "Dataset" 5 | ] -------------------------------------------------------------------------------- /typings/tensorflow/data/dataset.pyi: -------------------------------------------------------------------------------- 1 | from typing import Callable, Generator, Iterable, List, TypeVar, NewType 2 | 3 | import tensorflow as tf 4 | 5 | _T = TypeVar("_T") 6 | 7 | DTypeDeepIterable = tf.DType | Iterable["DTypeDeepIterable"] 8 | TensorShapeDeepIterable = tf.TensorShape | Iterable["TensorShapeDeepIterable"] 9 | 10 | _V = TypeVar("_V") 11 | 12 | class Dataset: 13 | @staticmethod 14 | def from_generator( 15 | gen: Callable[[_T], Generator[Iterable[tf.Tensor], None, None]] | Callable[[], Generator[Iterable[tf.Tensor], None, None]], 16 | output_type: DTypeDeepIterable = ..., 17 | output_shapes: TensorShapeDeepIterable = ..., 18 | args: _T = ... 19 | ) -> Dataset: ... 20 | 21 | def __iter__(self) -> Generator[Iterable[tf.Tensor], None, None]: ... 22 | 23 | def map(self, map_fn: Callable[..., Iterable[tf.Tensor]], num_parallel_calls: int = ..., deterministic: bool = ...) -> Dataset: ... 24 | def prefetch(self, buffer_size: int) -> Dataset: ... -------------------------------------------------------------------------------- /typings/tensorflow/distribute/__init__.pyi: -------------------------------------------------------------------------------- 1 | from . import experimental 2 | from .input_context import InputContext 3 | from .distributed_dataset import DistributedDataset 4 | 5 | __all__ = [ 6 | "experimental", 7 | "InputContext", 8 | "DistributedDataset", 9 | ] -------------------------------------------------------------------------------- /typings/tensorflow/distribute/distributed_dataset.pyi: -------------------------------------------------------------------------------- 1 | from typing import Generator, Iterable 2 | 3 | import tensorflow as tf 4 | 5 | class DistributedDataset: 6 | 7 | def __iter__(self) -> Generator[Iterable[tf.Tensor], None, None]: ... -------------------------------------------------------------------------------- /typings/tensorflow/distribute/experimental/__init__.pyi: -------------------------------------------------------------------------------- 1 | from .strategy import Strategy, MultiWorkerMirroredStrategy 2 | 3 | __all__ = [ 4 | "Strategy", 5 | "MultiWorkerMirroredStrategy", 6 | ] -------------------------------------------------------------------------------- /typings/tensorflow/distribute/experimental/strategy.pyi: -------------------------------------------------------------------------------- 1 | from typing import Callable, Generator 2 | import tensorflow as tf 3 | 4 | class Strategy: 5 | def experimental_distribute_datasets_from_function( 6 | self, 7 | fn: Callable[[tf.distribute.InputContext], tf.data.Dataset] 8 | ) -> tf.distribute.DistributedDataset: ... 9 | 10 | class MultiWorkerMirroredStrategy(Strategy): 11 | pass -------------------------------------------------------------------------------- /typings/tensorflow/distribute/input_context.pyi: -------------------------------------------------------------------------------- 1 | class InputContext: 2 | input_pipeline_id: int 3 | num_input_pipelines: int 4 | num_input_pipelines: int -------------------------------------------------------------------------------- /typings/tensorflow/io/__init__.py: -------------------------------------------------------------------------------- 1 | from .decode_image import decode_jpeg 2 | 3 | __all__ = [ 4 | "decode_jpeg", 5 | ] -------------------------------------------------------------------------------- /typings/tensorflow/io/decode_image.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | import tensorflow as tf 3 | 4 | def decode_jpeg(img: bytes, channels: Optional[int] = ..., dtype: tf.DType = ..., name: str = ..., expand_animations: bool = ...) -> tf.Tensor: ... -------------------------------------------------------------------------------- /typings/tensorflow/tensor.pyi: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Iterable 2 | 3 | import tensorflow as tf 4 | 5 | 6 | class Tensor: 7 | def numpy(self) -> Any: ... 8 | 9 | 10 | def constant(arraylike: Any, dtype: tf.DType = ..., shape: Iterable[int] | tf.TensorShape = ...) -> Tensor: ... 11 | def stack(tensors: Iterable[Tensor]) -> Tensor: ... 12 | def py_function(func: Callable[[Tensor], Tensor], inp: Any, Tout: tf.DType | Iterable[tf.DType], name: str = ...) -> Tensor: ... -------------------------------------------------------------------------------- /typings/tensorflow/types.pyi: -------------------------------------------------------------------------------- 1 | from typing import Iterable, NewType, Optional 2 | 3 | DType = NewType("DType", str) 4 | 5 | uint8: DType 6 | int8: DType 7 | int32: DType 8 | int64: DType 9 | float32: DType 10 | float64: DType 11 | string: DType 12 | 13 | class TensorShape: 14 | dims: Optional[Iterable[int]] 15 | ndims: Optional[int] 16 | rank: Optional[int] 17 | 18 | def __init__(self, dims: Optional[Iterable[Optional[int]]]) -> TensorShape: ... -------------------------------------------------------------------------------- /typings/torchvision/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zensors/datatap-python/1a05b354f20bbbc24b9bad76e919e0cc9542538c/typings/torchvision/__init__.py -------------------------------------------------------------------------------- /typings/torchvision/transforms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zensors/datatap-python/1a05b354f20bbbc24b9bad76e919e0cc9542538c/typings/torchvision/transforms/__init__.py -------------------------------------------------------------------------------- /typings/torchvision/transforms/functional.pyi: -------------------------------------------------------------------------------- 1 | import PIL.Image 2 | import torch 3 | 4 | def to_tensor(img: PIL.Image.Image) -> torch.Tensor: ... 5 | def to_pil_image(tensor: torch.Tensor, mode: str = ...) -> PIL.Image.Image: ... --------------------------------------------------------------------------------