├── pipe-plugins
    ├── reverse.py
    ├── reverse.bat
    └── reverse
├── LDB-schema.png
├── images
    ├── warn.png
    ├── ldb-intro.png
    ├── ldb-struct.png
    ├── workspace.png
    ├── ldb-principle.png
    ├── numerals-bescond.png
    └── numerals-deeplearningAI.png
├── apply-plugins
    ├── random_predictions.py
    └── textocr_crops.py
├── transforms
    ├── vertical_flip.py
    ├── horizontal_flip.py
    ├── rotate.py
    ├── random_rotate.py
    └── textocr_crops_single.py
├── documentation
    ├── Plugins.md
    ├── Datasets.md
    ├── alternatives-to-LDB.md
    ├── Label-studio.md
    ├── Quick-start-teams.md
    ├── formats.md
    ├── LDB-queries.md
    ├── tutorial.md
    └── Command-summary.md
├── .gitignore
├── LICENSE
├── _README_draft.md
└── README.md


/pipe-plugins/reverse.py:
--------------------------------------------------------------------------------
1 | reverse


--------------------------------------------------------------------------------
/pipe-plugins/reverse.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | python3 "%~dp0\reverse.py"
3 | 


--------------------------------------------------------------------------------
/LDB-schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iterative/ldb-resources/HEAD/LDB-schema.png


--------------------------------------------------------------------------------
/images/warn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iterative/ldb-resources/HEAD/images/warn.png


--------------------------------------------------------------------------------
/images/ldb-intro.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iterative/ldb-resources/HEAD/images/ldb-intro.png


--------------------------------------------------------------------------------
/images/ldb-struct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iterative/ldb-resources/HEAD/images/ldb-struct.png


--------------------------------------------------------------------------------
/images/workspace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iterative/ldb-resources/HEAD/images/workspace.png


--------------------------------------------------------------------------------
/images/ldb-principle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iterative/ldb-resources/HEAD/images/ldb-principle.png


--------------------------------------------------------------------------------
/images/numerals-bescond.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iterative/ldb-resources/HEAD/images/numerals-bescond.png


--------------------------------------------------------------------------------
/images/numerals-deeplearningAI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iterative/ldb-resources/HEAD/images/numerals-deeplearningAI.png


--------------------------------------------------------------------------------
/pipe-plugins/reverse:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import json
3 | import sys
4 | 
5 | if __name__ == "__main__":
6 |     for data_object_hash, *_ in reversed(json.loads(sys.stdin.read())):
7 |         print(data_object_hash, flush=True)
8 | 


--------------------------------------------------------------------------------
/apply-plugins/random_predictions.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import json
 3 | import os
 4 | import random
 5 | import shutil
 6 | import sys
 7 | from typing import Sequence
 8 | 
 9 | 
10 | def main(argv: Sequence[str] = ()) -> None:
11 |     source_dir = argv[0]
12 |     dest_dir = argv[1]
13 |     for entry in os.listdir(source_dir):
14 |         source_file_path = os.path.join(source_dir, entry)
15 |         dest_file_path = os.path.join(dest_dir, entry)
16 |         if entry.endswith(".json"):
17 |             pred = random.random()
18 |             write_prediction(source_file_path, dest_file_path, pred)
19 |         else:
20 |             shutil.move(source_file_path, dest_file_path)
21 | 
22 | 
23 | def write_prediction(
24 |     source_file_path: str,
25 |     dest_file_path: str,
26 |     prediction: float,
27 | ) -> None:
28 |     with open(source_file_path, encoding="utf-8") as source_file:
29 |         raw_annot = source_file.read()
30 |     annot = json.loads(raw_annot)
31 |     annot["prediction"] = prediction
32 |     new_raw_annot = json.dumps(annot, indent=2)
33 |     with open(dest_file_path, "x", encoding="utf-8") as dest_file:
34 |         dest_file.write(new_raw_annot)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     main(json.loads(sys.stdin.read()))
39 | 


--------------------------------------------------------------------------------
/transforms/vertical_flip.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import json
 3 | import os
 4 | import shutil
 5 | import sys
 6 | from typing import Dict, Sequence
 7 | 
 8 | from PIL import Image
 9 | 
10 | 
11 | def main(inp: Dict[str, str], argv: Sequence[str] = ()) -> None:
12 |     data_object_path = inp["data_object"]
13 |     annotation_path = inp["annotation"]
14 |     output_dir = inp["output_dir"]
15 |     transform_name = inp["transform_name"]
16 | 
17 |     if len(argv) > 1:
18 |         print(
19 |             "ERROR: No arguments expected",
20 |             file=sys.stderr,
21 |         )
22 |         sys.exit(1)
23 |     orig_image = Image.open(data_object_path)
24 |     file_name_base, ext = os.path.splitext(os.path.basename(data_object_path))
25 |     if ext.lstrip("."):
26 |         fmt = None
27 |     else:
28 |         fmt = orig_image.format or "PNG"
29 |         ext = ""
30 | 
31 |     new_image = orig_image.transpose(method=Image.Transpose.FLIP_TOP_BOTTOM)
32 |     file_name_base = f"{file_name_base}--{transform_name}".replace(".", "-")
33 |     obj_file_path = os.path.join(output_dir, f"{file_name_base}{ext}")
34 |     annot_file_path = os.path.join(output_dir, f"{file_name_base}.json")
35 |     rgb_im = new_image.convert('RGB')
36 |     rgb_im.save(obj_file_path, format=fmt)
37 |     shutil.copy2(annotation_path, annot_file_path)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     main(json.loads(sys.stdin.read()), sys.argv)
42 | 


--------------------------------------------------------------------------------
/transforms/horizontal_flip.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import json
 3 | import os
 4 | import shutil
 5 | import sys
 6 | from typing import Dict, Sequence
 7 | 
 8 | from PIL import Image
 9 | 
10 | 
11 | def main(inp: Dict[str, str], argv: Sequence[str] = ()) -> None:
12 |     data_object_path = inp["data_object"]
13 |     annotation_path = inp["annotation"]
14 |     output_dir = inp["output_dir"]
15 |     transform_name = inp["transform_name"]
16 | 
17 |     if len(argv) > 1:
18 |         print(
19 |             "ERROR: No arguments expected",
20 |             file=sys.stderr,
21 |         )
22 |         sys.exit(1)
23 |     orig_image = Image.open(data_object_path)
24 |     file_name_base, ext = os.path.splitext(os.path.basename(data_object_path))
25 |     if ext.lstrip("."):
26 |         fmt = None
27 |     else:
28 |         fmt = orig_image.format or "PNG"
29 |         ext = ""
30 | 
31 |     new_image = orig_image.transpose(method=Image.Transpose.FLIP_LEFT_RIGHT)
32 |     file_name_base = f"{file_name_base}--{transform_name}".replace(".", "-")
33 |     obj_file_path = os.path.join(output_dir, f"{file_name_base}{ext}")
34 |     annot_file_path = os.path.join(output_dir, f"{file_name_base}.json")
35 |     rgb_im = new_image.convert('RGB')
36 |     rgb_im.save(obj_file_path, format=fmt)
37 |     shutil.copy2(annotation_path, annot_file_path)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     main(json.loads(sys.stdin.read()), sys.argv)
42 | 


--------------------------------------------------------------------------------
/transforms/rotate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import json
 3 | import os
 4 | import shutil
 5 | import sys
 6 | from math import cos, pi, sin
 7 | from typing import Dict, Sequence
 8 | 
 9 | from PIL import Image
10 | 
11 | 
12 | def main(inp: Dict[str, str], argv: Sequence[str] = ()) -> None:
13 |     data_object_path = inp["data_object"]
14 |     annotation_path = inp["annotation"]
15 |     output_dir = inp["output_dir"]
16 |     transform_name = inp["transform_name"]
17 | 
18 |     transform_args = argv[1:] or ["90"]
19 | 
20 |     orig_image = Image.open(data_object_path)
21 |     prefix, ext = os.path.splitext(os.path.basename(data_object_path))
22 |     if ext.lstrip("."):
23 |         fmt = None
24 |     else:
25 |         fmt = orig_image.format or "PNG"
26 |         ext = ""
27 |     for n in transform_args:
28 |         new_image = rotate_and_crop(orig_image, int(n))
29 |         file_name_base = f"{prefix}--{transform_name}--{n}".replace(
30 |             ".",
31 |             "-",
32 |         )
33 |         obj_file_path = os.path.join(output_dir, f"{file_name_base}{ext}")
34 |         annot_file_path = os.path.join(output_dir, f"{file_name_base}.json")
35 |         rgb_im = new_image.convert('RGB')
36 |         rgb_im.save(obj_file_path, format=fmt)
37 |         shutil.copy2(annotation_path, annot_file_path)
38 | 
39 | 
40 | def rotate_and_crop(image, degrees):
41 |     """
42 |     Rotate the given image by `degrees`, and crop the resulting image
43 |     to the largest rectangle with the original aspect ratio.
44 |     """
45 |     rotated_image = image.rotate(degrees, expand=True)
46 |     w1, h1 = image.size
47 |     w2, h2 = rotated_image.size
48 |     r1 = w1 / h1
49 |     r2 = w2 / h2
50 |     angle = abs(degrees) * pi / 180
51 |     if w1 < h1:
52 |         total_height = w1 / r2
53 |     else:
54 |         total_height = h1
55 |     h = abs(total_height / (r1 * abs(sin(angle)) + abs(cos(angle))))
56 |     w = h * r1
57 | 
58 |     x1 = (w2 - w) / 2
59 |     x2 = w2 - x1
60 |     y1 = (h2 - h) / 2
61 |     y2 = h2 - y1
62 |     return rotated_image.crop([x1, y1, x2, y2])
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     main(json.loads(sys.stdin.read()), sys.argv)
67 | 


--------------------------------------------------------------------------------
/documentation/Plugins.md:
--------------------------------------------------------------------------------
 1 | LDB supports pluggable executables that can be used to select data objects based on some operations over their annotations or binary content.
 2 | 
 3 | - Bundled plugin: [CLIP](#clip)
 4 | - Bundled plugin: [ResNet](#resnet)
 5 | - [Custom plugins](#custom-plugins)
 6 | 
 7 | ## CLIP
 8 | 
 9 | OpenAI's [CLIP](https://github.com/openai/CLIP) is a popular semantic embedding model. Input images are embedded into semantic space, and compared by cosine similarity to an exemplar image or text string. 
10 | 
11 | To install:
12 | ```
13 | cd ldb
14 | pip install '.[clip-plugin]'
15 | ```
16 | LDB clip plugin supports two ways to call: 
17 | 
18 | 1. Using exemplar image.
19 | 
20 | ```
21 | clip-image <image_file_path> [<model_name>]
22 | ```
23 | 
24 | Where `model_name` is one of `RN50, RN101, RN50x4, RN50x16, RN50x64, ViT-B/32, ViT-B/16, ViT-L/14`
25 | 
26 | Examples:
27 | 
28 | ```
29 | ldb list ds:root --pipe clip-image ~/dogs-and-cats/cat.1000.jpg --limit 10
30 | ldb list ds:root --pipe clip-image ~/dogs-and-cats/cat.1000.jpg RN50 --limit 10
31 | ```
32 | 2. Using exemplar text
33 | 
34 | ```
35 | clip-text <text_to_compare> [<model_name>]
36 | ```
37 | 
38 | Examples:
39 | 
40 | ```
41 | ldb list ds:root --pipe clip-text 'an orange cat' --limit 10
42 | ldb list ds:root --pipe clip-text 'an orange cat' RN50 --limit 10
43 | 
44 | ```
45 | 
46 | 
47 | ## ResNet
48 | 
49 | ResNet plugin sorts incoming image objects by cosine similarity on features from ["Deep Residual Network for Image Recognition"](https://arxiv.org/abs/1512.03385).
50 | 
51 | To install:
52 | ```
53 | cd ldb
54 | pip install '.[resnet-plugin]'
55 | ```
56 | 
57 | To call:
58 | ```
59 | resnet-image <image_file_path> [<model_num> [<layer_num>]]
60 | ```
61 | where `model_num` is one of `18, 34, 101, 152`, and `<layer_num>` is number between 1-4. Not specifying a layer means comoaring on the final output.
62 | 
63 | Examples:
64 | 
65 | ```
66 | ldb list ds:root --pipe resnet-image ~/dogs-and-cats/cat.1000.jpg --limit 10 # final output of default (resnet18)
67 | ldb list ds:root --pipe resnet-image ~/dogs-and-cats/cat.1000.jpg 50 --limit 10 # final output of resnet50
68 | ldb list ds:root --pipe resnet-image ~/dogs-and-cats/cat.1000.jpg 50 2 --limit 10 # layer 2 output of resnet50 
69 | 
70 | ```
71 | 
72 | ## Custom Plugins
73 | 
74 | LDB supports custom plugins. For information how to write a plugin, please refer to [command summary](Command-summary.md#pipe-plugins)
75 | 


--------------------------------------------------------------------------------
/documentation/Datasets.md:
--------------------------------------------------------------------------------
 1 | # LDB Datasets
 2 | 
 3 | LDB comes pre-configured with access to several public datasets:
 4 | 
 5 | * [Dogs and Cats](#dogs-and-cats)
 6 | * [TextOCR](#textocr)
 7 | 
 8 | ### Dogs and Cats
 9 | 
10 | Dataset containing 200 annotated images of cats and dogs in ['strict-pairs'](Command-summary.md#index) format.
11 | Schema looks as follows:
12 | 
13 | ```
14 | {
15 |   "class": "dog",
16 |   "id": "1020",
17 |   "inference": {
18 |     "class": "dog",
19 |     "confidence": 0.3
20 |   },
21 |   "num_annotators": 4
22 | }
23 | ```
24 | **S3 url bucket:**
25 | * s3://ldb-public/remote/data-lakes/dogs-and-cats/
26 | 
27 | **Downloading as archive:**
28 | * https://remote.ldb.ai/datasets/dogs-and-cats/dogs-and-cats.zip
29 | * https://remote.ldb.ai/datasets/dogs-and-cats/dogs-and-cats.tar.gz
30 | 
31 | **Indexing:**
32 | ```
33 | ldb index s3://ldb-public/remote/data-lakes/dogs-and-cats/
34 | ```
35 | 
36 | **Example of use:**
37 | ```
38 | ldb index s3://ldb-public/remote/data-lakes/dogs-and-cats/
39 | ldb stage ds:my-animals 
40 | ldb add s3://ldb-public/remote/data-lakes/dogs-and-cats/
41 | ldb eval --limit 3 --query '[class, inference.class]'
42 | ldb get ws:./ --pipe clip-text 'orange cats' --limit 10 -t orange-cats/
43 | ```
44 | 
45 | TODO: rework into http/https
46 | 
47 | ### TextOCR
48 | 
49 | A sample of the [TextOCR](https://textvqa.org/textocr/) dataset containing about 1800 images.
50 | 
51 | **Downloads:**
52 | * https://remote.ldb.ai/datasets/textocr/textocr.zip
53 | * https://remote.ldb.ai/datasets/textocr/textocr.tar.gz
54 | 
55 | **Example of use:**
56 | ```
57 | ldb add-storage textocr
58 | ldb index --format bare textocr
59 | ldb list ds:root --summary
60 | ldb list ds:root --query 'length(anns) >= `20`' --summary
61 | ldb eval ds:root --limit 1 --query 'anns[*].utf8_string'
62 | ldb list ds:root --query 'length(anns[?regex(utf8_string, `\\d`)]) >= `1`' --summary
63 | ```
64 | 
65 | ## Dataset download tips
66 | 
67 | Zip files and and gzipped tarballs are available for some public datasets to play with LDB locally. These can be downloaded from the URLs provided under [Datasets](#datasets) with a browser or with a CLI tool such as [DVC](https://dvc.org/doc/install):
68 | ```
69 | dvc get-url https://remote.ldb.ai/datasets/DATASET_NAME/DATASET_NAME.zip
70 | unzip DATASET_NAME.zip
71 | ```
72 | 
73 | Or download and unpack with `curl` and `tar`:
74 | ```
75 | curl -L https://remote.ldb.ai/datasets/DATASET_NAME/DATASET_NAME.tar.gz | tar xz
76 | ```
77 | 


--------------------------------------------------------------------------------
/transforms/random_rotate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import json
 3 | import os
 4 | import random
 5 | import shutil
 6 | import sys
 7 | from math import cos, pi, sin
 8 | from typing import Dict, Sequence
 9 | 
10 | from PIL import Image
11 | 
12 | 
13 | def main(inp: Dict[str, str], argv: Sequence[str] = ()) -> None:
14 |     data_object_path = inp["data_object"]
15 |     annotation_path = inp["annotation"]
16 |     output_dir = inp["output_dir"]
17 |     transform_name = inp["transform_name"]
18 | 
19 |     transform_args = [int(x) for x in argv[1:]]
20 |     if len(transform_args) > 3:
21 |         print(
22 |             "ERROR: Too many args\n"
23 |             "usage: random_rotate.py [ [start] stop [step] ]",
24 |             file=sys.stderr,
25 |         )
26 |         sys.exit(1)
27 |     if not transform_args:
28 |         transform_args = [360]
29 |     degrees = random.randrange(*transform_args)
30 |     orig_image = Image.open(data_object_path)
31 |     prefix, ext = os.path.splitext(os.path.basename(data_object_path))
32 |     if ext.lstrip("."):
33 |         fmt = None
34 |     else:
35 |         fmt = orig_image.format or "PNG"
36 |         ext = ""
37 | 
38 |     str_args = "-".join(map(str, transform_args))
39 |     new_image = rotate_and_crop(orig_image, degrees)
40 |     file_name_base = f"{prefix}--{transform_name}--{str_args}".replace(
41 |         ".",
42 |         "-",
43 |     )
44 |     obj_file_path = os.path.join(output_dir, f"{file_name_base}{ext}")
45 |     annot_file_path = os.path.join(output_dir, f"{file_name_base}.json")
46 |     rgb_im = new_image.convert('RGB')
47 |     rgb_im.save(obj_file_path, format=fmt)
48 |     shutil.copy2(annotation_path, annot_file_path)
49 | 
50 | 
51 | def rotate_and_crop(image, degrees):
52 |     """
53 |     Rotate the given image by `degrees`, and crop the resulting image
54 |     to the largest rectangle with the original aspect ratio.
55 |     """
56 |     rotated_image = image.rotate(degrees, expand=True)
57 |     w1, h1 = image.size
58 |     w2, h2 = rotated_image.size
59 |     r1 = w1 / h1
60 |     r2 = w2 / h2
61 |     angle = abs(degrees) * pi / 180
62 |     if w1 < h1:
63 |         total_height = w1 / r2
64 |     else:
65 |         total_height = h1
66 |     h = abs(total_height / (r1 * abs(sin(angle)) + abs(cos(angle))))
67 |     w = h * r1
68 | 
69 |     x1 = (w2 - w) / 2
70 |     x2 = w2 - x1
71 |     y1 = (h2 - h) / 2
72 |     y2 = h2 - y1
73 |     return rotated_image.crop([x1, y1, x2, y2])
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     main(json.loads(sys.stdin.read()), sys.argv)
78 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | /data
140 | 
141 | # PyAnnotate
142 | type_info.json
143 | 


--------------------------------------------------------------------------------
/transforms/textocr_crops_single.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | A script for use with the `ldb instantiate` command's `--apply` option. It
 4 | expects data from the TextOCR dataset, and demonstrates how to find
 5 | segmentations with a regex and generate an image from each segmentation's
 6 | bounding box.
 7 | 
 8 | For example:
 9 | 
10 |     ldb instantiate --apply python3 path/to/textocr_crops.py '(?i)^(V?I{1,3}|I?[VX])$'
11 | 
12 | For information on TextOCR, see https://textvqa.org/textocr/dataset/
13 | """
14 | import json
15 | import os
16 | import re
17 | import sys
18 | from typing import Dict, Sequence
19 | 
20 | from PIL import Image
21 | 
22 | 
23 | def main(inp: Dict[str, str], argv: Sequence[str] = ()) -> None:
24 |     if len(argv) != 1:
25 |         raise ValueError("expected exactly one argument, a regex pattern")
26 |     pattern = argv[0]
27 |     data_object_path = inp["data_object"]
28 |     annotation_path = inp["annotation"]
29 |     output_dir = inp["output_dir"]
30 |     transform_name = inp["transform_name"]
31 |     create_roman_numeral_crops(
32 |         data_object_path,
33 |         annotation_path,
34 |         output_dir,
35 |         pattern,
36 |         transform_name,
37 |     )
38 | 
39 | 
40 | def create_roman_numeral_crops(
41 |     data_obj_path: str,
42 |     annot_path: str,
43 |     dest_dir: str,
44 |     pattern: str,
45 |     transform_name: str = "",
46 | ) -> None:
47 |     with open(annot_path, encoding="utf-8") as source_file:
48 |         raw_annot = source_file.read()
49 |     annot = json.loads(raw_annot)
50 |     sub_annots = [
51 |         a for a in annot["anns"] if re.search(pattern, a["utf8_string"])
52 |     ]
53 |     if sub_annots:
54 |         data_obj_name = os.path.basename(data_obj_path)
55 |         dest_base, ext = os.path.splitext(
56 |             os.path.join(dest_dir, data_obj_name),
57 |         )
58 |         if transform_name:
59 |             dest_base = f"{dest_base}--{transform_name}"
60 |         source_img = annot["img"]
61 |         img = Image.open(data_obj_path)
62 |         for i, sub_annot in enumerate(sub_annots, 1):
63 |             new_path_base = f"{dest_base}--{i:03}"
64 |             new_data_obj_path = f"{new_path_base}{ext}"
65 |             new_annot_path = f"{new_path_base}.json"
66 |             new_img = crop_bbox(img, sub_annot["bbox"])
67 |             new_annot = {
68 |                 "source_img": source_img,
69 |                 "annotation": sub_annot,
70 |             }
71 |             new_raw_annot = json.dumps(new_annot, indent=2)
72 |             with open(new_annot_path, "x", encoding="utf-8") as dest_file:
73 |                 dest_file.write(new_raw_annot)
74 |             new_img.save(new_data_obj_path)
75 | 
76 | 
77 | def crop_bbox(image: Image.Image, bbox: Sequence[float]) -> Image.Image:
78 |     x, y, width, height = bbox
79 |     return image.crop([x, y, x + width, y + height])  # type: ignore[arg-type]
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     main(json.loads(sys.stdin.read()), sys.argv[1:])
84 | 


--------------------------------------------------------------------------------
/apply-plugins/textocr_crops.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | A script for use with the `ldb instantiate` command's `--apply` option. It
 4 | expects data from the TextOCR dataset, and demonstrates how to find
 5 | segmentations with a regex and generate an image from each segmentation's
 6 | bounding box.
 7 | 
 8 | For example:
 9 | 
10 |     ldb instantiate --apply python3 path/to/textocr_crops.py '(?i)^(V?I{1,3}|I?[VX])$'
11 | 
12 | For information on TextOCR, see https://textvqa.org/textocr/dataset/
13 | """
14 | import json
15 | import os
16 | import re
17 | import sys
18 | from typing import Sequence
19 | 
20 | from PIL import Image
21 | 
22 | 
23 | def main(argv: Sequence[str], stdin_text: str) -> None:
24 |     if len(argv) != 1:
25 |         raise ValueError("expected exactly one argument, a regex pattern")
26 |     source_dir, dest_dir = json.loads(stdin_text)
27 |     generate_crops(source_dir, dest_dir, argv[0])
28 | 
29 | 
30 | def generate_crops(source_dir: str, dest_dir: str, pattern: str) -> None:
31 |     file_names = set(os.listdir(source_dir))
32 |     pairs = {}
33 |     for entry in file_names:
34 |         if not entry.endswith(".json"):
35 |             annot_path = os.path.splitext(entry)[0] + ".json"
36 |             if annot_path in file_names:
37 |                 pairs[annot_path] = entry
38 | 
39 |     for annot_name, data_obj_name in pairs.items():
40 |         create_roman_numeral_crops(
41 |             os.path.join(source_dir, data_obj_name),
42 |             os.path.join(source_dir, annot_name),
43 |             dest_dir,
44 |             pattern,
45 |         )
46 | 
47 | 
48 | def create_roman_numeral_crops(
49 |     data_obj_path: str,
50 |     annot_path: str,
51 |     dest_dir: str,
52 |     pattern: str,
53 |     transform_name: str = "",
54 | ) -> None:
55 |     with open(annot_path, encoding="utf-8") as source_file:
56 |         raw_annot = source_file.read()
57 |     annot = json.loads(raw_annot)
58 |     sub_annots = [
59 |         a for a in annot["anns"] if re.search(pattern, a["utf8_string"])
60 |     ]
61 |     if sub_annots:
62 |         data_obj_name = os.path.basename(data_obj_path)
63 |         dest_base, ext = os.path.splitext(
64 |             os.path.join(dest_dir, data_obj_name),
65 |         )
66 |         if transform_name:
67 |             dest_base = f"{dest_base}--{transform_name}"
68 |         source_img = annot["img"]
69 |         img = Image.open(data_obj_path)
70 |         for i, sub_annot in enumerate(sub_annots, 1):
71 |             new_path_base = f"{dest_base}--{i:03}"
72 |             new_data_obj_path = f"{new_path_base}{ext}"
73 |             new_annot_path = f"{new_path_base}.json"
74 |             new_img = crop_bbox(img, sub_annot["bbox"])
75 |             new_annot = {
76 |                 "source_img": source_img,
77 |                 "annotation": sub_annot,
78 |             }
79 |             new_raw_annot = json.dumps(new_annot, indent=2)
80 |             with open(new_annot_path, "x", encoding="utf-8") as dest_file:
81 |                 dest_file.write(new_raw_annot)
82 |             new_img.save(new_data_obj_path)
83 | 
84 | 
85 | def crop_bbox(image: Image.Image, bbox: Sequence[float]) -> Image.Image:
86 |     x, y, width, height = bbox
87 |     return image.crop([x, y, x + width, y + height])  # type: ignore[arg-type]
88 | 
89 | 
90 | if __name__ == "__main__":
91 |     main(sys.argv[1:], sys.stdin.read())
92 | 


--------------------------------------------------------------------------------
/documentation/alternatives-to-LDB.md:
--------------------------------------------------------------------------------
 1 | ## Alternatives in dataset organization: ##
 2 | 
 3 | Here we discuss some popular dataset organization methods.
 4 | 
 5 | ### Folder-level dataset organization. ###
 6 | 
 7 | The simplest method to manage datasets is by grouping samples into named file folders. Many popular datasets (COCO, ImageNet, etc) are shipped as folders, and most Kaggle datasets will look as folders after setting them up.
 8 | 
 9 | This method is great when work focuses on the AI models, yet it leads into serious limitations with Data-Centric AI:
10 | 
11 | * Experimenting on dataset (adding or removing data objects) results in multiple copies of the *same* dataset with minimal changes.
12 | 
13 | * Folders are not easy to slice and dice, and retain no metadata to keep track of changes and object provenance. 
14 | 
15 | * Attempts to add samples may result in repetitions (identical objects under multiple names) or data loss (samples overwritten with name collisions).
16 | 
17 | * Annotation updates are not versioned, and may cause annotations going stale.
18 | 
19 | * Folder-level access granularity is difficult to integrate with privacy policies and regulation directives.
20 | 
21 | ### Spreadsheets (or other database-powered designs). ###
22 | 
23 | A reasonable step up from managing datasets in folders is to organize datasets as tables filled with pointers (URIs of data objects). 
24 | 
25 | This method permits for sparse datasets where individual objects are no longer required to reside in one folder or one cloud bucket. Since spreadsheet-based datasets decouple storage from membership, they no longer require objects to be copied (or moved) to form new datasets, and allow to store any meta-information as column attributes. In addition, versioning for datasets and annotations can be provided by means of multiple tables corresponding to different versions.
26 | 
27 | Spreadsheets, however, still carry significant limitations: 
28 | 
29 | * They do not solve the problem of repetitions (same data objects appearing under different URIs), and cannot prevent annotations from going stale. Both of these problems require tracking objects by content – which spreadsheets cannot do.
30 | 
31 | * Spreadsheets do not provide native means to assemble datasets from queries. This means an ML engineer needs to compose object lists manually, or use ad-hoc software to query annotations and export matching objects into the tables.
32 | 
33 | * Use of spreadsheets and databases to store datasets forces ML engineers to use unfamiliar tools that are hard to integrate with MLOps. Forming a dataset and registering it in a database is a process with many touchpoints.
34 | 
35 | ### Heavyweight ML frameworks ###
36 | 
37 | Finally, it is fairly common to find dataset management functions in large, heavyweight ML frameworks. For example, any data labeling software likely has some ability to track annotation versions and search annotations by fields. Likewise, every end-to-end ML platform facilitates some organization of input data into the datasets, at least at a folder level. 
38 | 
39 | While end-to-end ML platforms can be extremely successful in vertical-specific applications, they are difficult to recommend in a general case.
40 | 
41 | Unlike these platforms, LDB follows Unix toolchain philosophy and solves exactly one problem – it sits between the (immutable) data storage and the mutable model training workspace, and allows for reproducible and fast data-driven iterations. This enables an easy integration with any labeling software upstream, or any experiment automation downstream.
42 | 


--------------------------------------------------------------------------------
/documentation/Label-studio.md:
--------------------------------------------------------------------------------
 1 | # LDB and Label Studio
 2 | 
 3 | ## Roman Numeral Bounding Boxes
 4 | 
 5 | This example shows how to use LDB alongside Label Studio. The basic steps are:
 6 | 
 7 | 1. Use LDB to download some data from a remote data lake.
 8 | 2. Use Label Studio to annotate this data.
 9 | 3. Import your Label Studio annotations back into LDB.
10 | 
11 | This assumes you have both `ldb` and `label-studio` installed. Separate virtual envs for each is recommended:
12 | ```bash
13 | pip install ldb-alpha
14 | pip install label-studio
15 | ```
16 | 
17 | Create a new ldb instance, and index some data:
18 | ```bash
19 | mkdir -p ~/projects/roman-numerals
20 | cd ~/projects/roman-numerals
21 | 
22 | export LDB_DIR="$PWD/instance"
23 | ldb init "$LDB_DIR"
24 | ldb add-storage s3://ldb-public/remote -o anon true
25 | ldb index --format infer s3://ldb-public/remote/data-lakes/roman-numerals/val/
26 | ldb instantiate ds:root --limit 3 -t data
27 | ```
28 | 
29 | In another terminal, navigate to this same directory and use label studio's script to host these files locally:
30 | ```bash
31 | cd ~/projects/roman-numerals
32 | wget https://raw.githubusercontent.com/heartexlabs/label-studio/3b394c3bf997abe28100e4ef4143f347f6083a69/scripts/serve_local_files.sh
33 | ./serve_local_files.sh data '*.png'
34 | ```
35 | 
36 | This will create a `files.txt` file in the current directory with a list of localhost URL's. 
37 | 
38 | Leave this process running while you import data to Label Studio, annotate it, export the annotations, and index the new annotations with LDB. If you stop it at any point, simple run the last command again to start it.
39 | 
40 | Now you can start a Label Studio instance:
41 | ```bash
42 | label-studio start --username user@example.com --password your-password --user-token abc123
43 | ```
44 | 
45 | This should open a browser window with a Label Studio UI. You can login with the username and password from the previous command. Now you may do the following steps from the UI or command line:
46 | 1. Create a new project
47 | 2. Import `file.txt`. In the UI, if prompted with an option "Treat CSV/TSV as [ ] List of tasks [ ] Time Series", select "List of tasks".
48 | 3. Add your labeling configuration. In the UI, navigate within your project to Settings > Labeling Interface and paste the config into the Code box. Then click save.
49 | 
50 | To create a new project and import the `files.txt` file from the command line, replace `abc123` with your token and run:
51 | ```
52 | curl -H Content-Type:application/json -H 'Authorization: Token abc123' -X POST 'http://localhost:8080/api/projects' --data "$(cat config.xml | jq -R --slurp '{label_config: .}')"
53 | curl -H 'Authorization: Token abc123' -X POST 'http://localhost:8080/api/projects/1/import' -F file=@"$HOME/projects/roman-numerals/files.txt"
54 | ```
55 | 
56 | Now you can annotate your files in the Label Studio UI, by clicking on tasks, selecting the label and marking a bounding box. For example you may want to create new samples by taking an image of `IX` and marking the `I` and the `X` separately.
57 | 
58 | Finally, export your newly annotated data in the JSON format using the "Export" button in the UI, or by running:
59 | ```bash
60 | curl -X GET http://localhost:8080/api/projects/1/export?exportType=JSON -H 'Authorization: Token abc123' --output 'annotations.json'
61 | ```
62 | 
63 | Now you may index this file to import these annotations into LDB:
64 | ```bash
65 | ldb index --ephemeral-remote --format label-studio annotations.json
66 | ```
67 | 
68 | Now to see the bounding boxes of updated annotations, run:
69 | ```bash
70 | ldb eval ds:root --query annotations --query 'annotations[].result[].value' 
71 | ```
72 | 


--------------------------------------------------------------------------------
/documentation/Quick-start-teams.md:
--------------------------------------------------------------------------------
 1 | LDB works by building an index over storage locations. It therefore needs space to store data (LDB instance), and relies on data engineering discipline to ensure that indexed storage locations are persistent and pointers into this storage will remain valid. 
 2 | 
 3 | To simplify things for individual users, by default LDB will automatically create an instance in the home directory at  `~/.ldb` the first time the commands `STAGE` or `GET` are called. This personal LDB instance comes with storage configuration treating all remote locations (s3, gcp, azure, http) as valid persistent storage.
 4 | 
 5 | Team configuration of LDB requires a slightly different setup. First, LDB instance must reside on a shared volume, and second – care must be taken to configure the storage locations that are persistent and safe to index. 
 6 | 
 7 | # LDB team setup #
 8 | 
 9 | When starting a shared LDB instance, the first decision is where to house it. 
10 | 
11 | A shared LDB instance must reside in a shared filestystem folder that is available for all team members, and fast enough for queries to operate efficiently. In the below example, a drive mounted at location `/data` is shared across the ML team, and LDB instance is placed into folder `corporate-LDB` 
12 | 
13 | ### Setting up a new LDB instance
14 | 
15 | | Step | Command |
16 | | --- | --- |
17 | | Create a new LDB instance | `$  ldb init /data/corporate-LDB` |
18 | 
19 | 
20 | When running LDB commands, they need to be pointed towards an active LDB instance. This is done with one of the two methods: an environment variable or a configuration file. By default, an LDB configuration file is expected to reside in the user home directory, folder '.ldb' but if both config methods are present, an environment variable will take precedence:
21 | 
22 | | Step | Command |
23 | | --- | --- |
24 | | Save LDB location into environment | `$  export LDB_DIR=/data/corporate-LDB` |
25 | | Save LDB location into a configuration file | `$ mkdir ~/.ldb; echo "LDB_DIR=/data/corporate-LDB" > ~/.ldb/.config` |
26 | 
27 | ### Registering new LDB storage locations
28 | 
29 | LDB assumes data objects are immutable and live in the pre-defined storage locations (cloud buckets or datalake folders). 
30 | 
31 | A good engineering discipline is to gate access to these locations to ensure the data objects are not accidentally moved or deleted. LDB supports local storage, AWS, Google Cloud, and Azure as storage targets. LDB configuration to access these cloud providers is discussed in [Setting Access to Cloud Locations](/TODO).
32 | 
33 | You can add new storage locations to LDB at any time, but you cannot remove storage locations that are already referenced in the existing datasets.
34 | 
35 | | Step | Command |
36 | | --- | --- |
37 | | Add a new storage location | ` $  ldb add-storage gs://my-awesome-bucket/` |
38 | 
39 | ### Registering a "read-add" storage location
40 | 
41 | By defalt, LDB only allows for objects from immutable storage locations. LDB only needs read and stat permissions for those locations and relies on external data engineering processes to get new samples there. However, in some cases, it is convenient to add new samples to LDB index right when you work in your data workspace. For example, an engineer may choose to quickly modify an image, audio record, or text while browsing data in LDB workspace.
42 | 
43 | In that case, LDB can be configured to support a 'read-add' storage location where new data objects are copied when indexing from emphemeral locations (such as  a workspace). This configuration must be explicitily specified, and will require "append" privileges at this location:
44 | 
45 | | Step | Command |
46 | | --- | --- |
47 | | Add a new 'read-add' storage location | ` $  ldb add-storage -a gs://our-append-bucket/` |
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/documentation/formats.md:
--------------------------------------------------------------------------------
 1 | ## Metadata and annotation formats
 2 | 
 3 | LDB stores, searches and versions metadata and annotations in JSON. LDB does not dictate any specific schema, and just needs to understand how specific data objects (e.g. `cat1.png`) are related to information that describes it (e.g. `cat1.json`, or some JSON array that references `cat1.png`). Different *brand-name* dataset formats may employ incompatible ways to encode this relation, so LDB commands like INDEX and INSTANTIATE might need an explicit argument to specify a format under the `--format` flag.
 4 | 
 5 | Here is what is supported so far:
 6 | 
 7 | * `auto | auto-detect` – auto-detected data format. Supports detection of: `strict-pairs`, `annotation-only`, `tensorflow-inferred`
 8 | 
 9 | * `strict | strict-pairs` – "native" LDB format that assumes data and metadata come in pairs of files (object + json). The annotation file in each pair must have a name ending with `.json`. The data object files are matched to their annotations by means of sharing filenames (e.g. `cat1.json` + `cat1.jpg`), and pairs must reside in the same directory. LDB does not provide restrictions on the actual JSON schema and will accept any valid JSON content.
10 | 
11 | * `bare | bare-pairs` – complete pairs are detected as with `strict-pairs`, but bare data objects (data without annotation files) are also indexed (any file whose name does not end with `.json` will be considered a data object). This format is the primary way to index un-annotated data. 
12 | 
13 | * `infer, tensorflow-inferred` – label-only format based on the `labels="inferred"` option in TensorFlow's [tf.keras.utils.image_dataset_from_directory](https://www.tensorflow.org/api_docs/python/tf/keras/utils/image_dataset_from_directory)  method. Files supplied in this format must have names that do not end with `.json`. The name of the directory passed to INDEX is used as the label for all data objects inside, and objects within subdirectories will have nested labels. 
14 |  
15 |     For example, a call `ldb index --format tensorflow-inferred ~/data/animals/cat/`, would result in object at path `~/data/animals/cat/0001.png` having the annotation `{"label": "cat"}`, and object at path `~/data/animals/cat/tabby/0001.png` having the nested annotation `{"label": {"cat": "tabby"}}`, and so on. Note that for successful conversion of a label from another format into the tensorflow-inferred, it must have the "label" JSON key and will fail otherwise.
16 | 
17 | * `annot | annotation-only` - this format targets labeling teams and allows to pass modified annotatons back and forth without the attached data objects. 
18 | 
19 |   It therefore assumes that data objects were already indexed by LDB, and folder in `annotation-only` format only has new annotations in .json files. To match these annotations to entries in LDB index, they must contain a JSON object with the key `ldb_meta.data_object_id` pointing to hashsum. This hash must match some data object already known to LDB index. The actual annotation is stored under the "annotation" key, for example:
20 | 
21 |   ```
22 |   {
23 |     "annotation": {
24 |       "label": 1
25 |     }
26 |     "ldb_meta": {
27 |       "data_object_id": "2c4a9d28cc2ce780d17bea08d45d33b3"
28 |     }
29 |   }
30 |   ```
31 |   This results in ldb indexing the following annotation for data object `id:2c4a9d28cc2ce780d17bea08d45d33b3`:
32 | 
33 |   ```
34 |   {
35 |     "label": 1
36 |   }
37 |   ```
38 | * `label-studio` - This handles exports from a Label Studio instance in the [JSON format](https://labelstud.io/guide/export.html#JSON). Each export should be a single JSON file containing a top-level array of JSON objects following Label Studio's [raw JSON format](https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks). LDB will treat each JSON object in this top-level array as an annotation.
39 | 
40 |   Under Label Studio's JSON format, [certain keys](https://labelstud.io/guide/export.html#Relevant-JSON-property-descriptions) are expected to be present, including a `data` key with information about the labeling task. LDB will populate some data under `data.data-object-info`, if it doesn't already exist. In particular, LDB will make sure the following fields exist:
41 |   * `data.data-object-info.path_key` - The key of the data object's URI. Usually a sub-key of `data`, such as `data.image`. This can be inferred by LDB if there is only one key under `data` aside from `data-object-info`. If present, LDB will use the existing value. LDB needs the URI of the data object in order to index it if it hasn't already been indexed by LDB.
42 |   * `data.data-object-info.md5` - The MD5 hash of the data object. If this key is already present, and the hash matches a data object LDB has indexed previously, then LDB does not need to index this annotation's data object.
43 | 
44 |   These fields allows Label Studio tasks to be passed between LDB and Label Studio instances repeatedly while maintaining consistent data object identifiers and avoiding repeated indexing of the same data objects. In order to export data from LDB that was indexed using the `label-studio` format, stage it as a working dataset and run:
45 |   ```
46 |   ldb instantiate --format label-studio
47 |   ```
48 |   This will generate a single `annotations.json` file which you can then [import into a Label Studio instance](https://labelstud.io/guide/tasks.html#How-to-import-your-data).
49 | 
50 | 
51 | TODO
52 | 
53 | * `annotation-only` format extension that serves http/https
54 |   * "path" key in "ldb_meta" object of 'annotation-only' to specify an object location
55 |   * top-level array in 'annotation-only' to describe multiple files
56 | * COCO 
57 | * Google ImageNet
58 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright 2021 Iterative, Inc.
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/_README_draft.md:
--------------------------------------------------------------------------------
  1 | # β README
  2 | 
  3 | Label Database (**LDB**) is an **open-source** tool for **data-centric** AI and machine learning projects. It works **upstream from model training** and intends to index data in the *cloud storages* and *data lakes*, organizing pointers to data samples into datasets.
  4 | 
  5 | **LDB** aims to displace ad-hoc dataset management and de-duplication tools – such as file folders, spreadsheets and SQL databases. In the upstream direction, LDB can interface with labeling software, and in the downstream direction LDB integrates with ML pipelines. 
  6 | 
  7 | **Key LDB features**:
  8 | 
  9 | * unix-like **command line** instrument
 10 | * no changes to existing data storage. Data objects be stored anywhere in local storage, web, S3, Google Cloud, or Azure. There is **no need** to **move or duplicate** data objects in order to create, share or modify an LDB dataset.
 11 | * advanced manipulation and versioning for datasets. Collections can be cloned, queried, merged, and sampled. **Every change in a dataset is tracked**
 12 | * label-aware operations. Objects can be selected based on **annotation metadata, file attributes, or custom ML model queries**, and changes to ingress object metadata are versioned. 
 13 | * **reproducible,** **shareable, and fast to materialize**. A particular dataset version will always point to the same set of data objects and annotations. Data samples can be placed in a shared cache during instantiation, so transfers from remote locations are accelerated.
 14 | 
 15 | Full LDB command summary [here](documentation/Command-summary.md)
 16 | 
 17 | ### Contents
 18 | 
 19 | - [How LDB works](#how-ldb-works)
 20 | - [Quick start](#quick-start)
 21 | - [Comparison to related technologies](#comparison-to-related-technologies)
 22 | - [Contributing to LDB](#contributing)
 23 | 
 24 | 
 25 | ### How LDB works
 26 | 
 27 | Data objects in ML normally start life from data mining, data labeling, data cleansing, or data synthesis and accummulate at storage locations. LDB can index these locations and note unique data objects along with their annotations (if present). Queries into LDB index then can used to construct datasets that work like collections of pointers into the storage. Since LDB does not save data objects internally, it relies on persistent storage locations to access samples in the future. This means a read-only access into protected storage is sufficient for LDB, and LDB dataset operations can never endanger original data objects.
 28 | 
 29 | <img src="images/ldb-struct.png" width="500" height="288" align="left">
 30 | 
 31 | The main use case for LDB is to organize objects into collections (datasets) for training or testing. Datasets can then be shared and versioned within LDB, which makes collaboration on dataset membership (cloning, merging, splitting, adding, and removing objects) manageable and reproducible.
 32 | 
 33 | Since LDB datasets are logical, they must be materialized (instantiated) prior to use. Whenever a dataset needs to be materialized (for instance, to run a model experiment), LDB copies all relevant objects from storage and compiles the linked annotations. 
 34 | 
 35 | For as long as object storage remains intact and logical dataset state is saved within LDB, a physical dataset instance created by LDB can always be safely erased after the experiment is complete.
 36 | 
 37 | ## Quick Start
 38 | Please refer to [sample LDB workflow](documentation/Getting-started-with-LDB.md) for more a detailed example of Data-driven AI methodology and to [LDB command summary](documentation/Command-summary.md) for additional information on command options.
 39 | 
 40 | <img src="images/warn.png" width="35" height="25" align="left">
 41 | 
 42 | **LDB instance** is a persistent structure where all information about known objects, labels and datasets is being stored. To set up a shared LDB instance for a team or organization, please follow [LDB team setup](documentation/Quick-start-teams.md). If no LDB instance is found, a private one will be created automatically in the `~/.ldb` directory the first time an LDB dataset is staged. 
 43 | 
 44 | | Step | Command |
 45 | | --- | --- |
 46 | | Install LDB | ```$ pip install 'ldb-alpha[clip-plugin,resnet-plugin]'``` |
 47 | 
 48 | ### Forming datasets by querying annotations
 49 | 
 50 | Ability to issue complex queries is key to dataset formation in LDB.  For demo purposes, we will use a web-hosted image dataset with annotations in the following JSON format that denote animal class, size, and eye positions:
 51 | 
 52 | ```json
 53 | {
 54 |   "class": "cat",
 55 |   "features": {
 56 |     "left-eye": {
 57 |       "x": 318,
 58 |       "y": 222
 59 |     },
 60 |     "right-eye": {
 61 |       "x": 340,
 62 |       "y": 224
 63 |     }
 64 |   },
 65 |   "size": "large"
 66 | }
 67 | ```
 68 | 
 69 | | Step | Command |
 70 | | --- | --- |
 71 | | Cats size L | ```$  ldb get s3://ldb-public/remote/ds/cats/ --query 'size == `large`' large-cats``` |
 72 | | Small heads | ```$ ldb get --query 'sub(features."right-eye".x, features."left-eye".x) < `30`' small-head``` |
 73 | 
 74 | Now we should have folder `large-cats` with instantiated data samples annotated as `"size": "large"`, and folder `small-head` with samples annotated for horizontal distance between animal eyes less than 30 pixels. LDB can support very complex JSON queries that would normally require custom programming by making good use of extended JMESPATH query language (see [LDB queries](documentation/LDB-queries.md) for details).
 75 | 
 76 | <img src="images/warn.png" width="35" height="25" align="left">
 77 | LDB command `GET` in examples above does four distinct things: it creates a targer folder, stages a namesake dataset there, performs logical addition of objects mentioned in query, and instantiates the result.
 78 | 
 79 | 
 80 | * Note that objects in folders `large-cats` and `small-head` can be overlapping – for example, the same animal can be labeled `"size": "large"` but not occupy much real estate in the image. In that case, the same object will be present in both folders, yet LDB is smart enough to avoid double transfer and storage by using a local cache.
 81 | 
 82 | * Also note that the first query explicitly referenced cloud storage, while the second did not. LDB indexes unknown data objects at first encounter, so subsequent queries can run from the internal LDB index addressable under the reserved name "root".
 83 | 
 84 | ### Forming datasets by querying object file attributes
 85 | 
 86 | At index time, LDB also stores object attributes that can be queried in the same way a conventional file search tool would work over storage. For example, LDB can filter objects for symbol matches in storage path and for creation timestamp range:
 87 | 
 88 | | Step | Command |
 89 | | --- | --- |
 90 | | Regex to object path | ```$ ldb get --path 'cat[0-9][0-3].*' misc-cats``` |
 91 | | Range of ctimes | ```$ ldb get --file 'fs.ctime < `"2022-03-28"`' --file 'fs.ctime > `"2022-03-25"`' misc-cats ``` |
 92 | 
 93 | * LDB index (ds:root) is a default source for objects, so LDB commands query all known objects unless instructed otherwise
 94 | * The first `GET` command stages new dataset 'misc-cats' in a namesake folder, and the second command adds to it.
 95 | * Time-based query uses two `--file` filters to intersect their results in order to form a time interval
 96 | 
 97 | ### Query debugging
 98 | 
 99 | JMESPATH queries can quickly become complicated, so it is useful to understand how LDB constructs and evaluates them.
100 | 
101 | LDB treats any expression that results in null, boolean false, or an empty objects as 'falsy' that fails the filter, and treats every other output (including 0) as 'truthy' that passes the filter. Any reference to a non-existing key immediately fails the filter.
102 | 
103 | To understand exactly what LDB does in each case, it is useful to utilize command `EVAL` and observe the result of  JSON query reduction. `EVAL` without --query simply returns the entire annotation:
104 | 
105 | ```
106 | $ ldb eval 0xffa97795d32350dc450f41c4ce725886
107 | 
108 | 0xffa97795d32350dc450f41c4ce725886
109 | {
110 |   "class": "cat",
111 |   "features": {
112 |     "left-eye": {
113 |       "x": 318,
114 |       "y": 222
115 |     },
116 |     "right-eye": {
117 |       "x": 340,
118 |       "y": 224
119 |     }
120 |   },
121 |   "size": "small"
122 | }
123 | ```
124 | 
125 | Any missing JSON key in query produces `null` – which means this query would immediately fail:
126 | 
127 | ```
128 | $ ldb eval 0xffa97795d32350dc450f41c4ce725886 --query 'inference.time'
129 | 
130 | RuntimeWarning: MissingIdentifierException: inference
131 | 0xffa97795d32350dc450f41c4ce725886
132 | null
133 | ```
134 | A valid JMESPATH expression will always emit a JSON value:
135 | ```
136 | $ ldb eval 0xffa97795d32350dc450f41c4ce725886 --query 'class'
137 | 
138 | 0xffa97795d32350dc450f41c4ce725886
139 | "cat"
140 | ```
141 | 
142 | ### Custom code for queries
143 | 
144 | If none of existing methods to query annotation or a data object works well, LDB supports custom query code that collects all objects passed through filters so far (see [command summary](Command-summary.md#pipe-plugins) for API reference). Here is an example of "useless" filter that sorts objects by their hashsum identifiers:
145 | 
146 | ```python
147 | # id_sorted.py
148 | 
149 | import json
150 | import sys
151 | 
152 | if __name__ == "__main__":
153 |     for data_object_hash, *_ in sorted(json.loads(sys.stdin.read())):
154 |         print(data_object_hash, flush=True)
155 | ```
156 | 
157 | | Step | Command |
158 | | --- | --- |
159 | | Use of custom query filter | `$ ldb get --pipe python3 ./id_sorted.py --limit 3 misc-cats` |
160 | 
161 | ### ML plugins for queries
162 | 
163 | One application of custom code is ML plugins that run supplementary ML models to identify objects of interest. LDB ships with CLIP and ResNet plugins for image filtering, but [other ML plugins](documentation/Plugins.md) can be easily added. This helps, for example, to find objects with features not present in annotations.
164 | 
165 | Here is an example of using CLIP semantic embedding to calculate which 10 images will have content closest to "orange cat":
166 | 
167 | | Step | Command |
168 | | --- | --- |
169 | | Change into workspace "misc-cats" | ```$ cd misc-cats```
170 | | Add three images most resembling orange cats | ```$ ldb add ds:root --pipe clip-text 'orange cat' --limit 10``` |
171 | 
172 | * Note we used ADD command within the workspace that contains dataset `misc-cats`. ADD results in a logical membership change, so no actual files were copied into workspace. This is convenient in cases where the dataset is large and does not need an immediate instantiation.
173 | 
174 | 
175 | ### Instantiation
176 | 
177 | At this point, folder `misc-cats` holds a logical dataset `misc-cats` that is only partially instantiated. In particular, the "orange cat" images were not copied from storage. We can materialize this dataset entirely with `INSTANTIATE` command that renders a physical instance of a staged logical dataset:
178 | 
179 | | Step | Command |
180 | | --- | --- |
181 | | Materialize the entire dataset "misc-cats" | ```$ ldb instantiate``` |
182 | 
183 | * LDB uses caching to avoid re-downloading objects that were already instantiated by GET before
184 | 
185 | 
186 | ### Saving and versioning datasets
187 | 
188 | As we saw with `INSTANTIATE` and `ADD`, many LDB commands are designed to run within a workspace that holds a staged dataset. We can verify if the current folder is indeed a valid LDB workspace with command `STATUS`:
189 | 
190 | | Step | Command |
191 | | --- | --- |
192 | | Check the state of the workspace 'misc-cats' | `$ ldb status` |
193 | 
194 | As we see from the output, we are indeed in a workspace where a dataset 'misc-cats' was staged. However, this dataset has changes not yet saved into LDB. This is because the pending changes must be communicated to LDB with command `COMMIT` which pushes a new dataset version into the LDB instance:
195 | 
196 | | Step | Command |
197 | | --- | --- |
198 | | Save dataset 'misc-cats' into LDB | `$ ldb commit` |
199 | 
200 | Let as add more objects from another workspace that we created earlier:
201 | 
202 | | Step | Command |
203 | | --- | --- |
204 | | Add more objects | `$ ldb add ../large-cats/*png` |
205 | | Check status again | `$ ldb status` |
206 | 
207 | * ADD can identify objects by reference to a dataset, workspace, hash ids, or a list of files
208 | * If we save the current state of the workspace, it will create new version for dataset 'misc-cats':
209 | 
210 | | Step | Command |
211 | | --- | --- |
212 | | Save 'misc-cats' version 2 | `$ ldb commit` |
213 | | Compare with previous | `$ ldb diff ds:misc-cats.v1` |
214 | 
215 | * LDB uses postfix notation `.vNN` to refer to specific version of a dataset
216 | 
217 | ### Dataset mixing and matching
218 | 
219 | The combination of queries and commands `ADD` and `DEL` allows for arbitrary organization of data objects. Some examples:
220 | 
221 | | Step | Command |
222 | | --- | --- |
223 | | Stage a new dataset in a workspace | ```$ ldb stage --force new-cats``` |
224 | | Add all objects from some named dataset into a workspace | `$ ldb add ds misc-cats.v2` |
225 | | Subtract all objects from a named dataset from a workspace | `$ ldb del ds misc-cats.v1` |
226 | | Fill shuffled objects from a source folder | ```$ ldb add ../small-head  --shuffle --limit 10'``` |
227 | 
228 | * STAGE with flag --force clobbers the contents of target folder
229 | * To retain objects that are in one dataset but not another, it is sufficient to ADD the first and DEL the second
230 | 
231 | ## Comparison to related technologies
232 | 
233 | One good question when considering a new ML tool is whether it is worth the investment of time to adopt. 
234 | 
235 | Without a tool like LDB, a team iterating on data typically takes one of the common recipes: (1) datasets as file folders, (2) datasets as spreadsheets, or (3) datasets under control of ML framework. All these solutions have their limits we discuss in the greater detail [here](/documentation/alternatives-to-LDB.md).
236 | 
237 | A second good question is why one should choose LDB over general data versioning (like [DVC](https://dvc.org/) or [PachyDerm](pachyderm.com)). The answer is that capabilities of LDB and general versioning systems do not overlap. 
238 | 
239 | For example, DVC actively manages the model repository and interprets datasets as cached files under full version control. On the other hand, LDB is an indexing service over immutable storage and treats datasets as collections of pointers. This lightweight approach relies on storage immutability to guarantee access, but offers higher speed and better flexibility. In addition, LDB understands annotations and can group sparse objects into datasets by queries.
240 | 
241 | If your data is indexed by LDB while your models are run by DVC, the two tools will happily work together, [see more recipes here](documentation/Getting-started-with-LDB.md).
242 | 
243 | 
244 | ## Contributing
245 | 
246 | ```
247 | TODO
248 | ```
249 | 


--------------------------------------------------------------------------------
/documentation/LDB-queries.md:
--------------------------------------------------------------------------------
  1 | # Query examples
  2 | 
  3 | LDB uses [JMESPATH](https://jmespath.org/specification.html) as a query language over JSON annotations. This allows for complicated data selection and statistics that would normally require custom programming.
  4 | 
  5 | The way LDB treats JMESPATH expressions is as follows:
  6 | 
  7 | - All query-enabled commands expand the provided list of objects and evaluate JMESPATH over annotations one-by-one.
  8 | - **ADD**, **DEL**, **LIST** evaluate the result of JMESPATH as true/false – which determines whether an object is selected for processing. Rules for "falsy" evaluation are as follows: `boolean false, null, empty object, empty list, empty string` (i.e. `[]`, `{}`, `""`, `false`, `null`). 
  9 | [Everything else, including `0` and `0.0` evaluates to `true`.](https://jmespath.org/specification.html#or-expressions)
 10 | 
 11 | 
 12 | `Tip` one common mistake to watch: ``` `null` == `null` ``` -> True
 13 | 
 14 | If you evaluate one "falsy" key against another, the result is a match. See "Get objects where certain key is not null or false" below. 
 15 | 
 16 | - Use the **EVAL** command to print raw JMESPATH query output over annotations
 17 | - LDB uses `--query` flag to scan object annotations, and `--file` flag to scan file attributes formatted as JSON. These flags can be pipelined.
 18 | 
 19 | Here are some query examples, from simple to more advanced:
 20 | 
 21 | - **Get objects of certain class**
 22 |     
 23 |     Input: dataset where image objects have annotations with "class" JSON field:
 24 |     
 25 |     ```json
 26 |     {
 27 |       "class": "cat",
 28 |       "breed": {
 29 |         "type": "main-coon",
 30 |         "size": "large"
 31 |       }
 32 |     }
 33 |     ```
 34 |     
 35 |     Goal: select objects of certain class
 36 |     
 37 |     ```bash
 38 |     # note backquotes around  literal
 39 |     ldb add ds:root --query 'class == `cat`'
 40 |     ```
 41 |     
 42 | - **Get objects where attribute compares against numeric constant**
 43 |     
 44 |     Input: dataset where image objects have annotations with "confidence" JSON field:
 45 |     
 46 |     ```json
 47 |     {
 48 |       "class": "airpods",
 49 |       "inference": {
 50 |         "class": "beats",
 51 |         "confidence": 0.7
 52 |       }
 53 |     }
 54 |     ```
 55 |     
 56 |     Goal: select objects with confidence above certain threshold
 57 |     
 58 |     ```bash
 59 |     # note backquotes on literal
 60 |     ldb add ds:root --query 'inference.confidence >= `0.8`' 
 61 |     
 62 |     ```
 63 |     
 64 | - **Get objects where inference differs from ground truth**
 65 |     
 66 |     Input: dataset where objects have annotations with ground truth ("class") and "inference" JSON fields:
 67 |     
 68 |     ```json
 69 |     {
 70 |       "class": "airpods",
 71 |       "inference": {
 72 |         "class": "beats"
 73 |       }
 74 |     }
 75 |     ```
 76 |     
 77 |     Goal: print objects where two keys pass a comparison test
 78 |     
 79 |     ```bash
 80 |     
 81 |     ldb add ds:root --query 'inference.class != class'
 82 |     ```
 83 |     
 84 | - **Compare attribute against JSON object literals**
 85 |     
 86 |     Input: Dataset consisting of random objects.
 87 |     
 88 |     ```json
 89 |     {
 90 |       "class": "airpods",
 91 |       "inference": {
 92 |         "class": "pro",
 93 |         "confidence": 0.7
 94 |       }
 95 |     }
 96 |     ```
 97 |     
 98 |     Goal:  compare a key against a JSON object literal
 99 |     
100 |     ```bash
101 |     # note use of backquotes and swapped keys in the object
102 |     ldb list --query 'inference == `{"confidence": 0.7, "class": "pro"}`'
103 |     
104 |     ```
105 |     
106 | - **Dealing with objects where certain key can be missing or null**
107 | 
108 |   The original JMESPATH specification assumes a missing JSON key to return `null`, which may lead to unintended consequences – e.g. when comparing one missing key to another missing key (this would somewhat unexpectedly return `true`). LDB query implementation therefore differs from JMESPATH standard by means of immediately skipping an object where annotation has referenced missing key. A separate `--jquery` flag exists for full compatibility with JMESPATH standard with respect to missing keys.
109 |     
110 |     Input: dataset where objects have annotations with "class" JSON field:
111 |     
112 |     ```json
113 |     {
114 |       "class": "cat",
115 |       "breed": {
116 |         "type": "main-coon",
117 |         "size": "large"
118 |       }
119 |     }
120 |     ```
121 |     
122 |     Goal: print all objects where `breed.type` key is not missing, empty or null:
123 |     
124 |     ```bash
125 |     # non-"falsy" key is resolved to "true"
126 |     ldb list --query breed.type
127 |     ```
128 | 
129 |     Goal: print all objects where `breed.type` is not missing (but can be empty or null):
130 |     
131 |     ```bash
132 |     # this query fails only if the key does not exist
133 |     ldb list --query 'breed.type || !breed.type'
134 |     ```
135 |    
136 |     Goal: include only annotations with "truthy" value under `breed.type`:
137 |     ```
138 |     ldb add ds:root --query 'breed.type && class == `cat`'
139 |     ```
140 |     
141 | - **Combine query terms**
142 |     
143 |     Input: dataset where image objects have annotations with "class" JSON field:
144 |     
145 |     ```json
146 |     {
147 |       "class": "cat",
148 |       "breed": {
149 |         "type": "main-coon",
150 |         "size": "large"
151 |       }
152 |     }
153 |     ```
154 |     
155 |     JMESPATH allows for logical grouping of boolean terms with &&, || , !
156 |     
157 |     ```bash
158 |     # quotes required here to shield &&
159 |     ldb list --query 'class==`cat` && breed.size==`large`'  
160 |     
161 |     ```
162 |     
163 |     Another way to achieve the AND operation is to pipeline multiple `--query` or `--file` flags:
164 |     
165 |      ```bash
166 |     # quotes required here to shield &&
167 |     ldb list --query 'class==`cat`' --query 'breed.size==`large`'  
168 |     
169 |     ```
170 |     
171 |     
172 | - **Simple check for class balance**
173 |     
174 |     Input: JSON annotation with object classes:
175 |     
176 |     ```json
177 |     {
178 |       "class": "cat",
179 |     }
180 |     ```
181 |     
182 |     Display cumulative class counts:
183 |     
184 |     ```bash
185 |     # note backslashes around  literals
186 |     ldb list -s --query 'class==`cat`' 
187 |     ldb list -s --query 'class==`dog`' 
188 |     ```
189 |     
190 | - TODO beta: **Examine objects where inference results differ between model runs**
191 | 
192 |     Normally, JMESPATH query operates within a particular annotation JSON – namely, the one attached to the object referenced. If an object is a part of some dataset, it will have the same annotation version it was assigned when last added. If an object is referenced from the index, by default it will have the latest annotation version indexed.
193 |     
194 |     However, sometimes we want to run a JMESPATH query *between* annotation versions of one object. This happens, for example, if we want to compare how some field (e.g. model prediction) has evolved between several annotations. LDB achieves this via `--vquery` flag that modifies JSON tree, prepending an annotation version as root.
195 |     
196 |     Input: dataset where two annotation versions were produced after running the model twice: 
197 |     
198 |     ```json
199 |     Annotation example version 1:
200 |     
201 |     {
202 |       "class": "ii",
203 |       "inference": {
204 |         "class": "i",
205 |       }
206 |     }
207 |     
208 |     Annotation example version 2:
209 |     
210 |     {
211 |       "class": "ii",
212 |       "inference": {
213 |         "class": "ii",
214 |       }
215 |     }
216 |     ```
217 |     
218 |     Goal: compare inference results between the two versions:
219 |     
220 |     ```bash
221 |     ldb stage ds:model-run-difference
222 |     ldb add ds:roman-numerals --vquery 'v1.inference.class != v2.inference.class'
223 |     ```
224 |    
225 |     
226 | - **Get objects with a given number of array members (JMESPATH function `length`)**
227 |     
228 |     Input: JSON annotation that may have several object instances:
229 |     
230 |     ```json
231 |     {
232 |     	"instances" : [
233 |     	    {
234 |     	       "label": "cat"
235 |     	    },
236 |     	    {
237 |     	       "label": "dog"
238 |     	    },
239 |             {
240 |     	       "label": "dog"
241 |     	    }
242 |        ]
243 |     }
244 |     ```
245 |     
246 |     Query: add objects with a given number of instances
247 |     
248 |     ```bash
249 |     # note backquotes on literal
250 |     ldb add --query 'length(instances) == `3`' 
251 |     ```
252 |     
253 | - **Class balance statistics**
254 | 
255 |   Input: JSON annotation with a class name field
256 |     
257 |     ```json
258 |     {
259 |       "class": "cat",
260 |       "breed": {
261 |         "type": "main-coon",
262 |         "size": "large"
263 |       }
264 |     }
265 |     ```
266 |     Desired: object count per class 
267 |    
268 |    ```bash
269 |    ldb eval -j --query 'class' ds:pets | sort | uniq -c
270 |      100 "cat"
271 |      100 "dog"
272 | 
273 |    ```
274 |     
275 | - **Histogram printing for numeric parameter**
276 |     
277 |     Input: JSON annotation that may have several object instances in a dataset 
278 |     
279 |     ```json
280 |     {
281 |     	"instances" : [
282 |     	    {
283 |     	       "label": "cat"
284 |     	    },
285 |     	    {
286 |     	       "label": "dog"
287 |     	    },
288 |             {
289 |     	       "label": "dog"
290 |     	    }
291 |         ]
292 |     }
293 |     ```
294 |     
295 |     Desired: histogram for distribution of some numeric key across samples in dataset (uses external program **hist** from [bashplotlib](https://github.com/glamp/bashplotlib ))
296 |     
297 |     ```bash
298 |     
299 |     $ ldb eval -j ds:pets --query 'length(instances)'  | hist -n
300 |     
301 |      6| o          
302 |      5| o          
303 |      4| o   o     o
304 |      3| o   o     o
305 |      2| o   o     o
306 |      1| o   o  o  o
307 |        -----------
308 |     ```
309 | - **Statistics evaluation for numeric parameter**
310 |     
311 |     Input: JSON annotation that has some numeric field
312 |     
313 |     ```json
314 |     {
315 |       "img": {
316 |         "height": 768,
317 |         "width": 1024
318 |       }
319 |     }
320 |     ```
321 |     
322 |     Desired: histogram for distribution of numeric key across samples in dataset (uses external program **hist** from [num](https://github.com/numcommand/num)) (requires GNU gawk, e.g. `brew install --build-from-source gawk`)
323 |     
324 |     ```bash
325 |     
326 |     ldb eval -j  ds:images --query 'img.height'  | grep -v null | num stddev median
327 |     
328 |      185.858
329 |     ```
330 |     
331 | - **Query** **objects when keys can be null**
332 |     
333 |     Input: JSON annotation that may have several object instances, but existence of array is not guaranteed:
334 |     
335 |     ```json
336 |     {
337 |     	"instances" : [
338 |     	    {
339 |     	       "label": "cat"
340 |     	    },
341 |     	    {
342 |     	       "label": "dog"
343 |     	    },
344 |             {
345 |     	       "label": "mouse"
346 |     	    }
347 |         ]
348 |     }
349 |     ```
350 |     
351 |     Let us say we are interested in counting number of instances:
352 |     
353 |     ```bash
354 |     # note backquotes for literal
355 |     ldb add --query 'length(instances) == `3`' 
356 |     ```
357 |     
358 |     If we run the above query over annotations where key "instances" is null, JMESPATH function `length()` will fail:
359 |     
360 |     ```bash
361 |     
362 |     $ ldb eval 'length(instances) == `3`' 18de96e5871380ce1594b55d906ca816
363 |     
364 |     ERROR: In function length(), invalid type for value: None, expected one of: ['string', 'array', 'object'], received: "null"
365 |     ```
366 |     
367 |     To prevent this failure, we can add a check if key is not null:
368 |     
369 |     ```bash
370 |     
371 |     ldb add --query 'not_null(instances) && length(instances) == `3`' 
372 |     ```
373 |     
374 | - **Isolate objects with a helper ML model:**
375 |     
376 |     Input: Dataset consisting of cat images.
377 |     
378 |     Output:  20 images most close in semantic meaning to "sitting cat".
379 |     
380 |     ```bash
381 |     
382 |     ldb add --pipe clip-text 'sitting cat' --limit 20
383 |     
384 |     ```
385 |     
386 | 
387 | Advanced examples 
388 | 
389 | - **Array operations: indexing**, **slicing, flattening, projections, filters**
390 |     
391 |     Input: Annotation for object `id:0dc11270eb2c136b454859df4b472aed`:
392 |     
393 |     ```json
394 |     {
395 |     	"instances" : [
396 |     	    {
397 |     	       "label": [
398 |     	  	      "cat",
399 |     		        "dog"
400 |     				  ]
401 |     	    },
402 |     	    {
403 |     	        "label": "dog"
404 |     	    },
405 |             {
406 |     	        "label": "mouse"
407 |     	    }
408 |        ]
409 |     }
410 |     ```
411 |     
412 |     Indexing:
413 |     
414 |     ```bash
415 |      
416 |     ldb eval 'instances[0]' id:0dc11270eb2c136b454859df4b472aed
417 |     
418 |     {
419 |          "label": [
420 |     		"cat",
421 |     		"dog"
422 |     	 ]
423 |     }
424 |     ```
425 |     
426 |     Slicing (python syntax start:step:stop):
427 |     
428 |     ```bash
429 |      
430 |     ldb eval 'instances[:2:]' id:0dc11270eb2c136b454859df4b472aed
431 |     
432 |     [
433 |       {
434 |         "label": [
435 |           "cat",
436 |           "dog"
437 |         ]
438 |       },
439 |       {
440 |         "label": "mouse"
441 |       }
442 |     ]
443 |     ```
444 |     
445 |     Projection:
446 |     
447 |     ```bash
448 |      
449 |     ldb eval 'instances[1:].label' id:0dc11270eb2c136b454859df4b472aed
450 |     
451 |     [
452 |       "dog",
453 |       "mouse"
454 |     ]
455 |     ```
456 |     
457 |     Flattening + projection:
458 |     
459 |     ```bash
460 |      
461 |     ldb eval 'instances[*].label[]' id:0dc11270eb2c136b454859df4b472aed
462 |     
463 |     [
464 |       "cat",
465 |       "dog",
466 |       "dog",
467 |       "mouse"
468 |     ]
469 |     ```
470 |     
471 |     Filters:
472 |     
473 |     ```bash
474 |      
475 |     ldb eval 'instances[?contains(label,`cat`)]' id:0dc11270eb2c136b454859df4b472aed
476 |     
477 |     [
478 |       {
479 |         "label": [
480 |           "cat",
481 |           "dog"
482 |         ]
483 |       }
484 |     ]
485 |     ```
486 |     
487 |     Pipes (flattening and sequential filters):
488 |     
489 |     ```bash
490 |      
491 |     $ ldb eval 'instances[?contains(label,`dog`)] | [0] | label[?@==`cat`]' id:0dc11270eb2c136b454859df4b472aed
492 |     
493 |     [
494 |        "cat"
495 |     ]
496 |     ```
497 |     
498 | - **Object transforms:  list, hash**
499 |     
500 |     Input: Annotation for object `id:18de96e5871380ce1594b55d906ca816`:
501 |     
502 |     ```json
503 |     {
504 |       "class": "cat",
505 |       "breed": {
506 |         "type": "main-coon",
507 |         "size": "large"
508 |       }
509 |     }
510 |     ```
511 |     
512 |     Multiselect hash:
513 |     
514 |     ```bash
515 |      
516 |     ldb eval '{class:class}' id:18de96e5871380ce1594b55d906ca816
517 |     
518 |     {
519 |       "class": "cat"
520 |     }
521 |     ```
522 |     
523 |     Multiselect subexpression and value swap:
524 |     
525 |     ```bash
526 |      
527 |     ldb eval '{breed:breed.{type:size}}' id:18de96e5871380ce1594b55d906ca816
528 |     
529 |     {
530 |       "breed": {
531 |         "type": "large"
532 |       }
533 |     }
534 |     ```
535 |     
536 |     Multiselect list:
537 |     
538 |     ```bash
539 |      
540 |     ldb eval '[class, breed]' id:18de96e5871380ce1594b55d906ca816
541 |     
542 |     [
543 |       "cat",
544 |       {
545 |         "type": "main-coon",
546 |         "size": "large"
547 |       }
548 |     ]
549 |     ```
550 |     
551 | - **Comparing string against regular expression (LDB JMESPATH extension `regex`)**
552 |     
553 |     Input: Dataset consisting of random object images.
554 |     
555 |     ```json
556 |     {
557 |       "class": "airpods",
558 |       "inference": {
559 |         "class": "beats",
560 |         "confidence": 0.7
561 |       }
562 |     }
563 |     ```
564 |     
565 |     Query:
566 |     
567 |     ```bash
568 |     # note use of backquotes
569 |     ldb list --query 'regex(inference.class, `"^b.+s$"`)'
570 |     ```
571 |     
572 |     LDB JMESPATH func: `regex`
573 |     
574 | - **Simple math operation (LDB JMESPATH extension `sub`)**
575 |     
576 |     Input: annotation for audio record
577 |     
578 |     ```json
579 |     {
580 |       "original_length": 3.774375,
581 |       "segments": [
582 |         {
583 |           "start": 0.2267952733485195,
584 |           "end": 0.390150911161731,
585 |           "labels": [
586 |             "Positive"
587 |           ]
588 |         },
589 |         {
590 |           "start": 1.2267952733485195,
591 |           "end": 1.390150911161731,
592 |           "labels": [
593 |             "Negative"
594 |           ]
595 |         },
596 |         {
597 |           "start": 2.2267952733485195,
598 |           "end": 2.390150911161731,
599 |           "labels": [
600 |             "Positive"
601 |           ]
602 |         }
603 |       ]
604 |     }
605 |     ```
606 |     
607 |      Goal: calculate cumulative duration of "Positive" sections
608 |     
609 |     ```shell
610 |     $ ldb eval --query 'sum(segments[?contains(labels, `Positive`)].sub(end, start))'
611 |     $ ldb list --query 'sum(segments[?contains(labels, `Positive`)].sub(end, start)) > `0.3`'
612 |     ```
613 | - Built-in JMESPATH functions
614 | 
615 |    JMESPATH specification comes with a [vast array of built-in functions](https://jmespath.org/specification.html#built-in-functions) like abs, avg. ceil, contains, not_null, max, sort, and so forth. 
616 |    
617 | - **Full list of** **LDB-specific functions**
618 |     
619 |     LDB is bundled with several functions that extend JMESPATH built-ins:
620 |     
621 |     - `sub(array, scalar)` → array
622 |     - `sub(scalar, scalar)` → scalar
623 |     - `sub(array, array)` → array             # arrays must match in dimensions
624 |     - `prod(array, scalar)` → array
625 |     - `prod(array, array)` → array
626 |     - `div(array, scalar)` → array
627 |     - `div(array, array)` → array                                       # arrays must match in dimensions
628 |     - `dotproduct(array, array)` → scalar                       # arrays must match in dimensions
629 |     - `unique(array)` → vector                                             # input array is flattened
630 |     - `regex(object, regex_query)` → boolean                 # rules of regex
631 |     
632 | - TODO: **Custom (user-defined) functions**
633 |     
634 |     Users can define custom functions for complex queries. 
635 |     
636 |     For example, a custom function can be coded that accepts coordinates of two bounding boxes and outputs the area of overlapping surface (in pixels).
637 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # α README
  2 | 
  3 | Label Database (**LDB**) is a standalone **open-source** indexing tool for **data-centric** AI and machine learning projects. It works **upstream from model training** and organizes data in *cloud storages*, *filesystems* and *data lakes* into reproducible logical datasets.
  4 | 
  5 | **LDB** aims to displace ad-hoc dataset management, data search and de-duplication devices – such as file folders, spreadsheets, and custom code for data selection/augmentation. In the upstream direction, LDB can interface with labeling software, and in the downstream direction LDB provides data files for clean training and evaluation pipelines – including modern registry-based model cards.
  6 | 
  7 | **Key LDB features**:
  8 | 
  9 | * **command line interface** (MLOps oriented). 
 10 | * LDB manages datasets as versioned collections of pointers into storage with automatic de-duplication
 11 | * Since LDB datasets use pointers, there is **no need** to **move or copy** data objects to create datasets. 
 12 | * LDB datasets are purely logical, so they are easily cloned, merged, sliced, and sampled. 
 13 | * LDB does not require write privileges to storage, so losing an LDB instance does not affect the actual data objects.
 14 | * **Search in the cloud:** data objects can be selected based on JSON annotations, file attributes, or helper ML model queries 
 15 | * **Annotation tracking:** JSON annotation and metadata are tracked and versioned during indexing
 16 | * **Reproducibility and sharing:** every LDB dataset version always points to the same collection of data samples and can be easily shared
 17 | * **Cloud optimization:** LDB caches objects during instantiation, increasing speed and reducing cloud egress costs
 18 | 
 19 | ### Contents
 20 | 
 21 | - [Installation](#installation)
 22 | - [How LDB works](#how-ldb-works)
 23 | - [What LDB can do](#what-ldb-can-do)
 24 | - [LDB tutorial](#ldb-tutorial)
 25 | - [LDB commands](#ldb-commands)
 26 | - [LDB versus other versioning tools](#ldb-versus-other-versioning-tools)
 27 | - [Contributing to LDB](#contributing)
 28 | 
 29 | ## Installation
 30 | 
 31 | ### pip **(PyPI core)**
 32 | 
 33 | ```sh
 34 | pip install ldb-alpha
 35 | ```
 36 | 
 37 | ### installation with AWS and ML plugin support **(optional)**
 38 | 
 39 | ```sh
 40 | pip install 'ldb-alpha [s3,clip-plugin,resnet-plugin]' 
 41 | ```
 42 | 
 43 | ### add anonymous access to public s3 datasets **(optional)**
 44 | ```
 45 | ldb add-storage s3://ldb-public/remote/ -o anon true
 46 | ```
 47 | * Sample dataset descriptions [are here](documentation/Datasets.md)
 48 | 
 49 | * Supported metadata and dataset formats [here](documentation/formats.md)
 50 | 
 51 | * Full LDB command summary [is here](documentation/Command-summary.md)
 52 | 
 53 | 
 54 | ### How LDB works
 55 | 
 56 | LDB indexes immutable storage and notes unique data objects along with their associated annotations (if present). This index can then be queried to construct datasets that operate like collections of sparse pointers into the storage. LDB does not save data objects internally, and depends on their persistent storage locations to materialize (instantiate) datasets on demand.
 57 | 
 58 | 
 59 | <img src="images/ldb-principle.png"  width=50% height=50%>
 60 | 
 61 | LDB datasets can then be shared and versioned, which makes any membership changes (cloning, merging, splitting, adding, and removing objects) manageable and reproducible.
 62 | 
 63 | Whenever a dataset needs to be instantiated (for instance, to run a model experiment), LDB copies all relevant objects from cloud storage into the local workspace and recreates all linked annotations. Since storage is immutable and dataset state is kept within LDB, the local workspace can be safely erased after the experiment is complete. 
 64 | 
 65 | TODO: LDB supports local caching of instantiated data, so sucessive object materializations do not need to repeat cloud transfers.
 66 | 
 67 | ## What LDB can do
 68 | 
 69 | <details>
 70 |   <summary>Cloud data retrieval, de-duplication and caching</summary>
 71 |   
 72 | ᐃ
 73 |   
 74 | The simplest way to form a dataset from cloud and materialize it in LDB is by using the [GET](documentation/Command-summary.md#get) command, which can point to cloud location, index it, add data objects into a specified dataset and instantiate it in one shot: 
 75 | 
 76 |   ```
 77 | ldb get s3://ldb-public/remote/data-lakes/dogs-and-cats/ --target animals/
 78 |   ```
 79 | 
 80 |   
 81 | <details>
 82 |   <summary>S3 path indexed, and 200 objects copied into a temporary dataset in folder animals</summary>
 83 |       
 84 |   
 85 | ```   
 86 |     Staged ds:.temp.2022-06-07T00:46:33.865467+00:00 at 'animals'
 87 |     Adding to working dataset...
 88 |     Added 200 data objects to ds:.temp.2022-06-07T00:46:33.865467+00:00
 89 |     Instantiating data...
 90 | 
 91 |     Copied data to workspace.
 92 |       Data objects:       200
 93 |       Annotations:        200
 94 | ```
 95 | 
 96 | </details>
 97 |   
 98 | Let's try to get the same objects again to see how automatic de-deduplication works:
 99 | 
100 |   ```
101 |   ldb get s3://ldb-public/remote/data-lakes/dogs-and-cats/ --target animals/
102 |   ```
103 | LDB reads the contents of path but adds nothing because it recognizes all input objects as duplicates.
104 |   
105 | TODO BETA: Another benefit of using LDB to service data objects from cloud locations is caching. When data engineers work on overlapping datasets, this normally requires duplicate file transfers from cloud bearing time and cost penalties. LDB solves this problem by using instantiation cache which is enabled by default.
106 |   
107 | ᐃ
108 | </details>
109 | 
110 | <details>
111 |   <summary>Find and retrieve data samples by file attributes</summary>
112 |   
113 | ᐃ
114 |   
115 | Searching data by name patterns and file attributes is easy in filestystems with `find(1)` and similar tools, but is not readily available in the cloud. LDB fills this gap by storing file attributes in JSON format at indexing time and allowing to query them with JMESPATH expressions.
116 | 
117 |   For one example, time-based file search of objects in LDB index may look like this:
118 | 
119 | ```
120 | ldb list s3://ldb-public/remote/data-lakes/dogs-and-cats/ --file 'fs.mtime > `2022-03-02T05:43:45`'
121 | ```
122 |   
123 |   For another example, retrieval based on the regular expression match in the path can be done as follows:
124 |   
125 |   
126 | ```
127 | ldb get s3://ldb-public/remote/data-lakes/dogs-and-cats/ --path 'dog\.102[0-2]+' --target some-dogs
128 | ```
129 |   
130 | LDB stores file attributes collected during indexing in a JSON schema, so in the example above, flag `--path` is actually a shortcut for JMESPATH regex function applied to JSON `fs.path` attribute and is equivalent to ```--file 'regex(fs.path, `EXPR`)'```.  
131 |   
132 |   <details>
133 |     <summary>Sample LDB-indexed file attributes</summary>
134 | 
135 |   ᐃᐃ
136 |     
137 |   ``` 
138 |          ldb eval  id:98603fb145b88c265fb4a745e6aaf806   --file '@'
139 | 
140 |             id:98603fb145b88c265fb4a745e6aaf806
141 |             {
142 |               "alternate_paths": [
143 |                 {
144 |                   "fs_id": "",
145 |                   "path": "ldb-public/remote/data-lakes/dogs-and-cats/dog.1020.jpg",
146 |                   "protocol": [
147 |                     "s3",
148 |                     "s3a"
149 |                   ]
150 |                 }
151 |               ],
152 |               "first_indexed": "2022-06-07T03:00:54.270212+00:00",
153 |               "fs": {
154 |                 "atime": null,
155 |                 "ctime": null,
156 |                 "fs_id": "",
157 |                 "gid": null,
158 |                 "mode": 0,
159 |                 "mtime": null,
160 |                 "path": "ldb-public/remote/data-lakes/dogs-and-cats/dog.1020.jpg",
161 |                 "protocol": [
162 |                   "s3",
163 |                   "s3a"
164 |                 ],
165 |                 "size": 26084,
166 |                 "uid": null
167 |               },
168 |               "last_indexed": "2022-06-07T03:00:54.270212+00:00",
169 |               "last_indexed_by": "dkh",
170 |               "tags": [],
171 |               "type": "jpg"
172 |             }
173 |   ```
174 | 
175 |   ᐃᐃ
176 |   </details>
177 | 
178 | File attribites schema works just like any other JSON, for example JMESPATH `--file` filters can be pipelined and use comparators and functions:
179 |   
180 |   ```
181 |   ldb list s3://ldb-public/remote/data-lakes/dogs-and-cats/  --file 'type == `jpg`' --file 'fs.size < `10000`'
182 |   ```
183 |   
184 | ᐃ
185 | </details>
186 | 
187 | <details>
188 |   <summary>Retrieve data samples by querying JSON annotations</summary>
189 | 
190 | ᐃ
191 | 
192 | LDB relies on AWS JMESPATH language to query JSON annotations. JMESPATH is not a Turing-complete language, but it is sufficiently expressive to provide complex search capabilities without writing code. 
193 |     
194 | Most everyday data selection tasks appear simple and elegant in JMESPATH. For example, choose objects with confidence below a threshold:
195 | 
196 |   ```
197 |   ldb list --query 'inference.confidence < 0.3'
198 |   ```
199 |   JMESPATH is a powerful JSON expression reducer, and can be extended with [custom functions](documentation/Command-summary.md#user-defined-custom-query-functions). LDB also provides some handy functions out of the box, for example – to compute the total area of (possibly overlapping) bounding boxes for all images in workspace, one can project dimension metrics into arrays and use dotproduct(array, array) to compute the final result:
200 |   
201 |   ```
202 |   ldb eval --query 'dotproduct(b_boxes[*].width, b_boxes[*].height))'
203 |   ```
204 | Please refer to the [queries](documentation/LDB-queries.md) document for more examples of JMESPATH expressions.
205 |   
206 | ᐃ
207 | </details>
208 | 
209 | <details>
210 |   <summary>Search data samples with ML helpers</summary>
211 | 
212 | ᐃ 
213 |   
214 | Sometimes there is a need to locate data samples that lack annotations, or must be indentified by criteria other than labels. In that case, LDB provides a convenience harness for programs that can filter or sort samples by looking deeper. 
215 |     
216 | For example, the following line uses ML helper to detect cat colors (which are not present in annotations):
217 | 
218 |   ```
219 |   ldb list s3://ldb-public/remote/data-lakes/dogs-and-cats/ --pipe clip-text 'black cat' --limit 10
220 |   ```
221 |   
222 |   Since helpers can be computationally expensive to run, it also makes sense to reduce their scope of operation. LDB queries are pipelined, which means flags are executed in the order of appearance and can be used to limit the scope for downstream ML filters:
223 |   
224 |   ```
225 |   ldb list s3://ldb-public/remote/data-lakes/dogs-and-cats/ --path 'dog\.10[0-2]+' --pipe clip-text 'black dog' --limit 3
226 | 
227 |   ```
228 |  
229 | ᐃ
230 | </details>
231 | 
232 | <details>
233 |   <summary>Slice/dice and merge datasets</summary>
234 | 
235 | ᐃ 
236 |   
237 | LDB supports many ways to manipulate dataset membership. For example, the following expression merges two datasets, shuffles the result and dispenses first 100 members into the workspace.
238 | 
239 |   ```
240 |   ldb add ds:dogs ds:cats --shuffle --limit 100
241 |   ```
242 |   
243 |   Similarly, here is a way to add objects present in dataset A but not present in dataset B:
244 |   
245 |   ```
246 |   ldb add ds:A
247 |   ldb del ds:B
248 |   ```
249 |   
250 |   Membership operations are not limited to named datasets already saved into LDB. For example, one can stage unnamed (temporary) dataset with `GET` and sample the result using the workspace notation `ws:`:
251 |   
252 |   ```
253 |   ldb stage ds:animals
254 |   ldb get s3://ldb-public/remote/data-lakes/dogs-and-cats/ --path 'dog\.10[0-2]+' --target more-animals/
255 |   ldb add ws:./more-animals --sample 0.5
256 |   ```
257 |  
258 | ᐃ
259 | </details>
260 |  
261 | <details>
262 |   <summary>Version-control datasets</summary>
263 | 
264 | ᐃ
265 |   
266 | Version control is key to engineering discipline and result reproducibility. Most work in LDB happens in incremental changes that modify a temporary (workspace) dataset, but once this dataset is ready and is committed to LDB, it receives a linear version number (1,2,3 etc):
267 | 
268 |   ```
269 |   ldb get s3://ldb-public/remote/data-lakes/dogs-and-cats/ --path 'dog\.10[0-2]+' -t more-animals/
270 |   cd more-animals/
271 |   ldb commit -name ds:more-animals
272 |   ```
273 |   
274 |   In the example above, folder `more-animals` is first staged with unnamed dataset, and then populated with data from cloud. The result is committed back into LDB and assigned a version number. A dataset reference without version number means the latest version, but a specific version can be also addressed:
275 |   
276 |   ```
277 |   ldb stage ds:more-animals.v1 -t temporary/
278 |   ```
279 |  
280 | ᐃ
281 | </details>
282 | 
283 | </details>
284 |  
285 | <details>
286 |   <summary>Version-control annotations</summary>
287 | 
288 | ᐃ
289 |   
290 | Annotation updates are quite common, which is why LDB datasets consist of tuples (data object, annotation version). A new annotation version is created in LDB every time a sample is re-indexed. Note that new annotation version in LDB index is not automatically pushed to the datasets pointing towards the older version.
291 | 
292 |   ```
293 |   ldb get s3://ldb-public/remote/data-lakes/dogs-and-cats/ -t my-animals
294 |   cd my-animals
295 |   sed -i '' 's/dog/cat/g' dog-1009-7918d986e52f3b939ef49020307837b2.json 
296 |   ldb index dog-1009-7918d986e52f3b939ef49020307837b2.json
297 |   ```
298 |   ^^^
299 |   This have physically changed one annotaton and updated it in LDB index, but the logical information about this annotation in the workspace has not changed. For that, one needs to use PULL command:
300 |   
301 |   ```
302 |   ldb pull dog-1009-7918d986e52f3b939ef49020307837b2.json
303 |   ```
304 | ᐃ
305 | </details>
306 | 
307 | <details>
308 |   <summary>Dataset-level transform configurations</summary>
309 | 
310 | ᐃ
311 |   
312 | Feeding data into the model often requires pre-processing. For example, a model might use just the crops of the image in bounding boxes, or still pictures extracted from a video object. Similarly, a pre-processor might take the original data and augment it to increase the number of samples.
313 |   
314 |   Traditionally, this functionality is written as code predating the training phase, which obscures the important dataset parameters and makes it harder to work with modular card-level models that lack pre-processors.
315 |   
316 |   LDB addresses this problem by allowing for transformation configuration to be baked into the dataset, where some (or all) objects are passed through transformation plugins during instantiation. This keeps data-level modifications parameters bound to dataset itself, and leaves no hidden states for data entering the model. 
317 |   
318 |   By default, every LDB object in any dataset has one (self-identity) transform in the transformation set:
319 |   
320 |   ```
321 |   ldb list
322 |   ```
323 |   <details>
324 |   <summary>Output</summary>
325 |     
326 |   ```
327 |       Data Object Hash                      Annot  Data Object Path          Transforms              
328 |        id:011caf8c8bc2a2d715ff1262a80dccdb   2      ...and-cats/cat.1011.jpg  self
329 |   ```
330 |   </details>
331 |   
332 |   However, the transformation set can have any number of actions configured:
333 |   
334 |   ```
335 |   ldb transform -s rotate-90,rotate-180
336 |   ```
337 |   
338 |   see [command summary](documentation/Command-summary.md#transform) for more information on how to configure plugins.
339 |   
340 | ᐃ
341 | </details>
342 | 
343 | ## LDB tutorial
344 | 
345 | Here is an [end-to-end workflow](documentation/tutorial.md) for a Data-centric AI task with LDB
346 | 
347 | ## LDB commands
348 | 
349 | <details>
350 |   <summary>LDB command cheat sheet</summary>
351 |   
352 | ᐃ
353 | 
354 | > **LDB instance** is a persistent structure where all information about known objects, labels and datasets is being stored. If no LDB instance is found, a private one will be created automatically in the `~/.ldb` directory the first time an LDB dataset is staged. To set up a shared LDB instance for a team or an instance in a different location, please follow [LDB team setup](documentation/Quick-start-teams.md).
355 |  
356 | >**LDB dataset** is a collection of pointers into storage. 
357 | 
358 |  ### Staging a new dataset 
359 | 
360 |  Whenever a new dataset is required – or an existing dataset needs an update, it must first be staged in an empty folder (data workspace). Staging does not automatically instantiate the dataset, but creates a draft state of dataset membership info and metadata. LDB prefixes dataset names with `ds:`
361 | 
362 |  | Step | Command |
363 |  | --- | --- |
364 |  | Create a workspace folder | `$ mkdir working-dataset; cd working-dataset` |
365 |  | Create a new dataset in the workspace | `$  ldb stage ds:my-cats ./` |
366 |  | Check the status of staged data | `$  ldb status ` |
367 | 
368 |  While working in this workspace, all subsequent dataset manipulations will apply to the staged dataset. 
369 | 
370 |  Logical modifications to dataset staged in the workspace are usually made with ADD and DEL commands that may reference individual objects, other datasets, and employ annotation queries (see [LDB queries](documentation/LDB-queries.md) for details).
371 | 
372 |  **Configuring immutable storage locations (optional)**
373 | 
374 |  LDB assumes data samples live in immutable locations from which they are indexed. By default, a private instance will treat any cloud location as immutable, and any local filesystem path as ephemeral. LDB automatically attempts to copy data samples from ephemeral locations into internal storage (defaults to `~/.ldb/read_add_storage`) during indexing. To prevent this behavior while indexing local storages, register them with `ADD-STORAGE` command:
375 | 
376 | 
377 |  | Step | Command |
378 |  | --- | --- |
379 |  | Register some immutable storage location  | `$  ldb add-storage ~/dogs-and-cats` |
380 | 
381 |  Please remember that LDB is an indexing service. If you move or erase indexed data samples from storage, LDB index may break.
382 | 
383 |  ### Indexing storage folder
384 | 
385 |  Once the storage location is registered, it can be indexed. During indexing, LDB recognizes all unique objects and associates them with annotations (if present). Whenever new samples are added, their location must be reindexed for LDB to pick the changes. Annotations updated for the old data objects will be registered with a new version.
386 | 
387 |  | Step | Command |
388 |  | --- | --- |
389 |  | Index images from storage | `$ ldb index ~/dogs-and-cats` |
390 | 
391 |  ### Modifying a dataset
392 | 
393 |  | Step | Command |
394 |  | --- | --- |
395 |  | Add cat objects from index by annotation | ```$ ldb add ds:root —-query 'class == `cat`'``` |
396 |  | Check the status of a staged dataset | `$  ldb list`|
397 | 
398 |  Note the use of single quotes to shield query from shell expansion, and the use of backticks to denote the literal value ("cat"). Also note that a special name `ds:root` designates the entire LDB index which references all known objects. 
399 | 
400 |  LDB is also not limited to querying the existing annotations. If installed, [custom ML plugins](documentation/Plugins.md) can be employed for queries beyond JSON:
401 | 
402 |  | Step | Command |
403 |  | --- | --- |
404 |  | Add objects by ML query: | `$ ldb add ds:root --pipe clip-text 'orange dog' --limit 10` |
405 |  | Check the status of a staged dataset | `$ ldb list`|
406 | 
407 |  At this point, our workspace holds membership info for all cat images from sample dataset, and ten images that best resemble an orange dog. It is okay to have same objects added to a dataset multiple times as LDB automatically deduplicates. Once we are happy with results, this dataset can be instantiated (materialized) in the desired output format to examine the samples or train the model.
408 | 
409 |  ### Instantiation
410 | 
411 |  | Step | Command |
412 |  | --- | --- |
413 |  | Instantiate all objects into the workspace | `$ ldb instantiate `|
414 |  | See the resulting physical dataset | `$ ls`|
415 | 
416 |  After examining the actual data objects, one might decide to add or remove data samples, or to edit their annotations.
417 |  LDB can pick the resulting changes right from the workspace:
418 | 
419 |  ### Notifying LDB on workspace modifications
420 | 
421 |  | Step | Command |
422 |  | --- | --- |
423 |  | Edit some annotation     | `$ sed -i 's/dog/cat/g' dog-1088.json` |
424 |  | Inject a new annotated sample directly into workspace | `$ cp ~/tmp/dog-1090.* ./`
425 |  | Pick object and annotation changes from workspace | `$ ldb add ./`|
426 | 
427 |  To save staged dataset into LDB (with all the cumulative changes made so far), one needs to use the *commit* command.
428 | 
429 |  ### Dataset saving and versioning
430 | 
431 |  | Step | Command |
432 |  | --- | --- |
433 |  | Push a new version of staged dataset to LDB | `$ ldb commit` |
434 | 
435 |  Every new commit creates a new dataset version in LDB. By default, a reference to an LDB dataset assumes the latest version. Other dataset versions can be explicitly accessed with a version suffix:
436 | 
437 |  | Step | Command |
438 |  | --- | --- |
439 |  | Stage a particular version of a dataset | `$  ldb stage ds:my-cats.v3` |
440 |  | Compare current workspace to a previous dataset version | `$  ldb diff ds:my-cats.v2`|
441 | 
442 |  If newer annotations will become available for the data object, they can be readded to dataset by name. If all labels need to be updated, this can be done with the *pull* command.
443 | 
444 |  TODO BETA: **Granular annotation versioning**
445 | 
446 |  | Step | Command |
447 |  | --- | --- |
448 |  | Add object with particular label version | `$  ldb add —-label-version 2 aws://my-awesome-bucket/1.jpg ` |
449 |  | Bump label version for an object to latest | `$   ldb add aws://my-awesome-bucket/1.jpg` |
450 |  | Bump all labels in a dataset to latest | `$ ldb pull`|
451 | 
452 | ᐃ
453 | </details>
454 | 
455 | Full LDB command summary [is here](documentation/Command-summary.md)
456 | ## LDB versus other versioning tools
457 | 
458 | Without a program like LDB, engineers iterating on data commonly accept one of the following data management recipes: (1) datasets as file folders, (2) datasets as pointers stored in spreadsheets (or database records), or (3) datasets under control of ML frameworks. All these solutions have their limits we discuss in greater detail [here](/documentation/alternatives-to-LDB.md).
459 | 
460 | Datasets can also exist under general versioning tools like (like [DVC](https://dvc.org/) or [PachyDerm](pachyderm.com)). The disadvantage of general versioning is in destruction of the original storage layout. For example, when DVC manages the model repository, it takes full ownership of data and caches the actual data samples. This can be problematic from data engineering perspective because every model has its own data cache and global data management and search becomes chalenging.
461 | 
462 | On the opposite, LDB is an indexing service, and treats datasets as collections of pointers. This lightweight approach requires storage immutability, but can accept any data in its original form (together with locations, folders, names, etc). Indexing allows LDB to operate without write access to storage and leave storage management to data engineers. In addition, LDB understands annotations and can group sparse objects into datasets by annotation queries. These abilities forms a natural boundary between dataset utilities like LDB and model/experiment management tools like DVC: LDB organizes data, while DVC manages the rest of ML pipeline.  
463 | 
464 | ## Contributing
465 | 
466 | Contributions are welcome! Pre-beta testers, please contact us for access.
467 | 
468 | 


--------------------------------------------------------------------------------
/documentation/tutorial.md:
--------------------------------------------------------------------------------
  1 | 
  2 | `section under construction`
  3 | 
  4 | # Intro
  5 | 
  6 | LDB is an MLOps tool that indexes the existing immutable storage for data files and annotations for easy construction of datasets based on metadata queries. These datasets are meant to be used for model training, and are periodically updated based on the model performance (Data-driven AI loop).
  7 | 
  8 | ### LDB workflow cycle:
  9 | 
 10 | 0a. [Start ldb instance](Command-summary.md#init) on a shared disk that everyone in the team has access to (step not needed for private instance)
 11 | 
 12 | 0b. [Configure immutable storage locations](Command-summary.md#add-storage) and access privileges (optional for private instance)
 13 | 
 14 | 1. Put new data (samples or annotations) in the immutable storage (e.g. NFS disk share, web folder, or S3 bucket).
 15 | 2. [Index](Command-summary.md#index) this new data location in LDB.
 16 | 3. [Stage](Command-summary.md#stage) a dataset in the workspace.
 17 | 4. [Add](Command-summary.md#add) data to this workspace based on location path, JSON query, file properties, etc.
 18 | 5. [Commit](Command-summary.md#commit) this dataset so it can be accessed later.
 19 | 6. [Instantiate](Command-summary.md#instantiate) this dataset to download the data from storage.
 20 | 
 21 | 6a. Train or validate the model on data, find things to improve in the dataset.
 22 | 
 23 | 7. Begin new incremental iteration starting from pp. 2, 3, or 4.
 24 | 
 25 | 
 26 | # Tutorial
 27 | 
 28 | 
 29 | One good example for the data-driven AI task can be derived from the [2021 competition](https://https-deeplearning-ai.github.io/data-centric-comp/) 
 30 | by [DeepLearning.AI](http://deeplearning.AI) to train a [ResNet50 model](https://arxiv.org/abs/1512.03385v1) to recognize hand-written roman numerals: 
 31 | 
 32 | ![roman numerals dataset, courtesy DeepLearning.AI](/images/numerals-deeplearningAI.png)
 33 | 
 34 | A [starter set of ~3,000 labeled training images](https://worksheets.codalab.org/rest/bundles/0xcea1d733e1f144d9aba83929af51f191/contents/blob/) is provided in competition, and contestants are free to add more (up to 10,000 images) to score on a leaderboard. The task aims at demonstrating how performance improvements can be achieved with managing data. In this tutorial, we are going to walk through this task using LDB.
 35 | 
 36 | To get the optimal performance from an expressive ML model, one needs to solve the following data-specific challenges:
 37 | 
 38 | * Cleanse input data. Remove duplicate, irrelevant, or excessively noisy samples.
 39 | * Clean input annotations. Make sure annotations match the content of data samples.
 40 | * Enrich the data. Find ways to obtain more samples from real world.
 41 | * Introduce augmentations to teach the network about data variations and imperfections.
 42 | * Add synthetic data (derived from the teacher-student, generative networks, etc.) to cover any remaining gaps.
 43 | * Do performance analysis to understand what data causes difficulties or drift and add more samples of this type.
 44 | 
 45 | At the level of organization, all these tasks can be reduced to manipulating data collections – such as the original dataset, auxiliary samples, synthesized samples, augmented samples, and so on. If you have not installed LDB yet, let us install it now:
 46 | 
 47 | ```
 48 | pip install 'ldb-alpha [s3,clip-plugin,resnet-plugin]' 
 49 | ```
 50 | 
 51 | We begin with organizing the starter data for the DeepLearningAI challenge into LDB dataset. The starter data is provided in Tensorflow-inferred format (labels derived from folder names), and there is some initial split into test and validation sets that we can mark by setting tags:
 52 | 
 53 | ```
 54 | mkdir Datacentric-competition ; cd Datacentric-competition
 55 | ldb stage ds:roman-numerals --target roman-numerals/
 56 | ldb index --format infer s3://ldb-public/remote/data-lakes/roman-numerals/val/ --add-tags val
 57 | ldb index --format infer s3://ldb-public/remote/data-lakes/roman-numerals/train/ --add-tags train
 58 | cd roman-numerals/
 59 | ldb add s3://ldb-public/remote/data-lakes/roman-numerals/
 60 | ldb commit
 61 | ```
 62 | <details>
 63 |   <summary>Output</summary>
 64 | 	
 65 | ```
 66 | Initialized LDB instance at '/Users/dkh/.ldb/private_instance'
 67 | Added storage location '/Users/dkh/.ldb/read_add_storage'
 68 | Added storage location 'ldb-public/remote'
 69 | Staged new dataset ds:roman-numerals at 'roman-numerals'
 70 | 	
 71 | Data format: tensorflow-inferred
 72 | 
 73 | Indexing paths...
 74 | 
 75 | Finished indexing:
 76 |   Found data objects:       813
 77 |   Found annotations:        813
 78 |   New data objects:         813
 79 |   New annotations:          813
 80 | 	
 81 | Data format: tensorflow-inferred
 82 | 
 83 | Indexing paths...
 84 | 
 85 | Finished indexing:
 86 |   Found data objects:      2067
 87 |   Found annotations:       2067
 88 |   New data objects:        2018
 89 |   New annotations:         2018
 90 | 
 91 | 
 92 | Adding to working dataset...
 93 | Added 2831 data objects to ds:roman-numerals
 94 | 	
 95 | Committed ds:roman-numerals.v1
 96 | ```
 97 | </details>
 98 | 
 99 | ## First look at the data
100 | 
101 | If you examine the indexing output carefully, you can already spot one problem with the starter set: test and validation splits do not add together to the total number of new objects. LDB indexes objects by content (hash), so if the same object was provided in training and validation split, it should have both `train` and `val` tags in our workspace. We can check for existence of such objects with LIST command:
102 | 
103 | ```
104 | ldb list --tag train --tag val
105 | ```
106 | <details>
107 |   <summary>Output</summary>
108 | 
109 | ```
110 | Data Object Hash                      Annot  Data Object Path          Transforms              
111 |  id:02d4f6af6de0e622bd67637d1d3620a7   1      ...b317-38f9d35ea60f.png  self
112 |  id:02eb050cd69598c3b0d6cc93611c92a2   1      ...b317-38f9d35ea60f.png  self
113 |  id:0635bd89465729cf84f8598229b0665b   1      ...b317-38f9d35ea60f.png  self
114 |  id:083439bdb2c0591e102addc01b1eb4b3   1      ...b317-38f9d35ea60f.png  self
115 |  id:15bf110e1ca8a4684dfcf3178456b633   1      ...b317-38f9d35ea60f.png  self
116 |  id:1ff7ced1800484161f34715c2172f535   1      ...b317-38f9d35ea60f.png  self
117 |  id:2124746c8162c112926050f7a33c5879   1      ...b317-38f9d35ea60f.png  self
118 |  id:2382f776473ee00daa94676b70ccae75   1      ...b317-38f9d35ea60f.png  self
119 |  id:26297aa20c509bdd08d67a487f6db5a8   1      ...b317-38f9d35ea60f.png  self
120 |  id:2aa3ae4cb092973ccbb288cb3ca03249   1      ...b317-38f9d35ea60f.png  self
121 |  id:2b7bbc6d97cd20fe224b084920c48de0   1      ...b317-38f9d35ea60f.png  self
122 |  id:2d8ce75c8587e262873ffbbe960a941f   1      ...b317-38f9d35ea60f.png  self
123 |  id:2f4d516b268fd579ddfabbcf36068339   1      ...b317-38f9d35ea60f.png  self
124 |  id:37516543f0866bf9253d49589fbd821a   1      ...b317-38f9d35ea60f.png  self
125 |  id:3cfb5d8293557021aa8d32af9aa7c1ee   1      ...b317-38f9d35ea60f.png  self
126 |  id:43235118e60d871949fc5a0ac571f1fd   1      ...b317-38f9d35ea60f.png  self
127 |  id:57c6eac9bb0e7cfd2009c3dce2d98d70   1      ...b317-38f9d35ea60f.png  self
128 |  id:5b3750bf92f0b85c1ec8a5f3a0f380ff   1      ...b317-38f9d35ea60f.png  self
129 |  id:5d83c044920ef7808ddd1cb17ef6899c   1      ...b317-38f9d35ea60f.png  self
130 |  id:5e3bf11e9b39ee7dbaa11aafb519bcc7   1      ...b317-38f9d35ea60f.png  self
131 |  id:66b5767dd4ca79063026defab5719d1e   1      ...b317-38f9d35ea60f.png  self
132 |  id:69c9c2e15e38d32074b5d1c21323bde2   1      ...b317-38f9d35ea60f.png  self
133 |  id:6be2772a1688a897f0710addee0221c5   1      ...b317-38f9d35ea60f.png  self
134 |  id:726ccf326a1cb4ba0e8e614a5d393449   1      ...b317-38f9d35ea60f.png  self
135 |  id:727d95b1ecc3b80f1e17cb40e1495cc3   1      ...b317-38f9d35ea60f.png  self
136 |  id:73d682eabd8fc2f6107bc85c2392ddf7   1      ...b317-38f9d35ea60f.png  self
137 |  id:7bce7e64fabf477c8e6380030c30f1ea   1      ...b317-38f9d35ea60f.png  self
138 |  id:7d1cc809c0ab2ae3fe1ce90517a34d9c   1      ...b317-38f9d35ea60f.png  self
139 |  id:80204a23356d089b1c3a4edea5bebc0a   1      ...b317-38f9d35ea60f.png  self
140 |  id:82a7c1753b187d235f521c6dd92a59d8   1      ...b317-38f9d35ea60f.png  self
141 |  id:8b2d92d718ee30b372af735ecefd9d7a   1      ...b317-38f9d35ea60f.png  self
142 |  id:9334c45729f357c7cd3bad4120455831   1      ...b317-38f9d35ea60f.png  self
143 |  id:a01aeaed7c19d9859072553d73743288   1      ...b317-38f9d35ea60f.png  self
144 |  id:a08a04977bbe82411752ed4b6e6fd506   1      ...b317-38f9d35ea60f.png  self
145 |  id:a5c566a35567bee4b3d6db89b497cb8b   1      ...b317-38f9d35ea60f.png  self
146 |  id:a5d5f8695908c027e9308be9cb783ceb   1      ...b317-38f9d35ea60f.png  self
147 |  id:a69d9477813a8ebe3a49635a3b6c43ef   1      ...b317-38f9d35ea60f.png  self
148 |  id:a7ff90ac601e6fdfeb5b38a553a4d458   1      ...b317-38f9d35ea60f.png  self
149 |  id:aec7d3aaf85e4f6753f37681a202156e   1      ...b317-38f9d35ea60f.png  self
150 |  id:b3e72f4413d1dfa9eef47a0717f4ca90   1      ...b317-38f9d35ea60f.png  self
151 |  id:c9a89776993c2f39896c091548f91708   1      ...b317-38f9d35ea60f.png  self
152 |  id:ce8f893cabf9865166f1d8493be9a6f5   1      ...b317-38f9d35ea60f.png  self
153 |  id:d1c1f943855515001c6fab7e7b07b7ba   1      ...b317-38f9d35ea60f.png  self
154 |  id:da599da573ce096f012a95534caab5fd   1      ...b317-38f9d35ea60f.png  self
155 |  id:ddd1931485d742e46866350ff16b5fc5   1      ...b317-38f9d35ea60f.png  self
156 |  id:eaf7fdda144f71f3770d451972a3e377   1      ...b317-38f9d35ea60f.png  self
157 |  id:ee465a602e63d7c5dd4b3d1f7f9530dd   1      ...b317-38f9d35ea60f.png  self
158 |  id:ee86c80c7fd5e020f0c52bf5f621dba7   1      ...b317-38f9d35ea60f.png  self
159 |  id:f1f7f68daa670efb578839b8fd0dd713   1      ...b317-38f9d35ea60f.png  self	
160 | ```
161 | </details>
162 | 	
163 | Next let's do another sanity check and see the balance of classes in splits. For that, we can peek at the JSON layout for the first duplicate object:
164 | 
165 | ```
166 | ldb eval id:02d4f6af6de0e622bd67637d1d3620a7
167 | ```
168 | <details>
169 |   <summary>Output</summary>
170 | 
171 | ```
172 | id:02d4f6af6de0e622bd67637d1d3620a7
173 | {
174 |   "label": "i"
175 | }
176 | ```
177 | </details>
178 | As we see from the annotation, Tensorflow-inferred format was translated in LDB into JSON schema with key `label` describing the object class. We can use this key to tally classes with a JMESPATH query:
179 | 
180 | ```
181 | ldb eval -j --tag val --query 'label'  | sort | uniq -c
182 | ldb eval -j --tag train --query 'label'  | sort | uniq -c
183 | ```
184 | 
185 | <details>
186 |   <summary>Output</summary>
187 | 
188 | ```
189 |   82 "i"
190 |   81 "ii"
191 |   79 "iii"
192 |   84 "iv"
193 |   81 "ix"
194 |   83 "v"
195 |   82 "vi"
196 |   77 "vii"
197 |   83 "viii"
198 |   81 "x"
199 | 	
200 |  261 "i"
201 |  157 "ii"
202 |  186 "iii"
203 |  281 "iv"
204 |  234 "ix"
205 |  196 "v"
206 |  181 "vi"
207 |  193 "vii"
208 |  199 "viii"
209 |  179 "x"
210 | 	
211 | ```
212 | </details>
213 | 	
214 | There is clearly imbalance in training set, especially for `ii` and `iv` labeled objects. 
215 | 		
216 | 
217 | ## Model evaluation harness
218 | 
219 | Ability to train the model, evaluate it and quickly return to editing the dataset is key to Datacetric AI. Let us organize our workflow around this idea. 
220 | 
221 | We left the previous section in the folder named "./roman-numerals" which was staged as namesake LDB dataset. This will be our workspace for iterating on numeral images. For training the ResNet50 model, we will need to split the numerals dataset into training and validation. We will also need a folder to store model predictions, so let us create those:
222 | 
223 | ```
224 | cd .. ; mkdir train; mkdir val; mkdir predictions
225 | ```
226 | 
227 | Besides, we will need the ResNet model itself, and a test dataset (labelbook) to evaluate the final score after training. In the Datacentric-AI competition, a testset that drives the leaderboard was hidden, so hand-picking convincing labelbook samples remained a responsibility of the participants.  
228 | 
229 | In this tutorial we will use [labelbook by Kenneth Leung](https://github.com/kennethleungty/Data-Centric-AI-Competition) and ResNet code modified for our folder layout:
230 | 
231 | ```
232 | curl -L https://remote.ldb.ai/datasets/ResNet50/ResNet50.tar.gz | tar xz
233 | ```
234 | 
235 | At this point, our project top directory should look like this:
236 | 
237 | ```
238 | Datacentric-competition
239 | 	.
240 | 	├── inference.py
241 | 	├── train.py
242 | 	├── roman-numerals/
243 | 	├── train/
244 | 	├── val/
245 | 	└── predictions/
246 | ```
247 | 
248 | The last thing we need to do before training the model on stock dataset is to instantiate our default splits:
249 | 
250 | ```
251 | cd roman-numerals/
252 | ldb instantiate --tag train --target ../train --format tensorflow-inferred
253 | ldb instantiate --tag val --target ../val --format tensorflow-inferred
254 | ```
255 | 
256 | This should be all we need to train our baseline model (if you don't have Tensorflow and Keras, look for help [here](https://www.tensorflow.org/install)).
257 | 
258 | ```
259 | python train.py
260 | ```
261 | <details>
262 |   <summary>Output</summary>
263 | 	
264 | ```
265 |   Model: "model_1"
266 | _________________________________________________________________
267 |  Layer (type)                Output Shape              Param #   
268 | =================================================================
269 |  input_2 (InputLayer)        [(None, 32, 32, 3)]       0         
270 |                                                                  
271 |  tf.__operators__.getitem (S  (None, 32, 32, 3)        0         
272 |  licingOpLambda)                                                 
273 |                                                                  
274 |  tf.nn.bias_add (TFOpLambda)  (None, 32, 32, 3)        0         
275 |                                                                  
276 |  model (Functional)          (None, 8, 8, 256)         229760    
277 |                                                                  
278 |  global_average_pooling2d (G  (None, 256)              0         
279 |  lobalAveragePooling2D)                                          
280 |                                                                  
281 |  dense (Dense)               (None, 10)                2570      
282 |                                                                  
283 | =================================================================
284 | Total params: 232,330
285 | Trainable params: 229,386
286 | Non-trainable params: 2,944
287 | 	
288 | test loss 2.2185871601104736, test acc 0.5961538553237915
289 | ```	
290 | 	
291 | </details>
292 | 
293 | ## Data cleansing
294 | 
295 | Once we have the model, running the inference over all available data is one easy way to find issues. Let us instantiate a dataset 'roman-numerals' in a clean directory (so Tensorflow will not have issues inferring labels from hidden folders), and run the inference script:
296 | 
297 | ```
298 | # starting from roman-numerals/
299 | 
300 | ldb instantiate --target output --format tensorflow-inferred
301 | cd .. 
302 | python inference.py roman-numerals/output
303 | ```
304 | 
305 | This should fill folder `predictions` with .json files of the following format:
306 | 
307 | <details>
308 | 	<summary>sample json </summary>
309 | 	
310 | ```
311 | {
312 |     "annotation": {
313 |         "inference": {
314 |             "label": "vi",
315 |             "confidence": 0.9992263317108154
316 |         }
317 |     },
318 |     "data-object-info": {
319 |         "md5": "447a1471b96fad28678cc2bbd678d303"
320 |     }
321 | }
322 | ```
323 | </details>
324 | 
325 | Note the sections `annotation` and `ldb_meta`. Presence of these two sections signifies `annotation-only` format that LDB can use without attached data files, by linking the content using the hash id. This format is convenient to use because we can easily merge it with annotations that already exist in the LDB index:
326 | 
327 | ```
328 | ldb index --annotation-update merge predictions/
329 | ```
330 | 
331 | <details>
332 | 	<summary>Output</details>
333 | 
334 | ```
335 | Data format: auto-detect
336 | Auto-detected data format: annotation-only
337 | 
338 | Indexing paths...
339 | 
340 | Finished indexing:
341 |   Found data objects:         0
342 |   Found annotations:       2831
343 |   New data objects:           0
344 |   New annotations:         2831
345 | ```
346 | </details>
347 | 
348 | If we examine the annotation for object id:447a1471b96fad28678cc2bbd678d303 in the index, we will see the following:
349 | 
350 | ```
351 | ldb eval id:447a1471b96fad28678cc2bbd678d303 
352 | ```
353 | <details>
354 | 	<summary> Output</summary>
355 | 
356 | ```
357 | id:447a1471b96fad28678cc2bbd678d303
358 | {
359 |   "inference": {
360 |     "confidence": 0.9992263317108154,
361 |     "label": "vi"
362 |   },
363 |   "label": "vi"
364 | }
365 | 
366 | ```
367 | </details>
368 | 
369 | The inference results were merged with the original labels in the index. But the dataset 'roman-numerals' that we have staged still uses the old annotation versions. We can update it to the latest revision of annotations and save into LDB:
370 | 
371 | ```
372 | cd roman-numerals/
373 | ldb pull
374 | ldb commit
375 | ```
376 | 
377 | Now we can refer to this dataset by name to extract mislabeled images and predictions with low confidence:
378 | 
379 | ```
380 | ldb get ds:roman-numerals --query 'label != inference.label || inference.confidence < 0.5' --target problem-images/
381 | ```
382 | 
383 | <details>
384 | 	<summary> Output</summary>
385 | 
386 | ```
387 | Adding to working dataset...
388 | Added 285 data objects to ds:.temp.2022-06-17T03:29:24.038751+00:00
389 | Instantiating data...
390 | 
391 | Copied data to workspace.
392 |   Data objects:       285
393 |   Annotations:        285
394 | ```
395 | </details>
396 | 
397 | Note there 285 such images that warrant a closer inspection. This is large reduction from the original set (2800+ images) that we can focus on.
398 | 
399 | ----------TODO---------
400 | 
401 | 
402 | For now let us delete tag `train` from all duplicates:
403 | 
404 | ```
405 | ldb tag --tag val --tag train --remove train
406 | ```
407 | <details>
408 |   <summary>Output</summary>
409 | 	
410 | ```
411 | Tagging data objects
412 |   Data objects:       49
413 |   Num updated:        49
414 | ```
415 | </details>
416 | 	
417 | Now we have created a dataset `"numerals"` in our workspace and filled it with input references. LDB datasets are logical entities, so no data objects were copied or moved. Instead, LDB have read the files in the provided location, found all unique data samples (ignoring any duplicates), parsed their annotations and stored data pointers to the workspace. 
418 | 
419 | To use`"numerals"` dataset in subsequent steps of the workflow, let us save it to LDB:
420 | 
421 | | Step | Command |
422 | | --- | --- |
423 | | Save dataset "numerals" v.1 to LDB | `$ ldb commit` |
424 | 
425 | This action stores `"numerals"` into LDB repository, and assigns a version number to it.
426 | 
427 | The DeepLearningAI competition comes with a ResNet50 docker image for evaluation. One "quick and dirty" way to check for sanity of training data is to check if the network can generalize over the training set. To simulate the competition leaderboard, we provide a version of ResNet50 here:  [instructions for running](TODO).
428 | 
429 | Now let us assume ResNet50 was trained on the starter data, and for every training sample produced the following output in JSON format, where "class" is the input label, and "inference" is the output label:
430 | 
431 | ```json
432 | {
433 | 	"path": "./i/125d.jpg",
434 | 	"class": "i",
435 | 	"inference": {
436 | 			"class": "ii",
437 | 			"confidence": 0.2,
438 | 	},
439 | }
440 | ```
441 | 
442 | These output annotations are available in a bucket `gs://iterative/inference/`.
443 | 
444 | As usual for inferences, we can observe that some training inputs were not generalized properly, or their confidence remained low. 
445 | Some of these errors highlght the problems with data: the underlying objects could be noisy, incorrect, or paired with a wrong label.
446 | 
447 | To investigate further, let us isolate these errors.  
448 | We can stage a new dataset and query annotations from inference to fill it with objects the network failed to train on:
449 | 
450 | 
451 | | Step | Command |
452 | | --- | --- |
453 | | Start a new dataset  | `$ ldb stage ds:to-examine` |
454 | | Add misclassified objects | `$ ldb add gs://iterative/inference/ --query class != inference.class` |
455 | 
456 | 
457 | Now we have created a new dataset `"to-examine"` that holds references to data objects that we want to inspect. 
458 | However, there are no files to examine in our workspace yet. This is because LDB datasets are logical entities that hold pointers, not actual data files. 
459 | To instantiate this dataset (transfer all relevant objects from storage into workspace), we will use the INSTANTIATE command:
460 | 
461 | | Step | Command |
462 | | --- | --- |
463 | | Instantiate dataset in a workspace  | `$ ldb instantiate` |
464 | 
465 | For the sake of example, let's assume the dataset `"to-examine"` now holds ten annotated images, which may look somewhat like this: 
466 | 
467 | ![Courtesy: DeepLearning.ai, subset of images compiled by Pierre-Louis Bescond.](/images/numerals-bescond.png)
468 | 
469 | Upon closer examination, we note that a second image in the top row is too noisy, and the third image in bottom row does not belong to a set. On the other hand, the very first image should have been easy to recognize (so maybe it carries the wrong annotation) – and so on. For now, let us try to delete these images from the training set and see if the network performs better.
470 | 
471 | To accomplish this task, we can save dataset `"to-examine"`, stage `"numerals"`, and subtract the former from the latter:
472 | 
473 | 
474 | | Step | Command |
475 | | --- | --- |
476 | | Save "to-examine" dataset   | `$ ldb commit` |
477 | | Stage "numerals" dataset | `$ ldb stage ds:numerals` |
478 | | Subtract contents of a dataset| `$ ldb del ds:to-examine` |
479 | | Save dataset "numerals" v.2 | `$ ldb commit` |
480 | 
481 | Once we have successfully modified the working dataset, and can instantiate it to re-train the model and check the performance. 
482 | 
483 | If we don't like the result and want to roll back the changes, LDB versioning system makes it easy. All we need to roll back to the previous dataset version is to stage it and push as a new revision:
484 | 
485 | | Step | Command |
486 | | --- | --- |
487 | | Stage a specific dataset version  | `$ ldb stage ds:numerals.v1` |
488 | | Save it as the "current" version | `$ ldb commit` |
489 | 
490 | At this point, LDB holds two revisions of the dataset "numerals", v.1 and v.2, and the former is the version that will now be checked out by default.
491 | 
492 | Inference is not the only metric you can use to close the data-train loop. Obviously, a network may just memorize wrong labels and offer little guidance to training data via inferences. To collect more signals from training loop we can use metrics like learning gradients per sample, or result confidence:
493 | 
494 | | Step | Command |
495 | | --- | --- |
496 | | Check objects with low confidence | ``$ ldb list gs://iterative/inference/ --query 'inference.confidence < `0.55`'`` |
497 | 
498 | ### Dataset merging and class balancing
499 | 
500 | Another key operation to data-driven AI is the dataset merging. LDB allows for dataset slicing, dicing and merging operations via a sequence of ADD and DEL commands paired with query filters. However, when merging two or more datasets, it is also important to also keep track of class balancing.
501 | 
502 | To that end, LDB supports `--limit ` and `--sample-ratio` arguments that collectively define the absolute and relative limits to a number of data objects being merged. For example, let us assume our roman numerals dataset underperforms for numeral `'i'`. To address this deficiency, we might want to boost it with more samples that we generated and stored in a dataset `generated-numerals`, up to a limit that would not upset our class balance:
503 | 
504 | | Step | Command |
505 | | --- | --- |
506 | | Fill workspace with desired class samples |   ``ldb add ds:generated-numerals --query 'class == `i`'  --limit 100`` |
507 | 
508 | 
509 | ### Isolating objects with helper ML models
510 | 
511 | So far, we saw how a dataset can be staged, instantiated, filled, evaluated, and modified.
512 | Another key operation for model performance analysis is discovering training samples that would best represent the underperforming subclass.
513 | 
514 | For example, you may find that a particular class (say, numerals `'iii'`) is under-represented, and you want more of them. Let us also assume that you have a set of handwritten numerals to choose from, but they are not annotated. 
515 | 
516 | A classical solution to this problem is to run a helper model that would produce pre-annotations, and do the rest of annotation work manually. LDB simplifies this task by allowing a helper to be called within a query. LDB ships with several helper models (like CLIP embeddings and visual similarity), and more can be added:
517 | 
518 | | Step | Command |
519 | | --- | --- |
520 | | Add visually similar images to a working dataset  | `$ ldb add gs://iterative/handwritten --pipe clip-text "iii" --limit 100` |
521 | 
522 | 
523 | ### Indexing storage locations
524 | 
525 | So far we have assumed that LDB parses data objects and annotations on the fly whenever a storage location is queried. 
526 | There are several topics here that we need to cover to make our workflow more efficient.
527 | 
528 | As your data storage grows, parsing it repeatedly for every request becomes suboptimal. 
529 | 
530 | Repeated queries waste time, and coupling queries with storage locations is cumbersome. To solve this problem, LDB saves every data object and annotation it comes across into internal database (index). Index has a role of "root dataset" to which all data objects are assigned by default, and can be referenced as `ds:root`. Therefore, looking for a specific data object based on the previously indexed annotation (or user-defined tag) may look simply like this:
531 | 
532 | | Step | Command |
533 | | --- | --- |
534 | | List all objects matching annotation field in the index | ``$ ldb list ds:root --query 'class == `i`' `` |
535 | | List all objects matching a tag in the index  | `$ ldb list ds:root --tag "training" `| 
536 | 
537 | Also note that LDB addresses data objects by hashsum, and therefore only keeps track of unique data samples. However, data objects are often coupled with annotations that may change over time and are not unique. This presents two additional problems: first, how to update an annotation, and second – how to ensure reproducibility in a dataset when annotations are a moving target?
538 | 
539 | To answer these challenges, LDB supports re-indexing and annotation versioning. 
540 | 
541 | Re-indexing asynchronously queries a specified storage path for changed data objects and annotations, and adds them to the index. When adding a new annotation, LDB also retains a previous version (if any). This ensures the datasets referencing previous annotations will remain reproducible:
542 | 
543 | | Step | Command |
544 | | --- | --- |
545 | | Reindex objects at a given location | `$ ldb index /storage/` |
546 | 
547 | 
548 | ### Modifying annotations in existing datasets
549 | 
550 | We have seen how annotations can be updated with re-indexing. Practically, this means that we can send our dataset annotations to a 3rd party labeling service, receive the corrected records, and re-index them in LDB. To continue our example, let us assume we have pre-annotated a part of `handwritten` dataset, and sent the rest to an annotation service. Upon receiving the results, we can simply re-index the target to pick the updates:
551 | 
552 | | Step | Command |
553 | | --- | --- |
554 | | List all objects matching annotation field in the index | `$ ldb index gs://iterative/handwritten-reannotated/` |
555 | 
556 | However, as we mentioned, this change would not affect the existing datasets that are referencing the older annotations. To upgrade all annotations in a dataset to the latest revision found in index, one can use PULL command:
557 | 
558 | | Step | Command |
559 | | --- | --- |
560 | | Bump annotations to the latest version | `$ ldb pull` |
561 | 
562 | Pull command also works for individual data objects (referenced by hashsum or object path). A reverse operation to pull would be to set a specific object to a specific version of annotation:
563 | 
564 | | Step | Command |
565 | | --- | --- |
566 | | Bump annotations to the latest version | `$ ldb add 0xFD45DE --label-version 2` |
567 | 
568 | Finally, it might be convenient to correct minor errors in annotations right from the workspace. This can be done with staging the dataset, editing annotations, and adding the workspace (as a whole, or by individual files) back into a dataset:
569 | 
570 | | Step | Command |
571 | | --- | --- |
572 | | Stage a dataset | `$ ldb stage ds:numerals` |
573 | | Correct an annotation | `$ sed -i 's/"class": "i"/"class": "ii"/g' 125d.jpg` |
574 | | Register the change | `$ ldb add .` |
575 | | Push the change into LDB | `$ ldb commit` |
576 | 
577 | ### Indexing data in various formats
578 | 
579 | The DeepLearningAI competition permits up to 10,000 training images, but only ships with a starter set of 3,000. A natural question for a data scientist working on this challenge is where to find more data. The good news is that nowadays just about every data class in the world exists in the public domain. The bad news is that public datasets come in different (and often incompatible) formats.
580 | 
581 | The primary method to pair data objects with annotations supported by LDB is matching-name JSON annotations within the same folder, for example:
582 | 
583 | - 154F.m4a, 154F.json,
584 | - 23DE.m4a, 23DE.json,
585 | ...
586 | 
587 | Alternatively, LDB also understands the scheme where multiple data objects are described in a single JSON file residing at root folder:
588 | 
589 | - 154F.m4a,
590 | - 23DE.m4a, 
591 | - annotations.json
592 | 
593 | Both schemes permit arbitrary storage configurations, and double as default scheme for many labeling tools (such as [Label Studio](https://labelstud.io)).
594 | 
595 | However, a lot of "branded" datasets follow unique and proprietary conventions for annotations. For example, the DeepLearningAI competition on roman numerals encodes classes as folder names. Alternatively, COCO uses multiple shared JSON files to annotate objects in a dataset, while ImageNet combines a single key annotation file with class-specific folders.
596 | 
597 | LDB ships with a pre-processor for COCO, ImageNet, Google OpenImage, and general folder-class schemes. These can be selected during indexing with `--format` argument providing the format:
598 | 
599 | | Step | Command |
600 | | --- | --- |
601 | | Index objects in ImageNet folder | `$ ldb index --format ImageNet /storage/ImageNet500K/` |
602 | 
603 | Additional formats can be included by writing the custom pre-processors ([more on this here](TODO)).
604 | 
605 | 
606 | 


--------------------------------------------------------------------------------
/documentation/Command-summary.md:
--------------------------------------------------------------------------------
  1 | ## LDB datasets
  2 | 
  3 | LDB defines datasets as collections of pointers to immutable data objects paired with optional annotations and metadata. 
  4 | 
  5 | Since datasets are just collections, LDB can modify them without moving the underlying data objects. To examine the physical data objects or annotations in dataset, it must be partially or fully instantiated (see `INSTANTIATE` below). 
  6 | 
  7 | LDB datasets support names with `[A-Za-z0-9_-]` ANSI characters. LDB commands require dataset names to carry a mandatory `ds:` prefix, and allows for optional `.v[0-9]*` postfix that denotes a dataset version number.
  8 | 
  9 | ## LDB object identifiers
 10 | 
 11 | LDB identifies data objects by hashsum - which is a primary data object identifier. LDB treats copies of data samples in immutable storage as different paths to the same data object, and permits any such path to be used as a secondary object identifier. 
 12 | 
 13 | LDB identifies annotations for data objects based on the rules of the ingress format, and saves them internally. Annotations in LDB are paired to objects and are not directly addressable. It is, however, possible to specify annotation version for a data object, or instantiate annotations without related data samples. 
 14 |  
 15 | ## LDB workspaces
 16 | 
 17 | To work on a dataset, LDB stages it in a workspace (see `STAGE` below). Workspace holds all information for a dataset that is being modified. One user might have several workspaces in different directories. Any changes to a dataset (adding & removing objects, changing tags, etc) remain local to workspace until the `COMMIT` 
 18 | 
 19 | Here is the internal structure of a workspace folder:
 20 | ```
 21 | .ldb_workspace/
 22 |             ├── collection/
 23 |             └── workspace_dataset    
 24 | ```
 25 | 
 26 | Most LDB commands – `ADD`, `DEL`, `SYNC`, `COMMIT`, `PULL` require a staged dataset, and hence must run from a valid workspace. Other commands – like `LIST`, `STATUS`, `DIFF` – will also target a staged dataset by default, but do not require a staged dataset if they are passed other dataset identifiers.
 27 | 
 28 | If the dataset is create with `STAGE`, it already has a name. Datasets staged with `GET` are unnamed, and will get a name at a first commit.
 29 | 
 30 | ## locating an LDB instance
 31 | 
 32 | Every LDB command is linked to an instance where datasets and annotations are stored. There are two ways to locate an instance:
 33 | 
 34 | 1. Set `core.ldb_dir` in the global configuration file `~/.ldb/config` to an absolute path.
 35 | ```
 36 | [core]
 37 | ldb_dir = '/some/absolute/path'
 38 | ```
 39 | 2. Set the `LDB_DIR` environment variable to any absolute or relative path.
 40 | 
 41 | If both ways of configuration are present, the environment variable takes precedence.
 42 | If no method of configuration succeeds, all LDB commands will fail, except for `INIT` which does not require an existing installation, and `STAGE` when used in QuickStart (see below).
 43 | 
 44 | ## QuickStart 
 45 | 
 46 | QuickStart allows the individual user to begin working with LDB without explicit configuration. To that end, QuickStart makes strong configuration assumptions, and in return can jumpstart the LDB workflow with as little as 3-4 commands.
 47 | 
 48 | `STAGE` and `GET` (by means of calling `STAGE`) are the only two LDB command that can trigger QuickStart. To do it, `STAGE` confirms the absence of an active LDB instance, and calls `INIT` to start a new LDB repository before proceeding with staging a dataset.
 49 | 
 50 |  Under the hood, QuickStart consists of the following three steps:
 51 | 
 52 | * New LDB instance is created in user's home directory: `~/.ldb/private_instance`
 53 | * Storage configuration defaults to wide-open settings: 
 54 |     * All cloud locations are permitted to host data objects.
 55 |     * A `read-add` folder is created in user's home directory (see `ADD-STORAGE`).
 56 | * An `auto-index` option is set in LDB config, permitting `ADD` to process previously unindexed storage (see `ADD`)
 57 |  
 58 | Below is an example of QuickStart, where user queries a remote storage in two commands right after the LDB installation:
 59 | 
 60 | ```
 61 | $ ldb stage ds:my-numerals 
 62 | $ ldb add gs://iterative/roman-numerals --query 'class == `i`'
 63 | ```
 64 | ### LDB command list
 65 | 
 66 | - [INIT](#init)
 67 | - [ADD-STORAGE](#add-storage)
 68 | - [STAGE](#stage)
 69 | - [INDEX](#index)
 70 | - [ADD](#add)
 71 | - [DEL](#del)
 72 | - [TAG](#tag)
 73 | - [SYNC](#sync)
 74 | - [TRANSFORM](#transform)
 75 | - [INSTANTIATE](#instantiate)
 76 | - [GET](#get)
 77 | - [COMMIT](#commit)
 78 | - [DIFF](#diff)
 79 | - [LIST](#list)
 80 | - [STATUS](#status)
 81 | - [PULL](#pull)
 82 | - [DS](#ds)
 83 | - [EVAL](#eval)
 84 | - [UNINDEX](#unindex)
 85 | - [COMPLETION](#completion)
 86 | 
 87 | # INIT 
 88 | 
 89 | ```
 90 | ldb init [-f] <path>
 91 | ```
 92 | 
 93 | Initialize a new ldb instance at the given `<path>`.
 94 | 
 95 | `INIT` creates a new LDB instance (index) in the given directory. For most enterprise installations, LDB repository folder would be a shared directory on a fast disk. `INIT` ignores any existing LDB instances, and permits a new LDB repository to reside anywhere in a filesystem.
 96 | 
 97 | In addition to creating an LDB instance, `INIT` makes a global configuration file at `~/.ldb/config` and sets the `core.ldb_dir` key to point to new LDB location. If configuration files already exist, `INIT` does not change them.
 98 | 
 99 | Running `ldb init <path>` creates the following directory structure:
100 | ```
101 | path/
102 | ├── config
103 | ├── custom_code/
104 | │   ├── ldb_user_filters/
105 | │   └── ldb_user_functions/
106 | ├── data_object_info/
107 | ├── datasets/
108 | ├── objects/
109 | │   ├── annotations/
110 | │   ├── collections/
111 | │   ├── dataset_versions/
112 | │   └── transforms/
113 | └── storage
114 | ```
115 | After finishing, `INIT` prints the summary of work and a reminder on how to change active LDB instance pointers.
116 | 
117 | ## flags
118 | 
119 | `-f` or  `--force` 
120 | 
121 | If a target directory already contains an existing LDB instance,  `INIT` fails & prints a reminder to use `--force`.  Using `-f` or  `--force` erases an existing LDB installation.
122 | 
123 | If the target directory contains data (but not an LDB instance), `INIT` fails without an option to override. The user must provide an empty directory.
124 | 
125 | 
126 | # ADD-STORAGE 
127 | 
128 | ```
129 | ldb add-storage [-a {true,false}] [-f] [-o <key> <value>] <path>
130 | ```
131 | 
132 | `ADD-STORAGE` registers a disk (or cloud) data storage location into LDB and verifies the requisite permissions. `<path>` should be a URI or a prefix for URIs.
133 | 
134 | LDB keeps track of storage locations for several reasons, the primary being engineering discipline (prevent adding objects from random places), and authentication (see `access configuration` below). 
135 | 
136 | LDB supports the following storage URI types: fs, Google Cloud, AWS, and Azure. 
137 | 
138 | The minimum and sufficient set of permissions for LDB is to **list, stat and read** any objects at `<path>`. `ADD-STORAGE` fails if permissions are not sufficient, and succeeds with a warning if permissions are too wide. `ADD-STORAGE` also checks if `<path>` falls within an already registered URI, and prints an error if this the case. Permissions are re-checked if an existing storage location is re-added.
139 | 
140 | Since LDB assumes that storage objects are immutable, it never attempts to alter or move them. However, LDB may be required to push *new* files to storage if user chooses to source objects from ephemeral _fs_ paths (for example, from a personal workspace). Destination configured as `read-add` will permit LDB to automatically save such objects into immutable storage.
141 | 
142 | ## flags
143 | 
144 | `-o / --option <key> <value>`
145 | 
146 | Specify a key/value to pass to the fsspec filesystem instance created when accessing this storage location. May be used multiple times. Note that `value` is expected to be a JSON value in order to distinguish between different data types, but non-JSON values will be interpreted as a raw string. For example, to use a specific AWS profile with an s3 storage location, these are equivalent:
147 | 
148 | ```
149 | ldb add-storage s3://bucket/some/prefix -o profile '"profile-name"'
150 | ldb add-storage s3://bucket/some/prefix -o profile \"profile-name\"
151 | ldb add-storage s3://bucket/some/prefix -o profile profile-name
152 | ```
153 | 
154 | Or to access a public bucket anonymously (`true` will be converted to an actually bool value):
155 | ```
156 | ldb add-storage s3://bucket/some/prefix -o anon true
157 | ```
158 | 
159 | `-a {true,false}, --read-add {true,false}`
160 | 
161 | Storage location registered with `--read-add=true` must have write-access to allow new files to be added. 
162 | 
163 | LDB supports at most one `read-add` location, and uses it to save _previously unseen_ local data files that `ADD` command may encounter outside the registered storage. Users can change or remove the `read-add` attribute by repeatedly adding locations with or without this flag. Attempt to add a second`read-add` location to LDB should fail prompting the user to remove the attribute from existing location first. 
164 | 
165 | `read-add` location should never be used to store any data objects that originate at cloud locations. Attempt to reference unregistered cloud location in `ADD` command will fail immediately.
166 | 
167 | 
168 | *Use case:* 
169 | 
170 | ```
171 | $ ldb add-storage gs://add-storage --read-add true
172 |   new storage location gs://add-storage successfully registered.
173 | 
174 | $ ldb add ./cat1.jpg
175 |      warning: object id:564d copied to gs://add-storage/auto-import220211-11/cat1.jpg
176 | ```
177 | 
178 | There is a location `gs://add-storage` registered with `read-add` attribute, and user tries to add file from a workspace into a dataset. If LDB does not have an object with identical hashsum already indexed, the `ADD` command copies `cat1.jpg` into `gs://add-storage` under the unique folder name, indexes it, and adds this object to dataset.
179 | 
180 | *Use case:* 
181 | 
182 | ```
183 | $ ldb add ./cat1.jpg
184 |      error: object id:564d is not in LDB and no read-add location configured
185 | ```
186 | There are no `read-add` storage locations registered, and user tries to add a file from his workspace to a dataset. If LDB does not have an object with identical hashsum already indexed somewhere in storage, the ADD command fails.
187 | 
188 | ## lambda configuration
189 | 
190 | TODO BETA 
191 | document object lambda access configuration here
192 | 
193 | 
194 | # STAGE 
195 | ```
196 | ldb stage [-f] [-t <workspace_folder>] <ds:name>
197 | ```
198 | 
199 | `STAGE` command creates an LDB workspace at a given `<workspace_folder>` for dataset `<ds:name>`. The destination folder is expected to be empty. If LDB repository has no dataset `<ds:name>`, a new dataset is created. If `<ds:name>` references an existing dataset, it is staged out (but not automaticlly instantiated).
200 | 
201 | If workspace is not empty, `STAGE` checks if it holds a clean dataset, and clobbers it.  If `<workspace_folder>` holds a dirty dataset, a warning and a status of this dataset are printed before failure. If `<workspace_folder>` is not empty but does not hold an LDB dataset, a reminder to use `--force` is printed. 
202 | 
203 | *Use case:* 
204 | 
205 | ```
206 | $ ldb stage ds:cats
207 | $ ldb status
208 |  Dataset ds:cats, 0 objects, not saved in LDB.
209 | ```
210 | 
211 | If `STAGE` cannot locate an active LDB instance, it assumes a QuickStart, and proceeds with setting a new LDB instance (see QuickStart discussion).
212 | 
213 | ## flags
214 | 
215 | `-f` or  `--force` 
216 | 
217 | allows to clobber the workspace regardless of what is there.
218 | 
219 | 
220 | # INDEX 
221 | ```
222 | ldb index [-m <format>] [--add-tags <tags>]
223 |           [--annotation-update <strategy>] [-p <key>=<value>]
224 |           <path> [<path> ...]
225 | ```
226 | 
227 | `INDEX` updates LDB repository with data objects and annotations given as arguments. If LDB instance was created via QuickStart (see `STAGE`), then any cloud location may be indexed by default. If LDB instance was created with the `INIT` command, then LDB assumes indexed URIs to reside within storage locations configured (see `ADD-STORAGE`) and will fail otherwise. If folder is supplied to `INDEX` with no format flag, this folder is traversed recursively to recover objects and annotations in default format (one .json file per every data object sharing the object name). All hidden paths are excluded during indexing, which means any path where any of the directory or filenames begins with a dot (`.`) will not be indexed.
228 | 
229 | LDB maintains a "current" annotation version for every data object with at least one indexed annotation. LDB will update the "current" annotation version for a data object under the two following conditions:
230 | 
231 |  * This object is re-indexed (explicitly or implicitly), and an associated annotation for this data object was successfully recovered.
232 |  * This annotation was not seen before (re-indexing older annotation versions will have no effect on LDB).
233 | 
234 | _Use case:_
235 | ```
236 | $ ldb index gs://my-storage/new-folder  # traverse this folder to find data objects in default format
237 | ```
238 | 
239 | _Use case:_
240 | ```
241 | $ ldb index gs://my-storage/cat1.json   # index (or reindex) a specific annotation URI
242 | ```
243 | 
244 | ## flags
245 | 
246 | `--format <format>` - options: `{auto,auto-detect,strict,strict-pairs,bare,bare-pairs,infer,tensorflow-inferred,annot,annotation-only}` -  sets schema for data objects and annotations. `INDEX` fails if URI is not conformant with schema. `auto-detect` is the default. Some of these are simply short aliases for longer format names.
247 | 
248 | `--add-tags <tags>` Comma-separated list of tags to add to indexed data objects.
249 | 
250 | `--annotation-update <strategy>` Merge strategy for combining a data object's current annotation with the one discovered during indexing. Choices: `{merge,replace}` (default: `replace`)
251 |  * `replace` will simply replace the old annotation with the new.
252 |  * `merge` will merge top-level json keys if both the old and new annotations contain top-level JSON objects. Otherwise the new annotation will be used as with the `replace` option.
253 | 
254 | `-p <key>=<value>`, `--param <key>=<value>` Format-specific option. May be used multiple times. Different formats support different options. The following are supported:
255 |  * `infer`
256 |    * `base-label=<str>` - the label to be used for data objects that are inside of the base directory
257 |    * `label-key=<str>` - a jmespath key expression indicating which key the inferred label should be stored under. The default when this option is not used is `label`.
258 |  * `annot`
259 |    * `single-file={true,false}` -  If `true`, instantiate all annotation JSON objects in a top-level array, creating a single JSON file called `dataset.json`. If `false`, instantiate each annotation in a separate JSON file. The default behavior if this param is not used is the same as setting it to `false`.
260 | 
261 | `--ephemeral-remote` - Allow non-storage cloud files to be indexed. They will be copied to read-add storage. Normally files outside of the local filesystem must be inside of a storage location unless the config value, `core.read_any_cloud_location = true` is set, but local non-storage (ephemeral) files may be indexed by copying them to a configured read-add location. This allows remote files to be treated the same way as local ephemeral files, and copied to read-add storage if necessary.
262 | 
263 | ## formats
264 | 
265 | Brief format descriptions (see longer discussion of formats [here](formats.md))
266 |  * `auto`, `auto-detect` - Auto-detect the data format. Supports detection of: `strict-pairs`, `annotation-only`, `tensorflow-inferred`
267 |  * `strict`, `strict-pairs` - Only complete pairs of files will be indexed. The annotation file in each pair must have a name ending with `.json` and contain valid json. The data object file must have the same file name but with a different extension, and it must be in the same directory as the annotation file.
268 |  * `bare`, `bare-pairs` - File pairs are detected as with `strict-pairs`, but bare data objects (without corresponding annotation files) are indexed too. Any file whose name does not end with `.json` will be considered a data object file.
269 |  * `infer`, `tensorflow-inferred` - Annotation files will be generated containing labels that are inferred from each data object file's directory. This is based on the `labels="inferred"` option in TensorFlow's [`tf.keras.utils.image_dataset_from_directory`](https://www.tensorflow.org/api_docs/python/tf/keras/utils/image_dataset_from_directory). All files should be data object files that do **not** end with `.json`. The name of the directory passed to `ldb index` will be used as the label for data objects contained directly inside of it. Data objects within subdirectories will have nested labels. For example, if you called `ldb index --format tensorflow-inferred ~/data/animals/cat`, then `~/data/animals/cat/0001.png` would have the annotation `{"label": "cat"}` and `~/data/animals/cat/tabby/0001.png` would have the annotation `{"label": {"cat": "tabby"}}`. This would allow for query such as `ldb list --query label.cat.tabby`.  Note that for successful conversion of a label from another format into `tensorflow-inferred`, it must have the "label" JSON key and will fail otherwise.
270 |  * `annot`, `annotation-only` - Only annotation files ending with `.json` may exist under the given location. Each annotation file must contain a JSON object with the key `ldb_meta.data_object_id` pointing to a data object hash. This hash must specify a data object that already exists in LDB. This JSON object must also contain an `annotation` key whose value will be used as the annotation for the specified data object. For example, some `.json` file may contain:
271 | ```
272 | {
273 |   "annotation": {
274 |     "label": 1
275 |   }
276 |   "ldb_meta": {
277 |     "data_object_id": "2c4a9d28cc2ce780d17bea08d45d33b3"
278 |   }
279 | }
280 | ```
281 | This results in ldb using the following as the annotation for data object `id:2c4a9d28cc2ce780d17bea08d45d33b3`:
282 | ```
283 | {
284 |   "label": 1
285 | }
286 | ```
287 | 
288 | # ADD  
289 | ```
290 | ldb add <object-list> [<filters>]
291 | ```
292 | 
293 | Where,
294 | * `object-list` can be of one object identifier types: `id:<hashsum>` | `object_path` | `ds:<name>[.v<num>]` | `workspace_folder`
295 | 
296 | `ADD` is the main workhorse of LDB as it allows data sample(s) to be added to a dataset staged in the workspace. 
297 | 
298 | `ADD` builds a list of objects referenced by their hashsum, storage location, or source dataset, and applies optional filters to this list. Objects passing the filters are merged into the currently staged dataset. When a data object is added to the workspace, an associated annotation may go with it. The particular annotation version will be determined by the source identifier. **In case of version collisions** (same object referenced multiple times with divergent annotation versions), **the latest annotation will be kept**. For instance, if `ds:cats` and `ds:small-cats` contain some of the same data objects but with different annotation versions, then `ldb add ds:cats ds:small-cats` would need to choose an annotation version for each common data object.
299 | 
300 | `ADD` allows for multiple objects (or object sets) of one type to be specified in one command. If no explicit object sources are provided and a filter option such as `--query` or `--file` is used, `ADD` assumes the source to be `ds:root` – which is all objects indexed by LDB with the most recently indexed annotation for each.
301 | 
302 | While `ADD` normally references sources already known to LDB (pre-indexed objects with valid identifiers), it can also target a storage folder directly. If all data objects in the folder have been indexed by LDB before, then no new indexing occurs. If some data objects have not been indexed, then LDB will index them if the config option `core.auto_index = true` is set. Otherwise an error will occur.
303 | 
304 | A special scenario for `ADD` arises when it targets ephemeral filesystem paths (anything outside the configured storage locations). Most commonly, such targets would be in the current workspace (where new objects were added directly, or where annotations were edited in-place). `ADD` can understand such changes and will save new data objects into permanent storage (see the `--read-add` option in `ADD-STORAGE`).
305 | 
306 | ## object identifiers supported by `ADD`
307 | 
308 | 1. `id:<hashsum>` - full hashsum of an object. Currently LDB uses MD5, so any MD5 hashsum that corresponds to an object indexed by LDB is a valid identifier.
309 | 
310 | *Use case:* 
311 | ```
312 | $ ldb stage ds:cats
313 | $ ldb add id:456FED id:5656FED    # result: objects with hashum ids 456FED and 5656FED added to dataset
314 | ```
315 | 
316 | 2. `object_path` - any fully qualified (down to an object), or folder path within registered storage locations. Shell GLOB patterns are supported.
317 | 
318 | By default, `ADD` checks if the specified objects (or a list of objects recursively found in folder) were previously indexed by LDB, and uses the indexed annotations if this is the case. If some objects in the list are not found in index, `ADD` command fails with a reminder to run `INDEX` first.
319 | 
320 | *Use case:*
321 | ```
322 | $ ldb stage ds:cats
323 | $ ldb add gs://my-datasets/cats/black-cats/  # previously indexed location
324 |   12 objects added to workspace (ds:cats)
325 | ```
326 | 
327 | 
328 | *Use case:*
329 | ```
330 | $ ldb stage ds:cats
331 | $ ldb add gs://my-datasets/cats/white-cats/  # location is registered but folder contains new data
332 |   error: 23 objects detected that are not found in ds:root
333 |   Please run INDEX command first to process this storage location
334 | ```
335 | 
336 | This behavior can be altered with `auto-index` option in LDB configuration file. If set, this option allows `ADD` to implicitly `INDEX` to process objects in the folder or paths. This implicit call to `INDEX` assumes the objects are in the default format (one annotation file per each data objects), and will fail otherwise.
337 | 
338 | *Use case:*
339 | ```
340 | $ ldb stage ds:cats
341 | $ ldb add gs://my-datasets/cats/white-cats/  # location is registered but folder contains new data
342 |   indexing gs://my-datasets/cats/white-cats/
343 |   23 objects found, 20 new objects added to index, 3 annotations updated
344 |   23 objects added to workspace (ds:cats)
345 | ```
346 | 
347 | 3. `object_path` - any valid *fs* path NOT registered with `ADD-STORAGE`. The path can be fully qualified (down to objects), or reference a folder. When `ADD` is called on unregistered fs path, it expects annotations in default format and works in the following way:
348 | 
349 |   * If `object_path` is the workspace:
350 |       -  `ADD` will process updated annotations even in absense of paired data objects (see `INSTANTIATE --annotations-only`)
351 |       -  `ADD` will ignore data object transforms (see `TRANSFORM`)
352 | 
353 |   * In all other cases:
354 |       - If previously indexed data objects are found, they are added to staged dataset, alongside with their annotations
355 |       - If new objects (unknown to LDB) are found and `read-add` storage option configured, those objects are copied to `read-add` storage, indexed, and then added.
356 |       - If new objects (unknown to LDB) are found but no `read-add` storage configured, `ADD` command fails.
357 | 
358 | *Use case:*
359 | ```
360 | $ ldb stage ds:cats
361 | $ ldb instantiate --annotations-only
362 | $ sed -i 's/class=cat/class=dog/g' cat1.json 
363 | $ ldb add .                  # result: annotation for cat1.jpg got a new version in LDB, and in ds:cats
364 | ```
365 | 
366 | *Use case:*
367 | ```
368 | $ ldb stage ds:cats
369 | $ cp ~/storage/cat1.jpg ./   # bring some object already in LDB but not in this dataset
370 | $ ldb add ./cat1.jpg         # result: staged dataset ds:cats now includes cat1.jpg
371 | ```
372 | 
373 | *Use case:*
374 | ```
375 | $ ldb stage ds:cats
376 | $ cp ~/Downloads/cat1.jpg ./  # this object is not in LDB and read-add storage location configured
377 | $ ldb add .                  # result: cat1.jpg copied to read-add storage, and then added
378 | ```
379 | 
380 | 
381 | 
382 | 4. `ds:<name>[.v<num>]` - dataset name with an optional version number. Any valid LDB dataset can serve as a source of objects. Note that every dataset has objects paired with a particular annotation number, so it is possible to build a list where the same object is referenced several times with different annotations. If this is the case, the collision is resolved by using the latest annotation version among references.
383 | 
384 | *Use case:*
385 | ```
386 | $ ldb stage ds:cats
387 | $ ldb add ds:black_cats ds:white_cats.v2  # merged with latest ds:black_cats and v.2 of ds:white_cats
388 | ```
389 | 
390 | 5. `ws:workspace_folder` - ADD can take a workspace folder name as an argument. This helps to avoid saving temporary datasets to LDB. 
391 | 
392 | *Use case:*
393 | ```
394 | $ mkdir red_cats; cd red_cats
395 | $ ldb stage ds:red_cats                        # create some temporary dataset 
396 | $ ldb add ds:cats --query 'cat_color == `red`'    # fill it from some source
397 | $ cd .. ; mkdir green_cats; cd green_cats         # create another dataset 
398 | $ ldb stage ds:red_cats                        # create another temporary dataset
399 | $ ldb add ds:cats --query 'cat_color == `green`'  # fill it from another source
400 | $ cd ..  
401 | $ ldb stage ds:red_and_green_cats              # make a permanent dataset
402 | $ ldb add ws:./red_cats ws:./green_cats           # merge two temporary datasets into it 
403 | $ ldb commit                                      # save a permanent dataset
404 | $ rm -rf green_cats/ red_cats/                    # delete temporary datasets
405 | 
406 | ```
407 | 
408 | ADD with a `workspace_folder` argument can also be used to share datasets between different LDB instances. In this latter case, the only requirement is to ensure that destination LDB instance has access to all file paths of the source workspace.
409 | 
410 | ## filters and modifiers supported by `ADD`
411 | 
412 | `ADD` can be called with several filter and modifier flags. If multiple flags are specified, filters will be pipelined, so their order may matter. Multiple instances of one flag are not permitted in one `ADD` command.
413 | 
414 | 
415 | `--file <query>`
416 | 
417 | Builds a query (see [LDB Query Syntax](./LDB-queries.md)) using fixed JSON fields specific to LDB index. The full list of field can be seen with `ldb eval  --file '@'`. The partial list as follows:
418 | 
419 | * fs.mtime - data object modification time. 
420 | * last.indexed - annotation last indexing time. Affected by re-indexing (annotations are mutable).
421 | * fs.size - data object file size in bytes.
422 | * fs.path - data object path. If same data object was indexed under multiple paths, will match the last one.
423 | 
424 | Regular JMESPATH functions can be used to enrich the query.
425 | 
426 | *Use case:*
427 | ```
428 | $ ldb add --file 'regex(fs.path, `gs:datasets/cat-bucket/.*`)'  # Object source is implicitly ds:root, path filtered by regex
429 | ```
430 | 
431 | `--path <regex expression>`
432 | 
433 | A convenience alias for ``` --file 'regex(fs.path, `regex expression`)' ```
434 | 
435 | `--query <query>`
436 | 
437 | Permits a query (see [LDB Query Syntax](./LDB-queries.md)) that references arbitrary JSON fields present in object annotation.
438 | 
439 | *Use case:*
440 | ```
441 | $ ldb add --query 'class == `cats`'
442 | ```
443 | 
444 | `--pipe <exec> [<exec ...]`
445 | 
446 | Passes a list of objects through external program (e.g. an ML model) that filters or sorts them according to match criteria. Often used with `--limit`. LDB can be installed with two ready-made ML plugins (clip and resnet), but more custom plugins can be added. See the Pipe Plugins section below.
447 | 
448 | *Use case:*
449 | ```
450 | $ ldb add --pipe clip-text 'cat sitting on a chair' --limit 100 # returns 100 images that best match the provided semantic embedding
451 | ```
452 | 
453 | `--limit <n>`
454 | 
455 | Cuts the input list of objects at \<n\> samples.
456 | 
457 | `--sample <probability>`
458 | 
459 | Passes every object in input list with a given Bernoulli probability.
460 | 
461 | ```
462 | $ ldb add ds:cats --sample 0.9 
463 | ```
464 | 
465 | `--tag <tags>`
466 | 
467 | Comma-separated list of tags. Select only data objects that contain at least one.
468 | 
469 | For example, the following are all equivalent:
470 | ```
471 | ldb list --tag a,b --tag c
472 | ldb list --file "contains(tags, 'a') || contains(tags, 'b')" --file "contains(tags, 'c')"
473 | ldb list --file "contains_any(tags, ['a', 'b']) && contains(tags, 'c')"
474 | ```
475 | 
476 | `--no-tag <tags>`
477 | 
478 | Comma-separated list of tags. Select only data objects where at least one of these tags is missing.
479 | 
480 | For example, the following are equivalent:
481 | ```
482 | ldb list --no-tag a,b --no-tag c
483 | ldb list --file "! contains(tags, 'a') || ! contains(tags, 'b')" --file "! contains(tags, 'c')"
484 | ldb list --file "! contains_all(tags, ['a', 'b']) && ! contains(tags, 'c')"
485 | ```
486 | 
487 | ## Pipe Plugins
488 | 
489 | The `--pipe` option for LDB dataset commands `list`, `eval`, `add`, `del` takes one or more arguments which will be called as a subprocesses. The first argument should be the name of some script or executable which filters or sorts the dataset members passed to it. If this is only a name rather than a path, the first place LDB looks is in the `custom_code/ldb_user_filters/` directory within an ldb instance. By default this would be:
490 | ```
491 | ~/.ldb/private_instance/custom_code/ldb_user_filters/
492 | ```
493 | Any executable available by name or path may be used. This internal directory is simply a place to isolate scripts from the rest of your environment if you wish.
494 | 
495 | If multiple arguments are given to `--pipe`, they are all called together as a single command. Flags or options (arguments beginning with `-`) should be avoided as they will collide with LDB's options. Complex commands may be wrapped in a script, so that only positional arguments are needed.
496 | 
497 | Because datasets are unordered collections, an ordering or sorting operation is most useful when combined with a following filter operation such as `--limit`.
498 | 
499 | A script intended for use by `--pipe` should expect a json array via stdin where each item is three element array in the form `[data_object_hash, data_object_path, annotation_path]`. LDB will instantiate the dataset in a temporary location, so the data object and annotation paths will point to files in this location. The script should then provide its filtered results as a series of data object hashes separated by new lines. This could be any type of sort or filter operation.
500 | 
501 | Here is a simple example of a python script that reverses the order of items in a dataset:
502 | 
503 | ```python3
504 | import json
505 | import sys
506 | 
507 | if __name__ == "__main__":
508 |     for data_object_hash, *_ in reversed(json.loads(sys.stdin.read())):
509 |         print(data_object_hash, flush=True)
510 | ```
511 | 
512 | Scripts should be created and called the same way they would be called on the commandline. For example a python script like the one above could be run the following ways:
513 | 
514 | #### Platform independent
515 | 
516 | Place python code in `reverse.py` and call with `python3 path/to/reverse.py`. The disadvantage of this method is that you have to specify the path to `reverse.py`.
517 | 
518 | Example usage:
519 | ```
520 | ldb add --pipe python3 path/to/reverse.py --limit 10
521 | ```
522 | 
523 | #### Unix
524 | 
525 | On Linux, MacOS, or other unix-like systems, simply put the python code in `reverse` with a shebang at the top:
526 | ```
527 | #!/usr/bin/env python3
528 | ```
529 | Make sure `reverse` is executable (`chmod +x reverse` or `chmod u+x reverse`). Then use the path to `reverse`, make sure `reverse` is on your `$PATH`, or place `reverse` in the ldb instance's plugin directory:
530 | ```
531 | mv reverse ~/.ldb/private_instance/custom_code/ldb_user_filters/
532 | ```
533 | Example usage:
534 | ```
535 | ldb add --pipe reverse --limit 10
536 | ```
537 | 
538 | #### Windows
539 | 
540 | To run the python script with a single command, put the code in `reverse.py`, and create a batch file, `reverse.bat` in the same directory with:
541 | ```
542 | @echo off
543 | python3 "%~dp0\reverse.py"
544 | ```
545 | 
546 | Then use the full path of `reverse.bat`, make sure `reverse.bat` is in a location where it can be called, or place both `reverse.py` and `reverse.bat` in the ldb instance's plugin directory.
547 | ```
548 | ldb add --pipe reverse --limit 10
549 | ```
550 | Example usage:
551 | ```
552 | ldb add --pipe reverse --limit 10
553 | ```
554 | 
555 | #### Plugin script examples
556 | 
557 | For `--pipe` plugin examples, see [pipe-plugins](../pipe-plugins). Copy the files in here to the ldb instance's plugin directory to make `reverse` available on Unix-like or Windows environments.
558 | 
559 | For `--apply` plugin examples, see [apply-plugins](../apply-plugins).
560 | 
561 | 
562 | ## LDB Query Language
563 | 
564 | Ability to construct complex queries is on of key features of LDB that permits it to extract data objects that are best suitable for training. LDB uses [JMESPath](https://jmespath.org) and supports JSON slices, projections, and reductions. This means, for example, that ML engineer can request only images with a given number of a particular class object detected.
565 | 
566 | Examples of LDB queries:
567 | 
568 | ```
569 | classes[0:1] == `["cats", "dogs"]`
570 | ```
571 | 
572 | ```
573 | ( ! regex(classes[0], `cat.*` ) && length(classes) < `5`
574 | ```
575 | More query examples are given [here](LDB-queries.md)
576 | 
577 | ### LDB-provided Custom Query Functions
578 | 
579 | LDB provides a number of custom JMESPath functions. These are specified below with a signature in the format used by the [JMESPath spec documentation](https://jmespath.org/specification.html#built-in-functions):
580 | ```
581 | return_type function_name(type $argname)
582 | ```
583 | 
584 | Regex functions:
585 | 
586 | These use the Python standard library's `re` module internally.
587 | 
588 | **regex**
589 | ```
590 | bool regex(string $input_str, string $pattern)
591 | ```
592 | Returns a boolean indicating whether or not `$input_str` matches `$pattern`.
593 | 
594 | **regex_match**
595 | ```
596 | string|null regex_match(string $input_str, string $pattern)
597 | ```
598 | Returns a string containing the matched group if `$input_str` matches `$pattern` and `null` otherwise.
599 | 
600 | Math functions:
601 | Each of the following takes two arguments, which may be either a one-dimension array of numbers (vector) or a single number and applies a binary operator. If both arguments are an array, then the operation is applied element-wise. If at least one argument is an array, then an array is returned.
602 | 
603 | **add**, **sub**, **mul**, **div**
604 | ```
605 | array|number add(array|number $x1, array|number $x2)
606 | array|number sub(array|number $x1, array|number $x2)
607 | array|number mul(array|number $x1, array|number $x2)
608 | array|number div(array|number $x1, array|number $x2)
609 | ```
610 | 
611 | **neg**
612 | ```
613 | array|number neg(array|number $x)
614 | ```
615 | Returns the negation of the given number or each number in the given array.
616 | 
617 | 
618 | ### User-defined Custom Query Functions
619 | 
620 | Additional custom JMESPath functions may be added by placing a Python file in the `custom_code/ldb_user_functions/` directory within an ldb instance. By default, this would be:
621 | ```
622 | ~/.ldb/private_instance/custom_code/ldb_user_functions/
623 | ```
624 | Any dependencies that this file relies on should be supplied by the user in this directory as well, so it is generally best to only use the Python standard library. In order to register your custom functions, at least one of the Python files in this directory must contain a `CUSTOM_FUNCTIONS` variable with a mapping (`dict`) of function names to a two-element tuple. The first tuple element should be the function and the second, should be a list of acceptable json types for each argument. If an argument accepts more than one type, they should separated by a vertical bar (`|`). For example if you didn't want to use the `add` function provided by LDB, you could create functions `add_nums`, `add_arrays` by creating a file `~/.ldb/private_instance/custom_code/ldb_user_functions/math_funcs.py` with the following:
625 | ```
626 | def add_nums(x1, x2):
627 |     return x1 + x2
628 | 
629 | 
630 | def add_arrays(a1, a2):
631 |     return [x1 + x2 for x1, x2 in zip(a1, a2)]
632 | 
633 | 
634 | CUSTOM_FUNCTIONS = {
635 |     "add_nums": (add_nums, ["number", "number"]),
636 |     "add_arrays": (add_arrays, ["array", "array"]),
637 | }
638 | ```
639 | For an argument that could be a number or array, you would use `"array|number"` instead of just `"number"`.
640 | 
641 | # DEL 
642 | 
643 | ```
644 | ldb del <object-list> [<filters>]
645 | ```
646 | 
647 | `DEL` takes the same arguments and filters as `ADD`, but instead of adding the filtered objects it subtracts them from dataset staged in the workspace. If objects provided to `DEL` are not in the dataset, `DEL` does nothing.
648 | 
649 | # TAG 
650 | 
651 | ```
652 | ldb tag <object-list> [<filters>]
653 | ```
654 | 
655 | `TAG` is a text string in the ANSI character set `[0-9A-z_-]`. Multiple tags can be attached to data objects. Tags attached to objects are global – which means they apply to all instances of an object in all datasets irrespective of their annotations.
656 | 
657 | `TAG` takes the same arguments and filters as `ADD` command to identify datasets or individual objects to apply tags.
658 | 
659 | ## flags 
660 | 
661 | `-a <tags>`, `--add <tags>`
662 | 
663 | Comma-separated list of tags to add to data objects
664 | 
665 | `-r <tags>`, `--remove <tags>`
666 | 
667 | Comma-separated list of tags to remove from data
668 | 
669 | # SYNC 
670 | ```
671 | ldb sync [<object-list>] [<filters>]
672 | ```
673 | 
674 | `SYNC` synchronizes workspace state with the given `<object-list>`. If no arguments are given, the current working directory is used. It acts as a combination of `ADD` and `DEL` commands.
675 | 
676 | _Use case:_
677 | ```
678 | $ ldb instantiate        # instantiate the workspace dataset
679 | $ rm cats1.jpg           # delete one object file
680 | $ ldb sync               # pick up changes in workspace
681 | ```
682 | 
683 | # TRANSFORM 
684 | ```
685 | ldb transform <object-list> [<filters>]
686 |               [-a <transforms>] [-r <transforms>] [-s <transforms>]
687 | ``` 
688 | 
689 | Add, remove, or set transforms for data objects within a dataset. Transforms are commands that will be run for each data object they are assigned to during instantiation as the final step when using `bare-pairs` (the default) or `strict-pairs` formats. Each transform will be given a temporary path for the data object and annotation, as well as an output directory, and the transform is responsible for writing it's output to the output directory. This may be used to generate any number of augment data objects or modified annotations during instantiation.
690 | 
691 | This command takes the same query arguments as the `ADD` command.
692 | 
693 | ## flags
694 | 
695 | ```
696 |   -a <transforms>, --add <transforms>
697 |                         Comma-separated set of transform names to add
698 |   -r <transforms>, --remove <transforms>
699 |                         Comma-separated set of transform names to remove if
700 |                         present
701 |   -s <transforms>, --set <transforms>
702 |                         Comma-separated set of transform names to assign. A
703 |                         matching data object's entire transform set will be
704 |                         replaced with this one
705 | ```
706 | 
707 | ## configuration
708 | 
709 | In order to assign transforms to a dataset you must configure a name for each transform. This may be done by creating a `transform.<name>` section with a `run` key in your ldb instance's `config` directory (`~/.ldb/private_instance/config` for the default instance). The `run` key should be set to command (possibly with arguments) in the form of an array. The command should be available on your system's `$PATH` as an executable, or it should be an absolute path. For example, you may have an executable `rotate-image` available which takes the number of degrees as an argument and writes a rotated version of the input image. You could configure different rotation transforms by adding the following to you `config`:
710 | ```toml
711 | [transform.rotate-90]
712 | run = ["rotate-image", "90"]
713 | 
714 | [transform.rotate-180]
715 | run = ["rotate-image", "180"]
716 | ```
717 | Then you can assign these transforms to data objects in a working dataset by running:
718 | ```
719 | ldb transform -a rotate-90,rotate-180
720 | ```
721 | Note that the default set of transforms contains a single item, `self`, which refers to a builtin transform that executes raw instantiation. This means that the original data object and annotation are simply copied to the target directory during instantiation. If the original data object and annotation are not wanted, then the `self` transform may be removed with:
722 | ```
723 | ldb transform -r self
724 | ```
725 | Instead of adding and removing to existing sets of transforms, the exact set of transforms to use may be specified with the `-s` or `--set` option:
726 | ```
727 | ldb transform -s rotate-90,rotate-180
728 | ```
729 | 
730 | `-a` and `-r` may be used together in the same transform command, but `-s` may not be used with either `-a` or `-r`.
731 | 
732 | Query options or data object identifiers may be used to filter down the list of data objects updated:
733 | ```
734 | ldb transform ds:other-dataset --query 'label == `cats`' --limit 10
735 | ```
736 | If none are given then the update applies to every data object in the dataset.
737 | 
738 | The `LIST` command will show a `Transforms` column with the comma-separated names of transforms assigned to each data object in the dataset.
739 | 
740 | For a simple example, see [transforms/rotate.py](../transforms/rotate.py). An example config for this script:
741 | ```toml
742 | [transform.rotate-90]
743 | run = ["python3", "path/to/transforms/rotate.py", "90"]
744 | 
745 | [transform.rotate-180]
746 | run = ["python3", "path/to/transforms/rotate.py", "180"]
747 | 
748 | [transform.rotate-45-n45]
749 | run = ["python3", "path/to/transforms/rotate.py", "45", "-45"]
750 | ```
751 | 
752 | ## Replacing transforms
753 | 
754 | In LDB, a transform assigned to a data object is an actual command (e.g. `["rotate-image", "90"]`) saved as an immutable object. The configured name for a transform (e.g. `rotate-90`) is simply an identifier, providing an easy way to refer to a particular command. This means that renaming transforms in your config file is perfectly fine. However, new commands should receive new config entries.
755 | 
756 | For example, changing `[transform.rotate-90]` to `[transform.rotate-image-90]` without changing the array under `run` allows you to refer to the same transform with a new name, `rotate-image-90`. The `LIST` command's output will automatically reflect this change. However, if you want to modify the command assigned to some data objects, you should add a new config entry for the new command. The use `TRANSFORM` command's `-a` and `-r` options to add/remove transforms.
757 | 
758 | Note that this also means that transform config entries should generally only be removed if they are not assigned to any data objects in any dataset. If an assigned transform is deleted (or the array under `run` is modified), then `LIST` will refer to the unnamed transform with a hash identifier, until you add a name for the original command. You may also use this hash in place of the transform's name with the `TRANSFORM` command's `-a`, `-r`, and `-s` options.
759 | 
760 | # INSTANTIATE 
761 | ```
762 | ldb instantiate [<object-list>] [<filters>]
763 |                 [--pipe <exec> [<exec> ...]] [-m <format>]
764 |         	[-f] [-t <dir>] [--apply <exec> [<exec> ...]]
765 |                 [-p <key>=<value>]
766 | ```
767 | 
768 | `INSTANTIATE` partially or fully re-creates dataset in a workspace.  This command works whether the dataset in the workspace is committed (clean) or not (dirty). To partially reconstruct the dataset, `INSTANTIATE` can take any valid object ids - hashsums or full object paths (only those objects are instantiated). If a sub-folder is provided, instantiation happens in this sub-folder, which is created if needed.
769 | 
770 | ## flags
771 | 
772 | `-p <key>=<value>`, `--param <key>=<value>` Format-specific option. May be used multiple times. Different formats support different options. The following are supported:
773 |  * `infer`
774 |    * `base-label=<str>` - a single label to be instantiated in the base directory instead of a subdirectory. Provides a way to mirror the same option on the `index` command.
775 |    * `label-key=<str>` - a jmespath key expression indicating which key the inferred label should be stored under. The default when this option is not used is `label`.
776 | 
777 | `--apply <exec> [<exec> ...]`
778 | 
779 | An executable, along with any arguments that it should take, which should apply the final instantiation step. This is useful for making inferences or appling other transformations during instantiation.
780 | 
781 | LDB will change the working directory to the executable's parent directory before calling it as a subprocess in order to make it easy for the executable to reference any relevant artifacts (i.e. ML models or data) with relative paths.
782 | 
783 | LDB will first instantiate data objects and annotations normally in a temporary directory. A two-member json array will be passed to this executable, containing first the temporary directory and second the final directory the executable should write to. For example, the executable would receive something like this:
784 | ```json
785 | ["/home/user/workspace-dir/.ldb_workspace/tmp/tmplole6mzj", "/home/user/workspace-dir"]
786 | ```
787 | Then the executable should read files from the first directory, and write results to the second directory. This allows the executable to transform data objects or annotations in any way. After the executable's process finishes, LDB will erase the temporary directory and any files remaining in it.
788 | 
789 | For a simple example see [apply-plugins/random_predictions.py](../apply-plugins/random_predictions.py). This script simply makes random predictions and adds them to a `"prediction"` key for each existing annotation.
790 | 
791 | For a more practical example, you can index a sample of textocr annotations. Then use [apply-plugins/textocr_crops.py](../apply-plugins/textocr_crops.py) to find text annotations matching a regex and generate cropped images based on their bounding boxes. In this example we look for annotations matching roman numerals in the range 1-10 (i.e. I-X):
792 | ```
793 | ldb stage ds:roman-numerals
794 | ldb index s3://ldb-public/remote/data-lakes/textocr/small/
795 | ldb add --path '^ldb-public/remote/data-lakes/textocr/small/.*'
796 | ldb instantiate --apply python3 path/to/apply-plugins/textocr_crops.py '(?i)^(V?I{1,3}|I?[VX])$'
797 | ```
798 | 
799 | Note: Because `--apply` can take any number of arguments, the positional path argument that `instantiate` can take should be before `--apply`:
800 | ```
801 | ldb instantiate ./some/path --apply script arg1 arg2
802 | ```
803 | Or alternatively, it may go after `--`, which indicates that all following arguments are positional:
804 | ```
805 | ldb instantiate --apply script arg1 arg2 -- ./some/path
806 | ```
807 |  
808 | `--annotations`, `--annotations-only`
809 | 
810 | Only instantiates annotations (no data objects). Can be combined with `--format`.
811 | 
812 | `--format <format>` - choices: `{strict,strict-pairs,bare,bare-pairs,infer,tensorflow-inferred,annot,annotation-only}` -  sets schema for data objects and annotations. See `INDEX` for details about each format.
813 | 
814 | Specific annotation output format. The list of formats mirror those in `INDEX` command with `--format` flag.
815 | 
816 | `--preserve-paths`
817 | 
818 | Instantiate objects preserving full storage paths. Only supported for default LDB format (annotation file per every object).
819 | 
820 | # GET
821 | 
822 | ```
823 | ldb get <object-list> [<filters>]
824 |         [--pipe <exec> [<exec> ...]] [-m <format>]
825 |         [-t <dir>] [--apply <exec> [<exec> ...]] [-p <key>=<value>]
826 |         [<path> [<path> ...]]
827 | ```
828 | 
829 | Add the specified data objects into a working dataset and instantiate them. 
830 | This command works as a combination of several LDB operations and carries out the following steps:
831 | 
832 | - `stage` unnamed dataset in the target directory (if it is not already a valid workspace)
833 | - `add` the data objects specified
834 | - `instantiate` these data objects
835 | 
836 | Differences from running the `stage`, `add`, and `instantiate` commands separately are:
837 | - If the directory is not already a workspace, then the newly staged working dataset will be given a temporary name. To save such a dataset you must provide a name when calling `commit`.
838 | - The data object identifiers and query filters are run once and resolved to a logical dataset internally. Then `add` and `instantiate` are both run on this dataset. This avoids the overhead of resolving these specifiers multiple times, and ensures that non-deterministic options such as `--sample` do not result in different datasets for the `add` and `instantiate` operations.
839 | - The `instantiate` command normally results in a full instantiation of all items in the working dataset, and ensures that no additional files exist in the workspace. Under the `get` command, the instantiation only applies to the data objects and specified by the command and does not remove additional files. This allows for partial and additive instantiations.
840 | 
841 | One use case is quickly obtaining a dataset for training:
842 | ```
843 | ldb get ds:cats
844 | ```
845 | Or the union of multiple datasets:
846 | ```
847 | ldb get ds:dogs ds:cats
848 | ```
849 | 
850 | Since any dataset identifiers are allowed, this could also be a storage location:
851 | ```
852 | ldb get s3://ldb-public/remote/data-lakes/dogs-and-cats/
853 | ```
854 | Or filtered result:
855 | ```
856 | ldb get s3://ldb-public/remote/data-lakes/dogs-and-cats/ --query 'inference.class == class' --limit 10
857 | ```
858 | 
859 | You may also wish to specify the target directory intead of using the current directory:
860 | ```
861 | ldb get ds:cats -t cats
862 | ldb get ds:dogs ds:cats -t dogs-and-cats
863 | ```
864 | 
865 | # COMMIT 
866 | ```
867 | ldb commit [-m <message>] [--auto-pull [{True,False}]] [<dataset>]
868 | ```
869 | 
870 | `COMMIT` takes the currently staged dataset and saves it to LDB. This action renders workspace "clean" – meaning that all changes are saved, and workspace can be erased if needed. The result of `COMMIT` command on "dirty" workspace is always a new version of dataset.
871 | 
872 | The optional `message` flag will be added as the commit message and shown in `ldb status` when called with a dataset as an argument.
873 | 
874 | # DIFF 
875 | ```
876 | ldb diff [-s] [<dataset>] [<dataset>]
877 | ```
878 | 
879 | `DIFF` prints a list of differences between two datasets. `DIFF` with one argument can only run from a workspace and uses as the first comparand.
880 | 
881 | ## flags
882 | ```
883 |   -s, --summary  Show only the number of additions, deletions and
884 |                  modifications.
885 | ```
886 | 
887 | # LIST  
888 | ```
889 | ldb list <object-list> [<filters>]
890 | ```
891 | 
892 | `LIST` can take the exact same arguments as `ADD` but only prints matching objects instead of actually adding them.
893 | Unlike `ADD`, `LIST` without arguments targets objects in a staged dataset. To target objects in LDB index, use `ds:root` as the object source.
894 | 
895 | ## flags
896 | 
897 | `-s` or  `--summary` 
898 | 
899 | just prints object counts
900 | 
901 | `-v` or  `--verbose`
902 | 
903 | detailed object information
904 | 
905 | # STATUS  
906 | ```
907 | ldb status [<dataset>]
908 | ```
909 | 
910 | When run without arguments from a workspace, `STATUS` summarizes state of a staged dataset. This includes any uncomitted changes and current object counts. If called with an argument, `STATUS` prints a summary for a dataset in the argument.
911 | 
912 | # PULL 
913 | ```
914 | ldb pull [<object-list>]
915 | ```
916 | 
917 | `PULL` changes annotation versions for indicated object(s) in workspace to latest known to LDB. If no `object-id(s)` specified, command will apply to all objects in a workspace. Pull action applied to objects not in the current workspace are ignored.
918 | 
919 | # DS
920 | ```
921 | ldb ds list
922 | ldb ds del <dataset> [<dataset> ...]
923 | ```
924 | 
925 | ## DS LIST
926 | ```
927 | ldb ds list
928 | ```
929 | 
930 | Lists latest versions of all datasets in LDB repository.
931 | 
932 | ## DS DEL
933 | ```
934 | ldb ds del <dataset> [<dataset> ...]
935 | ```
936 | 
937 | Deletes the given dataset entries. This command deletes all of the versions under a given dataset name, so arguments should not specify a version number. For example, use `ds:my-dataset` rather than `ds:my-dataset.v2`.
938 | 
939 | # EVAL
940 | ```
941 | ldb eval <object-list> [<filters>]
942 | ```
943 | `EVAL` works the same way as `LIST`, except it will print out json results. Any `--query` or `--file` option that comes before other filter options (such as `--limit`, `--pipe`, or multiple `--query` options) will be used to filter items, but if the command ends with a `--query`, `--file`, or both then the json values of applying these final queries will be displayed rather than used to filter our items. This is useful for debugging queries for other commands such as `ADD` and `LIST`.
944 | 
945 | The `query` argument must be a valid JMESPath query to be run over annotations (if used with `--query` flag) and over data object file attributes (if used with `--file`). The `path` arguments may be any data object identifiers that the `ADD` command can take.
946 | 
947 | The `-j` or `--json-only` option will print only JSON query results. Without it, each JSON object is preceded by the corresponding data object hash.
948 | 
949 | # UNINDEX
950 | 
951 | ```
952 | ldb unindex <object-list> [<filters>]
953 | ```
954 | 
955 | `UNINDEX` takes the same arguments and filters as `DEL` and permanently removes the data object entries from the index. This requires the data objects given not to be members of any saved datasets. Otherwise an error will be thrown. To delete datasets that are no longer useful, see the `ldb ds del` command.
956 | 
957 | # COMPLETION
958 | ```
959 | ldb completion [-h] [-q | -v] [-s {bash,zsh,tcsh}]
960 | ```
961 | To add tab-completion for a particular shell, save the output of this command into a file in your shell's completion directory. Use the `-s` option to specify your shell. For example, on a Linux machine, adding bash completion might be:
962 | ```
963 | ldb completion -s bash | sudo tee /usr/share/bash-completion/completions/ldb
964 | ```
965 | And adding zsh completion might be:
966 | ```
967 | ldb completion -s zsh | sudo tee /usr/local/share/zsh/site-functions/_ldb
968 | ```
969 | 
970 | The exact location of each shell's completion directory varies from system to system.
971 | 


--------------------------------------------------------------------------------