├── .gitattributes
├── .gitignore
├── README.md
├── apputils.py
├── build_docker_image.sh
├── build_metadata_file.py
├── requirements.txt
└── tagging_app.py


/.gitattributes:
--------------------------------------------------------------------------------
 1 | *.7z filter=lfs diff=lfs merge=lfs -text
 2 | *.arrow filter=lfs diff=lfs merge=lfs -text
 3 | *.bin filter=lfs diff=lfs merge=lfs -text
 4 | *.bin.* filter=lfs diff=lfs merge=lfs -text
 5 | *.bz2 filter=lfs diff=lfs merge=lfs -text
 6 | *.ftz filter=lfs diff=lfs merge=lfs -text
 7 | *.gz filter=lfs diff=lfs merge=lfs -text
 8 | *.h5 filter=lfs diff=lfs merge=lfs -text
 9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.model filter=lfs diff=lfs merge=lfs -text
12 | *.msgpack filter=lfs diff=lfs merge=lfs -text
13 | *.onnx filter=lfs diff=lfs merge=lfs -text
14 | *.ot filter=lfs diff=lfs merge=lfs -text
15 | *.parquet filter=lfs diff=lfs merge=lfs -text
16 | *.pb filter=lfs diff=lfs merge=lfs -text
17 | *.pt filter=lfs diff=lfs merge=lfs -text
18 | *.pth filter=lfs diff=lfs merge=lfs -text
19 | *.rar filter=lfs diff=lfs merge=lfs -text
20 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21 | *.tar.* filter=lfs diff=lfs merge=lfs -text
22 | *.tflite filter=lfs diff=lfs merge=lfs -text
23 | *.tgz filter=lfs diff=lfs merge=lfs -text
24 | *.xz filter=lfs diff=lfs merge=lfs -text
25 | *.zip filter=lfs diff=lfs merge=lfs -text
26 | *.zstandard filter=lfs diff=lfs merge=lfs -text
27 | *tfevents* filter=lfs diff=lfs merge=lfs -text
28 | *.json filter=lfs diff=lfs merge=lfs -text
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 
140 | .idea
141 | metadata_*.json
142 | datasets/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Datasets Tagging
 3 | emoji: 🤗
 4 | colorFrom: pink
 5 | colorTo: blue
 6 | sdk: streamlit
 7 | app_file: tagging_app.py
 8 | pinned: false
 9 | ---
10 | 
11 | ## ⚠️ This repo is now directly maintained in the Space repo at https://huggingface.co/spaces/huggingface/datasets-tagging ⚠️
12 | 
13 | You can clone it from there with `git clone https://huggingface.co/spaces/huggingface/datasets-tagging`.
14 | 
15 | You can open Pull requests & Discussions in the repo too: https://huggingface.co/spaces/huggingface/datasets-tagging/discussions.
16 | 
17 | 
18 | # 🤗 Datasets Tagging
19 | A Streamlit app to add structured tags to a dataset card.
20 | Available online [here!](https://huggingface.co/spaces/huggingface/datasets-tagging)
21 | 
22 | 
23 | 1. `pip install -r requirements.txt`
24 | 2. `./build_metadata_file.py` will build an up-to-date metadata file from the `datasets/` repo (clones it locally)
25 | 3. `streamlit run tagging_app.py`
26 | 
27 | This will give you a `localhost` link you can click to open in your browser.
28 | 
29 | The app initialization on the first run takes a few minutes, subsequent runs are faster.
30 | 
31 | Make sure to hit the `Done? Save to File!` button in the right column when you're done tagging a config!
32 | 


--------------------------------------------------------------------------------
/apputils.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List
 2 | 
 3 | 
 4 | def new_state() -> Dict[str, List]:
 5 |     return {
 6 |         "task_categories": [],
 7 |         "task_ids": [],
 8 |         "multilinguality": [],
 9 |         "languages": [],
10 |         "language_creators": [],
11 |         "annotations_creators": [],
12 |         "source_datasets": [],
13 |         "size_categories": [],
14 |         "licenses": [],
15 |         "pretty_name": None,
16 |     }
17 | 


--------------------------------------------------------------------------------
/build_docker_image.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | cleanup() {
 4 |   rm -f Dockerfile .dockerignore
 5 | }
 6 | 
 7 | trap cleanup ERR EXIT
 8 | 
 9 | ./build_metadata_file.py
10 | 
11 | cat > .dockerignore << EOF
12 | .git
13 | datasets
14 | EOF
15 | 
16 | cat > Dockerfile << EOF
17 | FROM python
18 | COPY requirements.txt tagging_app.py task_set.json language_set.json license_set.json metadata_927d44346b12fac66e97176608c5aa81843a9b9a.json ./
19 | RUN pip install -r requirements.txt
20 | RUN pip freeze
21 | CMD ["streamlit", "run", "tagging_app.py"]
22 | EOF
23 | 
24 | set -eEx
25 | 
26 | docker build -t dataset-tagger .
27 | 


--------------------------------------------------------------------------------
/build_metadata_file.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """ This script will clone the `datasets` repository in your current directory and parse all currently available
 4 |     metadata, from the `README.md` yaml headers and the automatically generated json files.
 5 |     It dumps the results in a `metadata_{current-commit-of-datasets}.json` file.
 6 | """
 7 | 
 8 | import json
 9 | from pathlib import Path
10 | from subprocess import check_call, check_output
11 | from typing import Dict
12 | 
13 | import yaml
14 | 
15 | from apputils import new_state
16 | 
17 | 
18 | def metadata_from_readme(f: Path) -> Dict:
19 |     with f.open() as fi:
20 |         content = [line.rstrip() for line in fi]
21 | 
22 |     if content[0] == "---" and "---" in content[1:]:
23 |         yamlblock = "\n".join(content[1 : content[1:].index("---") + 1])
24 |         return yaml.safe_load(yamlblock) or dict()
25 | 
26 | 
27 | def load_ds_datas():
28 |     drepo = Path("datasets")
29 |     if drepo.exists() and drepo.is_dir():
30 |         check_call(["git", "pull"], cwd=drepo)
31 |     else:
32 |         check_call(["git", "clone", "https://github.com/huggingface/datasets.git"])
33 |     head_sha = check_output(["git", "rev-parse", "HEAD"], cwd=drepo)
34 | 
35 |     datasets_md = dict()
36 | 
37 |     for ddir in sorted((drepo / "datasets").iterdir(), key=lambda d: d.name):
38 | 
39 |         try:
40 |             metadata = metadata_from_readme(ddir / "README.md")
41 |         except:
42 |             metadata = None
43 |         if metadata is None or len(metadata) == 0:
44 |             metadata = new_state()
45 | 
46 |         try:
47 |             with (ddir / "dataset_infos.json").open() as fi:
48 |                 infos = json.load(fi)
49 |         except:
50 |             infos = None
51 | 
52 |         datasets_md[ddir.name] = dict(metadata=metadata, infos=infos)
53 |     return head_sha.decode().strip(), datasets_md
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     head_sha, datas = load_ds_datas()
58 |     fn = f"metadata_{head_sha}.json"
59 |     print(f"writing to '{fn}'")
60 |     with open(fn, "w") as fi:
61 |         fi.write(json.dumps(datas))
62 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pyyaml
2 | datasets==1.9.0
3 | streamlit>=0.88.0
4 | langcodes[data]
5 | 


--------------------------------------------------------------------------------
/tagging_app.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from pathlib import Path
  4 | from typing import Callable, Dict, List, Tuple
  5 | 
  6 | import langcodes as lc
  7 | import streamlit as st
  8 | import yaml
  9 | from datasets.utils.metadata import (
 10 |     DatasetMetadata,
 11 |     known_creators,
 12 |     known_licenses,
 13 |     known_multilingualities,
 14 |     known_size_categories,
 15 |     known_task_ids,
 16 | )
 17 | 
 18 | from apputils import new_state
 19 | 
 20 | st.set_page_config(
 21 |     page_title="HF Dataset Tagging App",
 22 |     page_icon="https://huggingface.co/front/assets/huggingface_logo.svg",
 23 |     layout="wide",
 24 |     initial_sidebar_state="auto",
 25 | )
 26 | 
 27 | # XXX: restyling errors as streamlit does not respect whitespaces on `st.error` and doesn't scroll horizontally, which
 28 | #   generally makes things easier when reading error reports
 29 | st.markdown(
 30 |     """
 31 | <style>
 32 |     div[role=alert] { overflow-x: scroll}
 33 |     div.stAlert p { white-space: pre }
 34 | </style>
 35 | """,
 36 |     unsafe_allow_html=True,
 37 | )
 38 | 
 39 | ########################
 40 | ## Helper functions
 41 | ########################
 42 | 
 43 | 
 44 | def load_ds_datas() -> Dict[str, Dict[str, Dict]]:
 45 |     metada_exports = sorted(
 46 |         [f for f in Path.cwd().iterdir() if f.name.startswith("metadata_")],
 47 |         key=lambda f: f.lstat().st_mtime,
 48 |         reverse=True,
 49 |     )
 50 |     if len(metada_exports) == 0:
 51 |         raise ValueError("need to run ./build_metada_file.py at least once")
 52 |     with metada_exports[0].open() as fi:
 53 |         logging.info(f"loaded {metada_exports[0]}")
 54 |         return json.load(fi)
 55 | 
 56 | 
 57 | def split_known(vals: List[str], okset: List[str]) -> Tuple[List[str], List[str]]:
 58 |     if vals is None:
 59 |         return [], []
 60 |     return [v for v in vals if v in okset], [v for v in vals if v not in okset]
 61 | 
 62 | 
 63 | def multiselect(
 64 |     w: st.delta_generator.DeltaGenerator,
 65 |     title: str,
 66 |     markdown: str,
 67 |     values: List[str],
 68 |     valid_set: List[str],
 69 |     format_func: Callable = str,
 70 | ):
 71 |     valid_values, invalid_values = split_known(values, valid_set)
 72 |     w.markdown(f"#### {title}")
 73 |     if len(invalid_values) > 0:
 74 |         w.markdown("Found the following invalid values:")
 75 |         w.error(invalid_values)
 76 |     return w.multiselect(markdown, valid_set, default=valid_values, format_func=format_func)
 77 | 
 78 | 
 79 | def validate_dict(w: st.delta_generator.DeltaGenerator, state_dict: Dict):
 80 |     try:
 81 |         DatasetMetadata(**state_dict)
 82 |         w.markdown("✅ This is a valid tagset! 🤗")
 83 |     except Exception as e:
 84 |         w.markdown("❌ This is an invalid tagset, here are the errors in it:")
 85 |         w.error(e)
 86 | 
 87 | 
 88 | def map_num_examples_to_size_categories(n: int) -> str:
 89 |     if n <= 0:
 90 |         size_cat = "unknown"
 91 |     elif n < 1000:
 92 |         size_cat = "n<1K"
 93 |     elif n < 10000:
 94 |         size_cat = "1K<n<10K"
 95 |     elif n < 100000:
 96 |         size_cat = "10K<n<100K"
 97 |     elif n < 1000000:
 98 |         size_cat = "100K<n<1M"
 99 |     elif n < 10000000:
100 |         size_cat = "1M<n<10M"
101 |     elif n < 100000000:
102 |         size_cat = "10M<n<100M"
103 |     elif n < 1000000000:
104 |         size_cat = "100M<n<1B"
105 |     elif n < 10000000000:
106 |         size_cat = "1B<n<10B"
107 |     elif n < 100000000000:
108 |         size_cat = "10B<n<100B"
109 |     elif n < 1000000000000:
110 |         size_cat = "100B<n<1T"
111 |     else:
112 |         size_cat = "n>1T"
113 |     return size_cat
114 | 
115 | 
116 | def is_state_empty(state: Dict[str, List]) -> bool:
117 |     return sum(len(v) if v is not None else 0 for v in state.values()) == 0
118 | 
119 | 
120 | state = new_state()
121 | datasets_md = load_ds_datas()
122 | dataset_ids = list(datasets_md.keys())
123 | dataset_id_to_metadata = {name: mds["metadata"] for name, mds in datasets_md.items()}
124 | dataset_id_to_infos = {name: mds["infos"] for name, mds in datasets_md.items()}
125 | 
126 | 
127 | ########################
128 | ## Dataset selection
129 | ########################
130 | 
131 | 
132 | st.sidebar.markdown(
133 |     """
134 | # HuggingFace Dataset Tagger
135 | 
136 | This app aims to make it easier to add structured tags to the datasets present in the library.
137 | 
138 | """
139 | )
140 | 
141 | 
142 | queryparams = st.experimental_get_query_params()
143 | preload = queryparams.get("preload_dataset", list())
144 | preloaded_id = None
145 | initial_state = None
146 | initial_infos, initial_info_cfg = None, None
147 | dataset_selector_index = 0
148 | 
149 | if len(preload) == 1 and preload[0] in dataset_ids:
150 |     preloaded_id, *_ = preload
151 |     initial_state = dataset_id_to_metadata.get(preloaded_id)
152 |     initial_infos = dataset_id_to_infos.get(preloaded_id)
153 |     initial_info_cfg = next(iter(initial_infos)) if initial_infos is not None else None  # pick first available config
154 |     state = initial_state or new_state()
155 |     dataset_selector_index = dataset_ids.index(preloaded_id)
156 | 
157 | preloaded_id = st.sidebar.selectbox(
158 |     label="Choose dataset to load tag set from", options=dataset_ids, index=dataset_selector_index
159 | )
160 | 
161 | leftbtn, rightbtn = st.sidebar.columns(2)
162 | if leftbtn.button("pre-load"):
163 |     initial_state = dataset_id_to_metadata[preloaded_id]
164 |     initial_infos = dataset_id_to_infos[preloaded_id]
165 |     initial_info_cfg = next(iter(initial_infos))  # pick first available config
166 |     state = initial_state or new_state()
167 |     st.experimental_set_query_params(preload_dataset=preloaded_id)
168 | if not is_state_empty(state):
169 |     if rightbtn.button("flush state"):
170 |         state = new_state()
171 |         initial_state = None
172 |         preloaded_id = None
173 |         st.experimental_set_query_params()
174 | 
175 | if preloaded_id is not None and initial_state is not None:
176 |     st.sidebar.markdown(
177 |         f"""
178 | ---
179 | The current base tagset is [`{preloaded_id}`](https://huggingface.co/datasets/{preloaded_id})
180 | """
181 |     )
182 |     validate_dict(st.sidebar, initial_state)
183 |     st.sidebar.markdown(
184 |         f"""
185 | Here is the matching yaml block:
186 | 
187 | ```yaml
188 | {yaml.dump(initial_state)}
189 | ```
190 | """
191 |     )
192 | 
193 | 
194 | leftcol, _, rightcol = st.columns([12, 1, 12])
195 | 
196 | #
197 | # DATASET NAME
198 | #
199 | leftcol.markdown("### Dataset name")
200 | state["pretty_name"] = leftcol.text_area(
201 |     "Pick a nice descriptive name for the dataset",
202 | )
203 | 
204 | 
205 | 
206 | #
207 | # TASKS
208 | #
209 | leftcol.markdown("### Supported tasks")
210 | state["task_categories"] = multiselect(
211 |     leftcol,
212 |     "Task category",
213 |     "What categories of task does the dataset support?",
214 |     values=state["task_categories"],
215 |     valid_set=list(known_task_ids.keys()),
216 |     format_func=lambda tg: f"{tg}: {known_task_ids[tg]['description']}",
217 | )
218 | task_specifics = []
219 | for task_category in state["task_categories"]:
220 |     specs = multiselect(
221 |         leftcol,
222 |         f"Specific _{task_category}_ tasks",
223 |         f"What specific tasks does the dataset support?",
224 |         values=[ts for ts in (state["task_ids"] or []) if ts in known_task_ids[task_category]["options"]],
225 |         valid_set=known_task_ids[task_category]["options"],
226 |     )
227 |     if "other" in specs:
228 |         other_task = leftcol.text_input(
229 |             "You selected 'other' task. Please enter a short hyphen-separated description for the task:",
230 |             value="my-task-description",
231 |         )
232 |         leftcol.write(f"Registering {task_category}-other-{other_task} task")
233 |         specs[specs.index("other")] = f"{task_category}-other-{other_task}"
234 |     task_specifics += specs
235 | state["task_ids"] = task_specifics
236 | 
237 | 
238 | #
239 | # LANGUAGES
240 | #
241 | leftcol.markdown("### Languages")
242 | state["multilinguality"] = multiselect(
243 |     leftcol,
244 |     "Monolingual?",
245 |     "Does the dataset contain more than one language?",
246 |     values=state["multilinguality"],
247 |     valid_set=list(known_multilingualities.keys()),
248 |     format_func=lambda m: f"{m} : {known_multilingualities[m]}",
249 | )
250 | 
251 | if "other" in state["multilinguality"]:
252 |     other_multilinguality = leftcol.text_input(
253 |         "You selected 'other' type of multilinguality. Please enter a short hyphen-separated description:",
254 |         value="my-multilinguality",
255 |     )
256 |     leftcol.write(f"Registering other-{other_multilinguality} multilinguality")
257 |     state["multilinguality"][state["multilinguality"].index("other")] = f"other-{other_multilinguality}"
258 | 
259 | valid_values, invalid_values = list(), list()
260 | for langtag in state["languages"]:
261 |     try:
262 |         lc.get(langtag)
263 |         valid_values.append(langtag)
264 |     except:
265 |         invalid_values.append(langtag)
266 | leftcol.markdown("#### Languages")
267 | if len(invalid_values) > 0:
268 |     leftcol.markdown("Found the following invalid values:")
269 |     leftcol.error(invalid_values)
270 | 
271 | langtags = leftcol.text_area(
272 |     "What languages are represented in the dataset? expected format is BCP47 tags separated for ';' e.g. 'en-US;fr-FR'",
273 |     value=";".join(valid_values),
274 | )
275 | state["languages"] = langtags.strip().split(";") if langtags.strip() != "" else []
276 | 
277 | 
278 | #
279 | # DATASET CREATORS & ORIGINS
280 | #
281 | leftcol.markdown("### Dataset creators")
282 | state["language_creators"] = multiselect(
283 |     leftcol,
284 |     "Data origin",
285 |     "Where does the text in the dataset come from?",
286 |     values=state["language_creators"],
287 |     valid_set=known_creators["language"],
288 | )
289 | state["annotations_creators"] = multiselect(
290 |     leftcol,
291 |     "Annotations origin",
292 |     "Where do the annotations in the dataset come from?",
293 |     values=state["annotations_creators"],
294 |     valid_set=known_creators["annotations"],
295 | )
296 | 
297 | 
298 | #
299 | # LICENSES
300 | #
301 | state["licenses"] = multiselect(
302 |     leftcol,
303 |     "Licenses",
304 |     "What licenses is the dataset under?",
305 |     valid_set=list(known_licenses.keys()),
306 |     values=state["licenses"],
307 |     format_func=lambda l: f"{l} : {known_licenses[l]}",
308 | )
309 | if "other" in state["licenses"]:
310 |     other_license = st.text_input(
311 |         "You selected 'other' type of license. Please enter a short hyphen-separated description:",
312 |         value="my-license",
313 |     )
314 |     st.write(f"Registering other-{other_license} license")
315 |     state["licenses"][state["licenses"].index("other")] = f"other-{other_license}"
316 | 
317 | 
318 | #
319 | # LINK TO SUPPORTED DATASETS
320 | #
321 | pre_select_ext_a = []
322 | if "original" in state["source_datasets"]:
323 |     pre_select_ext_a += ["original"]
324 | if any([p.startswith("extended") for p in state["source_datasets"]]):
325 |     pre_select_ext_a += ["extended"]
326 | state["source_datasets"] = multiselect(
327 |     leftcol,
328 |     "Relations to existing work",
329 |     "Does the dataset contain original data and/or was it extended from other datasets?",
330 |     values=pre_select_ext_a,
331 |     valid_set=["original", "extended"],
332 | )
333 | 
334 | if "extended" in state["source_datasets"]:
335 |     pre_select_ext_b = [p.split("|")[1] for p in state["source_datasets"] if p.startswith("extended|")]
336 |     extended_sources = multiselect(
337 |         leftcol,
338 |         "Linked datasets",
339 |         "Which other datasets does this one use data from?",
340 |         values=pre_select_ext_b,
341 |         valid_set=dataset_ids + ["other"],
342 |     )
343 |     # flush placeholder
344 |     state["source_datasets"].remove("extended")
345 |     state["source_datasets"] += [f"extended|{src}" for src in extended_sources]
346 | 
347 | 
348 | #
349 | # SIZE CATEGORY
350 | #
351 | leftcol.markdown("### Size category")
352 | logging.info(initial_infos[initial_info_cfg]["splits"] if initial_infos is not None else 0)
353 | initial_num_examples = (
354 |     sum([dct.get("num_examples", 0) for _split, dct in initial_infos[initial_info_cfg].get("splits", dict()).items()])
355 |     if initial_infos is not None
356 |     else -1
357 | )
358 | initial_size_cats = map_num_examples_to_size_categories(initial_num_examples)
359 | leftcol.markdown(f"Computed size category from automatically generated dataset info to: `{initial_size_cats}`")
360 | current_size_cats = state.get("size_categories") or ["unknown"]
361 | ok, nonok = split_known(current_size_cats, known_size_categories)
362 | if len(nonok) > 0:
363 |     leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
364 | else:
365 |     state["size_categories"] = [initial_size_cats]
366 | 
367 | 
368 | ########################
369 | ## Show results
370 | ########################
371 | 
372 | rightcol.markdown(
373 |     f"""
374 | ### Finalized tag set
375 | 
376 | """
377 | )
378 | if is_state_empty(state):
379 |     rightcol.markdown("❌ This is an invalid tagset: it's empty!")
380 | else:
381 |     validate_dict(rightcol, state)
382 | 
383 | 
384 | rightcol.markdown(
385 |     f"""
386 | 
387 | ```yaml
388 | {yaml.dump(state)}
389 | ```
390 | ---
391 | #### Arbitrary yaml validator
392 | 
393 | This is a standalone tool, it is useful to check for errors on an existing tagset or modifying directly the text rather than the UI on the left.
394 | """,
395 | )
396 | 
397 | yamlblock = rightcol.text_area("Input your yaml here")
398 | if yamlblock.strip() != "":
399 |     inputdict = yaml.safe_load(yamlblock)
400 |     validate_dict(rightcol, inputdict)
401 | 


--------------------------------------------------------------------------------