├── catalog ├── .gitkeep ├── nubis-ocr │ └── nubis-ocr.yml ├── e-editiones │ └── ocr17plus.yml ├── episearch │ └── episeach-htr.yml ├── fondue │ ├── FONDUE-FR-PRINT-16.yml │ ├── fondue-kunsthistorisches-uzh-archivdatenbank.yml │ ├── FONDUE-MLT-CAT.yml │ ├── fondue-gasparosarditoponomasia-dataset.yml │ ├── fondue-spanish-chapbooks-19th-c-dataset.yml │ ├── FONDUE-IT-PRINT-20.yml │ ├── FONDUE-ES-PRINT-19.yml │ ├── FONDUE-FR-PRINT-20.yml │ ├── FONDUE-EN-PRINT-20.yml │ └── FONDUE-FR-MSS-18.yml ├── genauto │ └── genauto-td-htr.yml ├── koenigsfelden │ └── kf-htr.yml ├── araucania │ └── araucania.yml ├── shakespeare-scott-translations │ └── ocr-data.yml ├── greek-data │ ├── d-scribe-zenon.yml │ ├── stavronikita-114.yml │ ├── stavronikita-53.yml │ ├── stavronikita-79.yml │ ├── hpgtr.yml │ └── eparchos.yml ├── from-manuscript-to-print-a-matter-of-bankability │ └── antoine-verard-extracts.yml ├── editer-la-correspondance-de-constance-de-salm-1767-1845 │ └── editer-la-correspondance-de-constance-de-salm-1767-1845.yml ├── ciham-htr │ ├── dataset-for-late-medieval-castilian-text-recognition.yml │ └── fabliaux.yml ├── enc-cours-git │ ├── hn-kovalewsky.yml │ ├── tnah-expouniv.yml │ ├── tnah-decameronfr.yml │ ├── tnah-notredame.yml │ ├── hn-chavigny.yml │ ├── hn-poesie-corse.yml │ └── hn-boccace.yml ├── transcriboquest2024-literary-medieval │ └── transcriboquest-2024-medieval-literary.yml ├── tubingen-library │ └── southasia-malayalam.yml ├── ocr-d │ └── ocr-d_gt_structure_text.yml ├── eutyches-grammaticus-glossed │ └── eutyches.yml ├── incunables-sevillans-1494-1500 │ └── incunables-sevillans-1494-1500.yml ├── distinguo │ └── distinguo-GT-metadata.yml ├── inha │ ├── LesPapiersBarye.yml │ └── LettresDeJacquesDoucetAReneJean1908-1929.yml ├── ajmc │ └── ajmc-layout.yml ├── naval-kishore │ └── naval-kishore.yml ├── burchards-dekret-digital │ └── bdd-segmentation-data.yml ├── tarima │ └── tarima.yml ├── bullinger │ └── gwalther-htr.yml ├── transcriboquest-2025 │ └── transcriboquest-2025-medieval-latin.yml ├── banq │ └── copiste-d-un-jour.yml ├── stabs-urfehdebuch │ └── urfehdebuch-htr.yml ├── impresso │ └── nzz-ocr.yml ├── alix-tz │ ├── peraire-ground-truth.yml │ └── moonshines.yml ├── TranscriboQuest_Arabic │ └── htr-united.yml ├── ifloral │ └── ifloral-dataset.yml ├── cremma │ ├── mss-20.yml │ ├── mss-16.yml │ ├── mss-19.yml │ ├── mss-18.yml │ └── mss-17.yml ├── teklia │ └── belfort.yml ├── incunabula-reichenau │ └── incunabula-reichenau.yml ├── htr-school-vienna │ ├── wien-onb-cod-2160-f-164-184-ground-truth-from-htr-winter-school-2022.yml │ ├── htr-winter-school-2024-medieval-czech-prague-bible-1488.yml │ └── paderov-bible-handwriting-ground-truth.yml ├── almanach │ ├── dahn.yml │ ├── lectaurep-bronod.yml │ ├── tapuscorpus.yml │ └── lectaurep-notaires.yml ├── hismodoc-htr │ └── titres-nobiliaires-17-18-siecles-dataset.yml ├── antwerp_bias-in-history │ └── arletta.yml ├── joseph-hooker-correspondance-project │ └── joseph-hooker-htr.yml ├── scripta-psl │ └── biblia.yml ├── gallicorpora │ ├── gothic-16.yml │ ├── incunable-15.yml │ ├── mss-15.yml │ └── print-16.yml ├── LiDi │ └── LiDi1-0-project.yml ├── popp │ └── the-popp-datasets.yml ├── meleagre │ └── meleagre.yml ├── rasam-2 │ └── rasam.yml ├── front-justice │ └── front-justice-htr.yml ├── bsc-cssh │ └── AMSMB-HTR.yml ├── rasam-1 │ └── rasam.yml ├── sloane_lab │ └── sloane_lab_htr_model.yml ├── slub-dresden │ ├── mscr-dresd-k-117.yml │ └── mscr-dresd-k-113.yml ├── bullinger-htr-dataset │ └── bullinger-htr-dataset.yml ├── chi-know-po │ └── chi-know-po.yml ├── rescribe │ └── caroline-minuscule.yml ├── pbp │ └── pbp.yml ├── parisTimeMachine │ └── addresses-et-annuaires.yml ├── ground-truth-set-for-handwritten-text-recognition-htr-ocr-dresdner-hofdiarium-1665-mscrdresdk80-17th-century-kurrent-manuscript │ └── ground-truth-set-for-handwritten-text-recognition-htr-ocr-dresdner-hofdiarium-1665-mscrdresdk80-17th-century-kurrent-manuscript.yml └── htromance │ └── ita.yml ├── id-db.json ├── graph.png ├── CITATION.CFF ├── .github ├── workflows │ ├── HTRUC.yaml │ └── Catalog.yaml └── ISSUE_TEMPLATE │ ├── add-a-new-dataset-description.md │ └── ajouter-la-description-d-un-nouveau-jeu-de-donn-es.md ├── spid.py └── catalog-ids.json /catalog/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /id-db.json: -------------------------------------------------------------------------------- 1 | {"values":{},"ids":{}} -------------------------------------------------------------------------------- /graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HTR-United/htr-united/HEAD/graph.png -------------------------------------------------------------------------------- /CITATION.CFF: -------------------------------------------------------------------------------- 1 | cff-version: 1.1.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: Chagué 5 | given-names: Alix 6 | orcid: https://orcid.org/0000-0002-0136-4434 7 | - family-names: Clérice 8 | given-names: Thibault 9 | orcid: https://orcid.org/0000-0003-1852-9204 10 | title: "HTR-United: Ground Truth Resources for the HTR and OCR of patrimonial documents" 11 | -------------------------------------------------------------------------------- /.github/workflows/HTRUC.yaml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: HTRUC 5 | 6 | on: [push, pull_request] 7 | 8 | jobs: 9 | test: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Set up Python 3.8 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: 3.8 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install htruc 21 | - name: Run HTRUC 22 | run: | 23 | htruc test ./catalog/**/*.y*ml 24 | -------------------------------------------------------------------------------- /catalog/nubis-ocr/nubis-ocr.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: NuBIS-OCR 3 | url: https://github.com/ksefil/NuBIS-OCR 4 | authors: 5 | - name: Kutay 6 | surname: Sefil 7 | roles: 8 | - transcriber 9 | institutions: [] 10 | description: >- 11 | Ground truth dataset for a selection of printed books from NuBIS, the digital 12 | library of the Bibliothèque Interuniversitaire de la Sorbonne. 13 | language: 14 | - fra 15 | - lat 16 | production-software: eScriptorium + Kraken 17 | automatically-aligned: false 18 | script: 19 | - iso: Latn 20 | script-type: only-typed 21 | time: 22 | notBefore: '1602' 23 | notAfter: '1989' 24 | hands: 25 | count: unknown 26 | precision: exact 27 | license: 28 | name: CC-BY 4.0 29 | url: https://creativecommons.org/licenses/by/4.0/ 30 | format: Alto-XML 31 | sources: 32 | - reference: '' 33 | link: https://nubis.bis-sorbonne.fr/ 34 | volume: 35 | - metric: pages 36 | count: 57 37 | -------------------------------------------------------------------------------- /catalog/e-editiones/ocr17plus.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: OCR17plus 3 | url: https://github.com/e-ditiones/OCR17plus 4 | project-name: E-ditiones 5 | project-website: https://e-ditiones.huma-num.fr/ 6 | authors: 7 | - name: Gabay 8 | surname: Simon 9 | roles: 10 | - transcriber 11 | - project-manager 12 | - support 13 | - name: Jahan 14 | surname: Claire 15 | roles: 16 | - transcriber 17 | - aligner 18 | description: "Imprim\xE9s classiques" 19 | language: 20 | - frm 21 | script: 22 | - iso: Latn 23 | script-type: only-typed 24 | time: 25 | notBefore: '1600' 26 | notAfter: '1700' 27 | hands: 28 | count: 1-per-folder 29 | precision: exact 30 | license: 31 | - name: CC-BY 4.0 32 | url: https://creativecommons.org/licenses/by/4.0/ 33 | format: Alto-XML 34 | volume: 35 | - count: 25628 36 | metric: lines 37 | - count: 965 38 | metric: files 39 | - count: 3923 40 | metric: regions 41 | - count: 686335 42 | metric: characters 43 | production-software: Transkribus 44 | -------------------------------------------------------------------------------- /spid.py: -------------------------------------------------------------------------------- 1 | # This script is meant to maintain a list of (semi-)PID based on URIs of dataset 2 | 3 | import json 4 | import hashlib 5 | 6 | def get_hash(string): 7 | sha = hashlib.sha256(string.encode()) 8 | return sha.hexdigest()[:9] 9 | 10 | with open("catalog.json") as f: 11 | data = json.load(f) 12 | 13 | with open("id-db.json") as f: 14 | db = json.load(f) 15 | 16 | for key, record in sorted(list(data.items()), key=lambda x: x[1]["url"]): 17 | if record["url"] not in db["values"]: 18 | current_id = get_hash(record["url"]) 19 | db["values"][record["url"]] = current_id 20 | db["ids"][current_id] = record["url"] 21 | data[key]["_pid"] = current_id 22 | if "_pid" not in record: 23 | data[key]["_pid"] = db["values"][record["url"]] 24 | 25 | with open("id-db.json", "w") as f: 26 | json.dump(db, f, indent=2) 27 | 28 | with open("catalog.json", "w") as f: 29 | json.dump( 30 | { 31 | record["_pid"]: record 32 | for _, record in sorted(list(data.items()), key=lambda x: x[1]["url"]) 33 | }, 34 | f, 35 | indent=2 36 | ) 37 | -------------------------------------------------------------------------------- /catalog/episearch/episeach-htr.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: EpiSearch HTR 3 | url: https://github.com/vedph/episearch-htr 4 | authors: 5 | - name: Lorenzo 6 | surname: Calvelli 7 | orcid: 0000-0002-0920-9156 8 | roles: 9 | - project-manager 10 | - name: Tatiana 11 | surname: Tommasi 12 | orcid: 0009-0000-2815-0113 13 | roles: 14 | - transcriber 15 | - name: Federico 16 | surname: Boschetti 17 | orcid: 0000-0002-7810-7735 18 | roles: 19 | - support 20 | institutions: [] 21 | description: Ground Truth for Astori’s letters (see the README.md file for details) 22 | project-name: EpiSearch 23 | project-website: https://github.com/vedph/episearch-htr 24 | language: 25 | - ita 26 | production-software: eScriptorium + Kraken 27 | script: 28 | - iso: Latn 29 | script-type: only-manuscript 30 | time: 31 | notBefore: '1705' 32 | notAfter: '1709' 33 | hands: 34 | count: '1' 35 | precision: exact 36 | license: 37 | - name: CC-BY-SA 4.0 38 | url: https://creativecommons.org/licenses/by-sa/4.0/ 39 | format: Alto-XML 40 | volume: 41 | - metric: files 42 | count: 34 43 | -------------------------------------------------------------------------------- /catalog/fondue/FONDUE-FR-PRINT-16.yml: -------------------------------------------------------------------------------- 1 | authors: 2 | - name: Gabay 3 | orcid: 0000-0001-9094-4475 4 | roles: 5 | - transcriber 6 | - project-manager 7 | - quality-control 8 | - support 9 | surname: Simon 10 | citation-file-link: https://github.com/FoNDUE-HTR/FONDUE-FR-PRINT-16/blob/master/CITATION.cff 11 | description: ' Transcriptions of French 16th c. prints ' 12 | format: Alto-XML 13 | hands: 14 | count: unknown 15 | precision: exact 16 | language: 17 | - fra 18 | license: 19 | name: CC-BY 4.0 20 | url: https://creativecommons.org/licenses/by/4.0/ 21 | production-software: eScriptorium + Kraken 22 | project-name: FoNDUE 23 | project-website: https://github.com/FoNDUE-HTR/ 24 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 25 | script: 26 | - iso: Latn 27 | script-type: only-typed 28 | time: 29 | notAfter: '1600' 30 | notBefore: '1500' 31 | title: FONDUE-FR-PRINT-16 32 | transcription-guidelines: SegmOnto 33 | url: https://github.com/FoNDUE-HTR/FONDUE-FR-PRINT-16 34 | volume: 35 | - count: 504656 36 | metric: characters 37 | - count: 930 38 | metric: files 39 | - count: 17817 40 | metric: lines 41 | - count: 2829 42 | metric: regions 43 | -------------------------------------------------------------------------------- /catalog/genauto/genauto-td-htr.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: GenAuto TD Corpus 3 | url: https://github.com/jpmjpmjpm/genauto-td-htr.git 4 | project-name: GenAuto 5 | project-website: '' 6 | authors: 7 | - name: Boutet 8 | surname: "Jean-Fran\xE7ois" 9 | roles: 10 | - transcriber 11 | - aligner 12 | - name: Merx 13 | surname: Jean-Pierre 14 | roles: 15 | - transcriber 16 | - aligner 17 | - project-manager 18 | description: "150 transcribed images from \"Tables D\xE9cennales\" French Civil Registry.\ 19 | \ Those come from Sermaises and Romilly-sur-Seine municipalities.\n" 20 | language: 21 | - fra 22 | script: 23 | - iso: Latn 24 | script-type: only-manuscript 25 | time: 26 | notBefore: '1792' 27 | notAfter: '1902' 28 | hands: 29 | count: less-than-11 30 | precision: estimated 31 | license: 32 | - name: CC-BY 4.0 33 | url: https://creativecommons.org/licenses/by/4.0/ 34 | format: Alto-XML 35 | volume: 36 | - count: 300 37 | metric: pages 38 | - count: 150 39 | metric: images 40 | - count: 150 41 | metric: files 42 | - count: 186366 43 | metric: characters 44 | - count: 21557 45 | metric: lines 46 | - count: 608 47 | metric: regions 48 | production-software: "eScriptorium + Kraken" 49 | -------------------------------------------------------------------------------- /catalog/koenigsfelden/kf-htr.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: "Charters and Records of K\xF6nigsfelden Abbey and Bailiwick (1308-1662)" 3 | url: https://doi.org/10.5281/zenodo.5179361 4 | authors: 5 | - name: Hodel 6 | surname: Tobias 7 | roles: 8 | - transcriber 9 | - project-manager 10 | - support 11 | - name: Halter-Pernet 12 | surname: Colette 13 | roles: 14 | - transcriber 15 | - aligner 16 | - project-manager 17 | - quality-control 18 | - digitization 19 | - support 20 | - name: Teuscher 21 | surname: Simon 22 | roles: 23 | - project-manager 24 | description: "The data set is the publication of the data of the scholarly edition\ 25 | \ \"Urkunden und Akten des Klosters und der Hofmeisterei K\xF6nigsfelden\"." 26 | project-website: https://www.koenigsfelden.uzh.ch/ 27 | language: 28 | - lat 29 | - deu 30 | script: 31 | - iso: Latn 32 | script-type: only-manuscript 33 | time: 34 | notBefore: '1292' 35 | notAfter: '1570' 36 | hands: 37 | count: more-than-10 38 | precision: estimated 39 | license: 40 | - name: CC-BY 4.0 41 | url: https://creativecommons.org/licenses/by/4.0/ 42 | format: Page-XML 43 | volume: 44 | - metric: lines 45 | count: 60000 46 | transcription-guidelines: 'See: https://www.koenigsfelden.uzh.ch/exist/apps/ssrq/intro.html#richtlinien' 47 | production-software: "Transkribus" 48 | -------------------------------------------------------------------------------- /catalog/araucania/araucania.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: HTR - Araucania manuscript XIX 3 | url: https://github.com/Proyecto-Ocupacion-Araucania-UChile/HTR_Araucania_XIX 4 | authors: 5 | - name: Humeau 6 | surname: Maxime 7 | - name: Chiaretti 8 | surname: Alessandro 9 | institutions: 10 | - name: Archivo Central Andres Bello 11 | description: >- 12 | Ground Truth dataset for Spanish 19th typewritten OCR. 13 | 14 | The archives come from the events of the Occupation of Araucania (1850-1881) 15 | in Chile. They are archived in the ’Colección manuscritos' of the Archivo 16 | Central Andres Bello - Universidad de Chile. 17 | language: 18 | - spa 19 | production-software: eScriptorium + Kraken 20 | script: 21 | - iso: Latn 22 | script-type: mainly-manuscript 23 | time: 24 | notBefore: '1859' 25 | notAfter: '1877' 26 | hands: 27 | count: more-than-10 28 | precision: estimated 29 | license: 30 | - name: CC-BY-SA 4.0 31 | url: https://creativecommons.org/licenses/by-sa/4.0/ 32 | format: Alto-XML 33 | volume: 34 | - metric: files 35 | count: 180 36 | - metric: lines 37 | count: 3932 38 | - metric: regions 39 | count: 981 40 | - metric: characters 41 | count: 117155 42 | transcription-guidelines: | 43 | - xxx for erased or unreadable characters 44 | - ^+letters for superscript letters 45 | - ⁋ for new paragraph 46 | -------------------------------------------------------------------------------- /catalog/shakespeare-scott-translations/ocr-data.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: Shakespeare-Scott translations 3 | url: https://github.com/millawell/ocr-data 4 | project-name: 'Publishing an OCR ground truth data set for reuse in an unclear copyright 5 | setting'' 6 | 7 | ' 8 | project-website: https://github.com/millawell/ocr-data 9 | authors: 10 | - name: Lassner 11 | surname: David 12 | - name: Coburger 13 | surname: Julius 14 | - name: Neudecker 15 | surname: Clemens 16 | - name: Baillot 17 | surname: Anne 18 | description: "Ground truth data in German and English of Shakespeare and Scott prints\ 19 | \ in original and different translations. \n" 20 | language: 21 | - eng 22 | - deu 23 | script: 24 | - iso: Latn 25 | - iso: Latf 26 | script-type: only-typed 27 | time: 28 | notBefore: '1815' 29 | notAfter: '1852' 30 | hands: 31 | count: unknown 32 | precision: exact 33 | license: 34 | - name: CC-BY 4.0 35 | url: https://creativecommons.org/licenses/by/4.0/ 36 | format: Alto-XML 37 | volume: 38 | - metric: lines 39 | count: 5354 40 | - metric: files 41 | count: 131 42 | - metric: regions 43 | count: 131 44 | - metric: characters 45 | count: 192264 46 | sources: 47 | - reference: '' 48 | link: https://zfdg.de/sb005_006 49 | citation-file-link: https://github.com/millawell/ocr-data/blob/master/citation.cff 50 | production-software: "eScriptorium + Kraken" 51 | -------------------------------------------------------------------------------- /catalog/greek-data/d-scribe-zenon.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: Ground-Truthed Data Set of Zenon Papyri for Handwritten Text Recognition 3 | url: https://zenodo.org/records/6565706 4 | authors: 5 | - name: Isabelle 6 | surname: Marthot-Santaniello 7 | orcid: 0000-0003-0407-8748 8 | roles: 9 | - transcriber 10 | - project-manager 11 | - name: Hodel 12 | surname: Tobias 13 | orcid: 0000-0002-2071-6407 14 | roles: 15 | - transcriber 16 | - project-manager 17 | institutions: [] 18 | description: >- 19 | Diplomatic transcription of papyri found in the Zenon archive [see 20 | en.wikipedia.org/wiki/Zenon_of_Kaunos] 21 | 22 | 23 | Manually prepared as PageXML with Transkribus within D-Scribes project. 24 | project-name: D-Scribes 25 | project-website: https://d-scribes.philhist.unibas.ch/en/ 26 | language: 27 | - grc 28 | production-software: Transkribus 29 | automatically-aligned: false 30 | characters: 31 | mode: NFD 32 | script: 33 | - iso: Grek 34 | script-type: only-manuscript 35 | time: 36 | notBefore: '-250' 37 | notAfter: '-230' 38 | hands: 39 | count: unknown 40 | precision: estimated 41 | license: 42 | name: CC-BY 4.0 43 | url: https://creativecommons.org/licenses/by/4.0/ 44 | format: Page-XML 45 | volume: 46 | - metric: lines 47 | count: 321 48 | - metric: characters 49 | count: 5850 50 | - metric: files 51 | count: 27 -------------------------------------------------------------------------------- /catalog/from-manuscript-to-print-a-matter-of-bankability/antoine-verard-extracts.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: Antoine Verard extracts 3 | url: https://github.com/LaurieHoeben/Verard-corpus 4 | authors: 5 | - name: Laurie 6 | surname: Hoeben 7 | roles: 8 | - transcriber 9 | - aligner 10 | institutions: [] 11 | description: >- 12 | Parts of Antoine Vérard’s editions princeps of "Tristan", "Merlin" and "Gyron 13 | le Courtoys". 14 | project-name: 'From Manuscript to Print: a Matter of Bankability?' 15 | project-website: https://www.universityofgalway.ie/rebpaf/ 16 | language: 17 | - frm 18 | production-software: 'eScriptorium ' 19 | automatically-aligned: false 20 | script: 21 | - iso: Latn 22 | script-type: mainly-typed 23 | time: 24 | notBefore: '1489' 25 | notAfter: '1503' 26 | hands: 27 | count: 1-per-folder 28 | precision: exact 29 | license: 30 | name: Etalab OL 2.0 31 | url: https://spdx.org/licenses/etalab-2.0.html 32 | format: Page-XML 33 | sources: 34 | - reference: '' 35 | link: https://catalogue.bnf.fr/ark:/12148/cb33631875s 36 | - reference: '' 37 | link: https://catalogue.bnf.fr/ark:/12148/cb39334880d 38 | - reference: '' 39 | link: https://catalogue.bnf.fr/ark:/12148/cb334128727 40 | volume: 41 | - metric: lines 42 | count: 4710 43 | transcription-guidelines: >- 44 | Ariane Pinche. Guide de transcription pour les manuscrits du Xe au XVe siècle. 45 | 2022. hal-03697382f 46 | -------------------------------------------------------------------------------- /catalog/editer-la-correspondance-de-constance-de-salm-1767-1845/editer-la-correspondance-de-constance-de-salm-1767-1845.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: Éditer la correspondance de Constance de Salm (1767-1845) 3 | url: https://github.com/sbiay/CdS-edition/tree/main/htr/verite-terrain 4 | authors: 5 | - name: Biay 6 | surname: Sébastien 7 | roles: 8 | - transcriber 9 | institutions: [] 10 | description: >- 11 | La correspondance de Constance de Salm (femme de lettres française) comprend 12 | différents spécimens d’écriture du début du XIXe siècle. Le jeu de données 13 | atteste les mains de quatre copistes différents. 14 | project-website: https://dhiha.hypotheses.org/2945 15 | language: 16 | - fra 17 | production-software: eScriptorium + Kraken 18 | script: 19 | - iso: Latn 20 | script-type: only-manuscript 21 | time: 22 | notBefore: '1800' 23 | notAfter: '1825' 24 | hands: 25 | count: less-than-11 26 | precision: estimated 27 | license: 28 | - name: CC-BY 4.0 29 | url: https://creativecommons.org/licenses/by/4.0/ 30 | format: Alto-XML 31 | sources: 32 | - reference: >- 33 | Salm, C. de (1767-1845). Correspondance. Société des Amis du Vieux Toulon 34 | et de sa Région, Fonds Salm. Archiv Schloss Dyck, fonds Constance de Salm. 35 | link: '' 36 | volume: 37 | - metric: lines 38 | count: 1754 39 | transcription-guidelines: >- 40 | Usages scribaux respectés : abréviations, fautes, accentuation respectés. 41 | Allographes normalisés (s long). 42 | -------------------------------------------------------------------------------- /catalog/ciham-htr/dataset-for-late-medieval-castilian-text-recognition.yml: -------------------------------------------------------------------------------- 1 | 2 | 3 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 4 | title: 'Dataset for late medieval Castilian text recognition ' 5 | url: https://doi.org/10.5281/zenodo.7386489 6 | authors: 7 | - name: Gille Levenson 8 | surname: Matthias 9 | orcid: 0000-0001-9488-5986 10 | roles: 11 | - transcriber 12 | - quality-control 13 | institutions: [] 14 | description: >- 15 | HTR/OCR open access gold corpus for spanish late medieval sources, based 16 | 17 | on the allographetic transcription of more than 300 pages of several 18 | manuscripts of the Regimiento de los 19 | Prínçipes, as well as a first set of general transcription models trained with 20 | kraken and out-of-domain test data. See https://doi.org/10.5281/zenodo.7387376 for full description of the dataset. 21 | language: 22 | - spa 23 | production-software: eScriptorium + Kraken 24 | script: 25 | - iso: Latn 26 | script-type: mainly-manuscript 27 | time: 28 | notBefore: '1300' 29 | notAfter: '1500' 30 | hands: 31 | count: more-than-10 32 | precision: estimated 33 | license: 34 | - name: CC-BY-SA 4.0 35 | url: https://creativecommons.org/licenses/by-sa/4.0/ 36 | format: Alto-XML 37 | volume: 38 | - metric: lines 39 | count: 28000 40 | transcription-guidelines: >- 41 | Allographetic transcription. See the article 42 | (https://doi.org/10.5281/zenodo.7387376) for full transcription guidelines. 43 | 44 | 320 pages in-domain; 40 pages out-of-domain 45 | 46 | -------------------------------------------------------------------------------- /catalog/enc-cours-git/hn-kovalewsky.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: "Maxime Kovalewsky - Coutume contemporaine et loi ancienne: droit coutumier\ 3 | \ oss\xE9tien" 4 | url: https://github.com/PSL-Chartes-HTR-Students/HN2021-Kovalewsky-1893 5 | project-name: 'ENC - Bonnes pratiques du developpement collaboratif 6 | 7 | ' 8 | authors: 9 | - name: "L\u2019Eveque" 10 | surname: "Zo\xE9" 11 | roles: 12 | - transcriber 13 | - name: Ekaterina 14 | surname: Kate 15 | roles: 16 | - transcriber 17 | - name: Kasparian 18 | surname: Anahide 19 | roles: 20 | - transcriber 21 | description: "Nous avons choisi de transcrire le deuxi\xE8me chapitre de l\u2019ouvrage\ 22 | \ de Maxime Kovalewsky : Coutume contemporaine et loi ancienne : droit coutumier\ 23 | \ oss\xE9tien, \xE9clair\xE9 par l\u2019histoire compar\xE9e. Paris, L. Larose,\ 24 | \ 1893. \n" 25 | language: 26 | - fra 27 | script: 28 | - iso: Latn 29 | script-type: only-typed 30 | time: 31 | notBefore: '1893' 32 | notAfter: '1893' 33 | hands: 34 | count: '1' 35 | precision: exact 36 | license: 37 | - name: CC-BY 4.0 38 | url: https://creativecommons.org/licenses/by/4.0/ 39 | format: Alto-XML 40 | citation-file-link: https://github.com/PSL-Chartes-HTR-Students/HN2021-Kovalewsky-1893/main/CITATION.CFF 41 | volume: 42 | - metric: characters 43 | count: 45626 44 | - metric: files 45 | count: 28 46 | - metric: lines 47 | count: 983 48 | - metric: regions 49 | count: 72 50 | production-software: "eScriptorium + Kraken" 51 | -------------------------------------------------------------------------------- /catalog/transcriboquest2024-literary-medieval/transcriboquest-2024-medieval-literary.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: TranscriboQuest 2024 Medieval Literary 3 | url: 10.5281/zenodo.13757440 4 | authors: 5 | - name: Jessie 6 | surname: Dummer 7 | - name: Emmanuelle 8 | surname: Kuhry 9 | - name: Zdzislaw 10 | surname: Koczarski 11 | - name: Sylvain 12 | surname: Besson 13 | - name: Caroline 14 | surname: Chevalier-Royet 15 | orcid: 0000-0002-7574-6742 16 | - name: Caroline 17 | surname: Vandyck 18 | roles: 19 | - project-manager 20 | institutions: [] 21 | description: >- 22 | This dataset was created in the context of TranscriboQuest 2024 (Medieval 23 | Literary Team) held in Lyon (11/09/2024-13/09/2024). We opted to focus on 24 | medieval scientific documents that are damaged, in several different 25 | languages. The result is 808 lines transcribed by experts in the field. The 26 | dataset contains the images of the manuscripts and ALTO-XMLs. 27 | language: 28 | - lat 29 | - dum 30 | - fro 31 | - gmh 32 | production-software: eScriptorium + Kraken 33 | automatically-aligned: false 34 | script: 35 | - iso: Latn 36 | script-type: only-manuscript 37 | time: 38 | notBefore: '800' 39 | notAfter: '1500' 40 | hands: 41 | count: 1-per-folder 42 | precision: exact 43 | license: 44 | name: CC-BY 4.0 45 | url: https://creativecommons.org/licenses/by/4.0/ 46 | format: Alto-XML 47 | volume: 48 | - metric: lines 49 | count: 800 50 | transcription-guidelines: CATMuS Guidelines (https://catmus-guidelines.github.io) 51 | -------------------------------------------------------------------------------- /catalog/tubingen-library/southasia-malayalam.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: Ground Truth data for printed Malayalam 3 | url: https://doi.org/10.11588/data/L2KRZO 4 | authors: [] 5 | institutions: 6 | - name: Tübingen University Library 7 | roles: 8 | - project-manager 9 | description: >- 10 | Ground Truth (GT) data (JPG and ALTO XML files) which can be used to train OCR 11 | models that recognize printed text in Malayalam script. The training material 12 | is gathered from 19th and 20th centuries prints. 13 | 14 | 15 | The GT data was trained in Transkribus with the HTR+ and the PyLaia engine 16 | with a resulting CER of 2.29% on validation set with HTR+ and 3,20% with 17 | PyLaia. The training was performed on 43 pages with appr. 9,000 words. The 18 | validation set consisted of 5 pages (ca. 1,000 words). 19 | 20 | 21 | Transcription was performed by Tübingen University Library, the Ground Truth 22 | data was created by Elena Mucciarelli (University of Groningen) with support 23 | and model training by Dorothee Huff (Tübingen University Library). 24 | (2022-11-02) 25 | project-name: DigitalSouthAsia 26 | project-website: http://idb.ub.uni-tuebingen.de/digitue/southasia 27 | language: 28 | - mal 29 | production-software: Transkribus 30 | script: 31 | - iso: Mlym 32 | script-type: only-typed 33 | time: 34 | notBefore: '1850' 35 | notAfter: '1996' 36 | hands: 37 | count: unknown 38 | precision: exact 39 | license: 40 | name: CC-BY 4.0 41 | url: https://creativecommons.org/licenses/by/4.0/ 42 | format: Page-XML 43 | volume: 44 | - metric: pages 45 | count: 43 46 | -------------------------------------------------------------------------------- /.github/workflows/Catalog.yaml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: HTRUC Catalog 5 | on: 6 | push: 7 | branches: 8 | - master 9 | workflow_dispatch: #Allows for manual triggering 10 | schedule: 11 | - cron: "0 23 * * 0" 12 | jobs: 13 | catalog: 14 | runs-on: ubuntu-latest 15 | env: 16 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python 3.10 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: "3.9" 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install htruc 27 | - name: Run HTRUC 28 | run: | 29 | htruc make ./catalog --access_token ${{ secrets. GITHUB_TOKEN }} --graph-csv data.csv --statistics statistics.csv --output htr-united.yml --graph graph.png --json catalog.json --ids catalog-ids.json --check-link --no-remote 30 | - name: Commit files 31 | run: | 32 | git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com" 33 | git config --local user.name "github-actions[bot]" 34 | python3 spid.py 35 | git add htr-united.yml graph.png statistics.csv catalog.json 36 | git commit -m "[Automatic] Update of the Catalog" || echo "Nothing to commit" 37 | git push || echo "Nothing to push" 38 | - uses: rymndhng/release-on-push-action@master 39 | with: 40 | bump_version_scheme: patch 41 | use_github_release_notes: true 42 | -------------------------------------------------------------------------------- /catalog/fondue/fondue-kunsthistorisches-uzh-archivdatenbank.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: FoNDUE_Kunsthistorisches-UZH_Archivdatenbank 3 | url: https://github.com/FoNDUE-HTR/FoNDUE_Kunsthistorisches-UZH_Archivdatenbank 4 | authors: 5 | - name: Pauline 6 | surname: Jacsont 7 | orcid: 0000-0002-6296-3246 8 | roles: 9 | - project-manager 10 | - transcriber 11 | - aligner 12 | - quality-control 13 | - name: Simon 14 | surname: Gabay 15 | orcid: 0000-0001-9094-4475 16 | roles: 17 | - project-manager 18 | - quality-control 19 | - support 20 | - name: Tristan 21 | surname: Weddigen 22 | orcid: 0000-0002-4609-8950 23 | roles: 24 | - support 25 | institutions: [] 26 | description: HTR data made with the Kunsthistorisches UZH corpus. 27 | project-name: FoNDUE 28 | project-website: https://www.unige.ch/lettres/humanites-numeriques/recherche/projets-de-la-chaire/fondue 29 | language: 30 | - deu 31 | - fra 32 | - ita 33 | production-software: eScriptorium + Kraken 34 | script: 35 | - iso: Latn 36 | script-type: evenly-mixed 37 | time: 38 | notBefore: '1900' 39 | notAfter: '1999' 40 | hands: 41 | count: more-than-10 42 | precision: estimated 43 | license: 44 | - name: CC-BY 4.0 45 | url: https://creativecommons.org/licenses/by/4.0/ 46 | format: Alto-XML 47 | volume: 48 | - metric: pages 49 | count: 1100 50 | citation-file-link: >- 51 | https://github.com/FoNDUE-HTR/FoNDUE_Kunsthistorisches-UZH_Archivdatenbank/blob/main/CITATION.cff 52 | transcription-guidelines: "The transcription is strictly diplomatic: no abbreviations are resolved. \LItems that are crossed out or struck through will be transcribed with a \"€\"." 53 | -------------------------------------------------------------------------------- /catalog/fondue/FONDUE-MLT-CAT.yml: -------------------------------------------------------------------------------- 1 | authors: 2 | - name: Pradier 3 | orcid: 0000-0002-3476-7248 4 | roles: 5 | - transcriber 6 | surname: Frédérine 7 | - name: Gabay 8 | orcid: 0000-0001-9094-4475 9 | roles: 10 | - transcriber 11 | - project-manager 12 | - quality-control 13 | - support 14 | surname: Simon 15 | - name: Kervegan 16 | orcid: 0000-0003-2821-8821 17 | roles: 18 | - transcriber 19 | surname: Paul 20 | - name: Janès 21 | orcid: 0000-0002-8971-6173 22 | roles: 23 | - transcriber 24 | surname: Juliette 25 | - name: Sánchez Oeconomo 26 | orcid: 0000-0002-8591-5394 27 | roles: 28 | - transcriber 29 | surname: Esteban 30 | citation-file-link: https://github.com/FoNDUE-HTR/FONDUE-MLT-CAT/blob/main/CITATION.cff 31 | description: 'Groundtruth for 19th/20th sale/exhibition catalogues, mainly printed 32 | in France but not only.' 33 | transcription-guidelines: 'Segmentation include an extra zone `CustomeZone: entry`' 34 | format: Alto-XML 35 | hands: 36 | count: unknown 37 | precision: exact 38 | institutions: [] 39 | language: 40 | - por 41 | - fra 42 | - ita 43 | license: 44 | - name: CC-BY 4.0 45 | url: https://creativecommons.org/licenses/by/4.0/ 46 | production-software: eScriptorium + Kraken 47 | project-name: FoNDUE 48 | project-website: https://github.com/FoNDUE-HTR 49 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 50 | script: 51 | - iso: Latn 52 | script-type: only-typed 53 | time: 54 | notAfter: '1972' 55 | notBefore: '1818' 56 | title: FONDUE-MLT-CAT 57 | url: https://github.com/FoNDUE-HTR/FONDUE-MLT-CAT 58 | volume: 59 | - count: 1285120 60 | metric: characters 61 | - count: 1381 62 | metric: files 63 | - count: 43114 64 | metric: lines 65 | - count: 10713 66 | metric: regions 67 | -------------------------------------------------------------------------------- /catalog/ocr-d/ocr-d_gt_structure_text.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: gt_structure_text 3 | url: https://github.com/OCR-D/gt_structure_text 4 | authors: 5 | - name: Matthias 6 | surname: Boenig 7 | orcid: 0000-0003-4615-4753 8 | roles: 9 | - transcriber 10 | - aligner 11 | - project-manager 12 | - quality-control 13 | - digitization 14 | - support 15 | institutions: [] 16 | description: >- 17 | The OCR-D Ground Truth text and structure corpus was created between 18 | 2015-2017. In the years since 2017, this corpus has been further curated and 19 | supplemented with metadata where appropriate. The corpus includes page XML 20 | files within annotations of the text and structure include. The data is based 21 | on transcription data stored in the German Text Archive (DTA) 22 | (https://www.deutschestextarchiv.de/). 23 | project-name: OCR-D 24 | project-website: https://ocr-d.de/ 25 | language: 26 | - eng 27 | - fra 28 | - deu 29 | - heb 30 | - lat 31 | production-software: Aletheia 32 | automatically-aligned: false 33 | script: 34 | - iso: Latn 35 | - iso: Latf 36 | script-type: only-typed 37 | time: 38 | notAfter: '1900' 39 | notBefore: '1500' 40 | hands: 41 | count: less-than-11 42 | precision: exact 43 | license: 44 | name: CC-BY-SA 4.0 45 | url: https://creativecommons.org/licenses/by-sa/4.0/ 46 | format: Page-XML 47 | volume: 48 | - count: 640976 49 | metric: characters 50 | - count: 217 51 | metric: files 52 | - count: 6608 53 | metric: lines 54 | - count: 1647 55 | metric: regions 56 | citation-file-link: https://raw.githubusercontent.com/OCR-D/gt_structure_text/main/CITATION.cff 57 | transcription-guidelines: OCR-D Ground Truth Guidelines https://ocr-d.de/en/gt-guidelines/trans/ 58 | -------------------------------------------------------------------------------- /catalog/eutyches-grammaticus-glossed/eutyches.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: Eutyches 3 | url: https://github.com/malamatenia/Eutyches 4 | authors: 5 | - name: Vlachou Efstathiou 6 | surname: Malamatenia 7 | roles: 8 | - transcriber 9 | - aligner 10 | - project-manager 11 | institutions: [] 12 | description: >- 13 | Ground truth for minuscule caroline of the late 9th century from the 14 | grammatical work "de uerbo" of Eutychès. 15 | project-name: Eutyches grammaticus glossed 16 | language: 17 | - lat 18 | - grc 19 | production-software: eScriptorium + Kraken 20 | script: 21 | - iso: Latn 22 | qualify: Minuscule Caroline 23 | script-type: only-manuscript 24 | time: 25 | notBefore: '850' 26 | notAfter: '900' 27 | hands: 28 | count: less-than-11 29 | precision: estimated 30 | license: 31 | - name: CC-BY 4.0 32 | url: https://creativecommons.org/licenses/by/4.0/ 33 | format: Alto-XML 34 | sources: 35 | - reference: Codices Vossiani Latini, Brill , VLO41 36 | link: >- 37 | https://primarysources.brillonline.com/browse/vossiani-latini/vlo-041-eutyches-grammaticalia-isidorus-alphabeta 38 | volume: 39 | - metric: pages 40 | count: 65 41 | citation-file-link: https://github.com/malamatenia/Eutyches/blob/main/CITATION.cff 42 | transcription-guidelines: >- 43 | Graphematic transcription, following the guidelines of CREMMA-medieval. 44 | Spacing has been reestablished when dealing with semicontinua, s for long s, 45 | loyal to the manuscript for capital letters, abbreviations preserved, 46 | punctuation reduced to ";" and ".". The few greek passages have been also been 47 | preserved, and some of the essais de plume as well (when forming full 48 | words). Annotation of the layout made with SegmOnto controlled vocabulary. 49 | -------------------------------------------------------------------------------- /catalog/incunables-sevillans-1494-1500/incunables-sevillans-1494-1500.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: Jeu de données OCR - Incunables sévillans 1494-1500 3 | url: https://doi.org/10.5281/zenodo.3643393 4 | authors: 5 | - name: Gille Levenson 6 | surname: Matthias 7 | orcid: 0000-0001-9488-5986 8 | roles: 9 | - transcriber 10 | - aligner 11 | - project-manager 12 | institutions: [] 13 | description: >- 14 | The data set corresponds to 60 pages printed in 1494 by Estanislao Polono and Meinardo Ungut in Seville. These pages are taken from the Regimiento de los Prínçipes (also known as 'Glosa castellana al Regimiento de prínçipes'), and the exemplar used is the 15 | INC/901 of the Biblioteca Nacional de España. The type used for this incunabulum is 97G (Martín Abad and Moyano Andrés, Estanislao Polono, 2002, p. 61). This type was used between 1494 and 1500. For other incunabula produced in this period, see op. cit, p.112-121. 16 | language: 17 | - spa 18 | production-software: eScriptorium + Kraken 19 | script: 20 | - iso: Latn 21 | script-type: only-typed 22 | time: 23 | notBefore: '1494' 24 | notAfter: '1500' 25 | hands: 26 | count: '1' 27 | precision: exact 28 | license: 29 | - name: CC-BY 4.0 30 | url: https://creativecommons.org/licenses/by/4.0/ 31 | format: Alto-XML 32 | sources: 33 | - reference: >- 34 | Matthias Gille Levenson. (2022). Jeu de données de segmentation et de reconnaissance optique de caractères - Kraken - Incunables sévillans 1494-1500 (Version v5) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.7006981 35 | link: '' 36 | volume: 37 | - metric: lines 38 | count: 4836 39 | transcription-guidelines: >- 40 | Transcription diplomatique, sans normalisation, sans résolution d'abréviations 41 | ni corrections. 42 | -------------------------------------------------------------------------------- /catalog/enc-cours-git/tnah-expouniv.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: Projet Exposition universelle de 1878 3 | url: https://github.com/PSL-Chartes-HTR-Students/TNAH-2021-Expositions_Universelles 4 | project-name: 'ENC - Bonnes pratiques du developpement collaboratif'' 5 | 6 | ' 7 | authors: 8 | - name: Christensen 9 | surname: Kelly 10 | roles: 11 | - transcriber 12 | - name: Davoury 13 | surname: Baudoin 14 | roles: 15 | - transcriber 16 | - name: Anahi 17 | surname: Haedo 18 | roles: 19 | - transcriber 20 | - name: Kervegan 21 | surname: Paul 22 | roles: 23 | - transcriber 24 | - name: Sanchez-Oeconomo 25 | surname: Esteban 26 | roles: 27 | - transcriber 28 | description: "Le Congr\xE8s international des sciences ethnographiques de 1878 a eu\ 29 | \ lieu \xE0 l\u2019occasion de l'Exposition universelle de 1878, \xE0 Paris. \xC9\ 30 | dit\xE9 en 1881 par l'Imprimerie nationale, le compte rendu de ce congr\xE8s a \xE9\ 31 | t\xE9 mis \xE0 disposition par le Conservatoire num\xE9rique des Arts et M\xE9tiers.\n" 32 | language: 33 | - fra 34 | script: 35 | - iso: Latn 36 | - iso: Grek 37 | - iso: Deva 38 | - iso: Arab 39 | script-type: only-typed 40 | time: 41 | notBefore: '1881' 42 | notAfter: '1881' 43 | hands: 44 | count: '1' 45 | precision: exact 46 | license: 47 | - name: CC-BY 4.0 48 | url: https://creativecommons.org/licenses/by/4.0/ 49 | format: Alto-XML 50 | citation-file-link: https://raw.githubusercontent.com/PSL-Chartes-HTR-Students/TNAH-2021-Expositions_Universelles/main/CITATION.cff 51 | transcription-guidelines: "Diplomatique, mais pas allograph\xE9tique." 52 | volume: 53 | - metric: characters 54 | count: 155022 55 | - metric: files 56 | count: 56 57 | - metric: lines 58 | count: 2620 59 | - metric: regions 60 | count: 158 61 | production-software: "eScriptorium + Kraken" 62 | -------------------------------------------------------------------------------- /catalog/distinguo/distinguo-GT-metadata.yml: -------------------------------------------------------------------------------- 1 | 2 | 3 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 4 | title: >- 5 | DISTINGUO : Ground truth for Handwritten Text Recognition (HTR) on Collections 6 | of Distinctions (late 13th to late 15th century) 7 | url: https://nakala.fr/10.34847/nkl.48ad8b8d 8 | authors: 9 | - name: Svetlana 10 | surname: Yatsyk 11 | orcid: 0000-0001-5356-7746 12 | roles: 13 | - transcriber 14 | - aligner 15 | institutions: [] 16 | description: >- 17 | This dataset contains normalized transcriptions of collections of 18 | distinctions, specifically "Summa de abstinentia" by Nicolas of Biard and 19 | "Dictionarium bovis" by Thomas of Pavia. They were prepared as part of the 20 | DISTINGUO project, dedicated to the study of distinctiones in medieval Latin 21 | preaching and led by Marjorie Burghart in 2019-2024. 22 | project-website: https://distinguo.huma-num.fr/ 23 | language: 24 | - lat 25 | production-software: eScriptorium + Kraken 26 | automatically-aligned: false 27 | script: 28 | - iso: Latn 29 | script-type: only-manuscript 30 | time: 31 | notBefore: '1250' 32 | notAfter: '1499' 33 | hands: 34 | count: 1-per-folder 35 | precision: estimated 36 | license: 37 | name: CC-BY 4.0 38 | url: https://creativecommons.org/licenses/by/4.0/ 39 | format: Page-XML 40 | sources: 41 | - reference: >- 42 | Yatsyk, S. (2024). DISTINGUO : Ground truth for Handwritten Text 43 | Recognition (HTR) on Collections of Distinctions (late 13th to late 15th 44 | century) (Version 1) [Data set]. NAKALA - https://nakala.fr (Huma-Num - 45 | CNRS). 46 | link: https://doi.org/10.34847/NKL.48AD8B8D 47 | volume: 48 | - metric: lines 49 | count: 15190 50 | - metric: characters 51 | count: 682486 52 | - metric: regions 53 | count: 1076 54 | - metric: pages 55 | count: 318 56 | 57 | -------------------------------------------------------------------------------- /catalog/inha/LesPapiersBarye.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: Les Papiers Barye 3 | url: https://gitlab.inha.fr/snr/LesPapiersBarye 4 | authors: 5 | - name: Claass 6 | surname: Victor 7 | roles: 8 | - transcriber 9 | - project-manager 10 | - quality-control 11 | - name: Gain 12 | surname: Justine 13 | roles: 14 | - transcriber 15 | - quality-control 16 | - name: Martin-Vigier 17 | surname: Suzanne 18 | roles: 19 | - transcriber 20 | - quality-control 21 | institutions: 22 | - name: Institut National de l'histoire de l'art (INHA) 23 | roles: 24 | - transcriber 25 | - aligner 26 | - project-manager 27 | - quality-control 28 | - digitization 29 | description: >- 30 | Ensemble de documents autour du sculpteur Antoine-Louis Barye. Paris, 31 | Bibliothèque de l’Institut national d’histoire de l’art, collections Jacques 32 | Doucet, Archives 166. Institut National de l’Histoire de l’art (INHA) / 33 | Set of documents about the sculptor Antoine-Louis Barye. Paris, 34 | Library of the Institut national d'histoire de l'art, Jacques 35 | Doucet, Archives 166. National Institute of Art History (INHA) 36 | project-name: PENSE@INHA 37 | project-website: https://skylab.inha.fr/PENSE/LesPapiersBarye/ 38 | language: 39 | - fra 40 | production-software: Transkribus 41 | script: 42 | - iso: Latn 43 | script-type: mainly-manuscript 44 | time: 45 | notBefore: '1819' 46 | notAfter: '1914' 47 | hands: 48 | count: more-than-10 49 | precision: exact 50 | license: 51 | - name: Etalab OL 2.0 52 | url: https://spdx.org/licenses/etalab-2.0.html 53 | format: Alto-XML 54 | volume: 55 | - metric: characters 56 | count: 362629 57 | - metric: lines 58 | count: 17880 59 | - metric: pages 60 | count: 918 61 | - metric: files 62 | count: 918 63 | -------------------------------------------------------------------------------- /catalog/ajmc/ajmc-layout.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: 'GT4HistCommentLayout: Layout Ground Truth for Historical Commentaries' 3 | url: https://github.com/AjaxMultiCommentary/GT-commentaries-OLR 4 | authors: 5 | - name: Matteo 6 | surname: Romanello 7 | orcid: 0000-0002-7406-6286 8 | roles: 9 | - project-manager 10 | - name: Sven 11 | surname: Najem-Meyer 12 | orcid: 0000-0002-3661-4579 13 | roles: 14 | - transcriber 15 | - quality-control 16 | - name: Carla 17 | surname: Amaya 18 | roles: 19 | - transcriber 20 | description: 'This dataset contains layout annotations for ca. 370 pages sampled from 21 | 8 public domain classical commentaries, published in the 19th century in English, 22 | German and Latin. The commentaries concern Ancient Greek and Latin works from prose 23 | and poetry (caveat: AGreek poetry is slightly over-represented). Pages were annotated 24 | according to a taxonomy mapped to the SegmOnto controlled vocabulary.' 25 | project-name: Ajax Multi-Commentary 26 | project-website: https://mromanello.github.io/ajax-multi-commentary/ 27 | language: 28 | - eng 29 | - deu 30 | - lat 31 | - grc 32 | production-software: Kraken + VGG Image Annotator (VIA) 33 | script: 34 | - iso: Latn 35 | - iso: Grek 36 | script-type: only-typed 37 | time: 38 | notBefore: '1835' 39 | notAfter: '1903' 40 | hands: 41 | count: '1' 42 | precision: exact 43 | license: 44 | - name: CC-BY 4.0 45 | url: https://creativecommons.org/licenses/by/4.0/ 46 | format: Alto-XML 47 | volume: 48 | - metric: characters 49 | count: 0 50 | - metric: files 51 | count: 371 52 | - metric: lines 53 | count: 0 54 | - metric: regions 55 | count: 2386 56 | transcription-guidelines: SegmOnto guidelines (v. 0.9) 57 | citation-file-link: https://github.com/AjaxMultiCommentary/GT-commentaries-layout/blob/master/CITATION.cff 58 | characters: 59 | mode: NFD 60 | members: [] 61 | -------------------------------------------------------------------------------- /catalog/enc-cours-git/tnah-decameronfr.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: DecameronFR 3 | url: https://github.com/PSL-Chartes-HTR-Students/TNAH-2021-DecameronFR 4 | project-name: 'ENC - Bonnes pratiques du developpement collaboratif 5 | 6 | ' 7 | authors: 8 | - name: Biay 9 | surname: "S\xE9bastien" 10 | roles: 11 | - transcriber 12 | - name: Cappe 13 | surname: "Zo\xE9" 14 | roles: 15 | - transcriber 16 | - name: Konstantinova 17 | surname: Kristina 18 | roles: 19 | - transcriber 20 | - name: Boby 21 | surname: Victor 22 | roles: 23 | - transcriber 24 | - aligner 25 | description: "Le projet vise \xE0 la consitution de v\xE9rit\xE9s de terrain pour\ 26 | \ l\u2019entra\xEEnement de mod\xE8les HTR \xE0 partir d'un manuscrit fran\xE7ais\ 27 | \ des ann\xE9es 1430-1455 : le manuscrit 5070 de la Biblioth\xE8que de l'Arsenal\ 28 | \ (reproduit sur Gallica). Ce manuscrit contient la traduction fran\xE7aise du Decameron\ 29 | \ de Boccace par Laurent de Premierfait. Nos v\xE9rit\xE9s de terrain recouvrent\ 30 | \ la description de la peste \xE0 Florence situ\xE9e dans le prologue de l'ouvrage.\n" 31 | language: 32 | - frm 33 | script: 34 | - iso: Latn 35 | script-type: only-manuscript 36 | time: 37 | notBefore: '1430' 38 | notAfter: '1455' 39 | hands: 40 | count: '1' 41 | precision: exact 42 | license: 43 | - name: CC-BY 4.0 44 | url: https://creativecommons.org/licenses/by/4.0/ 45 | format: Alto-XML 46 | citation-file-link: https://raw.githubusercontent.com/PSL-Chartes-HTR-Students/TNAH-2021-DecameronFR/main/CITATION.cff 47 | transcription-guidelines: Cf. https://github.com/PSL-Chartes-HTR-Students/TNAH-2021-DecameronFR/blob/main/normesTranscription.md 48 | volume: 49 | - metric: characters 50 | count: 19821 51 | - metric: files 52 | count: 9 53 | - metric: lines 54 | count: 751 55 | - metric: regions 56 | count: 41 57 | production-software: "eScriptorium + Kraken" 58 | -------------------------------------------------------------------------------- /catalog/naval-kishore/naval-kishore.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: Ground truth data for printed Devanagari 3 | url: https://doi.org/10.11588/data/EGOKEI 4 | authors: 5 | - name: Nicole 6 | surname: Merkel-Hilf 7 | orcid: 0000-0002-0344-6169 8 | roles: 9 | - transcriber 10 | - project-manager 11 | - name: Daria 12 | surname: Peshcherova 13 | roles: 14 | - support 15 | institutions: 16 | - name: Heidelberg University Library 17 | description: >- 18 | Ground truth (GT) data (jpg and alto xml files) for an OCR model that 19 | recognizes printed text in Devanagari script. 20 | 21 | 22 | The GT data was trained on Transkribus with the HTR+ engine. The training was 23 | performed on appr. 220 pages with appr. 27,000 words. The validation set was 24 | 10% of the training set. 25 | 26 | 27 | The training material is comprised of letterpress printings from the Naval 28 | Kishore Press (Lakhnau, North India) from the late 19th and early 20th century 29 | in the Hindi, Sanskrit, Braj Bhasha and Awadhi languages. 30 | 31 | 32 | Transcription was performed by Nicole Merkel-Hilf (CATS Library / Heidelberg 33 | University Library) with support by Daria Peshcherova (CATS Library / 34 | Heidelberg University Library). 35 | project-name: Naval Kishore Press - digital 36 | project-website: https://digi.ub.uni-heidelberg.de/en/sammlungen/suedasien/navalkishore.html 37 | language: 38 | - hin 39 | - san 40 | - bra 41 | production-software: Transkribus 42 | script: 43 | - iso: Deva 44 | script-type: only-typed 45 | time: 46 | notBefore: '1880' 47 | notAfter: '1953' 48 | hands: 49 | count: less-than-11 50 | precision: exact 51 | license: 52 | - name: CC-BY 4.0 53 | url: https://creativecommons.org/licenses/by/4.0/ 54 | format: Alto-XML 55 | volume: 56 | - metric: lines 57 | count: 4333 58 | transcription-guidelines: Diplomatic transcription, no correction of mispelling 59 | -------------------------------------------------------------------------------- /catalog/burchards-dekret-digital/bdd-segmentation-data.yml: -------------------------------------------------------------------------------- 1 | 2 | 3 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 4 | title: Burchards Dekret Digital (BDD) Segmentation Data 5 | url: https://github.com/michaelscho/bdd-segmentation-data 6 | authors: 7 | - name: Michael 8 | surname: Schonhardt 9 | orcid: 0000-0002-2750-1900 10 | roles: 11 | - aligner 12 | - project-manager 13 | - quality-control 14 | - name: Leo 15 | surname: Felder 16 | orcid: 0009-0008-7230-4229 17 | roles: 18 | - support 19 | - name: Torben 20 | surname: Jordan 21 | orcid: 0009-0002-2143-0520 22 | roles: 23 | - support 24 | - name: Christopher 25 | surname: Oed 26 | orcid: 0009-0001-3910-1832 27 | roles: 28 | - support 29 | institutions: [] 30 | description: >- 31 | This dataset comprises PageXML for training segmentation models in Transkribus 32 | and Kraken. It is designed to capture the specific layout of medieval canon 33 | law collections. Compiled from several 11th-century manuscripts of the 34 | Decretum Burchardi, it supports the ongoing edition project Burchards Dekret 35 | Digital. Annotations are tailored to project-specific needs but can be adapted 36 | for other use cases. The data was first prepared using Transkribus and then 37 | remasked in eScriptorium for usage in Kraken. 38 | project-name: Burchards Dekret Digital 39 | project-website: https://www.adwmainz.de/projekte/burchards-dekret-digital/informationen.html 40 | language: 41 | - lat 42 | production-software: eScriptorium + Kraken + Transkribus 43 | automatically-aligned: false 44 | script: 45 | - iso: Latn 46 | script-type: only-manuscript 47 | time: 48 | notBefore: '1000' 49 | notAfter: '1199' 50 | hands: 51 | count: unknown 52 | precision: exact 53 | license: 54 | name: CC-BY 4.0 55 | url: https://creativecommons.org/licenses/by/4.0/ 56 | format: Page-XML 57 | volume: 58 | - metric: pages 59 | count: 3000 60 | 61 | -------------------------------------------------------------------------------- /catalog/tarima/tarima.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: TariMa 3 | url: https://github.com/calfa-co/tarima 4 | authors: 5 | - name: Antoine 6 | surname: Perrier 7 | orcid: 0000-0002-5035-4283 8 | roles: 9 | - project-manager 10 | - name: Chahan 11 | surname: Vidal-Gorène 12 | orcid: 0000-0003-1567-6508 13 | roles: 14 | - project-manager 15 | institutions: 16 | - name: BULAC 17 | roles: 18 | - project-manager 19 | - name: Calfa 20 | roles: 21 | - project-manager 22 | - transcriber 23 | description: >- 24 | The dataset has been collated within the frame of the TariMa project (Tarih 25 | al-Maghrib. Writing History in the Maghreb in the modern and contemporary 26 | era), sponsored by the French agency Collex-Persee and led by Antoine 27 | Perrier (CNRS). It comprises different image resolution and size (width from 28 | 982px to 8049px), different layouts (double page, multiple columns), and state 29 | of conservation. It also mixes microfilms, scans and lithographies. It 30 | presents a very wide variety representative of the Maghrebi Arabic production. 31 | project-website: https://www.collexpersee.eu/projet/tarima/ 32 | language: 33 | - ara 34 | production-software: Calfa Vision 35 | script: 36 | - iso: Arab 37 | qualify: Maghrebi 38 | script-type: mainly-manuscript 39 | time: 40 | notBefore: '1500' 41 | notAfter: '1899' 42 | hands: 43 | count: more-than-10 44 | precision: estimated 45 | license: 46 | - name: CC-BY 4.0 47 | url: https://creativecommons.org/licenses/by/4.0/ 48 | format: Page-XML 49 | sources: 50 | - reference: '' 51 | link: https://github.com/calfa-co/tarima 52 | volume: 53 | - metric: files 54 | count: 120 55 | - metric: lines 56 | count: 2673 57 | - metric: characters 58 | count: 146667 59 | transcription-guidelines: >- 60 | We follow the RASAM guidelines for the transcription of Arabic Maghrebi 61 | manuscripts. 62 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/add-a-new-dataset-description.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Add a new dataset description 3 | about: Template to add the description of a new dataset 4 | title: "[catalog] New repo {project-name/dataset-name}" 5 | labels: project 6 | assignees: '' 7 | 8 | --- 9 | ## Autonomy 10 | 11 | Check applicable situation: 12 | 13 | - [ ] I know how to make a Pull Request and will create the corresponding directory and files under "[htr-united/catalog/](https://github.com/HTR-United/htr-united/tree/master/catalog)" 14 | - [ ] I don't know how to do a Pull Request, I need assistance to add the description of my dataset under "[htr-united/catalog/](https://github.com/HTR-United/htr-united/tree/master/catalog)" 15 | 16 | 17 | ## Description of the dataset 18 | 19 | ### Checklist 20 | - [ ] name of the corpus is explicitly stated 21 | - [ ] name of the project is explicitly stated 22 | - [ ] authors and roles are explicitly stated 23 | - [ ] a license is associated with the dataset 24 | - [ ] the dataset is described in a clear and explicit way enabling other users to understand its content and context of creation 25 | - [ ] the dataset uses standard formats such as PAGE XML or ALTO XML and is aligned with images 26 | 27 | ### Relevant information 28 | 29 | - name of the corpus[1](#fn1): 30 | - name of the project[2](#fn2): 31 | - description generated with [our form](https://htr-united.github.io/document-your-data-en.html): 32 | ``` 33 | [paste description here] 34 | ``` 35 | 36 | --- 37 | 38 | 1: This name will be used to create a YAML file dedicated to this dataset. *For example: if your dataset is called "My Awesome Dataset", its description will be saved under "my-awesome-dataset.yml"* 39 | 40 | 2: This name will be used to create a folder under "catalog/" containing all the datasets related to your project. *For example: if you project is called "My Awesome Project", the YAML file(s) describing your datasets will be saved under "catalog/my-awesome-project/"* 41 | -------------------------------------------------------------------------------- /catalog/bullinger/gwalther-htr.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: Gwalther Handwriting Ground Truth 3 | url: https://zenodo.org/record/4780947#.YhN5pVvMLUQ 4 | project-name: 'Bullinger digital'' 5 | 6 | ' 7 | project-website: https://www.bullinger-digital.ch/ 8 | authors: 9 | - name: "Str\xF6bel" 10 | surname: Phillip Benjamin 11 | roles: 12 | - aligner 13 | - quality-control 14 | - support 15 | - name: Stotz 16 | surname: Peter 17 | roles: 18 | - transcriber 19 | description: "This is ground truth for Rudolph Gwalther\u2019s (1519-1586) handwriting\ 20 | \ taken from his book \"Lateinische\" Gedichte\", where he accumulated writings\ 21 | \ between 1540 and 1580. Data collection and ground truth creation: At the time\ 22 | \ we collected the data, we found 150 images with corresponding transcriptions by\ 23 | \ Peter Stotz on e-manuscripta (reference: Gwalther, Rudolf: Lateinische Gedichte.\ 24 | \ Z\xFCrich, 1540-1580. Zentralbibliothek Z\xFCrich, Ms D 152, https://doi.org/10.7891/e-manuscripta-26750\ 25 | \ / Public Domain Mark) . We removed 8 images with too many corrections or vertical\ 26 | \ texts. Next, we uploaded the images into the Transkribus platform, applied the\ 27 | \ line recognition tool and manually copied the transcribed text lines into the\ 28 | \ recognised line boxes. During this process, we made some corrections, which were\ 29 | \ mainly due to inconsistencies in punctuation and capitalised letters.\n" 30 | language: 31 | - lat 32 | script: 33 | - iso: Latn 34 | script-type: only-manuscript 35 | time: 36 | notBefore: '1540' 37 | notAfter: '1580' 38 | hands: 39 | count: '1' 40 | precision: exact 41 | license: 42 | name: CC-BY 4.0 43 | url: https://creativecommons.org/licenses/by/4.0/ 44 | format: Alto-XML 45 | volume: 46 | - count: 4040 47 | metric: lines 48 | - count: 142 49 | metric: files 50 | - count: 155 51 | metric: regions 52 | - count: 144301 53 | metric: characters 54 | production-software: Transkribus 55 | -------------------------------------------------------------------------------- /catalog/fondue/fondue-gasparosarditoponomasia-dataset.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: FoNDUE-GasparoSardiToponomasia-Dataset 3 | url: https://github.com/PaulineJac/GasparoSardiToponomasia/tree/main/HTR 4 | authors: 5 | - name: Jacsont 6 | surname: Pauline 7 | roles: 8 | - transcriber 9 | - quality-control 10 | - digitization 11 | - name: Mittenhuber 12 | surname: Florian 13 | institutions: [] 14 | description: >- 15 | Dataset produced as for the project to edit Gasparo Sardi’s Toponomasia from 16 | codex 174 of the Burgerbibliothek of Bern. Images are available on request by writing to: pauline.jacsont [ at ] unige.ch. 17 | project-name: FoNDUE 18 | language: 19 | - lat 20 | production-software: eScriptorium + Kraken 21 | script: 22 | - iso: Latn 23 | - iso: Grek 24 | script-type: only-manuscript 25 | time: 26 | notBefore: '1561' 27 | notAfter: '1570' 28 | hands: 29 | count: '1' 30 | precision: exact 31 | license: 32 | - name: CC-BY 4.0 33 | url: https://creativecommons.org/licenses/by/4.0/ 34 | format: Alto-XML 35 | sources: 36 | - reference: '' 37 | link: http://katalog.burgerbib.ch/detail.aspx?ID=340662 38 | volume: 39 | - metric: pages 40 | count: 49 41 | citation-file-link: >- 42 | https://github.com/PaulineJac/GasparoSardiToponomasia/blob/main/HTR/CITATION.cff 43 | transcription-guidelines: ' The transcriptions were made following the rules of the github cremma-medieval repository - https://github.com/HTR-United/cremma-medieval. The transcription is strictly diplomatic and graphmatic. No abbreviations are resolved, no standardization of ''i'' and ''v'' with ramist letters, and accents, punctuation, spaces, and line breaks are strictly adhered to. Following Leiden conventions, crossed out or crossed out elements are transcribed with double brackets ⟦⟧, and elements that are illegible in the picture will not be restored but indicated by this type of bracket ⟨ ⟩. Special characters are encoded according to the MUFI fonts.' 44 | -------------------------------------------------------------------------------- /catalog/transcriboquest-2025/transcriboquest-2025-medieval-latin.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: 'TranscriboQuest 2025: Medieval Latin' 3 | url: https://www.doi.org/10.5281/zenodo.17062009 4 | authors: 5 | - name: Boutreux 6 | surname: Agnès 7 | roles: 8 | - transcriber 9 | - name: Chevalier 10 | surname: Romain 11 | roles: 12 | - transcriber 13 | - name: Corongiu 14 | surname: Chiara 15 | roles: 16 | - transcriber 17 | - name: Gaucher 18 | surname: Sarah 19 | orcid: 0000-0002-1605-3583 20 | roles: 21 | - transcriber 22 | - name: Guéville 23 | surname: Estelle 24 | orcid: 0000-0003-2603-1051 25 | roles: 26 | - transcriber 27 | - name: Kienzl 28 | surname: Annabelle 29 | roles: 30 | - transcriber 31 | - name: Maliszewski 32 | surname: Jan 33 | roles: 34 | - transcriber 35 | - name: Gille Levenson 36 | surname: Matthias 37 | orcid: 0000-0001-9488-5986 38 | roles: 39 | - project-manager 40 | - support 41 | - quality-control 42 | description: Dataset from TranscriboQuest 2025, Medieval Latin group. This dataset focuses on layout. All manuscripts are glossed latin manuscripts with complex layouts. The dataset contains 5000 typed lines, 700 of which have been transcribed. 43 | language: 44 | - lat 45 | production-software: eScriptorium + Kraken 46 | automatically-aligned: false 47 | script: 48 | - iso: Latn 49 | script-type: only-manuscript 50 | time: 51 | notBefore: '800' 52 | notAfter: '1499' 53 | hands: 54 | count: 'more-than-10' 55 | precision: estimated 56 | license: 57 | name: CC-BY-NC-SA 4.0 58 | url: https://creativecommons.org/licenses/by-sa/4.0/ 59 | format: Alto-XML 60 | volume: 61 | - metric: files 62 | count: 37 63 | - metric: lines 64 | count: 5060 65 | - metric: regions 66 | count: 358 67 | transcription-guidelines: |- 68 | transcription — https://catmus-guidelines.github.io/ 69 | segmentation — https://segmonto.github.io/ 70 | -------------------------------------------------------------------------------- /catalog/banq/copiste-d-un-jour.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: Copiste d’un jour 3 | url: https://github.com/banq-dcn/Copiste-d-un-jour 4 | authors: 5 | - name: Adèle 6 | surname: Aubin 7 | orcid: 0009-0009-3756-1606 8 | roles: 9 | - transcriber 10 | - project-manager 11 | - name: Pascale 12 | surname: Montmartin 13 | orcid: 0009-0002-5683-2423 14 | roles: 15 | - project-manager 16 | institutions: 17 | - name: BAnQ 18 | description: >- 19 | This project draws inspiration from the CREMMA WIKIPEDA data set, with the 20 | objective to create a ground truth repository of contemporary Québécois 21 | handwriting to train HTR models. It is based on a collection of randomly 22 | selected Wikipedia summaries. Each text comprises between 125 and 175 words 23 | and was copied by hand by volunteers. The texts were ordered in a way to 24 | prioritize texts that presented rare character 1- and 2-grams. Non-French 25 | characters were replaced with "-". In general, the copy of one text took 26 | between 1 and 2 pages. In total, 267 volunteers copied 265 texts (2 texts were 27 | unfortunately copied twice by two different volunteers). We took care of the 28 | alignment between the handwritten portion and the original text. 29 | project-name: Copiste d'un jour 30 | language: 31 | - fra 32 | production-software: eScriptorium + Kraken 33 | automatically-aligned: false 34 | script: 35 | - iso: Latn 36 | script-type: only-manuscript 37 | time: 38 | notBefore: '2024' 39 | notAfter: '2024' 40 | hands: 41 | count: 1-per-file 42 | precision: estimated 43 | license: 44 | name: CC-BY 4.0 45 | url: https://creativecommons.org/licenses/by/4.0/ 46 | format: Alto-XML 47 | volume: 48 | - metric: files 49 | count: 333 50 | - metric: pages 51 | count: 333 52 | - metric: characters 53 | count: 316715 54 | - metric: lines 55 | count: 6989 56 | transcription-guidelines: https://gist.github.com/alix-tz/6f89444521bf1cab0522da520f7e4ff4 57 | -------------------------------------------------------------------------------- /catalog/stabs-urfehdebuch/urfehdebuch-htr.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: "Handwritten Text Recognition Ground Truth Set: StABS Ratsb\xFCcher O10, Urfehdenbuch\ 3 | \ X" 4 | url: https://doi.org/10.5281/zenodo.5153263 5 | authors: 6 | - name: Susanna 7 | surname: Burghartz 8 | roles: 9 | - project-manager 10 | - name: Calvi 11 | surname: Sonia 12 | roles: 13 | - project-manager 14 | - quality-control 15 | - name: Vogeler 16 | surname: Georg 17 | roles: 18 | - project-manager 19 | - name: Baur 20 | surname: Laila 21 | roles: 22 | - transcriber 23 | - name: Egli 24 | surname: Benedikt 25 | roles: 26 | - transcriber 27 | - name: Gehrig 28 | surname: Gabriela 29 | roles: 30 | - transcriber 31 | - name: Heini 32 | surname: Alexandra Isabelle 33 | roles: 34 | - transcriber 35 | - name: Rossi 36 | surname: Rosanna 37 | roles: 38 | - transcriber 39 | - name: Siegrist 40 | surname: Benjamin 41 | roles: 42 | - transcriber 43 | - name: Wasmer 44 | surname: Remo 45 | roles: 46 | - transcriber 47 | - name: Zimmermann 48 | surname: Lynn 49 | roles: 50 | - transcriber 51 | - name: Schoch 52 | surname: David 53 | roles: 54 | - aligner 55 | - name: "D\xE4ngeli" 56 | surname: Peter 57 | roles: 58 | - digitization 59 | - name: Hodel 60 | surname: Tobias 61 | roles: 62 | - project-manager 63 | - aligner 64 | description: Ground Truth for "Urfehdenbuch X der Stadt Basel (1563-1569)" at Staatsarchiv 65 | Basel-Stadt (StABS). 66 | project-website: hdl:11471/1010.2.1 67 | language: 68 | - deu 69 | script: 70 | - iso: Latn 71 | script-type: only-manuscript 72 | time: 73 | notBefore: '1563' 74 | notAfter: '1569' 75 | hands: 76 | count: unknown 77 | precision: estimated 78 | license: 79 | - name: CC-BY-SA 4.0 80 | url: https://creativecommons.org/licenses/by-sa/4.0/ 81 | format: Page-XML 82 | volume: 83 | - metric: lines 84 | count: 8000 85 | transcription-guidelines: 'See: http://gams.uni-graz.at/o:ufbas.1563' 86 | production-software: Transkribus 87 | -------------------------------------------------------------------------------- /catalog/impresso/nzz-ocr.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: "Ground truth for Neue Z\xFCrcher Zeitung black letter period" 3 | url: https://zenodo.org/record/3333627#.YhN1G1vMLUQ 4 | project-name: 'impresso'' 5 | 6 | ' 7 | project-website: https://impresso-project.ch/ 8 | authors: 9 | - name: "Str\xF6bel" 10 | surname: Phillip Benjamin 11 | roles: 12 | - transcriber 13 | - aligner 14 | - project-manager 15 | - quality-control 16 | - support 17 | - name: Clematide 18 | surname: Simon 19 | roles: 20 | - transcriber 21 | - quality-control 22 | - name: Watter 23 | surname: Camille 24 | roles: 25 | - transcriber 26 | - name: Meraner 27 | surname: Isabell 28 | roles: 29 | - transcriber 30 | description: "The Neue Z\xFCrcher Zeitung (NZZ) has been publishing in black letter\ 31 | \ from its very first issue in 1780 until 1947. From this time period, we randomly\ 32 | \ sampled one frontpage per year, resulting in a total of 167 pages. We chose frontpages\ 33 | \ because they typically contain highly relevant material and because we want to\ 34 | \ make sure not to sample pages containing exclusively advertisements or stock information.\ 35 | \ During certain periods, the NZZ was published several times a day, and there were\ 36 | \ supplements, too. Due to incomplete metadata, the sampling included frontpages\ 37 | \ from supplements. We then manually corrected the pages, so it can be used as a\ 38 | \ ground truth to improve the OCR of black letter in historical newspapers.i\n" 39 | language: 40 | - deu 41 | script: 42 | - iso: Latn 43 | script-type: only-typed 44 | time: 45 | notBefore: '1780' 46 | notAfter: '1946' 47 | hands: 48 | count: less-than-11 49 | precision: estimated 50 | license: 51 | - name: CC-BY 4.0 52 | url: https://creativecommons.org/licenses/by/4.0/ 53 | format: Alto-XML 54 | volume: 55 | - count: 43173 56 | metric: lines 57 | - count: 167 58 | metric: files 59 | - count: 6318 60 | metric: regions 61 | - count: 1768146 62 | metric: characters 63 | production-software: Transkribus 64 | -------------------------------------------------------------------------------- /catalog/alix-tz/peraire-ground-truth.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: Peraire Ground Truth 3 | url: https://github.com/alix-tz/peraire-ground-truth 4 | authors: 5 | - name: Alix 6 | surname: Chagué 7 | orcid: 0000-0002-0136-4434 8 | roles: 9 | - transcriber 10 | - quality-control 11 | institutions: 12 | - name: Bibliothèque Sébert, Espéranto-France, Paris 13 | roles: 14 | - digitization 15 | description: >- 16 | This dataset was created in order to produce an HTR model for the Digital 17 | Peraire project. The documents are handwritten, dating from the second half of 18 | the 20th century, written by Lucien Péraire in French with a blue ink pen or, 19 | more frequently, with a blue pencil. 20 | project-name: Digital Peraire 21 | language: 22 | - fra 23 | production-software: eScriptorium + Kraken 24 | script: 25 | - iso: Latn 26 | script-type: only-manuscript 27 | time: 28 | notBefore: '1928' 29 | notAfter: '1971' 30 | hands: 31 | count: '1' 32 | precision: exact 33 | license: 34 | - name: CC-BY 4.0 35 | url: https://creativecommons.org/licenses/by/4.0/ 36 | format: Alto-XML 37 | volume: 38 | - metric: characters 39 | count: 38793 40 | - metric: files 41 | count: 33 42 | - metric: lines 43 | count: 1059 44 | - metric: regions 45 | count: 80 46 | citation-file-link: https://github.com/alix-tz/peraire-ground-truth/blob/master/CITATION.cff 47 | transcription-guidelines: >- 48 | The transcription respects what is written on the document, including 49 | ponctuation and spelling errors. The case is respected: capital letters are 50 | transcribed with capital letters. Crossed out words are signaled by # which 51 | isn't used to transcribe anything else. The SegmOnto ontology was used for the 52 | segmentation of this dataset. For regions, MainZone and MarginTextZone were 53 | used. For lines, DefaultLine and InterlinearLine were used. The original 54 | documents are held at the Bibliothèque Sébert, Espéranto-France, Paris. They 55 | should be mentionned every time the images are used. 56 | -------------------------------------------------------------------------------- /catalog/TranscriboQuest_Arabic/htr-united.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: TranscriboQuest_Arabic_team 3 | url: https://doi.org/10.5281/zenodo.13757236 4 | authors: 5 | - name: Ephrem Aboud 6 | surname: Ishac 7 | orcid: 0000-0003-2943-6556 8 | roles: 9 | - transcriber 10 | - aligner 11 | - quality-control 12 | - name: Enki 13 | surname: Baptiste 14 | orcid: 0009-0004-3456-9796 15 | roles: 16 | - transcriber 17 | - aligner 18 | - quality-control 19 | institutions: [] 20 | description: 'Dataset on an Arabic corpus of Christian-Islamic theology. ' 21 | project-name: TranscriboQuest 2024 22 | language: 23 | - ara 24 | production-software: eScriptorium + Kraken 25 | automatically-aligned: false 26 | script: 27 | - iso: Arab 28 | script-type: only-manuscript 29 | time: 30 | notBefore: '1200' 31 | notAfter: '1600' 32 | hands: 33 | count: 1-per-folder 34 | precision: estimated 35 | license: 36 | name: CC-BY-SA 4.0 37 | url: https://creativecommons.org/licenses/by-sa/4.0/ 38 | format: Alto-XML 39 | volume: 40 | - metric: lines 41 | count: 153 42 | transcription-guidelines: >- 43 | ▶ Data format: XML ALTO 44 | 45 | ▶ Number of transcribed lines: 153 46 | 47 | ▶ author/creator/curator of the dataset: Enki Baptiste and Ephrem Aboud Ishac 48 | 49 | ▶ Segmentation tools, HTR engine and interface: OpenITI model 50 | (https://github.com/OpenITI/acdc_results/blob/main/models/gen2-print-n7m5-union-ft_best.mlmodel); 51 | eScriptorium; Kraken 52 | 53 | ▶ Language of the corpus, Date: Arabic, end of the 16th century 54 | 55 | ▶ Type, support of documents, script: paper; mashriqi naskh 56 | 57 | ▶ Transcription method: diplomatic transcription respecting the tanwin, the 58 | shadda and the diacritic marks. 59 | 60 | ▶ Theme, collection, object of the dataset: theology; Maktabat al-Sālimī, 61 | Bidiyya, Oman, ms. AS 250 4v-5f 62 | (https://elibrary.mara.gov.om/en/omani-library/imam-nour-al-din-al-salmi-s-library/book/?id=324#book/7); 63 | St Mark Monastery, Jerusalem, SMMJ 00264 2v-5r 64 | -------------------------------------------------------------------------------- /catalog/ifloral/ifloral-dataset.yml: -------------------------------------------------------------------------------- 1 | authors: 2 | - name: Alexandre 3 | orcid: 0009-0007-4781-3294 4 | roles: 5 | - aligner 6 | - quality-control 7 | surname: Matos 8 | - name: Rui 9 | orcid: 0000-0001-5767-1583 10 | roles: 11 | - transcriber 12 | surname: Neves 13 | - name: Gonçalo 14 | roles: 15 | - transcriber 16 | surname: Monteiro 17 | - name: Catarina 18 | roles: 19 | - transcriber 20 | surname: Coelho 21 | - name: Pedro 22 | orcid: 0009-0004-9005-6688 23 | roles: 24 | - aligner 25 | surname: Bastos 26 | automatically-aligned: false 27 | description: >- 28 | This dataset was designed for training machine learning models in the context 29 | of the [iForal project](https://iforal.hypotheses.org/), which focuses on 30 | transcribing medieval Portuguese texts, specifically forais (charters). It 31 | includes images of medieval manuscripts, along with corresponding line-level 32 | transcription labels, to facilitate the development of models capable of 33 | recognizing and transcribing historical handwriting. 34 | 35 | The dataset is ideal for OCR/HTR tasks and segmentation tasks within the 36 | domain of medieval document transcription. It serves as a critical resource 37 | for advancing automated transcription tools for medieval texts, making 38 | historical archives more accessible. 39 | format: Page-XML 40 | hands: 41 | count: unknown 42 | precision: exact 43 | institutions: [] 44 | language: 45 | - lat 46 | - por 47 | license: 48 | name: CC-BY 4.0 49 | url: https://creativecommons.org/licenses/by/4.0/ 50 | production-software: eScriptorium + Kraken 51 | project-name: iForal 52 | project-website: https://iforal.hypotheses.org/ 53 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 54 | script: 55 | - iso: Latn 56 | script-type: only-manuscript 57 | time: 58 | notAfter: '1491' 59 | notBefore: '1217' 60 | title: iForal-Dataset 61 | url: https://github.com/Arch-W/iForal-Dataset 62 | volume: 63 | - count: 776873 64 | metric: characters 65 | - count: 180 66 | metric: files 67 | - count: 8009 68 | metric: lines 69 | - count: 183 70 | metric: regions 71 | -------------------------------------------------------------------------------- /catalog/cremma/mss-20.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: CREMMA Manuscrits du 20e 3 | url: https://github.com/HTR-United/CREMMA-MSS-20 4 | project-name: CREMMA 5 | authors: 6 | - name: "Cl\xE9rice" 7 | surname: Thibault 8 | roles: 9 | - project-manager 10 | - quality-control 11 | - name: "Chagu\xE9" 12 | surname: Alix 13 | roles: 14 | - project-manager 15 | - quality-control 16 | description: 'Manuscripts of the 20th century 17 | 18 | ' 19 | language: 20 | - fra 21 | script: 22 | - iso: Latn 23 | script-type: only-manuscript 24 | time: 25 | notBefore: '1900' 26 | notAfter: '1999' 27 | hands: 28 | count: 1-per-folder 29 | precision: exact 30 | license: 31 | - name: CC-BY 4.0 32 | url: https://creativecommons.org/licenses/by/4.0/ 33 | format: Alto-XML 34 | volume: 35 | - metric: characters 36 | count: 5764 37 | - metric: files 38 | count: 13 39 | - metric: lines 40 | count: 224 41 | - metric: regions 42 | count: 25 43 | transcription-guidelines: "Abr\xE9viations conserv\xE9es." 44 | production-software: eScriptorium + Kraken 45 | characters: 46 | mode: NFKD 47 | members: 48 | - e 49 | - a 50 | - s 51 | - n 52 | - t 53 | - r 54 | - i 55 | - u 56 | - l 57 | - o 58 | - d 59 | - c 60 | - m 61 | - p 62 | - "\u0301" 63 | - < 64 | - '>' 65 | - '''' 66 | - v 67 | - q 68 | - ',' 69 | - . 70 | - "\u0300" 71 | - b 72 | - g 73 | - h 74 | - j 75 | - f 76 | - F 77 | - J 78 | - '1' 79 | - '-' 80 | - "\u0302" 81 | - M 82 | - A 83 | - E 84 | - x 85 | - T 86 | - y 87 | - C 88 | - D 89 | - ^ 90 | - O 91 | - '8' 92 | - N 93 | - '7' 94 | - B 95 | - S 96 | - '0' 97 | - "\u0327" 98 | - P 99 | - G 100 | - R 101 | - H 102 | - L 103 | - '9' 104 | - z 105 | - I 106 | - '2' 107 | - ':' 108 | - U 109 | - '&' 110 | - k 111 | - + 112 | - ; 113 | - $ 114 | - V 115 | - "\u0153" 116 | - '[' 117 | - '?' 118 | - ']' 119 | - '4' 120 | - '3' 121 | - ( 122 | - ) 123 | - '6' 124 | -------------------------------------------------------------------------------- /catalog/teklia/belfort.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: Belfort 3 | url: https://zenodo.org/record/8041668 4 | authors: 5 | - name: Solène 6 | surname: Tarride 7 | orcid: 0000-0001-6174-9865 8 | - name: Tristan 9 | surname: Faine 10 | - name: Mélodie 11 | surname: Boillet 12 | orcid: 0000-0002-0618-7852 13 | - name: Harold 14 | surname: Mouchère 15 | orcid: 0000-0001-6220-7216 16 | - name: Christopher 17 | surname: Kermorvant 18 | orcid: 0000-0002-7508-4080 19 | institutions: [] 20 | description: > 21 | This dataset includes minutes of Belfort municipal council drawn up between 22 | 1790 and 1946. Documents include deliberations, lists of councillors, 23 | convocations, and agendas. The dataset includes 24,105 text-line images that 24 | were automatically detected from pages. 25 | 26 | Up to four transcriptions are available for each line image: 27 | 28 | * two from human annotators (in `Transcriptions/callico_1/` and 29 | `Transcriptions/callico_2/`) 30 | 31 | * two from automatic models (in `Transcriptions/dan/` and 32 | `Transcriptions/pylaia/`) 33 | project-name: Handwritten Text Recognition from Crowdsourced Annotations 34 | project-website: https://arxiv.org/abs/2306.10878 35 | language: 36 | - fra 37 | production-software: Callico 38 | script: 39 | - iso: Latn 40 | script-type: only-manuscript 41 | time: 42 | notBefore: '1790' 43 | notAfter: '1946' 44 | hands: 45 | count: more-than-10 46 | precision: estimated 47 | license: 48 | name: CC-BY 4.0 49 | url: https://creativecommons.org/licenses/by/4.0/ 50 | format: Image-Text-Pairs 51 | sources: 52 | - reference: >- 53 | Solène Tarride, Tristan Faine, Mélodie Boillet, Harold Mouchère, & 54 | Christopher Kermorvant. (2023). The Belfort dataset: Handwritten Text 55 | Recognition from Crowdsourced Annotations [Data set]. 7th International 56 | Workshop on Historical Document Imaging and Processing (HIP'23), San 57 | José, California, USA. Zenodo. https://doi.org/10.5281/zenodo.8041668 58 | link: https://arxiv.org/abs/2306.10878 59 | volume: 60 | - metric: lines 61 | count: 24105 62 | -------------------------------------------------------------------------------- /catalog/incunabula-reichenau/incunabula-reichenau.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: Incunabula Reichenau 3 | url: https://doi.org/10.5281/zenodo.11046061 4 | authors: 5 | - name: Annika 6 | surname: Stello 7 | orcid: 0000-0002-6305-4810 8 | roles: 9 | - project-manager 10 | - name: Gerit 11 | surname: Heim 12 | orcid: 0000-0002-5820-7771 13 | roles: 14 | - project-manager 15 | - name: Katharina 16 | surname: Ost 17 | orcid: 0000-0002-6234-9721 18 | roles: 19 | - transcriber 20 | institutions: [] 21 | description: >- 22 | This data set contains the training data for the following three published 23 | Transkribus models\: 24 | 25 | German Incunabula (Reichenau) 26 | Latin Incunabula (Reichenau) 27 | Latin/German Bilingual Incunabula (Reichenau) 28 | 29 | This data set represents an excerpt of a collection of incunabula and post-incunabula 30 | of the former Reichenau monastery, now held at the Badische Landesbibliothek in 31 | Karlsruhe (see https://digital.blb-karlsruhe.de/topic/view/7530707). As, typically, 32 | 1-20 pages were drawn from single prints, it reflects a wide range of typefaces used 33 | by early printers from the German language area and Northern Italy. 34 | 35 | The data was created as part of the project Digitalisierung und Volltexterkennung 36 | der ehemals Reichenauer Inkunabeln at the Badische Landesbibliothek, which was 37 | funded by the Stiftung Kulturgut Baden-Württemberg. 38 | project-name: Digitalisierung und Volltexterkennung der ehemals Reichenauer Inkunabeln 39 | language: 40 | - lat 41 | - deu 42 | production-software: Transkribus 43 | automatically-aligned: false 44 | script: 45 | - iso: Latn 46 | - iso: Latf 47 | script-type: only-typed 48 | time: 49 | notBefore: '1470' 50 | notAfter: '1510' 51 | hands: 52 | count: more-than-10 53 | precision: exact 54 | license: 55 | name: CC-BY-SA 4.0 56 | url: https://creativecommons.org/licenses/by-sa/4.0/ 57 | format: Page-XML 58 | volume: 59 | - metric: pages 60 | count: 2200 61 | transcription-guidelines: Abbreviations are represented through special characters, please see the project repository for a full documentation. 62 | -------------------------------------------------------------------------------- /catalog/htr-school-vienna/wien-onb-cod-2160-f-164-184-ground-truth-from-htr-winter-school-2022.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: Wien ÖNB Cod. 2160 f. 164-184 Ground Truth from HTR Winter School 2022 3 | url: https://zenodo.org/record/7467027#.Y6LRj3bMK3B 4 | authors: 5 | - name: Geelhaar 6 | surname: Tim 7 | orcid: 0000-0002-7653-5859 8 | roles: 9 | - transcriber 10 | - project-manager 11 | - name: D'Amico 12 | surname: Sara 13 | orcid: 0000-0002-8937-2040 14 | roles: 15 | - transcriber 16 | - name: Hofmann 17 | surname: Lara 18 | orcid: 0000-0003-4698-3906 19 | roles: 20 | - transcriber 21 | - name: Gnasso 22 | surname: Alessandro 23 | orcid: 0000-0001-5964-2989 24 | roles: 25 | - transcriber 26 | - name: Audebrand 27 | surname: Justine 28 | roles: 29 | - transcriber 30 | - name: Stitts 31 | surname: Jeremy 32 | orcid: 0000-0001-6988-1836 33 | roles: 34 | - transcriber 35 | - name: Sweeney 36 | surname: Mary 37 | orcid: 0000-0001-7028-2072 38 | roles: 39 | - transcriber 40 | - name: Atwood 41 | surname: Grace 42 | orcid: 0000-0002-1546-6546 43 | roles: 44 | - transcriber 45 | institutions: [] 46 | description: >- 47 | This is Ground Truth data created during the HTR Winter School 2022 for the 48 | Cod. 2160 ÖNB that contains one version of the so called Lex Dei. 49 | project-name: HTR Winter School 2022, Vienna 50 | language: 51 | - lat 52 | production-software: Transkribus 53 | script: 54 | - iso: Latn 55 | qualify: Carolingian Minuscule 56 | script-type: only-manuscript 57 | time: 58 | notBefore: '850' 59 | notAfter: '900' 60 | hands: 61 | count: '1' 62 | precision: exact 63 | license: 64 | - name: CC-BY 4.0 65 | url: https://creativecommons.org/licenses/by/4.0/ 66 | format: Alto-XML 67 | sources: 68 | - reference: '' 69 | link: http://data.onb.ac.at/rec/AC13956457 70 | volume: 71 | - metric: pages 72 | count: 40 73 | transcription-guidelines: >- 74 | Abbreviations resolved, but no normalization and no correcting of mispelling. 75 | No transcription of initials and interlinear script. 76 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/ajouter-la-description-d-un-nouveau-jeu-de-donn-es.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Ajouter la description d'un nouveau jeu de données 3 | about: Template pour ajouter la description d'un nouveau dataset 4 | title: "[catalog] Nouveau repo {project-name/dataset-name}" 5 | labels: project 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Description du jeu de données 11 | 12 | ### Checklist 13 | - [ ] le nom du corpus est exprimé explicitement 14 | - [ ] le nom du projet est exprimé explicitement 15 | - [ ] les auteur-rices et les rôles sont exprimés explicitement 16 | - [ ] une license est associée au jeu de données 17 | - [ ] le jeu de données est clairement et explicitement décrit, de manière à permettre aux autres utilisateurs de comprendre son contenu et le contexte de sa création 18 | - [ ] le jeu de données utilise des formats standards comme PAGE XML ou ALTO XML et les transcriptions sont alignées avec des images 19 | 20 | ### Informations inmportantes 21 | 22 | - nom du corpus[1](#fn1): 23 | - nom du projet[2](#fn2): 24 | - description générée à l'aide de [notre formulaire](https://htr-united.github.io/document-your-data.html): 25 | ``` 26 | [Copier la description ici] 27 | ``` 28 | 29 | ### Autonomie 30 | 31 | Cocher la situation applicable : 32 | 33 | - [ ] Je sais comment faire une Pull Request et je m'occupe de créer un dossier + fichier correspondant à mon dépôt dans "[htr-united/catalog/](https://github.com/HTR-United/htr-united/tree/master/catalog)" 34 | - [ ] Je ne sais pas comment faire une Pull Request, j'ai besoin d'aide pour ajouter une description de mon jeu de données sous "[htr-united/catalog/](https://github.com/HTR-United/htr-united/tree/master/catalog)" 35 | 36 | --- 37 | 38 | 1: Ce nom sera utilisé pour créer le fichier YAML dédié au jeu de données. *Par exemple : si votre jeu de données s'appelle "Mon Super Dataset", sa description sera enregistrée sous "mon-super-dataset.yml"* 39 | 40 | 2: Ce nom sera utlisé pour créer un dossier dans "catalog/", il contiendra toutes les descriptions des jeux de données liés à ce projet. *Par exemple : si vous projet s'appelle "Mon Super Projet", le(s) fichier(s) YAML sera(ont) enregistrés sous "catalog/mon-super-projet/"* 41 | -------------------------------------------------------------------------------- /catalog/almanach/dahn.yml: -------------------------------------------------------------------------------- 1 | authors: 2 | - name: Chiffoleau 3 | roles: 4 | - project-manager 5 | - aligner 6 | surname: Floriane 7 | characters: 8 | members: 9 | - e 10 | - s 11 | - a 12 | - n 13 | - r 14 | - i 15 | - t 16 | - u 17 | - o 18 | - l 19 | - d 20 | - c 21 | - m 22 | - p 23 | - ́ 24 | - ',' 25 | - v 26 | - . 27 | - f 28 | - q 29 | - g 30 | - ̀ 31 | - '-' 32 | - E 33 | - b 34 | - ’ 35 | - "'" 36 | - h 37 | - A 38 | - L 39 | - N 40 | - x 41 | - j 42 | - S 43 | - R 44 | - I 45 | - T 46 | - M 47 | - ̂ 48 | - C 49 | - P 50 | - y 51 | - O 52 | - ; 53 | - '1' 54 | - £ 55 | - U 56 | - D 57 | - B 58 | - F 59 | - J 60 | - G 61 | - '"' 62 | - '0' 63 | - z 64 | - V 65 | - '9' 66 | - '2' 67 | - ':' 68 | - X 69 | - 70 | - € 71 | - H 72 | - '5' 73 | - '!' 74 | - '3' 75 | - '4' 76 | - ̧ 77 | - ° 78 | - W 79 | - Y 80 | - '6' 81 | - '8' 82 | - '?' 83 | - '7' 84 | - K 85 | - Q 86 | - / 87 | - ( 88 | - ) 89 | - k 90 | - œ 91 | - w 92 | - ̈ 93 | - … 94 | - Z 95 | - – 96 | - '&' 97 | - '%' 98 | - '=' 99 | - $ 100 | - _ 101 | mode: NFD 102 | description: OCR ground Truth dataset based on French 20th typewritten letters 103 | format: Alto-XML 104 | hands: 105 | count: less-than-11 106 | precision: exact 107 | language: 108 | - fra 109 | license: 110 | - name: CC-BY 4.0 111 | url: https://creativecommons.org/licenses/by/4.0/ 112 | production-software: eScriptorium + Kraken 113 | project-name: DAHN 114 | project-website: https://digitalintellectuals.hypotheses.org/category/dahn 115 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 116 | script: 117 | - iso: Latn 118 | script-type: only-typed 119 | time: 120 | notAfter: '1924' 121 | notBefore: '1914' 122 | title: DAHN Corpus 123 | url: https://github.com/HTR-United/dahncorpus 124 | volume: 125 | - count: 475849 126 | metric: characters 127 | - count: 547 128 | metric: files 129 | - count: 12539 130 | metric: lines 131 | - count: 527 132 | metric: pages 133 | - count: 547 134 | metric: regions 135 | -------------------------------------------------------------------------------- /catalog/hismodoc-htr/titres-nobiliaires-17-18-siecles-dataset.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: Jeu de données HTR « Titres nobiliaires 17e-18e siècles » 3 | url: https://github.com/HisMoDoc-HTR/TitresNobiliaires_17_18/tree/main 4 | authors: 5 | - name: Jean-François 6 | surname: Moufflet 7 | roles: 8 | - transcriber 9 | - project-manager 10 | - quality-control 11 | - digitization 12 | - name: Chloé 13 | surname: Fize 14 | roles: 15 | - transcriber 16 | - aligner 17 | - name: Lucas 18 | surname: Terriel 19 | orcid: 0000-0002-9189-258X 20 | roles: 21 | - transcriber 22 | - aligner 23 | - quality-control 24 | - support 25 | institutions: [] 26 | description: >- 27 | Ce dataset pour la reconnaissance des écritures automatiques est composé d’un 28 | mélange de transcriptions de documents du 17e-18 siècle (actes de mariage, 29 | preuves de noblesse etc.), essentiellement en français, et provenant de la 30 | série M, titre III "Titres nobiliaires" des Archives nationales de France. 31 | language: 32 | - fra 33 | production-software: eScriptorium + Kraken 34 | automatically-aligned: false 35 | script: 36 | - iso: Latn 37 | script-type: only-manuscript 38 | time: 39 | notBefore: '1600' 40 | notAfter: '1799' 41 | hands: 42 | count: less-than-11 43 | precision: estimated 44 | license: 45 | name: Etalab OL 2.0 46 | url: https://spdx.org/licenses/etalab-2.0.html 47 | format: Alto-XML 48 | volume: 49 | - metric: lines 50 | count: 726 51 | - metric: pages 52 | count: 44 53 | - metric: regions 54 | count: 242 55 | - metric: characters 56 | count: 25458 57 | citation-file-link: https://github.com/HisMoDoc-HTR/TitresNobiliaires_17_18/blob/main/CITATION.cff 58 | transcription-guidelines: >- 59 | Les transcriptions suivent les conventions éditoriales définies par : 60 | 61 | Bernard Barbiche, Conseils pour l’édition des textes de l’époque moderne 62 | (XVIe-XVIIIe siècle), École nationale des chartes, publié en ligne sur Thélème 63 | (consulté le 01/03/2025). 64 | 65 | Autres précisions : 66 | 67 | - Les abréviations ont été résolues. 68 | 69 | - L'orthographe d'origine a été conservée, y compris les fautes éventuelles. 70 | -------------------------------------------------------------------------------- /catalog/cremma/mss-16.yml: -------------------------------------------------------------------------------- 1 | authors: 2 | - name: Thibault 3 | orcid: 0000-0003-1852-9204 4 | roles: 5 | - project-manager 6 | - quality-control 7 | - support 8 | surname: Clérice 9 | - name: Alix 10 | orcid: 0000-0002-0136-4434 11 | roles: 12 | - project-manager 13 | - quality-control 14 | - support 15 | surname: Chagué 16 | - name: Anaïs 17 | roles: 18 | - transcriber 19 | surname: Mazoue 20 | automatically-aligned: false 21 | characters: 22 | members: 23 | - e 24 | - r 25 | - n 26 | - a 27 | - u 28 | - o 29 | - t 30 | - i 31 | - l 32 | - ſ 33 | - d 34 | - s 35 | - c 36 | - m 37 | - p 38 | - v 39 | - y 40 | - q 41 | - g 42 | - f 43 | - b 44 | - z 45 | - h 46 | - J 47 | - / 48 | - x 49 | - R 50 | - ^ 51 | - L 52 | - I 53 | - . 54 | - E 55 | - ẜ 56 | - ⁊ 57 | - M 58 | - '1' 59 | - ꝑ 60 | - A 61 | - ́ 62 | - ̾ 63 | - < 64 | - '>' 65 | - j 66 | - C 67 | - D 68 | - '3' 69 | - ꝙ 70 | - '9' 71 | - V 72 | - '7' 73 | - '6' 74 | - ’ 75 | - P 76 | - '8' 77 | - Ꝑ 78 | - ̃ 79 | - T 80 | - ( 81 | - S 82 | - N 83 | - ; 84 | - Q 85 | - ̀ 86 | - '5' 87 | - '0' 88 | - U 89 | mode: NFD 90 | citation-file-link: https://github.com/HTR-United/CREMMA-MSS-16/CITATION.cff 91 | description: Manuscripts of the 16th century 92 | format: Alto-XML 93 | hands: 94 | count: 1-per-folder 95 | precision: exact 96 | institutions: [] 97 | language: 98 | - fra 99 | license: 100 | name: CC-BY 4.0 101 | url: https://creativecommons.org/licenses/by/4.0/ 102 | production-software: eScriptorium + Kraken 103 | project-name: CREMMA 104 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 105 | script: 106 | - iso: Latn 107 | script-type: only-manuscript 108 | time: 109 | notAfter: '1599' 110 | notBefore: '1500' 111 | title: CREMMA MSS 16 112 | transcription-guidelines: Abréviations conservées. 113 | url: https://github.com/HTR-United/CREMMA-MSS-16 114 | volume: 115 | - count: 10911 116 | metric: characters 117 | - count: 9 118 | metric: files 119 | - count: 244 120 | metric: lines 121 | - count: 18 122 | metric: regions 123 | -------------------------------------------------------------------------------- /catalog/inha/LettresDeJacquesDoucetAReneJean1908-1929.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: La Correspondances Jacques Doucet - René Jean 3 | url: https://gitlab.inha.fr/snr/LaCorrespondanceDoucetReneJean 4 | authors: 5 | - name: Cugy 6 | surname: Pascale 7 | roles: 8 | - transcriber 9 | - project-manager 10 | - quality-control 11 | - name: Fieschi 12 | surname: Caroline 13 | roles: 14 | - project-manager 15 | - quality-control 16 | - name: Peyrard 17 | surname: Alix 18 | roles: 19 | - transcriber 20 | - quality-control 21 | - name: Prohin 22 | surname: Lucie 23 | roles: 24 | - transcriber 25 | - quality-control 26 | - name: Sarda 27 | surname: Marie-Anne 28 | roles: 29 | - support 30 | institutions: 31 | - name: Institut National de l'histoire de l'art (INHA) 32 | roles: 33 | - transcriber 34 | - project-manager 35 | - quality-control 36 | - name: Bibliothèque nationale de France 37 | roles: 38 | - digitization 39 | description: >- 40 | Projet entrepris dans le cadre du programme La Bibliothèque d’art et 41 | d’archéologie de Jacques Doucet : corpus, savoirs et réseaux de l’Institut 42 | national d’histoire de l’art à partir d’un corpus de lettres et documents 43 | conservés au Département des manuscrits de la Bibliothèque nationale de France 44 | sous la cote NAF 13124, une des principales sources sur la relation entre 45 | Doucet et René Jean qu’il engagea comme bibliothécaire le 2 juin 1908. 46 | project-name: PENSE@INHA 47 | project-website: https://skylab.inha.fr/PENSE/LettresDeJacquesDoucetAReneJean1908-1929/ 48 | language: 49 | - fra 50 | production-software: Transkribus 51 | script: 52 | - iso: Latn 53 | script-type: mainly-manuscript 54 | time: 55 | notBefore: '1908' 56 | notAfter: '1929' 57 | hands: 58 | count: less-than-11 59 | precision: exact 60 | license: 61 | - name: Etalab OL 2.0 62 | url: https://spdx.org/licenses/etalab-2.0.html 63 | format: Alto-XML 64 | volume: 65 | - metric: characters 66 | count: 83312 67 | - metric: lines 68 | count: 2987 69 | - metric: pages 70 | count: 200 71 | - metric: files 72 | count: 200 73 | -------------------------------------------------------------------------------- /catalog/fondue/fondue-spanish-chapbooks-19th-c-dataset.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: FoNDUE Spanish chapbooks 19th c. Dataset 3 | url: https://github.com/DesenrollandoElCordel/FoNDUE-Spanish-chapbooks-Dataset 4 | authors: 5 | - name: Carta 6 | surname: Constance 7 | roles: 8 | - transcriber 9 | - project-manager 10 | - name: Leblanc 11 | surname: "\xC9lina" 12 | roles: 13 | - digitization 14 | - name: Jacsont 15 | surname: Pauline 16 | roles: 17 | - digitization 18 | - name: Palacios 19 | surname: Belinda 20 | roles: 21 | - transcriber 22 | - quality-control 23 | - name: Bermudez 24 | surname: Luana 25 | roles: 26 | - transcriber 27 | - quality-control 28 | description: Digital editions of the second part of the Genevan Spanish chapbooks 29 | collection (19th c.). 30 | project-name: Desenrollando El Cordel 31 | project-website: https://github.com/DesenrollandoElCordel 32 | language: 33 | - cat 34 | - spa 35 | - lat 36 | script: 37 | - iso: Latn 38 | script-type: only-typed 39 | time: 40 | notBefore: '1770' 41 | notAfter: '1920' 42 | hands: 43 | count: more-than-10 44 | precision: exact 45 | license: 46 | - name: CC-BY-SA 4.0 47 | url: https://creativecommons.org/licenses/by-sa/4.0/ 48 | format: Alto-XML 49 | sources: 50 | - reference: '' 51 | link: https://unige.swisscovery.slsp.ch/permalink/41SLSP_UGE/btt5ev/alma991008229029705502 52 | - reference: '' 53 | link: https://unige.swisscovery.slsp.ch/permalink/41SLSP_UGE/kjkm12/alma991002834309705502 54 | volume: 55 | - metric: characters 56 | count: 270718 57 | - metric: lines 58 | count: 12526 59 | - metric: pages 60 | count: 198 61 | citation-file-link: https://github.com/DesenrollandoElCordel/FoNDUE-Spanish-chapbooks-Dataset/blob/main/Grountruth/CITATION.cff 62 | transcription-guidelines: "Les r\xE8gles de transcription suivante ont \xE9t\xE9 adopt\xE9\ 63 | es :\n- Respecter les accents ;\n- Respecter la casse ;\n- Respecter la ponctuation\ 64 | \ ;\n- Respecter les espaces ;\n- Respecter les retours \xE0 la ligne ;\n- Respecter\ 65 | \ la graphie des mots (ne pas corriger les erreurs s\u2019il y en a) ;\n- Supprimer\ 66 | \ le bruit (t\xE2ches qui ont \xE9t\xE9 prises pour du texte par l\u2019OCR)." 67 | production-software: "eScriptorium + Kraken" 68 | -------------------------------------------------------------------------------- /catalog/antwerp_bias-in-history/arletta.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: ARletta 3 | url: https://zenodo.org/records/11191457 4 | authors: 5 | - name: Lith 6 | surname: Lefranc 7 | - name: Ilja 8 | surname: Van Damme 9 | - name: Thibault 10 | surname: Clérice 11 | - name: Mike 12 | surname: Kestemont 13 | institutions: 14 | - name: University of Antwerp 15 | - name: National Institute for Research in Digital Science and Technology, Paris 16 | description: Open-source handwritten text recognition models for historic Dutch 17 | project-name: Bias in History 18 | project-website: https://www.bias-in-history.eu/ 19 | language: 20 | - nld 21 | - fra 22 | production-software: eScriptorium + Kraken 23 | automatically-aligned: false 24 | script: 25 | - iso: Latn 26 | script-type: only-manuscript 27 | time: 28 | notBefore: '1600' 29 | notAfter: '1940' 30 | hands: 31 | count: more-than-10 32 | precision: estimated 33 | license: 34 | name: CC-BY-SA 4.0 35 | url: https://creativecommons.org/licenses/by-sa/4.0/ 36 | format: Page-XML 37 | volume: 38 | - metric: lines 39 | count: 431359 40 | - metric: regions 41 | count: 44536 42 | - metric: pages 43 | count: 10267 44 | - metric: characters 45 | count: 14253206 46 | transcription-guidelines: >- 47 | **Diplomatic transcription.** All of the text was transcribed verbatim, preserving all of its original features: 48 | 49 | - orthography: preserve original spelling 50 | 51 | - abbreviations: do not expand abbreviations 52 | 53 | - capitalization: retain original use of uppercase and lowercase letters 54 | 55 | - punctuation: transcribe punctuation marks exactly as they appear, even of they are unconventional by modern standards 56 | 57 | - special characters: include any special characters or symbols as they appear 58 | 59 | - formatting: maintain original formatting such as underlining or strikethrough 60 | 61 | - errors and corrections: include all errors and corrections found in the text 62 | 63 | - non-interpretative: avoid interpreting or modernizing the text 64 | 65 | - use the '@' symbol for characters you can not read an tag them as 'unclear' on baseline level 66 | 67 | - tag marginal text as 'marginalia' and main body text as 'paragraph' on region level 68 | -------------------------------------------------------------------------------- /catalog/greek-data/stavronikita-114.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: Stavronikita Monastery Collection No. 114 3 | url: https://zenodo.org/records/5578251 4 | authors: 5 | - name: Ioannis 6 | surname: Pratikakis 7 | orcid: 0000-0002-4124-3688 8 | roles: 9 | - transcriber 10 | - project-manager 11 | - name: Aleksandros 12 | surname: Papazoglou 13 | roles: 14 | - transcriber 15 | - project-manager 16 | - name: Symeon 17 | surname: Symeonidis 18 | orcid: 0000-0002-3259-614X 19 | roles: 20 | - transcriber 21 | - project-manager 22 | - name: Lazaros 23 | surname: Tsochatzidis 24 | orcid: 0000-0002-4634-7419 25 | roles: 26 | - transcriber 27 | - project-manager 28 | institutions: [] 29 | description: >- 30 | It comprises manuscripts made of paper, written at the end of the 15th century 31 | and its dimensions are 218X150 mm. In various pages, we find red initials and 32 | epititles which enrich the manuscript’s decoration. 33 | 34 | The dataset of ΧΦ114 consists of 1051 lines of text containing 5467 (2877 35 | unique words) words that are distributed over 44 scanned handwritten text pages. 36 | 37 | For each page, a PageXML is provided containing the following ground-truth: 38 | 39 | 1. Text region polygon coordinates 40 | 2. Text line polygon coordinates with the corresponding transcription text 41 | 3. Word polygon coordinated with the corresponding transcription text 42 | language: 43 | - grc 44 | transcription-guidelines: | 45 | - Abbreviation and ligatures were resolved 46 | - Minuscule in the beginning of sentences were kept as such. 47 | - Polytonic spelling and diaeresis are kept 48 | production-software: Unknown 49 | automatically-aligned: false 50 | characters: 51 | mode: NFD 52 | script: 53 | - iso: Grek 54 | script-type: only-manuscript 55 | time: 56 | notBefore: '1401' 57 | notAfter: '1500' 58 | hands: 59 | count: less-than-11 60 | precision: exact 61 | license: 62 | name: CC-BY 4.0 63 | url: https://creativecommons.org/licenses/by/4.0/ 64 | format: Page-XML 65 | volume: 66 | - {count: 1006, metric: "lines"} 67 | - {count: 44, metric: "files"} 68 | - {count: 44, metric: "regions"} 69 | - {count: 36898, metric: "characters"} -------------------------------------------------------------------------------- /catalog/greek-data/stavronikita-53.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: Stavronikita Monastery Collection No. 53 3 | url: https://zenodo.org/records/5595669 4 | authors: 5 | - name: Ioannis 6 | surname: Pratikakis 7 | orcid: 0000-0002-4124-3688 8 | roles: 9 | - transcriber 10 | - project-manager 11 | - name: Aleksandros 12 | surname: Papazoglou 13 | roles: 14 | - transcriber 15 | - project-manager 16 | - name: Symeon 17 | surname: Symeonidis 18 | orcid: 0000-0002-3259-614X 19 | roles: 20 | - transcriber 21 | - project-manager 22 | - name: Lazaros 23 | surname: Tsochatzidis 24 | orcid: 0000-0002-4634-7419 25 | roles: 26 | - transcriber 27 | - project-manager 28 | institutions: [] 29 | description: >- 30 | The collection is one of the oldest Stavronikita Monastery on Mount Athos. 31 | It is a parchment, four-gospel manuscript which has been written between 32 | 1301 and 1350. It comprises 54 pages with dimensions that are approximately 33 | 250x185 mm. The script is elegant minuscule and the use of majuscule letters 34 | is rare. Tachygraphical symbols and abbreviations are encountered in the 35 | manuscript as well. Furthermore, the manuscript is enriched with 36 | chrysography, elegant epititles and initials. 37 | 38 | The dataset of ΧΦ53 consists of 1038 lines of text containing 5592 words 39 | (2374 unique words) that are distributed over 54 scanned handwritten text pages. 40 | language: 41 | - grc 42 | transcription-guidelines: | 43 | - Abbreviation and ligatures were resolved 44 | - Minuscule in the beginning of sentences were kept as such. 45 | - Polytonic spelling and diaeresis are kept 46 | production-software: Unknown 47 | automatically-aligned: false 48 | characters: 49 | mode: NFD 50 | script: 51 | - iso: Grek 52 | script-type: only-manuscript 53 | time: 54 | notBefore: '1301' 55 | notAfter: '1350' 56 | hands: 57 | count: less-than-11 58 | precision: exact 59 | license: 60 | name: CC-BY 4.0 61 | url: https://creativecommons.org/licenses/by/4.0/ 62 | format: Page-XML 63 | volume: 64 | - {count: 1038, metric: "lines"} 65 | - {count: 54, metric: "files"} 66 | - {count: 54, metric: "regions"} 67 | - {count: 37070, metric: "characters"} 68 | -------------------------------------------------------------------------------- /catalog/greek-data/stavronikita-79.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: Stavronikita Monastery Collection No. 79 3 | url: https://zenodo.org/records/5578136 4 | authors: 5 | - name: Ioannis 6 | surname: Pratikakis 7 | orcid: 0000-0002-4124-3688 8 | roles: 9 | - transcriber 10 | - project-manager 11 | - name: Aleksandros 12 | surname: Papazoglou 13 | roles: 14 | - transcriber 15 | - project-manager 16 | - name: Symeon 17 | surname: Symeonidis 18 | orcid: 0000-0002-3259-614X 19 | roles: 20 | - transcriber 21 | - project-manager 22 | - name: Lazaros 23 | surname: Tsochatzidis 24 | orcid: 0000-0002-4634-7419 25 | roles: 26 | - transcriber 27 | - project-manager 28 | institutions: [] 29 | description: >- 30 | It comprises manuscripts made of paper, written in the 16th century and its 31 | dimensions are 220X165 mm. The manuscript is embellished with epititles and 32 | red initials. Tachygraphical symbols and abbreviations are encountered in 33 | the manuscript as well. The dataset of XΦ79 consists of 803 lines of text 34 | containing 4389 words (2069 unique words) that are distributed over 35 | 40 scanned handwritten text pages. 36 | For each page, a PageXML is provided containing the following ground-truth: 37 | 1. Text region polygon coordinates 38 | 2. Text line polygon coordinates with the corresponding transcription text 39 | 3. Word polygon coordinated with the corresponding transcription text 40 | language: 41 | - grc 42 | transcription-guidelines: | 43 | - Abbreviation and ligatures were resolved 44 | - Minuscule in the beginning of sentences were kept as such. 45 | - Polytonic spelling and diaeresis are kept 46 | production-software: Unknown 47 | automatically-aligned: false 48 | characters: 49 | mode: NFD 50 | script: 51 | - iso: Grek 52 | script-type: only-manuscript 53 | time: 54 | notBefore: '1501' 55 | notAfter: '1600' 56 | hands: 57 | count: less-than-11 58 | precision: exact 59 | license: 60 | name: CC-BY 4.0 61 | url: https://creativecommons.org/licenses/by/4.0/ 62 | format: Page-XML 63 | volume: 64 | - {count: 803, metric: "lines"} 65 | - {count: 40, metric: "files"} 66 | - {count: 40, metric: "regions"} 67 | - {count: 29112, metric: "characters"} 68 | -------------------------------------------------------------------------------- /catalog/fondue/FONDUE-IT-PRINT-20.yml: -------------------------------------------------------------------------------- 1 | authors: 2 | - name: Simon 3 | orcid: 0000-0001-9094-4475 4 | roles: 5 | - project-manager 6 | - quality-control 7 | - support 8 | surname: Gabay 9 | - name: Maddalena 10 | roles: 11 | - transcriber 12 | surname: Zaglio 13 | automatically-aligned: false 14 | characters: 15 | members: 16 | - e 17 | - a 18 | - i 19 | - o 20 | - r 21 | - n 22 | - t 23 | - l 24 | - s 25 | - c 26 | - d 27 | - u 28 | - p 29 | - m 30 | - v 31 | - ',' 32 | - g 33 | - h 34 | - f 35 | - b 36 | - z 37 | - . 38 | - ̀ 39 | - ¬ 40 | - q 41 | - I 42 | - '-' 43 | - C 44 | - A 45 | - "'" 46 | - P 47 | - '"' 48 | - S 49 | - M 50 | - E 51 | - ’ 52 | - L 53 | - '=' 54 | - ; 55 | - T 56 | - R 57 | - D 58 | - V 59 | - O 60 | - G 61 | - N 62 | - ':' 63 | - '1' 64 | - B 65 | - '4' 66 | - ) 67 | - '!' 68 | - ( 69 | - '[' 70 | - ']' 71 | - F 72 | - Q 73 | - '2' 74 | - '0' 75 | - '3' 76 | - '9' 77 | - '5' 78 | - U 79 | - '?' 80 | - ° 81 | - ⬪ 82 | - '6' 83 | - y 84 | - Z 85 | - k 86 | - ᗅ 87 | - K 88 | - x 89 | - H 90 | - '8' 91 | - X 92 | - W 93 | - — 94 | - '7' 95 | - “ 96 | - ᑕ 97 | - ᗞ 98 | - w 99 | mode: NFD 100 | citation-file-link: https://github.com/FoNDUE-HTR/FONDUE-IT-PRINT-20/blob/master/CITATION.cff 101 | description: Archives 102 | format: Alto-XML 103 | hands: 104 | count: unknown 105 | precision: exact 106 | institutions: [] 107 | language: 108 | - ita 109 | license: 110 | name: CC-BY 4.0 111 | url: https://creativecommons.org/licenses/by/4.0/ 112 | production-software: eScriptorium + Kraken 113 | project-name: FoNDUE 114 | project-website: https://github.com/FoNDUE-HTR 115 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 116 | script: 117 | - iso: Latn 118 | script-type: only-typed 119 | time: 120 | notAfter: '1900' 121 | notBefore: '1999' 122 | title: FONDUE-IT-PRINT-20 123 | transcription-guidelines: SegmOnto 124 | url: https://github.com/FoNDUE-HTR/FONDUE-IT-PRINT-20 125 | volume: 126 | - count: 49432 127 | metric: characters 128 | - count: 23 129 | metric: files 130 | - count: 1008 131 | metric: lines 132 | - count: 48 133 | metric: regions 134 | -------------------------------------------------------------------------------- /catalog/joseph-hooker-correspondance-project/joseph-hooker-htr.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: Joseph Hooker HTR 3 | url: https://github.com/jschaefer738b/JosephHookerHTR.git 4 | authors: 5 | - name: John 6 | surname: Schaefer 7 | orcid: 0009-0006-5751-9323 8 | roles: 9 | - transcriber 10 | - project-manager 11 | - quality-control 12 | - support 13 | - name: Kiri 14 | surname: Ross-Jones 15 | roles: 16 | - support 17 | - name: Alexis 18 | surname: Litvine 19 | roles: 20 | - support 21 | institutions: 22 | - name: Royal Botanic Gardens, Kew 23 | - name: University of Cambridge 24 | description: >- 25 | XML transcriptions and JPEG images exported from Transkribus as ground truth 26 | for an eScriptorium-Kraken HTR model (CER 11-12%) trained on the correspondence of Joseph 27 | Dalton Hooker (1817-1911), primarily letters to William Turner Thiselton-Dyer 28 | (1843-1928) during the late-19th/early-20th century. Many transcriptions in 29 | this dataset were generated by a small team of anonymous volunteers as part of 30 | the Joseph Hooker Correspondence Project based at Kew Gardens. All images in 31 | this dataset are reproduced with the kind permission of the Board of Trustees 32 | of the Royal Botanic Gardens Kew (© RBG, Kew). Contact archives@kew.org for 33 | more information. 34 | 35 | 36 | HTR Model: Schaefer, John, & Litvine, Alexis. (2023). Joseph Hooker HTR Model. 37 | Zenodo. https://doi.org/10.5281/zenodo.8038689 38 | project-name: Joseph Hooker Correspondence Project 39 | project-website: >- 40 | https://www.kew.org/science/our-science/projects/joseph-hooker-correspondence-project 41 | language: 42 | - eng 43 | production-software: Transkribus 44 | script: 45 | - iso: Latn 46 | script-type: only-manuscript 47 | time: 48 | notBefore: '1850' 49 | notAfter: '1911' 50 | hands: 51 | count: '1' 52 | precision: estimated 53 | license: 54 | - name: CC-BY-SA 4.0 55 | url: https://creativecommons.org/licenses/by-sa/4.0/ 56 | format: Page-XML 57 | volume: 58 | - metric: lines 59 | count: 7100 60 | - metric: files 61 | count: 337 62 | - metric: pages 63 | count: 337 64 | transcription-guidelines: >- 65 | All horizontal lines in Hooker's hand were transcribed as originally written. 66 | Most typescript and vertical lines in the margins were not included. 67 | -------------------------------------------------------------------------------- /catalog/htr-school-vienna/htr-winter-school-2024-medieval-czech-prague-bible-1488.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: HTR Winter School 2024 - Medieval Czech - Prague Bible (1488) 3 | url: https://github.com/HTR-School-Vienna/2024--medieval-czech 4 | authors: 5 | - name: Martin 6 | surname: Plechatý 7 | orcid: 0009-0000-3305-2075 8 | roles: 9 | - transcriber 10 | - name: Daniel 11 | surname: Katscher 12 | orcid: 0009-0008-3475-2522 13 | roles: 14 | - transcriber 15 | - name: Václav 16 | surname: Steiner 17 | orcid: 0009-0004-8336-9846 18 | roles: 19 | - transcriber 20 | - name: Jan 21 | surname: Švarc 22 | orcid: 0009-0005-1274-0545 23 | roles: 24 | - transcriber 25 | - name: 'Martina ' 26 | surname: Spěváčková 27 | orcid: 0000-0002-9357-4614 28 | roles: 29 | - transcriber 30 | - name: Jan 31 | surname: Škvrňák 32 | orcid: 0000-0003-0985-4144 33 | - name: Marie 34 | surname: Hedvíková 35 | orcid: 0009-0008-3693-6288 36 | roles: 37 | - transcriber 38 | - name: Anna 39 | surname: Michalcová 40 | orcid: 0000-0003-4760-6950 41 | roles: 42 | - project-manager 43 | - quality-control 44 | institutions: [] 45 | description: >- 46 | The Prague Bible (1488, Vienna, Österreichische Nationalbibliothek, shelfmark 47 | Ink 13.C.5, available from: http://data.onb.ac.at/rec/AC07537625, Old Czech) 48 | 49 | Print: Old Czech, Bastarda, end of the 15th C. 50 | language: 51 | - ces 52 | production-software: Transkribus 53 | automatically-aligned: false 54 | script: 55 | - iso: Latn 56 | script-type: only-typed 57 | time: 58 | notBefore: '1488' 59 | notAfter: '1488' 60 | hands: 61 | count: '1' 62 | precision: exact 63 | license: 64 | name: CC-BY 4.0 65 | url: https://creativecommons.org/licenses/by/4.0/ 66 | format: Page-XML 67 | volume: 68 | - metric: files 69 | count: 30 70 | citation-file-link: >- 71 | https://github.com/HTR-School-Vienna/2024--medieval-czech/blob/38a20c857757150d8e2da0e8c865fbf7d026cdee/CITATION.cff 72 | transcription-guidelines: >- 73 | The transcription rules were based on semi-diplomatic transcription rules set 74 | by Pero OCR and Směrnice pro vydávání starších českých textů by Jiří Daňhelka 75 | (https://vokabular.ujc.cas.cz/moduly/edicnipoznamka.aspx?id=DanhelkaSmernice). 76 | -------------------------------------------------------------------------------- /catalog/scripta-psl/biblia.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: BiblIA 3 | url: https://zenodo.org/record/5167263 4 | project-name: 'Scripta PSL 5 | 6 | ' 7 | project-website: https://escripta.hypotheses.org/ 8 | authors: 9 | - name: "St\xF6kl Ben Ezra" 10 | surname: Daniel 11 | roles: 12 | - transcriber 13 | - project-manager 14 | - name: Brown-DeVost 15 | surname: Bronson 16 | - name: Jablonski 17 | surname: Pawel 18 | - name: Kiessling 19 | surname: Benjamin 20 | - name: Lolli 21 | surname: Elena 22 | - name: Lapin 23 | surname: Hayim 24 | description: "This dataset for Handwritten Text Recognition includes layout segmentation\ 25 | \ (regions, toplines and linepolygons) and unicode-transcriptions in alto 4.2 XML\ 26 | \ for 202 images of Medieval Hebrew manuscripts from the Biblioth\xE8que nationale\ 27 | \ de France (BnF, National Library of France) and the Biblioteca Apostolica Vaticana\ 28 | \ (BAV, Vatican Library) corresponding to the article \"BiblIA - a General Model\ 29 | \ for Medieval Hebrew Manuscripts and an Open Annotated Dataset\" by Daniel St\xF6\ 30 | kl Ben Ezra, Bronson Brown-DeVost, Pawel Jablonski, Benjamin Kiessling, Elena Lolli,\ 31 | \ and Hayim Lapin, published in HIP@ICDAR 2021 held in Lausanne, September 2021.\n" 32 | language: 33 | - heb 34 | script: 35 | - iso: Hebr 36 | script-type: only-manuscript 37 | time: 38 | notBefore: '1000' 39 | notAfter: '1499' 40 | hands: 41 | count: more-than-10 42 | precision: exact 43 | license: 44 | - name: CC-BY-SA 4.0 45 | url: https://creativecommons.org/licenses/by-sa/4.0/ 46 | format: Alto-XML 47 | volume: 48 | - metric: files 49 | count: 202 50 | - metric: pages 51 | count: 202 52 | - metric: lines 53 | count: 12461 54 | - metric: regions 55 | count: 509 56 | - metric: characters 57 | count: 278641 58 | transcription-guidelines: "See the guidelines detailed in Stoekl Ben Ezra Daniel,\ 59 | \ Brown-DeVost Bronson, Jablonski Pawel, Lapin Hayim, Kiessling Benjamin, and Lolli\ 60 | \ Elena. 2021. BiblIA - a General Model for Medieval Hebrew Manuscripts and an Open\ 61 | \ Annotated Dataset. In The 6th International Workshop on Historical Document Imaging\ 62 | \ and Processing (HIP '21). Association for Computing Machinery, New York, NY, USA,\ 63 | \ 61\u201366. DOI:https://doi.org/10.1145/3476887.3476896'\n" 64 | production-software: "eScriptorium + Kraken" 65 | -------------------------------------------------------------------------------- /catalog/fondue/FONDUE-ES-PRINT-19.yml: -------------------------------------------------------------------------------- 1 | authors: 2 | - name: Simon 3 | orcid: 0000-0001-9094-4475 4 | roles: 5 | - project-manager 6 | - quality-control 7 | - support 8 | surname: Gabay 9 | - name: Carmen 10 | roles: 11 | - transcriber 12 | surname: Carrasco Luján 13 | automatically-aligned: false 14 | characters: 15 | members: 16 | - e 17 | - a 18 | - o 19 | - s 20 | - n 21 | - r 22 | - i 23 | - l 24 | - d 25 | - u 26 | - t 27 | - c 28 | - m 29 | - . 30 | - p 31 | - ́ 32 | - ',' 33 | - b 34 | - g 35 | - y 36 | - q 37 | - h 38 | - v 39 | - ¬ 40 | - f 41 | - j 42 | - z 43 | - – 44 | - A 45 | - ; 46 | - E 47 | - '!' 48 | - x 49 | - S 50 | - ̃ 51 | - I 52 | - P 53 | - B 54 | - U 55 | - C 56 | - D 57 | - L 58 | - T 59 | - '?' 60 | - ':' 61 | - '0' 62 | - O 63 | - R 64 | - N 65 | - H 66 | - Y 67 | - ¿ 68 | - V 69 | - J 70 | - M 71 | - '1' 72 | - ¡ 73 | - '2' 74 | - — 75 | - '"' 76 | - k 77 | - F 78 | - '8' 79 | - '7' 80 | - '4' 81 | - '5' 82 | - G 83 | - '-' 84 | - '3' 85 | - '6' 86 | - K 87 | - ( 88 | - ) 89 | - '9' 90 | - Q 91 | - ̀ 92 | - ̈ 93 | - X 94 | - W 95 | - '[' 96 | - ']' 97 | - '&' 98 | - w 99 | - '*' 100 | - § 101 | - ° 102 | - ǝ 103 | mode: NFD 104 | citation-file-link: https://github.com/FoNDUE-HTR/FONDUE-ES-PRINT-19/blob/master/CITATION.cff 105 | description: Novels written in Spanish 106 | format: Alto-XML 107 | hands: 108 | count: unknown 109 | precision: exact 110 | institutions: [] 111 | language: 112 | - spa 113 | license: 114 | name: CC-BY 4.0 115 | url: https://creativecommons.org/licenses/by/4.0/ 116 | production-software: eScriptorium + Kraken 117 | project-name: FoNDUE 118 | project-website: https://github.com/FoNDUE-HTR 119 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 120 | script: 121 | - iso: Latn 122 | script-type: only-typed 123 | time: 124 | notAfter: '1800' 125 | notBefore: '1899' 126 | title: FONDUE-ES-PRINT-19 127 | transcription-guidelines: SegmOnto 128 | url: https://github.com/FoNDUE-HTR/FONDUE-ES-PRINT-19 129 | volume: 130 | - count: 53687 131 | metric: characters 132 | - count: 38 133 | metric: files 134 | - count: 1375 135 | metric: lines 136 | - count: 103 137 | metric: regions 138 | -------------------------------------------------------------------------------- /catalog/fondue/FONDUE-FR-PRINT-20.yml: -------------------------------------------------------------------------------- 1 | authors: 2 | - name: Simon 3 | orcid: 0000-0001-9094-4475 4 | roles: 5 | - project-manager 6 | - quality-control 7 | - support 8 | surname: Gabay 9 | - name: Sophie 10 | orcid: 0009-0005-6841-0158 11 | roles: 12 | - transcriber 13 | surname: Dolto 14 | automatically-aligned: false 15 | characters: 16 | members: 17 | - e 18 | - a 19 | - s 20 | - i 21 | - t 22 | - r 23 | - n 24 | - u 25 | - l 26 | - o 27 | - d 28 | - c 29 | - p 30 | - m 31 | - ́ 32 | - ',' 33 | - . 34 | - v 35 | - ’ 36 | - g 37 | - f 38 | - b 39 | - q 40 | - h 41 | - ̀ 42 | - ̂ 43 | - x 44 | - j 45 | - L 46 | - y 47 | - '-' 48 | - I 49 | - "'" 50 | - — 51 | - A 52 | - G 53 | - E 54 | - M 55 | - P 56 | - C 57 | - B 58 | - J 59 | - D 60 | - z 61 | - ̧ 62 | - S 63 | - '!' 64 | - T 65 | - '?' 66 | - ¬ 67 | - V 68 | - ; 69 | - U 70 | - O 71 | - R 72 | - Q 73 | - ':' 74 | - '1' 75 | - k 76 | - F 77 | - H 78 | - œ 79 | - '0' 80 | - ( 81 | - ) 82 | - “ 83 | - '2' 84 | - N 85 | - '6' 86 | - '9' 87 | - '8' 88 | - '5' 89 | - ̈ 90 | - '3' 91 | - w 92 | - W 93 | - '4' 94 | - Y 95 | - ” 96 | -   97 | - '7' 98 | - Z 99 | - '*' 100 | - / 101 | - K 102 | - '"' 103 | - « 104 | - » 105 | mode: NFD 106 | citation-file-link: https://github.com/FoNDUE-HTR/FONDUE-FR-PRINT-20/blob/master/CITATION.cff 107 | description: French novels 108 | format: Alto-XML 109 | hands: 110 | count: unknown 111 | precision: exact 112 | institutions: [] 113 | language: 114 | - eng 115 | license: 116 | name: CC-BY 4.0 117 | url: https://creativecommons.org/licenses/by/4.0/ 118 | production-software: eScriptorium + Kraken 119 | project-name: FoNDUE 120 | project-website: https://github.com/FoNDUE-HTR 121 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 122 | script: 123 | - iso: Latn 124 | script-type: only-typed 125 | time: 126 | notAfter: '1900' 127 | notBefore: '1999' 128 | title: FONDUE-FR-PRINT-20 129 | transcription-guidelines: SegmOnto 130 | url: https://github.com/FoNDUE-HTR/FONDUE-FR-PRINT-20 131 | volume: 132 | - count: 81599 133 | metric: characters 134 | - count: 55 135 | metric: files 136 | - count: 1604 137 | metric: lines 138 | - count: 64 139 | metric: regions 140 | -------------------------------------------------------------------------------- /catalog-ids.json: -------------------------------------------------------------------------------- 1 | {"https://doi.org/10.5281/zenodo.5153263": "repo-00000", "https://zenodo.org/record/4780947#.YhN5pVvMLUQ": "repo-00001", "https://github.com/calfa-co/rasam-dataset": "repo-00002", "https://github.com/DesenrollandoElCordel/FoNDUE-Spanish-chapbooks-Dataset": "repo-00003", "https://zenodo.org/record/3333627#.YhN1G1vMLUQ": "repo-00004", "https://github.com/rescribe/carolineminuscule-groundtruth": "repo-00005", "http://dx.doi.org/10.34847/nkl.acb724xs": "repo-00006", "https://github.com/e-ditiones/OCR17plus": "repo-00007", "https://github.com/PSL-Chartes-HTR-Students/TNAH-2021-Projet-Notre-Dame": "repo-00008", "https://github.com/PSL-Chartes-HTR-Students/TNAH-2021-ArgusDesBrevets": "repo-00009", "https://github.com/PSL-Chartes-HTR-Students/TNAH-2021-DecameronFR": "repo-00010", "https://github.com/PSL-Chartes-HTR-Students/HN2021-Kovalewsky-1893": "repo-00011", "https://github.com/PSL-Chartes-HTR-Students/HN2021-ChateauChavigny": "repo-00012", "https://github.com/PSL-Chartes-HTR-Students/HN2021-Boccace": "repo-00013", "https://github.com/PSL-Chartes-HTR-Students/HN2021-Memorials_Jane_Lathrop_Stanford": "repo-00014", "https://github.com/PSL-Chartes-HTR-Students/TNAH-2021-Expositions_Universelles": "repo-00015", "https://github.com/PSL-Chartes-HTR-Students/TNAH-2021-Projet-Correspondance-Berlioz": "repo-00016", "https://github.com/jpmjpmjpm/genauto-td-htr.git": "repo-00017", "https://doi.org/10.5281/zenodo.5179361": "repo-00018", "HTR-United/tapuscorpus": "repo-00019", "HTR-United/timeuscorpus": "repo-00020", "HTR-United/dahncorpus": "repo-00021", "HTR-United/cremma-medieval": "repo-00022", "HTR-United/cremma-16-17-print": "repo-00023", "HTR-United/CREMMA-Medieval-LAT": "repo-00024", "HTR-United/CREMMA-MSS-17": "repo-00025", "HTR-United/CREMMA-MSS-18": "repo-00026", "HTR-United/CREMMA-MSS-19": "repo-00027", "HTR-United/CREMMA-MSS-20": "repo-00028", "HTR-United/lectaurep-bronod": "repo-00029", "HTR-United/lectaurep-mariages-et-divorces": "repo-00030", "HTR-United/lectaurep-repertoires": "repo-00031", "HTR-United/CREMMA-AN-TestamentDePoilus": "repo-00032", "HTR-United/cremma-wikipedia": "repo-00033", "Gallicorpora/HTR-MSS-15e-Siecle": "repo-00034", "Gallicorpora/HTR-incunable-15e-siecle": "repo-00035", "Gallicorpora/HTR-imprime-16e-siecle": "repo-00036", "Gallicorpora/HTR-imprime-17e-siecle": "repo-00037", "Gallicorpora/HTR-imprime-gothique-16e-siecle": "repo-00038", "Gallicorpora/HTR-imprime-18e-siecle": "repo-00039", "FoNDUE-HTR/FONDUE-FR-PRINT-17": "repo-00040", "FoNDUE-HTR/FONDUE-FR-PRINT-16": "repo-00041"} -------------------------------------------------------------------------------- /catalog/gallicorpora/gothic-16.yml: -------------------------------------------------------------------------------- 1 | authors: 2 | - name: Pinche 3 | roles: 4 | - project-manager 5 | surname: Ariane 6 | - name: Gabay 7 | roles: 8 | - project-manager 9 | surname: Simon 10 | - name: Vlachou-Efstathiou 11 | roles: 12 | - transcriber 13 | surname: malamatenia 14 | - name: Christensen 15 | roles: 16 | - support 17 | surname: Kelly 18 | characters: 19 | members: 20 | - e 21 | - u 22 | - a 23 | - i 24 | - t 25 | - r 26 | - n 27 | - o 28 | - s 29 | - l 30 | - d 31 | - c 32 | - m 33 | - p 34 | - ſ 35 | - q 36 | - y 37 | - ̃ 38 | - f 39 | - g 40 | - b 41 | - . 42 | - h 43 | - ',' 44 | - z 45 | - ⁊ 46 | - x 47 | - E 48 | - ¬ 49 | - ¶ 50 | - C 51 | - S 52 | - L 53 | - D 54 | - P 55 | - A 56 | - I 57 | - ͥ 58 | - M 59 | - v 60 | - Q 61 | - ꝰ 62 | - O 63 | - T 64 | - ':' 65 | - V 66 | - B 67 | - '?' 68 | - ꝑ 69 | - H 70 | - N 71 | - ͬ 72 | - R 73 | - ; 74 | - G 75 | - F 76 | - ̌ 77 | - ꝓ 78 | - J 79 | - '-' 80 | - ꝯ 81 | - ( 82 | - ) 83 | - '1' 84 | - U 85 | - '9' 86 | - ̾ 87 | - æ 88 | - X 89 | - '4' 90 | - ꝙ 91 | - ̧ 92 | - ͤ 93 | - '2' 94 | - '*' 95 | - '6' 96 | - "'" 97 | - Ι 98 | - '7' 99 | - ⟦ 100 | - ⟧ 101 | - '8' 102 | - Y 103 | - '5' 104 | - '0' 105 | mode: NFD 106 | description: Corpus d'entrainement pour l'HTR constitué d'imprimés du 16e siècle 107 | format: Alto-XML 108 | hands: 109 | count: 1-per-folder 110 | precision: estimated 111 | language: 112 | - fra 113 | license: 114 | - name: CC-BY 4.0 115 | url: https://creativecommons.org/licenses/by/4.0/ 116 | production-software: eScriptorium + Kraken 117 | project-name: Gallicorpora 118 | project-website: https://github.com/Gallicorpora 119 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 120 | script: 121 | - iso: Latn 122 | script-type: evenly-mixed 123 | time: 124 | notAfter: '1599' 125 | notBefore: '1500' 126 | title: Données imprimés gothiques du 16e siècle 127 | transcription-guidelines: Les transcriptions suivent les normes de transcription du 128 | projet Gallicorpora 129 | url: https://github.com/Gallicorpora/HTR-imprime-16e-siecle 130 | volume: 131 | - count: 90731 132 | metric: characters 133 | - count: 80 134 | metric: files 135 | - count: 2971 136 | metric: lines 137 | - count: 233 138 | metric: regions 139 | -------------------------------------------------------------------------------- /catalog/greek-data/hpgtr.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: HPGTR Dataset 3 | url: https://github.com/vivianpl/hpgtr 4 | authors: 5 | - name: Paraskevi 6 | surname: Platanou 7 | roles: 8 | - transcriber 9 | - project-manager 10 | - name: John 11 | surname: Pavlopoulos 12 | orcid: 0000-0001-9188-7425 13 | roles: 14 | - transcriber 15 | - project-manager 16 | - name: Georgios 17 | surname: Papaioannou 18 | orcid: 0000-0003-4774-0746 19 | roles: 20 | - transcriber 21 | - project-manager 22 | institutions: [] 23 | description: >- 24 | The HPGT dataset consists of images of Handwritten Paleographic 25 | Greek Text, derived from the Bodleian Libraries' Greek manuscript 26 | collection, specifically the Barocci collection, which dates from 27 | the 8th to the 17th centuries. This dataset is divided into two 28 | editions: HPGTR.N, which contains 77 unsegmented images categorized 29 | by century from the 10th to the 16th, and HPGTR.S, which features 30 | carefully segmented lines from selected images to facilitate machine 31 | learning tasks. The dataset captures a range of characteristics, 32 | including variations in writing style, page conditions, and 33 | manuscript production details. 34 | 35 | This dataset is part of the following work: Paraskevi Platanou, 36 | John Pavlopoulos, and Georgios Papaioannou. 2022. Handwritten 37 | Paleographic Greek Text Recognition: A Century-Based Approach. 38 | In *Proceedings of the "Thirteenth Language Resources and Evaluation Conference"*, 39 | pages 6585–6589, Marseille, France. European Language Resources Association. 40 | language: 41 | - grc 42 | transcription-guidelines: | 43 | - Abbreviation and ligatures were resolved 44 | - Minuscule in the beginning of sentences were kept as such. 45 | - Polytonic spelling and diaeresis are kept 46 | production-software: Unknown 47 | automatically-aligned: false 48 | characters: 49 | mode: NFD 50 | script: 51 | - iso: Grek 52 | script-type: only-manuscript 53 | time: 54 | notBefore: '0901' 55 | notAfter: '1600' 56 | hands: 57 | count: less-than-11 58 | precision: exact 59 | license: 60 | name: CC-BY-NC-SA 3.0 61 | url: https://creativecommons.org/licenses/by/4.0/ 62 | format: Page-XML 63 | volume: 64 | - {count: 1698, metric: "lines"} 65 | - {count: 70, metric: "files"} 66 | - {count: 178, metric: "regions"} 67 | - {count: 64952, metric: "characters"} -------------------------------------------------------------------------------- /catalog/LiDi/LiDi1-0-project.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: LiDi1.0-project 3 | url: https://github.com/Giorgiaagostini/LiDi1.0-project 4 | authors: 5 | - name: Giorgia 6 | surname: Agostini 7 | orcid: 0009-0007-9887-5129 8 | roles: 9 | - transcriber 10 | - aligner 11 | - project-manager 12 | - quality-control 13 | institutions: [] 14 | description: >- 15 | This repository contains all data relating to the LiDi 1.0 project. In 16 | particular HTR GT of 16th antiquarian Pirro Ligorio, used to create 17 | Transkribus public model Ligorio 0.3 PyL. 18 | project-name: LiDi 1.0 19 | project-website: https://lidiws-limes.cfs.unipi.it 20 | language: 21 | - ita 22 | production-software: Transkribus 23 | automatically-aligned: false 24 | script: 25 | - iso: Latn 26 | - iso: Grek 27 | script-type: only-manuscript 28 | time: 29 | notBefore: '1568' 30 | notAfter: '1580' 31 | hands: 32 | count: '1' 33 | precision: estimated 34 | license: 35 | name: CC-BY-SA 4.0 36 | url: https://creativecommons.org/licenses/by-sa/4.0/ 37 | format: Alto-XML 38 | sources: 39 | - reference: '' 40 | link: >- 41 | https://archiviodistatotorino.beniculturali.it/dbadd/visvol_bibl.php?uid=300146 42 | volume: 43 | - metric: files 44 | count: 195 45 | citation-file-link: >- 46 | https://github.com/Giorgiaagostini/LiDi1.0-project/blob/main/Data/Ground%20Truth/CITATION.cff 47 | transcription-guidelines: >- 48 | - Normalisation of «V» to «U» except in Latin inscriptions; 49 | 50 | - Preservation of the diacritical marks and punctuation as used by the Author 51 | except for the part in Greek; 52 | 53 | - Where the use of capital and small caps is not distinguished, it is 54 | transcribed according to the grammatical rules of the Italian language; 55 | 56 | - Tagging of uncertain words with the «unclear» tag; 57 | 58 | - Tagging of illegible words with three dots (...) and the «unclear» tag; 59 | 60 | - Use of the angle dash, instead of the hyphen, to divide words into syllables 61 | at the end of a line. 62 | 63 | Moreover due to some issues in the visualization of ancient symbols unicode, 64 | the Roman Denarius (U+10196) and the Roman Sestersius (U+10198) signs were 65 | transcribed using other symbols not used by the author from the Astronomical 66 | chart: 67 | 68 | Roman denarius sign ➛♀(U+2640 Female sign) 69 | 70 | Roman sestertius sign➛☿ (U+263F Mercury) 71 | 72 | In order to change them to the correct one during post-processing. 73 | -------------------------------------------------------------------------------- /catalog/fondue/FONDUE-EN-PRINT-20.yml: -------------------------------------------------------------------------------- 1 | authors: 2 | - name: Simon 3 | orcid: 0000-0001-9094-4475 4 | roles: 5 | - transcriber 6 | - project-manager 7 | - quality-control 8 | - support 9 | surname: Gabay 10 | - name: Jessica 11 | roles: 12 | - transcriber 13 | surname: Da Silva Fernandes 14 | - name: Myriam 15 | roles: 16 | - transcriber 17 | surname: Perregaux 18 | automatically-aligned: false 19 | characters: 20 | members: 21 | - e 22 | - t 23 | - o 24 | - n 25 | - a 26 | - i 27 | - r 28 | - s 29 | - h 30 | - d 31 | - l 32 | - c 33 | - u 34 | - m 35 | - f 36 | - g 37 | - p 38 | - ',' 39 | - y 40 | - w 41 | - b 42 | - v 43 | - . 44 | - k 45 | - '1' 46 | - I 47 | - ¬ 48 | - C 49 | - S 50 | - T 51 | - '-' 52 | - '9' 53 | - A 54 | - ; 55 | - '8' 56 | - M 57 | - x 58 | - '4' 59 | - '2' 60 | - / 61 | - '6' 62 | - N 63 | - G 64 | - R 65 | - D 66 | - q 67 | - '0' 68 | - '"' 69 | - H 70 | - E 71 | - '5' 72 | - z 73 | - P 74 | - W 75 | - U 76 | - '7' 77 | - ( 78 | - j 79 | - ) 80 | - '3' 81 | - B 82 | - "'" 83 | - ’ 84 | - L 85 | - ':' 86 | - Y 87 | - O 88 | - V 89 | - Q 90 | - – 91 | - '?' 92 | - F 93 | - J 94 | - '!' 95 | - K 96 | - “ 97 | - '[' 98 | - ']' 99 | - X 100 | - Z 101 | - ́ 102 | - ” 103 | - — 104 | mode: NFD 105 | citation-file-link: https://github.com/FoNDUE-HTR/FONDUE-EN-PRINT-20/blob/master/CITATION.cff 106 | description: Various prints (academic, archives, novels…) 107 | format: Alto-XML 108 | hands: 109 | count: unknown 110 | precision: exact 111 | institutions: [] 112 | language: 113 | - eng 114 | license: 115 | name: CC-BY 4.0 116 | url: https://creativecommons.org/licenses/by/4.0/ 117 | production-software: eScriptorium + Kraken 118 | project-name: FoNDUE 119 | project-website: https://github.com/FoNDUE-HTR 120 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 121 | script: 122 | - iso: Latn 123 | script-type: only-typed 124 | time: 125 | notAfter: '1900' 126 | notBefore: '1999' 127 | title: FONDUE-EN-PRINT-20 128 | transcription-guidelines: SegmOnto 129 | url: https://github.com/FoNDUE-HTR/FONDUE-EN-PRINT-20 130 | volume: 131 | - count: 82834 132 | metric: characters 133 | - count: 30 134 | metric: files 135 | - count: 1728 136 | metric: lines 137 | - count: 72 138 | metric: regions 139 | -------------------------------------------------------------------------------- /catalog/almanach/lectaurep-bronod.yml: -------------------------------------------------------------------------------- 1 | authors: 2 | - name: Limon-Bonnet 3 | roles: 4 | - transcriber 5 | - aligner 6 | - quality-control 7 | surname: Françoise 8 | - name: Chagué 9 | roles: 10 | - support 11 | - project-manager 12 | - quality-control 13 | surname: Alix 14 | - name: Rostaing 15 | roles: 16 | - project-manager 17 | surname: Aurélia 18 | characters: 19 | members: 20 | - e 21 | - t 22 | - a 23 | - / 24 | - '0' 25 | - c 26 | - n 27 | - r 28 | - m 29 | - h 30 | - p 31 | - s 32 | - o 33 | - g 34 | - '5' 35 | - '7' 36 | - '1' 37 | - E 38 | - . 39 | - i 40 | - '-' 41 | - '3' 42 | - '9' 43 | - '2' 44 | - f 45 | - d 46 | - '8' 47 | - < 48 | - l 49 | - '{' 50 | - ':' 51 | - P 52 | - A 53 | - G 54 | - '}' 55 | - U 56 | - x 57 | - '>' 58 | - b 59 | - '4' 60 | - '6' 61 | mode: NFD 62 | citation-file-link: https://raw.githubusercontent.com/HTR-United/lectaurep-bronod/master/CITATION.cff 63 | description: "Ground truth for Maître Bronod’s registers, notary in Paris during the\ 64 | \ 18th century.\n" 65 | format: Page-XML 66 | hands: 67 | count: '1' 68 | precision: exact 69 | language: 70 | - fra 71 | license: 72 | - name: CC-BY 4.0 73 | url: https://creativecommons.org/licenses/by/4.0/ 74 | production-software: eScriptorium + Kraken 75 | project-name: "LECTAUREP\n" 76 | project-website: https://lectaurep.hypotheses.org/ 77 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 78 | script: 79 | - iso: Latn 80 | script-type: only-manuscript 81 | sources: 82 | - link: '' 83 | reference: Limon-Bonnet, M. (2021). Lectaurep-Bronod, ground truth for Maitre Bronod\u0027s 84 | documents (French XVIIIth century) (Version 1.0) [Computer software]. https://doi.org/10.5072/zenodo.977735 85 | time: 86 | notAfter: '1745' 87 | notBefore: '1742' 88 | title: Notaires de Paris - Bronod 89 | transcription-guidelines: "Transcription fidèle aux manuscrits : la casse et les abréviations\ 90 | \ sont respectées. Les portions de texte suscrites sont précédées d'un symbole `^`.\ 91 | \ Pas de traitement particulier des éventuels s longs.'\n" 92 | url: https://github.com/HTR-United/lectaurep-bronod 93 | volume: 94 | - count: 359094 95 | metric: characters 96 | - count: 100 97 | metric: files 98 | - count: 3702 99 | metric: lines 100 | - count: 200 101 | metric: pages 102 | - count: 296 103 | metric: regions 104 | -------------------------------------------------------------------------------- /catalog/ciham-htr/fabliaux.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: Fabliaux 3 | url: https://github.com/CIHAM-HTR/Fabliaux 4 | authors: 5 | - name: Corinne 6 | surname: Pierreville 7 | orcid: 0009-0003-3074-3841 8 | roles: 9 | - project-manager 10 | - name: Ariane 11 | surname: Pinche 12 | orcid: 0000-0002-7843-5050 13 | roles: 14 | - transcriber 15 | - aligner 16 | - quality-control 17 | institutions: [] 18 | description: HTR data sets from medieval manuscripts (13th-14th c.) collecting "fabliaux" 19 | project-website: https://projet.biblissima.fr/fr/appels-projets/projets-retenus/fabliaux 20 | language: 21 | - fro 22 | production-software: eScriptorium + Kraken 23 | script: 24 | - iso: Latn 25 | script-type: only-manuscript 26 | time: 27 | notBefore: '1200' 28 | notAfter: '1402' 29 | hands: 30 | count: 1-per-folder 31 | precision: exact 32 | license: 33 | - name: CC-BY 4.0 34 | url: https://creativecommons.org/licenses/by/4.0/ 35 | format: Alto-XML 36 | citation-file-link: https://github.com/CIHAM-HTR/Fabliaux/blob/master/CITATION.cff 37 | transcription-guidelines: "The data follow the standards recommended by the CREMMALAB\ 38 | \ project, see Ariane Pinche. Transcription Guide for 10th to 15th Century Manuscripts.\ 39 | \ 2022. \u27E8hal-03697382\u27E9" 40 | volume: 41 | - metric: characters 42 | count: 19600 43 | - metric: files 44 | count: 10 45 | - metric: lines 46 | count: 904 47 | - metric: regions 48 | count: 40 49 | characters: 50 | mode: NFD 51 | members: 52 | - e 53 | - i 54 | - s 55 | - t 56 | - a 57 | - o 58 | - u 59 | - n 60 | - r 61 | - l 62 | - d 63 | - m 64 | - c 65 | - p 66 | - "\u0303" 67 | - f 68 | - q 69 | - b 70 | - . 71 | - h 72 | - z 73 | - g 74 | - "\u204A" 75 | - "\u033E" 76 | - "\uA751" 77 | - Q 78 | - "\u0365" 79 | - I 80 | - x 81 | - "\uA770" 82 | - S 83 | - C 84 | - E 85 | - "\uA76F" 86 | - T 87 | - L 88 | - N 89 | - O 90 | - y 91 | - M 92 | - D 93 | - "\u0363" 94 | - F 95 | - A 96 | - U 97 | - "\u0142" 98 | - "\u1E9C" 99 | - P 100 | - B 101 | - ':' 102 | - '9' 103 | - "\uF1AC" 104 | - '1' 105 | - '6' 106 | - '4' 107 | - "\u0366" 108 | - "\u27E6" 109 | - "\u27E7" 110 | - "\u205C" 111 | - '''' 112 | - G 113 | - "\u1DE4" 114 | - "\u036B" 115 | - '7' 116 | - '5' 117 | - '0' 118 | - "\uA753" 119 | - '8' 120 | -------------------------------------------------------------------------------- /catalog/fondue/FONDUE-FR-MSS-18.yml: -------------------------------------------------------------------------------- 1 | authors: 2 | - name: Peter 3 | roles: 4 | - transcriber 5 | surname: Nahon 6 | - name: Simon 7 | orcid: 0000-0001-9094-4475 8 | roles: 9 | - transcriber 10 | - project-manager 11 | - quality-control 12 | - support 13 | surname: Gabay 14 | automatically-aligned: false 15 | characters: 16 | members: 17 | - e 18 | - s 19 | - a 20 | - t 21 | - u 22 | - i 23 | - r 24 | - n 25 | - o 26 | - l 27 | - d 28 | - c 29 | - m 30 | - p 31 | - ',' 32 | - v 33 | - q 34 | - . 35 | - ́ 36 | - f 37 | - g 38 | - b 39 | - h 40 | - "'" 41 | - ’ 42 | - I 43 | - + 44 | - y 45 | - ¬ 46 | - '1' 47 | - ̀ 48 | - ̂ 49 | - x 50 | - V 51 | - j 52 | - S 53 | - '2' 54 | - ':' 55 | - E 56 | - X 57 | - C 58 | - L 59 | - J 60 | - '3' 61 | - D 62 | - '4' 63 | - M 64 | - ; 65 | - ̈ 66 | - A 67 | - '5' 68 | - '6' 69 | - '8' 70 | - '9' 71 | - '7' 72 | - '0' 73 | - P 74 | - O 75 | - ̧ 76 | - R 77 | - '-' 78 | - N 79 | - G 80 | - T 81 | - '?' 82 | - B 83 | - œ 84 | - H 85 | -   86 | - Q 87 | - α 88 | - F 89 | - z 90 | - Z 91 | - U 92 | - ̓ 93 | - ο 94 | - ν 95 | - μ 96 | - ω 97 | - τ 98 | - δ 99 | - ε 100 | - ρ 101 | - φ 102 | - ( 103 | - ) 104 | - '{' 105 | - k 106 | - Ψ 107 | - ι 108 | - υ 109 | - π 110 | - λ 111 | - Y 112 | - K 113 | mode: NFD 114 | citation-file-link: https://github.com/FoNDUE-HTR/FONDUE-FR-MSS-18/blob/master/CITATION.cff 115 | description: French Manuscripts of the 18th 116 | format: Alto-XML 117 | hands: 118 | count: unknown 119 | precision: exact 120 | institutions: [] 121 | language: 122 | - fra 123 | license: 124 | name: CC-BY 4.0 125 | url: https://creativecommons.org/licenses/by/4.0/ 126 | production-software: eScriptorium + Kraken 127 | project-name: FoNDUE 128 | project-website: https://github.com/FoNDUE-HTR 129 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 130 | script: 131 | - iso: Latn 132 | script-type: only-manuscript 133 | time: 134 | notAfter: '1799' 135 | notBefore: '1700' 136 | title: FONDUE-FR-MSS-18 137 | transcription-guidelines: SegmOnto 138 | url: https://github.com/FoNDUE-HTR/FONDUE-FR-MSS-18 139 | volume: 140 | - count: 108705 141 | metric: characters 142 | - count: 82 143 | metric: files 144 | - count: 2933 145 | metric: lines 146 | - count: 203 147 | metric: regions 148 | -------------------------------------------------------------------------------- /catalog/enc-cours-git/tnah-notredame.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: Projet Notre-Dame 3 | url: https://github.com/PSL-Chartes-HTR-Students/TNAH-2021-Projet-Notre-Dame 4 | project-name: 'ENC - Bonnes pratiques du developpement collaboratif 5 | 6 | ' 7 | authors: 8 | - name: Doat 9 | surname: Soline 10 | roles: 11 | - transcriber 12 | - name: Menu 13 | surname: Ariane 14 | roles: 15 | - transcriber 16 | - name: Falcoz 17 | surname: Elsa 18 | roles: 19 | - transcriber 20 | - name: Faure 21 | surname: Margaux 22 | roles: 23 | - transcriber 24 | - name: "Mazou\xE9" 25 | surname: "Ana\xEFs" 26 | roles: 27 | - transcriber 28 | description: "Le Projet Notre-Dame consiste en une transcription des journaux quotidiens\ 29 | \ de l\u2019ann\xE9e 1860 (https://mediatheque-patrimoine.culture.gouv.fr/sites/mediatheque/files/jnd_1860.pdf)\ 30 | \ des travaux de restauration effectu\xE9s de 1844 \xE0 1865 \xE0 la cath\xE9drale\ 31 | \ Notre-Dame de Paris sous la direction d'Eug\xE8ne Viollet-le-Duc et Jean-Baptiste\ 32 | \ Lassus. Celle-ci a \xE9t\xE9 effectu\xE9e sur eScriptorium \xE0 partir de la num\xE9\ 33 | risation des journaux des travaux (https://mediatheque-patrimoine.culture.gouv.fr/travaux-de-notre-dame-de-paris-1844-1865)\ 34 | \ r\xE9alis\xE9e par la M\xE9diath\xE8que de l'architecture et du patrimoine. \n" 35 | language: 36 | - fra 37 | script: 38 | - iso: Latn 39 | script-type: only-manuscript 40 | time: 41 | notBefore: '1860' 42 | notAfter: '1860' 43 | hands: 44 | count: '1' 45 | precision: exact 46 | license: 47 | - name: CC-BY 4.0 48 | url: https://creativecommons.org/licenses/by/4.0/ 49 | format: Alto-XML 50 | citation-file-link: https://raw.githubusercontent.com/PSL-Chartes-HTR-Students/TNAH-2021-Projet-Notre-Dame/main/CITATION.cff 51 | transcription-guidelines: "- respect des majuscules et minuscules - respect des ligatures\ 52 | \ (par exemple, transcrire \"ch\u0153ur\") - mot qui est barr\xE9 : \u96BE (une\ 53 | \ seule fois par mot) mais seulement s'ils sont totalement/\xE0 moiti\xE9 illisibles.\ 54 | \ Les restranscrire entre accolades {} s'ils sont lisibles. - Pour mettre en exergue\ 55 | \ les doutes de transcription : \n - mot incertain: [incertain]\n - mot que\ 56 | \ l'on ne parvient pas \xE0 transcrire : [??]\n" 57 | volume: 58 | - metric: characters 59 | count: 29286 60 | - metric: files 61 | count: 12 62 | - metric: lines 63 | count: 735 64 | - metric: regions 65 | count: 86 66 | production-software: "eScriptorium + Kraken" 67 | -------------------------------------------------------------------------------- /catalog/popp/the-popp-datasets.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: The POPP datasets 3 | url: https://zenodo.org/record/6581158 4 | authors: 5 | - name: Thomas 6 | surname: Constum 7 | roles: 8 | - aligner 9 | - quality-control 10 | - support 11 | - name: Nicolas 12 | surname: Kempf 13 | - name: Pierrick 14 | surname: Tranouez 15 | - name: Thierry 16 | surname: Paquet 17 | roles: 18 | - project-manager 19 | - name: Sandra 20 | surname: Brée 21 | orcid: 0000-0002-2802-5563 22 | roles: 23 | - transcriber 24 | - project-manager 25 | - name: François 26 | surname: Merveille 27 | roles: 28 | - transcriber 29 | institutions: [] 30 | description: >- 31 | The POPP datasets is a set of 3 datasets created within the POPP project 32 | (Project for the Oceration of the Paris Population Census) for the task of 33 | handwriting text recognition. These datasets have been published in 34 | "Recognition and information extraction in historical handwritten tables: 35 | toward understanding early 20th century Paris census" at DAS 2022. 36 | 37 | 38 | The 3 datasets are called “Generic dataset”, “Belleville”, and “Chaussée 39 | d’Antin” and contains lines made from the extracted rows of census tables from 40 | 1926. Each table in the Paris census contains 30 rows, thus each page in these 41 | datasets corresponds to 30 lines. 42 | project-name: Project for the Oceration of the Paris Population Census 43 | project-website: https://popp.hypotheses.org 44 | language: 45 | - fra 46 | production-software: Pivan 47 | script: 48 | - iso: Latn 49 | script-type: only-manuscript 50 | time: 51 | notBefore: '1926' 52 | notAfter: '1926' 53 | hands: 54 | count: more-than-10 55 | precision: estimated 56 | license: 57 | - name: CC-BY 4.0 58 | url: https://creativecommons.org/licenses/by/4.0/ 59 | format: Alto-XML 60 | volume: 61 | - metric: lines 62 | count: 7050 63 | transcription-guidelines: > 64 | The text is transcribed as in the image (no correction of mispelling, no 65 | resolution of abbreviation). 66 | 67 | Since the lines are extracted from table rows, we defined 4 special characters 68 | to describe the structure of the text: 69 | ¤ : indicates an empty cell 70 | / : indicates the separation into columns 71 | ? : indicates that the content of the cell following this symbol is written above the regular baseline 72 | ! : indicates that the content of the cell following this symbol is written below the regular baseline 73 | -------------------------------------------------------------------------------- /catalog/almanach/tapuscorpus.yml: -------------------------------------------------------------------------------- 1 | authors: 2 | - name: Chagué 3 | roles: 4 | - transcriber 5 | - project-manager 6 | surname: Alix 7 | characters: 8 | members: 9 | - e 10 | - a 11 | - s 12 | - n 13 | - t 14 | - r 15 | - i 16 | - u 17 | - o 18 | - l 19 | - d 20 | - c 21 | - m 22 | - p 23 | - ́ 24 | - . 25 | - '~' 26 | - v 27 | - ',' 28 | - "'" 29 | - '-' 30 | - f 31 | - g 32 | - h 33 | - q 34 | - b 35 | - ̀ 36 | - _ 37 | - E 38 | - L 39 | - A 40 | - I 41 | - C 42 | - x 43 | - S 44 | - M 45 | - j 46 | - T 47 | - ̂ 48 | - R 49 | - N 50 | - '1' 51 | - O 52 | - P 53 | - y 54 | - '"' 55 | - U 56 | - J 57 | - D 58 | - '2' 59 | - ':' 60 | - ) 61 | - ( 62 | - B 63 | - '0' 64 | - '5' 65 | - '3' 66 | - '4' 67 | - z 68 | - '6' 69 | - F 70 | - H 71 | - Q 72 | - '!' 73 | - '9' 74 | - G 75 | - '7' 76 | - V 77 | - '8' 78 | - '?' 79 | - ⟦ 80 | - ⟧ 81 | - ̧ 82 | - Y 83 | - ; 84 | - ’ 85 | - ° 86 | - k 87 | - X 88 | - ̈ 89 | - + 90 | - '=' 91 | - W 92 | - / 93 | - K 94 | - ^ 95 | - w 96 | - Z 97 | - '%' 98 | - '*' 99 | mode: NFD 100 | citation-file-link: https://github.com/HTR-United/tapuscorpus/raw/main/citation.cff 101 | description: Ground truth based on a variety of French typewritten documents from 102 | the 20th century. Contains exerpts plays, poems, letters and administrative reports. 103 | format: Page-XML 104 | hands: 105 | count: 1-per-folder 106 | precision: exact 107 | language: 108 | - fra 109 | license: 110 | - name: CC-BY 4.0 111 | url: https://creativecommons.org/licenses/by/4.0/ 112 | production-software: eScriptorium + Kraken 113 | project-name: "HTR-United\n" 114 | project-website: https://htr-united.github.io/ 115 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 116 | script: 117 | - iso: Latn 118 | script-type: only-typed 119 | sources: 120 | - link: '' 121 | reference: Chagué, A. (2021). Tapuscorpus (Version 1.0) [Computer software]. https://doi.org/10.5072/zenodo.977649 122 | time: 123 | notAfter: '1999' 124 | notBefore: '1900' 125 | title: Tapus Corpus 126 | transcription-guidelines: See README in repository. 127 | url: https://github.com/HTR-United/tapuscorpus 128 | volume: 129 | - count: 131511 130 | metric: characters 131 | - count: 151 132 | metric: files 133 | - count: 4376 134 | metric: lines 135 | - count: 150 136 | metric: pages 137 | - count: 375 138 | metric: regions 139 | -------------------------------------------------------------------------------- /catalog/enc-cours-git/hn-chavigny.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: Chateau de Chavigny 3 | url: https://github.com/PSL-Chartes-HTR-Students/HN2021-ChateauChavigny 4 | project-name: ENC - Bonnes pratiques du developpement collaboratif 5 | authors: 6 | - name: Pascual 7 | surname: Margot 8 | roles: 9 | - transcriber 10 | - name: "Franchet d\\u0027Esp\xE8rey" 11 | surname: Louis-Fiacre 12 | roles: 13 | - transcriber 14 | - digitization 15 | - name: Gabay 16 | surname: Simon 17 | roles: 18 | - quality-control 19 | description: "Le document sur lequel nous travaillons porte sur le Ch\xE2teau de Chavigny\ 20 | \ \xE0 Lern\xE9 en Touraine. Au XVI\xE8me si\xE8cle, c\u2019est la famille des seigneurs\ 21 | \ Leroy qui poss\xE8de ce ch\xE2teau. Avant 1568, en pleine guerre de religion,\ 22 | \ Fran\xE7ois Leroy, du parti du roi et des catholiques, participe \xE0 la capture\ 23 | \ et la ran\xE7on du prince de Cond\xE9, du parti protestant. En 1568, Fran\xE7\ 24 | ois Leroy, en tant que capitaine de 50 lances au service du roi, part en campagne\ 25 | \ avec lui. L'objectif est de transcrire cinq feuillets d'un manuscrit \xE0 l'aide\ 26 | \ d'eScriptorium. Le but \xE9tant d'apprendre \xE0 utiliser git et github pour mener\ 27 | \ \xE0 bien notre premier projet collaboratif.\n" 28 | language: 29 | - frm 30 | script: 31 | - iso: Latn 32 | script-type: only-manuscript 33 | time: 34 | notBefore: '1568' 35 | notAfter: '1599' 36 | hands: 37 | count: '1' 38 | precision: exact 39 | license: 40 | - name: CC-BY 4.0 41 | url: https://creativecommons.org/licenses/by/4.0/ 42 | format: Alto-XML 43 | citation-file-link: https://raw.githubusercontent.com/PSL-Chartes-HTR-Students/HN-2021-ChateauChavigny/main/CITATION.cff 44 | transcription-guidelines: "- Gestion des abbr\xE9viations: \n - Si d\xE9veloppement\ 45 | \ (pas toujours), les d\xE9velopper entre crochets.\n - L'orthographe originale\ 46 | \ et les abr\xE9viations doivent \xEAtre conserv\xE9es.\n- Gestion des \xE9checs\ 47 | \ de transcription de caract\xE8re : lorsqu'un qu'un caract\xE8re nous para\xEE\ 48 | t non sur, nous pr\xE9f\xE9rons mettre un [?] pour indiquer qu'il y a un caract\xE8\ 49 | re non transcrit dans un mot. Pour plusieurs caract\xE8res, faire autant de ? que\ 50 | \ de caract\xE8re non reconnu : tel [???] pour 3 caract\xE8res.\n" 51 | volume: 52 | - metric: characters 53 | count: 9126 54 | - metric: files 55 | count: 6 56 | - metric: lines 57 | count: 253 58 | - metric: regions 59 | count: 22 60 | production-software: "eScriptorium + Kraken" 61 | -------------------------------------------------------------------------------- /catalog/gallicorpora/incunable-15.yml: -------------------------------------------------------------------------------- 1 | authors: 2 | - name: Gabay 3 | roles: 4 | - project-manager 5 | surname: Simon 6 | - name: Pinche 7 | roles: 8 | - project-manager 9 | surname: Ariane 10 | - name: Leroy 11 | roles: 12 | - transcriber 13 | surname: Noé 14 | - name: Christensen 15 | roles: 16 | - support 17 | surname: Kelly 18 | characters: 19 | members: 20 | - e 21 | - s 22 | - u 23 | - t 24 | - a 25 | - i 26 | - r 27 | - o 28 | - n 29 | - l 30 | - d 31 | - c 32 | - m 33 | - p 34 | - ̃ 35 | - f 36 | - q 37 | - g 38 | - y 39 | - h 40 | - b 41 | - . 42 | - z 43 | - ⁊ 44 | - x 45 | - E 46 | - '-' 47 | - ',' 48 | - ¶ 49 | - L 50 | - ͥ 51 | - D 52 | - C 53 | - ; 54 | - ᷤ 55 | - I 56 | - ꝰ 57 | - Q 58 | - A 59 | - S 60 | - ꝑ 61 | - P 62 | - M 63 | - O 64 | - T 65 | - U 66 | - N 67 | - F 68 | - R 69 | - ꝓ 70 | - B 71 | - G 72 | - ꝯ 73 | - ̾ 74 | - H 75 | - ᷑ 76 | - ͬ 77 | - ̌ 78 | - ':' 79 | - ( 80 | - '[' 81 | - ']' 82 | - v 83 | - J 84 | - Ꝙ 85 | - ) 86 | - k 87 | - ꝙ 88 | - ͣ 89 | - V 90 | - '4' 91 | - ͦ 92 | - w 93 | - ͨ 94 | - ͤ 95 | - Ι 96 | - ̧ 97 | - '1' 98 | - '9' 99 | - '7' 100 | - ̶ 101 | - "'" 102 | - ́ 103 | - '|' 104 | mode: NFD 105 | citation-file-link: https://github.com/Gallicorpora/HTR-incunable-15e-siecle/CITATION.cff 106 | description: Corpus d'entrainement pour l'HTR composé d'incunable français du 15e 107 | s. 108 | format: Alto-XML 109 | hands: 110 | count: 1-per-folder 111 | precision: estimated 112 | language: 113 | - frm 114 | - fra 115 | license: 116 | - name: CC-BY 4.0 117 | url: https://creativecommons.org/licenses/by/4.0/ 118 | production-software: eScriptorium + Kraken 119 | project-name: Gallicorpora 120 | project-website: https://github.com/Gallicorpora 121 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 122 | script: 123 | - iso: Latn 124 | script-type: only-typed 125 | time: 126 | notAfter: '1500' 127 | notBefore: '1400' 128 | title: Données HTR incunables du 15e siècle 129 | transcription-guidelines: 'Les normes de transcription suivent les préconisations 130 | du projet CREMMALAB : https://cremmalab.hypotheses.org' 131 | url: https://github.com/Gallicorpora/HTR-incunable-15e-siecle 132 | volume: 133 | - count: 245094 134 | metric: characters 135 | - count: 149 136 | metric: files 137 | - count: 7608 138 | metric: lines 139 | - count: 535 140 | metric: regions 141 | -------------------------------------------------------------------------------- /catalog/meleagre/meleagre.yml: -------------------------------------------------------------------------------- 1 | authors: 2 | - name: Maxime 3 | orcid: 0009-0006-2076-1220 4 | roles: 5 | - transcriber 6 | - aligner 7 | - quality-control 8 | surname: Guénette 9 | - name: Mathilde 10 | orcid: 0000-0003-1642-8610 11 | roles: 12 | - transcriber 13 | - aligner 14 | - quality-control 15 | surname: Verstraete 16 | - name: Alix 17 | orcid: 0000-0002-0136-4434 18 | roles: 19 | - quality-control 20 | - support 21 | surname: Chagué 22 | - name: Marcello 23 | orcid: 0000-0001-6424-3229 24 | roles: 25 | - project-manager 26 | surname: Vitali-Rosati 27 | automatically-aligned: false 28 | characters: 29 | members: 30 | - α 31 | - ι 32 | - ́ 33 | - ο 34 | - ε 35 | - ν 36 | - σ 37 | - τ 38 | - ̓ 39 | - υ 40 | - ρ 41 | - · 42 | - κ 43 | - λ 44 | - η 45 | - ̀ 46 | - π 47 | - μ 48 | - δ 49 | - ω 50 | - ͂ 51 | - θ 52 | - γ 53 | - ̔ 54 | - χ 55 | - φ 56 | - ':' 57 | - β 58 | - ᾽ 59 | - ⋇ 60 | - ⁛ 61 | - ξ 62 | - ̈ 63 | - '~' 64 | - ζ 65 | - ψ 66 | - ※ 67 | - ∻ 68 | - ͳ 69 | mode: NFD 70 | description: >- 71 | Ground Truth dataset for the Codex palatinus graecus 23 (Palatine Anthology), 72 | byzantine writing from the X^th^ century. 73 | format: Alto-XML 74 | hands: 75 | count: less-than-11 76 | precision: estimated 77 | institutions: [] 78 | language: 79 | - grc 80 | license: 81 | name: CC-BY 4.0 82 | url: https://creativecommons.org/licenses/by/4.0/ 83 | production-software: eScriptorium + Kraken 84 | project-website: https://anthologiagraeca.org/ 85 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 86 | script: 87 | - iso: Grek 88 | qualify: byzantine 89 | script-type: only-manuscript 90 | sources: 91 | - link: https://doi.org/10.11588/diglit.3449 92 | reference: >- 93 | Cod. Pal. graec. 23 (10e s. av., Constantinople). Universitätsbibliothek 94 | Heidelberg, Germany. 95 | time: 96 | notAfter: '1000' 97 | notBefore: '900' 98 | title: Ground truth for the Palatine Anthology (HTR_CPgr23) 99 | transcription-guidelines: we do not resolve the abbreviation, except when they are 100 | non ambiguous. Full guidelines available here https://gitlab.huma-num.fr/ecrinum/anthologia/htr_cpgr23 101 | url: https://gitlab.huma-num.fr/ecrinum/anthologia/htr_cpgr23 102 | volume: 103 | - count: 114273 104 | metric: characters 105 | - count: 70 106 | metric: files 107 | - count: 3374 108 | metric: lines 109 | - count: 50 110 | metric: pages 111 | - count: 574 112 | metric: regions 113 | -------------------------------------------------------------------------------- /catalog/rasam-2/rasam.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: RASAM 2 3 | url: https://github.com/calfa-co/rasam-dataset 4 | authors: 5 | - name: "Vidal-Gorène" 6 | surname: Chahan 7 | orcid: 0000-0003-1567-6508 8 | roles: 9 | - project-manager 10 | - name: Salah 11 | surname: "Clément" 12 | orcid: 0000-0002-7846-4054 13 | roles: 14 | - transcriber 15 | - quality-control 16 | - name: Lucas 17 | surname: "Noémie" 18 | orcid: 0000-0003-2236-6778 19 | roles: 20 | - project-manager 21 | - quality-control 22 | - name: Decours-Perez 23 | surname: "Aliénor" 24 | roles: 25 | - support 26 | - name: Antoine 27 | surname: Perrier 28 | orcid: 0000-0002-5035-4283 29 | roles: 30 | - project-manager 31 | - quality-control 32 | - transcriber 33 | institutions: 34 | - name: BULAC 35 | - name: Calfa 36 | - name: DISTAM 37 | - name: GIS MOMM 38 | description: 'The Dataset is made up of 250 images, with their related ground truth 39 | stored in a XML file (pageXML format). Images come from fifteen manuscripts selected 40 | among the collections of the BULAC Library (Paris), in Magribi Arabic. It extends RASAM 1 by covering a very wide variety of hands, text density, and cursiveness. This dataset is the result of a collaborative transcription. All the 41 | participants are credited on the official deposit. With the support of the French 42 | Ministry of Higher Education, Research and Innovation, the Research Consortium Middle-East 43 | and Muslim Worlds (GIS MOMM), Calfa and the BULAC library.' 44 | language: 45 | - ara 46 | script: 47 | - iso: Arab 48 | script-type: only-manuscript 49 | time: 50 | notBefore: '1700' 51 | notAfter: '1899' 52 | hands: 53 | count: more-than-10 54 | precision: exact 55 | license: 56 | - name: Apache-2.0 License 57 | url: https://www.apache.org/licenses/LICENSE-2.0 58 | format: Page-XML 59 | volume: 60 | - metric: lines 61 | count: 3750 62 | - metric: files 63 | count: 250 64 | - metric: regions 65 | count: 839 66 | - metric: characters 67 | count: 522371 68 | sources: 69 | - reference: "Chahan Vidal-Gorène, Clément Salah, Noëmie Lucas, Aliénor Decours-Perez, Antoine Perrier. Enhancing Arabic Maghribi Handwritten Text Recognition with RASAM 2: A Comprehensive Dataset and Benchmarking. Computational Humanities Research (CHR), Dec 2024, Aarhus, Denmark. pp.200-216." 70 | link: https://ceur-ws.org/Vol-3834/paper35.pdf 71 | transcription-guidelines: 'Full description of specifications for transcription available on Github and in the paper. Following RASAM 1 specifications.' 72 | production-software: "Calfa Vision" 73 | -------------------------------------------------------------------------------- /catalog/alix-tz/moonshines.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: Moonshines 3 | url: https://github.com/alix-tz/moonshines 4 | authors: 5 | - name: Alix 6 | surname: "Chagu\xE9" 7 | orcid: 0000-0002-0136-4434 8 | roles: 9 | - transcriber 10 | - aligner 11 | - project-manager 12 | - digitization 13 | institutions: [] 14 | description: This dataset is composed of pages of text written in 2023 by a single 15 | person, copying texts taken from Guillaume Apollinaire's poems published in Alcools, 16 | and taken from Guillaume Apollinaire's Wikipedia page. 17 | language: 18 | - fra 19 | production-software: eScriptorium + Kraken 20 | script: 21 | - iso: Latn 22 | script-type: only-manuscript 23 | time: 24 | notBefore: '2023' 25 | notAfter: '2023' 26 | hands: 27 | count: '1' 28 | precision: exact 29 | license: 30 | - name: CC-BY 4.0 31 | url: https://creativecommons.org/licenses/by/4.0/ 32 | format: Alto-XML 33 | volume: 34 | - metric: characters 35 | count: 27734 36 | - metric: files 37 | count: 45 38 | - metric: lines 39 | count: 1016 40 | - metric: regions 41 | count: 45 42 | citation-file-link: https://github.com/alix-tz/moonshines/blob/master/CITATION.cff 43 | transcription-guidelines: The transcription strictly follows what is written on the 44 | images, including accentuation or capitalization errors. The segmentation follows 45 | the SegmOnto ontology and mostly relies on MainZone and DefaultLine. Beware that 46 | this dataset barely contains any ponctuation and that most lines begin with a capital 47 | letter. 48 | characters: 49 | mode: NFD 50 | members: 51 | - e 52 | - s 53 | - a 54 | - n 55 | - r 56 | - i 57 | - t 58 | - u 59 | - o 60 | - l 61 | - d 62 | - m 63 | - c 64 | - p 65 | - "\u0301" 66 | - '''' 67 | - v 68 | - g 69 | - b 70 | - h 71 | - "\u0300" 72 | - f 73 | - L 74 | - q 75 | - E 76 | - '1' 77 | - A 78 | - C 79 | - x 80 | - y 81 | - "\u0302" 82 | - S 83 | - '9' 84 | - P 85 | - M 86 | - j 87 | - T 88 | - D 89 | - '-' 90 | - N 91 | - J 92 | - R 93 | - '0' 94 | - z 95 | - O 96 | - I 97 | - '2' 98 | - '8' 99 | - V 100 | - F 101 | - G 102 | - U 103 | - '5' 104 | - B 105 | - Q 106 | - ) 107 | - H 108 | - '3' 109 | - ( 110 | - '7' 111 | - '6' 112 | - w 113 | - k 114 | - '4' 115 | - "\u0327" 116 | - K 117 | - Z 118 | - "\u0308" 119 | - Y 120 | - '{' 121 | - '}' 122 | - W 123 | - . 124 | - X 125 | - ',' 126 | -------------------------------------------------------------------------------- /catalog/front-justice/front-justice-htr.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: HTR Front Justice 3 | url: https://github.com/Front-Justice/htr-front-justice.git 4 | authors: 5 | - name: Théo 6 | orcid: 0000-0003-2235-0565 7 | roles: 8 | - transcriber 9 | - digitization 10 | surname: Burnel 11 | - name: Giovanni Pietro 12 | orcid: 0000-0003-2722-6766 13 | roles: 14 | - project-manager 15 | surname: Vitali 16 | institutions: [] 17 | description: >- 18 | Some transcriptions of minute books from military court councils during the 19 | First World War 20 | language: 21 | - fra 22 | production-software: eScriptorium + Kraken 23 | automatically-aligned: false 24 | script: 25 | - iso: Latn 26 | script-type: evenly-mixed 27 | time: 28 | notAfter: '1919' 29 | notBefore: '1914' 30 | hands: 31 | count: more-than-10 32 | precision: estimated 33 | license: 34 | name: CC-BY 4.0 35 | url: https://creativecommons.org/licenses/by/4.0/ 36 | format: Alto-XML 37 | citation-file-link: >- 38 | https://github.com/Front-Justice/htr-front-justice/blob/31bd9342dc774b5c0c4b6fd9a704bb186430c6e3/CITATION.cff 39 | volume: 40 | - count: 795781 41 | metric: characters 42 | - count: 250 43 | metric: files 44 | - count: 13044 45 | metric: lines 46 | - count: 1333 47 | metric: regions 48 | transcription-guidelines: >- 49 | See README (Annotation and Transcription Guidelines section) 50 | characters: 51 | members: 52 | - e 53 | - . 54 | - i 55 | - r 56 | - s 57 | - n 58 | - a 59 | - t 60 | - u 61 | - l 62 | - o 63 | - d 64 | - c 65 | - ́ 66 | - m 67 | - p 68 | - ',' 69 | - f 70 | - "'" 71 | - v 72 | - '1' 73 | - q 74 | - g 75 | - C 76 | - ̀ 77 | - b 78 | - E 79 | - j 80 | - ^ 81 | - x 82 | - L 83 | - '2' 84 | - A 85 | - P 86 | - ( 87 | - ) 88 | - h 89 | - '-' 90 | - '3' 91 | - N 92 | - R 93 | - M 94 | - G 95 | - '9' 96 | - D 97 | - y 98 | - I 99 | - U 100 | - '4' 101 | - ̂ 102 | - T 103 | - '0' 104 | - F 105 | - '6' 106 | - '8' 107 | - ; 108 | - O 109 | - S 110 | - J 111 | - '7' 112 | - ':' 113 | - '5' 114 | - + 115 | - B 116 | - Q 117 | - V 118 | - z 119 | - ̧ 120 | - H 121 | - « 122 | - X 123 | - '?' 124 | - ̈ 125 | - – 126 | - '[' 127 | - ']' 128 | - » 129 | - _ 130 | - '"' 131 | - / 132 | - k 133 | - '&' 134 | - Z 135 | - Y 136 | - K 137 | - W 138 | - '>' 139 | - < 140 | - '%' 141 | - '=' 142 | - '|' 143 | mode: NFD 144 | -------------------------------------------------------------------------------- /catalog/greek-data/eparchos.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: EPARCHOS 3 | url: https://zenodo.org/records/4095301 4 | authors: 5 | - name: Aleksandros 6 | surname: Papazoglou 7 | roles: 8 | - transcriber 9 | - project-manager 10 | - name: Ioannis 11 | surname: Pratikakis 12 | orcid: 0000-0002-4124-3688 13 | roles: 14 | - transcriber 15 | - project-manager 16 | - name: Kleopatra 17 | surname: Markou 18 | roles: 19 | - transcriber 20 | - project-manager 21 | - name: Lazaros 22 | surname: Tsochatzidis 23 | orcid: 0000-0002-4634-7419 24 | roles: 25 | - transcriber 26 | - project-manager 27 | institutions: [] 28 | description: >- 29 | The dataset originates from a Greek handwritten codex that dates from around 30 | 1500-1530. This is the subset of the codex British Museum Addit. 6791, written 31 | by two hands, one by Antonius Eparchos and the other by Camillos Zanettus (ff. 32 | 104r-174v) and delivers texts by Hierocles (In Aureum carmen), Matthaeus 33 | Blastares (Collectio alphabetica) and, notably, texts by Michael Psellos (De 34 | omnifaria doctrina). The writing delivers the most important abbreviations, 35 | logograms and conjunctions, which are cited in virtually every Greek minuscule 36 | handwritten codex from the years of the manuscript transliteration and the 37 | prevalence of the minuscule script (9th century) to the post-Byzantine years. 38 | This dataset consists of 120 scanned handwritten text pages, containing 9285 39 | lines of text, 18809 words (6787 unique words). For each page, a PageXML is 40 | provided containing the following groundtruth: 41 | 1. Text region polygon coordinates 42 | 2. Text line polygon coordinates with the corresponding transcription text 43 | 3. Word polygon coordinated with the corresponding transcription text 44 | language: 45 | - grc 46 | transcription-guidelines: | 47 | - Abbreviation and ligatures were resolved 48 | - Minuscule in the beginning of sentences were kept as such. 49 | - Polytonic spelling and diaeresis are kept 50 | production-software: Unknown 51 | automatically-aligned: false 52 | characters: 53 | mode: NFD 54 | script: 55 | - iso: Grek 56 | script-type: only-manuscript 57 | time: 58 | notBefore: '1500' 59 | notAfter: '1530' 60 | hands: 61 | count: less-than-11 62 | precision: exact 63 | license: 64 | name: CC-BY 4.0 65 | url: https://creativecommons.org/licenses/by/4.0/ 66 | format: Page-XML 67 | volume: 68 | - metric: lines 69 | count: 2272 70 | - metric: characters 71 | count: 116894 72 | - metric: files 73 | count: 120 -------------------------------------------------------------------------------- /catalog/bsc-cssh/AMSMB-HTR.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: AMSMB HTR 3 | url: https://dataverse.bsc.es/citation?persistentId=perma:BSC/0VB0MC 4 | authors: 5 | - name: Mariona 6 | surname: Coll Ardanuy 7 | orcid: 0000-0001-8455-7196 8 | roles: 9 | - project-manager 10 | - quality-control 11 | - support 12 | - name: Ramon 13 | surname: Sarobe 14 | orcid: 0000-0003-2099-3567 15 | roles: 16 | - transcriber 17 | - aligner 18 | - name: Coral 19 | surname: Cuadrada 20 | orcid: 0000-0003-4577-2381 21 | roles: 22 | - project-manager 23 | - digitization 24 | institutions: 25 | - name: Barcelona Supercomputing Center 26 | - name: Arxiu dels Marquesos de Santa Maria de Barberà 27 | - name: Arxiu Municipal de Vilassar de Dalt 28 | description: >- 29 | Dataset for handwritten text recognition on medieval notarial charters written 30 | on parchment (1208-1499). The dataset is comprised of 100 digitized 31 | manuscripts (3,369 lines), carefully selected to represent the large variation 32 | that is present in the sources, encompassing at least 80 distinct hands and 33 | various document types (from sales and inventories to last wills and marriage 34 | contracts). Written primarily in Medieval Latin with fragments in Medieval 35 | Catalan, these manuscripts exhibit varying stages of preservation and degrees 36 | of deterioration. 37 | project-website: https://www.bsc.es/discover-bsc/organisation/scientific-structure/cssh 38 | language: 39 | - lat 40 | - cat 41 | production-software: eScriptorium + Kraken 42 | automatically-aligned: false 43 | script: 44 | - iso: Latn 45 | script-type: only-manuscript 46 | time: 47 | notBefore: '1208' 48 | notAfter: '1499' 49 | hands: 50 | count: more-than-10 51 | precision: exact 52 | license: 53 | name: CC-BY-SA 4.0 54 | url: https://creativecommons.org/licenses/by-sa/4.0/ 55 | format: Page-XML 56 | sources: 57 | - reference: >- 58 | Coll Ardanuy, M., Cuadrada, C., & Sarobe, R. (2025). AMSMB HTR: A Dataset 59 | for Handwritten Text Recognition in Medieval Notarial Charters Written on 60 | Parchment (1208-1499) [Dataset]. BSC Dataverse. 61 | link: https://dataverse.bsc.es/dataset.xhtml?persistentId=perma:BSC/0VB0MC 62 | volume: 63 | - metric: lines 64 | count: 3369 65 | - metric: files 66 | count: 100 67 | transcription-guidelines: >- 68 | The transcription follows a semi-diplomatic approach, in which abbreviations 69 | and symbols are expanded. Annotation and transcription decisions are 70 | documented in the datasheet accompanying the original dataset at: 71 | https://dataverse.bsc.es/citation?persistentId=perma:BSC/0VB0MC. 72 | -------------------------------------------------------------------------------- /catalog/cremma/mss-19.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: CREMMA Manuscrits du 19e 3 | url: https://github.com/HTR-United/CREMMA-MSS-19 4 | project-name: CREMMA 5 | authors: 6 | - name: "Cl\xE9rice" 7 | surname: Thibault 8 | roles: 9 | - project-manager 10 | - quality-control 11 | - name: "Chagu\xE9" 12 | surname: Alix 13 | roles: 14 | - project-manager 15 | - quality-control 16 | - name: Davoury 17 | surname: Baudouin 18 | roles: 19 | - transcriber 20 | - aligner 21 | - name: Doat 22 | surname: Soline 23 | roles: 24 | - transcriber 25 | - aligner 26 | - name: Faure 27 | surname: Margaux 28 | roles: 29 | - transcriber 30 | - aligner 31 | - name: Humeau 32 | surname: Maxime 33 | roles: 34 | - transcriber 35 | - aligner 36 | description: Manuscripts of the 19th century 37 | language: 38 | - fra 39 | script: 40 | - iso: Latn 41 | script-type: only-manuscript 42 | time: 43 | notBefore: '1800' 44 | notAfter: '1899' 45 | hands: 46 | count: 1-per-folder 47 | precision: exact 48 | license: 49 | - name: CC-BY 4.0 50 | url: https://creativecommons.org/licenses/by/4.0/ 51 | format: Alto-XML 52 | volume: 53 | - metric: characters 54 | count: 55581 55 | - metric: files 56 | count: 69 57 | - metric: lines 58 | count: 1807 59 | - metric: regions 60 | count: 167 61 | transcription-guidelines: "Abr\xE9viations conserv\xE9es." 62 | production-software: eScriptorium + Kraken 63 | characters: 64 | mode: NFD 65 | members: 66 | - e 67 | - s 68 | - a 69 | - i 70 | - u 71 | - n 72 | - r 73 | - t 74 | - o 75 | - l 76 | - d 77 | - m 78 | - c 79 | - p 80 | - v 81 | - ',' 82 | - "\u0301" 83 | - '''' 84 | - q 85 | - f 86 | - . 87 | - g 88 | - b 89 | - h 90 | - "\u0300" 91 | - j 92 | - x 93 | - '-' 94 | - "\u0302" 95 | - L 96 | - C 97 | - M 98 | - y 99 | - J 100 | - z 101 | - A 102 | - D 103 | - P 104 | - '"' 105 | - '>' 106 | - < 107 | - E 108 | - '!' 109 | - N 110 | - S 111 | - Q 112 | - '1' 113 | - ; 114 | - '?' 115 | - ':' 116 | - R 117 | - I 118 | - T 119 | - B 120 | - V 121 | - "\u0153" 122 | - '6' 123 | - O 124 | - ( 125 | - _ 126 | - ) 127 | - '2' 128 | - '3' 129 | - H 130 | - '4' 131 | - ^ 132 | - '9' 133 | - '8' 134 | - '7' 135 | - F 136 | - '0' 137 | - G 138 | - '5' 139 | - "\u0327" 140 | - U 141 | - '&' 142 | - '[' 143 | - ']' 144 | - "\xB0" 145 | - "\u0308" 146 | - k 147 | - $ 148 | - w 149 | - X 150 | - W 151 | - Y 152 | - + 153 | - Z 154 | -------------------------------------------------------------------------------- /catalog/enc-cours-git/hn-poesie-corse.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: OCR Corse 3 | url: https://github.com/PSL-Chartes-HTR-Students/HN2021-OCR-Poesie-Corse 4 | project-name: ENC - Bonnes pratiques du developpement collaboratif 5 | authors: 6 | - name: Sarbach-Pulicani 7 | surname: Vincent 8 | roles: 9 | - transcriber 10 | - project-manager 11 | - name: "Sa\xEFag" 12 | surname: Violette 13 | - name: Escoda 14 | surname: Adrien 15 | roles: 16 | - transcriber 17 | - name: Miaille 18 | surname: "Th\xE9ophile" 19 | roles: 20 | - transcriber 21 | - project-manager 22 | description: "Le premier ouvrage s\u2019intitule *Ponten\xF4vu* a \xE9t\xE9 \xE9crit\ 23 | \ par Petru Rocca et publi\xE9 par la \"Stamparia di a Muvra\" en 1927. Il s'agit\ 24 | \ d'un recueil de po\xE8mes en corse et en fran\xE7ais dont les th\xE8mes varient.\ 25 | \ *A Muvra* est un journal autonomiste corse d'influence maurassienne qui a exist\xE9\ 26 | \ pendant toute la p\xE9riode de l'entre-deux-guerres. Se revendiquant comme \xE9\ 27 | tant une revue culturelle, la dimension politique de la revue (incarn\xE9e par le\ 28 | \ PCA, ou Partitu corsu d'azione), en a fait un mouvement controvers\xE9. C'est\ 29 | \ dans ce contexte de lutte politique et d'\xE9veil culturel corse que s'inscrit\ 30 | \ ce recueil.\nLe second ouvrage s'intitule *A nostra Santa Fede - Catechismu Corsu*,\ 31 | \ \xE9crit par Ageniu Grimaldi en 1926 sous le pseudonyme de Saveriu Malaspina.\ 32 | \ Proche de Petru Rocca, ce-dernier est l'un des th\xE9oriciens de l'autonomisme\ 33 | \ corse de l'entre-deux-guerres et fid\xE8le muvriste. Dans l'ouvrage, il est fait\ 34 | \ mention notamment de la fa\xE7on dont un vrai corse doit se comproter vis-\xE0\ 35 | -vis de sa foi envers Dieu et son \xEEle. Bien qu'il ne s'agisse pas r\xE9ellement\ 36 | \ d'un recueil de po\xE8mes, le style d'\xE9criture de cet ouvrage est particuli\xE8\ 37 | rement int\xE9ressant. Il reprend un style qui se rapproche des \xE9crits bibliques.\n" 38 | language: 39 | - cos 40 | - fra 41 | script: 42 | - iso: Latn 43 | script-type: only-typed 44 | time: 45 | notBefore: '1926' 46 | notAfter: '1927' 47 | hands: 48 | count: 1-per-folder 49 | precision: exact 50 | license: 51 | - name: CC-BY 4.0 52 | url: https://creativecommons.org/licenses/by/4.0/ 53 | format: Alto-XML 54 | citation-file-link: https://raw.githubusercontent.com/PSL-Chartes-HTR-Students/HN2021-OCR-Poesie-Corse/main/CITATION.CFF 55 | transcription-guidelines: '' 56 | volume: 57 | - metric: characters 58 | count: 40957 59 | - metric: files 60 | count: 47 61 | - metric: lines 62 | count: 1664 63 | - metric: regions 64 | count: 146 65 | production-software: "eScriptorium + Kraken" 66 | -------------------------------------------------------------------------------- /catalog/cremma/mss-18.yml: -------------------------------------------------------------------------------- 1 | authors: 2 | - name: Chagué 3 | roles: 4 | - project-manager 5 | - quality-control 6 | surname: Alix 7 | - name: Clérice 8 | roles: 9 | - project-manager 10 | - quality-control 11 | surname: Thibault 12 | - name: Norindr 13 | roles: 14 | - transcriber 15 | surname: Jade 16 | - name: Norindr 17 | roles: 18 | - transcriber 19 | surname: Jade 20 | - name: Van Kote 21 | roles: 22 | - transcriber 23 | - aligner 24 | surname: Elsa 25 | - name: Faure 26 | roles: 27 | - transcriber 28 | - aligner 29 | surname: Margaux 30 | characters: 31 | members: 32 | - e 33 | - s 34 | - a 35 | - r 36 | - t 37 | - n 38 | - u 39 | - i 40 | - o 41 | - l 42 | - d 43 | - p 44 | - c 45 | - m 46 | - v 47 | - . 48 | - q 49 | - f 50 | - ́ 51 | - "'" 52 | - ',' 53 | - g 54 | - b 55 | - h 56 | - y 57 | - x 58 | - j 59 | - L 60 | - C 61 | - ̀ 62 | - ^ 63 | - '1' 64 | - M 65 | - S 66 | - ̂ 67 | - z 68 | - E 69 | - R 70 | - ; 71 | - '2' 72 | - I 73 | - '6' 74 | - '0' 75 | - '>' 76 | - < 77 | - D 78 | - V 79 | - J 80 | - '4' 81 | - '3' 82 | - ( 83 | - ) 84 | - P 85 | - ̈ 86 | - '5' 87 | - ̃ 88 | - '-' 89 | - '7' 90 | - B 91 | - '8' 92 | - A 93 | - '[' 94 | - ']' 95 | - '9' 96 | - N 97 | - F 98 | - G 99 | - T 100 | - '?' 101 | - X 102 | - ̧ 103 | - / 104 | - ':' 105 | - O 106 | - H 107 | - ’ 108 | - ¬ 109 | - + 110 | - 111 | - œ 112 | - U 113 | - '&' 114 | - « 115 | - Q 116 | - '=' 117 | - K 118 | - '!' 119 | - k 120 | - W 121 | - Z 122 | - w 123 | - ° 124 | - ⁊ 125 | - ꝑ 126 | - ſ 127 | - ‸ 128 | - '#' 129 | - ̶ 130 | - _ 131 | - Y 132 | - ̄ 133 | - » 134 | - ͦ 135 | mode: NFD 136 | description: Manuscripts of the 18th century 137 | format: Alto-XML 138 | hands: 139 | count: 1-per-folder 140 | precision: exact 141 | language: 142 | - fra 143 | license: 144 | - name: CC-BY 4.0 145 | url: https://creativecommons.org/licenses/by/4.0/ 146 | production-software: eScriptorium + Kraken 147 | project-name: CREMMA 148 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 149 | script: 150 | - iso: Latn 151 | script-type: only-manuscript 152 | time: 153 | notAfter: '1799' 154 | notBefore: '1700' 155 | title: CREMMA Manuscrits du 18e 156 | transcription-guidelines: Abréviations conservées. 157 | url: https://github.com/HTR-United/CREMMA-MSS-18 158 | volume: 159 | - count: 141690 160 | metric: characters 161 | - count: 125 162 | metric: files 163 | - count: 4019 164 | metric: lines 165 | - count: 329 166 | metric: regions 167 | -------------------------------------------------------------------------------- /catalog/rasam-1/rasam.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: RASAM 1 3 | url: https://github.com/calfa-co/rasam-dataset 4 | project-website: https://calfa.fr/blog/26 5 | authors: 6 | - name: "Vidal-Gorène" 7 | surname: Chahan 8 | orcid: 0000-0003-1567-6508 9 | roles: 10 | - project-manager 11 | - name: Lucas 12 | surname: "Noémie" 13 | orcid: 0000-0003-2236-6778 14 | roles: 15 | - project-manager 16 | - quality-control 17 | - name: Salah 18 | surname: "Clément" 19 | orcid: 0000-0002-7846-4054 20 | roles: 21 | - transcriber 22 | - quality-control 23 | - name: Decours-Perez 24 | surname: "Aliénor" 25 | roles: 26 | - support 27 | - name: Dupin 28 | surname: Boris 29 | roles: 30 | - support 31 | institutions: 32 | - name: BULAC 33 | - name: Calfa 34 | - name: DISTAM 35 | - name: GIS MOMM 36 | description: 'The Dataset is made up of 300 images, with their related ground truth 37 | stored in a XML file (pageXML format). Images come from three manuscripts selected 38 | among the collections of the BULAC Library (Paris). It covers a representative part 39 | of the handwritten production in Arabic Maghrebi scripts and includes an annotation 40 | of the layout (TextRegions, baselines and polygons) and the transcription of the 41 | main text. This dataset is the result of a collaborative transcription. All the 42 | participants are credited on the official deposit. With the support of the French 43 | Ministry of Higher Education, Research and Innovation, the Research Consortium Middle-East 44 | and Muslim Worlds (GIS MOMM), Calfa and the BULAC library.' 45 | language: 46 | - ara 47 | script: 48 | - iso: Arab 49 | script-type: only-manuscript 50 | time: 51 | notBefore: '1700' 52 | notAfter: '1899' 53 | hands: 54 | count: less-than-11 55 | precision: exact 56 | license: 57 | - name: Apache-2.0 License 58 | url: https://www.apache.org/licenses/LICENSE-2.0 59 | format: Page-XML 60 | volume: 61 | - metric: pages 62 | count: 300 63 | - count: 7540 64 | metric: lines 65 | - count: 300 66 | metric: files 67 | - count: 676 68 | metric: regions 69 | - count: 403034 70 | metric: characters 71 | sources: 72 | - reference: "Vidal-Gor\xE8ne, C., Lucas, N., Salah, C., Decours-Perez, A., & Dupin,\ 73 | \ B. (2021, September). RASAM\u2013A Dataset for the Recognition and Analysis\ 74 | \ of Scripts in Arabic Maghrebi. In International Conference on Document Analysis\ 75 | \ and Recognition (pp. 265-281). Springer, Cham" 76 | link: https://link.springer.com/chapter/10.1007/978-3-030-86198-8_19 77 | transcription-guidelines: 'Full description of specifications for transcription available 78 | on Github and in the paper.'' 79 | 80 | ' 81 | production-software: "Calfa Vision" 82 | -------------------------------------------------------------------------------- /catalog/sloane_lab/sloane_lab_htr_model.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: The Sloane Lab HTR Model 3 | url: https://github.com/sloanelab-org/HTR-Model 4 | authors: 5 | - name: Marco 6 | surname: Humbel 7 | orcid: 0000-0003-1861-162X 8 | roles: 9 | - aligner 10 | - name: 'Andreas ' 11 | surname: Vlachidis 12 | roles: 13 | - project-manager 14 | - name: 'Julianne ' 15 | surname: Nyhan 16 | roles: 17 | - project-manager 18 | - name: 'The British Museum ' 19 | surname: '' 20 | roles: 21 | - digitization 22 | institutions: 23 | - name: AEL Data Service 24 | roles: 25 | - transcriber 26 | description: > 27 | This repository contains Handwritten Text Recognition training data (layout 28 | segmentation and transcriptions ) for the Sloane Lab HTR model. The HTR model 29 | is trained on the handwriting of Hans Sloane (1660-1753). 30 | 31 | 32 | Funding: 33 | 34 | Enlightenment Architectures: Leverhulme Trust Project Grant 2016-21 35 | 36 | The Sloane Lab: Towards a National Collection – AHRC AH/W003457/1 37 | project-name: 'The Sloane Lab: Looking back to build future shared collections' 38 | project-website: https://sloanelab.org/ 39 | language: 40 | - eng 41 | production-software: Transkribus 42 | automatically-aligned: false 43 | script: 44 | - iso: Latn 45 | script-type: only-manuscript 46 | time: 47 | notBefore: '1680' 48 | notAfter: '1750' 49 | hands: 50 | count: less-than-11 51 | precision: estimated 52 | license: 53 | name: CC BY-NC-SA 4.0 54 | url: https://creativecommons.org/licenses/by-nc-sa/4.0/deed.en 55 | format: Alto-XML 56 | sources: 57 | - reference: >- 58 | Sloan, K., Ortolja-Baird, A., Nyhan, J., Pickering, V., & Fleming, M. 59 | (Eds.). (2019). Sir Hans Sloane’s Miscellanea which comprises his 60 | catalogues of Miscellanies, Antiquities, Seals, Pictures, Mathematical 61 | Instruments, Agate Handles and Agate Cups, Bottles, Spoons (Digital 62 | Edition). 63 | link: >- 64 | https://enlightenmentarchitectures.reconstructingsloane.org/cataloguemiscellanies/index.html 65 | volume: 66 | - metric: pages 67 | count: 196 68 | citation-file-link: https://github.com/sloanelab-org/HTR-Model/blob/main/Citation_SL_HTR_Model.cff 69 | transcription-guidelines: >- 70 | Transcription rules can be found alongside the dataset. They include the 71 | following rules: 72 | 73 | - Exclusion of overwritten text from training data 74 | 75 | - Exclusion of text not identified by the automated layout recognition 76 | 77 | - Exclusion of faded text 78 | 79 | - Inserted words are treated as separate text lines 80 | 81 | - Exclusion of textual features such as dotted lines 82 | 83 | - Base line separation for text written apart 84 | -------------------------------------------------------------------------------- /catalog/slub-dresden/mscr-dresd-k-117.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: >- 3 | Ground Truth Set for Handwritten Text Recognition (HTR/OCR): Dresdner 4 | Hofdiarium 1673 (Mscr.Dresd.K.117) - 17th century Kurrent manuscript 5 | url: https://doi.org/10.5281/zenodo.15303243 6 | authors: 7 | - name: Stefan 8 | surname: Beckert 9 | orcid: 0009-0005-2394-0075 10 | roles: 11 | - transcriber 12 | - aligner 13 | - project-manager 14 | - quality-control 15 | institutions: [] 16 | description: >+ 17 | Twenty pages of Ground Truth from the "Hofdiarium des Kurfürsten Johann Georgs 18 | II. 1673" (SLUB Mscr.Dresd.K.117; https://www.wikidata.org/wiki/Q134220291). 19 | The handwriting is a typical late 17th century Saxon kurrent 20 | ("Kanzleikurrent"), with occasional words written in bastarda or fraktur-like 21 | script. 22 | 23 | 24 | This transcription is part of a larger project regarding the Dresden court 25 | diaries. Check https://slub-dresden.academia.edu/StefanBeckert for further 26 | updates. 27 | language: 28 | - deu 29 | production-software: eScriptorium + Kraken 30 | automatically-aligned: false 31 | script: 32 | - iso: Latn 33 | script-type: only-manuscript 34 | time: 35 | notBefore: '1673' 36 | notAfter: '1673' 37 | hands: 38 | count: '1' 39 | precision: exact 40 | license: 41 | name: CC-BY 4.0 42 | url: https://creativecommons.org/licenses/by/4.0/ 43 | format: Alto-XML 44 | volume: 45 | - metric: pages 46 | count: 20 47 | transcription-guidelines: >- 48 | Transcription guidelines are oriented on the DTABF-M schema 49 | (https://www.deutschestextarchiv.de/doku/basisformat/manuskript.html), but 50 | have been adapted as follows: 51 | 52 | 53 | - I and J majuscules are not distinguished 54 | 55 | - u and v are reproduced true to the original (e.g. vnd) 56 | 57 | - Long-s (ſ) and round-s (s) are distinguished 58 | 59 | - sz ligature is rendered as ß in Kurrent scripts and as sz (e.g. "Libusza") 60 | in Antiqua scripts 61 | 62 | - ij ligature is rendered as y 63 | 64 | - other ligatures, if they occur at all, are dissolved 65 | 66 | - r graphemes are rendered as r in their modern day form 67 | 68 | - an m with a nasal stroke was rendered as a simple mm 69 | 70 | - Where possible, abbreviation signs (Abbrechungszeichen) for the contemporary 71 | identification of abbreviations have been included as single letters and not 72 | marked separately. The subsequent punctuation mark (“.” or “:”) for further 73 | identification of the abbreviation has also been included (cf. also Capelli, 74 | 1928, Lexicon abbreviaturarum I, p.X) 75 | 76 | - Diacritics in u were not marked 77 | 78 | - In the case of uncertain capitalization, an approximation is sought via the 79 | letter size 80 | -------------------------------------------------------------------------------- /catalog/bullinger-htr-dataset/bullinger-htr-dataset.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: Bullinger HTR Dataset 3 | url: https://github.com/pstroe/bullinger-htr 4 | authors: 5 | - name: Phillip Benjamin 6 | surname: Ströbel 7 | orcid: 0000-0003-2063-5495 8 | roles: 9 | - aligner 10 | - support 11 | - name: Tobias 12 | surname: Hodel 13 | orcid: 0000-0002-2071-6407 14 | roles: 15 | - aligner 16 | - project-manager 17 | - name: Christian 18 | surname: Sieber 19 | orcid: 0000-0002-9364-6921 20 | roles: 21 | - digitization 22 | - name: Patricia 23 | surname: Scheurer 24 | roles: 25 | - quality-control 26 | - support 27 | - name: David Selim 28 | surname: Schoch 29 | orcid: 0000-0002-9936-8459 30 | roles: 31 | - aligner 32 | - name: Anna 33 | surname: Janka 34 | roles: 35 | - aligner 36 | - name: Raphael 37 | surname: Schwitter 38 | roles: 39 | - aligner 40 | - name: Beat 41 | surname: Wolf 42 | roles: 43 | - aligner 44 | - name: Jonas 45 | surname: Widmer 46 | roles: 47 | - aligner 48 | - name: Peter 49 | surname: Rechsteiner 50 | roles: 51 | - quality-control 52 | - support 53 | - name: Raphael 54 | surname: Müller 55 | roles: 56 | - quality-control 57 | - digitization 58 | - support 59 | institutions: [] 60 | description: >- 61 | This dataset contains 165,673 image and corresponding text line files (.png 62 | for images and .txt for the texts) in a random 80/10/10 training, validation 63 | and test set split. The source is the extensive correspondence of Swiss 64 | reformer Heinrich Bullinger (1504-1575) and his over 800 different 65 | correspondents. It therefore contains great variety in handwriting styles. 66 | Furthermore, it is multilingual since there are Latin and Early New High 67 | German (and sometimes mixed) letters. The data is split into Latin and Early 68 | New High German (determined with langid) and put into separate folders (de for 69 | Early New High German and la for Latin). 70 | project-website: https://www.bullinger-digital.ch/ 71 | language: 72 | - lat 73 | - deu 74 | production-software: Transkribus, own 75 | script: 76 | - iso: Latn 77 | script-type: only-manuscript 78 | time: 79 | notBefore: '1523' 80 | notAfter: '1575' 81 | hands: 82 | count: more-than-10 83 | precision: estimated 84 | license: 85 | name: CC-BY-SA 4.0 86 | url: https://creativecommons.org/licenses/by-sa/4.0/ 87 | format: Image-Text-Pairs 88 | volume: 89 | - metric: lines 90 | count: 165673 91 | automatically-aligned: true 92 | transcription-guidelines: Automated transcript alignment with Transkribus 93 | -------------------------------------------------------------------------------- /catalog/slub-dresden/mscr-dresd-k-113.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: >- 3 | Ground Truth Set for Handwritten Text Recognition (HTR/OCR): Dresdner 4 | Hofdiarium 1653-56 (Mscr.Dresd.K.113) - 17th century Kurrent manuscript 5 | url: https://doi.org/10.5281/zenodo.15303398 6 | authors: 7 | - name: Stefan 8 | surname: Beckert 9 | orcid: 0009-0005-2394-0075 10 | roles: 11 | - transcriber 12 | - aligner 13 | - project-manager 14 | - quality-control 15 | institutions: [] 16 | description: >- 17 | Twelve pages of Ground Truth from the "Hofdiarium des Kurfürsten Johann Georgs 18 | II. 1653-1656" (SLUB Mscr.Dresd.K113; 19 | https://www.wikidata.org/wiki/Q133883726). The handwriting is a typical late 20 | 17th century Saxon kurrent ("Kanzleikurrent"), with occasional words written 21 | in bastarda or fraktur-like script. 22 | 23 | 24 | This transcription is part of a larger project regarding the Dresden court 25 | diaries. Check https://slub-dresden.academia.edu/StefanBeckert for further 26 | updates. 27 | language: 28 | - deu 29 | production-software: eScriptorium + Kraken 30 | automatically-aligned: false 31 | script: 32 | - iso: Latn 33 | script-type: only-manuscript 34 | time: 35 | notBefore: '1653' 36 | notAfter: '1656' 37 | hands: 38 | count: '1' 39 | precision: exact 40 | license: 41 | name: CC-BY 4.0 42 | url: https://creativecommons.org/licenses/by/4.0/ 43 | format: Alto-XML 44 | volume: 45 | - metric: pages 46 | count: 12 47 | transcription-guidelines: >- 48 | Transcription guidelines are oriented on the DTABF-M schema 49 | (https://www.deutschestextarchiv.de/doku/basisformat/manuskript.html), but 50 | have been adapted as follows: 51 | 52 | 53 | - I and J majuscules are not distinguished 54 | 55 | - u and v are reproduced true to the original (e.g. vnd) 56 | 57 | - Long-s (ſ) and round-s (s) are distinguished 58 | 59 | - sz ligature is rendered as ß in Kurrent scripts and as sz (e.g. "Libusza") 60 | in Antiqua scripts 61 | 62 | - ij ligature is rendered as y 63 | 64 | - other ligatures, if they occur at all, are dissolved 65 | 66 | - r graphemes are rendered as r in their modern day form 67 | 68 | - an m with a nasal stroke was rendered as a simple mm 69 | 70 | - Where possible, abbreviation signs (Abbrechungszeichen) for the contemporary 71 | identification of abbreviations have been included as single letters and not 72 | marked separately. The subsequent punctuation mark (“.” or “:”) for further 73 | identification of the abbreviation has also been included (cf. also Capelli, 74 | 1928, Lexicon abbreviaturarum I, p.X) 75 | 76 | - Diacritics in u were not marked 77 | 78 | - In the case of uncertain capitalization, an approximation is sought via the 79 | letter size 80 | -------------------------------------------------------------------------------- /catalog/gallicorpora/mss-15.yml: -------------------------------------------------------------------------------- 1 | authors: 2 | - name: Gabay 3 | roles: 4 | - project-manager 5 | surname: Simon 6 | - name: Pinche 7 | roles: 8 | - project-manager 9 | surname: Ariane 10 | - name: Leroy 11 | roles: 12 | - transcriber 13 | surname: Noé 14 | - name: Christensen 15 | roles: 16 | - support 17 | surname: Kelly 18 | characters: 19 | members: 20 | - e 21 | - i 22 | - s 23 | - t 24 | - u 25 | - n 26 | - a 27 | - r 28 | - o 29 | - l 30 | - d 31 | - c 32 | - m 33 | - p 34 | - q 35 | - f 36 | - g 37 | - . 38 | - ̃ 39 | - h 40 | - b 41 | - z 42 | - y 43 | - I 44 | - x 45 | - ⁊ 46 | - ',' 47 | - R 48 | - E 49 | - C 50 | - ̾ 51 | - Q 52 | - L 53 | - S 54 | - A 55 | - D 56 | - M 57 | - ͣ 58 | - ꝑ 59 | - ͥ 60 | - P 61 | - ꝯ 62 | - T 63 | - N 64 | - ¶ 65 | - O 66 | - B 67 | - ͤ 68 | - U 69 | - '-' 70 | - '1' 71 | - ꝰ 72 | - ᷑ 73 | - ̽ 74 | - '2' 75 | - '3' 76 | - ẜ 77 | - F 78 | - ⟦ 79 | - ⟧ 80 | - '6' 81 | - ħ 82 | - ꝓ 83 | - '7' 84 | - '4' 85 | - ͨ 86 | - '9' 87 | - '8' 88 | - ; 89 | - G 90 | - '0' 91 | - ͦ 92 | - '5' 93 | - H 94 | - "'" 95 | - ̀ 96 | - ł 97 | - đ 98 | - ́ 99 | - ͫ 100 | - ‸ 101 | - '&' 102 | - k 103 | - ° 104 | - ẞ 105 | - ͬ 106 | - ᷤ 107 | - K 108 | - '[' 109 | - ']' 110 | - ͯ 111 | - ̧ 112 | - ( 113 | - ) 114 | - Y 115 | - Z 116 | - ':' 117 | - ͧ 118 | - ᷠ 119 | - X 120 | mode: NFD 121 | citation-file-link: https://github.com/Gallicorpora/HTR-MSS-15e-Siecle/CITATION. 122 | description: Corpus d'entrainement pour l'HTR composé de manuscrits français du 15e 123 | s. 124 | format: Alto-XML 125 | hands: 126 | count: 1-per-folder 127 | precision: estimated 128 | language: 129 | - frm 130 | - fra 131 | license: 132 | - name: CC-BY 4.0 133 | url: https://creativecommons.org/licenses/by/4.0/ 134 | production-software: eScriptorium + Kraken 135 | project-name: Gallicorpora 136 | project-website: https://github.com/Gallicorpora 137 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 138 | script: 139 | - iso: Latn 140 | script-type: only-manuscript 141 | time: 142 | notAfter: '1500' 143 | notBefore: '1400' 144 | title: Données HTR manuscrits du 15e siècle 145 | transcription-guidelines: 'Les normes de transcription suivent les préconisations 146 | du projet CREMMALAB : https://cremmalab.hypotheses.org' 147 | url: https://github.com/Gallicorpora/HTR-MSS-15e-Siecle 148 | volume: 149 | - count: 169207 150 | metric: characters 151 | - count: 85 152 | metric: files 153 | - count: 5937 154 | metric: lines 155 | - count: 458 156 | metric: regions 157 | -------------------------------------------------------------------------------- /catalog/chi-know-po/chi-know-po.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: CHI-KNOW-PO CORPUS 3 | url: https://github.com/calfa-co/chi-know-po 4 | authors: 5 | - name: Marie 6 | surname: Bizais-Lillig 7 | orcid: 0000-0002-2426-2641 8 | roles: 9 | - project-manager 10 | - quality-control 11 | - name: Hu 12 | surname: Xinmin 13 | roles: 14 | - transcriber 15 | - name: LIAO 16 | surname: Shueh-Ying 17 | roles: 18 | - transcriber 19 | - name: Cuillé 20 | surname: Elsa 21 | orcid: 0000-0002-6060-0724 22 | roles: 23 | - transcriber 24 | - name: Tanelian 25 | surname: ani 26 | roles: 27 | - quality-control 28 | - support 29 | - name: Kasparian 30 | surname: Anahide 31 | roles: 32 | - quality-control 33 | - support 34 | - name: Vidal-Gorène 35 | surname: Chahan 36 | orcid: 0000-0003-1567-6508 37 | roles: 38 | - quality-control 39 | - support 40 | - name: Dupin 41 | surname: Boris 42 | roles: 43 | - support 44 | institutions: 45 | - name: Université de Strasbourg, GÉO (UR1340) 46 | - name: CNRS, UAR2999, Distam 47 | - name: Calfa 48 | description: >- 49 | HTR ground-truth of the CHI-KNOW-PO project (Collex-Persée), that aimed to digitize a corpus of belletristic anthologies, scholarly collections, dictionaries and encyclopedias from the Chinese medieval period (ca. 200-1000) and to process them using HTR. 50 | project-website: https://chi-know-po.gitpages.huma-num.fr 51 | language: 52 | - lzh 53 | automatically-aligned: false 54 | script: 55 | - iso: Hant 56 | script-type: only-manuscript 57 | time: 58 | notBefore: '1604' 59 | notAfter: '1921' 60 | hands: 61 | count: 1-per-folder 62 | precision: exact 63 | license: 64 | name: CC-BY 4.0 65 | url: https://creativecommons.org/licenses/by/4.0/ 66 | format: Page-XML 67 | volume: 68 | - metric: lines 69 | count: 1248 70 | - metric: characters 71 | count: 104536 72 | - metric: files 73 | count: 327 74 | sources: 75 | - reference: "Bizais-Lillig, M., Vidal-Gorène, C., & Dupin, B. (2024, August). 76 | Optimizing HTR and Reading Order Strategies for Chinese Imperial Editions with 77 | Few-Shot Learning. In International Conference on Document Analysis and 78 | Recognition (pp. 37-56). Cham: Springer Nature Switzerland." 79 | link: https://link.springer.com/chapter/10.1007/978-3-031-70642-4_3 80 | transcription-guidelines: 'Regions and baselines are distinguished by types, mainly to differentiate between main text from commentaries. Diplomatic transcription with the following exceptions: characters are transcribed in their standard form based on the online dictionary of variants (https://dict.variants.moe.edu.tw/).' 81 | production-software: "Calfa Vision" -------------------------------------------------------------------------------- /catalog/cremma/mss-17.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: CREMMA Manuscrits du 17e 3 | url: https://github.com/HTR-United/CREMMA-MSS-17 4 | project-name: CREMMA 5 | authors: 6 | - name: "Cl\xE9rice" 7 | surname: Thibault 8 | roles: 9 | - project-manager 10 | - quality-control 11 | - name: "Chagu\xE9" 12 | surname: Alix 13 | roles: 14 | - project-manager 15 | - quality-control 16 | - name: Faure 17 | surname: Margaux 18 | roles: 19 | - transcriber 20 | - name: Norindr 21 | surname: Jade 22 | roles: 23 | - transcriber 24 | - name: Mazoue 25 | surname: Anais 26 | roles: 27 | - transcriber 28 | - name: Davoury 29 | surname: Baudoin 30 | roles: 31 | - transcriber 32 | description: Various Manuscripts of the 17th century 33 | language: 34 | - fra 35 | script: 36 | - iso: Latn 37 | script-type: only-manuscript 38 | time: 39 | notBefore: '1600' 40 | notAfter: '1699' 41 | hands: 42 | count: 1-per-folder 43 | precision: exact 44 | license: 45 | - name: CC-BY 4.0 46 | url: https://creativecommons.org/licenses/by/4.0/ 47 | format: Alto-XML 48 | volume: 49 | - metric: characters 50 | count: 81909 51 | - metric: files 52 | count: 111 53 | - metric: lines 54 | count: 2245 55 | - metric: regions 56 | count: 264 57 | transcription-guidelines: "Abr\xE9viations conserv\xE9es." 58 | production-software: eScriptorium + Kraken 59 | characters: 60 | mode: NFD 61 | members: 62 | - e 63 | - s 64 | - r 65 | - a 66 | - n 67 | - u 68 | - i 69 | - o 70 | - t 71 | - l 72 | - d 73 | - c 74 | - m 75 | - p 76 | - v 77 | - q 78 | - . 79 | - ',' 80 | - y 81 | - '''' 82 | - f 83 | - b 84 | - g 85 | - "\u0301" 86 | - h 87 | - j 88 | - "\u0303" 89 | - M 90 | - x 91 | - R 92 | - z 93 | - C 94 | - '1' 95 | - J 96 | - ^ 97 | - "\u0300" 98 | - P 99 | - L 100 | - S 101 | - V 102 | - '&' 103 | - A 104 | - E 105 | - '>' 106 | - I 107 | - < 108 | - '2' 109 | - X 110 | - '3' 111 | - T 112 | - '7' 113 | - D 114 | - '6' 115 | - ']' 116 | - B 117 | - '4' 118 | - '[' 119 | - '0' 120 | - '?' 121 | - '-' 122 | - "\u0302" 123 | - "\u0308" 124 | - '9' 125 | - '5' 126 | - ; 127 | - G 128 | - N 129 | - '8' 130 | - ':' 131 | - F 132 | - "\u0327" 133 | - ) 134 | - ( 135 | - Q 136 | - O 137 | - H 138 | - W 139 | - "\u0153" 140 | - "\u2038" 141 | - "\u204A" 142 | - U 143 | - "\u0304" 144 | - / 145 | - "\uA757" 146 | - + 147 | - k 148 | - "\xB0" 149 | - "\_" 150 | - w 151 | - "\u05DD" 152 | - Z 153 | - "\u03C2" 154 | - '#' 155 | - "\xE6" 156 | - "\uA759" 157 | - "\u0363" 158 | - "\u03B5" 159 | - "\u03D5" 160 | -------------------------------------------------------------------------------- /catalog/almanach/lectaurep-notaires.yml: -------------------------------------------------------------------------------- 1 | authors: 2 | - name: Durand 3 | roles: 4 | - transcriber 5 | - aligner 6 | surname: Marc 7 | - name: Rostaing 8 | roles: 9 | - transcriber 10 | - project-manager 11 | - quality-control 12 | surname: Aurélia 13 | - name: Chagué 14 | roles: 15 | - project-manager 16 | - quality-control 17 | - support 18 | surname: Alix 19 | characters: 20 | members: 21 | - e 22 | - r 23 | - a 24 | - i 25 | - n 26 | - t 27 | - o 28 | - u 29 | - s 30 | - d 31 | - l 32 | - c 33 | - p 34 | - '1' 35 | - m 36 | - S 37 | - ̀ 38 | - ',' 39 | - E 40 | - ́ 41 | - '2' 42 | - P 43 | - . 44 | - M 45 | - '0' 46 | - A 47 | - C 48 | - '5' 49 | - '3' 50 | - h 51 | - T 52 | - v 53 | - g 54 | - D 55 | - '7' 56 | - ) 57 | - ( 58 | - R 59 | - N 60 | - f 61 | - I 62 | - b 63 | - L 64 | - '8' 65 | - '9' 66 | - ^ 67 | - '4' 68 | - '6' 69 | - B 70 | - O 71 | - J 72 | - V 73 | - y 74 | - "'" 75 | - G 76 | - F 77 | - '-' 78 | - x 79 | - q 80 | - ° 81 | - H 82 | - ̂ 83 | - U 84 | - '"' 85 | - X 86 | - '&' 87 | - z 88 | - ; 89 | - ̧ 90 | - ':' 91 | - j 92 | - + 93 | - Q 94 | - '|' 95 | - ̈ 96 | - / 97 | - k 98 | - '=' 99 | - '%' 100 | - W 101 | - K 102 | - Y 103 | - Z 104 | - w 105 | - '~' 106 | - ¥ 107 | - ȼ 108 | - _ 109 | - € 110 | - '`' 111 | - '[' 112 | - ']' 113 | - œ 114 | - '?' 115 | - '*' 116 | - ̃ 117 | - '>' 118 | - ½ 119 | mode: NFD 120 | citation-file-link: https://github.com/HTR-United/lectaurep-repertoires/raw/main/CITATION.cff 121 | description: Ground truth for various Parisian registries of notary deeds written 122 | in French during the 19th century. The information is organized following pre-printed 123 | tables (with printed headers) and contain many names, addresses, numbers and abbreviations. 124 | format: Alto-XML 125 | hands: 126 | count: more-than-10 127 | precision: estimated 128 | language: 129 | - fra 130 | license: 131 | - name: CC-BY 4.0 132 | url: https://creativecommons.org/licenses/by/4.0/ 133 | production-software: eScriptorium + Kraken 134 | project-name: LECTAUREP 135 | project-website: https://lectaurep.hypotheses.org/ 136 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 137 | script: 138 | - iso: Latn 139 | script-type: mainly-manuscript 140 | time: 141 | notAfter: '1939' 142 | notBefore: '1830' 143 | title: Notaires de Paris - Répertoires 144 | url: https://github.com/HTR-United/lectaurep-repertoires 145 | volume: 146 | - count: 525786 147 | metric: characters 148 | - count: 218 149 | metric: files 150 | - count: 29410 151 | metric: lines 152 | - count: 1181 153 | metric: regions 154 | -------------------------------------------------------------------------------- /catalog/htr-school-vienna/paderov-bible-handwriting-ground-truth.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: Padeřov-Bible-handwriting-ground-truth 3 | url: https://zenodo.org/record/7467034#.Y6LQZBWZM2w 4 | authors: 5 | - name: Anna 6 | surname: Michalcová 7 | orcid: 0000-0003-4760-6950 8 | roles: 9 | - transcriber 10 | - aligner 11 | - project-manager 12 | - quality-control 13 | - support 14 | - name: Jan 15 | surname: Odstrčilík 16 | orcid: 0000-0001-9104-9827 17 | roles: 18 | - project-manager 19 | - support 20 | - name: Laura 21 | surname: Maniaková 22 | roles: 23 | - transcriber 24 | - name: Eliška 25 | surname: Pěnkavová 26 | orcid: 0000-0002-5494-8847 27 | - name: Kamil 28 | surname: Bazelides 29 | orcid: 0000-0002-5199-8726 30 | - name: Jan 31 | surname: Hajič 32 | orcid: 0000-0002-9207-567X 33 | - name: Hana 34 | surname: Kreisingerová 35 | orcid: 0000-0002-2924-598X 36 | - name: Jitka 37 | surname: Filipová 38 | orcid: 0000-0002-3570-4038 39 | - name: Chi-hung 40 | surname: Liu 41 | - name: Martina 42 | surname: Dvořáková 43 | institutions: 44 | - name: Institute of the Czech Language 45 | - name: Masaryk Institute and Archives 46 | description: >- 47 | This is ground truth based on the Padeřov Bible (Vienna, Austrian National 48 | Library, shelfmark Cod. 1175, 1432–1435), the bible of the third redaction of 49 | the Old Czech Bible translation. The transcription rules were based on 50 | semi-diplomatic transcription rules set by PERO OCR and Směrnice pro vydávání 51 | starších českých textů set by Jiří Daňhelka 52 | (https://vokabular.ujc.cas.cz/moduly/edicnipoznamka.aspx?id=DanhelkaSmernice). 53 | Abbreviations were tagged and expanded. 54 | project-name: HTR Winter School 2022, Vienna 55 | project-website: >- 56 | https://www.oeaw.ac.at/imafo/veranstaltungen/detail/introduction-into-handwritten-text-recognition-1 57 | language: 58 | - ces 59 | production-software: Transkribus 60 | script: 61 | - iso: Latn 62 | script-type: only-manuscript 63 | time: 64 | notBefore: '1432' 65 | notAfter: '1435' 66 | hands: 67 | count: '1' 68 | precision: exact 69 | license: 70 | - name: CC-BY 4.0 71 | url: https://creativecommons.org/licenses/by/4.0/ 72 | format: Page-XML 73 | sources: 74 | - reference: '' 75 | link: >- 76 | https://search.onb.ac.at/primo-explore/fulldisplay?docid=ONB_alma21302405460003338&context=L&adaptor=Local%20Search%20Engine&vid=ONB&lang=de_DE&search_scope=ONB_gesamtbestand&tab=default_tab&query=addsrcrid,exact,AC13954505 77 | volume: 78 | - metric: pages 79 | count: 63 80 | transcription-guidelines: >- 81 | Transliteration. Differentiates long and short "s". Abbreviations tagged and 82 | expanded. No misspelling corrections. 83 | -------------------------------------------------------------------------------- /catalog/rescribe/caroline-minuscule.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: Caroline Minuscule by Rescribe 3 | url: https://github.com/rescribe/carolineminuscule-groundtruth 4 | project-name: Rescribe 5 | project-website: https://rescribe.xyz/ 6 | authors: 7 | - name: White 8 | surname: Nick 9 | roles: 10 | - transcriber 11 | - project-manager 12 | - name: Karaisl 13 | surname: Antonia 14 | roles: 15 | - transcriber 16 | - project-manager 17 | - name: "Cl\xE9rice" 18 | surname: Thibault 19 | roles: 20 | - aligner 21 | description: 'This ground truth repository is a work in process; it currently accounts 22 | for a part of our complete Caroline Minuscule training pool of around 70 manuscripts 23 | used for our OCRopus Caroline Minuscule model (see ocropus-models repository). 24 | 25 | ' 26 | language: 27 | - lat 28 | script: 29 | - iso: Latn 30 | script-type: only-manuscript 31 | time: 32 | notBefore: '800' 33 | notAfter: '1199' 34 | hands: 35 | count: 1-per-file 36 | precision: exact 37 | license: 38 | - name: CC-BY 4.0 39 | url: https://creativecommons.org/licenses/by/4.0/ 40 | format: Alto-XML 41 | volume: 42 | - count: 457 43 | metric: lines 44 | - count: 17 45 | metric: files 46 | - count: 45 47 | metric: regions 48 | - count: 16909 49 | metric: characters 50 | transcription-guidelines: "In general this meant deciding between diplomatic transcription\ 51 | \ (i.e. sticking to what it says on the page) and gently modernized features (i.e.\ 52 | \ reinterpreting medieval signs into modern equivalents) with a view to specific\ 53 | \ categories. Read on for a summary of the rules and the respective rationale behind\ 54 | \ them.\nSUMMARY\nPUNCTUATION\n\n Modern: medieval punctuation is transcribed\ 55 | \ with modern equivalents; punctus elevatus transcribed as semicolon\n\nCAPITALIZATION\n\ 56 | \n Diplomatic: Original capitalization retained\n\nABBREVIATIONS\n\n Diplomatic\ 57 | \ where possible: Retain abbreviations and render glyphs as opposed to expanded\ 58 | \ versions where possible\n \"*\" where original character isn't served: OCRopus\ 59 | \ (at the point in time of transcription) could not handle some of the medieval\ 60 | \ glyphs, even where a Unicode version was present. Abbreviations not in OCRopus\ 61 | \ are uniformly transcribed as \"*\", in the case of a combined character (such\ 62 | \ as a consonant with a macron) as the base character followed by \"*\" (e.g. \"\ 63 | t*\"). The list of accepted characters in OCRopus can be found in this repository,\ 64 | \ and downloaded and used as codec in the OCRopus training process.\n\nSPACING\n\ 65 | \n Diplomatic: Preserve manuscript spacing, i.e. give diplomatic transcription\n\ 66 | \nNUMBERS\n\n Diplomatic: retain original version of both Roman and Arabic numerals'\n" 67 | production-software: "eScriptorium + Kraken" 68 | -------------------------------------------------------------------------------- /catalog/pbp/pbp.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: Paris Bible Project (PBP) 3 | url: https://github.com/parisbible/ground_truth 4 | authors: 5 | - name: Estelle 6 | surname: Guéville 7 | orcid: 0000-0003-2603-1051 8 | roles: 9 | - transcriber 10 | - aligner 11 | - project-manager 12 | - quality-control 13 | - name: David 14 | surname: Wrisley 15 | orcid: 0000-0002-0355-1487 16 | roles: 17 | - transcriber 18 | - aligner 19 | - project-manager 20 | - quality-control 21 | - name: Niccolò Acram 22 | surname: Cappelletto 23 | roles: 24 | - transcriber 25 | - aligner 26 | - quality-control 27 | institutions: [] 28 | description: >- 29 | The Paris Bible Project aims to understand the production and diffusion of 30 | medieval Latin Bibles in Europe. The dataset includes ground truth from Paris 31 | Bibles produced in the 13th and 14th centuries. We also provide the most 32 | recent version of our list of Paris Bible manuscripts found in the world along 33 | with information about them. 34 | project-website: https://parisbible.github.io/ 35 | language: 36 | - lat 37 | production-software: Transkribus 38 | script: 39 | - iso: Latn 40 | script-type: only-manuscript 41 | time: 42 | notBefore: '1200' 43 | notAfter: '1399' 44 | hands: 45 | count: more-than-10 46 | precision: estimated 47 | license: 48 | - name: CC-BY 4.0 49 | url: https://creativecommons.org/licenses/by/4.0/ 50 | format: Alto-XML 51 | volume: 52 | - metric: lines 53 | count: 1700 54 | - metric: files 55 | count: 19 56 | - metric: regions 57 | count: 40 58 | - metric: characters 59 | count: 55970 60 | characters: 61 | mode: NFKD 62 | members: 63 | - i 64 | - e 65 | - t 66 | - u 67 | - a 68 | - s 69 | - o 70 | - n 71 | - ̄ 72 | - c 73 | - m 74 | - r 75 | - l 76 | - ꝺ 77 | - "." 78 | - p 79 | - b 80 | - q 81 | - "⁊" 82 | - g 83 | - f 84 | - ́ 85 | - ꝛ 86 | - h 87 | - "-" 88 | - d 89 | - ꝫ 90 | - ";" 91 | - x 92 | - ꝯ 93 | - ̾ 94 | - ꝑ 95 | - ͥ 96 | - E 97 | - ̕ 98 | - ꝝ 99 | - ̃ 100 | - ꝓ 101 | - y 102 | - ̈ 103 | - N 104 | - ̇ 105 | - Q 106 | - "·" 107 | - D 108 | - S 109 | - I 110 | - A 111 | - ͦ 112 | - C 113 | - T 114 | - ᔆ 115 | - ꝙ 116 | - H 117 | - F 118 | - P 119 | - ͣ 120 | - '2' 121 | - V 122 | - M 123 | - ":" 124 | - R 125 | - z 126 | - L 127 | - O 128 | - U 129 | - v 130 | - "℟" 131 | - G 132 | - ͨ 133 | - ͧ 134 | - "&" 135 | - ẜ 136 | - ᷤ 137 | - ͤ 138 | - ʀ 139 | - B 140 | - X 141 | - Ꝙ 142 | - "?" 143 | - k 144 | - ᣳ 145 | - j 146 | - ͬ 147 | transcription-guidelines: 'See: https://parisbible.github.io/guidelines/' 148 | -------------------------------------------------------------------------------- /catalog/enc-cours-git/hn-boccace.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: "De la g\xE9n\xE9alogie des dieux" 3 | url: https://github.com/PSL-Chartes-HTR-Students/HN2021-Boccace 4 | project-name: ENC - Bonnes pratiques du developpement collaboratif 5 | authors: 6 | - name: Vlachou Efstathiou 7 | surname: Malamatenia 8 | roles: 9 | - transcriber 10 | - project-manager 11 | - name: Leroy 12 | surname: "No\xE9" 13 | roles: 14 | - transcriber 15 | - project-manager 16 | - name: Maulu 17 | surname: Marco 18 | roles: 19 | - project-manager 20 | - quality-control 21 | description: "This repository hosts all the documents, including transcriptions, bibliographical\ 22 | \ references and introduction that serve the team Boccace for the validation of\ 23 | \ the course \"Bonnes pratiques du developpement collaboratif : initiation \xE0\ 24 | \ Git\" (prof. Thibault Cl\xE9rice), of the first semester - Master Humanit\xE9\ 25 | s Num\xE9riques ENC-PSL 2021-2022. At the same time it and constitutes part of\ 26 | \ the biannual project \"Per un\u2019edizione digitale della Genealogia deorum gentilium\"\ 27 | \ di Boccaccio\" (dir. F. Duval, M. Maulu). Financed in 2021, this project foresees\ 28 | \ to put on line in XML format the unpublished translation in Middle French entitled\ 29 | \ \"De la genealogie des dieux\".\n" 30 | language: 31 | - frm 32 | - lat 33 | script: 34 | - iso: Latn 35 | script-type: only-typed 36 | time: 37 | notBefore: '1472' 38 | notAfter: '1498' 39 | hands: 40 | count: 1-per-folder 41 | precision: exact 42 | license: 43 | - name: CC-BY 4.0 44 | url: https://creativecommons.org/licenses/by/4.0/ 45 | format: Alto-XML 46 | volume: 47 | - metric: characters 48 | count: 109409 49 | - metric: files 50 | count: 47 51 | - metric: lines 52 | count: 3656 53 | - metric: regions 54 | count: 292 55 | sources: 56 | - reference: "Laurent Premierfait, Boccace (1498), \"De la genealogie des dieux\"\ 57 | , Paris, A. V\xE9rard." 58 | link: 'https://gallica.bnf.fr/ark:/12148/bpt6k105063r?rk=21459;2 ' 59 | citation-file-link: https://raw.githubusercontent.com/PSL-Chartes-HTR-Students/HN2021-Boccace/main/CITATION.cff 60 | transcription-guidelines: 'No development of abbreviations. Special characters are 61 | used for the graphemic transcription, compatible with the Unicode mufi qnd the special 62 | character table of cremma-medieval. No correction of orthography errors, BUT proper 63 | transcription of inversed letters (for Inc59) such as character "n" printed as "u" 64 | in several cases. Spaces were added freely for word separation according to dictionaries 65 | of middle French and Latin (latin forms verified on Collatinus). For more documentation 66 | regarding the transcription norms and guidelines head to the repository and the 67 | report file.'''' 68 | 69 | ' 70 | production-software: "eScriptorium + Kraken" 71 | -------------------------------------------------------------------------------- /catalog/parisTimeMachine/addresses-et-annuaires.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: "Donn\xE9es v\xE9rit\xE9 de terrain HTR+ Annuaire des propri\xE9taires et des\ 3 | \ propri\xE9t\xE9s de Paris et du d\xE9partement de la Seine (1898-1923)" 4 | url: http://dx.doi.org/10.34847/nkl.acb724xs 5 | project-name: 'Groupe annuaires et adresses - Consortium Huma-num Paris Time Machine 6 | 7 | ' 8 | project-website: https://paris-timemachine.huma-num.fr/groupe-adresses-et-annuaires/ 9 | authors: 10 | - name: Elgarrista 11 | surname: Gabriela 12 | roles: 13 | - transcriber 14 | - quality-control 15 | - name: "M\xE9lanie-Becquet" 16 | surname: "Fr\xE9d\xE9rique" 17 | roles: 18 | - project-manager 19 | - quality-control 20 | - name: Brando 21 | surname: Carmen 22 | roles: 23 | - project-manager 24 | - quality-control 25 | description: "Annuaire des propri\xE9taires et des propri\xE9t\xE9s de Paris et du\ 26 | \ d\xE9partement de la Seine. Lien dans le catalogue de la BNF : https://catalogue.bnf.fr/ark:/12148/cb32697229h.\ 27 | \ Cr\xE9dits : Biblioth\xE8que nationale de France. Donn\xE9es v\xE9rit\xE9 de terrain\ 28 | \ r\xE9sultant de la transcription et la segmentation manuelle d\u2019un \xE9chantillon\ 29 | \ de 169 pages des annuaires appartenant aux volumes 1898 et 1923. Un mod\xE8le\ 30 | \ de transcription HTR+ a \xE9t\xE9 entrain\xE9 \xE0 partir de cet \xE9chantillon\ 31 | \ gr\xE2ce \xE0 Transkribus et est disponible sur cette plateforme en mode public.\ 32 | \ Ce mod\xE8le est valable pour transcrire automatiquement les volumes de 1903 et\ 33 | \ 1913 et tout autre document imprim\xE9 \xE0 deux colonnes et en utilisant l'alphabet\ 34 | \ latin et particuli\xE8rement en fran\xE7ais. Le choix de l'\xE9chantillon est\ 35 | \ fait par crit\xE8re alphab\xE9tique car c'est le mode d'organisation de l'information\ 36 | \ dans ce document. Les accolades pr\xE9sentes dans le document n'ont pas \xE9t\xE9\ 37 | \ segment\xE9es. 118 pages pour entrainer et 51 pages pour validation.\nContexte\ 38 | \ et financement : Subvention DAHN (Dispositif de soutien \xE0 l'archivistique et\ 39 | \ aux humanit\xE9s num\xE9riques) par le MESRI. Equipes : Consortium Paris Time\ 40 | \ Machine - TGIR Humanum EHESS / CNRS / LATTICE / INRIA Contact si besoin d'anonymiser\ 41 | \ les noms de personnes : carmen.brando@ehess.fr.\n" 42 | language: 43 | - fra 44 | script: 45 | - iso: Latn 46 | script-type: only-typed 47 | time: 48 | notBefore: '1898' 49 | notAfter: '1923' 50 | hands: 51 | count: less-than-11 52 | precision: estimated 53 | license: 54 | - name: CC-BY-SA 4.0 55 | url: https://creativecommons.org/licenses/by-sa/4.0/ 56 | format: Alto-XML 57 | volume: 58 | - count: 169 59 | metric: pages 60 | - count: 19022 61 | metric: lines 62 | - count: 641401 63 | metric: characters 64 | transcription-guidelines: "Transcription diplomatique. Les accolades n'ont pas \xE9\ 65 | t\xE9 segment\xE9es.\n" 66 | production-software: Transkribus 67 | -------------------------------------------------------------------------------- /catalog/gallicorpora/print-16.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json 2 | title: "Donn\xE9es imprim\xE9s du 16e si\xE8cle" 3 | description: "Corpus d'entrainement pour l'HTR constitu\xE9 d'imprim\xE9s du 16e si\xE8\ 4 | cle" 5 | url: https://github.com/Gallicorpora/HTR-imprime-16e-siecle 6 | authors: 7 | - name: Gabay 8 | surname: Simon 9 | roles: 10 | - project-manager 11 | - name: Pinche 12 | roles: 13 | - project-manager 14 | surname: Ariane 15 | - name: Vlachou-Efstathiou 16 | surname: malamatenia 17 | roles: 18 | - transcriber 19 | - name: Christensen 20 | surname: Kelly 21 | roles: 22 | - support 23 | format: Alto-XML 24 | hands: 25 | count: 1-per-folder 26 | precision: estimated 27 | language: 28 | - frm 29 | - fra 30 | license: 31 | - name: CC-BY 4.0 32 | url: https://creativecommons.org/licenses/by/4.0/ 33 | project-name: Gallicorpora 34 | project-website: https://github.com/Gallicorpora 35 | script: 36 | - iso: Latn 37 | script-type: only-typed 38 | time: 39 | notAfter: '1599' 40 | notBefore: '1500' 41 | transcription-guidelines: "Les normes de transcription suivent les pr\xE9conisations\ 42 | \ du projet Gallicorpora" 43 | volume: 44 | - metric: characters 45 | count: 186202 46 | - metric: files 47 | count: 180 48 | - metric: lines 49 | count: 4918 50 | - metric: regions 51 | count: 591 52 | citation-file-link: https://github.com/Gallicorpora/HTR-imprime-16e-siecle/CITATION.cff 53 | production-software: eScriptorium + Kraken 54 | characters: 55 | mode: NFD 56 | members: 57 | - e 58 | - u 59 | - r 60 | - a 61 | - n 62 | - i 63 | - t 64 | - o 65 | - l 66 | - s 67 | - "\u017F" 68 | - d 69 | - c 70 | - m 71 | - p 72 | - ',' 73 | - q 74 | - y 75 | - v 76 | - f 77 | - g 78 | - b 79 | - h 80 | - . 81 | - "\u2019" 82 | - '&' 83 | - E 84 | - x 85 | - '''' 86 | - z 87 | - "\u0301" 88 | - "\u0300" 89 | - A 90 | - "\xAC" 91 | - "\u0303" 92 | - D 93 | - C 94 | - R 95 | - ':' 96 | - L 97 | - I 98 | - S 99 | - P 100 | - N 101 | - M 102 | - O 103 | - Q 104 | - T 105 | - V 106 | - G 107 | - H 108 | - B 109 | - F 110 | - '-' 111 | - "\u0327" 112 | - j 113 | - '?' 114 | - ( 115 | - "\u0308" 116 | - ) 117 | - "\xBB" 118 | - '1' 119 | - "\u0153" 120 | - "\xB6" 121 | - '!' 122 | - U 123 | - '2' 124 | - X 125 | - ; 126 | - '9' 127 | - Y 128 | - '4' 129 | - '3' 130 | - "\xDF" 131 | - '5' 132 | - '"' 133 | - '7' 134 | - J 135 | - '8' 136 | - "\xE6" 137 | - "\uA770" 138 | - '6' 139 | - '0' 140 | - "\u0302" 141 | - "\u02B3" 142 | - "\u204A" 143 | - Z 144 | - "\xAB" 145 | - '*' 146 | - "\uA757" 147 | - "\uA753" 148 | - "\_" 149 | - "\u204B" 150 | - "\u0399" 151 | - "\uA751" 152 | - ']' 153 | - "\u0365" 154 | - "\u1D49" 155 | - "\u0395" 156 | - '[' 157 | - "\u03A4" 158 | - / 159 | -------------------------------------------------------------------------------- /catalog/ground-truth-set-for-handwritten-text-recognition-htr-ocr-dresdner-hofdiarium-1665-mscrdresdk80-17th-century-kurrent-manuscript/ground-truth-set-for-handwritten-text-recognition-htr-ocr-dresdner-hofdiarium-1665-mscrdresdk80-17th-century-kurrent-manuscript.yml: -------------------------------------------------------------------------------- 1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 2 | title: >- 3 | Ground Truth Set for Handwritten Text Recognition (HTR/OCR): Dresdner 4 | Hofdiarium 1665 (Mscr.Dresd.K.80) - 17th century Kurrent manuscript 5 | url: https://doi.org/10.5281/zenodo.14356190 6 | authors: 7 | - name: Stefan 8 | surname: Beckert 9 | orcid: 0009-0005-2394-0075 10 | roles: 11 | - transcriber 12 | - aligner 13 | - project-manager 14 | - quality-control 15 | institutions: [] 16 | description: >- 17 | This dataset contains ten pages of Ground Truth from the Dresden Court Diaries 18 | of elector Johann Georg II. as Page XML, Alto XML and jpg. 19 | language: 20 | - deu 21 | production-software: eScriptorium + Kraken 22 | automatically-aligned: false 23 | script: 24 | - iso: Latn 25 | qualify: Kurrent 26 | script-type: only-manuscript 27 | time: 28 | notBefore: '1665' 29 | notAfter: '1665' 30 | hands: 31 | count: '1' 32 | precision: exact 33 | license: 34 | name: CC-BY-NC-SA 4.0 35 | url: https://creativecommons.org/licenses/by/4.0/ 36 | format: Alto-XML 37 | sources: 38 | - reference: >- 39 | Beckert, S. (2024). Ground Truth Set for Handwritten Text Recognition 40 | (HTR/OCR): Dresdner Hofdiarium 1665 (Mscr.Dresd.K.80) - 17th century 41 | Kurrent manuscript [Data set]. Zenodo. 42 | https://doi.org/10.5281/zenodo.14356190 43 | link: '' 44 | volume: 45 | - metric: pages 46 | count: 10 47 | transcription-guidelines: >- 48 | Transcription guidelines are oriented on the DTABF-M schema 49 | (https://www.deutschestextarchiv.de/doku/basisformat/manuskript.html), but 50 | have been adapted as follows: 51 | 52 | 53 | - I and J majuscules are not distinguished 54 | 55 | - u and v are reproduced true to the original (e.g. vnd) 56 | 57 | - Long-s (ſ) and round-s (s) are distinguished 58 | 59 | - sz ligature is rendered as ß in Kurrent scripts and as sz (e.g. "Libusza") 60 | in Antiqua scripts 61 | 62 | - ij ligature is rendered as y 63 | 64 | - other ligatures, if they occur at all, are dissolved 65 | 66 | - r graphemes are rendered as r in their modern day form 67 | 68 | - an m with a nasal stroke was rendered as a simple m 69 | 70 | - Where possible, abbreviation signs (Abbrechungszeichen) for the contemporary 71 | identification of abbreviations have been included as single letters and not 72 | marked separately. The subsequent punctuation mark (“.” or “:”) for further 73 | identification of the abbreviation has also been included (cf. also Capelli, 74 | 1928, Lexicon abbreviaturarum I, p.X) 75 | 76 | - Diacritics in u were not marked 77 | 78 | - In the case of uncertain capitalization, an approximation is sought via the 79 | letter size 80 | 81 | -------------------------------------------------------------------------------- /catalog/htromance/ita.yml: -------------------------------------------------------------------------------- 1 | authors: 2 | - name: Rachele 3 | roles: 4 | - transcriber 5 | surname: Alba 6 | - name: Giorgia 7 | roles: 8 | - transcriber 9 | surname: Rubin 10 | - name: Federico 11 | orcid: 0000-0002-7810-7735 12 | roles: 13 | - project-manager 14 | - quality-control 15 | surname: Boschetti 16 | - name: Franz 17 | roles: 18 | - project-manager 19 | surname: Fischer 20 | - name: Alix 21 | orcid: 0000-0002-0136-4434 22 | roles: 23 | - project-manager 24 | surname: Chagué 25 | - name: Thibault 26 | orcid: 0000-0003-1852-9204 27 | roles: 28 | - project-manager 29 | surname: Clérice 30 | automatically-aligned: false 31 | characters: 32 | members: 33 | - e 34 | - a 35 | - o 36 | - i 37 | - l 38 | - n 39 | - r 40 | - t 41 | - u 42 | - s 43 | - c 44 | - d 45 | - m 46 | - p 47 | - g 48 | - h 49 | - f 50 | - . 51 | - ̃ 52 | - q 53 | - b 54 | - ⁊ 55 | - ',' 56 | - ꝑ 57 | - E 58 | - C 59 | - z 60 | - x 61 | - ̾ 62 | - A 63 | - I 64 | - ̧ 65 | - D 66 | - L 67 | - M 68 | - ͤ 69 | - O 70 | - S 71 | - R 72 | - ͧ 73 | - y 74 | - ꝙ 75 | - ͬ 76 | - ł 77 | - F 78 | - N 79 | - U 80 | - T 81 | - Q 82 | - ͦ 83 | - P 84 | - B 85 | - ́ 86 | - ͥ 87 | - '=' 88 | - ':' 89 | - ꝯ 90 | - X 91 | - ẜ 92 | - G 93 | - ͣ 94 | - H 95 | - '2' 96 | - '9' 97 | - '1' 98 | - ¶ 99 | - '4' 100 | - ꝓ 101 | - '3' 102 | - '5' 103 | - k 104 | - ͭ 105 | - '7' 106 | - '8' 107 | - / 108 | - "'" 109 | - ε 110 | - ɨ 111 | - đ 112 | - '6' 113 | - ι 114 | - ο 115 | - '0' 116 | - ̓ 117 | - ν 118 | - ꝗ 119 | - ̈ 120 | - μ 121 | - λ 122 | - ꝰ 123 | - α 124 | - ω 125 | - π 126 | - σ 127 | - ͫ 128 | - Y 129 | - '-' 130 | - θ 131 | - γ 132 | - η 133 | - Ο 134 | - υ 135 | - ρ 136 | - ̔ 137 | - ͂ 138 | - β 139 | - + 140 | - Z 141 | mode: NFD 142 | description: Transcription of samples of Medieval Italian manuscripts 143 | format: Alto-XML 144 | hands: 145 | count: 1-per-folder 146 | precision: estimated 147 | language: 148 | - ita 149 | - vec 150 | license: 151 | name: CC-BY 4.0 152 | url: https://creativecommons.org/licenses/by/4.0/ 153 | production-software: eScriptorium + Kraken 154 | project-name: HTRomance 155 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json 156 | script: 157 | - iso: Latn 158 | script-type: only-manuscript 159 | time: 160 | notAfter: '1499' 161 | notBefore: '1100' 162 | title: HTRomance, Medieval Italian corpus of ground-truth for Handwritten Text Recognition 163 | and Layout Segmentation 164 | url: https://github.com/HTRomance-Project/medieval-italian 165 | volume: 166 | - count: 84366 167 | metric: characters 168 | - count: 60 169 | metric: files 170 | - count: 3086 171 | metric: lines 172 | - count: 60 173 | metric: pages 174 | - count: 353 175 | metric: regions 176 | --------------------------------------------------------------------------------