├── catalog
├── .gitkeep
├── nubis-ocr
│ └── nubis-ocr.yml
├── e-editiones
│ └── ocr17plus.yml
├── episearch
│ └── episeach-htr.yml
├── fondue
│ ├── FONDUE-FR-PRINT-16.yml
│ ├── fondue-kunsthistorisches-uzh-archivdatenbank.yml
│ ├── FONDUE-MLT-CAT.yml
│ ├── fondue-gasparosarditoponomasia-dataset.yml
│ ├── fondue-spanish-chapbooks-19th-c-dataset.yml
│ ├── FONDUE-IT-PRINT-20.yml
│ ├── FONDUE-ES-PRINT-19.yml
│ ├── FONDUE-FR-PRINT-20.yml
│ ├── FONDUE-EN-PRINT-20.yml
│ └── FONDUE-FR-MSS-18.yml
├── genauto
│ └── genauto-td-htr.yml
├── koenigsfelden
│ └── kf-htr.yml
├── araucania
│ └── araucania.yml
├── shakespeare-scott-translations
│ └── ocr-data.yml
├── greek-data
│ ├── d-scribe-zenon.yml
│ ├── stavronikita-114.yml
│ ├── stavronikita-53.yml
│ ├── stavronikita-79.yml
│ ├── hpgtr.yml
│ └── eparchos.yml
├── from-manuscript-to-print-a-matter-of-bankability
│ └── antoine-verard-extracts.yml
├── editer-la-correspondance-de-constance-de-salm-1767-1845
│ └── editer-la-correspondance-de-constance-de-salm-1767-1845.yml
├── ciham-htr
│ ├── dataset-for-late-medieval-castilian-text-recognition.yml
│ └── fabliaux.yml
├── enc-cours-git
│ ├── hn-kovalewsky.yml
│ ├── tnah-expouniv.yml
│ ├── tnah-decameronfr.yml
│ ├── tnah-notredame.yml
│ ├── hn-chavigny.yml
│ ├── hn-poesie-corse.yml
│ └── hn-boccace.yml
├── transcriboquest2024-literary-medieval
│ └── transcriboquest-2024-medieval-literary.yml
├── tubingen-library
│ └── southasia-malayalam.yml
├── ocr-d
│ └── ocr-d_gt_structure_text.yml
├── eutyches-grammaticus-glossed
│ └── eutyches.yml
├── incunables-sevillans-1494-1500
│ └── incunables-sevillans-1494-1500.yml
├── distinguo
│ └── distinguo-GT-metadata.yml
├── inha
│ ├── LesPapiersBarye.yml
│ └── LettresDeJacquesDoucetAReneJean1908-1929.yml
├── ajmc
│ └── ajmc-layout.yml
├── naval-kishore
│ └── naval-kishore.yml
├── burchards-dekret-digital
│ └── bdd-segmentation-data.yml
├── tarima
│ └── tarima.yml
├── bullinger
│ └── gwalther-htr.yml
├── transcriboquest-2025
│ └── transcriboquest-2025-medieval-latin.yml
├── banq
│ └── copiste-d-un-jour.yml
├── stabs-urfehdebuch
│ └── urfehdebuch-htr.yml
├── impresso
│ └── nzz-ocr.yml
├── alix-tz
│ ├── peraire-ground-truth.yml
│ └── moonshines.yml
├── TranscriboQuest_Arabic
│ └── htr-united.yml
├── ifloral
│ └── ifloral-dataset.yml
├── cremma
│ ├── mss-20.yml
│ ├── mss-16.yml
│ ├── mss-19.yml
│ ├── mss-18.yml
│ └── mss-17.yml
├── teklia
│ └── belfort.yml
├── incunabula-reichenau
│ └── incunabula-reichenau.yml
├── htr-school-vienna
│ ├── wien-onb-cod-2160-f-164-184-ground-truth-from-htr-winter-school-2022.yml
│ ├── htr-winter-school-2024-medieval-czech-prague-bible-1488.yml
│ └── paderov-bible-handwriting-ground-truth.yml
├── almanach
│ ├── dahn.yml
│ ├── lectaurep-bronod.yml
│ ├── tapuscorpus.yml
│ └── lectaurep-notaires.yml
├── hismodoc-htr
│ └── titres-nobiliaires-17-18-siecles-dataset.yml
├── antwerp_bias-in-history
│ └── arletta.yml
├── joseph-hooker-correspondance-project
│ └── joseph-hooker-htr.yml
├── scripta-psl
│ └── biblia.yml
├── gallicorpora
│ ├── gothic-16.yml
│ ├── incunable-15.yml
│ ├── mss-15.yml
│ └── print-16.yml
├── LiDi
│ └── LiDi1-0-project.yml
├── popp
│ └── the-popp-datasets.yml
├── meleagre
│ └── meleagre.yml
├── rasam-2
│ └── rasam.yml
├── front-justice
│ └── front-justice-htr.yml
├── bsc-cssh
│ └── AMSMB-HTR.yml
├── rasam-1
│ └── rasam.yml
├── sloane_lab
│ └── sloane_lab_htr_model.yml
├── slub-dresden
│ ├── mscr-dresd-k-117.yml
│ └── mscr-dresd-k-113.yml
├── bullinger-htr-dataset
│ └── bullinger-htr-dataset.yml
├── chi-know-po
│ └── chi-know-po.yml
├── rescribe
│ └── caroline-minuscule.yml
├── pbp
│ └── pbp.yml
├── parisTimeMachine
│ └── addresses-et-annuaires.yml
├── ground-truth-set-for-handwritten-text-recognition-htr-ocr-dresdner-hofdiarium-1665-mscrdresdk80-17th-century-kurrent-manuscript
│ └── ground-truth-set-for-handwritten-text-recognition-htr-ocr-dresdner-hofdiarium-1665-mscrdresdk80-17th-century-kurrent-manuscript.yml
└── htromance
│ └── ita.yml
├── id-db.json
├── graph.png
├── CITATION.CFF
├── .github
├── workflows
│ ├── HTRUC.yaml
│ └── Catalog.yaml
└── ISSUE_TEMPLATE
│ ├── add-a-new-dataset-description.md
│ └── ajouter-la-description-d-un-nouveau-jeu-de-donn-es.md
├── spid.py
└── catalog-ids.json
/catalog/.gitkeep:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/id-db.json:
--------------------------------------------------------------------------------
1 | {"values":{},"ids":{}}
--------------------------------------------------------------------------------
/graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HTR-United/htr-united/HEAD/graph.png
--------------------------------------------------------------------------------
/CITATION.CFF:
--------------------------------------------------------------------------------
1 | cff-version: 1.1.0
2 | message: "If you use this software, please cite it as below."
3 | authors:
4 | - family-names: Chagué
5 | given-names: Alix
6 | orcid: https://orcid.org/0000-0002-0136-4434
7 | - family-names: Clérice
8 | given-names: Thibault
9 | orcid: https://orcid.org/0000-0003-1852-9204
10 | title: "HTR-United: Ground Truth Resources for the HTR and OCR of patrimonial documents"
11 |
--------------------------------------------------------------------------------
/.github/workflows/HTRUC.yaml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: HTRUC
5 |
6 | on: [push, pull_request]
7 |
8 | jobs:
9 | test:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v2
13 | - name: Set up Python 3.8
14 | uses: actions/setup-python@v2
15 | with:
16 | python-version: 3.8
17 | - name: Install dependencies
18 | run: |
19 | python -m pip install --upgrade pip
20 | pip install htruc
21 | - name: Run HTRUC
22 | run: |
23 | htruc test ./catalog/**/*.y*ml
24 |
--------------------------------------------------------------------------------
/catalog/nubis-ocr/nubis-ocr.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: NuBIS-OCR
3 | url: https://github.com/ksefil/NuBIS-OCR
4 | authors:
5 | - name: Kutay
6 | surname: Sefil
7 | roles:
8 | - transcriber
9 | institutions: []
10 | description: >-
11 | Ground truth dataset for a selection of printed books from NuBIS, the digital
12 | library of the Bibliothèque Interuniversitaire de la Sorbonne.
13 | language:
14 | - fra
15 | - lat
16 | production-software: eScriptorium + Kraken
17 | automatically-aligned: false
18 | script:
19 | - iso: Latn
20 | script-type: only-typed
21 | time:
22 | notBefore: '1602'
23 | notAfter: '1989'
24 | hands:
25 | count: unknown
26 | precision: exact
27 | license:
28 | name: CC-BY 4.0
29 | url: https://creativecommons.org/licenses/by/4.0/
30 | format: Alto-XML
31 | sources:
32 | - reference: ''
33 | link: https://nubis.bis-sorbonne.fr/
34 | volume:
35 | - metric: pages
36 | count: 57
37 |
--------------------------------------------------------------------------------
/catalog/e-editiones/ocr17plus.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: OCR17plus
3 | url: https://github.com/e-ditiones/OCR17plus
4 | project-name: E-ditiones
5 | project-website: https://e-ditiones.huma-num.fr/
6 | authors:
7 | - name: Gabay
8 | surname: Simon
9 | roles:
10 | - transcriber
11 | - project-manager
12 | - support
13 | - name: Jahan
14 | surname: Claire
15 | roles:
16 | - transcriber
17 | - aligner
18 | description: "Imprim\xE9s classiques"
19 | language:
20 | - frm
21 | script:
22 | - iso: Latn
23 | script-type: only-typed
24 | time:
25 | notBefore: '1600'
26 | notAfter: '1700'
27 | hands:
28 | count: 1-per-folder
29 | precision: exact
30 | license:
31 | - name: CC-BY 4.0
32 | url: https://creativecommons.org/licenses/by/4.0/
33 | format: Alto-XML
34 | volume:
35 | - count: 25628
36 | metric: lines
37 | - count: 965
38 | metric: files
39 | - count: 3923
40 | metric: regions
41 | - count: 686335
42 | metric: characters
43 | production-software: Transkribus
44 |
--------------------------------------------------------------------------------
/spid.py:
--------------------------------------------------------------------------------
1 | # This script is meant to maintain a list of (semi-)PID based on URIs of dataset
2 |
3 | import json
4 | import hashlib
5 |
6 | def get_hash(string):
7 | sha = hashlib.sha256(string.encode())
8 | return sha.hexdigest()[:9]
9 |
10 | with open("catalog.json") as f:
11 | data = json.load(f)
12 |
13 | with open("id-db.json") as f:
14 | db = json.load(f)
15 |
16 | for key, record in sorted(list(data.items()), key=lambda x: x[1]["url"]):
17 | if record["url"] not in db["values"]:
18 | current_id = get_hash(record["url"])
19 | db["values"][record["url"]] = current_id
20 | db["ids"][current_id] = record["url"]
21 | data[key]["_pid"] = current_id
22 | if "_pid" not in record:
23 | data[key]["_pid"] = db["values"][record["url"]]
24 |
25 | with open("id-db.json", "w") as f:
26 | json.dump(db, f, indent=2)
27 |
28 | with open("catalog.json", "w") as f:
29 | json.dump(
30 | {
31 | record["_pid"]: record
32 | for _, record in sorted(list(data.items()), key=lambda x: x[1]["url"])
33 | },
34 | f,
35 | indent=2
36 | )
37 |
--------------------------------------------------------------------------------
/catalog/episearch/episeach-htr.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: EpiSearch HTR
3 | url: https://github.com/vedph/episearch-htr
4 | authors:
5 | - name: Lorenzo
6 | surname: Calvelli
7 | orcid: 0000-0002-0920-9156
8 | roles:
9 | - project-manager
10 | - name: Tatiana
11 | surname: Tommasi
12 | orcid: 0009-0000-2815-0113
13 | roles:
14 | - transcriber
15 | - name: Federico
16 | surname: Boschetti
17 | orcid: 0000-0002-7810-7735
18 | roles:
19 | - support
20 | institutions: []
21 | description: Ground Truth for Astori’s letters (see the README.md file for details)
22 | project-name: EpiSearch
23 | project-website: https://github.com/vedph/episearch-htr
24 | language:
25 | - ita
26 | production-software: eScriptorium + Kraken
27 | script:
28 | - iso: Latn
29 | script-type: only-manuscript
30 | time:
31 | notBefore: '1705'
32 | notAfter: '1709'
33 | hands:
34 | count: '1'
35 | precision: exact
36 | license:
37 | - name: CC-BY-SA 4.0
38 | url: https://creativecommons.org/licenses/by-sa/4.0/
39 | format: Alto-XML
40 | volume:
41 | - metric: files
42 | count: 34
43 |
--------------------------------------------------------------------------------
/catalog/fondue/FONDUE-FR-PRINT-16.yml:
--------------------------------------------------------------------------------
1 | authors:
2 | - name: Gabay
3 | orcid: 0000-0001-9094-4475
4 | roles:
5 | - transcriber
6 | - project-manager
7 | - quality-control
8 | - support
9 | surname: Simon
10 | citation-file-link: https://github.com/FoNDUE-HTR/FONDUE-FR-PRINT-16/blob/master/CITATION.cff
11 | description: ' Transcriptions of French 16th c. prints '
12 | format: Alto-XML
13 | hands:
14 | count: unknown
15 | precision: exact
16 | language:
17 | - fra
18 | license:
19 | name: CC-BY 4.0
20 | url: https://creativecommons.org/licenses/by/4.0/
21 | production-software: eScriptorium + Kraken
22 | project-name: FoNDUE
23 | project-website: https://github.com/FoNDUE-HTR/
24 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
25 | script:
26 | - iso: Latn
27 | script-type: only-typed
28 | time:
29 | notAfter: '1600'
30 | notBefore: '1500'
31 | title: FONDUE-FR-PRINT-16
32 | transcription-guidelines: SegmOnto
33 | url: https://github.com/FoNDUE-HTR/FONDUE-FR-PRINT-16
34 | volume:
35 | - count: 504656
36 | metric: characters
37 | - count: 930
38 | metric: files
39 | - count: 17817
40 | metric: lines
41 | - count: 2829
42 | metric: regions
43 |
--------------------------------------------------------------------------------
/catalog/genauto/genauto-td-htr.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: GenAuto TD Corpus
3 | url: https://github.com/jpmjpmjpm/genauto-td-htr.git
4 | project-name: GenAuto
5 | project-website: ''
6 | authors:
7 | - name: Boutet
8 | surname: "Jean-Fran\xE7ois"
9 | roles:
10 | - transcriber
11 | - aligner
12 | - name: Merx
13 | surname: Jean-Pierre
14 | roles:
15 | - transcriber
16 | - aligner
17 | - project-manager
18 | description: "150 transcribed images from \"Tables D\xE9cennales\" French Civil Registry.\
19 | \ Those come from Sermaises and Romilly-sur-Seine municipalities.\n"
20 | language:
21 | - fra
22 | script:
23 | - iso: Latn
24 | script-type: only-manuscript
25 | time:
26 | notBefore: '1792'
27 | notAfter: '1902'
28 | hands:
29 | count: less-than-11
30 | precision: estimated
31 | license:
32 | - name: CC-BY 4.0
33 | url: https://creativecommons.org/licenses/by/4.0/
34 | format: Alto-XML
35 | volume:
36 | - count: 300
37 | metric: pages
38 | - count: 150
39 | metric: images
40 | - count: 150
41 | metric: files
42 | - count: 186366
43 | metric: characters
44 | - count: 21557
45 | metric: lines
46 | - count: 608
47 | metric: regions
48 | production-software: "eScriptorium + Kraken"
49 |
--------------------------------------------------------------------------------
/catalog/koenigsfelden/kf-htr.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: "Charters and Records of K\xF6nigsfelden Abbey and Bailiwick (1308-1662)"
3 | url: https://doi.org/10.5281/zenodo.5179361
4 | authors:
5 | - name: Hodel
6 | surname: Tobias
7 | roles:
8 | - transcriber
9 | - project-manager
10 | - support
11 | - name: Halter-Pernet
12 | surname: Colette
13 | roles:
14 | - transcriber
15 | - aligner
16 | - project-manager
17 | - quality-control
18 | - digitization
19 | - support
20 | - name: Teuscher
21 | surname: Simon
22 | roles:
23 | - project-manager
24 | description: "The data set is the publication of the data of the scholarly edition\
25 | \ \"Urkunden und Akten des Klosters und der Hofmeisterei K\xF6nigsfelden\"."
26 | project-website: https://www.koenigsfelden.uzh.ch/
27 | language:
28 | - lat
29 | - deu
30 | script:
31 | - iso: Latn
32 | script-type: only-manuscript
33 | time:
34 | notBefore: '1292'
35 | notAfter: '1570'
36 | hands:
37 | count: more-than-10
38 | precision: estimated
39 | license:
40 | - name: CC-BY 4.0
41 | url: https://creativecommons.org/licenses/by/4.0/
42 | format: Page-XML
43 | volume:
44 | - metric: lines
45 | count: 60000
46 | transcription-guidelines: 'See: https://www.koenigsfelden.uzh.ch/exist/apps/ssrq/intro.html#richtlinien'
47 | production-software: "Transkribus"
48 |
--------------------------------------------------------------------------------
/catalog/araucania/araucania.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: HTR - Araucania manuscript XIX
3 | url: https://github.com/Proyecto-Ocupacion-Araucania-UChile/HTR_Araucania_XIX
4 | authors:
5 | - name: Humeau
6 | surname: Maxime
7 | - name: Chiaretti
8 | surname: Alessandro
9 | institutions:
10 | - name: Archivo Central Andres Bello
11 | description: >-
12 | Ground Truth dataset for Spanish 19th typewritten OCR.
13 |
14 | The archives come from the events of the Occupation of Araucania (1850-1881)
15 | in Chile. They are archived in the ’Colección manuscritos' of the Archivo
16 | Central Andres Bello - Universidad de Chile.
17 | language:
18 | - spa
19 | production-software: eScriptorium + Kraken
20 | script:
21 | - iso: Latn
22 | script-type: mainly-manuscript
23 | time:
24 | notBefore: '1859'
25 | notAfter: '1877'
26 | hands:
27 | count: more-than-10
28 | precision: estimated
29 | license:
30 | - name: CC-BY-SA 4.0
31 | url: https://creativecommons.org/licenses/by-sa/4.0/
32 | format: Alto-XML
33 | volume:
34 | - metric: files
35 | count: 180
36 | - metric: lines
37 | count: 3932
38 | - metric: regions
39 | count: 981
40 | - metric: characters
41 | count: 117155
42 | transcription-guidelines: |
43 | - xxx for erased or unreadable characters
44 | - ^+letters for superscript letters
45 | - ⁋ for new paragraph
46 |
--------------------------------------------------------------------------------
/catalog/shakespeare-scott-translations/ocr-data.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: Shakespeare-Scott translations
3 | url: https://github.com/millawell/ocr-data
4 | project-name: 'Publishing an OCR ground truth data set for reuse in an unclear copyright
5 | setting''
6 |
7 | '
8 | project-website: https://github.com/millawell/ocr-data
9 | authors:
10 | - name: Lassner
11 | surname: David
12 | - name: Coburger
13 | surname: Julius
14 | - name: Neudecker
15 | surname: Clemens
16 | - name: Baillot
17 | surname: Anne
18 | description: "Ground truth data in German and English of Shakespeare and Scott prints\
19 | \ in original and different translations. \n"
20 | language:
21 | - eng
22 | - deu
23 | script:
24 | - iso: Latn
25 | - iso: Latf
26 | script-type: only-typed
27 | time:
28 | notBefore: '1815'
29 | notAfter: '1852'
30 | hands:
31 | count: unknown
32 | precision: exact
33 | license:
34 | - name: CC-BY 4.0
35 | url: https://creativecommons.org/licenses/by/4.0/
36 | format: Alto-XML
37 | volume:
38 | - metric: lines
39 | count: 5354
40 | - metric: files
41 | count: 131
42 | - metric: regions
43 | count: 131
44 | - metric: characters
45 | count: 192264
46 | sources:
47 | - reference: ''
48 | link: https://zfdg.de/sb005_006
49 | citation-file-link: https://github.com/millawell/ocr-data/blob/master/citation.cff
50 | production-software: "eScriptorium + Kraken"
51 |
--------------------------------------------------------------------------------
/catalog/greek-data/d-scribe-zenon.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: Ground-Truthed Data Set of Zenon Papyri for Handwritten Text Recognition
3 | url: https://zenodo.org/records/6565706
4 | authors:
5 | - name: Isabelle
6 | surname: Marthot-Santaniello
7 | orcid: 0000-0003-0407-8748
8 | roles:
9 | - transcriber
10 | - project-manager
11 | - name: Hodel
12 | surname: Tobias
13 | orcid: 0000-0002-2071-6407
14 | roles:
15 | - transcriber
16 | - project-manager
17 | institutions: []
18 | description: >-
19 | Diplomatic transcription of papyri found in the Zenon archive [see
20 | en.wikipedia.org/wiki/Zenon_of_Kaunos]
21 |
22 |
23 | Manually prepared as PageXML with Transkribus within D-Scribes project.
24 | project-name: D-Scribes
25 | project-website: https://d-scribes.philhist.unibas.ch/en/
26 | language:
27 | - grc
28 | production-software: Transkribus
29 | automatically-aligned: false
30 | characters:
31 | mode: NFD
32 | script:
33 | - iso: Grek
34 | script-type: only-manuscript
35 | time:
36 | notBefore: '-250'
37 | notAfter: '-230'
38 | hands:
39 | count: unknown
40 | precision: estimated
41 | license:
42 | name: CC-BY 4.0
43 | url: https://creativecommons.org/licenses/by/4.0/
44 | format: Page-XML
45 | volume:
46 | - metric: lines
47 | count: 321
48 | - metric: characters
49 | count: 5850
50 | - metric: files
51 | count: 27
--------------------------------------------------------------------------------
/catalog/from-manuscript-to-print-a-matter-of-bankability/antoine-verard-extracts.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: Antoine Verard extracts
3 | url: https://github.com/LaurieHoeben/Verard-corpus
4 | authors:
5 | - name: Laurie
6 | surname: Hoeben
7 | roles:
8 | - transcriber
9 | - aligner
10 | institutions: []
11 | description: >-
12 | Parts of Antoine Vérard’s editions princeps of "Tristan", "Merlin" and "Gyron
13 | le Courtoys".
14 | project-name: 'From Manuscript to Print: a Matter of Bankability?'
15 | project-website: https://www.universityofgalway.ie/rebpaf/
16 | language:
17 | - frm
18 | production-software: 'eScriptorium '
19 | automatically-aligned: false
20 | script:
21 | - iso: Latn
22 | script-type: mainly-typed
23 | time:
24 | notBefore: '1489'
25 | notAfter: '1503'
26 | hands:
27 | count: 1-per-folder
28 | precision: exact
29 | license:
30 | name: Etalab OL 2.0
31 | url: https://spdx.org/licenses/etalab-2.0.html
32 | format: Page-XML
33 | sources:
34 | - reference: ''
35 | link: https://catalogue.bnf.fr/ark:/12148/cb33631875s
36 | - reference: ''
37 | link: https://catalogue.bnf.fr/ark:/12148/cb39334880d
38 | - reference: ''
39 | link: https://catalogue.bnf.fr/ark:/12148/cb334128727
40 | volume:
41 | - metric: lines
42 | count: 4710
43 | transcription-guidelines: >-
44 | Ariane Pinche. Guide de transcription pour les manuscrits du Xe au XVe siècle.
45 | 2022. hal-03697382f
46 |
--------------------------------------------------------------------------------
/catalog/editer-la-correspondance-de-constance-de-salm-1767-1845/editer-la-correspondance-de-constance-de-salm-1767-1845.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: Éditer la correspondance de Constance de Salm (1767-1845)
3 | url: https://github.com/sbiay/CdS-edition/tree/main/htr/verite-terrain
4 | authors:
5 | - name: Biay
6 | surname: Sébastien
7 | roles:
8 | - transcriber
9 | institutions: []
10 | description: >-
11 | La correspondance de Constance de Salm (femme de lettres française) comprend
12 | différents spécimens d’écriture du début du XIXe siècle. Le jeu de données
13 | atteste les mains de quatre copistes différents.
14 | project-website: https://dhiha.hypotheses.org/2945
15 | language:
16 | - fra
17 | production-software: eScriptorium + Kraken
18 | script:
19 | - iso: Latn
20 | script-type: only-manuscript
21 | time:
22 | notBefore: '1800'
23 | notAfter: '1825'
24 | hands:
25 | count: less-than-11
26 | precision: estimated
27 | license:
28 | - name: CC-BY 4.0
29 | url: https://creativecommons.org/licenses/by/4.0/
30 | format: Alto-XML
31 | sources:
32 | - reference: >-
33 | Salm, C. de (1767-1845). Correspondance. Société des Amis du Vieux Toulon
34 | et de sa Région, Fonds Salm. Archiv Schloss Dyck, fonds Constance de Salm.
35 | link: ''
36 | volume:
37 | - metric: lines
38 | count: 1754
39 | transcription-guidelines: >-
40 | Usages scribaux respectés : abréviations, fautes, accentuation respectés.
41 | Allographes normalisés (s long).
42 |
--------------------------------------------------------------------------------
/catalog/ciham-htr/dataset-for-late-medieval-castilian-text-recognition.yml:
--------------------------------------------------------------------------------
1 |
2 |
3 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
4 | title: 'Dataset for late medieval Castilian text recognition '
5 | url: https://doi.org/10.5281/zenodo.7386489
6 | authors:
7 | - name: Gille Levenson
8 | surname: Matthias
9 | orcid: 0000-0001-9488-5986
10 | roles:
11 | - transcriber
12 | - quality-control
13 | institutions: []
14 | description: >-
15 | HTR/OCR open access gold corpus for spanish late medieval sources, based
16 |
17 | on the allographetic transcription of more than 300 pages of several
18 | manuscripts of the Regimiento de los
19 | Prínçipes, as well as a first set of general transcription models trained with
20 | kraken and out-of-domain test data. See https://doi.org/10.5281/zenodo.7387376 for full description of the dataset.
21 | language:
22 | - spa
23 | production-software: eScriptorium + Kraken
24 | script:
25 | - iso: Latn
26 | script-type: mainly-manuscript
27 | time:
28 | notBefore: '1300'
29 | notAfter: '1500'
30 | hands:
31 | count: more-than-10
32 | precision: estimated
33 | license:
34 | - name: CC-BY-SA 4.0
35 | url: https://creativecommons.org/licenses/by-sa/4.0/
36 | format: Alto-XML
37 | volume:
38 | - metric: lines
39 | count: 28000
40 | transcription-guidelines: >-
41 | Allographetic transcription. See the article
42 | (https://doi.org/10.5281/zenodo.7387376) for full transcription guidelines.
43 |
44 | 320 pages in-domain; 40 pages out-of-domain
45 |
46 |
--------------------------------------------------------------------------------
/catalog/enc-cours-git/hn-kovalewsky.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: "Maxime Kovalewsky - Coutume contemporaine et loi ancienne: droit coutumier\
3 | \ oss\xE9tien"
4 | url: https://github.com/PSL-Chartes-HTR-Students/HN2021-Kovalewsky-1893
5 | project-name: 'ENC - Bonnes pratiques du developpement collaboratif
6 |
7 | '
8 | authors:
9 | - name: "L\u2019Eveque"
10 | surname: "Zo\xE9"
11 | roles:
12 | - transcriber
13 | - name: Ekaterina
14 | surname: Kate
15 | roles:
16 | - transcriber
17 | - name: Kasparian
18 | surname: Anahide
19 | roles:
20 | - transcriber
21 | description: "Nous avons choisi de transcrire le deuxi\xE8me chapitre de l\u2019ouvrage\
22 | \ de Maxime Kovalewsky : Coutume contemporaine et loi ancienne : droit coutumier\
23 | \ oss\xE9tien, \xE9clair\xE9 par l\u2019histoire compar\xE9e. Paris, L. Larose,\
24 | \ 1893. \n"
25 | language:
26 | - fra
27 | script:
28 | - iso: Latn
29 | script-type: only-typed
30 | time:
31 | notBefore: '1893'
32 | notAfter: '1893'
33 | hands:
34 | count: '1'
35 | precision: exact
36 | license:
37 | - name: CC-BY 4.0
38 | url: https://creativecommons.org/licenses/by/4.0/
39 | format: Alto-XML
40 | citation-file-link: https://github.com/PSL-Chartes-HTR-Students/HN2021-Kovalewsky-1893/main/CITATION.CFF
41 | volume:
42 | - metric: characters
43 | count: 45626
44 | - metric: files
45 | count: 28
46 | - metric: lines
47 | count: 983
48 | - metric: regions
49 | count: 72
50 | production-software: "eScriptorium + Kraken"
51 |
--------------------------------------------------------------------------------
/catalog/transcriboquest2024-literary-medieval/transcriboquest-2024-medieval-literary.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: TranscriboQuest 2024 Medieval Literary
3 | url: 10.5281/zenodo.13757440
4 | authors:
5 | - name: Jessie
6 | surname: Dummer
7 | - name: Emmanuelle
8 | surname: Kuhry
9 | - name: Zdzislaw
10 | surname: Koczarski
11 | - name: Sylvain
12 | surname: Besson
13 | - name: Caroline
14 | surname: Chevalier-Royet
15 | orcid: 0000-0002-7574-6742
16 | - name: Caroline
17 | surname: Vandyck
18 | roles:
19 | - project-manager
20 | institutions: []
21 | description: >-
22 | This dataset was created in the context of TranscriboQuest 2024 (Medieval
23 | Literary Team) held in Lyon (11/09/2024-13/09/2024). We opted to focus on
24 | medieval scientific documents that are damaged, in several different
25 | languages. The result is 808 lines transcribed by experts in the field. The
26 | dataset contains the images of the manuscripts and ALTO-XMLs.
27 | language:
28 | - lat
29 | - dum
30 | - fro
31 | - gmh
32 | production-software: eScriptorium + Kraken
33 | automatically-aligned: false
34 | script:
35 | - iso: Latn
36 | script-type: only-manuscript
37 | time:
38 | notBefore: '800'
39 | notAfter: '1500'
40 | hands:
41 | count: 1-per-folder
42 | precision: exact
43 | license:
44 | name: CC-BY 4.0
45 | url: https://creativecommons.org/licenses/by/4.0/
46 | format: Alto-XML
47 | volume:
48 | - metric: lines
49 | count: 800
50 | transcription-guidelines: CATMuS Guidelines (https://catmus-guidelines.github.io)
51 |
--------------------------------------------------------------------------------
/catalog/tubingen-library/southasia-malayalam.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: Ground Truth data for printed Malayalam
3 | url: https://doi.org/10.11588/data/L2KRZO
4 | authors: []
5 | institutions:
6 | - name: Tübingen University Library
7 | roles:
8 | - project-manager
9 | description: >-
10 | Ground Truth (GT) data (JPG and ALTO XML files) which can be used to train OCR
11 | models that recognize printed text in Malayalam script. The training material
12 | is gathered from 19th and 20th centuries prints.
13 |
14 |
15 | The GT data was trained in Transkribus with the HTR+ and the PyLaia engine
16 | with a resulting CER of 2.29% on validation set with HTR+ and 3,20% with
17 | PyLaia. The training was performed on 43 pages with appr. 9,000 words. The
18 | validation set consisted of 5 pages (ca. 1,000 words).
19 |
20 |
21 | Transcription was performed by Tübingen University Library, the Ground Truth
22 | data was created by Elena Mucciarelli (University of Groningen) with support
23 | and model training by Dorothee Huff (Tübingen University Library).
24 | (2022-11-02)
25 | project-name: DigitalSouthAsia
26 | project-website: http://idb.ub.uni-tuebingen.de/digitue/southasia
27 | language:
28 | - mal
29 | production-software: Transkribus
30 | script:
31 | - iso: Mlym
32 | script-type: only-typed
33 | time:
34 | notBefore: '1850'
35 | notAfter: '1996'
36 | hands:
37 | count: unknown
38 | precision: exact
39 | license:
40 | name: CC-BY 4.0
41 | url: https://creativecommons.org/licenses/by/4.0/
42 | format: Page-XML
43 | volume:
44 | - metric: pages
45 | count: 43
46 |
--------------------------------------------------------------------------------
/.github/workflows/Catalog.yaml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: HTRUC Catalog
5 | on:
6 | push:
7 | branches:
8 | - master
9 | workflow_dispatch: #Allows for manual triggering
10 | schedule:
11 | - cron: "0 23 * * 0"
12 | jobs:
13 | catalog:
14 | runs-on: ubuntu-latest
15 | env:
16 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
17 | steps:
18 | - uses: actions/checkout@v2
19 | - name: Set up Python 3.10
20 | uses: actions/setup-python@v2
21 | with:
22 | python-version: "3.9"
23 | - name: Install dependencies
24 | run: |
25 | python -m pip install --upgrade pip
26 | pip install htruc
27 | - name: Run HTRUC
28 | run: |
29 | htruc make ./catalog --access_token ${{ secrets. GITHUB_TOKEN }} --graph-csv data.csv --statistics statistics.csv --output htr-united.yml --graph graph.png --json catalog.json --ids catalog-ids.json --check-link --no-remote
30 | - name: Commit files
31 | run: |
32 | git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com"
33 | git config --local user.name "github-actions[bot]"
34 | python3 spid.py
35 | git add htr-united.yml graph.png statistics.csv catalog.json
36 | git commit -m "[Automatic] Update of the Catalog" || echo "Nothing to commit"
37 | git push || echo "Nothing to push"
38 | - uses: rymndhng/release-on-push-action@master
39 | with:
40 | bump_version_scheme: patch
41 | use_github_release_notes: true
42 |
--------------------------------------------------------------------------------
/catalog/fondue/fondue-kunsthistorisches-uzh-archivdatenbank.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: FoNDUE_Kunsthistorisches-UZH_Archivdatenbank
3 | url: https://github.com/FoNDUE-HTR/FoNDUE_Kunsthistorisches-UZH_Archivdatenbank
4 | authors:
5 | - name: Pauline
6 | surname: Jacsont
7 | orcid: 0000-0002-6296-3246
8 | roles:
9 | - project-manager
10 | - transcriber
11 | - aligner
12 | - quality-control
13 | - name: Simon
14 | surname: Gabay
15 | orcid: 0000-0001-9094-4475
16 | roles:
17 | - project-manager
18 | - quality-control
19 | - support
20 | - name: Tristan
21 | surname: Weddigen
22 | orcid: 0000-0002-4609-8950
23 | roles:
24 | - support
25 | institutions: []
26 | description: HTR data made with the Kunsthistorisches UZH corpus.
27 | project-name: FoNDUE
28 | project-website: https://www.unige.ch/lettres/humanites-numeriques/recherche/projets-de-la-chaire/fondue
29 | language:
30 | - deu
31 | - fra
32 | - ita
33 | production-software: eScriptorium + Kraken
34 | script:
35 | - iso: Latn
36 | script-type: evenly-mixed
37 | time:
38 | notBefore: '1900'
39 | notAfter: '1999'
40 | hands:
41 | count: more-than-10
42 | precision: estimated
43 | license:
44 | - name: CC-BY 4.0
45 | url: https://creativecommons.org/licenses/by/4.0/
46 | format: Alto-XML
47 | volume:
48 | - metric: pages
49 | count: 1100
50 | citation-file-link: >-
51 | https://github.com/FoNDUE-HTR/FoNDUE_Kunsthistorisches-UZH_Archivdatenbank/blob/main/CITATION.cff
52 | transcription-guidelines: "The transcription is strictly diplomatic: no abbreviations are resolved. \LItems that are crossed out or struck through will be transcribed with a \"€\"."
53 |
--------------------------------------------------------------------------------
/catalog/fondue/FONDUE-MLT-CAT.yml:
--------------------------------------------------------------------------------
1 | authors:
2 | - name: Pradier
3 | orcid: 0000-0002-3476-7248
4 | roles:
5 | - transcriber
6 | surname: Frédérine
7 | - name: Gabay
8 | orcid: 0000-0001-9094-4475
9 | roles:
10 | - transcriber
11 | - project-manager
12 | - quality-control
13 | - support
14 | surname: Simon
15 | - name: Kervegan
16 | orcid: 0000-0003-2821-8821
17 | roles:
18 | - transcriber
19 | surname: Paul
20 | - name: Janès
21 | orcid: 0000-0002-8971-6173
22 | roles:
23 | - transcriber
24 | surname: Juliette
25 | - name: Sánchez Oeconomo
26 | orcid: 0000-0002-8591-5394
27 | roles:
28 | - transcriber
29 | surname: Esteban
30 | citation-file-link: https://github.com/FoNDUE-HTR/FONDUE-MLT-CAT/blob/main/CITATION.cff
31 | description: 'Groundtruth for 19th/20th sale/exhibition catalogues, mainly printed
32 | in France but not only.'
33 | transcription-guidelines: 'Segmentation include an extra zone `CustomeZone: entry`'
34 | format: Alto-XML
35 | hands:
36 | count: unknown
37 | precision: exact
38 | institutions: []
39 | language:
40 | - por
41 | - fra
42 | - ita
43 | license:
44 | - name: CC-BY 4.0
45 | url: https://creativecommons.org/licenses/by/4.0/
46 | production-software: eScriptorium + Kraken
47 | project-name: FoNDUE
48 | project-website: https://github.com/FoNDUE-HTR
49 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
50 | script:
51 | - iso: Latn
52 | script-type: only-typed
53 | time:
54 | notAfter: '1972'
55 | notBefore: '1818'
56 | title: FONDUE-MLT-CAT
57 | url: https://github.com/FoNDUE-HTR/FONDUE-MLT-CAT
58 | volume:
59 | - count: 1285120
60 | metric: characters
61 | - count: 1381
62 | metric: files
63 | - count: 43114
64 | metric: lines
65 | - count: 10713
66 | metric: regions
67 |
--------------------------------------------------------------------------------
/catalog/ocr-d/ocr-d_gt_structure_text.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: gt_structure_text
3 | url: https://github.com/OCR-D/gt_structure_text
4 | authors:
5 | - name: Matthias
6 | surname: Boenig
7 | orcid: 0000-0003-4615-4753
8 | roles:
9 | - transcriber
10 | - aligner
11 | - project-manager
12 | - quality-control
13 | - digitization
14 | - support
15 | institutions: []
16 | description: >-
17 | The OCR-D Ground Truth text and structure corpus was created between
18 | 2015-2017. In the years since 2017, this corpus has been further curated and
19 | supplemented with metadata where appropriate. The corpus includes page XML
20 | files within annotations of the text and structure include. The data is based
21 | on transcription data stored in the German Text Archive (DTA)
22 | (https://www.deutschestextarchiv.de/).
23 | project-name: OCR-D
24 | project-website: https://ocr-d.de/
25 | language:
26 | - eng
27 | - fra
28 | - deu
29 | - heb
30 | - lat
31 | production-software: Aletheia
32 | automatically-aligned: false
33 | script:
34 | - iso: Latn
35 | - iso: Latf
36 | script-type: only-typed
37 | time:
38 | notAfter: '1900'
39 | notBefore: '1500'
40 | hands:
41 | count: less-than-11
42 | precision: exact
43 | license:
44 | name: CC-BY-SA 4.0
45 | url: https://creativecommons.org/licenses/by-sa/4.0/
46 | format: Page-XML
47 | volume:
48 | - count: 640976
49 | metric: characters
50 | - count: 217
51 | metric: files
52 | - count: 6608
53 | metric: lines
54 | - count: 1647
55 | metric: regions
56 | citation-file-link: https://raw.githubusercontent.com/OCR-D/gt_structure_text/main/CITATION.cff
57 | transcription-guidelines: OCR-D Ground Truth Guidelines https://ocr-d.de/en/gt-guidelines/trans/
58 |
--------------------------------------------------------------------------------
/catalog/eutyches-grammaticus-glossed/eutyches.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: Eutyches
3 | url: https://github.com/malamatenia/Eutyches
4 | authors:
5 | - name: Vlachou Efstathiou
6 | surname: Malamatenia
7 | roles:
8 | - transcriber
9 | - aligner
10 | - project-manager
11 | institutions: []
12 | description: >-
13 | Ground truth for minuscule caroline of the late 9th century from the
14 | grammatical work "de uerbo" of Eutychès.
15 | project-name: Eutyches grammaticus glossed
16 | language:
17 | - lat
18 | - grc
19 | production-software: eScriptorium + Kraken
20 | script:
21 | - iso: Latn
22 | qualify: Minuscule Caroline
23 | script-type: only-manuscript
24 | time:
25 | notBefore: '850'
26 | notAfter: '900'
27 | hands:
28 | count: less-than-11
29 | precision: estimated
30 | license:
31 | - name: CC-BY 4.0
32 | url: https://creativecommons.org/licenses/by/4.0/
33 | format: Alto-XML
34 | sources:
35 | - reference: Codices Vossiani Latini, Brill , VLO41
36 | link: >-
37 | https://primarysources.brillonline.com/browse/vossiani-latini/vlo-041-eutyches-grammaticalia-isidorus-alphabeta
38 | volume:
39 | - metric: pages
40 | count: 65
41 | citation-file-link: https://github.com/malamatenia/Eutyches/blob/main/CITATION.cff
42 | transcription-guidelines: >-
43 | Graphematic transcription, following the guidelines of CREMMA-medieval.
44 | Spacing has been reestablished when dealing with semicontinua, s for long s,
45 | loyal to the manuscript for capital letters, abbreviations preserved,
46 | punctuation reduced to ";" and ".". The few greek passages have been also been
47 | preserved, and some of the essais de plume as well (when forming full
48 | words). Annotation of the layout made with SegmOnto controlled vocabulary.
49 |
--------------------------------------------------------------------------------
/catalog/incunables-sevillans-1494-1500/incunables-sevillans-1494-1500.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: Jeu de données OCR - Incunables sévillans 1494-1500
3 | url: https://doi.org/10.5281/zenodo.3643393
4 | authors:
5 | - name: Gille Levenson
6 | surname: Matthias
7 | orcid: 0000-0001-9488-5986
8 | roles:
9 | - transcriber
10 | - aligner
11 | - project-manager
12 | institutions: []
13 | description: >-
14 | The data set corresponds to 60 pages printed in 1494 by Estanislao Polono and Meinardo Ungut in Seville. These pages are taken from the Regimiento de los Prínçipes (also known as 'Glosa castellana al Regimiento de prínçipes'), and the exemplar used is the
15 | INC/901 of the Biblioteca Nacional de España. The type used for this incunabulum is 97G (Martín Abad and Moyano Andrés, Estanislao Polono, 2002, p. 61). This type was used between 1494 and 1500. For other incunabula produced in this period, see op. cit, p.112-121.
16 | language:
17 | - spa
18 | production-software: eScriptorium + Kraken
19 | script:
20 | - iso: Latn
21 | script-type: only-typed
22 | time:
23 | notBefore: '1494'
24 | notAfter: '1500'
25 | hands:
26 | count: '1'
27 | precision: exact
28 | license:
29 | - name: CC-BY 4.0
30 | url: https://creativecommons.org/licenses/by/4.0/
31 | format: Alto-XML
32 | sources:
33 | - reference: >-
34 | Matthias Gille Levenson. (2022). Jeu de données de segmentation et de reconnaissance optique de caractères - Kraken - Incunables sévillans 1494-1500 (Version v5) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.7006981
35 | link: ''
36 | volume:
37 | - metric: lines
38 | count: 4836
39 | transcription-guidelines: >-
40 | Transcription diplomatique, sans normalisation, sans résolution d'abréviations
41 | ni corrections.
42 |
--------------------------------------------------------------------------------
/catalog/enc-cours-git/tnah-expouniv.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: Projet Exposition universelle de 1878
3 | url: https://github.com/PSL-Chartes-HTR-Students/TNAH-2021-Expositions_Universelles
4 | project-name: 'ENC - Bonnes pratiques du developpement collaboratif''
5 |
6 | '
7 | authors:
8 | - name: Christensen
9 | surname: Kelly
10 | roles:
11 | - transcriber
12 | - name: Davoury
13 | surname: Baudoin
14 | roles:
15 | - transcriber
16 | - name: Anahi
17 | surname: Haedo
18 | roles:
19 | - transcriber
20 | - name: Kervegan
21 | surname: Paul
22 | roles:
23 | - transcriber
24 | - name: Sanchez-Oeconomo
25 | surname: Esteban
26 | roles:
27 | - transcriber
28 | description: "Le Congr\xE8s international des sciences ethnographiques de 1878 a eu\
29 | \ lieu \xE0 l\u2019occasion de l'Exposition universelle de 1878, \xE0 Paris. \xC9\
30 | dit\xE9 en 1881 par l'Imprimerie nationale, le compte rendu de ce congr\xE8s a \xE9\
31 | t\xE9 mis \xE0 disposition par le Conservatoire num\xE9rique des Arts et M\xE9tiers.\n"
32 | language:
33 | - fra
34 | script:
35 | - iso: Latn
36 | - iso: Grek
37 | - iso: Deva
38 | - iso: Arab
39 | script-type: only-typed
40 | time:
41 | notBefore: '1881'
42 | notAfter: '1881'
43 | hands:
44 | count: '1'
45 | precision: exact
46 | license:
47 | - name: CC-BY 4.0
48 | url: https://creativecommons.org/licenses/by/4.0/
49 | format: Alto-XML
50 | citation-file-link: https://raw.githubusercontent.com/PSL-Chartes-HTR-Students/TNAH-2021-Expositions_Universelles/main/CITATION.cff
51 | transcription-guidelines: "Diplomatique, mais pas allograph\xE9tique."
52 | volume:
53 | - metric: characters
54 | count: 155022
55 | - metric: files
56 | count: 56
57 | - metric: lines
58 | count: 2620
59 | - metric: regions
60 | count: 158
61 | production-software: "eScriptorium + Kraken"
62 |
--------------------------------------------------------------------------------
/catalog/distinguo/distinguo-GT-metadata.yml:
--------------------------------------------------------------------------------
1 |
2 |
3 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
4 | title: >-
5 | DISTINGUO : Ground truth for Handwritten Text Recognition (HTR) on Collections
6 | of Distinctions (late 13th to late 15th century)
7 | url: https://nakala.fr/10.34847/nkl.48ad8b8d
8 | authors:
9 | - name: Svetlana
10 | surname: Yatsyk
11 | orcid: 0000-0001-5356-7746
12 | roles:
13 | - transcriber
14 | - aligner
15 | institutions: []
16 | description: >-
17 | This dataset contains normalized transcriptions of collections of
18 | distinctions, specifically "Summa de abstinentia" by Nicolas of Biard and
19 | "Dictionarium bovis" by Thomas of Pavia. They were prepared as part of the
20 | DISTINGUO project, dedicated to the study of distinctiones in medieval Latin
21 | preaching and led by Marjorie Burghart in 2019-2024.
22 | project-website: https://distinguo.huma-num.fr/
23 | language:
24 | - lat
25 | production-software: eScriptorium + Kraken
26 | automatically-aligned: false
27 | script:
28 | - iso: Latn
29 | script-type: only-manuscript
30 | time:
31 | notBefore: '1250'
32 | notAfter: '1499'
33 | hands:
34 | count: 1-per-folder
35 | precision: estimated
36 | license:
37 | name: CC-BY 4.0
38 | url: https://creativecommons.org/licenses/by/4.0/
39 | format: Page-XML
40 | sources:
41 | - reference: >-
42 | Yatsyk, S. (2024). DISTINGUO : Ground truth for Handwritten Text
43 | Recognition (HTR) on Collections of Distinctions (late 13th to late 15th
44 | century) (Version 1) [Data set]. NAKALA - https://nakala.fr (Huma-Num -
45 | CNRS).
46 | link: https://doi.org/10.34847/NKL.48AD8B8D
47 | volume:
48 | - metric: lines
49 | count: 15190
50 | - metric: characters
51 | count: 682486
52 | - metric: regions
53 | count: 1076
54 | - metric: pages
55 | count: 318
56 |
57 |
--------------------------------------------------------------------------------
/catalog/inha/LesPapiersBarye.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: Les Papiers Barye
3 | url: https://gitlab.inha.fr/snr/LesPapiersBarye
4 | authors:
5 | - name: Claass
6 | surname: Victor
7 | roles:
8 | - transcriber
9 | - project-manager
10 | - quality-control
11 | - name: Gain
12 | surname: Justine
13 | roles:
14 | - transcriber
15 | - quality-control
16 | - name: Martin-Vigier
17 | surname: Suzanne
18 | roles:
19 | - transcriber
20 | - quality-control
21 | institutions:
22 | - name: Institut National de l'histoire de l'art (INHA)
23 | roles:
24 | - transcriber
25 | - aligner
26 | - project-manager
27 | - quality-control
28 | - digitization
29 | description: >-
30 | Ensemble de documents autour du sculpteur Antoine-Louis Barye. Paris,
31 | Bibliothèque de l’Institut national d’histoire de l’art, collections Jacques
32 | Doucet, Archives 166. Institut National de l’Histoire de l’art (INHA) /
33 | Set of documents about the sculptor Antoine-Louis Barye. Paris,
34 | Library of the Institut national d'histoire de l'art, Jacques
35 | Doucet, Archives 166. National Institute of Art History (INHA)
36 | project-name: PENSE@INHA
37 | project-website: https://skylab.inha.fr/PENSE/LesPapiersBarye/
38 | language:
39 | - fra
40 | production-software: Transkribus
41 | script:
42 | - iso: Latn
43 | script-type: mainly-manuscript
44 | time:
45 | notBefore: '1819'
46 | notAfter: '1914'
47 | hands:
48 | count: more-than-10
49 | precision: exact
50 | license:
51 | - name: Etalab OL 2.0
52 | url: https://spdx.org/licenses/etalab-2.0.html
53 | format: Alto-XML
54 | volume:
55 | - metric: characters
56 | count: 362629
57 | - metric: lines
58 | count: 17880
59 | - metric: pages
60 | count: 918
61 | - metric: files
62 | count: 918
63 |
--------------------------------------------------------------------------------
/catalog/ajmc/ajmc-layout.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: 'GT4HistCommentLayout: Layout Ground Truth for Historical Commentaries'
3 | url: https://github.com/AjaxMultiCommentary/GT-commentaries-OLR
4 | authors:
5 | - name: Matteo
6 | surname: Romanello
7 | orcid: 0000-0002-7406-6286
8 | roles:
9 | - project-manager
10 | - name: Sven
11 | surname: Najem-Meyer
12 | orcid: 0000-0002-3661-4579
13 | roles:
14 | - transcriber
15 | - quality-control
16 | - name: Carla
17 | surname: Amaya
18 | roles:
19 | - transcriber
20 | description: 'This dataset contains layout annotations for ca. 370 pages sampled from
21 | 8 public domain classical commentaries, published in the 19th century in English,
22 | German and Latin. The commentaries concern Ancient Greek and Latin works from prose
23 | and poetry (caveat: AGreek poetry is slightly over-represented). Pages were annotated
24 | according to a taxonomy mapped to the SegmOnto controlled vocabulary.'
25 | project-name: Ajax Multi-Commentary
26 | project-website: https://mromanello.github.io/ajax-multi-commentary/
27 | language:
28 | - eng
29 | - deu
30 | - lat
31 | - grc
32 | production-software: Kraken + VGG Image Annotator (VIA)
33 | script:
34 | - iso: Latn
35 | - iso: Grek
36 | script-type: only-typed
37 | time:
38 | notBefore: '1835'
39 | notAfter: '1903'
40 | hands:
41 | count: '1'
42 | precision: exact
43 | license:
44 | - name: CC-BY 4.0
45 | url: https://creativecommons.org/licenses/by/4.0/
46 | format: Alto-XML
47 | volume:
48 | - metric: characters
49 | count: 0
50 | - metric: files
51 | count: 371
52 | - metric: lines
53 | count: 0
54 | - metric: regions
55 | count: 2386
56 | transcription-guidelines: SegmOnto guidelines (v. 0.9)
57 | citation-file-link: https://github.com/AjaxMultiCommentary/GT-commentaries-layout/blob/master/CITATION.cff
58 | characters:
59 | mode: NFD
60 | members: []
61 |
--------------------------------------------------------------------------------
/catalog/enc-cours-git/tnah-decameronfr.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: DecameronFR
3 | url: https://github.com/PSL-Chartes-HTR-Students/TNAH-2021-DecameronFR
4 | project-name: 'ENC - Bonnes pratiques du developpement collaboratif
5 |
6 | '
7 | authors:
8 | - name: Biay
9 | surname: "S\xE9bastien"
10 | roles:
11 | - transcriber
12 | - name: Cappe
13 | surname: "Zo\xE9"
14 | roles:
15 | - transcriber
16 | - name: Konstantinova
17 | surname: Kristina
18 | roles:
19 | - transcriber
20 | - name: Boby
21 | surname: Victor
22 | roles:
23 | - transcriber
24 | - aligner
25 | description: "Le projet vise \xE0 la consitution de v\xE9rit\xE9s de terrain pour\
26 | \ l\u2019entra\xEEnement de mod\xE8les HTR \xE0 partir d'un manuscrit fran\xE7ais\
27 | \ des ann\xE9es 1430-1455 : le manuscrit 5070 de la Biblioth\xE8que de l'Arsenal\
28 | \ (reproduit sur Gallica). Ce manuscrit contient la traduction fran\xE7aise du Decameron\
29 | \ de Boccace par Laurent de Premierfait. Nos v\xE9rit\xE9s de terrain recouvrent\
30 | \ la description de la peste \xE0 Florence situ\xE9e dans le prologue de l'ouvrage.\n"
31 | language:
32 | - frm
33 | script:
34 | - iso: Latn
35 | script-type: only-manuscript
36 | time:
37 | notBefore: '1430'
38 | notAfter: '1455'
39 | hands:
40 | count: '1'
41 | precision: exact
42 | license:
43 | - name: CC-BY 4.0
44 | url: https://creativecommons.org/licenses/by/4.0/
45 | format: Alto-XML
46 | citation-file-link: https://raw.githubusercontent.com/PSL-Chartes-HTR-Students/TNAH-2021-DecameronFR/main/CITATION.cff
47 | transcription-guidelines: Cf. https://github.com/PSL-Chartes-HTR-Students/TNAH-2021-DecameronFR/blob/main/normesTranscription.md
48 | volume:
49 | - metric: characters
50 | count: 19821
51 | - metric: files
52 | count: 9
53 | - metric: lines
54 | count: 751
55 | - metric: regions
56 | count: 41
57 | production-software: "eScriptorium + Kraken"
58 |
--------------------------------------------------------------------------------
/catalog/naval-kishore/naval-kishore.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: Ground truth data for printed Devanagari
3 | url: https://doi.org/10.11588/data/EGOKEI
4 | authors:
5 | - name: Nicole
6 | surname: Merkel-Hilf
7 | orcid: 0000-0002-0344-6169
8 | roles:
9 | - transcriber
10 | - project-manager
11 | - name: Daria
12 | surname: Peshcherova
13 | roles:
14 | - support
15 | institutions:
16 | - name: Heidelberg University Library
17 | description: >-
18 | Ground truth (GT) data (jpg and alto xml files) for an OCR model that
19 | recognizes printed text in Devanagari script.
20 |
21 |
22 | The GT data was trained on Transkribus with the HTR+ engine. The training was
23 | performed on appr. 220 pages with appr. 27,000 words. The validation set was
24 | 10% of the training set.
25 |
26 |
27 | The training material is comprised of letterpress printings from the Naval
28 | Kishore Press (Lakhnau, North India) from the late 19th and early 20th century
29 | in the Hindi, Sanskrit, Braj Bhasha and Awadhi languages.
30 |
31 |
32 | Transcription was performed by Nicole Merkel-Hilf (CATS Library / Heidelberg
33 | University Library) with support by Daria Peshcherova (CATS Library /
34 | Heidelberg University Library).
35 | project-name: Naval Kishore Press - digital
36 | project-website: https://digi.ub.uni-heidelberg.de/en/sammlungen/suedasien/navalkishore.html
37 | language:
38 | - hin
39 | - san
40 | - bra
41 | production-software: Transkribus
42 | script:
43 | - iso: Deva
44 | script-type: only-typed
45 | time:
46 | notBefore: '1880'
47 | notAfter: '1953'
48 | hands:
49 | count: less-than-11
50 | precision: exact
51 | license:
52 | - name: CC-BY 4.0
53 | url: https://creativecommons.org/licenses/by/4.0/
54 | format: Alto-XML
55 | volume:
56 | - metric: lines
57 | count: 4333
58 | transcription-guidelines: Diplomatic transcription, no correction of mispelling
59 |
--------------------------------------------------------------------------------
/catalog/burchards-dekret-digital/bdd-segmentation-data.yml:
--------------------------------------------------------------------------------
1 |
2 |
3 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
4 | title: Burchards Dekret Digital (BDD) Segmentation Data
5 | url: https://github.com/michaelscho/bdd-segmentation-data
6 | authors:
7 | - name: Michael
8 | surname: Schonhardt
9 | orcid: 0000-0002-2750-1900
10 | roles:
11 | - aligner
12 | - project-manager
13 | - quality-control
14 | - name: Leo
15 | surname: Felder
16 | orcid: 0009-0008-7230-4229
17 | roles:
18 | - support
19 | - name: Torben
20 | surname: Jordan
21 | orcid: 0009-0002-2143-0520
22 | roles:
23 | - support
24 | - name: Christopher
25 | surname: Oed
26 | orcid: 0009-0001-3910-1832
27 | roles:
28 | - support
29 | institutions: []
30 | description: >-
31 | This dataset comprises PageXML for training segmentation models in Transkribus
32 | and Kraken. It is designed to capture the specific layout of medieval canon
33 | law collections. Compiled from several 11th-century manuscripts of the
34 | Decretum Burchardi, it supports the ongoing edition project Burchards Dekret
35 | Digital. Annotations are tailored to project-specific needs but can be adapted
36 | for other use cases. The data was first prepared using Transkribus and then
37 | remasked in eScriptorium for usage in Kraken.
38 | project-name: Burchards Dekret Digital
39 | project-website: https://www.adwmainz.de/projekte/burchards-dekret-digital/informationen.html
40 | language:
41 | - lat
42 | production-software: eScriptorium + Kraken + Transkribus
43 | automatically-aligned: false
44 | script:
45 | - iso: Latn
46 | script-type: only-manuscript
47 | time:
48 | notBefore: '1000'
49 | notAfter: '1199'
50 | hands:
51 | count: unknown
52 | precision: exact
53 | license:
54 | name: CC-BY 4.0
55 | url: https://creativecommons.org/licenses/by/4.0/
56 | format: Page-XML
57 | volume:
58 | - metric: pages
59 | count: 3000
60 |
61 |
--------------------------------------------------------------------------------
/catalog/tarima/tarima.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: TariMa
3 | url: https://github.com/calfa-co/tarima
4 | authors:
5 | - name: Antoine
6 | surname: Perrier
7 | orcid: 0000-0002-5035-4283
8 | roles:
9 | - project-manager
10 | - name: Chahan
11 | surname: Vidal-Gorène
12 | orcid: 0000-0003-1567-6508
13 | roles:
14 | - project-manager
15 | institutions:
16 | - name: BULAC
17 | roles:
18 | - project-manager
19 | - name: Calfa
20 | roles:
21 | - project-manager
22 | - transcriber
23 | description: >-
24 | The dataset has been collated within the frame of the TariMa project (Tarih
25 | al-Maghrib. Writing History in the Maghreb in the modern and contemporary
26 | era), sponsored by the French agency Collex-Persee and led by Antoine
27 | Perrier (CNRS). It comprises different image resolution and size (width from
28 | 982px to 8049px), different layouts (double page, multiple columns), and state
29 | of conservation. It also mixes microfilms, scans and lithographies. It
30 | presents a very wide variety representative of the Maghrebi Arabic production.
31 | project-website: https://www.collexpersee.eu/projet/tarima/
32 | language:
33 | - ara
34 | production-software: Calfa Vision
35 | script:
36 | - iso: Arab
37 | qualify: Maghrebi
38 | script-type: mainly-manuscript
39 | time:
40 | notBefore: '1500'
41 | notAfter: '1899'
42 | hands:
43 | count: more-than-10
44 | precision: estimated
45 | license:
46 | - name: CC-BY 4.0
47 | url: https://creativecommons.org/licenses/by/4.0/
48 | format: Page-XML
49 | sources:
50 | - reference: ''
51 | link: https://github.com/calfa-co/tarima
52 | volume:
53 | - metric: files
54 | count: 120
55 | - metric: lines
56 | count: 2673
57 | - metric: characters
58 | count: 146667
59 | transcription-guidelines: >-
60 | We follow the RASAM guidelines for the transcription of Arabic Maghrebi
61 | manuscripts.
62 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/add-a-new-dataset-description.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Add a new dataset description
3 | about: Template to add the description of a new dataset
4 | title: "[catalog] New repo {project-name/dataset-name}"
5 | labels: project
6 | assignees: ''
7 |
8 | ---
9 | ## Autonomy
10 |
11 | Check applicable situation:
12 |
13 | - [ ] I know how to make a Pull Request and will create the corresponding directory and files under "[htr-united/catalog/](https://github.com/HTR-United/htr-united/tree/master/catalog)"
14 | - [ ] I don't know how to do a Pull Request, I need assistance to add the description of my dataset under "[htr-united/catalog/](https://github.com/HTR-United/htr-united/tree/master/catalog)"
15 |
16 |
17 | ## Description of the dataset
18 |
19 | ### Checklist
20 | - [ ] name of the corpus is explicitly stated
21 | - [ ] name of the project is explicitly stated
22 | - [ ] authors and roles are explicitly stated
23 | - [ ] a license is associated with the dataset
24 | - [ ] the dataset is described in a clear and explicit way enabling other users to understand its content and context of creation
25 | - [ ] the dataset uses standard formats such as PAGE XML or ALTO XML and is aligned with images
26 |
27 | ### Relevant information
28 |
29 | - name of the corpus[1](#fn1):
30 | - name of the project[2](#fn2):
31 | - description generated with [our form](https://htr-united.github.io/document-your-data-en.html):
32 | ```
33 | [paste description here]
34 | ```
35 |
36 | ---
37 |
38 | 1: This name will be used to create a YAML file dedicated to this dataset. *For example: if your dataset is called "My Awesome Dataset", its description will be saved under "my-awesome-dataset.yml"*
39 |
40 | 2: This name will be used to create a folder under "catalog/" containing all the datasets related to your project. *For example: if you project is called "My Awesome Project", the YAML file(s) describing your datasets will be saved under "catalog/my-awesome-project/"*
41 |
--------------------------------------------------------------------------------
/catalog/bullinger/gwalther-htr.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: Gwalther Handwriting Ground Truth
3 | url: https://zenodo.org/record/4780947#.YhN5pVvMLUQ
4 | project-name: 'Bullinger digital''
5 |
6 | '
7 | project-website: https://www.bullinger-digital.ch/
8 | authors:
9 | - name: "Str\xF6bel"
10 | surname: Phillip Benjamin
11 | roles:
12 | - aligner
13 | - quality-control
14 | - support
15 | - name: Stotz
16 | surname: Peter
17 | roles:
18 | - transcriber
19 | description: "This is ground truth for Rudolph Gwalther\u2019s (1519-1586) handwriting\
20 | \ taken from his book \"Lateinische\" Gedichte\", where he accumulated writings\
21 | \ between 1540 and 1580. Data collection and ground truth creation: At the time\
22 | \ we collected the data, we found 150 images with corresponding transcriptions by\
23 | \ Peter Stotz on e-manuscripta (reference: Gwalther, Rudolf: Lateinische Gedichte.\
24 | \ Z\xFCrich, 1540-1580. Zentralbibliothek Z\xFCrich, Ms D 152, https://doi.org/10.7891/e-manuscripta-26750\
25 | \ / Public Domain Mark) . We removed 8 images with too many corrections or vertical\
26 | \ texts. Next, we uploaded the images into the Transkribus platform, applied the\
27 | \ line recognition tool and manually copied the transcribed text lines into the\
28 | \ recognised line boxes. During this process, we made some corrections, which were\
29 | \ mainly due to inconsistencies in punctuation and capitalised letters.\n"
30 | language:
31 | - lat
32 | script:
33 | - iso: Latn
34 | script-type: only-manuscript
35 | time:
36 | notBefore: '1540'
37 | notAfter: '1580'
38 | hands:
39 | count: '1'
40 | precision: exact
41 | license:
42 | name: CC-BY 4.0
43 | url: https://creativecommons.org/licenses/by/4.0/
44 | format: Alto-XML
45 | volume:
46 | - count: 4040
47 | metric: lines
48 | - count: 142
49 | metric: files
50 | - count: 155
51 | metric: regions
52 | - count: 144301
53 | metric: characters
54 | production-software: Transkribus
55 |
--------------------------------------------------------------------------------
/catalog/fondue/fondue-gasparosarditoponomasia-dataset.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: FoNDUE-GasparoSardiToponomasia-Dataset
3 | url: https://github.com/PaulineJac/GasparoSardiToponomasia/tree/main/HTR
4 | authors:
5 | - name: Jacsont
6 | surname: Pauline
7 | roles:
8 | - transcriber
9 | - quality-control
10 | - digitization
11 | - name: Mittenhuber
12 | surname: Florian
13 | institutions: []
14 | description: >-
15 | Dataset produced as for the project to edit Gasparo Sardi’s Toponomasia from
16 | codex 174 of the Burgerbibliothek of Bern. Images are available on request by writing to: pauline.jacsont [ at ] unige.ch.
17 | project-name: FoNDUE
18 | language:
19 | - lat
20 | production-software: eScriptorium + Kraken
21 | script:
22 | - iso: Latn
23 | - iso: Grek
24 | script-type: only-manuscript
25 | time:
26 | notBefore: '1561'
27 | notAfter: '1570'
28 | hands:
29 | count: '1'
30 | precision: exact
31 | license:
32 | - name: CC-BY 4.0
33 | url: https://creativecommons.org/licenses/by/4.0/
34 | format: Alto-XML
35 | sources:
36 | - reference: ''
37 | link: http://katalog.burgerbib.ch/detail.aspx?ID=340662
38 | volume:
39 | - metric: pages
40 | count: 49
41 | citation-file-link: >-
42 | https://github.com/PaulineJac/GasparoSardiToponomasia/blob/main/HTR/CITATION.cff
43 | transcription-guidelines: ' The transcriptions were made following the rules of the github cremma-medieval repository - https://github.com/HTR-United/cremma-medieval. The transcription is strictly diplomatic and graphmatic. No abbreviations are resolved, no standardization of ''i'' and ''v'' with ramist letters, and accents, punctuation, spaces, and line breaks are strictly adhered to. Following Leiden conventions, crossed out or crossed out elements are transcribed with double brackets ⟦⟧, and elements that are illegible in the picture will not be restored but indicated by this type of bracket ⟨ ⟩. Special characters are encoded according to the MUFI fonts.'
44 |
--------------------------------------------------------------------------------
/catalog/transcriboquest-2025/transcriboquest-2025-medieval-latin.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: 'TranscriboQuest 2025: Medieval Latin'
3 | url: https://www.doi.org/10.5281/zenodo.17062009
4 | authors:
5 | - name: Boutreux
6 | surname: Agnès
7 | roles:
8 | - transcriber
9 | - name: Chevalier
10 | surname: Romain
11 | roles:
12 | - transcriber
13 | - name: Corongiu
14 | surname: Chiara
15 | roles:
16 | - transcriber
17 | - name: Gaucher
18 | surname: Sarah
19 | orcid: 0000-0002-1605-3583
20 | roles:
21 | - transcriber
22 | - name: Guéville
23 | surname: Estelle
24 | orcid: 0000-0003-2603-1051
25 | roles:
26 | - transcriber
27 | - name: Kienzl
28 | surname: Annabelle
29 | roles:
30 | - transcriber
31 | - name: Maliszewski
32 | surname: Jan
33 | roles:
34 | - transcriber
35 | - name: Gille Levenson
36 | surname: Matthias
37 | orcid: 0000-0001-9488-5986
38 | roles:
39 | - project-manager
40 | - support
41 | - quality-control
42 | description: Dataset from TranscriboQuest 2025, Medieval Latin group. This dataset focuses on layout. All manuscripts are glossed latin manuscripts with complex layouts. The dataset contains 5000 typed lines, 700 of which have been transcribed.
43 | language:
44 | - lat
45 | production-software: eScriptorium + Kraken
46 | automatically-aligned: false
47 | script:
48 | - iso: Latn
49 | script-type: only-manuscript
50 | time:
51 | notBefore: '800'
52 | notAfter: '1499'
53 | hands:
54 | count: 'more-than-10'
55 | precision: estimated
56 | license:
57 | name: CC-BY-NC-SA 4.0
58 | url: https://creativecommons.org/licenses/by-sa/4.0/
59 | format: Alto-XML
60 | volume:
61 | - metric: files
62 | count: 37
63 | - metric: lines
64 | count: 5060
65 | - metric: regions
66 | count: 358
67 | transcription-guidelines: |-
68 | transcription — https://catmus-guidelines.github.io/
69 | segmentation — https://segmonto.github.io/
70 |
--------------------------------------------------------------------------------
/catalog/banq/copiste-d-un-jour.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: Copiste d’un jour
3 | url: https://github.com/banq-dcn/Copiste-d-un-jour
4 | authors:
5 | - name: Adèle
6 | surname: Aubin
7 | orcid: 0009-0009-3756-1606
8 | roles:
9 | - transcriber
10 | - project-manager
11 | - name: Pascale
12 | surname: Montmartin
13 | orcid: 0009-0002-5683-2423
14 | roles:
15 | - project-manager
16 | institutions:
17 | - name: BAnQ
18 | description: >-
19 | This project draws inspiration from the CREMMA WIKIPEDA data set, with the
20 | objective to create a ground truth repository of contemporary Québécois
21 | handwriting to train HTR models. It is based on a collection of randomly
22 | selected Wikipedia summaries. Each text comprises between 125 and 175 words
23 | and was copied by hand by volunteers. The texts were ordered in a way to
24 | prioritize texts that presented rare character 1- and 2-grams. Non-French
25 | characters were replaced with "-". In general, the copy of one text took
26 | between 1 and 2 pages. In total, 267 volunteers copied 265 texts (2 texts were
27 | unfortunately copied twice by two different volunteers). We took care of the
28 | alignment between the handwritten portion and the original text.
29 | project-name: Copiste d'un jour
30 | language:
31 | - fra
32 | production-software: eScriptorium + Kraken
33 | automatically-aligned: false
34 | script:
35 | - iso: Latn
36 | script-type: only-manuscript
37 | time:
38 | notBefore: '2024'
39 | notAfter: '2024'
40 | hands:
41 | count: 1-per-file
42 | precision: estimated
43 | license:
44 | name: CC-BY 4.0
45 | url: https://creativecommons.org/licenses/by/4.0/
46 | format: Alto-XML
47 | volume:
48 | - metric: files
49 | count: 333
50 | - metric: pages
51 | count: 333
52 | - metric: characters
53 | count: 316715
54 | - metric: lines
55 | count: 6989
56 | transcription-guidelines: https://gist.github.com/alix-tz/6f89444521bf1cab0522da520f7e4ff4
57 |
--------------------------------------------------------------------------------
/catalog/stabs-urfehdebuch/urfehdebuch-htr.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: "Handwritten Text Recognition Ground Truth Set: StABS Ratsb\xFCcher O10, Urfehdenbuch\
3 | \ X"
4 | url: https://doi.org/10.5281/zenodo.5153263
5 | authors:
6 | - name: Susanna
7 | surname: Burghartz
8 | roles:
9 | - project-manager
10 | - name: Calvi
11 | surname: Sonia
12 | roles:
13 | - project-manager
14 | - quality-control
15 | - name: Vogeler
16 | surname: Georg
17 | roles:
18 | - project-manager
19 | - name: Baur
20 | surname: Laila
21 | roles:
22 | - transcriber
23 | - name: Egli
24 | surname: Benedikt
25 | roles:
26 | - transcriber
27 | - name: Gehrig
28 | surname: Gabriela
29 | roles:
30 | - transcriber
31 | - name: Heini
32 | surname: Alexandra Isabelle
33 | roles:
34 | - transcriber
35 | - name: Rossi
36 | surname: Rosanna
37 | roles:
38 | - transcriber
39 | - name: Siegrist
40 | surname: Benjamin
41 | roles:
42 | - transcriber
43 | - name: Wasmer
44 | surname: Remo
45 | roles:
46 | - transcriber
47 | - name: Zimmermann
48 | surname: Lynn
49 | roles:
50 | - transcriber
51 | - name: Schoch
52 | surname: David
53 | roles:
54 | - aligner
55 | - name: "D\xE4ngeli"
56 | surname: Peter
57 | roles:
58 | - digitization
59 | - name: Hodel
60 | surname: Tobias
61 | roles:
62 | - project-manager
63 | - aligner
64 | description: Ground Truth for "Urfehdenbuch X der Stadt Basel (1563-1569)" at Staatsarchiv
65 | Basel-Stadt (StABS).
66 | project-website: hdl:11471/1010.2.1
67 | language:
68 | - deu
69 | script:
70 | - iso: Latn
71 | script-type: only-manuscript
72 | time:
73 | notBefore: '1563'
74 | notAfter: '1569'
75 | hands:
76 | count: unknown
77 | precision: estimated
78 | license:
79 | - name: CC-BY-SA 4.0
80 | url: https://creativecommons.org/licenses/by-sa/4.0/
81 | format: Page-XML
82 | volume:
83 | - metric: lines
84 | count: 8000
85 | transcription-guidelines: 'See: http://gams.uni-graz.at/o:ufbas.1563'
86 | production-software: Transkribus
87 |
--------------------------------------------------------------------------------
/catalog/impresso/nzz-ocr.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: "Ground truth for Neue Z\xFCrcher Zeitung black letter period"
3 | url: https://zenodo.org/record/3333627#.YhN1G1vMLUQ
4 | project-name: 'impresso''
5 |
6 | '
7 | project-website: https://impresso-project.ch/
8 | authors:
9 | - name: "Str\xF6bel"
10 | surname: Phillip Benjamin
11 | roles:
12 | - transcriber
13 | - aligner
14 | - project-manager
15 | - quality-control
16 | - support
17 | - name: Clematide
18 | surname: Simon
19 | roles:
20 | - transcriber
21 | - quality-control
22 | - name: Watter
23 | surname: Camille
24 | roles:
25 | - transcriber
26 | - name: Meraner
27 | surname: Isabell
28 | roles:
29 | - transcriber
30 | description: "The Neue Z\xFCrcher Zeitung (NZZ) has been publishing in black letter\
31 | \ from its very first issue in 1780 until 1947. From this time period, we randomly\
32 | \ sampled one frontpage per year, resulting in a total of 167 pages. We chose frontpages\
33 | \ because they typically contain highly relevant material and because we want to\
34 | \ make sure not to sample pages containing exclusively advertisements or stock information.\
35 | \ During certain periods, the NZZ was published several times a day, and there were\
36 | \ supplements, too. Due to incomplete metadata, the sampling included frontpages\
37 | \ from supplements. We then manually corrected the pages, so it can be used as a\
38 | \ ground truth to improve the OCR of black letter in historical newspapers.i\n"
39 | language:
40 | - deu
41 | script:
42 | - iso: Latn
43 | script-type: only-typed
44 | time:
45 | notBefore: '1780'
46 | notAfter: '1946'
47 | hands:
48 | count: less-than-11
49 | precision: estimated
50 | license:
51 | - name: CC-BY 4.0
52 | url: https://creativecommons.org/licenses/by/4.0/
53 | format: Alto-XML
54 | volume:
55 | - count: 43173
56 | metric: lines
57 | - count: 167
58 | metric: files
59 | - count: 6318
60 | metric: regions
61 | - count: 1768146
62 | metric: characters
63 | production-software: Transkribus
64 |
--------------------------------------------------------------------------------
/catalog/alix-tz/peraire-ground-truth.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: Peraire Ground Truth
3 | url: https://github.com/alix-tz/peraire-ground-truth
4 | authors:
5 | - name: Alix
6 | surname: Chagué
7 | orcid: 0000-0002-0136-4434
8 | roles:
9 | - transcriber
10 | - quality-control
11 | institutions:
12 | - name: Bibliothèque Sébert, Espéranto-France, Paris
13 | roles:
14 | - digitization
15 | description: >-
16 | This dataset was created in order to produce an HTR model for the Digital
17 | Peraire project. The documents are handwritten, dating from the second half of
18 | the 20th century, written by Lucien Péraire in French with a blue ink pen or,
19 | more frequently, with a blue pencil.
20 | project-name: Digital Peraire
21 | language:
22 | - fra
23 | production-software: eScriptorium + Kraken
24 | script:
25 | - iso: Latn
26 | script-type: only-manuscript
27 | time:
28 | notBefore: '1928'
29 | notAfter: '1971'
30 | hands:
31 | count: '1'
32 | precision: exact
33 | license:
34 | - name: CC-BY 4.0
35 | url: https://creativecommons.org/licenses/by/4.0/
36 | format: Alto-XML
37 | volume:
38 | - metric: characters
39 | count: 38793
40 | - metric: files
41 | count: 33
42 | - metric: lines
43 | count: 1059
44 | - metric: regions
45 | count: 80
46 | citation-file-link: https://github.com/alix-tz/peraire-ground-truth/blob/master/CITATION.cff
47 | transcription-guidelines: >-
48 | The transcription respects what is written on the document, including
49 | ponctuation and spelling errors. The case is respected: capital letters are
50 | transcribed with capital letters. Crossed out words are signaled by # which
51 | isn't used to transcribe anything else. The SegmOnto ontology was used for the
52 | segmentation of this dataset. For regions, MainZone and MarginTextZone were
53 | used. For lines, DefaultLine and InterlinearLine were used. The original
54 | documents are held at the Bibliothèque Sébert, Espéranto-France, Paris. They
55 | should be mentionned every time the images are used.
56 |
--------------------------------------------------------------------------------
/catalog/TranscriboQuest_Arabic/htr-united.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: TranscriboQuest_Arabic_team
3 | url: https://doi.org/10.5281/zenodo.13757236
4 | authors:
5 | - name: Ephrem Aboud
6 | surname: Ishac
7 | orcid: 0000-0003-2943-6556
8 | roles:
9 | - transcriber
10 | - aligner
11 | - quality-control
12 | - name: Enki
13 | surname: Baptiste
14 | orcid: 0009-0004-3456-9796
15 | roles:
16 | - transcriber
17 | - aligner
18 | - quality-control
19 | institutions: []
20 | description: 'Dataset on an Arabic corpus of Christian-Islamic theology. '
21 | project-name: TranscriboQuest 2024
22 | language:
23 | - ara
24 | production-software: eScriptorium + Kraken
25 | automatically-aligned: false
26 | script:
27 | - iso: Arab
28 | script-type: only-manuscript
29 | time:
30 | notBefore: '1200'
31 | notAfter: '1600'
32 | hands:
33 | count: 1-per-folder
34 | precision: estimated
35 | license:
36 | name: CC-BY-SA 4.0
37 | url: https://creativecommons.org/licenses/by-sa/4.0/
38 | format: Alto-XML
39 | volume:
40 | - metric: lines
41 | count: 153
42 | transcription-guidelines: >-
43 | ▶ Data format: XML ALTO
44 |
45 | ▶ Number of transcribed lines: 153
46 |
47 | ▶ author/creator/curator of the dataset: Enki Baptiste and Ephrem Aboud Ishac
48 |
49 | ▶ Segmentation tools, HTR engine and interface: OpenITI model
50 | (https://github.com/OpenITI/acdc_results/blob/main/models/gen2-print-n7m5-union-ft_best.mlmodel);
51 | eScriptorium; Kraken
52 |
53 | ▶ Language of the corpus, Date: Arabic, end of the 16th century
54 |
55 | ▶ Type, support of documents, script: paper; mashriqi naskh
56 |
57 | ▶ Transcription method: diplomatic transcription respecting the tanwin, the
58 | shadda and the diacritic marks.
59 |
60 | ▶ Theme, collection, object of the dataset: theology; Maktabat al-Sālimī,
61 | Bidiyya, Oman, ms. AS 250 4v-5f
62 | (https://elibrary.mara.gov.om/en/omani-library/imam-nour-al-din-al-salmi-s-library/book/?id=324#book/7);
63 | St Mark Monastery, Jerusalem, SMMJ 00264 2v-5r
64 |
--------------------------------------------------------------------------------
/catalog/ifloral/ifloral-dataset.yml:
--------------------------------------------------------------------------------
1 | authors:
2 | - name: Alexandre
3 | orcid: 0009-0007-4781-3294
4 | roles:
5 | - aligner
6 | - quality-control
7 | surname: Matos
8 | - name: Rui
9 | orcid: 0000-0001-5767-1583
10 | roles:
11 | - transcriber
12 | surname: Neves
13 | - name: Gonçalo
14 | roles:
15 | - transcriber
16 | surname: Monteiro
17 | - name: Catarina
18 | roles:
19 | - transcriber
20 | surname: Coelho
21 | - name: Pedro
22 | orcid: 0009-0004-9005-6688
23 | roles:
24 | - aligner
25 | surname: Bastos
26 | automatically-aligned: false
27 | description: >-
28 | This dataset was designed for training machine learning models in the context
29 | of the [iForal project](https://iforal.hypotheses.org/), which focuses on
30 | transcribing medieval Portuguese texts, specifically forais (charters). It
31 | includes images of medieval manuscripts, along with corresponding line-level
32 | transcription labels, to facilitate the development of models capable of
33 | recognizing and transcribing historical handwriting.
34 |
35 | The dataset is ideal for OCR/HTR tasks and segmentation tasks within the
36 | domain of medieval document transcription. It serves as a critical resource
37 | for advancing automated transcription tools for medieval texts, making
38 | historical archives more accessible.
39 | format: Page-XML
40 | hands:
41 | count: unknown
42 | precision: exact
43 | institutions: []
44 | language:
45 | - lat
46 | - por
47 | license:
48 | name: CC-BY 4.0
49 | url: https://creativecommons.org/licenses/by/4.0/
50 | production-software: eScriptorium + Kraken
51 | project-name: iForal
52 | project-website: https://iforal.hypotheses.org/
53 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
54 | script:
55 | - iso: Latn
56 | script-type: only-manuscript
57 | time:
58 | notAfter: '1491'
59 | notBefore: '1217'
60 | title: iForal-Dataset
61 | url: https://github.com/Arch-W/iForal-Dataset
62 | volume:
63 | - count: 776873
64 | metric: characters
65 | - count: 180
66 | metric: files
67 | - count: 8009
68 | metric: lines
69 | - count: 183
70 | metric: regions
71 |
--------------------------------------------------------------------------------
/catalog/cremma/mss-20.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: CREMMA Manuscrits du 20e
3 | url: https://github.com/HTR-United/CREMMA-MSS-20
4 | project-name: CREMMA
5 | authors:
6 | - name: "Cl\xE9rice"
7 | surname: Thibault
8 | roles:
9 | - project-manager
10 | - quality-control
11 | - name: "Chagu\xE9"
12 | surname: Alix
13 | roles:
14 | - project-manager
15 | - quality-control
16 | description: 'Manuscripts of the 20th century
17 |
18 | '
19 | language:
20 | - fra
21 | script:
22 | - iso: Latn
23 | script-type: only-manuscript
24 | time:
25 | notBefore: '1900'
26 | notAfter: '1999'
27 | hands:
28 | count: 1-per-folder
29 | precision: exact
30 | license:
31 | - name: CC-BY 4.0
32 | url: https://creativecommons.org/licenses/by/4.0/
33 | format: Alto-XML
34 | volume:
35 | - metric: characters
36 | count: 5764
37 | - metric: files
38 | count: 13
39 | - metric: lines
40 | count: 224
41 | - metric: regions
42 | count: 25
43 | transcription-guidelines: "Abr\xE9viations conserv\xE9es."
44 | production-software: eScriptorium + Kraken
45 | characters:
46 | mode: NFKD
47 | members:
48 | - e
49 | - a
50 | - s
51 | - n
52 | - t
53 | - r
54 | - i
55 | - u
56 | - l
57 | - o
58 | - d
59 | - c
60 | - m
61 | - p
62 | - "\u0301"
63 | - <
64 | - '>'
65 | - ''''
66 | - v
67 | - q
68 | - ','
69 | - .
70 | - "\u0300"
71 | - b
72 | - g
73 | - h
74 | - j
75 | - f
76 | - F
77 | - J
78 | - '1'
79 | - '-'
80 | - "\u0302"
81 | - M
82 | - A
83 | - E
84 | - x
85 | - T
86 | - y
87 | - C
88 | - D
89 | - ^
90 | - O
91 | - '8'
92 | - N
93 | - '7'
94 | - B
95 | - S
96 | - '0'
97 | - "\u0327"
98 | - P
99 | - G
100 | - R
101 | - H
102 | - L
103 | - '9'
104 | - z
105 | - I
106 | - '2'
107 | - ':'
108 | - U
109 | - '&'
110 | - k
111 | - +
112 | - ;
113 | - $
114 | - V
115 | - "\u0153"
116 | - '['
117 | - '?'
118 | - ']'
119 | - '4'
120 | - '3'
121 | - (
122 | - )
123 | - '6'
124 |
--------------------------------------------------------------------------------
/catalog/teklia/belfort.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: Belfort
3 | url: https://zenodo.org/record/8041668
4 | authors:
5 | - name: Solène
6 | surname: Tarride
7 | orcid: 0000-0001-6174-9865
8 | - name: Tristan
9 | surname: Faine
10 | - name: Mélodie
11 | surname: Boillet
12 | orcid: 0000-0002-0618-7852
13 | - name: Harold
14 | surname: Mouchère
15 | orcid: 0000-0001-6220-7216
16 | - name: Christopher
17 | surname: Kermorvant
18 | orcid: 0000-0002-7508-4080
19 | institutions: []
20 | description: >
21 | This dataset includes minutes of Belfort municipal council drawn up between
22 | 1790 and 1946. Documents include deliberations, lists of councillors,
23 | convocations, and agendas. The dataset includes 24,105 text-line images that
24 | were automatically detected from pages.
25 |
26 | Up to four transcriptions are available for each line image:
27 |
28 | * two from human annotators (in `Transcriptions/callico_1/` and
29 | `Transcriptions/callico_2/`)
30 |
31 | * two from automatic models (in `Transcriptions/dan/` and
32 | `Transcriptions/pylaia/`)
33 | project-name: Handwritten Text Recognition from Crowdsourced Annotations
34 | project-website: https://arxiv.org/abs/2306.10878
35 | language:
36 | - fra
37 | production-software: Callico
38 | script:
39 | - iso: Latn
40 | script-type: only-manuscript
41 | time:
42 | notBefore: '1790'
43 | notAfter: '1946'
44 | hands:
45 | count: more-than-10
46 | precision: estimated
47 | license:
48 | name: CC-BY 4.0
49 | url: https://creativecommons.org/licenses/by/4.0/
50 | format: Image-Text-Pairs
51 | sources:
52 | - reference: >-
53 | Solène Tarride, Tristan Faine, Mélodie Boillet, Harold Mouchère, &
54 | Christopher Kermorvant. (2023). The Belfort dataset: Handwritten Text
55 | Recognition from Crowdsourced Annotations [Data set]. 7th International
56 | Workshop on Historical Document Imaging and Processing (HIP'23), San
57 | José, California, USA. Zenodo. https://doi.org/10.5281/zenodo.8041668
58 | link: https://arxiv.org/abs/2306.10878
59 | volume:
60 | - metric: lines
61 | count: 24105
62 |
--------------------------------------------------------------------------------
/catalog/incunabula-reichenau/incunabula-reichenau.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: Incunabula Reichenau
3 | url: https://doi.org/10.5281/zenodo.11046061
4 | authors:
5 | - name: Annika
6 | surname: Stello
7 | orcid: 0000-0002-6305-4810
8 | roles:
9 | - project-manager
10 | - name: Gerit
11 | surname: Heim
12 | orcid: 0000-0002-5820-7771
13 | roles:
14 | - project-manager
15 | - name: Katharina
16 | surname: Ost
17 | orcid: 0000-0002-6234-9721
18 | roles:
19 | - transcriber
20 | institutions: []
21 | description: >-
22 | This data set contains the training data for the following three published
23 | Transkribus models\:
24 |
25 | German Incunabula (Reichenau)
26 | Latin Incunabula (Reichenau)
27 | Latin/German Bilingual Incunabula (Reichenau)
28 |
29 | This data set represents an excerpt of a collection of incunabula and post-incunabula
30 | of the former Reichenau monastery, now held at the Badische Landesbibliothek in
31 | Karlsruhe (see https://digital.blb-karlsruhe.de/topic/view/7530707). As, typically,
32 | 1-20 pages were drawn from single prints, it reflects a wide range of typefaces used
33 | by early printers from the German language area and Northern Italy.
34 |
35 | The data was created as part of the project Digitalisierung und Volltexterkennung
36 | der ehemals Reichenauer Inkunabeln at the Badische Landesbibliothek, which was
37 | funded by the Stiftung Kulturgut Baden-Württemberg.
38 | project-name: Digitalisierung und Volltexterkennung der ehemals Reichenauer Inkunabeln
39 | language:
40 | - lat
41 | - deu
42 | production-software: Transkribus
43 | automatically-aligned: false
44 | script:
45 | - iso: Latn
46 | - iso: Latf
47 | script-type: only-typed
48 | time:
49 | notBefore: '1470'
50 | notAfter: '1510'
51 | hands:
52 | count: more-than-10
53 | precision: exact
54 | license:
55 | name: CC-BY-SA 4.0
56 | url: https://creativecommons.org/licenses/by-sa/4.0/
57 | format: Page-XML
58 | volume:
59 | - metric: pages
60 | count: 2200
61 | transcription-guidelines: Abbreviations are represented through special characters, please see the project repository for a full documentation.
62 |
--------------------------------------------------------------------------------
/catalog/htr-school-vienna/wien-onb-cod-2160-f-164-184-ground-truth-from-htr-winter-school-2022.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: Wien ÖNB Cod. 2160 f. 164-184 Ground Truth from HTR Winter School 2022
3 | url: https://zenodo.org/record/7467027#.Y6LRj3bMK3B
4 | authors:
5 | - name: Geelhaar
6 | surname: Tim
7 | orcid: 0000-0002-7653-5859
8 | roles:
9 | - transcriber
10 | - project-manager
11 | - name: D'Amico
12 | surname: Sara
13 | orcid: 0000-0002-8937-2040
14 | roles:
15 | - transcriber
16 | - name: Hofmann
17 | surname: Lara
18 | orcid: 0000-0003-4698-3906
19 | roles:
20 | - transcriber
21 | - name: Gnasso
22 | surname: Alessandro
23 | orcid: 0000-0001-5964-2989
24 | roles:
25 | - transcriber
26 | - name: Audebrand
27 | surname: Justine
28 | roles:
29 | - transcriber
30 | - name: Stitts
31 | surname: Jeremy
32 | orcid: 0000-0001-6988-1836
33 | roles:
34 | - transcriber
35 | - name: Sweeney
36 | surname: Mary
37 | orcid: 0000-0001-7028-2072
38 | roles:
39 | - transcriber
40 | - name: Atwood
41 | surname: Grace
42 | orcid: 0000-0002-1546-6546
43 | roles:
44 | - transcriber
45 | institutions: []
46 | description: >-
47 | This is Ground Truth data created during the HTR Winter School 2022 for the
48 | Cod. 2160 ÖNB that contains one version of the so called Lex Dei.
49 | project-name: HTR Winter School 2022, Vienna
50 | language:
51 | - lat
52 | production-software: Transkribus
53 | script:
54 | - iso: Latn
55 | qualify: Carolingian Minuscule
56 | script-type: only-manuscript
57 | time:
58 | notBefore: '850'
59 | notAfter: '900'
60 | hands:
61 | count: '1'
62 | precision: exact
63 | license:
64 | - name: CC-BY 4.0
65 | url: https://creativecommons.org/licenses/by/4.0/
66 | format: Alto-XML
67 | sources:
68 | - reference: ''
69 | link: http://data.onb.ac.at/rec/AC13956457
70 | volume:
71 | - metric: pages
72 | count: 40
73 | transcription-guidelines: >-
74 | Abbreviations resolved, but no normalization and no correcting of mispelling.
75 | No transcription of initials and interlinear script.
76 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/ajouter-la-description-d-un-nouveau-jeu-de-donn-es.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Ajouter la description d'un nouveau jeu de données
3 | about: Template pour ajouter la description d'un nouveau dataset
4 | title: "[catalog] Nouveau repo {project-name/dataset-name}"
5 | labels: project
6 | assignees: ''
7 |
8 | ---
9 |
10 | ## Description du jeu de données
11 |
12 | ### Checklist
13 | - [ ] le nom du corpus est exprimé explicitement
14 | - [ ] le nom du projet est exprimé explicitement
15 | - [ ] les auteur-rices et les rôles sont exprimés explicitement
16 | - [ ] une license est associée au jeu de données
17 | - [ ] le jeu de données est clairement et explicitement décrit, de manière à permettre aux autres utilisateurs de comprendre son contenu et le contexte de sa création
18 | - [ ] le jeu de données utilise des formats standards comme PAGE XML ou ALTO XML et les transcriptions sont alignées avec des images
19 |
20 | ### Informations inmportantes
21 |
22 | - nom du corpus[1](#fn1):
23 | - nom du projet[2](#fn2):
24 | - description générée à l'aide de [notre formulaire](https://htr-united.github.io/document-your-data.html):
25 | ```
26 | [Copier la description ici]
27 | ```
28 |
29 | ### Autonomie
30 |
31 | Cocher la situation applicable :
32 |
33 | - [ ] Je sais comment faire une Pull Request et je m'occupe de créer un dossier + fichier correspondant à mon dépôt dans "[htr-united/catalog/](https://github.com/HTR-United/htr-united/tree/master/catalog)"
34 | - [ ] Je ne sais pas comment faire une Pull Request, j'ai besoin d'aide pour ajouter une description de mon jeu de données sous "[htr-united/catalog/](https://github.com/HTR-United/htr-united/tree/master/catalog)"
35 |
36 | ---
37 |
38 | 1: Ce nom sera utilisé pour créer le fichier YAML dédié au jeu de données. *Par exemple : si votre jeu de données s'appelle "Mon Super Dataset", sa description sera enregistrée sous "mon-super-dataset.yml"*
39 |
40 | 2: Ce nom sera utlisé pour créer un dossier dans "catalog/", il contiendra toutes les descriptions des jeux de données liés à ce projet. *Par exemple : si vous projet s'appelle "Mon Super Projet", le(s) fichier(s) YAML sera(ont) enregistrés sous "catalog/mon-super-projet/"*
41 |
--------------------------------------------------------------------------------
/catalog/almanach/dahn.yml:
--------------------------------------------------------------------------------
1 | authors:
2 | - name: Chiffoleau
3 | roles:
4 | - project-manager
5 | - aligner
6 | surname: Floriane
7 | characters:
8 | members:
9 | - e
10 | - s
11 | - a
12 | - n
13 | - r
14 | - i
15 | - t
16 | - u
17 | - o
18 | - l
19 | - d
20 | - c
21 | - m
22 | - p
23 | - ́
24 | - ','
25 | - v
26 | - .
27 | - f
28 | - q
29 | - g
30 | - ̀
31 | - '-'
32 | - E
33 | - b
34 | - ’
35 | - "'"
36 | - h
37 | - A
38 | - L
39 | - N
40 | - x
41 | - j
42 | - S
43 | - R
44 | - I
45 | - T
46 | - M
47 | - ̂
48 | - C
49 | - P
50 | - y
51 | - O
52 | - ;
53 | - '1'
54 | - £
55 | - U
56 | - D
57 | - B
58 | - F
59 | - J
60 | - G
61 | - '"'
62 | - '0'
63 | - z
64 | - V
65 | - '9'
66 | - '2'
67 | - ':'
68 | - X
69 | -
70 | - €
71 | - H
72 | - '5'
73 | - '!'
74 | - '3'
75 | - '4'
76 | - ̧
77 | - °
78 | - W
79 | - Y
80 | - '6'
81 | - '8'
82 | - '?'
83 | - '7'
84 | - K
85 | - Q
86 | - /
87 | - (
88 | - )
89 | - k
90 | - œ
91 | - w
92 | - ̈
93 | - …
94 | - Z
95 | - –
96 | - '&'
97 | - '%'
98 | - '='
99 | - $
100 | - _
101 | mode: NFD
102 | description: OCR ground Truth dataset based on French 20th typewritten letters
103 | format: Alto-XML
104 | hands:
105 | count: less-than-11
106 | precision: exact
107 | language:
108 | - fra
109 | license:
110 | - name: CC-BY 4.0
111 | url: https://creativecommons.org/licenses/by/4.0/
112 | production-software: eScriptorium + Kraken
113 | project-name: DAHN
114 | project-website: https://digitalintellectuals.hypotheses.org/category/dahn
115 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
116 | script:
117 | - iso: Latn
118 | script-type: only-typed
119 | time:
120 | notAfter: '1924'
121 | notBefore: '1914'
122 | title: DAHN Corpus
123 | url: https://github.com/HTR-United/dahncorpus
124 | volume:
125 | - count: 475849
126 | metric: characters
127 | - count: 547
128 | metric: files
129 | - count: 12539
130 | metric: lines
131 | - count: 527
132 | metric: pages
133 | - count: 547
134 | metric: regions
135 |
--------------------------------------------------------------------------------
/catalog/hismodoc-htr/titres-nobiliaires-17-18-siecles-dataset.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: Jeu de données HTR « Titres nobiliaires 17e-18e siècles »
3 | url: https://github.com/HisMoDoc-HTR/TitresNobiliaires_17_18/tree/main
4 | authors:
5 | - name: Jean-François
6 | surname: Moufflet
7 | roles:
8 | - transcriber
9 | - project-manager
10 | - quality-control
11 | - digitization
12 | - name: Chloé
13 | surname: Fize
14 | roles:
15 | - transcriber
16 | - aligner
17 | - name: Lucas
18 | surname: Terriel
19 | orcid: 0000-0002-9189-258X
20 | roles:
21 | - transcriber
22 | - aligner
23 | - quality-control
24 | - support
25 | institutions: []
26 | description: >-
27 | Ce dataset pour la reconnaissance des écritures automatiques est composé d’un
28 | mélange de transcriptions de documents du 17e-18 siècle (actes de mariage,
29 | preuves de noblesse etc.), essentiellement en français, et provenant de la
30 | série M, titre III "Titres nobiliaires" des Archives nationales de France.
31 | language:
32 | - fra
33 | production-software: eScriptorium + Kraken
34 | automatically-aligned: false
35 | script:
36 | - iso: Latn
37 | script-type: only-manuscript
38 | time:
39 | notBefore: '1600'
40 | notAfter: '1799'
41 | hands:
42 | count: less-than-11
43 | precision: estimated
44 | license:
45 | name: Etalab OL 2.0
46 | url: https://spdx.org/licenses/etalab-2.0.html
47 | format: Alto-XML
48 | volume:
49 | - metric: lines
50 | count: 726
51 | - metric: pages
52 | count: 44
53 | - metric: regions
54 | count: 242
55 | - metric: characters
56 | count: 25458
57 | citation-file-link: https://github.com/HisMoDoc-HTR/TitresNobiliaires_17_18/blob/main/CITATION.cff
58 | transcription-guidelines: >-
59 | Les transcriptions suivent les conventions éditoriales définies par :
60 |
61 | Bernard Barbiche, Conseils pour l’édition des textes de l’époque moderne
62 | (XVIe-XVIIIe siècle), École nationale des chartes, publié en ligne sur Thélème
63 | (consulté le 01/03/2025).
64 |
65 | Autres précisions :
66 |
67 | - Les abréviations ont été résolues.
68 |
69 | - L'orthographe d'origine a été conservée, y compris les fautes éventuelles.
70 |
--------------------------------------------------------------------------------
/catalog/cremma/mss-16.yml:
--------------------------------------------------------------------------------
1 | authors:
2 | - name: Thibault
3 | orcid: 0000-0003-1852-9204
4 | roles:
5 | - project-manager
6 | - quality-control
7 | - support
8 | surname: Clérice
9 | - name: Alix
10 | orcid: 0000-0002-0136-4434
11 | roles:
12 | - project-manager
13 | - quality-control
14 | - support
15 | surname: Chagué
16 | - name: Anaïs
17 | roles:
18 | - transcriber
19 | surname: Mazoue
20 | automatically-aligned: false
21 | characters:
22 | members:
23 | - e
24 | - r
25 | - n
26 | - a
27 | - u
28 | - o
29 | - t
30 | - i
31 | - l
32 | - ſ
33 | - d
34 | - s
35 | - c
36 | - m
37 | - p
38 | - v
39 | - y
40 | - q
41 | - g
42 | - f
43 | - b
44 | - z
45 | - h
46 | - J
47 | - /
48 | - x
49 | - R
50 | - ^
51 | - L
52 | - I
53 | - .
54 | - E
55 | - ẜ
56 | - ⁊
57 | - M
58 | - '1'
59 | - ꝑ
60 | - A
61 | - ́
62 | - ̾
63 | - <
64 | - '>'
65 | - j
66 | - C
67 | - D
68 | - '3'
69 | - ꝙ
70 | - '9'
71 | - V
72 | - '7'
73 | - '6'
74 | - ’
75 | - P
76 | - '8'
77 | - Ꝑ
78 | - ̃
79 | - T
80 | - (
81 | - S
82 | - N
83 | - ;
84 | - Q
85 | - ̀
86 | - '5'
87 | - '0'
88 | - U
89 | mode: NFD
90 | citation-file-link: https://github.com/HTR-United/CREMMA-MSS-16/CITATION.cff
91 | description: Manuscripts of the 16th century
92 | format: Alto-XML
93 | hands:
94 | count: 1-per-folder
95 | precision: exact
96 | institutions: []
97 | language:
98 | - fra
99 | license:
100 | name: CC-BY 4.0
101 | url: https://creativecommons.org/licenses/by/4.0/
102 | production-software: eScriptorium + Kraken
103 | project-name: CREMMA
104 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
105 | script:
106 | - iso: Latn
107 | script-type: only-manuscript
108 | time:
109 | notAfter: '1599'
110 | notBefore: '1500'
111 | title: CREMMA MSS 16
112 | transcription-guidelines: Abréviations conservées.
113 | url: https://github.com/HTR-United/CREMMA-MSS-16
114 | volume:
115 | - count: 10911
116 | metric: characters
117 | - count: 9
118 | metric: files
119 | - count: 244
120 | metric: lines
121 | - count: 18
122 | metric: regions
123 |
--------------------------------------------------------------------------------
/catalog/inha/LettresDeJacquesDoucetAReneJean1908-1929.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: La Correspondances Jacques Doucet - René Jean
3 | url: https://gitlab.inha.fr/snr/LaCorrespondanceDoucetReneJean
4 | authors:
5 | - name: Cugy
6 | surname: Pascale
7 | roles:
8 | - transcriber
9 | - project-manager
10 | - quality-control
11 | - name: Fieschi
12 | surname: Caroline
13 | roles:
14 | - project-manager
15 | - quality-control
16 | - name: Peyrard
17 | surname: Alix
18 | roles:
19 | - transcriber
20 | - quality-control
21 | - name: Prohin
22 | surname: Lucie
23 | roles:
24 | - transcriber
25 | - quality-control
26 | - name: Sarda
27 | surname: Marie-Anne
28 | roles:
29 | - support
30 | institutions:
31 | - name: Institut National de l'histoire de l'art (INHA)
32 | roles:
33 | - transcriber
34 | - project-manager
35 | - quality-control
36 | - name: Bibliothèque nationale de France
37 | roles:
38 | - digitization
39 | description: >-
40 | Projet entrepris dans le cadre du programme La Bibliothèque d’art et
41 | d’archéologie de Jacques Doucet : corpus, savoirs et réseaux de l’Institut
42 | national d’histoire de l’art à partir d’un corpus de lettres et documents
43 | conservés au Département des manuscrits de la Bibliothèque nationale de France
44 | sous la cote NAF 13124, une des principales sources sur la relation entre
45 | Doucet et René Jean qu’il engagea comme bibliothécaire le 2 juin 1908.
46 | project-name: PENSE@INHA
47 | project-website: https://skylab.inha.fr/PENSE/LettresDeJacquesDoucetAReneJean1908-1929/
48 | language:
49 | - fra
50 | production-software: Transkribus
51 | script:
52 | - iso: Latn
53 | script-type: mainly-manuscript
54 | time:
55 | notBefore: '1908'
56 | notAfter: '1929'
57 | hands:
58 | count: less-than-11
59 | precision: exact
60 | license:
61 | - name: Etalab OL 2.0
62 | url: https://spdx.org/licenses/etalab-2.0.html
63 | format: Alto-XML
64 | volume:
65 | - metric: characters
66 | count: 83312
67 | - metric: lines
68 | count: 2987
69 | - metric: pages
70 | count: 200
71 | - metric: files
72 | count: 200
73 |
--------------------------------------------------------------------------------
/catalog/fondue/fondue-spanish-chapbooks-19th-c-dataset.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: FoNDUE Spanish chapbooks 19th c. Dataset
3 | url: https://github.com/DesenrollandoElCordel/FoNDUE-Spanish-chapbooks-Dataset
4 | authors:
5 | - name: Carta
6 | surname: Constance
7 | roles:
8 | - transcriber
9 | - project-manager
10 | - name: Leblanc
11 | surname: "\xC9lina"
12 | roles:
13 | - digitization
14 | - name: Jacsont
15 | surname: Pauline
16 | roles:
17 | - digitization
18 | - name: Palacios
19 | surname: Belinda
20 | roles:
21 | - transcriber
22 | - quality-control
23 | - name: Bermudez
24 | surname: Luana
25 | roles:
26 | - transcriber
27 | - quality-control
28 | description: Digital editions of the second part of the Genevan Spanish chapbooks
29 | collection (19th c.).
30 | project-name: Desenrollando El Cordel
31 | project-website: https://github.com/DesenrollandoElCordel
32 | language:
33 | - cat
34 | - spa
35 | - lat
36 | script:
37 | - iso: Latn
38 | script-type: only-typed
39 | time:
40 | notBefore: '1770'
41 | notAfter: '1920'
42 | hands:
43 | count: more-than-10
44 | precision: exact
45 | license:
46 | - name: CC-BY-SA 4.0
47 | url: https://creativecommons.org/licenses/by-sa/4.0/
48 | format: Alto-XML
49 | sources:
50 | - reference: ''
51 | link: https://unige.swisscovery.slsp.ch/permalink/41SLSP_UGE/btt5ev/alma991008229029705502
52 | - reference: ''
53 | link: https://unige.swisscovery.slsp.ch/permalink/41SLSP_UGE/kjkm12/alma991002834309705502
54 | volume:
55 | - metric: characters
56 | count: 270718
57 | - metric: lines
58 | count: 12526
59 | - metric: pages
60 | count: 198
61 | citation-file-link: https://github.com/DesenrollandoElCordel/FoNDUE-Spanish-chapbooks-Dataset/blob/main/Grountruth/CITATION.cff
62 | transcription-guidelines: "Les r\xE8gles de transcription suivante ont \xE9t\xE9 adopt\xE9\
63 | es :\n- Respecter les accents ;\n- Respecter la casse ;\n- Respecter la ponctuation\
64 | \ ;\n- Respecter les espaces ;\n- Respecter les retours \xE0 la ligne ;\n- Respecter\
65 | \ la graphie des mots (ne pas corriger les erreurs s\u2019il y en a) ;\n- Supprimer\
66 | \ le bruit (t\xE2ches qui ont \xE9t\xE9 prises pour du texte par l\u2019OCR)."
67 | production-software: "eScriptorium + Kraken"
68 |
--------------------------------------------------------------------------------
/catalog/antwerp_bias-in-history/arletta.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: ARletta
3 | url: https://zenodo.org/records/11191457
4 | authors:
5 | - name: Lith
6 | surname: Lefranc
7 | - name: Ilja
8 | surname: Van Damme
9 | - name: Thibault
10 | surname: Clérice
11 | - name: Mike
12 | surname: Kestemont
13 | institutions:
14 | - name: University of Antwerp
15 | - name: National Institute for Research in Digital Science and Technology, Paris
16 | description: Open-source handwritten text recognition models for historic Dutch
17 | project-name: Bias in History
18 | project-website: https://www.bias-in-history.eu/
19 | language:
20 | - nld
21 | - fra
22 | production-software: eScriptorium + Kraken
23 | automatically-aligned: false
24 | script:
25 | - iso: Latn
26 | script-type: only-manuscript
27 | time:
28 | notBefore: '1600'
29 | notAfter: '1940'
30 | hands:
31 | count: more-than-10
32 | precision: estimated
33 | license:
34 | name: CC-BY-SA 4.0
35 | url: https://creativecommons.org/licenses/by-sa/4.0/
36 | format: Page-XML
37 | volume:
38 | - metric: lines
39 | count: 431359
40 | - metric: regions
41 | count: 44536
42 | - metric: pages
43 | count: 10267
44 | - metric: characters
45 | count: 14253206
46 | transcription-guidelines: >-
47 | **Diplomatic transcription.** All of the text was transcribed verbatim, preserving all of its original features:
48 |
49 | - orthography: preserve original spelling
50 |
51 | - abbreviations: do not expand abbreviations
52 |
53 | - capitalization: retain original use of uppercase and lowercase letters
54 |
55 | - punctuation: transcribe punctuation marks exactly as they appear, even of they are unconventional by modern standards
56 |
57 | - special characters: include any special characters or symbols as they appear
58 |
59 | - formatting: maintain original formatting such as underlining or strikethrough
60 |
61 | - errors and corrections: include all errors and corrections found in the text
62 |
63 | - non-interpretative: avoid interpreting or modernizing the text
64 |
65 | - use the '@' symbol for characters you can not read an tag them as 'unclear' on baseline level
66 |
67 | - tag marginal text as 'marginalia' and main body text as 'paragraph' on region level
68 |
--------------------------------------------------------------------------------
/catalog/greek-data/stavronikita-114.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: Stavronikita Monastery Collection No. 114
3 | url: https://zenodo.org/records/5578251
4 | authors:
5 | - name: Ioannis
6 | surname: Pratikakis
7 | orcid: 0000-0002-4124-3688
8 | roles:
9 | - transcriber
10 | - project-manager
11 | - name: Aleksandros
12 | surname: Papazoglou
13 | roles:
14 | - transcriber
15 | - project-manager
16 | - name: Symeon
17 | surname: Symeonidis
18 | orcid: 0000-0002-3259-614X
19 | roles:
20 | - transcriber
21 | - project-manager
22 | - name: Lazaros
23 | surname: Tsochatzidis
24 | orcid: 0000-0002-4634-7419
25 | roles:
26 | - transcriber
27 | - project-manager
28 | institutions: []
29 | description: >-
30 | It comprises manuscripts made of paper, written at the end of the 15th century
31 | and its dimensions are 218X150 mm. In various pages, we find red initials and
32 | epititles which enrich the manuscript’s decoration.
33 |
34 | The dataset of ΧΦ114 consists of 1051 lines of text containing 5467 (2877
35 | unique words) words that are distributed over 44 scanned handwritten text pages.
36 |
37 | For each page, a PageXML is provided containing the following ground-truth:
38 |
39 | 1. Text region polygon coordinates
40 | 2. Text line polygon coordinates with the corresponding transcription text
41 | 3. Word polygon coordinated with the corresponding transcription text
42 | language:
43 | - grc
44 | transcription-guidelines: |
45 | - Abbreviation and ligatures were resolved
46 | - Minuscule in the beginning of sentences were kept as such.
47 | - Polytonic spelling and diaeresis are kept
48 | production-software: Unknown
49 | automatically-aligned: false
50 | characters:
51 | mode: NFD
52 | script:
53 | - iso: Grek
54 | script-type: only-manuscript
55 | time:
56 | notBefore: '1401'
57 | notAfter: '1500'
58 | hands:
59 | count: less-than-11
60 | precision: exact
61 | license:
62 | name: CC-BY 4.0
63 | url: https://creativecommons.org/licenses/by/4.0/
64 | format: Page-XML
65 | volume:
66 | - {count: 1006, metric: "lines"}
67 | - {count: 44, metric: "files"}
68 | - {count: 44, metric: "regions"}
69 | - {count: 36898, metric: "characters"}
--------------------------------------------------------------------------------
/catalog/greek-data/stavronikita-53.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: Stavronikita Monastery Collection No. 53
3 | url: https://zenodo.org/records/5595669
4 | authors:
5 | - name: Ioannis
6 | surname: Pratikakis
7 | orcid: 0000-0002-4124-3688
8 | roles:
9 | - transcriber
10 | - project-manager
11 | - name: Aleksandros
12 | surname: Papazoglou
13 | roles:
14 | - transcriber
15 | - project-manager
16 | - name: Symeon
17 | surname: Symeonidis
18 | orcid: 0000-0002-3259-614X
19 | roles:
20 | - transcriber
21 | - project-manager
22 | - name: Lazaros
23 | surname: Tsochatzidis
24 | orcid: 0000-0002-4634-7419
25 | roles:
26 | - transcriber
27 | - project-manager
28 | institutions: []
29 | description: >-
30 | The collection is one of the oldest Stavronikita Monastery on Mount Athos.
31 | It is a parchment, four-gospel manuscript which has been written between
32 | 1301 and 1350. It comprises 54 pages with dimensions that are approximately
33 | 250x185 mm. The script is elegant minuscule and the use of majuscule letters
34 | is rare. Tachygraphical symbols and abbreviations are encountered in the
35 | manuscript as well. Furthermore, the manuscript is enriched with
36 | chrysography, elegant epititles and initials.
37 |
38 | The dataset of ΧΦ53 consists of 1038 lines of text containing 5592 words
39 | (2374 unique words) that are distributed over 54 scanned handwritten text pages.
40 | language:
41 | - grc
42 | transcription-guidelines: |
43 | - Abbreviation and ligatures were resolved
44 | - Minuscule in the beginning of sentences were kept as such.
45 | - Polytonic spelling and diaeresis are kept
46 | production-software: Unknown
47 | automatically-aligned: false
48 | characters:
49 | mode: NFD
50 | script:
51 | - iso: Grek
52 | script-type: only-manuscript
53 | time:
54 | notBefore: '1301'
55 | notAfter: '1350'
56 | hands:
57 | count: less-than-11
58 | precision: exact
59 | license:
60 | name: CC-BY 4.0
61 | url: https://creativecommons.org/licenses/by/4.0/
62 | format: Page-XML
63 | volume:
64 | - {count: 1038, metric: "lines"}
65 | - {count: 54, metric: "files"}
66 | - {count: 54, metric: "regions"}
67 | - {count: 37070, metric: "characters"}
68 |
--------------------------------------------------------------------------------
/catalog/greek-data/stavronikita-79.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: Stavronikita Monastery Collection No. 79
3 | url: https://zenodo.org/records/5578136
4 | authors:
5 | - name: Ioannis
6 | surname: Pratikakis
7 | orcid: 0000-0002-4124-3688
8 | roles:
9 | - transcriber
10 | - project-manager
11 | - name: Aleksandros
12 | surname: Papazoglou
13 | roles:
14 | - transcriber
15 | - project-manager
16 | - name: Symeon
17 | surname: Symeonidis
18 | orcid: 0000-0002-3259-614X
19 | roles:
20 | - transcriber
21 | - project-manager
22 | - name: Lazaros
23 | surname: Tsochatzidis
24 | orcid: 0000-0002-4634-7419
25 | roles:
26 | - transcriber
27 | - project-manager
28 | institutions: []
29 | description: >-
30 | It comprises manuscripts made of paper, written in the 16th century and its
31 | dimensions are 220X165 mm. The manuscript is embellished with epititles and
32 | red initials. Tachygraphical symbols and abbreviations are encountered in
33 | the manuscript as well. The dataset of XΦ79 consists of 803 lines of text
34 | containing 4389 words (2069 unique words) that are distributed over
35 | 40 scanned handwritten text pages.
36 | For each page, a PageXML is provided containing the following ground-truth:
37 | 1. Text region polygon coordinates
38 | 2. Text line polygon coordinates with the corresponding transcription text
39 | 3. Word polygon coordinated with the corresponding transcription text
40 | language:
41 | - grc
42 | transcription-guidelines: |
43 | - Abbreviation and ligatures were resolved
44 | - Minuscule in the beginning of sentences were kept as such.
45 | - Polytonic spelling and diaeresis are kept
46 | production-software: Unknown
47 | automatically-aligned: false
48 | characters:
49 | mode: NFD
50 | script:
51 | - iso: Grek
52 | script-type: only-manuscript
53 | time:
54 | notBefore: '1501'
55 | notAfter: '1600'
56 | hands:
57 | count: less-than-11
58 | precision: exact
59 | license:
60 | name: CC-BY 4.0
61 | url: https://creativecommons.org/licenses/by/4.0/
62 | format: Page-XML
63 | volume:
64 | - {count: 803, metric: "lines"}
65 | - {count: 40, metric: "files"}
66 | - {count: 40, metric: "regions"}
67 | - {count: 29112, metric: "characters"}
68 |
--------------------------------------------------------------------------------
/catalog/fondue/FONDUE-IT-PRINT-20.yml:
--------------------------------------------------------------------------------
1 | authors:
2 | - name: Simon
3 | orcid: 0000-0001-9094-4475
4 | roles:
5 | - project-manager
6 | - quality-control
7 | - support
8 | surname: Gabay
9 | - name: Maddalena
10 | roles:
11 | - transcriber
12 | surname: Zaglio
13 | automatically-aligned: false
14 | characters:
15 | members:
16 | - e
17 | - a
18 | - i
19 | - o
20 | - r
21 | - n
22 | - t
23 | - l
24 | - s
25 | - c
26 | - d
27 | - u
28 | - p
29 | - m
30 | - v
31 | - ','
32 | - g
33 | - h
34 | - f
35 | - b
36 | - z
37 | - .
38 | - ̀
39 | - ¬
40 | - q
41 | - I
42 | - '-'
43 | - C
44 | - A
45 | - "'"
46 | - P
47 | - '"'
48 | - S
49 | - M
50 | - E
51 | - ’
52 | - L
53 | - '='
54 | - ;
55 | - T
56 | - R
57 | - D
58 | - V
59 | - O
60 | - G
61 | - N
62 | - ':'
63 | - '1'
64 | - B
65 | - '4'
66 | - )
67 | - '!'
68 | - (
69 | - '['
70 | - ']'
71 | - F
72 | - Q
73 | - '2'
74 | - '0'
75 | - '3'
76 | - '9'
77 | - '5'
78 | - U
79 | - '?'
80 | - °
81 | - ⬪
82 | - '6'
83 | - y
84 | - Z
85 | - k
86 | - ᗅ
87 | - K
88 | - x
89 | - H
90 | - '8'
91 | - X
92 | - W
93 | - —
94 | - '7'
95 | - “
96 | - ᑕ
97 | - ᗞ
98 | - w
99 | mode: NFD
100 | citation-file-link: https://github.com/FoNDUE-HTR/FONDUE-IT-PRINT-20/blob/master/CITATION.cff
101 | description: Archives
102 | format: Alto-XML
103 | hands:
104 | count: unknown
105 | precision: exact
106 | institutions: []
107 | language:
108 | - ita
109 | license:
110 | name: CC-BY 4.0
111 | url: https://creativecommons.org/licenses/by/4.0/
112 | production-software: eScriptorium + Kraken
113 | project-name: FoNDUE
114 | project-website: https://github.com/FoNDUE-HTR
115 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
116 | script:
117 | - iso: Latn
118 | script-type: only-typed
119 | time:
120 | notAfter: '1900'
121 | notBefore: '1999'
122 | title: FONDUE-IT-PRINT-20
123 | transcription-guidelines: SegmOnto
124 | url: https://github.com/FoNDUE-HTR/FONDUE-IT-PRINT-20
125 | volume:
126 | - count: 49432
127 | metric: characters
128 | - count: 23
129 | metric: files
130 | - count: 1008
131 | metric: lines
132 | - count: 48
133 | metric: regions
134 |
--------------------------------------------------------------------------------
/catalog/joseph-hooker-correspondance-project/joseph-hooker-htr.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: Joseph Hooker HTR
3 | url: https://github.com/jschaefer738b/JosephHookerHTR.git
4 | authors:
5 | - name: John
6 | surname: Schaefer
7 | orcid: 0009-0006-5751-9323
8 | roles:
9 | - transcriber
10 | - project-manager
11 | - quality-control
12 | - support
13 | - name: Kiri
14 | surname: Ross-Jones
15 | roles:
16 | - support
17 | - name: Alexis
18 | surname: Litvine
19 | roles:
20 | - support
21 | institutions:
22 | - name: Royal Botanic Gardens, Kew
23 | - name: University of Cambridge
24 | description: >-
25 | XML transcriptions and JPEG images exported from Transkribus as ground truth
26 | for an eScriptorium-Kraken HTR model (CER 11-12%) trained on the correspondence of Joseph
27 | Dalton Hooker (1817-1911), primarily letters to William Turner Thiselton-Dyer
28 | (1843-1928) during the late-19th/early-20th century. Many transcriptions in
29 | this dataset were generated by a small team of anonymous volunteers as part of
30 | the Joseph Hooker Correspondence Project based at Kew Gardens. All images in
31 | this dataset are reproduced with the kind permission of the Board of Trustees
32 | of the Royal Botanic Gardens Kew (© RBG, Kew). Contact archives@kew.org for
33 | more information.
34 |
35 |
36 | HTR Model: Schaefer, John, & Litvine, Alexis. (2023). Joseph Hooker HTR Model.
37 | Zenodo. https://doi.org/10.5281/zenodo.8038689
38 | project-name: Joseph Hooker Correspondence Project
39 | project-website: >-
40 | https://www.kew.org/science/our-science/projects/joseph-hooker-correspondence-project
41 | language:
42 | - eng
43 | production-software: Transkribus
44 | script:
45 | - iso: Latn
46 | script-type: only-manuscript
47 | time:
48 | notBefore: '1850'
49 | notAfter: '1911'
50 | hands:
51 | count: '1'
52 | precision: estimated
53 | license:
54 | - name: CC-BY-SA 4.0
55 | url: https://creativecommons.org/licenses/by-sa/4.0/
56 | format: Page-XML
57 | volume:
58 | - metric: lines
59 | count: 7100
60 | - metric: files
61 | count: 337
62 | - metric: pages
63 | count: 337
64 | transcription-guidelines: >-
65 | All horizontal lines in Hooker's hand were transcribed as originally written.
66 | Most typescript and vertical lines in the margins were not included.
67 |
--------------------------------------------------------------------------------
/catalog/htr-school-vienna/htr-winter-school-2024-medieval-czech-prague-bible-1488.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: HTR Winter School 2024 - Medieval Czech - Prague Bible (1488)
3 | url: https://github.com/HTR-School-Vienna/2024--medieval-czech
4 | authors:
5 | - name: Martin
6 | surname: Plechatý
7 | orcid: 0009-0000-3305-2075
8 | roles:
9 | - transcriber
10 | - name: Daniel
11 | surname: Katscher
12 | orcid: 0009-0008-3475-2522
13 | roles:
14 | - transcriber
15 | - name: Václav
16 | surname: Steiner
17 | orcid: 0009-0004-8336-9846
18 | roles:
19 | - transcriber
20 | - name: Jan
21 | surname: Švarc
22 | orcid: 0009-0005-1274-0545
23 | roles:
24 | - transcriber
25 | - name: 'Martina '
26 | surname: Spěváčková
27 | orcid: 0000-0002-9357-4614
28 | roles:
29 | - transcriber
30 | - name: Jan
31 | surname: Škvrňák
32 | orcid: 0000-0003-0985-4144
33 | - name: Marie
34 | surname: Hedvíková
35 | orcid: 0009-0008-3693-6288
36 | roles:
37 | - transcriber
38 | - name: Anna
39 | surname: Michalcová
40 | orcid: 0000-0003-4760-6950
41 | roles:
42 | - project-manager
43 | - quality-control
44 | institutions: []
45 | description: >-
46 | The Prague Bible (1488, Vienna, Österreichische Nationalbibliothek, shelfmark
47 | Ink 13.C.5, available from: http://data.onb.ac.at/rec/AC07537625, Old Czech)
48 |
49 | Print: Old Czech, Bastarda, end of the 15th C.
50 | language:
51 | - ces
52 | production-software: Transkribus
53 | automatically-aligned: false
54 | script:
55 | - iso: Latn
56 | script-type: only-typed
57 | time:
58 | notBefore: '1488'
59 | notAfter: '1488'
60 | hands:
61 | count: '1'
62 | precision: exact
63 | license:
64 | name: CC-BY 4.0
65 | url: https://creativecommons.org/licenses/by/4.0/
66 | format: Page-XML
67 | volume:
68 | - metric: files
69 | count: 30
70 | citation-file-link: >-
71 | https://github.com/HTR-School-Vienna/2024--medieval-czech/blob/38a20c857757150d8e2da0e8c865fbf7d026cdee/CITATION.cff
72 | transcription-guidelines: >-
73 | The transcription rules were based on semi-diplomatic transcription rules set
74 | by Pero OCR and Směrnice pro vydávání starších českých textů by Jiří Daňhelka
75 | (https://vokabular.ujc.cas.cz/moduly/edicnipoznamka.aspx?id=DanhelkaSmernice).
76 |
--------------------------------------------------------------------------------
/catalog/scripta-psl/biblia.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: BiblIA
3 | url: https://zenodo.org/record/5167263
4 | project-name: 'Scripta PSL
5 |
6 | '
7 | project-website: https://escripta.hypotheses.org/
8 | authors:
9 | - name: "St\xF6kl Ben Ezra"
10 | surname: Daniel
11 | roles:
12 | - transcriber
13 | - project-manager
14 | - name: Brown-DeVost
15 | surname: Bronson
16 | - name: Jablonski
17 | surname: Pawel
18 | - name: Kiessling
19 | surname: Benjamin
20 | - name: Lolli
21 | surname: Elena
22 | - name: Lapin
23 | surname: Hayim
24 | description: "This dataset for Handwritten Text Recognition includes layout segmentation\
25 | \ (regions, toplines and linepolygons) and unicode-transcriptions in alto 4.2 XML\
26 | \ for 202 images of Medieval Hebrew manuscripts from the Biblioth\xE8que nationale\
27 | \ de France (BnF, National Library of France) and the Biblioteca Apostolica Vaticana\
28 | \ (BAV, Vatican Library) corresponding to the article \"BiblIA - a General Model\
29 | \ for Medieval Hebrew Manuscripts and an Open Annotated Dataset\" by Daniel St\xF6\
30 | kl Ben Ezra, Bronson Brown-DeVost, Pawel Jablonski, Benjamin Kiessling, Elena Lolli,\
31 | \ and Hayim Lapin, published in HIP@ICDAR 2021 held in Lausanne, September 2021.\n"
32 | language:
33 | - heb
34 | script:
35 | - iso: Hebr
36 | script-type: only-manuscript
37 | time:
38 | notBefore: '1000'
39 | notAfter: '1499'
40 | hands:
41 | count: more-than-10
42 | precision: exact
43 | license:
44 | - name: CC-BY-SA 4.0
45 | url: https://creativecommons.org/licenses/by-sa/4.0/
46 | format: Alto-XML
47 | volume:
48 | - metric: files
49 | count: 202
50 | - metric: pages
51 | count: 202
52 | - metric: lines
53 | count: 12461
54 | - metric: regions
55 | count: 509
56 | - metric: characters
57 | count: 278641
58 | transcription-guidelines: "See the guidelines detailed in Stoekl Ben Ezra Daniel,\
59 | \ Brown-DeVost Bronson, Jablonski Pawel, Lapin Hayim, Kiessling Benjamin, and Lolli\
60 | \ Elena. 2021. BiblIA - a General Model for Medieval Hebrew Manuscripts and an Open\
61 | \ Annotated Dataset. In The 6th International Workshop on Historical Document Imaging\
62 | \ and Processing (HIP '21). Association for Computing Machinery, New York, NY, USA,\
63 | \ 61\u201366. DOI:https://doi.org/10.1145/3476887.3476896'\n"
64 | production-software: "eScriptorium + Kraken"
65 |
--------------------------------------------------------------------------------
/catalog/fondue/FONDUE-ES-PRINT-19.yml:
--------------------------------------------------------------------------------
1 | authors:
2 | - name: Simon
3 | orcid: 0000-0001-9094-4475
4 | roles:
5 | - project-manager
6 | - quality-control
7 | - support
8 | surname: Gabay
9 | - name: Carmen
10 | roles:
11 | - transcriber
12 | surname: Carrasco Luján
13 | automatically-aligned: false
14 | characters:
15 | members:
16 | - e
17 | - a
18 | - o
19 | - s
20 | - n
21 | - r
22 | - i
23 | - l
24 | - d
25 | - u
26 | - t
27 | - c
28 | - m
29 | - .
30 | - p
31 | - ́
32 | - ','
33 | - b
34 | - g
35 | - y
36 | - q
37 | - h
38 | - v
39 | - ¬
40 | - f
41 | - j
42 | - z
43 | - –
44 | - A
45 | - ;
46 | - E
47 | - '!'
48 | - x
49 | - S
50 | - ̃
51 | - I
52 | - P
53 | - B
54 | - U
55 | - C
56 | - D
57 | - L
58 | - T
59 | - '?'
60 | - ':'
61 | - '0'
62 | - O
63 | - R
64 | - N
65 | - H
66 | - Y
67 | - ¿
68 | - V
69 | - J
70 | - M
71 | - '1'
72 | - ¡
73 | - '2'
74 | - —
75 | - '"'
76 | - k
77 | - F
78 | - '8'
79 | - '7'
80 | - '4'
81 | - '5'
82 | - G
83 | - '-'
84 | - '3'
85 | - '6'
86 | - K
87 | - (
88 | - )
89 | - '9'
90 | - Q
91 | - ̀
92 | - ̈
93 | - X
94 | - W
95 | - '['
96 | - ']'
97 | - '&'
98 | - w
99 | - '*'
100 | - §
101 | - °
102 | - ǝ
103 | mode: NFD
104 | citation-file-link: https://github.com/FoNDUE-HTR/FONDUE-ES-PRINT-19/blob/master/CITATION.cff
105 | description: Novels written in Spanish
106 | format: Alto-XML
107 | hands:
108 | count: unknown
109 | precision: exact
110 | institutions: []
111 | language:
112 | - spa
113 | license:
114 | name: CC-BY 4.0
115 | url: https://creativecommons.org/licenses/by/4.0/
116 | production-software: eScriptorium + Kraken
117 | project-name: FoNDUE
118 | project-website: https://github.com/FoNDUE-HTR
119 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
120 | script:
121 | - iso: Latn
122 | script-type: only-typed
123 | time:
124 | notAfter: '1800'
125 | notBefore: '1899'
126 | title: FONDUE-ES-PRINT-19
127 | transcription-guidelines: SegmOnto
128 | url: https://github.com/FoNDUE-HTR/FONDUE-ES-PRINT-19
129 | volume:
130 | - count: 53687
131 | metric: characters
132 | - count: 38
133 | metric: files
134 | - count: 1375
135 | metric: lines
136 | - count: 103
137 | metric: regions
138 |
--------------------------------------------------------------------------------
/catalog/fondue/FONDUE-FR-PRINT-20.yml:
--------------------------------------------------------------------------------
1 | authors:
2 | - name: Simon
3 | orcid: 0000-0001-9094-4475
4 | roles:
5 | - project-manager
6 | - quality-control
7 | - support
8 | surname: Gabay
9 | - name: Sophie
10 | orcid: 0009-0005-6841-0158
11 | roles:
12 | - transcriber
13 | surname: Dolto
14 | automatically-aligned: false
15 | characters:
16 | members:
17 | - e
18 | - a
19 | - s
20 | - i
21 | - t
22 | - r
23 | - n
24 | - u
25 | - l
26 | - o
27 | - d
28 | - c
29 | - p
30 | - m
31 | - ́
32 | - ','
33 | - .
34 | - v
35 | - ’
36 | - g
37 | - f
38 | - b
39 | - q
40 | - h
41 | - ̀
42 | - ̂
43 | - x
44 | - j
45 | - L
46 | - y
47 | - '-'
48 | - I
49 | - "'"
50 | - —
51 | - A
52 | - G
53 | - E
54 | - M
55 | - P
56 | - C
57 | - B
58 | - J
59 | - D
60 | - z
61 | - ̧
62 | - S
63 | - '!'
64 | - T
65 | - '?'
66 | - ¬
67 | - V
68 | - ;
69 | - U
70 | - O
71 | - R
72 | - Q
73 | - ':'
74 | - '1'
75 | - k
76 | - F
77 | - H
78 | - œ
79 | - '0'
80 | - (
81 | - )
82 | - “
83 | - '2'
84 | - N
85 | - '6'
86 | - '9'
87 | - '8'
88 | - '5'
89 | - ̈
90 | - '3'
91 | - w
92 | - W
93 | - '4'
94 | - Y
95 | - ”
96 | -
97 | - '7'
98 | - Z
99 | - '*'
100 | - /
101 | - K
102 | - '"'
103 | - «
104 | - »
105 | mode: NFD
106 | citation-file-link: https://github.com/FoNDUE-HTR/FONDUE-FR-PRINT-20/blob/master/CITATION.cff
107 | description: French novels
108 | format: Alto-XML
109 | hands:
110 | count: unknown
111 | precision: exact
112 | institutions: []
113 | language:
114 | - eng
115 | license:
116 | name: CC-BY 4.0
117 | url: https://creativecommons.org/licenses/by/4.0/
118 | production-software: eScriptorium + Kraken
119 | project-name: FoNDUE
120 | project-website: https://github.com/FoNDUE-HTR
121 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
122 | script:
123 | - iso: Latn
124 | script-type: only-typed
125 | time:
126 | notAfter: '1900'
127 | notBefore: '1999'
128 | title: FONDUE-FR-PRINT-20
129 | transcription-guidelines: SegmOnto
130 | url: https://github.com/FoNDUE-HTR/FONDUE-FR-PRINT-20
131 | volume:
132 | - count: 81599
133 | metric: characters
134 | - count: 55
135 | metric: files
136 | - count: 1604
137 | metric: lines
138 | - count: 64
139 | metric: regions
140 |
--------------------------------------------------------------------------------
/catalog-ids.json:
--------------------------------------------------------------------------------
1 | {"https://doi.org/10.5281/zenodo.5153263": "repo-00000", "https://zenodo.org/record/4780947#.YhN5pVvMLUQ": "repo-00001", "https://github.com/calfa-co/rasam-dataset": "repo-00002", "https://github.com/DesenrollandoElCordel/FoNDUE-Spanish-chapbooks-Dataset": "repo-00003", "https://zenodo.org/record/3333627#.YhN1G1vMLUQ": "repo-00004", "https://github.com/rescribe/carolineminuscule-groundtruth": "repo-00005", "http://dx.doi.org/10.34847/nkl.acb724xs": "repo-00006", "https://github.com/e-ditiones/OCR17plus": "repo-00007", "https://github.com/PSL-Chartes-HTR-Students/TNAH-2021-Projet-Notre-Dame": "repo-00008", "https://github.com/PSL-Chartes-HTR-Students/TNAH-2021-ArgusDesBrevets": "repo-00009", "https://github.com/PSL-Chartes-HTR-Students/TNAH-2021-DecameronFR": "repo-00010", "https://github.com/PSL-Chartes-HTR-Students/HN2021-Kovalewsky-1893": "repo-00011", "https://github.com/PSL-Chartes-HTR-Students/HN2021-ChateauChavigny": "repo-00012", "https://github.com/PSL-Chartes-HTR-Students/HN2021-Boccace": "repo-00013", "https://github.com/PSL-Chartes-HTR-Students/HN2021-Memorials_Jane_Lathrop_Stanford": "repo-00014", "https://github.com/PSL-Chartes-HTR-Students/TNAH-2021-Expositions_Universelles": "repo-00015", "https://github.com/PSL-Chartes-HTR-Students/TNAH-2021-Projet-Correspondance-Berlioz": "repo-00016", "https://github.com/jpmjpmjpm/genauto-td-htr.git": "repo-00017", "https://doi.org/10.5281/zenodo.5179361": "repo-00018", "HTR-United/tapuscorpus": "repo-00019", "HTR-United/timeuscorpus": "repo-00020", "HTR-United/dahncorpus": "repo-00021", "HTR-United/cremma-medieval": "repo-00022", "HTR-United/cremma-16-17-print": "repo-00023", "HTR-United/CREMMA-Medieval-LAT": "repo-00024", "HTR-United/CREMMA-MSS-17": "repo-00025", "HTR-United/CREMMA-MSS-18": "repo-00026", "HTR-United/CREMMA-MSS-19": "repo-00027", "HTR-United/CREMMA-MSS-20": "repo-00028", "HTR-United/lectaurep-bronod": "repo-00029", "HTR-United/lectaurep-mariages-et-divorces": "repo-00030", "HTR-United/lectaurep-repertoires": "repo-00031", "HTR-United/CREMMA-AN-TestamentDePoilus": "repo-00032", "HTR-United/cremma-wikipedia": "repo-00033", "Gallicorpora/HTR-MSS-15e-Siecle": "repo-00034", "Gallicorpora/HTR-incunable-15e-siecle": "repo-00035", "Gallicorpora/HTR-imprime-16e-siecle": "repo-00036", "Gallicorpora/HTR-imprime-17e-siecle": "repo-00037", "Gallicorpora/HTR-imprime-gothique-16e-siecle": "repo-00038", "Gallicorpora/HTR-imprime-18e-siecle": "repo-00039", "FoNDUE-HTR/FONDUE-FR-PRINT-17": "repo-00040", "FoNDUE-HTR/FONDUE-FR-PRINT-16": "repo-00041"}
--------------------------------------------------------------------------------
/catalog/gallicorpora/gothic-16.yml:
--------------------------------------------------------------------------------
1 | authors:
2 | - name: Pinche
3 | roles:
4 | - project-manager
5 | surname: Ariane
6 | - name: Gabay
7 | roles:
8 | - project-manager
9 | surname: Simon
10 | - name: Vlachou-Efstathiou
11 | roles:
12 | - transcriber
13 | surname: malamatenia
14 | - name: Christensen
15 | roles:
16 | - support
17 | surname: Kelly
18 | characters:
19 | members:
20 | - e
21 | - u
22 | - a
23 | - i
24 | - t
25 | - r
26 | - n
27 | - o
28 | - s
29 | - l
30 | - d
31 | - c
32 | - m
33 | - p
34 | - ſ
35 | - q
36 | - y
37 | - ̃
38 | - f
39 | - g
40 | - b
41 | - .
42 | - h
43 | - ','
44 | - z
45 | - ⁊
46 | - x
47 | - E
48 | - ¬
49 | - ¶
50 | - C
51 | - S
52 | - L
53 | - D
54 | - P
55 | - A
56 | - I
57 | - ͥ
58 | - M
59 | - v
60 | - Q
61 | - ꝰ
62 | - O
63 | - T
64 | - ':'
65 | - V
66 | - B
67 | - '?'
68 | - ꝑ
69 | - H
70 | - N
71 | - ͬ
72 | - R
73 | - ;
74 | - G
75 | - F
76 | - ̌
77 | - ꝓ
78 | - J
79 | - '-'
80 | - ꝯ
81 | - (
82 | - )
83 | - '1'
84 | - U
85 | - '9'
86 | - ̾
87 | - æ
88 | - X
89 | - '4'
90 | - ꝙ
91 | - ̧
92 | - ͤ
93 | - '2'
94 | - '*'
95 | - '6'
96 | - "'"
97 | - Ι
98 | - '7'
99 | - ⟦
100 | - ⟧
101 | - '8'
102 | - Y
103 | - '5'
104 | - '0'
105 | mode: NFD
106 | description: Corpus d'entrainement pour l'HTR constitué d'imprimés du 16e siècle
107 | format: Alto-XML
108 | hands:
109 | count: 1-per-folder
110 | precision: estimated
111 | language:
112 | - fra
113 | license:
114 | - name: CC-BY 4.0
115 | url: https://creativecommons.org/licenses/by/4.0/
116 | production-software: eScriptorium + Kraken
117 | project-name: Gallicorpora
118 | project-website: https://github.com/Gallicorpora
119 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
120 | script:
121 | - iso: Latn
122 | script-type: evenly-mixed
123 | time:
124 | notAfter: '1599'
125 | notBefore: '1500'
126 | title: Données imprimés gothiques du 16e siècle
127 | transcription-guidelines: Les transcriptions suivent les normes de transcription du
128 | projet Gallicorpora
129 | url: https://github.com/Gallicorpora/HTR-imprime-16e-siecle
130 | volume:
131 | - count: 90731
132 | metric: characters
133 | - count: 80
134 | metric: files
135 | - count: 2971
136 | metric: lines
137 | - count: 233
138 | metric: regions
139 |
--------------------------------------------------------------------------------
/catalog/greek-data/hpgtr.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: HPGTR Dataset
3 | url: https://github.com/vivianpl/hpgtr
4 | authors:
5 | - name: Paraskevi
6 | surname: Platanou
7 | roles:
8 | - transcriber
9 | - project-manager
10 | - name: John
11 | surname: Pavlopoulos
12 | orcid: 0000-0001-9188-7425
13 | roles:
14 | - transcriber
15 | - project-manager
16 | - name: Georgios
17 | surname: Papaioannou
18 | orcid: 0000-0003-4774-0746
19 | roles:
20 | - transcriber
21 | - project-manager
22 | institutions: []
23 | description: >-
24 | The HPGT dataset consists of images of Handwritten Paleographic
25 | Greek Text, derived from the Bodleian Libraries' Greek manuscript
26 | collection, specifically the Barocci collection, which dates from
27 | the 8th to the 17th centuries. This dataset is divided into two
28 | editions: HPGTR.N, which contains 77 unsegmented images categorized
29 | by century from the 10th to the 16th, and HPGTR.S, which features
30 | carefully segmented lines from selected images to facilitate machine
31 | learning tasks. The dataset captures a range of characteristics,
32 | including variations in writing style, page conditions, and
33 | manuscript production details.
34 |
35 | This dataset is part of the following work: Paraskevi Platanou,
36 | John Pavlopoulos, and Georgios Papaioannou. 2022. Handwritten
37 | Paleographic Greek Text Recognition: A Century-Based Approach.
38 | In *Proceedings of the "Thirteenth Language Resources and Evaluation Conference"*,
39 | pages 6585–6589, Marseille, France. European Language Resources Association.
40 | language:
41 | - grc
42 | transcription-guidelines: |
43 | - Abbreviation and ligatures were resolved
44 | - Minuscule in the beginning of sentences were kept as such.
45 | - Polytonic spelling and diaeresis are kept
46 | production-software: Unknown
47 | automatically-aligned: false
48 | characters:
49 | mode: NFD
50 | script:
51 | - iso: Grek
52 | script-type: only-manuscript
53 | time:
54 | notBefore: '0901'
55 | notAfter: '1600'
56 | hands:
57 | count: less-than-11
58 | precision: exact
59 | license:
60 | name: CC-BY-NC-SA 3.0
61 | url: https://creativecommons.org/licenses/by/4.0/
62 | format: Page-XML
63 | volume:
64 | - {count: 1698, metric: "lines"}
65 | - {count: 70, metric: "files"}
66 | - {count: 178, metric: "regions"}
67 | - {count: 64952, metric: "characters"}
--------------------------------------------------------------------------------
/catalog/LiDi/LiDi1-0-project.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: LiDi1.0-project
3 | url: https://github.com/Giorgiaagostini/LiDi1.0-project
4 | authors:
5 | - name: Giorgia
6 | surname: Agostini
7 | orcid: 0009-0007-9887-5129
8 | roles:
9 | - transcriber
10 | - aligner
11 | - project-manager
12 | - quality-control
13 | institutions: []
14 | description: >-
15 | This repository contains all data relating to the LiDi 1.0 project. In
16 | particular HTR GT of 16th antiquarian Pirro Ligorio, used to create
17 | Transkribus public model Ligorio 0.3 PyL.
18 | project-name: LiDi 1.0
19 | project-website: https://lidiws-limes.cfs.unipi.it
20 | language:
21 | - ita
22 | production-software: Transkribus
23 | automatically-aligned: false
24 | script:
25 | - iso: Latn
26 | - iso: Grek
27 | script-type: only-manuscript
28 | time:
29 | notBefore: '1568'
30 | notAfter: '1580'
31 | hands:
32 | count: '1'
33 | precision: estimated
34 | license:
35 | name: CC-BY-SA 4.0
36 | url: https://creativecommons.org/licenses/by-sa/4.0/
37 | format: Alto-XML
38 | sources:
39 | - reference: ''
40 | link: >-
41 | https://archiviodistatotorino.beniculturali.it/dbadd/visvol_bibl.php?uid=300146
42 | volume:
43 | - metric: files
44 | count: 195
45 | citation-file-link: >-
46 | https://github.com/Giorgiaagostini/LiDi1.0-project/blob/main/Data/Ground%20Truth/CITATION.cff
47 | transcription-guidelines: >-
48 | - Normalisation of «V» to «U» except in Latin inscriptions;
49 |
50 | - Preservation of the diacritical marks and punctuation as used by the Author
51 | except for the part in Greek;
52 |
53 | - Where the use of capital and small caps is not distinguished, it is
54 | transcribed according to the grammatical rules of the Italian language;
55 |
56 | - Tagging of uncertain words with the «unclear» tag;
57 |
58 | - Tagging of illegible words with three dots (...) and the «unclear» tag;
59 |
60 | - Use of the angle dash, instead of the hyphen, to divide words into syllables
61 | at the end of a line.
62 |
63 | Moreover due to some issues in the visualization of ancient symbols unicode,
64 | the Roman Denarius (U+10196) and the Roman Sestersius (U+10198) signs were
65 | transcribed using other symbols not used by the author from the Astronomical
66 | chart:
67 |
68 | Roman denarius sign ➛♀(U+2640 Female sign)
69 |
70 | Roman sestertius sign➛☿ (U+263F Mercury)
71 |
72 | In order to change them to the correct one during post-processing.
73 |
--------------------------------------------------------------------------------
/catalog/fondue/FONDUE-EN-PRINT-20.yml:
--------------------------------------------------------------------------------
1 | authors:
2 | - name: Simon
3 | orcid: 0000-0001-9094-4475
4 | roles:
5 | - transcriber
6 | - project-manager
7 | - quality-control
8 | - support
9 | surname: Gabay
10 | - name: Jessica
11 | roles:
12 | - transcriber
13 | surname: Da Silva Fernandes
14 | - name: Myriam
15 | roles:
16 | - transcriber
17 | surname: Perregaux
18 | automatically-aligned: false
19 | characters:
20 | members:
21 | - e
22 | - t
23 | - o
24 | - n
25 | - a
26 | - i
27 | - r
28 | - s
29 | - h
30 | - d
31 | - l
32 | - c
33 | - u
34 | - m
35 | - f
36 | - g
37 | - p
38 | - ','
39 | - y
40 | - w
41 | - b
42 | - v
43 | - .
44 | - k
45 | - '1'
46 | - I
47 | - ¬
48 | - C
49 | - S
50 | - T
51 | - '-'
52 | - '9'
53 | - A
54 | - ;
55 | - '8'
56 | - M
57 | - x
58 | - '4'
59 | - '2'
60 | - /
61 | - '6'
62 | - N
63 | - G
64 | - R
65 | - D
66 | - q
67 | - '0'
68 | - '"'
69 | - H
70 | - E
71 | - '5'
72 | - z
73 | - P
74 | - W
75 | - U
76 | - '7'
77 | - (
78 | - j
79 | - )
80 | - '3'
81 | - B
82 | - "'"
83 | - ’
84 | - L
85 | - ':'
86 | - Y
87 | - O
88 | - V
89 | - Q
90 | - –
91 | - '?'
92 | - F
93 | - J
94 | - '!'
95 | - K
96 | - “
97 | - '['
98 | - ']'
99 | - X
100 | - Z
101 | - ́
102 | - ”
103 | - —
104 | mode: NFD
105 | citation-file-link: https://github.com/FoNDUE-HTR/FONDUE-EN-PRINT-20/blob/master/CITATION.cff
106 | description: Various prints (academic, archives, novels…)
107 | format: Alto-XML
108 | hands:
109 | count: unknown
110 | precision: exact
111 | institutions: []
112 | language:
113 | - eng
114 | license:
115 | name: CC-BY 4.0
116 | url: https://creativecommons.org/licenses/by/4.0/
117 | production-software: eScriptorium + Kraken
118 | project-name: FoNDUE
119 | project-website: https://github.com/FoNDUE-HTR
120 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
121 | script:
122 | - iso: Latn
123 | script-type: only-typed
124 | time:
125 | notAfter: '1900'
126 | notBefore: '1999'
127 | title: FONDUE-EN-PRINT-20
128 | transcription-guidelines: SegmOnto
129 | url: https://github.com/FoNDUE-HTR/FONDUE-EN-PRINT-20
130 | volume:
131 | - count: 82834
132 | metric: characters
133 | - count: 30
134 | metric: files
135 | - count: 1728
136 | metric: lines
137 | - count: 72
138 | metric: regions
139 |
--------------------------------------------------------------------------------
/catalog/almanach/lectaurep-bronod.yml:
--------------------------------------------------------------------------------
1 | authors:
2 | - name: Limon-Bonnet
3 | roles:
4 | - transcriber
5 | - aligner
6 | - quality-control
7 | surname: Françoise
8 | - name: Chagué
9 | roles:
10 | - support
11 | - project-manager
12 | - quality-control
13 | surname: Alix
14 | - name: Rostaing
15 | roles:
16 | - project-manager
17 | surname: Aurélia
18 | characters:
19 | members:
20 | - e
21 | - t
22 | - a
23 | - /
24 | - '0'
25 | - c
26 | - n
27 | - r
28 | - m
29 | - h
30 | - p
31 | - s
32 | - o
33 | - g
34 | - '5'
35 | - '7'
36 | - '1'
37 | - E
38 | - .
39 | - i
40 | - '-'
41 | - '3'
42 | - '9'
43 | - '2'
44 | - f
45 | - d
46 | - '8'
47 | - <
48 | - l
49 | - '{'
50 | - ':'
51 | - P
52 | - A
53 | - G
54 | - '}'
55 | - U
56 | - x
57 | - '>'
58 | - b
59 | - '4'
60 | - '6'
61 | mode: NFD
62 | citation-file-link: https://raw.githubusercontent.com/HTR-United/lectaurep-bronod/master/CITATION.cff
63 | description: "Ground truth for Maître Bronod’s registers, notary in Paris during the\
64 | \ 18th century.\n"
65 | format: Page-XML
66 | hands:
67 | count: '1'
68 | precision: exact
69 | language:
70 | - fra
71 | license:
72 | - name: CC-BY 4.0
73 | url: https://creativecommons.org/licenses/by/4.0/
74 | production-software: eScriptorium + Kraken
75 | project-name: "LECTAUREP\n"
76 | project-website: https://lectaurep.hypotheses.org/
77 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
78 | script:
79 | - iso: Latn
80 | script-type: only-manuscript
81 | sources:
82 | - link: ''
83 | reference: Limon-Bonnet, M. (2021). Lectaurep-Bronod, ground truth for Maitre Bronod\u0027s
84 | documents (French XVIIIth century) (Version 1.0) [Computer software]. https://doi.org/10.5072/zenodo.977735
85 | time:
86 | notAfter: '1745'
87 | notBefore: '1742'
88 | title: Notaires de Paris - Bronod
89 | transcription-guidelines: "Transcription fidèle aux manuscrits : la casse et les abréviations\
90 | \ sont respectées. Les portions de texte suscrites sont précédées d'un symbole `^`.\
91 | \ Pas de traitement particulier des éventuels s longs.'\n"
92 | url: https://github.com/HTR-United/lectaurep-bronod
93 | volume:
94 | - count: 359094
95 | metric: characters
96 | - count: 100
97 | metric: files
98 | - count: 3702
99 | metric: lines
100 | - count: 200
101 | metric: pages
102 | - count: 296
103 | metric: regions
104 |
--------------------------------------------------------------------------------
/catalog/ciham-htr/fabliaux.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: Fabliaux
3 | url: https://github.com/CIHAM-HTR/Fabliaux
4 | authors:
5 | - name: Corinne
6 | surname: Pierreville
7 | orcid: 0009-0003-3074-3841
8 | roles:
9 | - project-manager
10 | - name: Ariane
11 | surname: Pinche
12 | orcid: 0000-0002-7843-5050
13 | roles:
14 | - transcriber
15 | - aligner
16 | - quality-control
17 | institutions: []
18 | description: HTR data sets from medieval manuscripts (13th-14th c.) collecting "fabliaux"
19 | project-website: https://projet.biblissima.fr/fr/appels-projets/projets-retenus/fabliaux
20 | language:
21 | - fro
22 | production-software: eScriptorium + Kraken
23 | script:
24 | - iso: Latn
25 | script-type: only-manuscript
26 | time:
27 | notBefore: '1200'
28 | notAfter: '1402'
29 | hands:
30 | count: 1-per-folder
31 | precision: exact
32 | license:
33 | - name: CC-BY 4.0
34 | url: https://creativecommons.org/licenses/by/4.0/
35 | format: Alto-XML
36 | citation-file-link: https://github.com/CIHAM-HTR/Fabliaux/blob/master/CITATION.cff
37 | transcription-guidelines: "The data follow the standards recommended by the CREMMALAB\
38 | \ project, see Ariane Pinche. Transcription Guide for 10th to 15th Century Manuscripts.\
39 | \ 2022. \u27E8hal-03697382\u27E9"
40 | volume:
41 | - metric: characters
42 | count: 19600
43 | - metric: files
44 | count: 10
45 | - metric: lines
46 | count: 904
47 | - metric: regions
48 | count: 40
49 | characters:
50 | mode: NFD
51 | members:
52 | - e
53 | - i
54 | - s
55 | - t
56 | - a
57 | - o
58 | - u
59 | - n
60 | - r
61 | - l
62 | - d
63 | - m
64 | - c
65 | - p
66 | - "\u0303"
67 | - f
68 | - q
69 | - b
70 | - .
71 | - h
72 | - z
73 | - g
74 | - "\u204A"
75 | - "\u033E"
76 | - "\uA751"
77 | - Q
78 | - "\u0365"
79 | - I
80 | - x
81 | - "\uA770"
82 | - S
83 | - C
84 | - E
85 | - "\uA76F"
86 | - T
87 | - L
88 | - N
89 | - O
90 | - y
91 | - M
92 | - D
93 | - "\u0363"
94 | - F
95 | - A
96 | - U
97 | - "\u0142"
98 | - "\u1E9C"
99 | - P
100 | - B
101 | - ':'
102 | - '9'
103 | - "\uF1AC"
104 | - '1'
105 | - '6'
106 | - '4'
107 | - "\u0366"
108 | - "\u27E6"
109 | - "\u27E7"
110 | - "\u205C"
111 | - ''''
112 | - G
113 | - "\u1DE4"
114 | - "\u036B"
115 | - '7'
116 | - '5'
117 | - '0'
118 | - "\uA753"
119 | - '8'
120 |
--------------------------------------------------------------------------------
/catalog/fondue/FONDUE-FR-MSS-18.yml:
--------------------------------------------------------------------------------
1 | authors:
2 | - name: Peter
3 | roles:
4 | - transcriber
5 | surname: Nahon
6 | - name: Simon
7 | orcid: 0000-0001-9094-4475
8 | roles:
9 | - transcriber
10 | - project-manager
11 | - quality-control
12 | - support
13 | surname: Gabay
14 | automatically-aligned: false
15 | characters:
16 | members:
17 | - e
18 | - s
19 | - a
20 | - t
21 | - u
22 | - i
23 | - r
24 | - n
25 | - o
26 | - l
27 | - d
28 | - c
29 | - m
30 | - p
31 | - ','
32 | - v
33 | - q
34 | - .
35 | - ́
36 | - f
37 | - g
38 | - b
39 | - h
40 | - "'"
41 | - ’
42 | - I
43 | - +
44 | - y
45 | - ¬
46 | - '1'
47 | - ̀
48 | - ̂
49 | - x
50 | - V
51 | - j
52 | - S
53 | - '2'
54 | - ':'
55 | - E
56 | - X
57 | - C
58 | - L
59 | - J
60 | - '3'
61 | - D
62 | - '4'
63 | - M
64 | - ;
65 | - ̈
66 | - A
67 | - '5'
68 | - '6'
69 | - '8'
70 | - '9'
71 | - '7'
72 | - '0'
73 | - P
74 | - O
75 | - ̧
76 | - R
77 | - '-'
78 | - N
79 | - G
80 | - T
81 | - '?'
82 | - B
83 | - œ
84 | - H
85 | -
86 | - Q
87 | - α
88 | - F
89 | - z
90 | - Z
91 | - U
92 | - ̓
93 | - ο
94 | - ν
95 | - μ
96 | - ω
97 | - τ
98 | - δ
99 | - ε
100 | - ρ
101 | - φ
102 | - (
103 | - )
104 | - '{'
105 | - k
106 | - Ψ
107 | - ι
108 | - υ
109 | - π
110 | - λ
111 | - Y
112 | - K
113 | mode: NFD
114 | citation-file-link: https://github.com/FoNDUE-HTR/FONDUE-FR-MSS-18/blob/master/CITATION.cff
115 | description: French Manuscripts of the 18th
116 | format: Alto-XML
117 | hands:
118 | count: unknown
119 | precision: exact
120 | institutions: []
121 | language:
122 | - fra
123 | license:
124 | name: CC-BY 4.0
125 | url: https://creativecommons.org/licenses/by/4.0/
126 | production-software: eScriptorium + Kraken
127 | project-name: FoNDUE
128 | project-website: https://github.com/FoNDUE-HTR
129 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
130 | script:
131 | - iso: Latn
132 | script-type: only-manuscript
133 | time:
134 | notAfter: '1799'
135 | notBefore: '1700'
136 | title: FONDUE-FR-MSS-18
137 | transcription-guidelines: SegmOnto
138 | url: https://github.com/FoNDUE-HTR/FONDUE-FR-MSS-18
139 | volume:
140 | - count: 108705
141 | metric: characters
142 | - count: 82
143 | metric: files
144 | - count: 2933
145 | metric: lines
146 | - count: 203
147 | metric: regions
148 |
--------------------------------------------------------------------------------
/catalog/enc-cours-git/tnah-notredame.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: Projet Notre-Dame
3 | url: https://github.com/PSL-Chartes-HTR-Students/TNAH-2021-Projet-Notre-Dame
4 | project-name: 'ENC - Bonnes pratiques du developpement collaboratif
5 |
6 | '
7 | authors:
8 | - name: Doat
9 | surname: Soline
10 | roles:
11 | - transcriber
12 | - name: Menu
13 | surname: Ariane
14 | roles:
15 | - transcriber
16 | - name: Falcoz
17 | surname: Elsa
18 | roles:
19 | - transcriber
20 | - name: Faure
21 | surname: Margaux
22 | roles:
23 | - transcriber
24 | - name: "Mazou\xE9"
25 | surname: "Ana\xEFs"
26 | roles:
27 | - transcriber
28 | description: "Le Projet Notre-Dame consiste en une transcription des journaux quotidiens\
29 | \ de l\u2019ann\xE9e 1860 (https://mediatheque-patrimoine.culture.gouv.fr/sites/mediatheque/files/jnd_1860.pdf)\
30 | \ des travaux de restauration effectu\xE9s de 1844 \xE0 1865 \xE0 la cath\xE9drale\
31 | \ Notre-Dame de Paris sous la direction d'Eug\xE8ne Viollet-le-Duc et Jean-Baptiste\
32 | \ Lassus. Celle-ci a \xE9t\xE9 effectu\xE9e sur eScriptorium \xE0 partir de la num\xE9\
33 | risation des journaux des travaux (https://mediatheque-patrimoine.culture.gouv.fr/travaux-de-notre-dame-de-paris-1844-1865)\
34 | \ r\xE9alis\xE9e par la M\xE9diath\xE8que de l'architecture et du patrimoine. \n"
35 | language:
36 | - fra
37 | script:
38 | - iso: Latn
39 | script-type: only-manuscript
40 | time:
41 | notBefore: '1860'
42 | notAfter: '1860'
43 | hands:
44 | count: '1'
45 | precision: exact
46 | license:
47 | - name: CC-BY 4.0
48 | url: https://creativecommons.org/licenses/by/4.0/
49 | format: Alto-XML
50 | citation-file-link: https://raw.githubusercontent.com/PSL-Chartes-HTR-Students/TNAH-2021-Projet-Notre-Dame/main/CITATION.cff
51 | transcription-guidelines: "- respect des majuscules et minuscules - respect des ligatures\
52 | \ (par exemple, transcrire \"ch\u0153ur\") - mot qui est barr\xE9 : \u96BE (une\
53 | \ seule fois par mot) mais seulement s'ils sont totalement/\xE0 moiti\xE9 illisibles.\
54 | \ Les restranscrire entre accolades {} s'ils sont lisibles. - Pour mettre en exergue\
55 | \ les doutes de transcription : \n - mot incertain: [incertain]\n - mot que\
56 | \ l'on ne parvient pas \xE0 transcrire : [??]\n"
57 | volume:
58 | - metric: characters
59 | count: 29286
60 | - metric: files
61 | count: 12
62 | - metric: lines
63 | count: 735
64 | - metric: regions
65 | count: 86
66 | production-software: "eScriptorium + Kraken"
67 |
--------------------------------------------------------------------------------
/catalog/popp/the-popp-datasets.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: The POPP datasets
3 | url: https://zenodo.org/record/6581158
4 | authors:
5 | - name: Thomas
6 | surname: Constum
7 | roles:
8 | - aligner
9 | - quality-control
10 | - support
11 | - name: Nicolas
12 | surname: Kempf
13 | - name: Pierrick
14 | surname: Tranouez
15 | - name: Thierry
16 | surname: Paquet
17 | roles:
18 | - project-manager
19 | - name: Sandra
20 | surname: Brée
21 | orcid: 0000-0002-2802-5563
22 | roles:
23 | - transcriber
24 | - project-manager
25 | - name: François
26 | surname: Merveille
27 | roles:
28 | - transcriber
29 | institutions: []
30 | description: >-
31 | The POPP datasets is a set of 3 datasets created within the POPP project
32 | (Project for the Oceration of the Paris Population Census) for the task of
33 | handwriting text recognition. These datasets have been published in
34 | "Recognition and information extraction in historical handwritten tables:
35 | toward understanding early 20th century Paris census" at DAS 2022.
36 |
37 |
38 | The 3 datasets are called “Generic dataset”, “Belleville”, and “Chaussée
39 | d’Antin” and contains lines made from the extracted rows of census tables from
40 | 1926. Each table in the Paris census contains 30 rows, thus each page in these
41 | datasets corresponds to 30 lines.
42 | project-name: Project for the Oceration of the Paris Population Census
43 | project-website: https://popp.hypotheses.org
44 | language:
45 | - fra
46 | production-software: Pivan
47 | script:
48 | - iso: Latn
49 | script-type: only-manuscript
50 | time:
51 | notBefore: '1926'
52 | notAfter: '1926'
53 | hands:
54 | count: more-than-10
55 | precision: estimated
56 | license:
57 | - name: CC-BY 4.0
58 | url: https://creativecommons.org/licenses/by/4.0/
59 | format: Alto-XML
60 | volume:
61 | - metric: lines
62 | count: 7050
63 | transcription-guidelines: >
64 | The text is transcribed as in the image (no correction of mispelling, no
65 | resolution of abbreviation).
66 |
67 | Since the lines are extracted from table rows, we defined 4 special characters
68 | to describe the structure of the text:
69 | ¤ : indicates an empty cell
70 | / : indicates the separation into columns
71 | ? : indicates that the content of the cell following this symbol is written above the regular baseline
72 | ! : indicates that the content of the cell following this symbol is written below the regular baseline
73 |
--------------------------------------------------------------------------------
/catalog/almanach/tapuscorpus.yml:
--------------------------------------------------------------------------------
1 | authors:
2 | - name: Chagué
3 | roles:
4 | - transcriber
5 | - project-manager
6 | surname: Alix
7 | characters:
8 | members:
9 | - e
10 | - a
11 | - s
12 | - n
13 | - t
14 | - r
15 | - i
16 | - u
17 | - o
18 | - l
19 | - d
20 | - c
21 | - m
22 | - p
23 | - ́
24 | - .
25 | - '~'
26 | - v
27 | - ','
28 | - "'"
29 | - '-'
30 | - f
31 | - g
32 | - h
33 | - q
34 | - b
35 | - ̀
36 | - _
37 | - E
38 | - L
39 | - A
40 | - I
41 | - C
42 | - x
43 | - S
44 | - M
45 | - j
46 | - T
47 | - ̂
48 | - R
49 | - N
50 | - '1'
51 | - O
52 | - P
53 | - y
54 | - '"'
55 | - U
56 | - J
57 | - D
58 | - '2'
59 | - ':'
60 | - )
61 | - (
62 | - B
63 | - '0'
64 | - '5'
65 | - '3'
66 | - '4'
67 | - z
68 | - '6'
69 | - F
70 | - H
71 | - Q
72 | - '!'
73 | - '9'
74 | - G
75 | - '7'
76 | - V
77 | - '8'
78 | - '?'
79 | - ⟦
80 | - ⟧
81 | - ̧
82 | - Y
83 | - ;
84 | - ’
85 | - °
86 | - k
87 | - X
88 | - ̈
89 | - +
90 | - '='
91 | - W
92 | - /
93 | - K
94 | - ^
95 | - w
96 | - Z
97 | - '%'
98 | - '*'
99 | mode: NFD
100 | citation-file-link: https://github.com/HTR-United/tapuscorpus/raw/main/citation.cff
101 | description: Ground truth based on a variety of French typewritten documents from
102 | the 20th century. Contains exerpts plays, poems, letters and administrative reports.
103 | format: Page-XML
104 | hands:
105 | count: 1-per-folder
106 | precision: exact
107 | language:
108 | - fra
109 | license:
110 | - name: CC-BY 4.0
111 | url: https://creativecommons.org/licenses/by/4.0/
112 | production-software: eScriptorium + Kraken
113 | project-name: "HTR-United\n"
114 | project-website: https://htr-united.github.io/
115 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
116 | script:
117 | - iso: Latn
118 | script-type: only-typed
119 | sources:
120 | - link: ''
121 | reference: Chagué, A. (2021). Tapuscorpus (Version 1.0) [Computer software]. https://doi.org/10.5072/zenodo.977649
122 | time:
123 | notAfter: '1999'
124 | notBefore: '1900'
125 | title: Tapus Corpus
126 | transcription-guidelines: See README in repository.
127 | url: https://github.com/HTR-United/tapuscorpus
128 | volume:
129 | - count: 131511
130 | metric: characters
131 | - count: 151
132 | metric: files
133 | - count: 4376
134 | metric: lines
135 | - count: 150
136 | metric: pages
137 | - count: 375
138 | metric: regions
139 |
--------------------------------------------------------------------------------
/catalog/enc-cours-git/hn-chavigny.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: Chateau de Chavigny
3 | url: https://github.com/PSL-Chartes-HTR-Students/HN2021-ChateauChavigny
4 | project-name: ENC - Bonnes pratiques du developpement collaboratif
5 | authors:
6 | - name: Pascual
7 | surname: Margot
8 | roles:
9 | - transcriber
10 | - name: "Franchet d\\u0027Esp\xE8rey"
11 | surname: Louis-Fiacre
12 | roles:
13 | - transcriber
14 | - digitization
15 | - name: Gabay
16 | surname: Simon
17 | roles:
18 | - quality-control
19 | description: "Le document sur lequel nous travaillons porte sur le Ch\xE2teau de Chavigny\
20 | \ \xE0 Lern\xE9 en Touraine. Au XVI\xE8me si\xE8cle, c\u2019est la famille des seigneurs\
21 | \ Leroy qui poss\xE8de ce ch\xE2teau. Avant 1568, en pleine guerre de religion,\
22 | \ Fran\xE7ois Leroy, du parti du roi et des catholiques, participe \xE0 la capture\
23 | \ et la ran\xE7on du prince de Cond\xE9, du parti protestant. En 1568, Fran\xE7\
24 | ois Leroy, en tant que capitaine de 50 lances au service du roi, part en campagne\
25 | \ avec lui. L'objectif est de transcrire cinq feuillets d'un manuscrit \xE0 l'aide\
26 | \ d'eScriptorium. Le but \xE9tant d'apprendre \xE0 utiliser git et github pour mener\
27 | \ \xE0 bien notre premier projet collaboratif.\n"
28 | language:
29 | - frm
30 | script:
31 | - iso: Latn
32 | script-type: only-manuscript
33 | time:
34 | notBefore: '1568'
35 | notAfter: '1599'
36 | hands:
37 | count: '1'
38 | precision: exact
39 | license:
40 | - name: CC-BY 4.0
41 | url: https://creativecommons.org/licenses/by/4.0/
42 | format: Alto-XML
43 | citation-file-link: https://raw.githubusercontent.com/PSL-Chartes-HTR-Students/HN-2021-ChateauChavigny/main/CITATION.cff
44 | transcription-guidelines: "- Gestion des abbr\xE9viations: \n - Si d\xE9veloppement\
45 | \ (pas toujours), les d\xE9velopper entre crochets.\n - L'orthographe originale\
46 | \ et les abr\xE9viations doivent \xEAtre conserv\xE9es.\n- Gestion des \xE9checs\
47 | \ de transcription de caract\xE8re : lorsqu'un qu'un caract\xE8re nous para\xEE\
48 | t non sur, nous pr\xE9f\xE9rons mettre un [?] pour indiquer qu'il y a un caract\xE8\
49 | re non transcrit dans un mot. Pour plusieurs caract\xE8res, faire autant de ? que\
50 | \ de caract\xE8re non reconnu : tel [???] pour 3 caract\xE8res.\n"
51 | volume:
52 | - metric: characters
53 | count: 9126
54 | - metric: files
55 | count: 6
56 | - metric: lines
57 | count: 253
58 | - metric: regions
59 | count: 22
60 | production-software: "eScriptorium + Kraken"
61 |
--------------------------------------------------------------------------------
/catalog/gallicorpora/incunable-15.yml:
--------------------------------------------------------------------------------
1 | authors:
2 | - name: Gabay
3 | roles:
4 | - project-manager
5 | surname: Simon
6 | - name: Pinche
7 | roles:
8 | - project-manager
9 | surname: Ariane
10 | - name: Leroy
11 | roles:
12 | - transcriber
13 | surname: Noé
14 | - name: Christensen
15 | roles:
16 | - support
17 | surname: Kelly
18 | characters:
19 | members:
20 | - e
21 | - s
22 | - u
23 | - t
24 | - a
25 | - i
26 | - r
27 | - o
28 | - n
29 | - l
30 | - d
31 | - c
32 | - m
33 | - p
34 | - ̃
35 | - f
36 | - q
37 | - g
38 | - y
39 | - h
40 | - b
41 | - .
42 | - z
43 | - ⁊
44 | - x
45 | - E
46 | - '-'
47 | - ','
48 | - ¶
49 | - L
50 | - ͥ
51 | - D
52 | - C
53 | - ;
54 | - ᷤ
55 | - I
56 | - ꝰ
57 | - Q
58 | - A
59 | - S
60 | - ꝑ
61 | - P
62 | - M
63 | - O
64 | - T
65 | - U
66 | - N
67 | - F
68 | - R
69 | - ꝓ
70 | - B
71 | - G
72 | - ꝯ
73 | - ̾
74 | - H
75 | - ᷑
76 | - ͬ
77 | - ̌
78 | - ':'
79 | - (
80 | - '['
81 | - ']'
82 | - v
83 | - J
84 | - Ꝙ
85 | - )
86 | - k
87 | - ꝙ
88 | - ͣ
89 | - V
90 | - '4'
91 | - ͦ
92 | - w
93 | - ͨ
94 | - ͤ
95 | - Ι
96 | - ̧
97 | - '1'
98 | - '9'
99 | - '7'
100 | - ̶
101 | - "'"
102 | - ́
103 | - '|'
104 | mode: NFD
105 | citation-file-link: https://github.com/Gallicorpora/HTR-incunable-15e-siecle/CITATION.cff
106 | description: Corpus d'entrainement pour l'HTR composé d'incunable français du 15e
107 | s.
108 | format: Alto-XML
109 | hands:
110 | count: 1-per-folder
111 | precision: estimated
112 | language:
113 | - frm
114 | - fra
115 | license:
116 | - name: CC-BY 4.0
117 | url: https://creativecommons.org/licenses/by/4.0/
118 | production-software: eScriptorium + Kraken
119 | project-name: Gallicorpora
120 | project-website: https://github.com/Gallicorpora
121 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
122 | script:
123 | - iso: Latn
124 | script-type: only-typed
125 | time:
126 | notAfter: '1500'
127 | notBefore: '1400'
128 | title: Données HTR incunables du 15e siècle
129 | transcription-guidelines: 'Les normes de transcription suivent les préconisations
130 | du projet CREMMALAB : https://cremmalab.hypotheses.org'
131 | url: https://github.com/Gallicorpora/HTR-incunable-15e-siecle
132 | volume:
133 | - count: 245094
134 | metric: characters
135 | - count: 149
136 | metric: files
137 | - count: 7608
138 | metric: lines
139 | - count: 535
140 | metric: regions
141 |
--------------------------------------------------------------------------------
/catalog/meleagre/meleagre.yml:
--------------------------------------------------------------------------------
1 | authors:
2 | - name: Maxime
3 | orcid: 0009-0006-2076-1220
4 | roles:
5 | - transcriber
6 | - aligner
7 | - quality-control
8 | surname: Guénette
9 | - name: Mathilde
10 | orcid: 0000-0003-1642-8610
11 | roles:
12 | - transcriber
13 | - aligner
14 | - quality-control
15 | surname: Verstraete
16 | - name: Alix
17 | orcid: 0000-0002-0136-4434
18 | roles:
19 | - quality-control
20 | - support
21 | surname: Chagué
22 | - name: Marcello
23 | orcid: 0000-0001-6424-3229
24 | roles:
25 | - project-manager
26 | surname: Vitali-Rosati
27 | automatically-aligned: false
28 | characters:
29 | members:
30 | - α
31 | - ι
32 | - ́
33 | - ο
34 | - ε
35 | - ν
36 | - σ
37 | - τ
38 | - ̓
39 | - υ
40 | - ρ
41 | - ·
42 | - κ
43 | - λ
44 | - η
45 | - ̀
46 | - π
47 | - μ
48 | - δ
49 | - ω
50 | - ͂
51 | - θ
52 | - γ
53 | - ̔
54 | - χ
55 | - φ
56 | - ':'
57 | - β
58 | - ᾽
59 | - ⋇
60 | - ⁛
61 | - ξ
62 | - ̈
63 | - '~'
64 | - ζ
65 | - ψ
66 | - ※
67 | - ∻
68 | - ͳ
69 | mode: NFD
70 | description: >-
71 | Ground Truth dataset for the Codex palatinus graecus 23 (Palatine Anthology),
72 | byzantine writing from the X^th^ century.
73 | format: Alto-XML
74 | hands:
75 | count: less-than-11
76 | precision: estimated
77 | institutions: []
78 | language:
79 | - grc
80 | license:
81 | name: CC-BY 4.0
82 | url: https://creativecommons.org/licenses/by/4.0/
83 | production-software: eScriptorium + Kraken
84 | project-website: https://anthologiagraeca.org/
85 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
86 | script:
87 | - iso: Grek
88 | qualify: byzantine
89 | script-type: only-manuscript
90 | sources:
91 | - link: https://doi.org/10.11588/diglit.3449
92 | reference: >-
93 | Cod. Pal. graec. 23 (10e s. av., Constantinople). Universitätsbibliothek
94 | Heidelberg, Germany.
95 | time:
96 | notAfter: '1000'
97 | notBefore: '900'
98 | title: Ground truth for the Palatine Anthology (HTR_CPgr23)
99 | transcription-guidelines: we do not resolve the abbreviation, except when they are
100 | non ambiguous. Full guidelines available here https://gitlab.huma-num.fr/ecrinum/anthologia/htr_cpgr23
101 | url: https://gitlab.huma-num.fr/ecrinum/anthologia/htr_cpgr23
102 | volume:
103 | - count: 114273
104 | metric: characters
105 | - count: 70
106 | metric: files
107 | - count: 3374
108 | metric: lines
109 | - count: 50
110 | metric: pages
111 | - count: 574
112 | metric: regions
113 |
--------------------------------------------------------------------------------
/catalog/rasam-2/rasam.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: RASAM 2
3 | url: https://github.com/calfa-co/rasam-dataset
4 | authors:
5 | - name: "Vidal-Gorène"
6 | surname: Chahan
7 | orcid: 0000-0003-1567-6508
8 | roles:
9 | - project-manager
10 | - name: Salah
11 | surname: "Clément"
12 | orcid: 0000-0002-7846-4054
13 | roles:
14 | - transcriber
15 | - quality-control
16 | - name: Lucas
17 | surname: "Noémie"
18 | orcid: 0000-0003-2236-6778
19 | roles:
20 | - project-manager
21 | - quality-control
22 | - name: Decours-Perez
23 | surname: "Aliénor"
24 | roles:
25 | - support
26 | - name: Antoine
27 | surname: Perrier
28 | orcid: 0000-0002-5035-4283
29 | roles:
30 | - project-manager
31 | - quality-control
32 | - transcriber
33 | institutions:
34 | - name: BULAC
35 | - name: Calfa
36 | - name: DISTAM
37 | - name: GIS MOMM
38 | description: 'The Dataset is made up of 250 images, with their related ground truth
39 | stored in a XML file (pageXML format). Images come from fifteen manuscripts selected
40 | among the collections of the BULAC Library (Paris), in Magribi Arabic. It extends RASAM 1 by covering a very wide variety of hands, text density, and cursiveness. This dataset is the result of a collaborative transcription. All the
41 | participants are credited on the official deposit. With the support of the French
42 | Ministry of Higher Education, Research and Innovation, the Research Consortium Middle-East
43 | and Muslim Worlds (GIS MOMM), Calfa and the BULAC library.'
44 | language:
45 | - ara
46 | script:
47 | - iso: Arab
48 | script-type: only-manuscript
49 | time:
50 | notBefore: '1700'
51 | notAfter: '1899'
52 | hands:
53 | count: more-than-10
54 | precision: exact
55 | license:
56 | - name: Apache-2.0 License
57 | url: https://www.apache.org/licenses/LICENSE-2.0
58 | format: Page-XML
59 | volume:
60 | - metric: lines
61 | count: 3750
62 | - metric: files
63 | count: 250
64 | - metric: regions
65 | count: 839
66 | - metric: characters
67 | count: 522371
68 | sources:
69 | - reference: "Chahan Vidal-Gorène, Clément Salah, Noëmie Lucas, Aliénor Decours-Perez, Antoine Perrier. Enhancing Arabic Maghribi Handwritten Text Recognition with RASAM 2: A Comprehensive Dataset and Benchmarking. Computational Humanities Research (CHR), Dec 2024, Aarhus, Denmark. pp.200-216."
70 | link: https://ceur-ws.org/Vol-3834/paper35.pdf
71 | transcription-guidelines: 'Full description of specifications for transcription available on Github and in the paper. Following RASAM 1 specifications.'
72 | production-software: "Calfa Vision"
73 |
--------------------------------------------------------------------------------
/catalog/alix-tz/moonshines.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: Moonshines
3 | url: https://github.com/alix-tz/moonshines
4 | authors:
5 | - name: Alix
6 | surname: "Chagu\xE9"
7 | orcid: 0000-0002-0136-4434
8 | roles:
9 | - transcriber
10 | - aligner
11 | - project-manager
12 | - digitization
13 | institutions: []
14 | description: This dataset is composed of pages of text written in 2023 by a single
15 | person, copying texts taken from Guillaume Apollinaire's poems published in Alcools,
16 | and taken from Guillaume Apollinaire's Wikipedia page.
17 | language:
18 | - fra
19 | production-software: eScriptorium + Kraken
20 | script:
21 | - iso: Latn
22 | script-type: only-manuscript
23 | time:
24 | notBefore: '2023'
25 | notAfter: '2023'
26 | hands:
27 | count: '1'
28 | precision: exact
29 | license:
30 | - name: CC-BY 4.0
31 | url: https://creativecommons.org/licenses/by/4.0/
32 | format: Alto-XML
33 | volume:
34 | - metric: characters
35 | count: 27734
36 | - metric: files
37 | count: 45
38 | - metric: lines
39 | count: 1016
40 | - metric: regions
41 | count: 45
42 | citation-file-link: https://github.com/alix-tz/moonshines/blob/master/CITATION.cff
43 | transcription-guidelines: The transcription strictly follows what is written on the
44 | images, including accentuation or capitalization errors. The segmentation follows
45 | the SegmOnto ontology and mostly relies on MainZone and DefaultLine. Beware that
46 | this dataset barely contains any ponctuation and that most lines begin with a capital
47 | letter.
48 | characters:
49 | mode: NFD
50 | members:
51 | - e
52 | - s
53 | - a
54 | - n
55 | - r
56 | - i
57 | - t
58 | - u
59 | - o
60 | - l
61 | - d
62 | - m
63 | - c
64 | - p
65 | - "\u0301"
66 | - ''''
67 | - v
68 | - g
69 | - b
70 | - h
71 | - "\u0300"
72 | - f
73 | - L
74 | - q
75 | - E
76 | - '1'
77 | - A
78 | - C
79 | - x
80 | - y
81 | - "\u0302"
82 | - S
83 | - '9'
84 | - P
85 | - M
86 | - j
87 | - T
88 | - D
89 | - '-'
90 | - N
91 | - J
92 | - R
93 | - '0'
94 | - z
95 | - O
96 | - I
97 | - '2'
98 | - '8'
99 | - V
100 | - F
101 | - G
102 | - U
103 | - '5'
104 | - B
105 | - Q
106 | - )
107 | - H
108 | - '3'
109 | - (
110 | - '7'
111 | - '6'
112 | - w
113 | - k
114 | - '4'
115 | - "\u0327"
116 | - K
117 | - Z
118 | - "\u0308"
119 | - Y
120 | - '{'
121 | - '}'
122 | - W
123 | - .
124 | - X
125 | - ','
126 |
--------------------------------------------------------------------------------
/catalog/front-justice/front-justice-htr.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: HTR Front Justice
3 | url: https://github.com/Front-Justice/htr-front-justice.git
4 | authors:
5 | - name: Théo
6 | orcid: 0000-0003-2235-0565
7 | roles:
8 | - transcriber
9 | - digitization
10 | surname: Burnel
11 | - name: Giovanni Pietro
12 | orcid: 0000-0003-2722-6766
13 | roles:
14 | - project-manager
15 | surname: Vitali
16 | institutions: []
17 | description: >-
18 | Some transcriptions of minute books from military court councils during the
19 | First World War
20 | language:
21 | - fra
22 | production-software: eScriptorium + Kraken
23 | automatically-aligned: false
24 | script:
25 | - iso: Latn
26 | script-type: evenly-mixed
27 | time:
28 | notAfter: '1919'
29 | notBefore: '1914'
30 | hands:
31 | count: more-than-10
32 | precision: estimated
33 | license:
34 | name: CC-BY 4.0
35 | url: https://creativecommons.org/licenses/by/4.0/
36 | format: Alto-XML
37 | citation-file-link: >-
38 | https://github.com/Front-Justice/htr-front-justice/blob/31bd9342dc774b5c0c4b6fd9a704bb186430c6e3/CITATION.cff
39 | volume:
40 | - count: 795781
41 | metric: characters
42 | - count: 250
43 | metric: files
44 | - count: 13044
45 | metric: lines
46 | - count: 1333
47 | metric: regions
48 | transcription-guidelines: >-
49 | See README (Annotation and Transcription Guidelines section)
50 | characters:
51 | members:
52 | - e
53 | - .
54 | - i
55 | - r
56 | - s
57 | - n
58 | - a
59 | - t
60 | - u
61 | - l
62 | - o
63 | - d
64 | - c
65 | - ́
66 | - m
67 | - p
68 | - ','
69 | - f
70 | - "'"
71 | - v
72 | - '1'
73 | - q
74 | - g
75 | - C
76 | - ̀
77 | - b
78 | - E
79 | - j
80 | - ^
81 | - x
82 | - L
83 | - '2'
84 | - A
85 | - P
86 | - (
87 | - )
88 | - h
89 | - '-'
90 | - '3'
91 | - N
92 | - R
93 | - M
94 | - G
95 | - '9'
96 | - D
97 | - y
98 | - I
99 | - U
100 | - '4'
101 | - ̂
102 | - T
103 | - '0'
104 | - F
105 | - '6'
106 | - '8'
107 | - ;
108 | - O
109 | - S
110 | - J
111 | - '7'
112 | - ':'
113 | - '5'
114 | - +
115 | - B
116 | - Q
117 | - V
118 | - z
119 | - ̧
120 | - H
121 | - «
122 | - X
123 | - '?'
124 | - ̈
125 | - –
126 | - '['
127 | - ']'
128 | - »
129 | - _
130 | - '"'
131 | - /
132 | - k
133 | - '&'
134 | - Z
135 | - Y
136 | - K
137 | - W
138 | - '>'
139 | - <
140 | - '%'
141 | - '='
142 | - '|'
143 | mode: NFD
144 |
--------------------------------------------------------------------------------
/catalog/greek-data/eparchos.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: EPARCHOS
3 | url: https://zenodo.org/records/4095301
4 | authors:
5 | - name: Aleksandros
6 | surname: Papazoglou
7 | roles:
8 | - transcriber
9 | - project-manager
10 | - name: Ioannis
11 | surname: Pratikakis
12 | orcid: 0000-0002-4124-3688
13 | roles:
14 | - transcriber
15 | - project-manager
16 | - name: Kleopatra
17 | surname: Markou
18 | roles:
19 | - transcriber
20 | - project-manager
21 | - name: Lazaros
22 | surname: Tsochatzidis
23 | orcid: 0000-0002-4634-7419
24 | roles:
25 | - transcriber
26 | - project-manager
27 | institutions: []
28 | description: >-
29 | The dataset originates from a Greek handwritten codex that dates from around
30 | 1500-1530. This is the subset of the codex British Museum Addit. 6791, written
31 | by two hands, one by Antonius Eparchos and the other by Camillos Zanettus (ff.
32 | 104r-174v) and delivers texts by Hierocles (In Aureum carmen), Matthaeus
33 | Blastares (Collectio alphabetica) and, notably, texts by Michael Psellos (De
34 | omnifaria doctrina). The writing delivers the most important abbreviations,
35 | logograms and conjunctions, which are cited in virtually every Greek minuscule
36 | handwritten codex from the years of the manuscript transliteration and the
37 | prevalence of the minuscule script (9th century) to the post-Byzantine years.
38 | This dataset consists of 120 scanned handwritten text pages, containing 9285
39 | lines of text, 18809 words (6787 unique words). For each page, a PageXML is
40 | provided containing the following groundtruth:
41 | 1. Text region polygon coordinates
42 | 2. Text line polygon coordinates with the corresponding transcription text
43 | 3. Word polygon coordinated with the corresponding transcription text
44 | language:
45 | - grc
46 | transcription-guidelines: |
47 | - Abbreviation and ligatures were resolved
48 | - Minuscule in the beginning of sentences were kept as such.
49 | - Polytonic spelling and diaeresis are kept
50 | production-software: Unknown
51 | automatically-aligned: false
52 | characters:
53 | mode: NFD
54 | script:
55 | - iso: Grek
56 | script-type: only-manuscript
57 | time:
58 | notBefore: '1500'
59 | notAfter: '1530'
60 | hands:
61 | count: less-than-11
62 | precision: exact
63 | license:
64 | name: CC-BY 4.0
65 | url: https://creativecommons.org/licenses/by/4.0/
66 | format: Page-XML
67 | volume:
68 | - metric: lines
69 | count: 2272
70 | - metric: characters
71 | count: 116894
72 | - metric: files
73 | count: 120
--------------------------------------------------------------------------------
/catalog/bsc-cssh/AMSMB-HTR.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: AMSMB HTR
3 | url: https://dataverse.bsc.es/citation?persistentId=perma:BSC/0VB0MC
4 | authors:
5 | - name: Mariona
6 | surname: Coll Ardanuy
7 | orcid: 0000-0001-8455-7196
8 | roles:
9 | - project-manager
10 | - quality-control
11 | - support
12 | - name: Ramon
13 | surname: Sarobe
14 | orcid: 0000-0003-2099-3567
15 | roles:
16 | - transcriber
17 | - aligner
18 | - name: Coral
19 | surname: Cuadrada
20 | orcid: 0000-0003-4577-2381
21 | roles:
22 | - project-manager
23 | - digitization
24 | institutions:
25 | - name: Barcelona Supercomputing Center
26 | - name: Arxiu dels Marquesos de Santa Maria de Barberà
27 | - name: Arxiu Municipal de Vilassar de Dalt
28 | description: >-
29 | Dataset for handwritten text recognition on medieval notarial charters written
30 | on parchment (1208-1499). The dataset is comprised of 100 digitized
31 | manuscripts (3,369 lines), carefully selected to represent the large variation
32 | that is present in the sources, encompassing at least 80 distinct hands and
33 | various document types (from sales and inventories to last wills and marriage
34 | contracts). Written primarily in Medieval Latin with fragments in Medieval
35 | Catalan, these manuscripts exhibit varying stages of preservation and degrees
36 | of deterioration.
37 | project-website: https://www.bsc.es/discover-bsc/organisation/scientific-structure/cssh
38 | language:
39 | - lat
40 | - cat
41 | production-software: eScriptorium + Kraken
42 | automatically-aligned: false
43 | script:
44 | - iso: Latn
45 | script-type: only-manuscript
46 | time:
47 | notBefore: '1208'
48 | notAfter: '1499'
49 | hands:
50 | count: more-than-10
51 | precision: exact
52 | license:
53 | name: CC-BY-SA 4.0
54 | url: https://creativecommons.org/licenses/by-sa/4.0/
55 | format: Page-XML
56 | sources:
57 | - reference: >-
58 | Coll Ardanuy, M., Cuadrada, C., & Sarobe, R. (2025). AMSMB HTR: A Dataset
59 | for Handwritten Text Recognition in Medieval Notarial Charters Written on
60 | Parchment (1208-1499) [Dataset]. BSC Dataverse.
61 | link: https://dataverse.bsc.es/dataset.xhtml?persistentId=perma:BSC/0VB0MC
62 | volume:
63 | - metric: lines
64 | count: 3369
65 | - metric: files
66 | count: 100
67 | transcription-guidelines: >-
68 | The transcription follows a semi-diplomatic approach, in which abbreviations
69 | and symbols are expanded. Annotation and transcription decisions are
70 | documented in the datasheet accompanying the original dataset at:
71 | https://dataverse.bsc.es/citation?persistentId=perma:BSC/0VB0MC.
72 |
--------------------------------------------------------------------------------
/catalog/cremma/mss-19.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: CREMMA Manuscrits du 19e
3 | url: https://github.com/HTR-United/CREMMA-MSS-19
4 | project-name: CREMMA
5 | authors:
6 | - name: "Cl\xE9rice"
7 | surname: Thibault
8 | roles:
9 | - project-manager
10 | - quality-control
11 | - name: "Chagu\xE9"
12 | surname: Alix
13 | roles:
14 | - project-manager
15 | - quality-control
16 | - name: Davoury
17 | surname: Baudouin
18 | roles:
19 | - transcriber
20 | - aligner
21 | - name: Doat
22 | surname: Soline
23 | roles:
24 | - transcriber
25 | - aligner
26 | - name: Faure
27 | surname: Margaux
28 | roles:
29 | - transcriber
30 | - aligner
31 | - name: Humeau
32 | surname: Maxime
33 | roles:
34 | - transcriber
35 | - aligner
36 | description: Manuscripts of the 19th century
37 | language:
38 | - fra
39 | script:
40 | - iso: Latn
41 | script-type: only-manuscript
42 | time:
43 | notBefore: '1800'
44 | notAfter: '1899'
45 | hands:
46 | count: 1-per-folder
47 | precision: exact
48 | license:
49 | - name: CC-BY 4.0
50 | url: https://creativecommons.org/licenses/by/4.0/
51 | format: Alto-XML
52 | volume:
53 | - metric: characters
54 | count: 55581
55 | - metric: files
56 | count: 69
57 | - metric: lines
58 | count: 1807
59 | - metric: regions
60 | count: 167
61 | transcription-guidelines: "Abr\xE9viations conserv\xE9es."
62 | production-software: eScriptorium + Kraken
63 | characters:
64 | mode: NFD
65 | members:
66 | - e
67 | - s
68 | - a
69 | - i
70 | - u
71 | - n
72 | - r
73 | - t
74 | - o
75 | - l
76 | - d
77 | - m
78 | - c
79 | - p
80 | - v
81 | - ','
82 | - "\u0301"
83 | - ''''
84 | - q
85 | - f
86 | - .
87 | - g
88 | - b
89 | - h
90 | - "\u0300"
91 | - j
92 | - x
93 | - '-'
94 | - "\u0302"
95 | - L
96 | - C
97 | - M
98 | - y
99 | - J
100 | - z
101 | - A
102 | - D
103 | - P
104 | - '"'
105 | - '>'
106 | - <
107 | - E
108 | - '!'
109 | - N
110 | - S
111 | - Q
112 | - '1'
113 | - ;
114 | - '?'
115 | - ':'
116 | - R
117 | - I
118 | - T
119 | - B
120 | - V
121 | - "\u0153"
122 | - '6'
123 | - O
124 | - (
125 | - _
126 | - )
127 | - '2'
128 | - '3'
129 | - H
130 | - '4'
131 | - ^
132 | - '9'
133 | - '8'
134 | - '7'
135 | - F
136 | - '0'
137 | - G
138 | - '5'
139 | - "\u0327"
140 | - U
141 | - '&'
142 | - '['
143 | - ']'
144 | - "\xB0"
145 | - "\u0308"
146 | - k
147 | - $
148 | - w
149 | - X
150 | - W
151 | - Y
152 | - +
153 | - Z
154 |
--------------------------------------------------------------------------------
/catalog/enc-cours-git/hn-poesie-corse.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: OCR Corse
3 | url: https://github.com/PSL-Chartes-HTR-Students/HN2021-OCR-Poesie-Corse
4 | project-name: ENC - Bonnes pratiques du developpement collaboratif
5 | authors:
6 | - name: Sarbach-Pulicani
7 | surname: Vincent
8 | roles:
9 | - transcriber
10 | - project-manager
11 | - name: "Sa\xEFag"
12 | surname: Violette
13 | - name: Escoda
14 | surname: Adrien
15 | roles:
16 | - transcriber
17 | - name: Miaille
18 | surname: "Th\xE9ophile"
19 | roles:
20 | - transcriber
21 | - project-manager
22 | description: "Le premier ouvrage s\u2019intitule *Ponten\xF4vu* a \xE9t\xE9 \xE9crit\
23 | \ par Petru Rocca et publi\xE9 par la \"Stamparia di a Muvra\" en 1927. Il s'agit\
24 | \ d'un recueil de po\xE8mes en corse et en fran\xE7ais dont les th\xE8mes varient.\
25 | \ *A Muvra* est un journal autonomiste corse d'influence maurassienne qui a exist\xE9\
26 | \ pendant toute la p\xE9riode de l'entre-deux-guerres. Se revendiquant comme \xE9\
27 | tant une revue culturelle, la dimension politique de la revue (incarn\xE9e par le\
28 | \ PCA, ou Partitu corsu d'azione), en a fait un mouvement controvers\xE9. C'est\
29 | \ dans ce contexte de lutte politique et d'\xE9veil culturel corse que s'inscrit\
30 | \ ce recueil.\nLe second ouvrage s'intitule *A nostra Santa Fede - Catechismu Corsu*,\
31 | \ \xE9crit par Ageniu Grimaldi en 1926 sous le pseudonyme de Saveriu Malaspina.\
32 | \ Proche de Petru Rocca, ce-dernier est l'un des th\xE9oriciens de l'autonomisme\
33 | \ corse de l'entre-deux-guerres et fid\xE8le muvriste. Dans l'ouvrage, il est fait\
34 | \ mention notamment de la fa\xE7on dont un vrai corse doit se comproter vis-\xE0\
35 | -vis de sa foi envers Dieu et son \xEEle. Bien qu'il ne s'agisse pas r\xE9ellement\
36 | \ d'un recueil de po\xE8mes, le style d'\xE9criture de cet ouvrage est particuli\xE8\
37 | rement int\xE9ressant. Il reprend un style qui se rapproche des \xE9crits bibliques.\n"
38 | language:
39 | - cos
40 | - fra
41 | script:
42 | - iso: Latn
43 | script-type: only-typed
44 | time:
45 | notBefore: '1926'
46 | notAfter: '1927'
47 | hands:
48 | count: 1-per-folder
49 | precision: exact
50 | license:
51 | - name: CC-BY 4.0
52 | url: https://creativecommons.org/licenses/by/4.0/
53 | format: Alto-XML
54 | citation-file-link: https://raw.githubusercontent.com/PSL-Chartes-HTR-Students/HN2021-OCR-Poesie-Corse/main/CITATION.CFF
55 | transcription-guidelines: ''
56 | volume:
57 | - metric: characters
58 | count: 40957
59 | - metric: files
60 | count: 47
61 | - metric: lines
62 | count: 1664
63 | - metric: regions
64 | count: 146
65 | production-software: "eScriptorium + Kraken"
66 |
--------------------------------------------------------------------------------
/catalog/cremma/mss-18.yml:
--------------------------------------------------------------------------------
1 | authors:
2 | - name: Chagué
3 | roles:
4 | - project-manager
5 | - quality-control
6 | surname: Alix
7 | - name: Clérice
8 | roles:
9 | - project-manager
10 | - quality-control
11 | surname: Thibault
12 | - name: Norindr
13 | roles:
14 | - transcriber
15 | surname: Jade
16 | - name: Norindr
17 | roles:
18 | - transcriber
19 | surname: Jade
20 | - name: Van Kote
21 | roles:
22 | - transcriber
23 | - aligner
24 | surname: Elsa
25 | - name: Faure
26 | roles:
27 | - transcriber
28 | - aligner
29 | surname: Margaux
30 | characters:
31 | members:
32 | - e
33 | - s
34 | - a
35 | - r
36 | - t
37 | - n
38 | - u
39 | - i
40 | - o
41 | - l
42 | - d
43 | - p
44 | - c
45 | - m
46 | - v
47 | - .
48 | - q
49 | - f
50 | - ́
51 | - "'"
52 | - ','
53 | - g
54 | - b
55 | - h
56 | - y
57 | - x
58 | - j
59 | - L
60 | - C
61 | - ̀
62 | - ^
63 | - '1'
64 | - M
65 | - S
66 | - ̂
67 | - z
68 | - E
69 | - R
70 | - ;
71 | - '2'
72 | - I
73 | - '6'
74 | - '0'
75 | - '>'
76 | - <
77 | - D
78 | - V
79 | - J
80 | - '4'
81 | - '3'
82 | - (
83 | - )
84 | - P
85 | - ̈
86 | - '5'
87 | - ̃
88 | - '-'
89 | - '7'
90 | - B
91 | - '8'
92 | - A
93 | - '['
94 | - ']'
95 | - '9'
96 | - N
97 | - F
98 | - G
99 | - T
100 | - '?'
101 | - X
102 | - ̧
103 | - /
104 | - ':'
105 | - O
106 | - H
107 | - ’
108 | - ¬
109 | - +
110 | -
111 | - œ
112 | - U
113 | - '&'
114 | - «
115 | - Q
116 | - '='
117 | - K
118 | - '!'
119 | - k
120 | - W
121 | - Z
122 | - w
123 | - °
124 | - ⁊
125 | - ꝑ
126 | - ſ
127 | - ‸
128 | - '#'
129 | - ̶
130 | - _
131 | - Y
132 | - ̄
133 | - »
134 | - ͦ
135 | mode: NFD
136 | description: Manuscripts of the 18th century
137 | format: Alto-XML
138 | hands:
139 | count: 1-per-folder
140 | precision: exact
141 | language:
142 | - fra
143 | license:
144 | - name: CC-BY 4.0
145 | url: https://creativecommons.org/licenses/by/4.0/
146 | production-software: eScriptorium + Kraken
147 | project-name: CREMMA
148 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
149 | script:
150 | - iso: Latn
151 | script-type: only-manuscript
152 | time:
153 | notAfter: '1799'
154 | notBefore: '1700'
155 | title: CREMMA Manuscrits du 18e
156 | transcription-guidelines: Abréviations conservées.
157 | url: https://github.com/HTR-United/CREMMA-MSS-18
158 | volume:
159 | - count: 141690
160 | metric: characters
161 | - count: 125
162 | metric: files
163 | - count: 4019
164 | metric: lines
165 | - count: 329
166 | metric: regions
167 |
--------------------------------------------------------------------------------
/catalog/rasam-1/rasam.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: RASAM 1
3 | url: https://github.com/calfa-co/rasam-dataset
4 | project-website: https://calfa.fr/blog/26
5 | authors:
6 | - name: "Vidal-Gorène"
7 | surname: Chahan
8 | orcid: 0000-0003-1567-6508
9 | roles:
10 | - project-manager
11 | - name: Lucas
12 | surname: "Noémie"
13 | orcid: 0000-0003-2236-6778
14 | roles:
15 | - project-manager
16 | - quality-control
17 | - name: Salah
18 | surname: "Clément"
19 | orcid: 0000-0002-7846-4054
20 | roles:
21 | - transcriber
22 | - quality-control
23 | - name: Decours-Perez
24 | surname: "Aliénor"
25 | roles:
26 | - support
27 | - name: Dupin
28 | surname: Boris
29 | roles:
30 | - support
31 | institutions:
32 | - name: BULAC
33 | - name: Calfa
34 | - name: DISTAM
35 | - name: GIS MOMM
36 | description: 'The Dataset is made up of 300 images, with their related ground truth
37 | stored in a XML file (pageXML format). Images come from three manuscripts selected
38 | among the collections of the BULAC Library (Paris). It covers a representative part
39 | of the handwritten production in Arabic Maghrebi scripts and includes an annotation
40 | of the layout (TextRegions, baselines and polygons) and the transcription of the
41 | main text. This dataset is the result of a collaborative transcription. All the
42 | participants are credited on the official deposit. With the support of the French
43 | Ministry of Higher Education, Research and Innovation, the Research Consortium Middle-East
44 | and Muslim Worlds (GIS MOMM), Calfa and the BULAC library.'
45 | language:
46 | - ara
47 | script:
48 | - iso: Arab
49 | script-type: only-manuscript
50 | time:
51 | notBefore: '1700'
52 | notAfter: '1899'
53 | hands:
54 | count: less-than-11
55 | precision: exact
56 | license:
57 | - name: Apache-2.0 License
58 | url: https://www.apache.org/licenses/LICENSE-2.0
59 | format: Page-XML
60 | volume:
61 | - metric: pages
62 | count: 300
63 | - count: 7540
64 | metric: lines
65 | - count: 300
66 | metric: files
67 | - count: 676
68 | metric: regions
69 | - count: 403034
70 | metric: characters
71 | sources:
72 | - reference: "Vidal-Gor\xE8ne, C., Lucas, N., Salah, C., Decours-Perez, A., & Dupin,\
73 | \ B. (2021, September). RASAM\u2013A Dataset for the Recognition and Analysis\
74 | \ of Scripts in Arabic Maghrebi. In International Conference on Document Analysis\
75 | \ and Recognition (pp. 265-281). Springer, Cham"
76 | link: https://link.springer.com/chapter/10.1007/978-3-030-86198-8_19
77 | transcription-guidelines: 'Full description of specifications for transcription available
78 | on Github and in the paper.''
79 |
80 | '
81 | production-software: "Calfa Vision"
82 |
--------------------------------------------------------------------------------
/catalog/sloane_lab/sloane_lab_htr_model.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: The Sloane Lab HTR Model
3 | url: https://github.com/sloanelab-org/HTR-Model
4 | authors:
5 | - name: Marco
6 | surname: Humbel
7 | orcid: 0000-0003-1861-162X
8 | roles:
9 | - aligner
10 | - name: 'Andreas '
11 | surname: Vlachidis
12 | roles:
13 | - project-manager
14 | - name: 'Julianne '
15 | surname: Nyhan
16 | roles:
17 | - project-manager
18 | - name: 'The British Museum '
19 | surname: ''
20 | roles:
21 | - digitization
22 | institutions:
23 | - name: AEL Data Service
24 | roles:
25 | - transcriber
26 | description: >
27 | This repository contains Handwritten Text Recognition training data (layout
28 | segmentation and transcriptions ) for the Sloane Lab HTR model. The HTR model
29 | is trained on the handwriting of Hans Sloane (1660-1753).
30 |
31 |
32 | Funding:
33 |
34 | Enlightenment Architectures: Leverhulme Trust Project Grant 2016-21
35 |
36 | The Sloane Lab: Towards a National Collection – AHRC AH/W003457/1
37 | project-name: 'The Sloane Lab: Looking back to build future shared collections'
38 | project-website: https://sloanelab.org/
39 | language:
40 | - eng
41 | production-software: Transkribus
42 | automatically-aligned: false
43 | script:
44 | - iso: Latn
45 | script-type: only-manuscript
46 | time:
47 | notBefore: '1680'
48 | notAfter: '1750'
49 | hands:
50 | count: less-than-11
51 | precision: estimated
52 | license:
53 | name: CC BY-NC-SA 4.0
54 | url: https://creativecommons.org/licenses/by-nc-sa/4.0/deed.en
55 | format: Alto-XML
56 | sources:
57 | - reference: >-
58 | Sloan, K., Ortolja-Baird, A., Nyhan, J., Pickering, V., & Fleming, M.
59 | (Eds.). (2019). Sir Hans Sloane’s Miscellanea which comprises his
60 | catalogues of Miscellanies, Antiquities, Seals, Pictures, Mathematical
61 | Instruments, Agate Handles and Agate Cups, Bottles, Spoons (Digital
62 | Edition).
63 | link: >-
64 | https://enlightenmentarchitectures.reconstructingsloane.org/cataloguemiscellanies/index.html
65 | volume:
66 | - metric: pages
67 | count: 196
68 | citation-file-link: https://github.com/sloanelab-org/HTR-Model/blob/main/Citation_SL_HTR_Model.cff
69 | transcription-guidelines: >-
70 | Transcription rules can be found alongside the dataset. They include the
71 | following rules:
72 |
73 | - Exclusion of overwritten text from training data
74 |
75 | - Exclusion of text not identified by the automated layout recognition
76 |
77 | - Exclusion of faded text
78 |
79 | - Inserted words are treated as separate text lines
80 |
81 | - Exclusion of textual features such as dotted lines
82 |
83 | - Base line separation for text written apart
84 |
--------------------------------------------------------------------------------
/catalog/slub-dresden/mscr-dresd-k-117.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: >-
3 | Ground Truth Set for Handwritten Text Recognition (HTR/OCR): Dresdner
4 | Hofdiarium 1673 (Mscr.Dresd.K.117) - 17th century Kurrent manuscript
5 | url: https://doi.org/10.5281/zenodo.15303243
6 | authors:
7 | - name: Stefan
8 | surname: Beckert
9 | orcid: 0009-0005-2394-0075
10 | roles:
11 | - transcriber
12 | - aligner
13 | - project-manager
14 | - quality-control
15 | institutions: []
16 | description: >+
17 | Twenty pages of Ground Truth from the "Hofdiarium des Kurfürsten Johann Georgs
18 | II. 1673" (SLUB Mscr.Dresd.K.117; https://www.wikidata.org/wiki/Q134220291).
19 | The handwriting is a typical late 17th century Saxon kurrent
20 | ("Kanzleikurrent"), with occasional words written in bastarda or fraktur-like
21 | script.
22 |
23 |
24 | This transcription is part of a larger project regarding the Dresden court
25 | diaries. Check https://slub-dresden.academia.edu/StefanBeckert for further
26 | updates.
27 | language:
28 | - deu
29 | production-software: eScriptorium + Kraken
30 | automatically-aligned: false
31 | script:
32 | - iso: Latn
33 | script-type: only-manuscript
34 | time:
35 | notBefore: '1673'
36 | notAfter: '1673'
37 | hands:
38 | count: '1'
39 | precision: exact
40 | license:
41 | name: CC-BY 4.0
42 | url: https://creativecommons.org/licenses/by/4.0/
43 | format: Alto-XML
44 | volume:
45 | - metric: pages
46 | count: 20
47 | transcription-guidelines: >-
48 | Transcription guidelines are oriented on the DTABF-M schema
49 | (https://www.deutschestextarchiv.de/doku/basisformat/manuskript.html), but
50 | have been adapted as follows:
51 |
52 |
53 | - I and J majuscules are not distinguished
54 |
55 | - u and v are reproduced true to the original (e.g. vnd)
56 |
57 | - Long-s (ſ) and round-s (s) are distinguished
58 |
59 | - sz ligature is rendered as ß in Kurrent scripts and as sz (e.g. "Libusza")
60 | in Antiqua scripts
61 |
62 | - ij ligature is rendered as y
63 |
64 | - other ligatures, if they occur at all, are dissolved
65 |
66 | - r graphemes are rendered as r in their modern day form
67 |
68 | - an m with a nasal stroke was rendered as a simple mm
69 |
70 | - Where possible, abbreviation signs (Abbrechungszeichen) for the contemporary
71 | identification of abbreviations have been included as single letters and not
72 | marked separately. The subsequent punctuation mark (“.” or “:”) for further
73 | identification of the abbreviation has also been included (cf. also Capelli,
74 | 1928, Lexicon abbreviaturarum I, p.X)
75 |
76 | - Diacritics in u were not marked
77 |
78 | - In the case of uncertain capitalization, an approximation is sought via the
79 | letter size
80 |
--------------------------------------------------------------------------------
/catalog/bullinger-htr-dataset/bullinger-htr-dataset.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: Bullinger HTR Dataset
3 | url: https://github.com/pstroe/bullinger-htr
4 | authors:
5 | - name: Phillip Benjamin
6 | surname: Ströbel
7 | orcid: 0000-0003-2063-5495
8 | roles:
9 | - aligner
10 | - support
11 | - name: Tobias
12 | surname: Hodel
13 | orcid: 0000-0002-2071-6407
14 | roles:
15 | - aligner
16 | - project-manager
17 | - name: Christian
18 | surname: Sieber
19 | orcid: 0000-0002-9364-6921
20 | roles:
21 | - digitization
22 | - name: Patricia
23 | surname: Scheurer
24 | roles:
25 | - quality-control
26 | - support
27 | - name: David Selim
28 | surname: Schoch
29 | orcid: 0000-0002-9936-8459
30 | roles:
31 | - aligner
32 | - name: Anna
33 | surname: Janka
34 | roles:
35 | - aligner
36 | - name: Raphael
37 | surname: Schwitter
38 | roles:
39 | - aligner
40 | - name: Beat
41 | surname: Wolf
42 | roles:
43 | - aligner
44 | - name: Jonas
45 | surname: Widmer
46 | roles:
47 | - aligner
48 | - name: Peter
49 | surname: Rechsteiner
50 | roles:
51 | - quality-control
52 | - support
53 | - name: Raphael
54 | surname: Müller
55 | roles:
56 | - quality-control
57 | - digitization
58 | - support
59 | institutions: []
60 | description: >-
61 | This dataset contains 165,673 image and corresponding text line files (.png
62 | for images and .txt for the texts) in a random 80/10/10 training, validation
63 | and test set split. The source is the extensive correspondence of Swiss
64 | reformer Heinrich Bullinger (1504-1575) and his over 800 different
65 | correspondents. It therefore contains great variety in handwriting styles.
66 | Furthermore, it is multilingual since there are Latin and Early New High
67 | German (and sometimes mixed) letters. The data is split into Latin and Early
68 | New High German (determined with langid) and put into separate folders (de for
69 | Early New High German and la for Latin).
70 | project-website: https://www.bullinger-digital.ch/
71 | language:
72 | - lat
73 | - deu
74 | production-software: Transkribus, own
75 | script:
76 | - iso: Latn
77 | script-type: only-manuscript
78 | time:
79 | notBefore: '1523'
80 | notAfter: '1575'
81 | hands:
82 | count: more-than-10
83 | precision: estimated
84 | license:
85 | name: CC-BY-SA 4.0
86 | url: https://creativecommons.org/licenses/by-sa/4.0/
87 | format: Image-Text-Pairs
88 | volume:
89 | - metric: lines
90 | count: 165673
91 | automatically-aligned: true
92 | transcription-guidelines: Automated transcript alignment with Transkribus
93 |
--------------------------------------------------------------------------------
/catalog/slub-dresden/mscr-dresd-k-113.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: >-
3 | Ground Truth Set for Handwritten Text Recognition (HTR/OCR): Dresdner
4 | Hofdiarium 1653-56 (Mscr.Dresd.K.113) - 17th century Kurrent manuscript
5 | url: https://doi.org/10.5281/zenodo.15303398
6 | authors:
7 | - name: Stefan
8 | surname: Beckert
9 | orcid: 0009-0005-2394-0075
10 | roles:
11 | - transcriber
12 | - aligner
13 | - project-manager
14 | - quality-control
15 | institutions: []
16 | description: >-
17 | Twelve pages of Ground Truth from the "Hofdiarium des Kurfürsten Johann Georgs
18 | II. 1653-1656" (SLUB Mscr.Dresd.K113;
19 | https://www.wikidata.org/wiki/Q133883726). The handwriting is a typical late
20 | 17th century Saxon kurrent ("Kanzleikurrent"), with occasional words written
21 | in bastarda or fraktur-like script.
22 |
23 |
24 | This transcription is part of a larger project regarding the Dresden court
25 | diaries. Check https://slub-dresden.academia.edu/StefanBeckert for further
26 | updates.
27 | language:
28 | - deu
29 | production-software: eScriptorium + Kraken
30 | automatically-aligned: false
31 | script:
32 | - iso: Latn
33 | script-type: only-manuscript
34 | time:
35 | notBefore: '1653'
36 | notAfter: '1656'
37 | hands:
38 | count: '1'
39 | precision: exact
40 | license:
41 | name: CC-BY 4.0
42 | url: https://creativecommons.org/licenses/by/4.0/
43 | format: Alto-XML
44 | volume:
45 | - metric: pages
46 | count: 12
47 | transcription-guidelines: >-
48 | Transcription guidelines are oriented on the DTABF-M schema
49 | (https://www.deutschestextarchiv.de/doku/basisformat/manuskript.html), but
50 | have been adapted as follows:
51 |
52 |
53 | - I and J majuscules are not distinguished
54 |
55 | - u and v are reproduced true to the original (e.g. vnd)
56 |
57 | - Long-s (ſ) and round-s (s) are distinguished
58 |
59 | - sz ligature is rendered as ß in Kurrent scripts and as sz (e.g. "Libusza")
60 | in Antiqua scripts
61 |
62 | - ij ligature is rendered as y
63 |
64 | - other ligatures, if they occur at all, are dissolved
65 |
66 | - r graphemes are rendered as r in their modern day form
67 |
68 | - an m with a nasal stroke was rendered as a simple mm
69 |
70 | - Where possible, abbreviation signs (Abbrechungszeichen) for the contemporary
71 | identification of abbreviations have been included as single letters and not
72 | marked separately. The subsequent punctuation mark (“.” or “:”) for further
73 | identification of the abbreviation has also been included (cf. also Capelli,
74 | 1928, Lexicon abbreviaturarum I, p.X)
75 |
76 | - Diacritics in u were not marked
77 |
78 | - In the case of uncertain capitalization, an approximation is sought via the
79 | letter size
80 |
--------------------------------------------------------------------------------
/catalog/gallicorpora/mss-15.yml:
--------------------------------------------------------------------------------
1 | authors:
2 | - name: Gabay
3 | roles:
4 | - project-manager
5 | surname: Simon
6 | - name: Pinche
7 | roles:
8 | - project-manager
9 | surname: Ariane
10 | - name: Leroy
11 | roles:
12 | - transcriber
13 | surname: Noé
14 | - name: Christensen
15 | roles:
16 | - support
17 | surname: Kelly
18 | characters:
19 | members:
20 | - e
21 | - i
22 | - s
23 | - t
24 | - u
25 | - n
26 | - a
27 | - r
28 | - o
29 | - l
30 | - d
31 | - c
32 | - m
33 | - p
34 | - q
35 | - f
36 | - g
37 | - .
38 | - ̃
39 | - h
40 | - b
41 | - z
42 | - y
43 | - I
44 | - x
45 | - ⁊
46 | - ','
47 | - R
48 | - E
49 | - C
50 | - ̾
51 | - Q
52 | - L
53 | - S
54 | - A
55 | - D
56 | - M
57 | - ͣ
58 | - ꝑ
59 | - ͥ
60 | - P
61 | - ꝯ
62 | - T
63 | - N
64 | - ¶
65 | - O
66 | - B
67 | - ͤ
68 | - U
69 | - '-'
70 | - '1'
71 | - ꝰ
72 | - ᷑
73 | - ̽
74 | - '2'
75 | - '3'
76 | - ẜ
77 | - F
78 | - ⟦
79 | - ⟧
80 | - '6'
81 | - ħ
82 | - ꝓ
83 | - '7'
84 | - '4'
85 | - ͨ
86 | - '9'
87 | - '8'
88 | - ;
89 | - G
90 | - '0'
91 | - ͦ
92 | - '5'
93 | - H
94 | - "'"
95 | - ̀
96 | - ł
97 | - đ
98 | - ́
99 | - ͫ
100 | - ‸
101 | - '&'
102 | - k
103 | - °
104 | - ẞ
105 | - ͬ
106 | - ᷤ
107 | - K
108 | - '['
109 | - ']'
110 | - ͯ
111 | - ̧
112 | - (
113 | - )
114 | - Y
115 | - Z
116 | - ':'
117 | - ͧ
118 | - ᷠ
119 | - X
120 | mode: NFD
121 | citation-file-link: https://github.com/Gallicorpora/HTR-MSS-15e-Siecle/CITATION.
122 | description: Corpus d'entrainement pour l'HTR composé de manuscrits français du 15e
123 | s.
124 | format: Alto-XML
125 | hands:
126 | count: 1-per-folder
127 | precision: estimated
128 | language:
129 | - frm
130 | - fra
131 | license:
132 | - name: CC-BY 4.0
133 | url: https://creativecommons.org/licenses/by/4.0/
134 | production-software: eScriptorium + Kraken
135 | project-name: Gallicorpora
136 | project-website: https://github.com/Gallicorpora
137 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
138 | script:
139 | - iso: Latn
140 | script-type: only-manuscript
141 | time:
142 | notAfter: '1500'
143 | notBefore: '1400'
144 | title: Données HTR manuscrits du 15e siècle
145 | transcription-guidelines: 'Les normes de transcription suivent les préconisations
146 | du projet CREMMALAB : https://cremmalab.hypotheses.org'
147 | url: https://github.com/Gallicorpora/HTR-MSS-15e-Siecle
148 | volume:
149 | - count: 169207
150 | metric: characters
151 | - count: 85
152 | metric: files
153 | - count: 5937
154 | metric: lines
155 | - count: 458
156 | metric: regions
157 |
--------------------------------------------------------------------------------
/catalog/chi-know-po/chi-know-po.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: CHI-KNOW-PO CORPUS
3 | url: https://github.com/calfa-co/chi-know-po
4 | authors:
5 | - name: Marie
6 | surname: Bizais-Lillig
7 | orcid: 0000-0002-2426-2641
8 | roles:
9 | - project-manager
10 | - quality-control
11 | - name: Hu
12 | surname: Xinmin
13 | roles:
14 | - transcriber
15 | - name: LIAO
16 | surname: Shueh-Ying
17 | roles:
18 | - transcriber
19 | - name: Cuillé
20 | surname: Elsa
21 | orcid: 0000-0002-6060-0724
22 | roles:
23 | - transcriber
24 | - name: Tanelian
25 | surname: ani
26 | roles:
27 | - quality-control
28 | - support
29 | - name: Kasparian
30 | surname: Anahide
31 | roles:
32 | - quality-control
33 | - support
34 | - name: Vidal-Gorène
35 | surname: Chahan
36 | orcid: 0000-0003-1567-6508
37 | roles:
38 | - quality-control
39 | - support
40 | - name: Dupin
41 | surname: Boris
42 | roles:
43 | - support
44 | institutions:
45 | - name: Université de Strasbourg, GÉO (UR1340)
46 | - name: CNRS, UAR2999, Distam
47 | - name: Calfa
48 | description: >-
49 | HTR ground-truth of the CHI-KNOW-PO project (Collex-Persée), that aimed to digitize a corpus of belletristic anthologies, scholarly collections, dictionaries and encyclopedias from the Chinese medieval period (ca. 200-1000) and to process them using HTR.
50 | project-website: https://chi-know-po.gitpages.huma-num.fr
51 | language:
52 | - lzh
53 | automatically-aligned: false
54 | script:
55 | - iso: Hant
56 | script-type: only-manuscript
57 | time:
58 | notBefore: '1604'
59 | notAfter: '1921'
60 | hands:
61 | count: 1-per-folder
62 | precision: exact
63 | license:
64 | name: CC-BY 4.0
65 | url: https://creativecommons.org/licenses/by/4.0/
66 | format: Page-XML
67 | volume:
68 | - metric: lines
69 | count: 1248
70 | - metric: characters
71 | count: 104536
72 | - metric: files
73 | count: 327
74 | sources:
75 | - reference: "Bizais-Lillig, M., Vidal-Gorène, C., & Dupin, B. (2024, August).
76 | Optimizing HTR and Reading Order Strategies for Chinese Imperial Editions with
77 | Few-Shot Learning. In International Conference on Document Analysis and
78 | Recognition (pp. 37-56). Cham: Springer Nature Switzerland."
79 | link: https://link.springer.com/chapter/10.1007/978-3-031-70642-4_3
80 | transcription-guidelines: 'Regions and baselines are distinguished by types, mainly to differentiate between main text from commentaries. Diplomatic transcription with the following exceptions: characters are transcribed in their standard form based on the online dictionary of variants (https://dict.variants.moe.edu.tw/).'
81 | production-software: "Calfa Vision"
--------------------------------------------------------------------------------
/catalog/cremma/mss-17.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: CREMMA Manuscrits du 17e
3 | url: https://github.com/HTR-United/CREMMA-MSS-17
4 | project-name: CREMMA
5 | authors:
6 | - name: "Cl\xE9rice"
7 | surname: Thibault
8 | roles:
9 | - project-manager
10 | - quality-control
11 | - name: "Chagu\xE9"
12 | surname: Alix
13 | roles:
14 | - project-manager
15 | - quality-control
16 | - name: Faure
17 | surname: Margaux
18 | roles:
19 | - transcriber
20 | - name: Norindr
21 | surname: Jade
22 | roles:
23 | - transcriber
24 | - name: Mazoue
25 | surname: Anais
26 | roles:
27 | - transcriber
28 | - name: Davoury
29 | surname: Baudoin
30 | roles:
31 | - transcriber
32 | description: Various Manuscripts of the 17th century
33 | language:
34 | - fra
35 | script:
36 | - iso: Latn
37 | script-type: only-manuscript
38 | time:
39 | notBefore: '1600'
40 | notAfter: '1699'
41 | hands:
42 | count: 1-per-folder
43 | precision: exact
44 | license:
45 | - name: CC-BY 4.0
46 | url: https://creativecommons.org/licenses/by/4.0/
47 | format: Alto-XML
48 | volume:
49 | - metric: characters
50 | count: 81909
51 | - metric: files
52 | count: 111
53 | - metric: lines
54 | count: 2245
55 | - metric: regions
56 | count: 264
57 | transcription-guidelines: "Abr\xE9viations conserv\xE9es."
58 | production-software: eScriptorium + Kraken
59 | characters:
60 | mode: NFD
61 | members:
62 | - e
63 | - s
64 | - r
65 | - a
66 | - n
67 | - u
68 | - i
69 | - o
70 | - t
71 | - l
72 | - d
73 | - c
74 | - m
75 | - p
76 | - v
77 | - q
78 | - .
79 | - ','
80 | - y
81 | - ''''
82 | - f
83 | - b
84 | - g
85 | - "\u0301"
86 | - h
87 | - j
88 | - "\u0303"
89 | - M
90 | - x
91 | - R
92 | - z
93 | - C
94 | - '1'
95 | - J
96 | - ^
97 | - "\u0300"
98 | - P
99 | - L
100 | - S
101 | - V
102 | - '&'
103 | - A
104 | - E
105 | - '>'
106 | - I
107 | - <
108 | - '2'
109 | - X
110 | - '3'
111 | - T
112 | - '7'
113 | - D
114 | - '6'
115 | - ']'
116 | - B
117 | - '4'
118 | - '['
119 | - '0'
120 | - '?'
121 | - '-'
122 | - "\u0302"
123 | - "\u0308"
124 | - '9'
125 | - '5'
126 | - ;
127 | - G
128 | - N
129 | - '8'
130 | - ':'
131 | - F
132 | - "\u0327"
133 | - )
134 | - (
135 | - Q
136 | - O
137 | - H
138 | - W
139 | - "\u0153"
140 | - "\u2038"
141 | - "\u204A"
142 | - U
143 | - "\u0304"
144 | - /
145 | - "\uA757"
146 | - +
147 | - k
148 | - "\xB0"
149 | - "\_"
150 | - w
151 | - "\u05DD"
152 | - Z
153 | - "\u03C2"
154 | - '#'
155 | - "\xE6"
156 | - "\uA759"
157 | - "\u0363"
158 | - "\u03B5"
159 | - "\u03D5"
160 |
--------------------------------------------------------------------------------
/catalog/almanach/lectaurep-notaires.yml:
--------------------------------------------------------------------------------
1 | authors:
2 | - name: Durand
3 | roles:
4 | - transcriber
5 | - aligner
6 | surname: Marc
7 | - name: Rostaing
8 | roles:
9 | - transcriber
10 | - project-manager
11 | - quality-control
12 | surname: Aurélia
13 | - name: Chagué
14 | roles:
15 | - project-manager
16 | - quality-control
17 | - support
18 | surname: Alix
19 | characters:
20 | members:
21 | - e
22 | - r
23 | - a
24 | - i
25 | - n
26 | - t
27 | - o
28 | - u
29 | - s
30 | - d
31 | - l
32 | - c
33 | - p
34 | - '1'
35 | - m
36 | - S
37 | - ̀
38 | - ','
39 | - E
40 | - ́
41 | - '2'
42 | - P
43 | - .
44 | - M
45 | - '0'
46 | - A
47 | - C
48 | - '5'
49 | - '3'
50 | - h
51 | - T
52 | - v
53 | - g
54 | - D
55 | - '7'
56 | - )
57 | - (
58 | - R
59 | - N
60 | - f
61 | - I
62 | - b
63 | - L
64 | - '8'
65 | - '9'
66 | - ^
67 | - '4'
68 | - '6'
69 | - B
70 | - O
71 | - J
72 | - V
73 | - y
74 | - "'"
75 | - G
76 | - F
77 | - '-'
78 | - x
79 | - q
80 | - °
81 | - H
82 | - ̂
83 | - U
84 | - '"'
85 | - X
86 | - '&'
87 | - z
88 | - ;
89 | - ̧
90 | - ':'
91 | - j
92 | - +
93 | - Q
94 | - '|'
95 | - ̈
96 | - /
97 | - k
98 | - '='
99 | - '%'
100 | - W
101 | - K
102 | - Y
103 | - Z
104 | - w
105 | - '~'
106 | - ¥
107 | - ȼ
108 | - _
109 | - €
110 | - '`'
111 | - '['
112 | - ']'
113 | - œ
114 | - '?'
115 | - '*'
116 | - ̃
117 | - '>'
118 | - ½
119 | mode: NFD
120 | citation-file-link: https://github.com/HTR-United/lectaurep-repertoires/raw/main/CITATION.cff
121 | description: Ground truth for various Parisian registries of notary deeds written
122 | in French during the 19th century. The information is organized following pre-printed
123 | tables (with printed headers) and contain many names, addresses, numbers and abbreviations.
124 | format: Alto-XML
125 | hands:
126 | count: more-than-10
127 | precision: estimated
128 | language:
129 | - fra
130 | license:
131 | - name: CC-BY 4.0
132 | url: https://creativecommons.org/licenses/by/4.0/
133 | production-software: eScriptorium + Kraken
134 | project-name: LECTAUREP
135 | project-website: https://lectaurep.hypotheses.org/
136 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
137 | script:
138 | - iso: Latn
139 | script-type: mainly-manuscript
140 | time:
141 | notAfter: '1939'
142 | notBefore: '1830'
143 | title: Notaires de Paris - Répertoires
144 | url: https://github.com/HTR-United/lectaurep-repertoires
145 | volume:
146 | - count: 525786
147 | metric: characters
148 | - count: 218
149 | metric: files
150 | - count: 29410
151 | metric: lines
152 | - count: 1181
153 | metric: regions
154 |
--------------------------------------------------------------------------------
/catalog/htr-school-vienna/paderov-bible-handwriting-ground-truth.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: Padeřov-Bible-handwriting-ground-truth
3 | url: https://zenodo.org/record/7467034#.Y6LQZBWZM2w
4 | authors:
5 | - name: Anna
6 | surname: Michalcová
7 | orcid: 0000-0003-4760-6950
8 | roles:
9 | - transcriber
10 | - aligner
11 | - project-manager
12 | - quality-control
13 | - support
14 | - name: Jan
15 | surname: Odstrčilík
16 | orcid: 0000-0001-9104-9827
17 | roles:
18 | - project-manager
19 | - support
20 | - name: Laura
21 | surname: Maniaková
22 | roles:
23 | - transcriber
24 | - name: Eliška
25 | surname: Pěnkavová
26 | orcid: 0000-0002-5494-8847
27 | - name: Kamil
28 | surname: Bazelides
29 | orcid: 0000-0002-5199-8726
30 | - name: Jan
31 | surname: Hajič
32 | orcid: 0000-0002-9207-567X
33 | - name: Hana
34 | surname: Kreisingerová
35 | orcid: 0000-0002-2924-598X
36 | - name: Jitka
37 | surname: Filipová
38 | orcid: 0000-0002-3570-4038
39 | - name: Chi-hung
40 | surname: Liu
41 | - name: Martina
42 | surname: Dvořáková
43 | institutions:
44 | - name: Institute of the Czech Language
45 | - name: Masaryk Institute and Archives
46 | description: >-
47 | This is ground truth based on the Padeřov Bible (Vienna, Austrian National
48 | Library, shelfmark Cod. 1175, 1432–1435), the bible of the third redaction of
49 | the Old Czech Bible translation. The transcription rules were based on
50 | semi-diplomatic transcription rules set by PERO OCR and Směrnice pro vydávání
51 | starších českých textů set by Jiří Daňhelka
52 | (https://vokabular.ujc.cas.cz/moduly/edicnipoznamka.aspx?id=DanhelkaSmernice).
53 | Abbreviations were tagged and expanded.
54 | project-name: HTR Winter School 2022, Vienna
55 | project-website: >-
56 | https://www.oeaw.ac.at/imafo/veranstaltungen/detail/introduction-into-handwritten-text-recognition-1
57 | language:
58 | - ces
59 | production-software: Transkribus
60 | script:
61 | - iso: Latn
62 | script-type: only-manuscript
63 | time:
64 | notBefore: '1432'
65 | notAfter: '1435'
66 | hands:
67 | count: '1'
68 | precision: exact
69 | license:
70 | - name: CC-BY 4.0
71 | url: https://creativecommons.org/licenses/by/4.0/
72 | format: Page-XML
73 | sources:
74 | - reference: ''
75 | link: >-
76 | https://search.onb.ac.at/primo-explore/fulldisplay?docid=ONB_alma21302405460003338&context=L&adaptor=Local%20Search%20Engine&vid=ONB&lang=de_DE&search_scope=ONB_gesamtbestand&tab=default_tab&query=addsrcrid,exact,AC13954505
77 | volume:
78 | - metric: pages
79 | count: 63
80 | transcription-guidelines: >-
81 | Transliteration. Differentiates long and short "s". Abbreviations tagged and
82 | expanded. No misspelling corrections.
83 |
--------------------------------------------------------------------------------
/catalog/rescribe/caroline-minuscule.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: Caroline Minuscule by Rescribe
3 | url: https://github.com/rescribe/carolineminuscule-groundtruth
4 | project-name: Rescribe
5 | project-website: https://rescribe.xyz/
6 | authors:
7 | - name: White
8 | surname: Nick
9 | roles:
10 | - transcriber
11 | - project-manager
12 | - name: Karaisl
13 | surname: Antonia
14 | roles:
15 | - transcriber
16 | - project-manager
17 | - name: "Cl\xE9rice"
18 | surname: Thibault
19 | roles:
20 | - aligner
21 | description: 'This ground truth repository is a work in process; it currently accounts
22 | for a part of our complete Caroline Minuscule training pool of around 70 manuscripts
23 | used for our OCRopus Caroline Minuscule model (see ocropus-models repository).
24 |
25 | '
26 | language:
27 | - lat
28 | script:
29 | - iso: Latn
30 | script-type: only-manuscript
31 | time:
32 | notBefore: '800'
33 | notAfter: '1199'
34 | hands:
35 | count: 1-per-file
36 | precision: exact
37 | license:
38 | - name: CC-BY 4.0
39 | url: https://creativecommons.org/licenses/by/4.0/
40 | format: Alto-XML
41 | volume:
42 | - count: 457
43 | metric: lines
44 | - count: 17
45 | metric: files
46 | - count: 45
47 | metric: regions
48 | - count: 16909
49 | metric: characters
50 | transcription-guidelines: "In general this meant deciding between diplomatic transcription\
51 | \ (i.e. sticking to what it says on the page) and gently modernized features (i.e.\
52 | \ reinterpreting medieval signs into modern equivalents) with a view to specific\
53 | \ categories. Read on for a summary of the rules and the respective rationale behind\
54 | \ them.\nSUMMARY\nPUNCTUATION\n\n Modern: medieval punctuation is transcribed\
55 | \ with modern equivalents; punctus elevatus transcribed as semicolon\n\nCAPITALIZATION\n\
56 | \n Diplomatic: Original capitalization retained\n\nABBREVIATIONS\n\n Diplomatic\
57 | \ where possible: Retain abbreviations and render glyphs as opposed to expanded\
58 | \ versions where possible\n \"*\" where original character isn't served: OCRopus\
59 | \ (at the point in time of transcription) could not handle some of the medieval\
60 | \ glyphs, even where a Unicode version was present. Abbreviations not in OCRopus\
61 | \ are uniformly transcribed as \"*\", in the case of a combined character (such\
62 | \ as a consonant with a macron) as the base character followed by \"*\" (e.g. \"\
63 | t*\"). The list of accepted characters in OCRopus can be found in this repository,\
64 | \ and downloaded and used as codec in the OCRopus training process.\n\nSPACING\n\
65 | \n Diplomatic: Preserve manuscript spacing, i.e. give diplomatic transcription\n\
66 | \nNUMBERS\n\n Diplomatic: retain original version of both Roman and Arabic numerals'\n"
67 | production-software: "eScriptorium + Kraken"
68 |
--------------------------------------------------------------------------------
/catalog/pbp/pbp.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: Paris Bible Project (PBP)
3 | url: https://github.com/parisbible/ground_truth
4 | authors:
5 | - name: Estelle
6 | surname: Guéville
7 | orcid: 0000-0003-2603-1051
8 | roles:
9 | - transcriber
10 | - aligner
11 | - project-manager
12 | - quality-control
13 | - name: David
14 | surname: Wrisley
15 | orcid: 0000-0002-0355-1487
16 | roles:
17 | - transcriber
18 | - aligner
19 | - project-manager
20 | - quality-control
21 | - name: Niccolò Acram
22 | surname: Cappelletto
23 | roles:
24 | - transcriber
25 | - aligner
26 | - quality-control
27 | institutions: []
28 | description: >-
29 | The Paris Bible Project aims to understand the production and diffusion of
30 | medieval Latin Bibles in Europe. The dataset includes ground truth from Paris
31 | Bibles produced in the 13th and 14th centuries. We also provide the most
32 | recent version of our list of Paris Bible manuscripts found in the world along
33 | with information about them.
34 | project-website: https://parisbible.github.io/
35 | language:
36 | - lat
37 | production-software: Transkribus
38 | script:
39 | - iso: Latn
40 | script-type: only-manuscript
41 | time:
42 | notBefore: '1200'
43 | notAfter: '1399'
44 | hands:
45 | count: more-than-10
46 | precision: estimated
47 | license:
48 | - name: CC-BY 4.0
49 | url: https://creativecommons.org/licenses/by/4.0/
50 | format: Alto-XML
51 | volume:
52 | - metric: lines
53 | count: 1700
54 | - metric: files
55 | count: 19
56 | - metric: regions
57 | count: 40
58 | - metric: characters
59 | count: 55970
60 | characters:
61 | mode: NFKD
62 | members:
63 | - i
64 | - e
65 | - t
66 | - u
67 | - a
68 | - s
69 | - o
70 | - n
71 | - ̄
72 | - c
73 | - m
74 | - r
75 | - l
76 | - ꝺ
77 | - "."
78 | - p
79 | - b
80 | - q
81 | - "⁊"
82 | - g
83 | - f
84 | - ́
85 | - ꝛ
86 | - h
87 | - "-"
88 | - d
89 | - ꝫ
90 | - ";"
91 | - x
92 | - ꝯ
93 | - ̾
94 | - ꝑ
95 | - ͥ
96 | - E
97 | - ̕
98 | - ꝝ
99 | - ̃
100 | - ꝓ
101 | - y
102 | - ̈
103 | - N
104 | - ̇
105 | - Q
106 | - "·"
107 | - D
108 | - S
109 | - I
110 | - A
111 | - ͦ
112 | - C
113 | - T
114 | - ᔆ
115 | - ꝙ
116 | - H
117 | - F
118 | - P
119 | - ͣ
120 | - '2'
121 | - V
122 | - M
123 | - ":"
124 | - R
125 | - z
126 | - L
127 | - O
128 | - U
129 | - v
130 | - "℟"
131 | - G
132 | - ͨ
133 | - ͧ
134 | - "&"
135 | - ẜ
136 | - ᷤ
137 | - ͤ
138 | - ʀ
139 | - B
140 | - X
141 | - Ꝙ
142 | - "?"
143 | - k
144 | - ᣳ
145 | - j
146 | - ͬ
147 | transcription-guidelines: 'See: https://parisbible.github.io/guidelines/'
148 |
--------------------------------------------------------------------------------
/catalog/enc-cours-git/hn-boccace.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: "De la g\xE9n\xE9alogie des dieux"
3 | url: https://github.com/PSL-Chartes-HTR-Students/HN2021-Boccace
4 | project-name: ENC - Bonnes pratiques du developpement collaboratif
5 | authors:
6 | - name: Vlachou Efstathiou
7 | surname: Malamatenia
8 | roles:
9 | - transcriber
10 | - project-manager
11 | - name: Leroy
12 | surname: "No\xE9"
13 | roles:
14 | - transcriber
15 | - project-manager
16 | - name: Maulu
17 | surname: Marco
18 | roles:
19 | - project-manager
20 | - quality-control
21 | description: "This repository hosts all the documents, including transcriptions, bibliographical\
22 | \ references and introduction that serve the team Boccace for the validation of\
23 | \ the course \"Bonnes pratiques du developpement collaboratif : initiation \xE0\
24 | \ Git\" (prof. Thibault Cl\xE9rice), of the first semester - Master Humanit\xE9\
25 | s Num\xE9riques ENC-PSL 2021-2022. At the same time it and constitutes part of\
26 | \ the biannual project \"Per un\u2019edizione digitale della Genealogia deorum gentilium\"\
27 | \ di Boccaccio\" (dir. F. Duval, M. Maulu). Financed in 2021, this project foresees\
28 | \ to put on line in XML format the unpublished translation in Middle French entitled\
29 | \ \"De la genealogie des dieux\".\n"
30 | language:
31 | - frm
32 | - lat
33 | script:
34 | - iso: Latn
35 | script-type: only-typed
36 | time:
37 | notBefore: '1472'
38 | notAfter: '1498'
39 | hands:
40 | count: 1-per-folder
41 | precision: exact
42 | license:
43 | - name: CC-BY 4.0
44 | url: https://creativecommons.org/licenses/by/4.0/
45 | format: Alto-XML
46 | volume:
47 | - metric: characters
48 | count: 109409
49 | - metric: files
50 | count: 47
51 | - metric: lines
52 | count: 3656
53 | - metric: regions
54 | count: 292
55 | sources:
56 | - reference: "Laurent Premierfait, Boccace (1498), \"De la genealogie des dieux\"\
57 | , Paris, A. V\xE9rard."
58 | link: 'https://gallica.bnf.fr/ark:/12148/bpt6k105063r?rk=21459;2 '
59 | citation-file-link: https://raw.githubusercontent.com/PSL-Chartes-HTR-Students/HN2021-Boccace/main/CITATION.cff
60 | transcription-guidelines: 'No development of abbreviations. Special characters are
61 | used for the graphemic transcription, compatible with the Unicode mufi qnd the special
62 | character table of cremma-medieval. No correction of orthography errors, BUT proper
63 | transcription of inversed letters (for Inc59) such as character "n" printed as "u"
64 | in several cases. Spaces were added freely for word separation according to dictionaries
65 | of middle French and Latin (latin forms verified on Collatinus). For more documentation
66 | regarding the transcription norms and guidelines head to the repository and the
67 | report file.''''
68 |
69 | '
70 | production-software: "eScriptorium + Kraken"
71 |
--------------------------------------------------------------------------------
/catalog/parisTimeMachine/addresses-et-annuaires.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: "Donn\xE9es v\xE9rit\xE9 de terrain HTR+ Annuaire des propri\xE9taires et des\
3 | \ propri\xE9t\xE9s de Paris et du d\xE9partement de la Seine (1898-1923)"
4 | url: http://dx.doi.org/10.34847/nkl.acb724xs
5 | project-name: 'Groupe annuaires et adresses - Consortium Huma-num Paris Time Machine
6 |
7 | '
8 | project-website: https://paris-timemachine.huma-num.fr/groupe-adresses-et-annuaires/
9 | authors:
10 | - name: Elgarrista
11 | surname: Gabriela
12 | roles:
13 | - transcriber
14 | - quality-control
15 | - name: "M\xE9lanie-Becquet"
16 | surname: "Fr\xE9d\xE9rique"
17 | roles:
18 | - project-manager
19 | - quality-control
20 | - name: Brando
21 | surname: Carmen
22 | roles:
23 | - project-manager
24 | - quality-control
25 | description: "Annuaire des propri\xE9taires et des propri\xE9t\xE9s de Paris et du\
26 | \ d\xE9partement de la Seine. Lien dans le catalogue de la BNF : https://catalogue.bnf.fr/ark:/12148/cb32697229h.\
27 | \ Cr\xE9dits : Biblioth\xE8que nationale de France. Donn\xE9es v\xE9rit\xE9 de terrain\
28 | \ r\xE9sultant de la transcription et la segmentation manuelle d\u2019un \xE9chantillon\
29 | \ de 169 pages des annuaires appartenant aux volumes 1898 et 1923. Un mod\xE8le\
30 | \ de transcription HTR+ a \xE9t\xE9 entrain\xE9 \xE0 partir de cet \xE9chantillon\
31 | \ gr\xE2ce \xE0 Transkribus et est disponible sur cette plateforme en mode public.\
32 | \ Ce mod\xE8le est valable pour transcrire automatiquement les volumes de 1903 et\
33 | \ 1913 et tout autre document imprim\xE9 \xE0 deux colonnes et en utilisant l'alphabet\
34 | \ latin et particuli\xE8rement en fran\xE7ais. Le choix de l'\xE9chantillon est\
35 | \ fait par crit\xE8re alphab\xE9tique car c'est le mode d'organisation de l'information\
36 | \ dans ce document. Les accolades pr\xE9sentes dans le document n'ont pas \xE9t\xE9\
37 | \ segment\xE9es. 118 pages pour entrainer et 51 pages pour validation.\nContexte\
38 | \ et financement : Subvention DAHN (Dispositif de soutien \xE0 l'archivistique et\
39 | \ aux humanit\xE9s num\xE9riques) par le MESRI. Equipes : Consortium Paris Time\
40 | \ Machine - TGIR Humanum EHESS / CNRS / LATTICE / INRIA Contact si besoin d'anonymiser\
41 | \ les noms de personnes : carmen.brando@ehess.fr.\n"
42 | language:
43 | - fra
44 | script:
45 | - iso: Latn
46 | script-type: only-typed
47 | time:
48 | notBefore: '1898'
49 | notAfter: '1923'
50 | hands:
51 | count: less-than-11
52 | precision: estimated
53 | license:
54 | - name: CC-BY-SA 4.0
55 | url: https://creativecommons.org/licenses/by-sa/4.0/
56 | format: Alto-XML
57 | volume:
58 | - count: 169
59 | metric: pages
60 | - count: 19022
61 | metric: lines
62 | - count: 641401
63 | metric: characters
64 | transcription-guidelines: "Transcription diplomatique. Les accolades n'ont pas \xE9\
65 | t\xE9 segment\xE9es.\n"
66 | production-software: Transkribus
67 |
--------------------------------------------------------------------------------
/catalog/gallicorpora/print-16.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2022-04-15/schema.json
2 | title: "Donn\xE9es imprim\xE9s du 16e si\xE8cle"
3 | description: "Corpus d'entrainement pour l'HTR constitu\xE9 d'imprim\xE9s du 16e si\xE8\
4 | cle"
5 | url: https://github.com/Gallicorpora/HTR-imprime-16e-siecle
6 | authors:
7 | - name: Gabay
8 | surname: Simon
9 | roles:
10 | - project-manager
11 | - name: Pinche
12 | roles:
13 | - project-manager
14 | surname: Ariane
15 | - name: Vlachou-Efstathiou
16 | surname: malamatenia
17 | roles:
18 | - transcriber
19 | - name: Christensen
20 | surname: Kelly
21 | roles:
22 | - support
23 | format: Alto-XML
24 | hands:
25 | count: 1-per-folder
26 | precision: estimated
27 | language:
28 | - frm
29 | - fra
30 | license:
31 | - name: CC-BY 4.0
32 | url: https://creativecommons.org/licenses/by/4.0/
33 | project-name: Gallicorpora
34 | project-website: https://github.com/Gallicorpora
35 | script:
36 | - iso: Latn
37 | script-type: only-typed
38 | time:
39 | notAfter: '1599'
40 | notBefore: '1500'
41 | transcription-guidelines: "Les normes de transcription suivent les pr\xE9conisations\
42 | \ du projet Gallicorpora"
43 | volume:
44 | - metric: characters
45 | count: 186202
46 | - metric: files
47 | count: 180
48 | - metric: lines
49 | count: 4918
50 | - metric: regions
51 | count: 591
52 | citation-file-link: https://github.com/Gallicorpora/HTR-imprime-16e-siecle/CITATION.cff
53 | production-software: eScriptorium + Kraken
54 | characters:
55 | mode: NFD
56 | members:
57 | - e
58 | - u
59 | - r
60 | - a
61 | - n
62 | - i
63 | - t
64 | - o
65 | - l
66 | - s
67 | - "\u017F"
68 | - d
69 | - c
70 | - m
71 | - p
72 | - ','
73 | - q
74 | - y
75 | - v
76 | - f
77 | - g
78 | - b
79 | - h
80 | - .
81 | - "\u2019"
82 | - '&'
83 | - E
84 | - x
85 | - ''''
86 | - z
87 | - "\u0301"
88 | - "\u0300"
89 | - A
90 | - "\xAC"
91 | - "\u0303"
92 | - D
93 | - C
94 | - R
95 | - ':'
96 | - L
97 | - I
98 | - S
99 | - P
100 | - N
101 | - M
102 | - O
103 | - Q
104 | - T
105 | - V
106 | - G
107 | - H
108 | - B
109 | - F
110 | - '-'
111 | - "\u0327"
112 | - j
113 | - '?'
114 | - (
115 | - "\u0308"
116 | - )
117 | - "\xBB"
118 | - '1'
119 | - "\u0153"
120 | - "\xB6"
121 | - '!'
122 | - U
123 | - '2'
124 | - X
125 | - ;
126 | - '9'
127 | - Y
128 | - '4'
129 | - '3'
130 | - "\xDF"
131 | - '5'
132 | - '"'
133 | - '7'
134 | - J
135 | - '8'
136 | - "\xE6"
137 | - "\uA770"
138 | - '6'
139 | - '0'
140 | - "\u0302"
141 | - "\u02B3"
142 | - "\u204A"
143 | - Z
144 | - "\xAB"
145 | - '*'
146 | - "\uA757"
147 | - "\uA753"
148 | - "\_"
149 | - "\u204B"
150 | - "\u0399"
151 | - "\uA751"
152 | - ']'
153 | - "\u0365"
154 | - "\u1D49"
155 | - "\u0395"
156 | - '['
157 | - "\u03A4"
158 | - /
159 |
--------------------------------------------------------------------------------
/catalog/ground-truth-set-for-handwritten-text-recognition-htr-ocr-dresdner-hofdiarium-1665-mscrdresdk80-17th-century-kurrent-manuscript/ground-truth-set-for-handwritten-text-recognition-htr-ocr-dresdner-hofdiarium-1665-mscrdresdk80-17th-century-kurrent-manuscript.yml:
--------------------------------------------------------------------------------
1 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
2 | title: >-
3 | Ground Truth Set for Handwritten Text Recognition (HTR/OCR): Dresdner
4 | Hofdiarium 1665 (Mscr.Dresd.K.80) - 17th century Kurrent manuscript
5 | url: https://doi.org/10.5281/zenodo.14356190
6 | authors:
7 | - name: Stefan
8 | surname: Beckert
9 | orcid: 0009-0005-2394-0075
10 | roles:
11 | - transcriber
12 | - aligner
13 | - project-manager
14 | - quality-control
15 | institutions: []
16 | description: >-
17 | This dataset contains ten pages of Ground Truth from the Dresden Court Diaries
18 | of elector Johann Georg II. as Page XML, Alto XML and jpg.
19 | language:
20 | - deu
21 | production-software: eScriptorium + Kraken
22 | automatically-aligned: false
23 | script:
24 | - iso: Latn
25 | qualify: Kurrent
26 | script-type: only-manuscript
27 | time:
28 | notBefore: '1665'
29 | notAfter: '1665'
30 | hands:
31 | count: '1'
32 | precision: exact
33 | license:
34 | name: CC-BY-NC-SA 4.0
35 | url: https://creativecommons.org/licenses/by/4.0/
36 | format: Alto-XML
37 | sources:
38 | - reference: >-
39 | Beckert, S. (2024). Ground Truth Set for Handwritten Text Recognition
40 | (HTR/OCR): Dresdner Hofdiarium 1665 (Mscr.Dresd.K.80) - 17th century
41 | Kurrent manuscript [Data set]. Zenodo.
42 | https://doi.org/10.5281/zenodo.14356190
43 | link: ''
44 | volume:
45 | - metric: pages
46 | count: 10
47 | transcription-guidelines: >-
48 | Transcription guidelines are oriented on the DTABF-M schema
49 | (https://www.deutschestextarchiv.de/doku/basisformat/manuskript.html), but
50 | have been adapted as follows:
51 |
52 |
53 | - I and J majuscules are not distinguished
54 |
55 | - u and v are reproduced true to the original (e.g. vnd)
56 |
57 | - Long-s (ſ) and round-s (s) are distinguished
58 |
59 | - sz ligature is rendered as ß in Kurrent scripts and as sz (e.g. "Libusza")
60 | in Antiqua scripts
61 |
62 | - ij ligature is rendered as y
63 |
64 | - other ligatures, if they occur at all, are dissolved
65 |
66 | - r graphemes are rendered as r in their modern day form
67 |
68 | - an m with a nasal stroke was rendered as a simple m
69 |
70 | - Where possible, abbreviation signs (Abbrechungszeichen) for the contemporary
71 | identification of abbreviations have been included as single letters and not
72 | marked separately. The subsequent punctuation mark (“.” or “:”) for further
73 | identification of the abbreviation has also been included (cf. also Capelli,
74 | 1928, Lexicon abbreviaturarum I, p.X)
75 |
76 | - Diacritics in u were not marked
77 |
78 | - In the case of uncertain capitalization, an approximation is sought via the
79 | letter size
80 |
81 |
--------------------------------------------------------------------------------
/catalog/htromance/ita.yml:
--------------------------------------------------------------------------------
1 | authors:
2 | - name: Rachele
3 | roles:
4 | - transcriber
5 | surname: Alba
6 | - name: Giorgia
7 | roles:
8 | - transcriber
9 | surname: Rubin
10 | - name: Federico
11 | orcid: 0000-0002-7810-7735
12 | roles:
13 | - project-manager
14 | - quality-control
15 | surname: Boschetti
16 | - name: Franz
17 | roles:
18 | - project-manager
19 | surname: Fischer
20 | - name: Alix
21 | orcid: 0000-0002-0136-4434
22 | roles:
23 | - project-manager
24 | surname: Chagué
25 | - name: Thibault
26 | orcid: 0000-0003-1852-9204
27 | roles:
28 | - project-manager
29 | surname: Clérice
30 | automatically-aligned: false
31 | characters:
32 | members:
33 | - e
34 | - a
35 | - o
36 | - i
37 | - l
38 | - n
39 | - r
40 | - t
41 | - u
42 | - s
43 | - c
44 | - d
45 | - m
46 | - p
47 | - g
48 | - h
49 | - f
50 | - .
51 | - ̃
52 | - q
53 | - b
54 | - ⁊
55 | - ','
56 | - ꝑ
57 | - E
58 | - C
59 | - z
60 | - x
61 | - ̾
62 | - A
63 | - I
64 | - ̧
65 | - D
66 | - L
67 | - M
68 | - ͤ
69 | - O
70 | - S
71 | - R
72 | - ͧ
73 | - y
74 | - ꝙ
75 | - ͬ
76 | - ł
77 | - F
78 | - N
79 | - U
80 | - T
81 | - Q
82 | - ͦ
83 | - P
84 | - B
85 | - ́
86 | - ͥ
87 | - '='
88 | - ':'
89 | - ꝯ
90 | - X
91 | - ẜ
92 | - G
93 | - ͣ
94 | - H
95 | - '2'
96 | - '9'
97 | - '1'
98 | - ¶
99 | - '4'
100 | - ꝓ
101 | - '3'
102 | - '5'
103 | - k
104 | - ͭ
105 | - '7'
106 | - '8'
107 | - /
108 | - "'"
109 | - ε
110 | - ɨ
111 | - đ
112 | - '6'
113 | - ι
114 | - ο
115 | - '0'
116 | - ̓
117 | - ν
118 | - ꝗ
119 | - ̈
120 | - μ
121 | - λ
122 | - ꝰ
123 | - α
124 | - ω
125 | - π
126 | - σ
127 | - ͫ
128 | - Y
129 | - '-'
130 | - θ
131 | - γ
132 | - η
133 | - Ο
134 | - υ
135 | - ρ
136 | - ̔
137 | - ͂
138 | - β
139 | - +
140 | - Z
141 | mode: NFD
142 | description: Transcription of samples of Medieval Italian manuscripts
143 | format: Alto-XML
144 | hands:
145 | count: 1-per-folder
146 | precision: estimated
147 | language:
148 | - ita
149 | - vec
150 | license:
151 | name: CC-BY 4.0
152 | url: https://creativecommons.org/licenses/by/4.0/
153 | production-software: eScriptorium + Kraken
154 | project-name: HTRomance
155 | schema: https://htr-united.github.io/schema/2023-06-27/schema.json
156 | script:
157 | - iso: Latn
158 | script-type: only-manuscript
159 | time:
160 | notAfter: '1499'
161 | notBefore: '1100'
162 | title: HTRomance, Medieval Italian corpus of ground-truth for Handwritten Text Recognition
163 | and Layout Segmentation
164 | url: https://github.com/HTRomance-Project/medieval-italian
165 | volume:
166 | - count: 84366
167 | metric: characters
168 | - count: 60
169 | metric: files
170 | - count: 3086
171 | metric: lines
172 | - count: 60
173 | metric: pages
174 | - count: 353
175 | metric: regions
176 |
--------------------------------------------------------------------------------