├── requirements.txt ├── README.md └── reader.py /requirements.txt: -------------------------------------------------------------------------------- 1 | lxml 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Referenzkorpus Mittelhochdeutsch 2 | 3 | ## Source and license 4 | [Main page of the project](https://www.linguistics.rub.de/rem/access/index.html) 5 | 6 | License: 7 | 8 | > Das Referenzkorpus Mittelhochdeutsch ist lizenziert unter einer [Creative Commons Namensnennung - Weitergabe unter gleichen Bedingungen 4.0 International Lizenz](https://creativecommons.org/licenses/by-sa/4.0/). 9 | 10 | No change is made on the corpus. This code is intended to parse the corpus. 11 | 12 | ## Corpus retrieval 13 | 14 | 1. Go to https://www.linguistics.rub.de/rem/access/index.html. 15 | 2. Click on "CORA-XML AKS .TAR.XZ" or "CORA-XML ALS .ZIP" 16 | 3. Click on "Herunterladen". 17 | 4. Uncompress the dowloaded file. 18 | 5. You have a folder, named **rem-corraled-20161222** (2019-09-18) with a list of XML files which are annotated texts. 19 | 20 | ## Code 21 | The available code will parse individual XML files. 22 | -------------------------------------------------------------------------------- /reader.py: -------------------------------------------------------------------------------- 1 | """ 2 | Reader of the Nibelungenlied in Referenzkorpus Mittelhochdeutsch 3 | 4 | Source: https://www.linguistics.rub.de/rem/access/index.html 5 | """ 6 | 7 | 8 | import os 9 | from lxml import etree 10 | 11 | __author__ = ["Clément Besnier ", ] 12 | 13 | nibelungenlied_filename = "M321-G1.xml" 14 | 15 | 16 | def get_root(filename): 17 | parser = etree.XMLParser(load_dtd=True, no_network=False) 18 | tree = etree.parse(os.path.join(filename), parser=parser) 19 | return tree.getroot() 20 | 21 | 22 | def extract_annotations(entry): 23 | return {child.tag: child.get("tag") for child in entry.getchildren()} 24 | 25 | 26 | def extract_by_tag(tag, tokens): 27 | return [token[tag] for token in tokens if tag in token] 28 | 29 | 30 | def extract_normalized_text(root): 31 | return [token.get("utf") for token in root.findall(".//tok_anno")] 32 | 33 | 34 | def extract_token_ranges(token_range: str): 35 | pass 36 | 37 | 38 | def extract_line_ranges(line_ranges): 39 | pass 40 | 41 | 42 | def reconstitute_text(root): 43 | pages_ranges = {page.get("id")[1:]: page.get("range")[1:] for page in root.findall(".//page")} 44 | columns_ranges = {line.get("id")[1:]: line.get("range") for line in root.findall(".//column")} 45 | lines_ranges = {line.get("id"): line.get("range") for line in root.findall(".//line")} 46 | lines = [[[lines_ranges[line] for line in lines_ranges] for column in columns_ranges 47 | if pages_ranges[page] == column] for page in pages_ranges] 48 | # TODO extract from each column, a range of line 49 | # TODO extract from each line, a range of tokens 50 | 51 | 52 | if __name__ == "__main__": 53 | root = get_root(nibelungenlied_filename) 54 | tokens = [extract_annotations(entry) for entry in root.findall(".//tok_anno")] 55 | complete_text = [token.get("utf") for token in root.findall(".//tok_anno")] 56 | normalized_text = extract_by_tag("norm", tokens) 57 | # lemmata = extract_by_tag("lemma", tokens) 58 | # pos_tags = extract_by_tag("pos", tokens) 59 | # inflections = extract_by_tag("infl", tokens) 60 | print(complete_text[:100]) 61 | reconstitute_text(root) 62 | 63 | --------------------------------------------------------------------------------