├── __init__.py ├── utils.py ├── reader.py ├── download.py └── README.md /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | __author__ = ["Clément Besnier "] 4 | PUNCTUATION = ",;.?!" 5 | 6 | 7 | def normalize_sentence(sentence: list): 8 | res = sentence.copy() 9 | if len(sentence) >= 2: 10 | if res[-1] in PUNCTUATION: 11 | res[-2] = res[-2] + res[-1] 12 | res.pop() 13 | res[0] = res[0].capitalize() 14 | return " ".join(res) 15 | 16 | 17 | if __name__ == "__main__": 18 | print(normalize_sentence(["bonjour", "ok", "oui", "non", "."])) 19 | -------------------------------------------------------------------------------- /reader.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | """ 4 | 5 | from xml.etree import ElementTree 6 | import codecs 7 | 8 | from old_swedish_texts.utils import normalize_sentence 9 | 10 | __author__ = ["Clément Besnier ", ] 11 | 12 | 13 | def read_corpus(): 14 | with codecs.open("corpora/fsv-aldrelagar.xml", "r", encoding="utf8") as f: 15 | text = f.read() 16 | 17 | root = ElementTree.fromstring(text) 18 | ltext = [] 19 | for text in root: 20 | title = text.attrib["title"] 21 | lparagraph = [] 22 | for paragraph in text: 23 | sentences = [] 24 | for sentence in paragraph: 25 | lsentence = [] 26 | for word in sentence: 27 | lsentence.append(word.text) 28 | sentences.append(normalize_sentence(lsentence)) 29 | lparagraph.append(" ".join(sentences)) 30 | ltext.append("\n\n".join(lparagraph)) 31 | return ltext 32 | 33 | 34 | if __name__ == "__main__": 35 | print(read_corpus()) 36 | -------------------------------------------------------------------------------- /download.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | """ 4 | import bz2 5 | import os 6 | 7 | from requests import get 8 | 9 | 10 | __author__ = ["Clément Besnier "] 11 | 12 | 13 | def download_spraakbanken_corpora(): 14 | """ 15 | 16 | :return: 17 | """ 18 | folder = "corpora" 19 | files = "files" 20 | if not os.path.exists(os.path.join(os.getcwd(), files)): 21 | os.mkdir(files) 22 | if not os.path.exists(os.path.join(os.getcwd(), folder)): 23 | os.mkdir(folder) 24 | 25 | links = ["http://spraakbanken.gu.se/lb/resurser/meningsmangder/fsv-verser.xml.bz2", 26 | "http://spraakbanken.gu.se/lb/resurser/meningsmangder/fsv-aldrelagar.xml.bz2", 27 | "http://spraakbanken.gu.se/lb/resurser/meningsmangder/fsv-aldrereligiosprosa.xml.bz2", 28 | "http://spraakbanken.gu.se/lb/resurser/meningsmangder/fsv-yngretankebocker.xml.bz2", 29 | "http://spraakbanken.gu.se/lb/resurser/meningsmangder/fsv-yngrelagar.xml.bz2"] 30 | for link in links: 31 | xml_data = get(link).content 32 | with open(os.path.join(files, link.split("/")[-1]), "wb") as f: 33 | f.write(xml_data) 34 | with open(os.path.join(files, link.split("/")[-1]), "rb") as f: 35 | data = bz2.decompress(f.read()) 36 | with open(os.path.join(folder, link.split("/")[-1].split(".bz2")[0]), "wb") as f: 37 | f.write(data) 38 | 39 | 40 | if __name__ == "__main__": 41 | download_spraakbanken_corpora() 42 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | Sources and resources : 4 | * Corpora 5 | * [All resources](https://spraakbanken.gu.se/eng/resources) 6 | * [Fornsvenska verser](https://spraakbanken.gu.se/eng/resource/fsv-verser) 7 | * [Metdata](https://spraakbanken.gu.se/eng/resource/fsv-verser/json) 8 | * [Download](http://spraakbanken.gu.se/lb/resurser/meningsmangder/fsv-verser.xml.bz2) 9 | * [Fornsvenska yngrelagar](https://spraakbanken.gu.se/eng/resource/fsv-yngrelagar) 10 | * [Metadata](https://spraakbanken.gu.se/eng/resource/fsv-yngrelagar/json) 11 | * [Download](http://spraakbanken.gu.se/lb/resurser/meningsmangder/fsv-yngrelagar.xml.bz2) 12 | * [Fornsvenska äldre lagar](https://spraakbanken.gu.se/eng/resource/fsv-aldrelagar) 13 | * [Download](http://spraakbanken.gu.se/lb/resurser/meningsmangder/fsv-aldrelagar.xml.bz2) 14 | * [Metadat](https://spraakbanken.gu.se/eng/resource/fsv-aldrelagar/json) 15 | * [Fornsvenska äldre religiosprosa](https://spraakbanken.gu.se/eng/resource/fsv-aldrereligiosprosa) 16 | * [Metdata](https://spraakbanken.gu.se/eng/resource/fsv-aldrereligiosprosa/json) 17 | * [Download](http://spraakbanken.gu.se/lb/resurser/meningsmangder/fsv-aldrereligiosprosa.xml.bz2) 18 | * [Fornsvensja yngre tankeböcker](https://spraakbanken.gu.se/eng/resource/fsv-yngretankebocker) 19 | * [Metadata](https://spraakbanken.gu.se/eng/resource/fsv-yngretankebocker/json) 20 | * [Download](http://spraakbanken.gu.se/lb/resurser/meningsmangder/fsv-yngretankebocker.xml.bz2) 21 | * Articles 22 | * [Old Swedish Part-of-Speech Tagging between Variation and External Knowledge](http://www.aclweb.org/anthology/W16-2104) 23 | * [Part-of-speech and Morphology Tagging Old Swedish ](http://people.cs.umu.se/johanna/sltc2016/abstracts/SLTC_2016_paper_46.pdf) 24 | * [Something Old, Something New – Applying a Pre-trained Parsing Model to Clinical Swedish](http://www.aclweb.org/anthology/W11-4641) 25 | 26 | 27 | 28 | --------------------------------------------------------------------------------