├── LICENSE ├── README.md └── Make human-editable POS text from TLG.py /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Classical Language Toolkit 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # About 2 | The file `pos_editable_xenophon_anabasis.md` is a human–editable POS–tagged text of [Xenohphon's *Anabasis*](https://en.wikipedia.org/wiki/Anabasis_%28Xenophon%29). Any GitHub user can check the quality of a sentence's POS tags, fix or confirm it, and give one's name. 3 | 4 | The text was tagged using [the CLTK's TnT tagger](http://docs.cltk.org/en/latest/greek.html#tnt-tagger). The file `Make human-editable POS text from TLG.py` generated `pos_editable_xenophon_anabasis.md`, which was then broken down into files for each book. 5 | 6 | Once the *Anabasis* has been tagged, in part or in full, it will be added to [the CLTK's POS tagging training set](https://github.com/cltk/greek_treebank_perseus/blob/master/greek_training_set.pos). 7 | 8 | # How to edit 9 | To begin editing, make a free GitHub user account, [fork this repository](https://help.github.com/articles/fork-a-repo/), pick a book, find a sentence which has not been edited, and check and fix the tags. Then, [submit a pull request](https://help.github.com/articles/creating-a-pull-request/). 10 | 11 | 12 | # POS tags 13 | This text uses the Perseus project's POS tags ([read about them here](http://nlp.perseus.tufts.edu/syntax/treebank/greek.html).) 14 | 15 | 1: part of speech 16 | 17 | n noun 18 | v verb 19 | t participle 20 | a adjective 21 | d adverb 22 | l article 23 | g particle 24 | c conjunction 25 | r preposition 26 | p pronoun 27 | m numeral 28 | i interjection 29 | e exclamation 30 | u punctuation 31 | 32 | 2: person 33 | 34 | 1 first person 35 | 2 second person 36 | 3 third person 37 | 38 | 3: number 39 | 40 | s singular 41 | p plural 42 | d dual 43 | 44 | 4: tense 45 | 46 | p present 47 | i imperfect 48 | r perfect 49 | l pluperfect 50 | t future perfect 51 | f future 52 | a aorist 53 | 54 | 5: mood 55 | 56 | i indicative 57 | s subjunctive 58 | o optative 59 | n infinitive 60 | m imperative 61 | p participle 62 | 63 | 6: voice 64 | 65 | a active 66 | p passive 67 | m middle 68 | e medio-passive 69 | 70 | 7: gender 71 | 72 | m masculine 73 | f feminine 74 | n neuter 75 | 76 | 8: case 77 | 78 | n nominative 79 | g genitive 80 | d dative 81 | a accusative 82 | v vocative 83 | l locative 84 | 85 | 9: degree 86 | 87 | c comparative 88 | s superlative 89 | 90 | --- 91 | 92 | For example, the postag for the noun "a)/ndra" is "n-s---ma-", 93 | which corresponds to the following features: 94 | 95 | 1: n noun 96 | 2: - 97 | 3: s singular 98 | 4: - 99 | 5: - 100 | 6: - 101 | 7: m masculine 102 | 8: a accusative 103 | 9: - 104 | 105 | 106 | # License 107 | The MIT License (see `LICENSE`). The Greek text is public domain, [Marchant's *Xenophontis opera omnia* (Oxford's OCT, 1904)](http://books.google.com/books?id=4rQ4AQAAMAAJ&printsec=frontcover&dq=Xenophontis+opera+omnia+marchant&hl=en&sa=X&ei=i4NdVK28J4X1iQL13IHADg&ved=0CB0Q6AEwAA#v=onepage&q=Xenophontis%20opera%20omnia%20marchant&f=false). 108 | -------------------------------------------------------------------------------- /Make human-editable POS text from TLG.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | from cltk.tag.pos.pos_tagger import POSTag 7 | from cltk.tokenize.sentence.tokenize_sentences import TokenizeSentence 8 | import os 9 | import re 10 | 11 | 12 | # In[2]: 13 | 14 | def extract_tlg_work(file_path, regex_match): 15 | abs_path = os.path.expanduser(file_path) 16 | with open(abs_path) as f: 17 | r = f.read() 18 | d = re.compile(regex_match) 19 | m = d.findall(r) 20 | for x in m: 21 | work_str = x[1] 22 | return work_str 23 | 24 | 25 | # In[3]: 26 | 27 | anabasis_path = '~/cltk_data/compiled/tlg/TLG0032.txt' 28 | anabasis_regex = r'(@1 \{1ΚΥΡΟΥ ΑΝΑΒΑΣΕΩΣ Α\}1 @)(.*)( @1 \{1ΚΥΡΟΥ ΠΑΙΔΕΙΑΣ Α\}1 @)' 29 | anabasis_raw = extract_tlg_work(anabasis_path, anabasis_regex) 30 | 31 | 32 | # In[4]: 33 | 34 | def cleanup_tlg_txt(tlg_str): 35 | # fix beta code transliteration problems 36 | tlg_str = re.sub(r'ι\+', 'ϊ', tlg_str) 37 | tlg_str = re.sub(r'ί\+', 'ΐ', tlg_str) 38 | tlg_str = re.sub(r'\\.', '.', tlg_str) 39 | # fix tlg markup 40 | tlg_str = re.sub(r'@1 \{1.+?\}1 @', '', tlg_str) # rm book titles 41 | tlg_str = re.sub(r'\[.+?\]', '', tlg_str) # rm words in square brackets 42 | tlg_str = re.sub(r'[0-9]', '', tlg_str) 43 | tlg_str = re.sub(r'@|%|\x00', '', tlg_str) 44 | tlg_str = re.sub('—', ' — ', tlg_str) 45 | return tlg_str 46 | 47 | 48 | # In[5]: 49 | 50 | anabasis_clean = cleanup_tlg_txt(anabasis_raw) 51 | 52 | 53 | # In[6]: 54 | 55 | def tokenize_sentences(in_str): 56 | """tokenize into list of sentences""" 57 | t = TokenizeSentence() 58 | out_list = t.sentence_tokenizer(in_str, 'greek') 59 | return out_list 60 | 61 | 62 | # In[7]: 63 | 64 | anabasis_sentences = tokenize_sentences(anabasis_clean) 65 | 66 | 67 | # In[8]: 68 | 69 | def append_to_file(file_name, pos_str): 70 | user_data = os.path.expanduser('~/cltk_data/user_data/') 71 | if not os.path.isdir(user_data): 72 | os.makedirs(user_data) 73 | file_name = str('pos_editable_') + str(file_name) + str('.md') 74 | file_path = os.path.join(user_data, file_name) 75 | with open(file_path, 'a') as f: 76 | f.write(pos_str) 77 | 78 | 79 | # In[9]: 80 | 81 | def editable_pos_text(untagged_sentences): 82 | """POS tag each sentence and print text.""" 83 | p = POSTag() 84 | counter = 0 85 | for sentence in untagged_sentences: 86 | counter += 1 87 | tagged_words = p.tnt_tagger(sentence, 'greek') # ~ 6 sec. per sent 88 | tags_newlines = '' 89 | unknowns = [] # mk list of untagged words 90 | for tagged_word in tagged_words: 91 | line = str(tagged_word) + '\n' 92 | tags_newlines = tags_newlines + line 93 | if tagged_word[1] == 'Unk': 94 | unknowns.append(tagged_word[0]) 95 | # print str of human-readable sentence 96 | sent_str_out = """## Sentence %s 97 | ### Plaintext 98 | %s 99 | ``` 100 | ### Tagged 101 | %s``` 102 | ### Unknown words 103 | %s 104 | ### Corrected by 105 | [''] 106 | 107 | """ % (counter, sentence, tags_newlines, unknowns) 108 | append_to_file('xenophon_anabasis', sent_str_out) 109 | 110 | 111 | # In[10]: 112 | 113 | editable_pos_text(anabasis_sentences) 114 | 115 | --------------------------------------------------------------------------------