├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── api_json.py ├── api_old.py ├── convert_tei_json_to_simple_json ├── __init__.py ├── book_chapter.py ├── book_line.py └── convert_all_perseus_xml.py ├── gunicorn_start.sh ├── metadata ├── __init__.py ├── commentary │ └── __init__.py ├── criticism │ ├── __init__.py │ └── criticism.py ├── definition │ ├── _init_.py │ └── views.py ├── entities │ ├── __init__.py │ ├── dbpedia.py │ ├── entity.py │ ├── pleiades.py │ ├── viaf.py │ └── wikipedia.py ├── media │ └── __init__.py ├── pos │ ├── __init__.py │ ├── constants.py │ └── views.py ├── prosody │ ├── __init__.py │ ├── scansion.py │ └── scansion_to_html.py ├── stem │ ├── __init__.py │ └── views.py ├── text_reuse │ └── __init__.py ├── tokenize │ └── __init__.py ├── translations │ ├── __init__.py │ └── map_translation.py └── vector │ └── __init__.py ├── perseus_parsing_notes.txt ├── requirements.txt ├── tests.py └── util ├── __init__.py ├── jsonp.py ├── numerals.py └── text.py /.gitignore: -------------------------------------------------------------------------------- 1 | *\~ 2 | venv 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *,cover 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | 56 | # Sphinx documentation 57 | docs/_build/ 58 | 59 | # PyBuilder 60 | target/ 61 | 62 | ######################### 63 | # Emacs temporary files # 64 | ######################### 65 | *~ 66 | \#*\# 67 | /.emacs.desktop 68 | /.emacs.desktop.lock 69 | *.elc 70 | auto-save-list 71 | tramp 72 | .\#* 73 | 74 | ################# 75 | # Other Editors # 76 | ################# 77 | *.sw[po] 78 | .idea/ 79 | *.iml 80 | *.iws 81 | 82 | ########################## 83 | # Temporary backup files # 84 | ########################## 85 | Backup of*.docx 86 | 87 | ###################### 88 | # OS generated files # 89 | ###################### 90 | .DS_Store 91 | .DS_Store? 92 | ._* 93 | .Spotlight-V100 94 | .Trashes 95 | ehthumbs.db 96 | Thumbs.db 97 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | 3 | language: python 4 | 5 | python: 6 | - "3.5" 7 | 8 | before_script: 9 | - pip install --upgrade pip 10 | - pip install -r requirements.txt 11 | 12 | script: 13 | # Notes on nose: 14 | # Travis CI pre-installs `nose` 15 | - nosetests 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Classical Language Toolkit 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/cltk/cltk_api.svg?branch=master)](https://travis-ci.org/cltk/cltk_api) 2 | 3 | [![Join the chat at https://gitter.im/cltk/cltk_api](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/cltk/cltk_api?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) 4 | 5 | # Notice 6 | 7 | The Classics Archive application is currently under active development and is not ready for production. 8 | 9 | # About 10 | 11 | A simple Flask app for accessing corpora from the CLTK corpora. Currently under development. 12 | 13 | To run with gunicorn: `gunicorn -w 4 -b 0.0.0.0:5000 api_json:app`. 14 | 15 | ## Development 16 | 17 | To get started developing, you'll need Python3.5 and Mongo installed. 18 | 19 | Create a virtual environment and activate it: 20 | 21 | `$ pyvenv venv` 22 | `$ source venv/bin/activate` 23 | 24 | Install dependencies: 25 | 26 | `$ pip install -r requirements.txt` 27 | 28 | Finally, start the app with the following command: 29 | 30 | `$ python api_json.py` 31 | -------------------------------------------------------------------------------- /api_json.py: -------------------------------------------------------------------------------- 1 | """Open JSON file and serve.""" 2 | 3 | import json 4 | import os 5 | 6 | from flask import Flask 7 | from flask import request # for getting query string 8 | # eg: request.args.get('user') will get '?user=some-value' 9 | from flask_restful import Resource, Api 10 | from util.jsonp import jsonp 11 | from metadata.pos.views import POSTagger 12 | from metadata.stem.views import Stem 13 | from metadata.definition.views import Definition 14 | 15 | from flask_restful import reqparse 16 | 17 | app = Flask(__name__) 18 | api = Api(app) 19 | 20 | 21 | # example 22 | class HelloWorld(Resource): 23 | def get(self): 24 | return {'hello': 'world'} 25 | 26 | 27 | # example 28 | class TodoSimple(Resource): 29 | def get(self, todo_id): 30 | return {'example with token': todo_id} 31 | 32 | 33 | def open_json(fp): 34 | """Open json file, return json.""" 35 | with open(fp) as fo: 36 | return json.load(fo) 37 | 38 | 39 | def get_cltk_text_dir(lang, corpus='perseus'): 40 | """Take relative filepath, return absolute""" 41 | cltk_home = os.path.expanduser('~/cltk_data') 42 | text_dir = os.path.join(cltk_home, lang.casefold(), 'text', lang.casefold() + '_text_' + corpus, 'json') 43 | return text_dir 44 | 45 | def get_cltk_translation_dir(lang, translation_lang, corpus='perseus'): 46 | """Take relative filepath, return absolute""" 47 | cltk_home = os.path.expanduser('~/cltk_data') 48 | translation_dir = os.path.join(cltk_home, lang.casefold(), 'text', lang.casefold() + '_text_' + corpus, 'translation', translation_lang) 49 | return translation_dir 50 | 51 | def get_cltk_commentary_dir(lang, corpus='perseus'): 52 | """Take relative filepath, return absolute""" 53 | cltk_home = os.path.expanduser('~/cltk_data') 54 | commentary_dir = os.path.join(cltk_home, lang.casefold(), 'text', lang.casefold() + '_text_' + corpus, 'commentary') 55 | return commentary_dir 56 | 57 | class Text(Resource): 58 | 59 | def get(self, lang, corpus, author, work): 60 | 61 | parser = reqparse.RequestParser() 62 | parser.add_argument('translation') 63 | parser.add_argument('commentary') 64 | args = parser.parse_args() 65 | translation_lang = args.get('translation') 66 | commentary_author = args.get('commentary') 67 | 68 | if(commentary_author): 69 | _dir = get_cltk_commentary_dir(lang) 70 | file = author + "__" + work + ".json"; 71 | json_fp = os.path.join(_dir, file); 72 | 73 | try: 74 | file_dict = open_json(json_fp) 75 | except Exception as e: 76 | return 77 | 78 | commentary = [] 79 | if(commentary_author == "all"): 80 | # Add all commentary 81 | commentary = file_dict["commentary"] 82 | else: 83 | # Add commentary by specific author 84 | for item in file_dict["commentary"]: 85 | print(item) 86 | if item['author'] == commentary_author: 87 | commentary.append(item) 88 | 89 | return {'language': lang, 90 | 'corpus': corpus, 91 | 'author': author, 92 | 'work': work, 93 | 'commentary': commentary, 94 | 'meta': file_dict['meta'], 95 | } 96 | 97 | elif(translation_lang): 98 | # Assumes translation data file name as "author__work__language.json" 99 | _dir = get_cltk_translation_dir(lang, translation_lang) 100 | file = author + "__" + work + ".json"; 101 | json_fp = os.path.join(_dir, file); 102 | 103 | try: 104 | file_dict = open_json(json_fp) 105 | except Exception as e: 106 | return 107 | 108 | return {'language': lang, 109 | 'corpus': corpus, 110 | 'author': author, 111 | 'work': work, 112 | 'translations': file_dict['translations'], 113 | 'meta': file_dict['meta'], 114 | } 115 | 116 | else: 117 | _dir = get_cltk_text_dir(lang) 118 | file = author + "__" + work + ".json"; 119 | 120 | json_fp = os.path.join(_dir, file) 121 | 122 | try: 123 | file_dict = open_json(json_fp) 124 | except Exception as e: 125 | return 126 | 127 | text = file_dict['text'] 128 | 129 | chunk1 = request.args.get('chunk1') 130 | chunk2 = request.args.get('chunk2') 131 | chunk3 = request.args.get('chunk3') 132 | 133 | if chunk1: 134 | text = text[chunk1] 135 | 136 | if chunk2: 137 | text = text[chunk2] 138 | 139 | if chunk3: 140 | text = text[chunk3] 141 | 142 | return {'language': lang, 143 | 'corpus': corpus, 144 | 'author': author, 145 | 'work': work, 146 | 'text': text, 147 | 'meta': file_dict['meta'], 148 | } 149 | 150 | 151 | class Lang(Resource): 152 | def get(self): 153 | 154 | cltk_home = os.path.expanduser('~/cltk_data') 155 | dirs = os.listdir(cltk_home) 156 | langs_with_perseus_corpus = [] 157 | for _dir_lang in dirs: 158 | is_perseus_corpus = get_cltk_text_dir(_dir_lang) 159 | if os.path.isdir(is_perseus_corpus): 160 | langs_with_perseus_corpus.append(_dir_lang) 161 | 162 | return {'languages': langs_with_perseus_corpus} 163 | 164 | 165 | class Corpus(Resource): 166 | 167 | def get(self, lang): 168 | 169 | possible_perseus_corpora_json = get_cltk_text_dir(lang) 170 | possible_perseus_corpora = os.path.split(possible_perseus_corpora_json)[0] 171 | is_perseus = os.path.isdir(possible_perseus_corpora) 172 | corpora = [] 173 | if is_perseus and possible_perseus_corpora.endswith('_perseus'): 174 | corpus_name = os.path.split(possible_perseus_corpora)[1] 175 | corpora.append('perseus') 176 | 177 | return {'language': lang, 178 | 'corpora': corpora} 179 | 180 | class Author(Resource): 181 | def get(self, lang, corpus): 182 | 183 | possible_perseus_corpora_json = get_cltk_text_dir(lang) 184 | 185 | authors = set() # use set to avoid dupes 186 | if os.path.isdir(possible_perseus_corpora_json): 187 | files = os.listdir(possible_perseus_corpora_json) 188 | for file in files: 189 | author = file.split('__')[0] 190 | authors.add(author) 191 | else: 192 | print('Corpus not installed into "~/cltk_data".') 193 | 194 | return {'language': lang, 195 | 'authors': list(authors)} # cast to list, set() not serializable 196 | 197 | class Texts(Resource): 198 | def get(self, lang, corpus, author): 199 | home_dir = os.path.expanduser('~/cltk_data') 200 | possible_corpus = os.path.join(home_dir, lang, 'text', lang + '_text_' + corpus, 'json') 201 | dir_contents = os.listdir(possible_corpus) 202 | 203 | texts = [] 204 | for file in dir_contents: 205 | if file.startswith(author): 206 | text = file.split('__')[1][:-5] 207 | texts.append(text) 208 | 209 | return {'language': lang, 210 | 'corpus': corpus, 211 | 'author': author, 212 | 'texts': texts} 213 | 214 | # http://localhost:5000/lang/latin/corpus/perseus/author/vergil/text 215 | # http://localhost:5000/lang/greek/corpus/perseus/author/homer/text 216 | api.add_resource(Texts, '/lang//corpus//author//text') 217 | 218 | # http://localhost:5000/lang/latin/corpus/perseus/author 219 | api.add_resource(Author, '/lang//corpus//author') 220 | 221 | # http://localhost:5000/lang/latin/corpus 222 | api.add_resource(Corpus, '/lang//corpus') 223 | 224 | 225 | # http://localhost:5000/lang 226 | api.add_resource(Lang, '/lang') 227 | 228 | 229 | # http://localhost:5000/lang/greek/corpus/perseus/author/achilles_tatius/text/leucippe_et_clitophon?chunk1=1&chunk2=1&chunk3=1 230 | # http://localhost:5000/lang/greek/corpus/perseus/author/homer/text/odyssey 231 | # http://localhost:5000/lang/greek/corpus/perseus/author/homer/text/odyssey?chunk1=1&chunk2=1 232 | # http://localhost:5000/lang/greek/corpus/perseus/author/homer/text/odyssey?translation=english 233 | # http://localhost:5000/lang/greek/corpus/perseus/author/homer/text/odyssey?commentary=all 234 | # http://localhost:5000/lang/greek/corpus/perseus/author/homer/text/odyssey?commentary=E. T. Merril 235 | api.add_resource(Text, '/lang//corpus//author//text/') 236 | #api.add_resource(Text, '/lang//corpus//author//text//') 237 | 238 | # CLTK core pos 239 | api.add_resource(POSTagger, '/core/pos', endpoint='pos') 240 | 241 | # CLTK core stemmer 242 | api.add_resource(Stem, '/core/stem/') 243 | 244 | # CLTK definitions 245 | # http://localhost:5000/lang/latin/define/abante 246 | api.add_resource(Definition, '/lang//define/') 247 | 248 | # simple examples 249 | api.add_resource(TodoSimple, '/todo/') 250 | api.add_resource(HelloWorld, '/hello') 251 | 252 | if __name__ == '__main__': 253 | #app.run(debug=True) 254 | app.run(host='0.0.0.0', debug=True) 255 | -------------------------------------------------------------------------------- /api_old.py: -------------------------------------------------------------------------------- 1 | """Main API file for backend CLTK webapp. 2 | 3 | The Texts class parses files to get their metadata. This is super cludgy and needs to be redone somehow. 4 | """ 5 | 6 | import os 7 | from flask import Flask 8 | from flask import request # for getting query string 9 | from flask import json, jsonify 10 | # eg: request.args.get('user') will get '?user=some-value' 11 | from flask_restful import Resource, Api 12 | from flask.ext.pymongo import PyMongo 13 | from ingest.resources import Ingest 14 | from api.resources import Query 15 | from util.jsonp import jsonp 16 | 17 | app = Flask(__name__) 18 | mongo = PyMongo(app) 19 | api = Api(app) 20 | 21 | 22 | class Authors(Resource): 23 | 24 | @jsonp 25 | def get(self, lang, corpus_name): 26 | # assert lang in ['greek', 'latin'] 27 | text_path = os.path.expanduser('~/cltk_data/' + lang + '/text/' + lang + '_text_' + corpus_name) 28 | 29 | dir_contents = os.listdir(text_path) 30 | 31 | # Sulpicia dir has no Latin texts 32 | # Isocrates dir has no Greek texts 33 | remove_files = ['README.md', '.git', 'LICENSE.md', 'perseus_compiler.py', '.DS_Store', 'Sulpicia' , 'Isocrates'] 34 | 35 | dir_contents = [f for f in dir_contents if f not in remove_files] 36 | 37 | return {'authors': sorted(dir_contents) } 38 | 39 | 40 | class Texts(Resource): 41 | 42 | @jsonp 43 | def get(self, lang, corpus_name, author_name): 44 | text_path = os.path.expanduser( 45 | '~/cltk_data/' + lang + '/text/' + lang + '_text_' + corpus_name + '/' + author_name.casefold() + '/opensource') # casefold() prob not nec 46 | dir_contents = os.listdir(text_path) 47 | ending = '' 48 | if corpus_name == 'perseus' and lang == 'greek': 49 | ending = '_gk.xml.json' 50 | if author_name.casefold() == 'aratus': 51 | ending = '.xml.json' 52 | elif author_name.casefold() == 'jebborators': 53 | ending = '.xml.json' 54 | elif author_name.casefold() == 'lucretius': 55 | ending = '_lat.xml.json' 56 | elif author_name.casefold() == 'lycophron': 57 | ending = '.xml.json' 58 | elif author_name.casefold() == 'nonnos': 59 | ending = '.xml.json' 60 | elif author_name.casefold() == 'tryphiodorus': 61 | ending = '.xml.json' 62 | elif author_name.casefold() == 'callimachus': 63 | ending = '.xml.json' 64 | elif corpus_name == 'perseus' and lang == 'latin': 65 | ending = '_lat.xml.json' 66 | # weird exceptions 67 | if author_name.casefold() == 'histaugust': 68 | ending = '.xml.json' 69 | elif author_name.casefold() == 'quintus': 70 | ending = '.xml.json' 71 | dir_contents = [f for f in dir_contents if f.endswith(ending)] 72 | dir_contents = [f.casefold() for f in dir_contents] # this probably isn't nec 73 | return json.dumps( {'texts': sorted(dir_contents)} ) 74 | 75 | 76 | class Text(Resource): 77 | 78 | @jsonp 79 | def get(self, lang, corpus_name, author_name, fname): 80 | 81 | text_path = os.path.expanduser( 82 | '~/cltk_data/') + lang + '/text/' + lang + '_text_' + corpus_name + '/' + author_name + '/opensource/' + fname 83 | ending = '' 84 | if corpus_name == 'perseus' and lang == 'greek': 85 | ending = '_gk.xml.json' 86 | if author_name.casefold() == 'aratus': 87 | ending = '.xml.json' 88 | elif author_name.casefold() == 'jebborators': 89 | ending = '.xml.json' 90 | elif author_name.casefold() == 'lucretius': 91 | ending = '_lat.xml.json' 92 | elif author_name.casefold() == 'lycophron': 93 | ending = '.xml.json' 94 | elif author_name.casefold() == 'nonnos': 95 | ending = '.xml.json' 96 | elif author_name.casefold() == 'tryphiodorus': 97 | ending = '.xml.json' 98 | elif author_name.casefold() == 'callimachus': 99 | if fname.startswith('call_0'): 100 | ending = '.xml.json' 101 | elif corpus_name == 'perseus' and lang == 'latin': 102 | ending = '_lat.xml.json' 103 | # weird exceptions 104 | if author_name.casefold() == 'histaugust' or author_name.casefold() == 'quintus': 105 | ending = '.xml.json' 106 | 107 | text_path += ending 108 | with open(text_path, "r") as f: # TODO: use json.loads() for all this 109 | file_string = f.read() 110 | file_json = json.loads(file_string) 111 | 112 | # Some files are odd 113 | if author_name.casefold() in ['quintus', 'aratus', 'callimachus', 'colluthus', 'lycophron', 'nonnos', 'tryphiodorus']: 114 | encoding_desc = file_json['TEI.2']['teiHeader']['encodingDesc'] 115 | if type(encoding_desc) is list: 116 | for desc in encoding_desc: 117 | try: 118 | quintus = True 119 | refs_decls = desc.get('refsDecl') 120 | break 121 | except Exception: 122 | pass 123 | # everyone else 124 | else: 125 | refs_decls = file_json['TEI.2']['teiHeader']['encodingDesc']['refsDecl'] 126 | 127 | section_types = [] # list of lists 128 | if type(refs_decls) is list: 129 | for refs_decl in refs_decls: 130 | if refs_decl.get('@doctype') == 'TEI.2' and 'state' in refs_decl: 131 | states = refs_decl['state'] 132 | if type(states) is list: 133 | units = [] 134 | for state in states: 135 | unit = state['@unit'] 136 | units.append(unit) 137 | section_types.append(units) 138 | elif type(states) is dict: 139 | state = states 140 | unit = state['@unit'] 141 | section_types.append([unit]) 142 | elif 'state' in refs_decl: 143 | states = refs_decl['state'] 144 | if type(states) is list: 145 | units = [] 146 | for state in states: 147 | unit = state['@unit'] 148 | units.append(unit) 149 | section_types.append(units) 150 | 151 | elif type(refs_decls) is dict: 152 | refs_decl = refs_decls 153 | if refs_decl.get('@doctype') == 'TEI.2' and 'state' in refs_decl: 154 | states = refs_decl['state'] 155 | if type(states) is list: 156 | units = [] 157 | for state in states: 158 | unit = state['@unit'] 159 | units.append(unit) 160 | section_types = [units] 161 | elif type(states) is dict: 162 | state = refs_decl['state'] 163 | unit = state['@unit'] 164 | section_types.append([unit]) 165 | elif refs_decl.get('@doctype') == 'TEI.2' and 'step' in refs_decl: 166 | steps = refs_decl['step'] 167 | if type(steps) is list: 168 | units = [] 169 | for state in steps: 170 | unit = state['@refunit'] 171 | units.append(unit) 172 | section_types = [units] 173 | elif type(steps) is dict: 174 | step = refs_decl['step'] 175 | unit = step['@refunit'] 176 | section_types.append([unit]) 177 | elif refs_decl.get('@doctype') != 'TEI.2' and 'step' in refs_decl: 178 | print('*' * 40) 179 | steps = refs_decl['step'] 180 | if type(steps) is list: 181 | units = [] 182 | for state in steps: 183 | unit = state['@refunit'] 184 | units.append(unit) 185 | section_types = [units] 186 | elif type(steps) is dict: 187 | step = refs_decl['step'] 188 | unit = step['@refunit'] 189 | section_types.append([unit]) 190 | 191 | # Some entries missing `{'@doctype': 'TEI.2'}` (eg, Pliny's `pliny.min.letters`) 192 | elif refs_decl.get('@doctype') != 'TEI.2' and 'state' in refs_decl: 193 | states = refs_decl['state'] 194 | if type(states) is list: 195 | units = [] 196 | for state in states: 197 | unit = state['@unit'] 198 | units.append(unit) 199 | section_types = [units] 200 | elif type(states) is dict: 201 | state = refs_decl['state'] 202 | unit = state['@unit'] 203 | section_types.append([unit]) 204 | 205 | 206 | # Parse query strings 207 | q_section_1 = request.args.get('section_1') 208 | q_section_2 = request.args.get('section_2') 209 | q_section_3 = request.args.get('section_3') 210 | q_section_4 = request.args.get('section_4') 211 | q_section_5 = request.args.get('section_5') 212 | 213 | # If no query string, return text object 214 | if not q_section_1: 215 | return {'refs_decl': refs_decls, 216 | 'filepath': text_path, 217 | 'section_types': section_types, 218 | 'text': file_json['TEI.2']['text'] 219 | } 220 | 221 | # Parse text according to query string 222 | section_1_object = file_json['TEI.2']['text']['body']['div1'] 223 | 224 | if type(section_1_object) is list: 225 | for section_1 in section_1_object: 226 | try: 227 | section_1_number = section_1['@n'] # str 228 | except KeyError: 229 | # http://localhost:5000/lang/greek/corpus/perseus/author/Aeschylus/text/aesch.ag?section_1=1 230 | # Something funny. Redefine section_1 to s.th. deeper embedded 231 | #! This pathway is broke and I don't know if I want to make this more convoluted than it is. Dammit. 232 | section_1 = section_1['div2']['sp'] 233 | 234 | if section_1_number == q_section_1: 235 | section_1_object = section_1['l'] # list 236 | 237 | # cleanup lines 238 | return_section_1_object = [] 239 | for line in section_1_object: 240 | if type(line) is dict: 241 | line = line['#text'] 242 | return_section_1_object.append(line) 243 | 244 | if not q_section_2: 245 | # http://localhost:5000/lang/latin/corpus/perseus/author/Vergil/text/verg.a?section_1=12 246 | # http://localhost:5000/lang/greek/corpus/perseus/author/Homer/text/hom.od?section_1=1 247 | return {'refs_decl': refs_decls, 248 | 'filepath': text_path, 249 | 'section_types': section_types, 250 | 'text': return_section_1_object 251 | } 252 | 253 | for counter, section_2_item in enumerate(section_1_object): 254 | if type(section_2_item) is dict: 255 | section_2_item = section_2_item['#text'] 256 | if counter + 1 == int(q_section_2): 257 | returned_text = section_2_item 258 | 259 | if not q_section_3: 260 | return {'refs_decl': refs_decls, 261 | 'filepath': text_path, 262 | 'section_types': section_types, 263 | 'text': returned_text, 264 | } 265 | 266 | elif type(section_1_object) is dict: 267 | # http://localhost:5000/lang/greek/corpus/perseus/author/Hesiod/text/hes.th?section_1=1 268 | section_1_type = section_1_object['@type'] 269 | section_1_number = section_1_object['@n'] 270 | section_1_list = section_1_object['l'] 271 | 272 | # cleanup lines 273 | return_section_1_object = [] 274 | for line in section_1_list: 275 | if type(line) is dict: 276 | line = line['#text'] 277 | return_section_1_object.append(line) 278 | 279 | for counter, section_1_item in enumerate(section_1_list): 280 | if type(section_1_item) is dict: 281 | section_1_item = section_1_item['#text'] 282 | if counter + 1 == int(q_section_1): 283 | returned_text = section_1_item 284 | 285 | return {'refs_decl': refs_decls, 286 | 'filepath': text_path, 287 | 'section_types': section_types, 288 | 'text': returned_text 289 | } 290 | 291 | 292 | # http://localhost:5000/lang/greek/corpus/perseus/authors 293 | api.add_resource(Authors, '/lang//corpus//authors') 294 | 295 | # http://localhost:5000/lang/greek/corpus/perseus/author/Homer/texts 296 | api.add_resource(Texts, '/lang//corpus//author//texts') 297 | 298 | # http://localhost:5000/lang/latin/corpus/perseus/author/Vergil/text/verg.a 299 | # http://localhost:5000/lang/greek/corpus/perseus/author/Homer/text/hom.od 300 | 301 | # http://localhost:5000/lang/latin/corpus/perseus/author/Vergil/text/verg.a?section_1=1§ion_2=1 302 | # http://localhost:5000/lang/greek/corpus/perseus/author/Homer/text/hom.od?section_1=1§ion_2=1 303 | api.add_resource(Text, 304 | '/lang//corpus//author//text/') 305 | 306 | # Trigger new document ingest 307 | api.add_resource(Ingest, '/ingest') 308 | 309 | # Feed GET params to query to DB 310 | api.add_resource(Query, '/query') 311 | 312 | if __name__ == '__main__': 313 | app.run(debug=True) 314 | #app.run(host='0.0.0.0') 315 | -------------------------------------------------------------------------------- /convert_tei_json_to_simple_json/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cltk/cltk_api/eb736ec9f20c008436e93cd0be4bcd05be7d455c/convert_tei_json_to_simple_json/__init__.py -------------------------------------------------------------------------------- /convert_tei_json_to_simple_json/book_chapter.py: -------------------------------------------------------------------------------- 1 | """Example: Ammianus""" 2 | 3 | import json 4 | import os 5 | 6 | from book_line import file_to_dict 7 | from book_line import dict_to_file 8 | 9 | 10 | 11 | def book_chapter_convert(fp): 12 | """Take filepath, try to make new file. 13 | {'author': author_name, 14 | 'text': [ 15 | {'book': 1, 16 | 'chapters': 17 | [{'chapter': 1, 'text': real_text}, …] 18 | } 19 | ] 20 | } 21 | """ 22 | final_file_dict = {} 23 | books_list = [] 24 | file_dict = file_to_dict(fp) 25 | 26 | tei = file_dict['TEI.2'] # dict 27 | text = tei['text'] # dict 28 | header = tei['teiHeader'] # dict 29 | 30 | # Get work's title, add to final dict 31 | title_list = header['fileDesc']['titleStmt']['title'] 32 | for obj in title_list: 33 | if type(obj) is str: 34 | title_name = obj 35 | final_file_dict['title'] = title_name 36 | break 37 | try: 38 | if obj['@type'] == 'work': 39 | title_name = obj['#text'] 40 | final_file_dict['title'] = title_name 41 | break 42 | except KeyError: 43 | raise 44 | 45 | encoding = header['encodingDesc'] # dict 46 | body = text['body'] # dict 47 | div1 = body['div1'] # list of dict 48 | #print(len(div1)) # eg 12 for Aeneid, 24 for Iliad 49 | for div1_dict in div1: # !Book loop 50 | #print(div1_dict.keys()) 51 | book_dict = {} 52 | div1_dict_div2 = div1_dict['div2'] # list of dict; where the text at 53 | div1_dict_type = div1_dict['@type'] # book 54 | try: 55 | div1_dict_pb = div1_dict['pb'] # dict or list of dict: [{'@id': 'v2.p.16'}, {'@id': 'v2.p.30'}, {'@id': 'v2.p.68'}] 56 | except KeyError: 57 | div1_dict_pb = None 58 | div1_dict_head = div1_dict['head'] # str, eg: 'Liber XVII' 59 | div1_dict_number = div1_dict['@n'] # str, eg: 17, 'val1' 60 | book_number = div1_dict_number 61 | #print('Book:', book_number) 62 | book_dict['book'] = book_number 63 | 64 | chapters_list = [] # a list of {: } 65 | for div2 in div1_dict_div2: # !Chapter Loop 66 | 67 | chapter_dict = {} 68 | chapter_text = [] 69 | #print(type(div2)) # dict 70 | div2_type = div2['@type'] # str: chapter 71 | div2_number = div2['@n'] # 6, 12, 4 72 | chapter_number = div2_number 73 | #print('Chapter:', chapter_number) 74 | try: 75 | div2_argument = div2['argument'] # dict: {'p': 'Quo patre natus sit, et quas res princeps gesserit.'} (text in here) 76 | div2_text_section = div2_argument['p'] # ! Summary text here, not useful (I think) 77 | #print('div2_text_section', div2_text_section) 78 | if type(div2_text_section) is dict: 79 | #print(div2_text_section.keys()) # ['note', '#text'] or ['corr', '#text'] 80 | div2_text_section_note = div2_text_section['note'] # summaries 81 | div2_text_section_text = div2_text_section['text'] # empty 82 | div2_text_section_corr = div2_text_section['corr'] # empty 83 | elif type(div2_text_section) is str: 84 | pass 85 | #print(div2_text_section) # ! real text here! (I think) 86 | except KeyError: 87 | div2_argument = None 88 | div2_ps = div2['p'] # list of dicts or dict (text in here) 89 | if type(div2_ps) is dict: 90 | #print(div2_ps.keys()) # ['note', 'quote', 'milestone', 'pb', '#text'] 91 | try: 92 | div2_ps_note = div2_ps['note'] # [{'hi': {'#text': 'et ad molliora,', '@rend': 'italics'}, '#text': 'added in G; V omits.'}, … ] 93 | except KeyError: 94 | div2_ps_note = None 95 | try: 96 | div2_ps_quote = div2_ps['quote'] # ['Nemo', 'vereatur: habeo firmiter quod tenebam.', {'@rend': 'blockquote', 'l': [{'foreign': {'@lang': 'greek', '#text': 'Zeu\\s o(/tan ei)s platu\\ te/rma mo/lh| klutou= u(droxo/oio,'}}, … ] 97 | except KeyError: 98 | div2_ps_quote = None 99 | div2_ps_milestone = div2_ps['milestone'] 100 | try: 101 | div2_ps_pb = div2_ps['pb'] # [{'@id': 'v3.p.316'}, {'@id': 'v3.p.318'}] or {'@id': 'v2.p.190'} 102 | except KeyError: 103 | div2_ps_pb = None 104 | div2_ps_text = div2_ps['#text'] # ! actual text! 105 | real_text = div2_ps_text 106 | chapter_text.append(real_text) 107 | #print('div2_ps_text', div2_ps_text) 108 | elif type(div2_ps) is list: 109 | for div2_ps_item in div2_ps: # all dicts 110 | #print(div2_ps_item.keys()) # ['pb', 'milestone', 'note', '#text', 'quote'] 111 | #div2_ps_item_pb = div2_ps_item['pb'] 112 | #div2_ps_item_milestone = div2_ps_item['milestone'] 113 | #div2_ps_item_note = div2_ps_item['note'] 114 | try: 115 | div2_ps_item_text = div2_ps_item['#text'] # ! real text here 116 | real_text = div2_ps_item_text 117 | chapter_text.append(real_text) 118 | #print(div2_ps_item_text) 119 | except KeyError: 120 | div2_ps_item_text = None 121 | #div2_ps_item_quote = div2_ps_item['quote'] 122 | chapter_text_str = ' '.join(chapter_text) 123 | chapter_dict[chapter_number] = chapter_text_str 124 | chapters_list.append(chapter_dict) 125 | book_dict['book'] = book_number 126 | book_dict['chapters']= chapters_list 127 | books_list.append(book_dict) 128 | 129 | # Get author name from 'latin_key.json' 130 | key_fp = os.path.expanduser('~/cltk_data/latin/text/latin_text_perseus/latin_key.json') 131 | with open(key_fp) as fo: 132 | meta_authors = json.load(fo) 133 | for meta_author in meta_authors: 134 | orig_filename = meta_author['title'] 135 | if orig_filename == os.path.split(fp)[1]: 136 | author_name = meta_author['name'] 137 | #print(author_name) 138 | structure_meta = meta_author['encoding']['state'] 139 | #book_dict['structure_meta'] = structure_meta 140 | #final_file_dict['structure_meta'] = structure_meta 141 | #book_dict['author_name'] = author_name 142 | final_file_dict['author'] = author_name 143 | break 144 | 145 | final_file_dict['text'] = books_list 146 | 147 | 148 | author_dir, author_file = os.path.split(fp)[0], os.path.split(fp)[1] 149 | author_file = author_file.replace('xml.', '') 150 | opensource_dir = os.path.split(author_dir)[0] 151 | perseus_root = os.path.split(opensource_dir)[0] 152 | # next write new perseus dir and put in there; check if present 153 | cltk_perseus_dir = 'cltk_formatted' 154 | cltk_perseus_path = os.path.expanduser(os.path.join(perseus_root, cltk_perseus_dir, author_name.casefold() + '_' + author_file)) 155 | print('Wrote new file to: "{}".'.format(cltk_perseus_path)) 156 | try: 157 | dict_to_file(final_file_dict, cltk_perseus_path) 158 | except FileNotFoundError: 159 | _dir = os.path.split(cltk_perseus_path)[0] 160 | os.mkdir(_dir) 161 | dict_to_file(final_file_dict, cltk_perseus_path) 162 | 163 | 164 | if __name__ == "__main__": 165 | fp = '/Users/kyle/cltk_data/latin/text/latin_text_perseus/Ammianus/opensource/amm_lat.xml.json' 166 | book_chapter_convert(fp) 167 | 168 | -------------------------------------------------------------------------------- /convert_tei_json_to_simple_json/book_line.py: -------------------------------------------------------------------------------- 1 | """Take the JSON conversion of the original Perseus XML, then convert it into 2 | easier-to-parse JSON. 3 | 4 | TODO: Perhaps get full author name and work name out of XML. 5 | """ 6 | 7 | import json 8 | import os 9 | import sys 10 | 11 | 12 | def file_to_dict(fp): 13 | """Open a json file and return Python dict.""" 14 | with open(os.path.expanduser(fp)) as fo: 15 | return json.load(fo) 16 | 17 | 18 | def dict_to_file(obj, fp): 19 | """Write dict to json file.""" 20 | with open(os.path.expanduser(fp), 'w') as fo: 21 | json.dump(obj, fo) 22 | 23 | 24 | def book_line_convert(fp): 25 | """Take filepath, try to make new file. 26 | {'author': 'Vergil', 27 | 'text': [ 28 | {'book': 1, 29 | 'line': ['aaaaa', 'bbbbb', 'cccc'] 30 | } 31 | ] 32 | } 33 | """ 34 | final_file_dict = {} 35 | text_books_list = [] 36 | file_dict = file_to_dict(fp) 37 | 38 | tei = file_dict['TEI.2'] # dict 39 | text = tei['text'] # dict 40 | header = tei['teiHeader'] # dict 41 | 42 | # Get work's title, add to final dict 43 | title_list = header['fileDesc']['titleStmt']['title'] 44 | for obj in title_list: 45 | if type(obj) is str: 46 | title_name = obj 47 | final_file_dict['title'] = title_name 48 | break 49 | try: 50 | if obj['@type'] == 'work': 51 | title_name = obj['#text'] 52 | final_file_dict['title'] = title_name 53 | break 54 | except KeyError: 55 | raise 56 | 57 | encoding = header['encodingDesc'] # dict 58 | body = text['body'] # dict 59 | div1 = body['div1'] # list of dict 60 | #print(len(div1)) # eg 12 for Aeneid, 24 for Iliad 61 | for div1_dict in div1: 62 | book_object = {} 63 | text_lines = [] 64 | milestone = div1_dict['milestone'] # list, not useful 65 | _type = div1_dict['@type'] # str, 'Book' 66 | book_number = int(div1_dict['@n']) # str cast as int 67 | div1_dict_list = div1_dict['l'] # list of str or dict 68 | for counter, div1_dict_list_object in enumerate(div1_dict_list, start=1): 69 | if type(div1_dict_list_object) is dict: 70 | try: 71 | div1_dict_list_object_number = div1_dict_list_object['@n'] # str 72 | except KeyError: 73 | div1_dict_list_object_number = None 74 | try: 75 | div1_dict_list_object_milestone = div1_dict_list_object['milestone'] # dict, eg Aen and Il: {'@ed': 'P', '@unit': 'para'} 76 | except KeyError: 77 | div1_dict_list_object_milestone = None 78 | div1_dict_list_object_text = div1_dict_list_object['#text'] # the actual text 79 | div1_dict_list_object = div1_dict_list_object_text # str 80 | else: 81 | pass 82 | #print(book_number, counter, div1_dict_list_object) 83 | text_lines.append(div1_dict_list_object) 84 | book_object['text'] = text_lines 85 | book_object['book'] = book_number 86 | 87 | # Get author name from 'latin_key.json' 88 | key_fp = os.path.expanduser('~/cltk_data/latin/text/latin_text_perseus/latin_key.json') 89 | with open(key_fp) as fo: 90 | meta_authors = json.load(fo) 91 | for meta_author in meta_authors: 92 | orig_filename = meta_author['title'] 93 | if orig_filename == os.path.split(fp)[1]: 94 | author_name = meta_author['name'] 95 | #structure_meta = meta_author['encoding']['state'] 96 | #book_object['structure_meta'] = structure_meta 97 | #book_object['author_name'] = author_name 98 | final_file_dict['author'] = author_name 99 | break 100 | 101 | text_books_list.append(book_object) 102 | #print(len(text_books_list)) # eg 12 for Aen, 4 for Georgics, 24 for Od 103 | 104 | final_file_dict['text'] = text_books_list 105 | 106 | author_dir, author_file = os.path.split(fp)[0], os.path.split(fp)[1] 107 | author_file = author_file.replace('xml.', '') 108 | opensource_dir = os.path.split(author_dir)[0] 109 | perseus_root = os.path.split(opensource_dir)[0] 110 | # next write new perseus dir and put in there; check if present 111 | cltk_perseus_dir = 'cltk_formatted' 112 | cltk_perseus_path = os.path.expanduser(os.path.join(perseus_root, cltk_perseus_dir, author_file)) 113 | print('Wrote new file to: "{}".'.format(cltk_perseus_path)) 114 | try: 115 | dict_to_file(final_file_dict, cltk_perseus_path) 116 | except FileNotFoundError: 117 | _dir = os.path.split(cltk_perseus_path)[0] 118 | os.mkdir(_dir) 119 | dict_to_file(final_file_dict, cltk_perseus_path) 120 | 121 | if __name__ == "__main__": 122 | 123 | examples_files = ['~/cltk_data/latin/text/latin_text_perseus/Vergil/opensource/verg.a_lat.xml.json', 124 | #'~/cltk_data/latin/text/latin_text_perseus/Vergil/opensource/verg.ecl_lat.xml.json', # KeyError: 'milestone' 125 | '~/cltk_data/latin/text/latin_text_perseus/Vergil/opensource/verg.g_lat.xml.json', 126 | '~/cltk_data/latin/text/latin_text_perseus/Ovid/opensource/ovid.met_lat.xml.json', 127 | #'~/cltk_data/latin/text/latin_text_perseus/Ovid/opensource/ovid.am_lat.xml.json', # KeyError: 'body' 128 | #'~/cltk_data/latin/text/latin_text_perseus/Ovid/opensource/ovid.fast_lat.xml.json', # KeyError: 'milestone' 129 | #'~/cltk_data/latin/text/latin_text_perseus/Ovid/opensource/ovid.ibis_lat.xml.json', # TypeError: string indices must be integers 130 | #'~/cltk_data/latin/text/latin_text_perseus/Ovid/opensource/ovid.pont_lat.xml.json', # KeyError: 'milestone' 131 | #'~/cltk_data/latin/text/latin_text_perseus/Ovid/opensource/ovid.tr_lat.xml.json', # KeyError: 'milestone' 132 | '~/cltk_data/greek/text/greek_text_perseus/Homer/opensource/hom.il_gk.xml.json', 133 | '~/cltk_data/greek/text/greek_text_perseus/Homer/opensource/hom.od_gk.xml.json' 134 | ] 135 | 136 | for fp in examples_files: 137 | book_line_convert(fp) -------------------------------------------------------------------------------- /convert_tei_json_to_simple_json/convert_all_perseus_xml.py: -------------------------------------------------------------------------------- 1 | """Look for all Perseus files, then try to convert with available converters. 2 | If error rises, then try another converter. 3 | 4 | Outputs to: '~/cltk_data/greek/text/greek_text_perseus/cltk_formatted' and 5 | '~/cltk_data/latin/text/latin_text_perseus/cltk_formatted'. 6 | """ 7 | 8 | import os 9 | import sys 10 | 11 | from book_line import book_line_convert 12 | from book_chapter import book_chapter_convert 13 | 14 | 15 | def os_walk(fp, ending='_lat.xml.json'): 16 | """Recursively find files in path.""" 17 | for dir_path, dir_names, files in os.walk(fp): # pylint: disable=W0612 18 | for name in files: 19 | if name.endswith(ending): 20 | yield os.path.join(dir_path, name) 21 | 22 | 23 | if __name__ == "__main__": 24 | perseus_dirs = ['~/cltk_data/latin/text/latin_text_perseus/', '~/cltk_data/greek/text/greek_text_perseus/'] 25 | #perseus_dirs = ['~/cltk_data/latin/text/latin_text_perseus/'] 26 | xml_converter = [book_line_convert, book_chapter_convert] 27 | success_count = 0 28 | fail_count = 0 29 | for perseus_dir in perseus_dirs: 30 | for fp in os_walk(os.path.expanduser(perseus_dir)): 31 | for converter in xml_converter: 32 | try: 33 | converter(fp) 34 | success_count += 1 35 | break 36 | except: 37 | pass 38 | fail_count += 1 39 | print('Sucess:', success_count) 40 | print('Fail:', fail_count) -------------------------------------------------------------------------------- /gunicorn_start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | NAME="CLTK API" 4 | SOCKFILE=/home/cltk/cltk_api/binding.sock 5 | NUM_WORKERS=4 6 | 7 | echo "Starting $NAME" 8 | 9 | #activate virtual environment 10 | source /home/cltk/venv/bin/activate 11 | cd /home/cltk/cltk_api 12 | 13 | # Start gunicorn server 14 | exec gunicorn api_json:app -b 127.0.0.1:5000 \ 15 | --workers $NUM_WORKERS \ 16 | --bind=unix:$SOCKFILE 17 | -------------------------------------------------------------------------------- /metadata/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cltk/cltk_api/eb736ec9f20c008436e93cd0be4bcd05be7d455c/metadata/__init__.py -------------------------------------------------------------------------------- /metadata/commentary/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cltk/cltk_api/eb736ec9f20c008436e93cd0be4bcd05be7d455c/metadata/commentary/__init__.py -------------------------------------------------------------------------------- /metadata/criticism/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cltk/cltk_api/eb736ec9f20c008436e93cd0be4bcd05be7d455c/metadata/criticism/__init__.py -------------------------------------------------------------------------------- /metadata/criticism/criticism.py: -------------------------------------------------------------------------------- 1 | """Ingest citations for criticism""" 2 | import time 3 | import string 4 | import random 5 | import urllib.request 6 | from bs4 import BeautifulSoup 7 | from cltk_api.util.db import mongo 8 | 9 | class Criticism: 10 | 11 | def __init__(self, dbname): 12 | """Setup db connection to mongo""" 13 | self.dbname = dbname 14 | self.punctuation_transtable = {ord(c): None for c in string.punctuation} 15 | 16 | return 17 | 18 | def ingest(self, line): 19 | """Ingest citation data to the database and mark done for later processing""" 20 | 21 | try: 22 | 23 | cites = self.search_jstor(line) 24 | for cite in cites: 25 | cite['line'] = line 26 | self.save(cite) 27 | 28 | except: 29 | return False 30 | 31 | 32 | return True 33 | 34 | def search_jstor(self, line): 35 | """Search for line via JSTOR API""" 36 | cites = [] 37 | pages = [] 38 | 39 | # Make URL to query 40 | sline = line['line']['text'].translate(self.punctuation_transtable).lower() 41 | sline = sline.replace(" ","+").lower() 42 | sline = sline.replace("—", "") 43 | 44 | url = "http://dfr.jstor.org/?view=text&qk0=ft&qw0=1.0&qv0=%22" + sline + "%22&qf0=any&sk=ca" 45 | 46 | # Get the page 47 | res = urllib.request.urlopen(url) 48 | html = res.read() 49 | soup = BeautifulSoup(html) 50 | pagination = soup.select(".pagination a") 51 | cites.extend(self._parse_jstor_page(soup)) 52 | 53 | # Get the paginated results 54 | for elem in pagination: 55 | #If elem doesn't have classes "prevnextlink" and "currentpage" 56 | try: 57 | if "prevnextlink" not in elem['class'] and "currentpage" not in elem['class']: 58 | pages.append("http://dfr.jstor.org/" + elem['href']) 59 | except: 60 | try: 61 | pages.append("http://dfr.jstor.org/" + elem['href']) 62 | except: 63 | pass 64 | 65 | time.sleep(random.randint( 2, 5 )) 66 | for i, page_link in enumerate(pages): 67 | print(" -- querying page", i + 2) 68 | res = urllib.request.urlopen(page_link) 69 | html = res.read() 70 | soup = BeautifulSoup(html) 71 | cites.extend(self._parse_jstor_page(soup)) 72 | time.sleep(random.randint( 2, 5 )) 73 | 74 | return cites 75 | 76 | def _parse_jstor_page(self, soup): 77 | c = [] 78 | res = soup.select("ul.results_item") 79 | for el in res: 80 | c.append({ 81 | 'title' : el.select(".title")[0].text, 82 | 'author' : el.select(".author")[0].text, 83 | 'cite' : el.select('li')[2].text 84 | }) 85 | 86 | return c 87 | 88 | def save(self, cite): 89 | """Save the citation to the db for processing""" 90 | db = mongo(self.dbname) 91 | db.criticism.insert(cite) 92 | return 93 | -------------------------------------------------------------------------------- /metadata/definition/_init_.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cltk/cltk_api/eb736ec9f20c008436e93cd0be4bcd05be7d455c/metadata/definition/_init_.py -------------------------------------------------------------------------------- /metadata/definition/views.py: -------------------------------------------------------------------------------- 1 | from flask_restful import Resource 2 | import json 3 | import os 4 | 5 | # File name suffix for the json files 6 | DATA_FILE_SUFFFIX = "-analyses.json" 7 | 8 | def get_cltk_treebank_dir(lang, corpus='perseus'): 9 | """Take relative filepath, return absolute""" 10 | cltk_home = os.path.expanduser('~/cltk_data') 11 | treebank_path = lang.casefold() + '_treebank_' + corpus; 12 | treebank_dir = os.path.join(cltk_home, lang.casefold(), 'treebank', treebank_path, treebank_path) 13 | return treebank_dir 14 | 15 | class Definition(Resource): 16 | 17 | ''' 18 | GET /lang//define/ 19 | Return the available definitions for a word of given language 20 | ''' 21 | def get(self, lang, word): 22 | # File name would something like "latin-analyses.json" 23 | filename = lang + DATA_FILE_SUFFFIX 24 | _dir = get_cltk_treebank_dir(lang) 25 | file = os.path.join(_dir, filename) 26 | with open(file, "r") as infile: 27 | word_list = json.load(infile) 28 | try: 29 | return word_list[word] 30 | except KeyError as e: 31 | return [] 32 | -------------------------------------------------------------------------------- /metadata/entities/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cltk/cltk_api/eb736ec9f20c008436e93cd0be4bcd05be7d455c/metadata/entities/__init__.py -------------------------------------------------------------------------------- /metadata/entities/dbpedia.py: -------------------------------------------------------------------------------- 1 | """ 2 | Interface with DBpedia 3 | """ 4 | 5 | class DBpedia: 6 | 7 | def __init__(self): 8 | 9 | 10 | return -------------------------------------------------------------------------------- /metadata/entities/entity.py: -------------------------------------------------------------------------------- 1 | """ 2 | A class for working with Entities retrieved from the NER core functionality 3 | of the CLTK 4 | """ 5 | 6 | import string 7 | import os 8 | import json 9 | import re 10 | import random 11 | from time import sleep 12 | from urllib.request import urlopen, urlretrieve 13 | from urllib import error 14 | from bs4 import BeautifulSoup 15 | from cltk_api.metadata.entities.wikipedia import Wikipedia 16 | 17 | 18 | class Entity: 19 | 20 | def __init__(self, name_english, name_original): 21 | 22 | self.name_english = name_english 23 | self.name_original = name_original 24 | self.punctuation_transtable = {ord(c): None for c in string.punctuation} 25 | 26 | # External resources 27 | self.wikipedia_entity = {} 28 | 29 | return 30 | 31 | def fetch_wikipedia(self): 32 | """ 33 | Fetch metadata, images, and summaries about an entity from Wikipedia 34 | """ 35 | self.wikipedia_entity = Wikipedia.query(self.name_english) 36 | 37 | return 38 | -------------------------------------------------------------------------------- /metadata/entities/pleiades.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | Interface with Pleiades 4 | """ 5 | 6 | class Pleiades: 7 | 8 | def __init__(self): 9 | 10 | 11 | return 12 | -------------------------------------------------------------------------------- /metadata/entities/viaf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Interface with Viaf 3 | """ 4 | 5 | class VIAF: 6 | 7 | def __init__(self): 8 | 9 | 10 | return 11 | -------------------------------------------------------------------------------- /metadata/entities/wikipedia.py: -------------------------------------------------------------------------------- 1 | """ 2 | A class for interfacing with Wikipedia to associate metadata and images to 3 | the CLTK named entities 4 | 5 | Example usage: 6 | >>> from cltk_api.metadata.entities.wikipedia import Wikipedia 7 | >>> Wikipedia.query("Aeneas") 8 | { 9 | 'name': 'Aeneas', 10 | 'summary': 'In Greco-Roman mythology, Aeneas (/ᵻˈniːəs/; Greek: Αἰνείας, Aineías, possibly derived from Greek αἰνή meaning "praised") was a Trojan hero, the son of the prince Anchises and the goddess Venus (Aphrodite). His father was the second cousin of King Priam of Troy, making Aeneas Priam\'s second cousin, once removed. He is a character in Greek mythology and is mentioned in Homer\'s Iliad. Aeneas receives full treatment in Roman mythology, most extensively in Virgil\'s Aeneid where he is an ancestor of Romulus and Remus. He became the first true hero of Rome.' 11 | 'images': ['https://upload.wikimedia.org/wikipedia/commons/c/c0/Denier_frapp%C3%A9_sous_C%C3%A9sar_c%C3%A9l%C3%A9brant_le_mythe_d%27En%C3%A9e_et_d%27Anchise.jpg', 'https://upload.wikimedia.org/wikipedia/commons/a/aa/Capitoline_she-wolf_Musei_Capitolini_MC1181.jpg', 'https://upload.wikimedia.org/wikipedia/commons/9/9f/Aineias_Ankhises_Louvre_F118.jpg', 'https://upload.wikimedia.org/wikipedia/commons/3/3c/William_Blake_Richmond_-_Venus_and_Anchises_-_Google_Art_Project.jpg', 'https://upload.wikimedia.org/wikipedia/commons/4/4c/Wikisource-logo.svg', 'https://upload.wikimedia.org/wikipedia/commons/2/2f/B._PINELLI%2C_Enea_e_il_Tevere.jpg', 'https://upload.wikimedia.org/wikipedia/commons/7/76/Aeneas_and_Turnus.jpg', 'https://upload.wikimedia.org/wikipedia/commons/e/e0/Gu%C3%A9rin_%C3%89n%C3%A9e_racontant_%C3%A0_Didon_les_malheurs_de_la_ville_de_Troie_Louvre_5184.jpg', 'https://upload.wikimedia.org/wikipedia/commons/a/a8/Venus_as_Huntress_Appears_to_Aeneas.jpg', 'https://upload.wikimedia.org/wikipedia/en/4/4a/Commons-logo.svg', 'https://upload.wikimedia.org/wikipedia/commons/f/f7/Aeneas%27_Flight_from_Troy_by_Federico_Barocci.jpg'], 12 | } 13 | 14 | """ 15 | 16 | import wikipedia 17 | 18 | class Wikipedia: 19 | 20 | @staticmethod 21 | def query(entity_name): 22 | """ 23 | Retrieve data from Wikipedia for a given input entity name 24 | :return wikipedia_entity: dict 25 | """ 26 | 27 | # Return a wikipedia entity dictionary 28 | wikipedia_entity = {} 29 | 30 | # Get a list of results from wikipedia for the input entity name 31 | entity_results = wikipedia.search(entity_name, suggestion=True) 32 | 33 | # For the moment, just use the first wikipedia entry 34 | # Perhaps work in wikipedia.suggest in the future 35 | try: 36 | wikipedia_entity['name'] = entity_results[0] 37 | 38 | # Get the summary 39 | wikipedia_entity['summary'] = wikipedia.summary(wikipedia_entity['name']) 40 | 41 | # Get the page and images 42 | wikipedia_page = wikipedia.page(wikipedia_entity['name']) 43 | wikipedia_entity['images'] = wikipedia_page.images 44 | 45 | 46 | # Get anything else we might need... 47 | 48 | except: 49 | wikipedia_entity = {} 50 | finally: 51 | return wikipedia_entity 52 | 53 | -------------------------------------------------------------------------------- /metadata/media/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cltk/cltk_api/eb736ec9f20c008436e93cd0be4bcd05be7d455c/metadata/media/__init__.py -------------------------------------------------------------------------------- /metadata/pos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cltk/cltk_api/eb736ec9f20c008436e93cd0be4bcd05be7d455c/metadata/pos/__init__.py -------------------------------------------------------------------------------- /metadata/pos/constants.py: -------------------------------------------------------------------------------- 1 | # list of available POS tagging methods in CLTK 2 | POS_METHODS = {'greek': ['unigram', 'bigram', 'trigram', 'ngram123', 'tnt'], 3 | 'latin': ['unigram', 'bigram', 'trigram', 'ngram123', 'tnt']} 4 | DEFAULT_POS_METHOD = 'ngram123' 5 | -------------------------------------------------------------------------------- /metadata/pos/views.py: -------------------------------------------------------------------------------- 1 | from .constants import POS_METHODS, DEFAULT_POS_METHOD 2 | from cltk.tag.pos import POSTag 3 | from flask_restful import Resource, reqparse 4 | 5 | """ 6 | GET /core/pos View available POS tagging methods 7 | 8 | POST /core/pos Return POS tags for the given string 9 | Data: {'string': string to tag, using the specified langauge and 10 | 'lang': language tagging method. 11 | 'method': tagging method} 12 | """ 13 | class POSTagger(Resource): 14 | def get(self): 15 | return {'methods': POS_METHODS} 16 | 17 | def post(self): 18 | self.reqparse = reqparse.RequestParser() 19 | self.reqparse.add_argument('string', required=True) 20 | self.reqparse.add_argument('lang', required=True, choices=POS_METHODS.keys()) 21 | self.reqparse.add_argument('method', required=False, 22 | default=DEFAULT_POS_METHOD) 23 | 24 | args = self.reqparse.parse_args() 25 | string = args['string'] 26 | lang = args['lang'] 27 | method = args['method'] 28 | 29 | if method not in POS_METHODS[lang]: 30 | return {'message': {'method': method + ' is not a valid choice'}} 31 | 32 | tagger = POSTag(lang) 33 | tagged = [] 34 | if method == 'unigram': 35 | tagged = tagger.tag_unigram(string) 36 | elif method == 'bigram': 37 | tagged = tagger.tag_bigram(string) 38 | elif method == 'trigram': 39 | tagged = tagger.tag_trigram(string) 40 | elif method == 'ngram123': 41 | tagged = tagger.tag_ngram_123_backoff(string) 42 | elif method == 'tnt': 43 | tagged = tagger.tag_tnt(string) 44 | 45 | return {'tags': [{'word': word, 'tag': tag} 46 | if tag is not None else {'word': word, 'tag': 'None'} 47 | for word, tag in tagged]} 48 | -------------------------------------------------------------------------------- /metadata/prosody/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cltk/cltk_api/eb736ec9f20c008436e93cd0be4bcd05be7d455c/metadata/prosody/__init__.py -------------------------------------------------------------------------------- /metadata/prosody/scansion.py: -------------------------------------------------------------------------------- 1 | """ 2 | A class for predicting scansion in a line of Latin dactylic hexameter 3 | 4 | May possibly be reworked with CLTK prosody modules in the future. 5 | """ 6 | 7 | 8 | from cltk.util.syllabifier import Syllabifier 9 | import string 10 | import re 11 | 12 | Patterns = { 13 | 'dactylic_hexameter' : { 14 | 'n_feet' : 6, 15 | 'feet' : [[1,0,0],[1,1]], 16 | 'pattern' :[ 17 | [[1,0,0],[1,1]], 18 | [[1,0,0],[1,1]], 19 | [[1,0,0],[1,1]], 20 | [[1,0,0],[1,1]], 21 | [[1,0,0],[1,1]], 22 | [[1,1]] 23 | ] 24 | }, 25 | 'elegiac_pentameter' : { 26 | 'n_feet' : 5, 27 | 'feet' : [[1,0,0],[1,1]], 28 | 'pattern' :[ 29 | [[1,0,0],[1,1]], 30 | [[1,0,0],[1,1]], 31 | [[1]], 32 | [[1,0,0]], 33 | [[1,0,0]], 34 | [[1]] 35 | ] 36 | } 37 | 38 | } 39 | 40 | Latin = { 41 | 42 | 'diphthongs' : ["ae", "au", "ei", "eu", "oe"], 43 | 'two_consonants' : ["x", "z"], 44 | 'digraphs' : ["ch", "ph", "th", "qu", "gu", "su"], 45 | 'mute_consonants_and_f' : ['b', 'c', 'd', 'g', 'p', 't', 'f'], 46 | 'liquid_consonants' : ['l', 'r'], 47 | 'vowels' : [ 48 | 'a', 'e', 'i', 'o', 'u', 49 | 'á', 'é', 'í', 'ó', 'ú', 50 | 'æ', 'œ', 51 | 'ǽ', # no accented œ in unicode? 52 | 'y' # y is treated as a vowel; not native to Latin but useful for words borrowed from Greek 53 | ] 54 | 55 | } 56 | 57 | class Scansion(object): 58 | """Predict scansion for a line of classical Greek or Latin poetry""" 59 | 60 | def __init__(self, patterns=Patterns, language=Latin): 61 | 62 | self.patterns = patterns 63 | self.language = language 64 | self.punctuation_transtable = {ord(c): " " for c in string.punctuation} 65 | self.line = [] 66 | 67 | return 68 | 69 | def scan(self, line, pattern="dactylic_hexameter"): 70 | """Input a line of poetry and receive an array of booleans indicating open or closed syllables""" 71 | 72 | s = Syllabifier() 73 | line_sylls = [] 74 | scansion = [] 75 | # Strip any punctuation and lower 76 | line = line.translate(self.punctuation_transtable).lower() 77 | line = line.replace("—", " ") 78 | 79 | 80 | # Build list of line syllables 81 | line = line.split() 82 | for word in line: 83 | if len( word ): 84 | line_sylls.extend( s.syllabify( word ) ) 85 | 86 | # Build scansion for syllables, based on pattern 87 | # If a syllable is not long, it is short 88 | sylls_len = len(line_sylls) 89 | for i, syll in enumerate(line_sylls): 90 | scansion.append({ 91 | 's' : syll, 92 | 'l' : 0 93 | }) 94 | 95 | if i < sylls_len - 1 and self._is_elided( syll, line_sylls[ i + 1 ], line ): 96 | scansion[i]['l'] = "-" 97 | #scansion[i]['r'] = "elided" 98 | continue 99 | 100 | elif self._long_by_nature( i, syll, line_sylls, line ): 101 | scansion[i]['l'] = 1 102 | #scansion[i]['r'] = "by nature" 103 | continue 104 | 105 | elif i < sylls_len - 1 and self._long_by_position( syll, line_sylls[ i + 1 ] ): 106 | scansion[i]['l'] = 1 107 | #scansion[i]['r'] = "by position" 108 | continue 109 | 110 | # For next step, remove elided syllables 111 | for syll in scansion: 112 | if syll['l'] == "-": 113 | scansion.remove(syll) 114 | 115 | # Compare scansion against selected pattern 116 | scansion = self._scan_against_pattern( pattern, scansion, line ) 117 | 118 | return scansion 119 | 120 | def _scan_against_pattern(self, pattern, scansion, line, depth=0): 121 | """Make judgements about feet regularized to pattern""" 122 | 123 | # Load permissible feet for scansion pattern 124 | feet = self.patterns[ pattern ]['feet'] 125 | n_feet = self.patterns[ pattern ]['n_feet'] 126 | ft_cos = self._find_feet_commonalities(feet) 127 | new_scansion = [] 128 | 129 | if depth == 1: 130 | scansion = self._check_synizesis(scansion) 131 | 132 | # Make a copy of the input scansion 133 | prev_scansion = scansion[:] 134 | 135 | # Primary loop for checking scansion against pattern feet 136 | while len(scansion) > 0: 137 | match = False 138 | 139 | # First, check if a foot matches the start of syllable list 140 | for foot in feet: 141 | #foot.reverse() 142 | has_elided = 0 143 | sylls = scansion[:len(foot)] 144 | 145 | # If the syllable list starts with a foot 146 | if self._comp_syll_foot( sylls, foot ): 147 | 148 | # Add the syllables to the scansion foot 149 | #sylls.reverse() 150 | new_scansion.append(sylls) 151 | 152 | # And remove the syllables from the original scansion list 153 | for sy in sylls: 154 | scansion.remove(sy) 155 | 156 | match = True 157 | break 158 | 159 | # If we don't have a match from the feet in our allowed feet 160 | if not match: 161 | # Apply common rules among the feet to syllables at start of list 162 | for c in ft_cos: 163 | scansion[c['i']]['l'] = c['val'] 164 | #scansion[c['i']]['r'] = "commonality between feet" 165 | 166 | # If short between two longs (and no iambic foot from earlier loop), make that short long 167 | if len(scansion) > 2: 168 | if scansion[0]['l'] == 1 and scansion[2]['l'] == 1: 169 | scansion[1]['l'] = 1 170 | #scansion[1]['r'] = "scansion context" 171 | # If long short at the end of the line (and no iambic foot), make that short long 172 | elif len(scansion) == 2: 173 | scansion[1]['l'] = 1 174 | #scansion[1]['r'] = "end of line" 175 | 176 | # Catch the remainder to prevent inf loop 177 | else: 178 | # Add the syllables to the scansion foot 179 | new_scansion.append(scansion) 180 | # And remove the syllables from the original scansion list 181 | scansion = [] 182 | 183 | # If there's more feet in the new scansion than are allowed in the meter 184 | scan_len = len(new_scansion) 185 | new_scansion = {'scansion':new_scansion} 186 | if n_feet < scan_len: 187 | new_scansion['error'] = "too many" 188 | if depth == 0: 189 | new_scansion = self._scan_against_pattern(pattern, prev_scansion, line, depth + 1) 190 | 191 | elif n_feet > scan_len: 192 | new_scansion['error'] = "too few" 193 | if depth == 0: 194 | new_scansion = self._scan_against_pattern(pattern, prev_scansion, line, depth + 1) 195 | 196 | # Return the scansion with feet in the correct order 197 | #new_scansion.reverse() 198 | return new_scansion 199 | 200 | def _comp_syll_foot(self, sylls, pattern): 201 | """Check if the possible pattern for the foot matches syllables""" 202 | match = [] 203 | if len(sylls) == len(pattern): 204 | for p_index, value in enumerate(pattern): 205 | if value == sylls[ p_index ]['l']: 206 | match.append(True) 207 | else: 208 | match.append(False) 209 | 210 | if len(match): 211 | return all(item==True for item in match) 212 | else: 213 | return False 214 | 215 | def _find_feet_commonalities(self, scansion_feet): 216 | """Find the commonalities between the feet: e.g. dactyl and spondee have commonality of long syllable in first position""" 217 | commonalities = [] 218 | for i_foot, foot in enumerate(scansion_feet): 219 | for i, val in enumerate(foot): 220 | common = True 221 | 222 | for i_comp_foot, comp_foot in enumerate(scansion_feet): 223 | if i >= len(comp_foot) or comp_foot[i] != val: 224 | common = False 225 | 226 | if common == True: 227 | c = {'i':i, 'val':val} 228 | included = False 229 | for comp_c in commonalities: 230 | if c['i'] == comp_c['i'] and c['val'] == comp_c['val']: 231 | included = True 232 | if not included: 233 | commonalities.append({'i':i,'val':val}) 234 | 235 | return commonalities 236 | 237 | def _is_elided(self, syll, next_syll, line): 238 | """Is the syllable elided based its ending and the next syllable's beginning""" 239 | is_elided = False 240 | line_len = len(line) 241 | 242 | # Only check the syllables that are at word boundaries (not interior syllables) 243 | for i, word in enumerate(line): 244 | if word.endswith(syll): 245 | if ( 246 | ( 247 | # If the target syllable ends with 'm' or a vowel 248 | syll.endswith("m") 249 | or self._is_vowel( syll[-1] ) 250 | ) 251 | and 252 | ( 253 | # And if the next word exists and it starts with the next syllable 254 | i < line_len - 1 255 | and line[i + 1].startswith( next_syll ) 256 | ) 257 | ): 258 | 259 | # And next word starts with a vowel or 'h' 260 | if ( 261 | self._is_vowel( next_syll[0] ) 262 | or next_syll[0] == "h" 263 | ): 264 | 265 | # And if the next word starts with an i, and the i isn't a consonant 266 | if next_syll[0] == "i": 267 | if len( next_syll ) > 1 and not self._is_vowel( next_syll[1] ): 268 | is_elided = True 269 | elif len( next_syll ) == 1: 270 | is_elided = True 271 | 272 | else: 273 | is_elided = True 274 | 275 | return is_elided 276 | 277 | def _long_by_nature(self, i, syll, line_sylls, line): 278 | """Is the syllable long by nature""" 279 | is_long = False 280 | # Long_ends could also contain o, i, and u 281 | long_ends = ["as","es","os"] 282 | syll = syll.lstrip("qu") 283 | 284 | # If it contains a diphthong 285 | for diphthong in self.language['diphthongs']: 286 | if diphthong in syll: 287 | is_long = True 288 | break 289 | 290 | if not is_long: 291 | line_len = len(line) 292 | line_sylls_len = len(line_sylls) 293 | 294 | # If it's a final o, i, u, as, es, or os 295 | for e in long_ends: 296 | if syll.endswith(e): 297 | 298 | # Except tibi / mihi 299 | # If it has a preceding syllable 300 | if i > 0: 301 | if syll == "hi" and line_sylls[ i - 1 ] == "mi": 302 | return is_long 303 | 304 | elif syll == "bi" and line_sylls[ i - 1 ] == "ti": 305 | return is_long 306 | 307 | # Ensure the syll is an end of a word 308 | for l_i, word in enumerate(line): 309 | if word.endswith(syll): 310 | 311 | # If there's a next word and next syllable 312 | if l_i < line_len - 1: 313 | # If there's a next syllable 314 | if i < line_sylls_len - 1: 315 | if word.endswith( syll ) and line[ l_i + 1 ].startswith( line_sylls[ i + 1 ] ): 316 | is_long = True 317 | 318 | # Else, if there's not another syllable after it in the line, mark as long 319 | else: 320 | if i == line_sylls_len - 1 and word.endswith( syll ): 321 | is_long = True 322 | 323 | return is_long 324 | 325 | def _long_by_position(self, syll, next_syll): 326 | """Is the syllable long by position, with two or more consonants between its vowel and the next""" 327 | is_long = False 328 | 329 | if syll.endswith("x") or next_syll.startswith("x"): 330 | is_long = True 331 | 332 | else: 333 | syll_cvs = self._return_consonants_vowels( syll ) 334 | next_syll_cvs = self._return_consonants_vowels( next_syll ) 335 | if ( syll_cvs.lstrip("c").count("c") + next_syll_cvs.rstrip("c").count("c") ) >= 2: 336 | is_long = True 337 | 338 | #print(syll, syll_cvs, is_long) 339 | #print(next_syll, next_syll_cvs) 340 | 341 | return is_long 342 | 343 | def _return_consonants_vowels(self, input_string): 344 | """Return a string of Cs and Vs for the consonants and vowels in the string""" 345 | cvs = '' 346 | 347 | 348 | for i, char in enumerate(input_string): 349 | has_prev_char = i > 0 350 | has_next_char = i < len(input_string) - 1 351 | 352 | # First check for vowels with the u and i exceptions 353 | if self._is_vowel(char) and char not in ["u", "i"]: 354 | cvs = cvs + "v" 355 | 356 | # If it's a 'u', it's a vowel unless preceded by a q, g, or s 357 | elif char == "u": 358 | if has_prev_char: 359 | if not ( 360 | not has_next_char 361 | and input_string[ i - 1 ] in ["q","g","s"] 362 | ): 363 | cvs = cvs + "v" 364 | else: 365 | cvs = cvs + "v" 366 | 367 | # Handle the i/y/j exception 368 | elif char == "i": 369 | if has_next_char and i == 0 and self._is_vowel( input_string[ i + 1 ]): 370 | cvs = cvs + "c" 371 | else: 372 | cvs = cvs + "v" 373 | 374 | # x and z are double consonants 375 | elif char in ["x","z"]: 376 | cvs = cvs + "cc" 377 | 378 | # ch, ph, th are single 379 | elif has_prev_char and char == "h" and input_string[ i - 1 ] in ["c","p","t"]: 380 | pass 381 | 382 | # mute followed by a liquid is single 383 | elif has_prev_char and self._is_liquid_consonant( char ) and self._is_mute_consonant_or_f( input_string[ i - 1 ] ): 384 | pass 385 | 386 | elif char == "h": 387 | pass 388 | 389 | # failing all of the above, it's a normal consonant 390 | else: 391 | cvs = cvs + "c" 392 | 393 | return cvs 394 | 395 | def _check_synizesis(self, scansion): 396 | new_scansion = [] 397 | remove_next_syll = False 398 | for i, syll in enumerate(scansion): 399 | 400 | len_scansion = len(scansion) 401 | has_next_syll = i < len_scansion - 1 402 | has_prev_syll = i > 0 403 | 404 | if remove_next_syll: 405 | remove_next_syll = False 406 | continue 407 | 408 | if has_next_syll: 409 | next_syll = scansion[i + 1] 410 | if syll["s"].endswith("u"): 411 | if next_syll["s"].startswith("u") or next_syll["s"].startswith("i") or next_syll["s"].startswith("e"): 412 | syll["s"] = syll['s'] + next_syll["s"][1:] 413 | remove_next_syll = True 414 | 415 | if "uu" in syll['s']: 416 | syll['s'] = syll['s'].replace("uu","u") 417 | elif "ui" in syll['s']: 418 | syll['s'] = syll['s'].replace("ui","u") 419 | elif "ue" in syll['s']: 420 | syll['s'] = syll['s'].replace("ue","u") 421 | 422 | new_scansion.append(syll) 423 | 424 | return new_scansion[:] 425 | 426 | def _is_consonant(self, char): 427 | """Checks if char is in the list of vowels in the language""" 428 | return not char in self.language['vowels'] 429 | 430 | def _is_vowel(self, char): 431 | """Checks if char is in the list of vowels in the language""" 432 | return char in self.language['vowels'] 433 | 434 | def _is_mute_consonant_or_f(self, char): 435 | """Checks if char is in the mute_consonants_and_f list""" 436 | return char in self.language['mute_consonants_and_f'] 437 | 438 | def _is_liquid_consonant(self, char): 439 | """Checks if char is in the mute_consonants_and_f list""" 440 | return char in self.language['liquid_consonants'] 441 | -------------------------------------------------------------------------------- /metadata/prosody/scansion_to_html.py: -------------------------------------------------------------------------------- 1 | """ 2 | For a given input string and scansion value, return a line of HTML with 3 | s around syllables and syllable-long and syllable-short classes 4 | """ 5 | 6 | import re 7 | import string 8 | import sys 9 | 10 | class ScansionToHTML: 11 | 12 | def __init__(self, line, scansion): 13 | """ 14 | 15 | """ 16 | 17 | return 18 | 19 | def scansion_to_html(self, line, scansion): 20 | """ 21 | For a given input string and scansion, generate an HTML response of syllables wrapped in 22 | elements with classes denoting long and short syllables. 23 | :param line: str 24 | :param scansion: Line scansion (needs to be reworked to be like the CLTK scansion) 25 | :return html_line: str (formatted HTML string) 26 | """ 27 | 28 | while len( self.scansion ) > 0: 29 | foot = self.scansion[0] 30 | while len( foot ) > 0: 31 | syll = foot[0] 32 | 33 | if self.line.lower().startswith( syll['s'] ): 34 | len_syll_s = len( syll['s'] ) 35 | 36 | if syll['l']: 37 | #long 38 | html_line += "" + self.line[0:len_syll_s] + "" 39 | 40 | else: 41 | #short 42 | html_line += "" + self.line[0:len_syll_s] + "" 43 | 44 | self.line = self.line[len_syll_s:] 45 | 46 | # finally remove the syll 47 | foot.remove( syll ) 48 | 49 | 50 | else: 51 | # skip one forward (spaces, punct, &c.) 52 | if len(self.line) > 0: 53 | html_line += self.line[0] 54 | self.line = self.line[1:] 55 | else: 56 | foot = [] 57 | self.scansion = [] 58 | print(" -- error with transfering to html for", self.line_orig, html_line) 59 | break 60 | 61 | # If there's more scansion 62 | if len(self.scansion): 63 | # Remove the empty foot 64 | self.scansion.remove(foot) 65 | 66 | # If scansion length is now no more 67 | if len(self.scansion) == 0: 68 | # add the remainder of line (final punctuation!!) 69 | html_line += self.line 70 | 71 | 72 | return html_line 73 | -------------------------------------------------------------------------------- /metadata/stem/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cltk/cltk_api/eb736ec9f20c008436e93cd0be4bcd05be7d455c/metadata/stem/__init__.py -------------------------------------------------------------------------------- /metadata/stem/views.py: -------------------------------------------------------------------------------- 1 | from cltk.stem.latin.stem import Stemmer 2 | from flask_restful import Resource 3 | 4 | class Stem(Resource): 5 | """ 6 | GET /core/stem/ 7 | Takes sentence input and strips suffix using CLTK's core Stemmer 8 | """ 9 | 10 | def get(self, sentence): 11 | stemmer = Stemmer() 12 | return {'stemmed_output': stemmer.stem(sentence.lower())} 13 | -------------------------------------------------------------------------------- /metadata/text_reuse/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cltk/cltk_api/eb736ec9f20c008436e93cd0be4bcd05be7d455c/metadata/text_reuse/__init__.py -------------------------------------------------------------------------------- /metadata/tokenize/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cltk/cltk_api/eb736ec9f20c008436e93cd0be4bcd05be7d455c/metadata/tokenize/__init__.py -------------------------------------------------------------------------------- /metadata/translations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cltk/cltk_api/eb736ec9f20c008436e93cd0be4bcd05be7d455c/metadata/translations/__init__.py -------------------------------------------------------------------------------- /metadata/translations/map_translation.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Map a translation of a Latin document to the original Latin document 4 | 5 | Must already have definitions ingested for this to work 6 | 7 | """ 8 | 9 | 10 | import optparse 11 | import pymongo 12 | import re 13 | import copy 14 | import string 15 | import numpy as np 16 | from nltk.corpus import wordnet as wn 17 | from nltk.corpus import stopwords 18 | from nltk.stem.wordnet import WordNetLemmatizer 19 | 20 | 21 | class MapTranslation: 22 | 23 | def __init__(self, settings): 24 | 25 | 26 | # What data to import and where to put it 27 | self.trans_fname = settings.fname 28 | self.work = settings.work 29 | self.subwork = { 'n' : int( settings.subwork ) } 30 | self.translators = [ settings.author ] 31 | self.edition_slug = settings.author 32 | 33 | # Get the length of the original work 34 | self.len_orig = 0 35 | 36 | # Get the length of the translation 37 | self.len_trans = 0 38 | 39 | # Threshold settings for association 40 | self.r = 5 # range to search in lines before/after 41 | self.si_thresh = 0.0 # must have a higher similarity index than this to save 42 | 43 | # Helps 44 | self.punctuation_transtable = {ord(c): " " for c in string.punctuation} 45 | self.stops = stopwords.words("english") 46 | self.lmtzr = WordNetLemmatizer() 47 | 48 | # Load translation and map 49 | self.load_trans() 50 | self.map_trans() 51 | 52 | return 53 | 54 | 55 | 56 | def load_trans(self): 57 | 58 | self.translation = [] 59 | 60 | with open( self.trans_fname, "r" ) as f: 61 | 62 | trans = f.readlines() 63 | 64 | for i, line in enumerate( trans ): 65 | if len( line ): 66 | self.translation.append( line.strip() ) 67 | 68 | self.len_trans = len( self.translation ) 69 | 70 | return 71 | 72 | 73 | def map_trans(self): 74 | 75 | # calculate the ratio of the length of the original to the translation 76 | self.ratio = self.len_orig / self.len_trans 77 | 78 | for i, text_unit in enumerate( self.translation ): 79 | 80 | # nix 'd 81 | text_unit_orig = text_unit 82 | text_unit = text_unit.replace("'d", "") 83 | 84 | # strip punctuation 85 | text_unit = text_unit.translate(self.punctuation_transtable).lower() 86 | text_unit = text_unit.replace("—", " ") 87 | 88 | # split at words 89 | words = text_unit.split(" ") 90 | 91 | # lemmas 92 | lemmas = [] 93 | for word in words: 94 | if len(word): 95 | word = self.lmtzr.lemmatize(word) 96 | 97 | if word not in self.stops: 98 | lemmas.append(word) 99 | 100 | # syns 101 | syns = [] 102 | for lemma in lemmas: 103 | synsets = wn.synsets(lemma) 104 | 105 | word_syns = [] 106 | for syn in synsets: 107 | word_syns = word_syns + syn.lemma_names() 108 | 109 | syns = syns + word_syns 110 | 111 | syns = dedupe_list( syns ) 112 | self._map_unit( i, syns, text_unit_orig ) 113 | 114 | 115 | 116 | return 117 | 118 | def _map_unit( self, i, syns, text_unit_orig ): 119 | 120 | 121 | target_n = self.ratio * i 122 | l_n_min = np.floor( target_n - self.r ) 123 | l_n_max = np.ceil( target_n + self.r ) 124 | 125 | # This is where we need to load lines from the original work 126 | lines = [] 127 | 128 | line_ms = [] 129 | for line in lines: 130 | 131 | line_senses = [] 132 | line_defs_lemmas = [] 133 | line_defs_syns = [] 134 | m = 0 135 | 136 | # Flatten the line definition senses 137 | for word in line['definitions']: 138 | for definition in word['defs']: 139 | line_senses = line_senses + definition['senses'] 140 | 141 | # Build list of lemmas from the word definitions 142 | for sense in line_senses: 143 | # nix 'd 144 | sense = sense.replace("'d", "") 145 | 146 | # strip punctuation 147 | sense = sense.translate(self.punctuation_transtable).lower() 148 | sense = sense.replace("—", " ") 149 | 150 | # split at words 151 | sense_words = sense.split(" ") 152 | 153 | # lemmatize and check stoplist 154 | for word in sense_words: 155 | if len(word): 156 | word = self.lmtzr.lemmatize(word) 157 | if word not in self.stops: 158 | line_defs_lemmas.append(word) 159 | # syns 160 | line_defs_lemmas = dedupe_list( line_defs_lemmas) 161 | for lemma in line_defs_lemmas: 162 | synsets = wn.synsets(lemma) 163 | word_syns = [] 164 | for syn in synsets: 165 | word_syns = word_syns + syn.lemma_names() 166 | line_defs_syns = line_defs_syns + word_syns 167 | line_defs_syns = dedupe_list( line_defs_syns ) 168 | 169 | # Compare the line definiton senses to our syn list 170 | for lem_syn in syns: 171 | for lem_def in line_defs_syns: 172 | if lem_syn == lem_def: 173 | m += 1 174 | 175 | # Adjust m for the total number of syns compared 176 | m_rel = m / ( len( syns ) + len( line_defs_syns ) ) 177 | 178 | # Finally, add the comparison matching to the 179 | line_ms.append( [ m, m_rel ] ) 180 | 181 | 182 | # Figure the min/max, rel * 100 183 | m_max = 0 184 | m_min = 100 185 | for m_ls in line_ms: 186 | m_ls[1] = m_ls[1] * 100 187 | m_rel = m_ls[1] 188 | 189 | if m_rel > m_max: 190 | m_max = m_ls[1] 191 | if m_rel < m_min: 192 | m_min = m_ls[1] 193 | 194 | # Scale m_rel and if above significance thresh, add to line nos 195 | trans_l_ns = [] 196 | for m_i, m_ls in enumerate( line_ms ): 197 | # rel is scaled to min/max (20%) 198 | if ( m_max - m_min ) > 0: 199 | m_ls[1] = ( ( m_ls[1] - m_min ) / ( m_max - m_min ) ) * 0.20 200 | # Final adjust for bigger rels (80%) 201 | m_ls[1] = m_ls[1] + ( ( m_ls[0] / 100 ) * 0.80 ) 202 | 203 | if m_ls[1] >= self.si_thresh: 204 | line_n = int( l_n_min + m_i ) 205 | 206 | if( l_n_min < 0 ): 207 | line_n = int( l_n_min + m_i ) + self.r 208 | elif ( l_n_min > self.len_orig ): 209 | line_n = int( l_n_min + m_i ) 210 | 211 | # Append for base-1 counting 212 | trans_l_ns.append( line_n + 1 ) 213 | 214 | return trans_l_ns 215 | -------------------------------------------------------------------------------- /metadata/vector/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cltk/cltk_api/eb736ec9f20c008436e93cd0be4bcd05be7d455c/metadata/vector/__init__.py -------------------------------------------------------------------------------- /perseus_parsing_notes.txt: -------------------------------------------------------------------------------- 1 | # Notes 2 | Some units are called 'card' (for sections in Seneca and line numbers in Vergil. Some rewriting will be required. 3 | 4 | # Checked Authors 5 | Metadata has been parsed from the following files (at least one text for each author) 6 | 7 | ## Latin 8 | Ammianus 9 | http://localhost:5000/lang/latin/corpus/perseus/author/Ammianus/text/amm 10 | 11 | Apuleius 12 | http://localhost:5000/lang/latin/corpus/perseus/author/apuleius/text/apuleius.ap 13 | 14 | Augustine 15 | http://localhost:5000/lang/latin/corpus/perseus/author/Augustine/text/august.sellet 16 | 17 | Bede 18 | http://localhost:5000/lang/latin/corpus/perseus/author/bede/text/bede.hega 19 | 20 | Caesar 21 | http://localhost:5000/lang/latin/corpus/perseus/author/caesar/text/caes.bg 22 | 23 | Catullus 24 | http://localhost:5000/lang/latin/corpus/perseus/author/Catullus/text/cat 25 | 26 | Celsus 27 | http://localhost:5000/lang/latin/corpus/perseus/author/celsus/text/cels_darem 28 | 29 | Cicero 30 | http://localhost:5000/lang/latin/corpus/perseus/author/cicero/text/cic.ac 31 | http://localhost:5000/lang/latin/corpus/perseus/author/cicero/text/cic.fam 32 | 33 | Columella 34 | http://localhost:5000/lang/latin/corpus/perseus/author/columella/text/col.agr01 35 | 36 | Curtius 37 | http://localhost:5000/lang/latin/corpus/perseus/author/curtius/text/curtius.alex 38 | 39 | Flaccus 40 | http://localhost:5000/lang/latin/corpus/perseus/author/flaccus/text/v.fl 41 | 42 | Florus 43 | http://localhost:5000/lang/latin/corpus/perseus/author/florus/text/florus.epit 44 | 45 | Gellius 46 | http://localhost:5000/lang/latin/corpus/perseus/author/Gellius/text/gel 47 | 48 | Glass 49 | http://localhost:5000/lang/latin/corpus/perseus/author/glass/text/washington.bio 50 | 51 | HistAugust 52 | http://localhost:5000/lang/latin/corpus/perseus/author/HistAugust/text/sha01.1 53 | 54 | Horace 55 | http://localhost:5000/lang/latin/corpus/perseus/author/horace/text/hor.ap 56 | 57 | Jerome 58 | http://localhost:5000/lang/latin/corpus/perseus/author/jerome/text/jerome.sellet 59 | 60 | Juvenal 61 | http://localhost:5000/lang/latin/corpus/perseus/author/juvenal/text/juv 62 | 63 | Livy 64 | http://localhost:5000/lang/latin/corpus/perseus/author/livy/text/livy.foster01-02 65 | 66 | Lucan 67 | http://localhost:5000/lang/latin/corpus/perseus/author/lucan/text/pharsalia 68 | 69 | Martial 70 | http://localhost:5000/lang/latin/corpus/perseus/author/martial/text/martial 71 | 72 | MinuciusFelix 73 | http://localhost:5000/lang/latin/corpus/perseus/author/MinuciusFelix/text/minfel.octav 74 | 75 | Nepos 76 | http://localhost:5000/lang/latin/corpus/perseus/author/nepos/text/lives 77 | 78 | Ovid 79 | http://localhost:5000/lang/latin/corpus/perseus/author/ovid/text/ovid.am 80 | http://localhost:5000/lang/latin/corpus/perseus/author/ovid/text/ovid.met?section_1=1 81 | 82 | Persius 83 | http://localhost:5000/lang/latin/corpus/perseus/author/persius/text/persius.sat 84 | 85 | Petronius 86 | http://localhost:5000/lang/latin/corpus/perseus/author/petronius/text/petr 87 | 88 | Plautus 89 | http://localhost:5000/lang/latin/corpus/perseus/author/Plautus/text/pl.aul 90 | 91 | Pliny 92 | http://localhost:5000/lang/latin/corpus/perseus/author/pliny/text/pliny.min.letters 93 | 94 | Propertius 95 | http://localhost:5000/lang/latin/corpus/perseus/author/propertius/text/prop 96 | 97 | Prudentius 98 | http://localhost:5000/lang/latin/corpus/perseus/author/prudentius/text/prud.01praef 99 | 100 | Quintilian 101 | http://localhost:5000/lang/latin/corpus/perseus/author/quintilian/text/quint.butler1-3 102 | 103 | Quintus 104 | http://localhost:5000/lang/latin/corpus/perseus/author/quintus/text/quintsmyrn_01 105 | 106 | Sallust 107 | http://localhost:5000/lang/latin/corpus/perseus/author/sallust/text/sallust.catil 108 | 109 | Seneca 110 | http://localhost:5000/lang/latin/corpus/perseus/author/seneca/text/sen.ag 111 | 112 | Seneca1 113 | http://localhost:5000/lang/latin/corpus/perseus/author/seneca1/text/seneca.contr 114 | 115 | SiliusItalicus 116 | http://localhost:5000/lang/latin/corpus/perseus/author/SiliusItalicus/text/silius.punica 117 | 118 | Statius 119 | http://localhost:5000/lang/latin/corpus/perseus/author/statius/text/stat.achill 120 | 121 | Suetonius 122 | http://localhost:5000/lang/latin/corpus/perseus/author/suetonius/text/suet.caes 123 | 124 | Sulpicia 125 | Note: Sulpicia has no Latin text file, so I remove it from /authors, however routing still available at 126 | http://localhost:5000/lang/latin/corpus/perseus/author/Sulpicia/texts 127 | 128 | Tacitus 129 | http://localhost:5000/lang/latin/corpus/perseus/author/Tacitus/text/tac.ann 130 | 131 | Terence 132 | http://localhost:5000/lang/latin/corpus/perseus/author/terence/text/ad 133 | 134 | Tertullian 135 | http://localhost:5000/lang/latin/corpus/perseus/author/tertullian/text/tert.apol 136 | 137 | Tibullus 138 | http://localhost:5000/lang/latin/corpus/perseus/author/tibullus/text/tibullus.el 139 | 140 | Vergil 141 | http://localhost:5000/lang/latin/corpus/perseus/author/Vergil/text/verg.a 142 | 143 | Vitruvius 144 | http://localhost:5000/lang/latin/corpus/perseus/author/vitruvius/text/vitruv 145 | 146 | ## Greek 147 | 148 | Aeschines 149 | http://localhost:5000/lang/greek/corpus/perseus/author/Aeschines/text/aeschin 150 | 151 | Aeschylus 152 | http://localhost:5000/lang/greek/corpus/perseus/author/Aeschylus/text/aesch.ag 153 | 154 | Andocides 155 | http://localhost:5000/lang/greek/corpus/perseus/author/Andocides/text/andoc 156 | 157 | Anth 158 | http://localhost:5000/lang/greek/corpus/perseus/author/anth/text/01 159 | 160 | Apollodorus 161 | http://localhost:5000/lang/greek/corpus/perseus/author/Apollodorus/text/apollod 162 | 163 | Apollonius 164 | http://localhost:5000/lang/greek/corpus/perseus/author/apollonius/text/argo 165 | 166 | Appian 167 | http://localhost:5000/lang/greek/corpus/perseus/author/appian/text/appian.cw 168 | 169 | Aratus 170 | http://localhost:5000/lang/greek/corpus/perseus/author/aratus/text/aratus_01 171 | 172 | Aretaeus 173 | http://localhost:5000/lang/greek/corpus/perseus/author/Aretaeus/text/aret 174 | 175 | Aristides 176 | http://localhost:5000/lang/greek/corpus/perseus/author/Aristides/text/aristid.orat 177 | 178 | Aristophanes 179 | http://localhost:5000/lang/greek/corpus/perseus/author/Aristophanes/text/aristoph.ach 180 | 181 | Aristotle 182 | http://localhost:5000/lang/greek/corpus/perseus/author/Aristotle/text/aristot.ath.pol 183 | 184 | Arrian 185 | http://localhost:5000/lang/greek/corpus/perseus/author/Arrian/text/arrian.acies 186 | http://localhost:5000/lang/greek/corpus/perseus/author/Arrian/text/arrian.indica 187 | 188 | Athenaeus 189 | http://localhost:5000/lang/greek/corpus/perseus/author/Athenaeus/text/ath01 190 | 191 | Bacchylides 192 | http://localhost:5000/lang/greek/corpus/perseus/author/Bacchylides/text/bacchyl 193 | 194 | Bible 195 | http://localhost:5000/lang/greek/corpus/perseus/author/Bible/text/nt 196 | 197 | Callimachus 198 | http://localhost:5000/lang/greek/corpus/perseus/author/Callimachus/text/callimachus 199 | http://localhost:5000/lang/greek/corpus/perseus/author/Callimachus/text/call_02 200 | 201 | 202 | Colluthus 203 | http://localhost:5000/lang/greek/corpus/perseus/author/Colluthus/text/colluthus.01 204 | 205 | Demades 206 | http://localhost:5000/lang/greek/corpus/perseus/author/Demades/text/demad 207 | 208 | Demosthenes 209 | http://localhost:5000/lang/greek/corpus/perseus/author/Demosthenes/text/dem01-10 210 | 211 | Dinarchus 212 | http://localhost:5000/lang/greek/corpus/perseus/author/Dinarchus/text/din 213 | 214 | DioChrys 215 | http://localhost:5000/lang/greek/corpus/perseus/author/DioChrys/text/diochr01 216 | 217 | Diodorus 218 | http://localhost:5000/lang/greek/corpus/perseus/author/Diodorus/text/diod.hist01-05 219 | 220 | Diogenes 221 | http://localhost:5000/lang/greek/corpus/perseus/author/Diogenes/text/dl 222 | 223 | Dionysius 224 | http://localhost:5000/lang/greek/corpus/perseus/author/Dionysius/text/dh.002 225 | http://localhost:5000/lang/greek/corpus/perseus/author/Dionysius/text/dh.hist01 226 | 227 | Dobson 228 | ! broken: What is this? Remove from /authors ? 229 | 230 | Elegy 231 | http://localhost:5000/lang/greek/corpus/perseus/author/elegy/text/1 232 | 233 | Epictetus 234 | http://localhost:5000/lang/greek/corpus/perseus/author/epictetus/text/epictetus 235 | 236 | Euclid 237 | http://localhost:5000/lang/greek/corpus/perseus/author/Euclid/text/euc.elem 238 | 239 | Euripides 240 | http://localhost:5000/lang/greek/corpus/perseus/author/Euripides/text/eur.orest 241 | 242 | Galen 243 | http://localhost:5000/lang/greek/corpus/perseus/author/Galen/text/gal.nat.fac 244 | 245 | Herodotus 246 | http://localhost:5000/lang/greek/corpus/perseus/author/Herodotus/text/hdt 247 | 248 | Hesiod 249 | http://localhost:5000/lang/greek/corpus/perseus/author/Hesiod/text/hes.sh 250 | http://localhost:5000/lang/greek/corpus/perseus/author/Hesiod/text/hes.th 251 | http://localhost:5000/lang/greek/corpus/perseus/author/Hesiod/text/hes.wd 252 | http://localhost:5000/lang/greek/corpus/perseus/author/Hesiod/text/hes.wd?section_1=1 253 | 254 | Hippocrates 255 | http://localhost:5000/lang/greek/corpus/perseus/author/Hippocrates/text/hp.jones 256 | 257 | Homer 258 | http://localhost:5000/lang/greek/corpus/perseus/author/Homer/text/hom.il 259 | 260 | Homeric_Hymns 261 | http://localhost:5000/lang/greek/corpus/perseus/author/Homeric_Hymns/text/hh 262 | 263 | Hyperides 264 | http://localhost:5000/lang/greek/corpus/perseus/author/Hyperides/text/hyp 265 | 266 | Isocrates 267 | No Greek text, so rm from /authors 268 | 269 | JebbOrators 270 | http://localhost:5000/lang/greek/corpus/perseus/author/JebbOrators/text/attic_orators 271 | 272 | Josephus 273 | http://localhost:5000/lang/greek/corpus/perseus/author/Josephus/text/j.bj 274 | 275 | Lucian 276 | http://localhost:5000/lang/greek/corpus/perseus/author/Lucian/text/01 277 | 278 | Lucretius 279 | !! Why they hell is Lucretius under Greek??? 280 | !! This is something we'll have to fix while indexing; I'll leave him under Greek for now 281 | http://localhost:5000/lang/greek/corpus/perseus/author/Lucretius/text/lucretius 282 | 283 | Lycophron 284 | http://localhost:5000/lang/greek/corpus/perseus/author/Lycophron/text/lycophron_01 285 | 286 | Lycurgus 287 | http://localhost:5000/lang/greek/corpus/perseus/author/Lycurgus/text/lyc 288 | 289 | Lysias 290 | http://localhost:5000/lang/greek/corpus/perseus/author/Lysias/text/lys 291 | 292 | Nonnos 293 | http://localhost:5000/lang/greek/corpus/perseus/author/Nonnos/text/nonnos_01 294 | 295 | Oppian 296 | http://localhost:5000/lang/greek/corpus/perseus/author/Oppian/texts 297 | 298 | Pausanias 299 | http://localhost:5000/lang/greek/corpus/perseus/author/Pausanias/text/paus 300 | 301 | Phaedrus 302 | http://localhost:5000/lang/greek/corpus/perseus/author/Phaedrus/texts 303 | 304 | Pindar 305 | http://localhost:5000/lang/greek/corpus/perseus/author/Pindar/text/pind 306 | 307 | Plato 308 | http://localhost:5000/lang/greek/corpus/perseus/author/Plato/text/plat.l 309 | 310 | Plutarch 311 | http://localhost:5000/lang/greek/corpus/perseus/author/Plutarch/text/plut.0094.002_teubner 312 | http://localhost:5000/lang/greek/corpus/perseus/author/Plutarch/text/plut.082b_loeb 313 | 314 | Polybius 315 | http://localhost:5000/lang/greek/corpus/perseus/author/Polybius/text/hist 316 | 317 | Sidonius 318 | http://localhost:5000/lang/greek/corpus/perseus/author/Sidonius/texts 319 | 320 | Sophocles 321 | http://localhost:5000/lang/greek/corpus/perseus/author/Sophocles/text/soph.aj 322 | 323 | Strabo 324 | http://localhost:5000/lang/greek/corpus/perseus/author/Strabo/text/strab 325 | 326 | Theocritus 327 | http://localhost:5000/lang/greek/corpus/perseus/author/Theocritus/text/idylls 328 | 329 | Theophrastus 330 | http://localhost:5000/lang/greek/corpus/perseus/author/Theophrastus/text/char 331 | 332 | Thucydides 333 | ! There are no Greek files for Thucydides: http://localhost:5000/lang/greek/corpus/perseus/author/Thucydides/texts 334 | /Users/kyle/cltk_data/greek/text/greek_text_perseus/Thucydides/opensource/ 335 | 336 | Tryphiodorus 337 | http://localhost:5000/lang/greek/corpus/perseus/author/Tryphiodorus/text/tryphiodorus_01 338 | 339 | Xenophon 340 | http://localhost:5000/lang/greek/corpus/perseus/author/Xenophon/text/xen.anab 341 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | amqp==1.4.7 2 | aniso8601==1.1.0 3 | anyjson==0.3.3 4 | beautifulsoup4==4.4.1 5 | billiard==3.3.0.21 6 | celery==3.1.19 7 | cltk==0.1.29 8 | Flask==0.10.1 9 | Flask-PyMongo==0.4.0 10 | Flask-RESTful==0.3.4 11 | gitdb==0.6.4 12 | GitPython==1.0.1 13 | itsdangerous==0.24 14 | Jinja2==2.8 15 | kombu==3.0.29 16 | MarkupSafe==0.23 17 | nltk==3.1 18 | pymongo==3.1.1 19 | python-dateutil==2.4.2 20 | python-slugify==1.1.4 21 | pytz==2015.7 22 | regex==2015.11.14 23 | six==1.10.0 24 | smmap==0.9.0 25 | Unidecode==0.4.18 26 | Werkzeug==0.11.2 27 | wheel==0.24.0 28 | wikipedia 29 | -------------------------------------------------------------------------------- /tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | import api_json 4 | import json 5 | from cltk.corpus.utils.importer import CorpusImporter 6 | from metadata.pos.constants import POS_METHODS 7 | 8 | class TestAPIMethods(unittest.TestCase): 9 | """Requires latin_text_perseus folder in ~/cltk_data/latin/text/latin_text_perseus""" 10 | 11 | def setUp(self): 12 | file_rel_text = os.path.join('~/cltk_data/latin/text/latin_text_perseus/README.md') 13 | file_text = os.path.expanduser(file_rel_text) 14 | if not os.path.isfile(file_text): 15 | corpus_importer = CorpusImporter('latin') 16 | corpus_importer.import_corpus('latin_text_perseus') 17 | corpus_importer.import_corpus('latin_models_cltk') 18 | file_exists = os.path.isfile(file_text) 19 | self.assertTrue(file_exists) 20 | 21 | file_rel_treebank = os.path.join('~/cltk_data/latin/treebank/latin_treebank_perseus/README.md') 22 | file_treebank = os.path.expanduser(file_rel_treebank) 23 | if not os.path.isfile(file_rel_treebank): 24 | corpus_importer = CorpusImporter('latin') 25 | corpus_importer.import_corpus('latin_treebank_perseus') 26 | file_exists = os.path.isfile(file_treebank) 27 | self.assertTrue(file_exists) 28 | 29 | self.app = api_json.app.test_client() 30 | self.headers = [('Content-Type', 'application/json')] 31 | 32 | def test_home(self): 33 | response = self.app.get('/') 34 | self.assertEqual(response.status, '404 NOT FOUND') 35 | 36 | def test_hello_api(self): 37 | response = self.app.get('/hello') 38 | self.assertEqual(response.status, '200 OK') 39 | self.assertEqual(eval(response.data), dict(hello='world')) 40 | 41 | def test_todo_api(self): 42 | response = self.app.get('/todo/cltk_testing') 43 | self.assertEqual(response.status, '200 OK') 44 | self.assertEqual(eval(response.data), {'example with token': 'cltk_testing'}) 45 | 46 | def test_lang_api(self): 47 | response = self.app.get('/lang') 48 | self.assertEqual(response.status, '200 OK') 49 | response_lang = eval(response.data)['languages'] 50 | self.assertTrue('latin' in response_lang) 51 | 52 | def test_corpus_api(self): 53 | response = self.app.get('/lang/latin/corpus') 54 | self.assertEqual(response.status, '200 OK') 55 | self.assertEqual(eval(response.data)['language'], 'latin') 56 | self.assertTrue('perseus' in eval(response.data)['corpora']) 57 | 58 | def test_author_api(self): 59 | response = self.app.get('/lang/latin/corpus/perseus/author') 60 | self.assertEqual(response.status, '200 OK') 61 | self.assertEqual(eval(response.data)['language'], 'latin') 62 | self.assertTrue('glass' in eval(response.data)['authors']) 63 | 64 | def test_texts_api(self): 65 | response = self.app.get('/lang/latin/corpus/perseus/author/glass/text') 66 | self.assertEqual(response.status, '200 OK') 67 | self.assertEqual(eval(response.data)['language'], 'latin') 68 | self.assertEqual(eval(response.data)['corpus'], 'perseus') 69 | self.assertEqual(eval(response.data)['author'], 'glass') 70 | self.assertTrue('washingtonii_vita' in eval(response.data)['texts']) 71 | 72 | def test_text_api(self): 73 | response = self.app.get('/lang/latin/corpus/perseus/author/tacitus/text/germania') 74 | self.assertEqual(response.status, '200 OK') 75 | self.assertEqual(eval(response.data)['language'], 'latin') 76 | self.assertEqual(eval(response.data)['corpus'], 'perseus') 77 | self.assertEqual(eval(response.data)['author'], 'tacitus') 78 | self.assertEqual(eval(response.data)['meta'], 'book-chapter') 79 | self.assertEqual(eval(response.data)['work'], 'germania') 80 | self.assertEqual(eval(response.data)['text']['2']['1'].strip(), 'Ipsos Germanos indigenas crediderim minimeque aliarum gentium adventibus et hospitiis mixtos, quia nec terra olim sed classibus advehebantur qui mutare sedes quaerebant, et immensus ultra utque sic dixerim adversus Oceanus raris ab orbe nostro navibus aditur.') 81 | 82 | response_chunk1 = self.app.get('/lang/latin/corpus/perseus/author/tacitus/text/germania?chunk1=2') 83 | self.assertEqual(response_chunk1.status, '200 OK') 84 | self.assertEqual(eval(response_chunk1.data)['text']['2'].strip(), 'quis porro, praeter periculum horridi et ignoti maris, Asia aut Africa aut Italia relicta Germaniam peteret, informem terris, asperam caelo, tristem cultu aspectuque nisi si patria sit?') 85 | 86 | response_chunk2 = self.app.get('/lang/latin/corpus/perseus/author/tacitus/text/germania?chunk1=2&chunk2=4') 87 | self.assertEqual(response_chunk2.status, '200 OK') 88 | self.assertEqual(eval(response_chunk2.data)['text'].strip(), 'quidam, ut in licentia vetustatis, plures deo ortos pluresque gentis appellationes, Marsos Gambrivios Suebos Vandilios adfirmant, eaque vera et antiqua nomina.') 89 | 90 | response_chunk3 = self.app.get('/lang/latin/corpus/perseus/author/tacitus/text/germania?chunk1=2&chunk2=4&chunk3=1') 91 | self.assertEqual(response_chunk3.status, '500 INTERNAL SERVER ERROR') 92 | 93 | def test_pos_latin_ngram123(self): 94 | # test GET response 95 | response = self.app.get('/core/pos') 96 | expected_response = {'methods': POS_METHODS} 97 | self.assertEqual(eval(response.data), expected_response) 98 | 99 | # test POST response 100 | data = json.dumps({'string': 'Gallia est omnis divisa in partes tres', 101 | 'lang': 'latin', 102 | 'method': 'ngram123'}) 103 | response = self.app.post('/core/pos', data=data, headers=self.headers) 104 | expected_response = {u'tags': [{'word': 'Gallia', 'tag': 'None'}, 105 | {'word': 'est', 'tag': 'V3SPIA---'}, 106 | {'word': 'omnis', 'tag': 'A-S---MN-'}, 107 | {'word': 'divisa', 'tag': 'T-PRPPNN-'}, 108 | {'word': 'in', 'tag': 'R--------'}, 109 | {'word': 'partes', 'tag': 'N-P---FA-'}, 110 | {'word': 'tres', 'tag': 'M--------'}]} 111 | self.assertEqual(response.status, '200 OK') 112 | self.assertEqual(eval(response.data), expected_response) 113 | 114 | def test_core_stem(self): 115 | response = self.app.get('/core/stem/Est interdum praestare mercaturis rem quaerere, nisi tam periculosum sit, et item foenerari, si tam honestum. Maiores nostri sic habuerunt et ita in legibus posiuerunt: furem dupli condemnari, foeneratorem quadrupli. Quanto peiorem ciuem existimarint foeneratorem quam furem, hinc licet existimare. Et uirum bonum quom laudabant, ita laudabant: bonum agricolam bonumque colonum; amplissime laudari existimabatur qui ita laudabatur. Mercatorem autem strenuum studiosumque rei quaerendae existimo, uerum, ut supra dixi, periculosum et calamitosum. At ex agricolis et uiri fortissimi et milites strenuissimi gignuntur, maximeque pius quaestus stabilissimusque consequitur minimeque inuidiosus, minimeque male cogitantes sunt qui in eo studio occupati sunt. Nunc, ut ad rem redeam, quod promisi institutum principium hoc erit.') 116 | self.assertEqual(response.status, "200 OK") 117 | self.assertEqual(eval(response.data)['stemmed_output'], 'est interd praestar mercatur r quaerere, nisi tam periculos sit, et it foenerari, si tam honestum. maior nostr sic habueru et ita in leg posiuerunt: fur dupl condemnari, foenerator quadrupli. quant peior ciu existimari foenerator quam furem, hinc lice existimare. et uir bon quo laudabant, ita laudabant: bon agricol bon colonum; amplissim laudar existimaba qui ita laudabatur. mercator autem strenu studios re quaerend existimo, uerum, ut supr dixi, periculos et calamitosum. at ex agricol et uir fortissim et milit strenuissim gignuntur, maxim p quaest stabilissim consequi minim inuidiosus, minim mal cogitant su qui in e studi occupat sunt. nunc, ut ad r redeam, quod promis institut principi hoc erit. ') 118 | 119 | def test_definition_api(self): 120 | response = self.app.get('lang/latin/define/abante') 121 | self.assertEqual(response.status, '200 OK') 122 | self.assertEqual(eval(response.data)[0]['headword'], 'Abas') 123 | self.assertEqual(eval(response.data)[0]['definition'], 'The twelfth king of Argos, son of Lynceus and Hypermnestra') 124 | self.assertEqual(eval(response.data)[0]['pos'], 'noun sg masc abl') 125 | 126 | def test_translation_api(self): 127 | response = self.app.get('/lang/latin/corpus/perseus/author/catullus/text/poemata?translation=english') 128 | self.assertEqual(response.status, '200 OK') 129 | data = json.loads(response.get_data(as_text=True)) 130 | self.assertEqual(data['language'], 'latin') 131 | self.assertEqual(data['corpus'], 'perseus') 132 | self.assertEqual(data['author'], 'catullus') 133 | self.assertEqual(data['meta'], 'poem-line') 134 | self.assertEqual(data['work'], 'poemata') 135 | self.assertEqual(data['translations'][0]['translator'], 'Sir R. F. Burton') 136 | self.assertEqual(data['translations'][0]['text']['1']['1'], 'To thee (Cornelius!); for wast ever fain') 137 | 138 | if __name__ == '__main__': 139 | unittest.main() 140 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cltk/cltk_api/eb736ec9f20c008436e93cd0be4bcd05be7d455c/util/__init__.py -------------------------------------------------------------------------------- /util/jsonp.py: -------------------------------------------------------------------------------- 1 | 2 | from functools import wraps 3 | from flask import request, current_app 4 | 5 | def jsonp(func): 6 | """Wraps JSONified output for JSONP requests.""" 7 | 8 | @wraps(func) 9 | def decorated_function(*args, **kwargs): 10 | 11 | callback = request.args.get('callback', False) 12 | if callback: 13 | data = str(func(*args, **kwargs)) 14 | content = str(callback) + '(' + data + ')' 15 | mimetype = 'application/javascript' 16 | return current_app.response_class(content, mimetype=mimetype) 17 | 18 | else: 19 | return func(*args, **kwargs) 20 | 21 | return decorated_function 22 | -------------------------------------------------------------------------------- /util/numerals.py: -------------------------------------------------------------------------------- 1 | """Convert to and from Roman numerals""" 2 | 3 | __author__ = "Mark Pilgrim (f8dy@diveintopython.org)" 4 | __version__ = "1.4" 5 | __date__ = "8 August 2001" 6 | __copyright__ = """Copyright (c) 2001 Mark Pilgrim 7 | 8 | This program is part of "Dive Into Python", a free Python tutorial for 9 | experienced programmers. Visit http://diveintopython.org/ for the 10 | latest version. 11 | 12 | This program is free software; you can redistribute it and/or modify 13 | it under the terms of the Python 2.1.1 license, available at 14 | http://www.python.org/2.1.1/license.html 15 | """ 16 | 17 | import re 18 | 19 | #Define exceptions 20 | class RomanError(Exception): pass 21 | class OutOfRangeError(RomanError): pass 22 | class NotIntegerError(RomanError): pass 23 | class InvalidRomanNumeralError(RomanError): pass 24 | 25 | #Define digit mapping 26 | romanNumeralMap = (('M', 1000), 27 | ('CM', 900), 28 | ('D', 500), 29 | ('CD', 400), 30 | ('C', 100), 31 | ('XC', 90), 32 | ('L', 50), 33 | ('XL', 40), 34 | ('X', 10), 35 | ('IX', 9), 36 | ('V', 5), 37 | ('IV', 4), 38 | ('I', 1)) 39 | 40 | def toRoman(n): 41 | """convert integer to Roman numeral""" 42 | if not (0 < n < 5000): 43 | raise OutOfRangeError("number out of range (must be 1..4999)") 44 | if int(n) != n: 45 | raise NotIntegerError("decimals can not be converted") 46 | 47 | result = "" 48 | for numeral, integer in romanNumeralMap: 49 | while n >= integer: 50 | result += numeral 51 | n -= integer 52 | return result 53 | 54 | #Define pattern to detect valid Roman numerals 55 | romanNumeralPattern = re.compile(""" 56 | ^ # beginning of string 57 | M{0,4} # thousands - 0 to 4 M's 58 | (CM|CD|D?C{0,3}) # hundreds - 900 (CM), 400 (CD), 0-300 (0 to 3 C's), 59 | # or 500-800 (D, followed by 0 to 3 C's) 60 | (XC|XL|L?X{0,3}) # tens - 90 (XC), 40 (XL), 0-30 (0 to 3 X's), 61 | # or 50-80 (L, followed by 0 to 3 X's) 62 | (IX|IV|V?I{0,3}) # ones - 9 (IX), 4 (IV), 0-3 (0 to 3 I's), 63 | # or 5-8 (V, followed by 0 to 3 I's) 64 | $ # end of string 65 | """ ,re.VERBOSE) 66 | 67 | def fromRoman(s): 68 | """convert Roman numeral to integer""" 69 | if not s: 70 | raise InvalidRomanNumeralError('Input can not be blank') 71 | if not romanNumeralPattern.search(s): 72 | raise InvalidRomanNumeralError('Invalid Roman numeral: %s' % s) 73 | 74 | result = 0 75 | index = 0 76 | for numeral, integer in romanNumeralMap: 77 | while s[index:index+len(numeral)] == numeral: 78 | result += integer 79 | index += len(numeral) 80 | return result -------------------------------------------------------------------------------- /util/text.py: -------------------------------------------------------------------------------- 1 | """ 2 | Sundry utility functions for sanitizing textual data 3 | """ 4 | import re 5 | import string 6 | import unicodedata as ud 7 | 8 | class TextUtil: 9 | 10 | def is_latin(self, uchr): 11 | try: return self.latin_letters[uchr] 12 | except KeyError: 13 | return self.latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr)) 14 | 15 | def only_roman_chars(self, unistr): 16 | return all(self.is_latin(uchr) 17 | for uchr in unistr 18 | if uchr.isalpha()) 19 | 20 | def only_iso88591(self, string): 21 | flag = True 22 | try: 23 | string.encode("iso-8859-1") 24 | except UnicodeEncodeError: 25 | flag = False 26 | 27 | return flag 28 | 29 | def strip_punctution( s ): 30 | 31 | exclude = set(string.punctuation) 32 | 33 | return ''.join(ch for ch in s if ch not in exclude) 34 | --------------------------------------------------------------------------------