├── requirements.txt ├── test_data └── repo │ ├── data │ ├── groupe │ │ ├── __cts__.xml │ │ └── oeuvre │ │ │ ├── __cts__.xml │ │ │ └── groupe.oeuvre.version-lat1.xml │ └── textgroup │ │ ├── __cts__.xml │ │ └── work │ │ ├── __cts__.xml │ │ └── textgroup.work.version-lat1.xml │ └── full_inventory.xml ├── .travis.yml ├── setup.py ├── .gitignore ├── LICENSE ├── README.md ├── test.py └── cltk_capitains_corpora_converter.py /requirements.txt: -------------------------------------------------------------------------------- 1 | MyCapytain>=1.0.1 2 | GitPython==1.0.2 -------------------------------------------------------------------------------- /test_data/repo/data/groupe/__cts__.xml: -------------------------------------------------------------------------------- 1 | 2 | Groupe de texte 3 | -------------------------------------------------------------------------------- /test_data/repo/data/textgroup/__cts__.xml: -------------------------------------------------------------------------------- 1 | 2 | Textgroup 3 | Groupe de texte 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - "3.4" 5 | - "3.5" 6 | 7 | install: 8 | - python setup.py sdist install 9 | - pip install coveralls 10 | 11 | # command to run tests 12 | script: 13 | - coverage run --source=cltk_capitains_corpora_converter setup.py test 14 | after_success: 15 | - coveralls -------------------------------------------------------------------------------- /test_data/repo/data/groupe/oeuvre/__cts__.xml: -------------------------------------------------------------------------------- 1 | 2 | Oeuvre 3 | 4 | WorkLabel 5 | description 6 | ma description 7 | 8 | -------------------------------------------------------------------------------- /test_data/repo/data/textgroup/work/__cts__.xml: -------------------------------------------------------------------------------- 1 | 2 | Work 3 | Oeuvre 4 | 5 | WorkLabel 6 | description 7 | ma description 8 | 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='cltk_capitains_corpora_converter', 5 | version="0.0.1", 6 | description='CLTK Converter for Capitains Guidelines Repository', 7 | url='http://github.com/cltk/capitains_corpora_converter', 8 | author='Thibault Clerice', 9 | author_email='leponteineptique@gmail.com', 10 | license='MIT', 11 | py_modules=['cltk_capitains_corpora_converter'], 12 | install_requires=[ 13 | "MyCapytain>=1.0.1", 14 | "gitpython==1.0.2" 15 | ], 16 | entry_points={ 17 | 'console_scripts': ['capitains-cltk-converter=cltk_capitains_corpora_converter:cmd'], 18 | }, 19 | test_suite="test" 20 | ) -------------------------------------------------------------------------------- /test_data/repo/full_inventory.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Textgroup 4 | Groupe de texte 5 | 6 | Work 7 | Oeuvre 8 | 9 | WorkLabel 10 | description 11 | ma description 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | cloning 6 | json-converted 7 | .idea 8 | venv 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *,cover 50 | .hypothesis/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | 59 | # Sphinx documentation 60 | docs/_build/ 61 | 62 | # PyBuilder 63 | target/ 64 | 65 | #Ipython Notebook 66 | .ipynb_checkpoints 67 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Classical Language Toolkit 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /test_data/repo/data/groupe/oeuvre/groupe.oeuvre.version-lat1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 8 |

This pointer pattern extracts book, chapter and section

9 |
10 | 13 |

This pointer pattern extracts book and chapter

14 |
15 |
16 |
17 |
18 | 19 | 20 |
23 |
24 | 25 | 26 | Liber VIII, 27 | ad Serenvm: 28 | de otio 29 | 30 |
31 |
32 |

33 | 34 |

35 | 36 | cit, nobis magno consensu vitia 37 | commendant. Licet nihil aliud, quod sit salutare, 38 | temptemus, proderit tamen per se ipsum secedere ; 39 | meliores erimus singuli. Quid, quod secedere ad 40 | optimos viros et aliquod exemplum eligere, ad 41 | quod vitam derigamus, licet ? Quod nisi nisi added by Gronovius. in otio 42 | non fit. Tunc potest obtineri quod semel placuit, 43 | ubi nemo intervenit, qui iudicium adhuc imbecillum 44 | populo adiutore detorqueat; tunc potest vita 45 | aequali et uno tenore procedere, quam propositis 46 | diversissimis scindimus. 47 |

48 |
49 | 50 |
51 |

Nam inter cetera mala 52 | illud pessimum est, quod vitia ipsa mutamus. Sic 53 | ne hoc quidem nobis contingit permanere in malo 54 | iam familiari. Aliud ex alio placet vexatque nos hoc 55 | quoque, quod iudicia nostra non tantum prava, sed 56 | etiam levia sunt. Fluctuamur aliudque ex alio com- 57 | 58 | 59 | prendimus, petita relinquimus, relicta repetimus, 60 | alternae inter cupiditatem nostram et paenitentiam 61 | vices sunt ; 62 |

63 |
64 |
65 |
66 |
67 | 68 |
69 |
70 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Capitains Corpora Converter 2 | 3 | [![Build Status](https://travis-ci.org/cltk/capitains_corpora_converter.svg?branch=master)](https://travis-ci.org/cltk/capitains_corpora_converter) 4 | [![Coverage Status](https://coveralls.io/repos/github/cltk/capitains_corpora_converter/badge.svg?branch=master)](https://coveralls.io/github/cltk/capitains_corpora_converter?branch=master) 5 | 6 | Converts CapiTainS-based Repository ( http://capitains.github.io ) to JSON for CLTK 7 | 8 | ## How to install ? 9 | 10 | ### Install as a package 11 | 12 | ```shell 13 | git clone https://github.com/cltk/capitains_corpora_converter.git 14 | cd capitains_corpora_converter 15 | pyvenv venv 16 | source venv/bin/activate 17 | python setup install 18 | ``` 19 | 20 | ### Install for development 21 | 22 | ```shell 23 | git clone https://github.com/cltk/capitains_corpora_converter.git 24 | cd capitains_corpora_converter 25 | pyvenv venv 26 | source venv/bin/activate 27 | python setup develop 28 | ``` 29 | 30 | ### Install as global commandline 31 | 32 | **Not recommanded** 33 | 34 | ```shell 35 | git clone https://github.com/cltk/capitains_corpora_converter.git 36 | cd capitains_corpora_converter 37 | sudo python setup install 38 | ``` 39 | 40 | ## Command Line Interface 41 | 42 | capitains-cltk-converter [-h] [--output OUTPUT] [--git REPOSITORY] 43 | [--credit CREDIT] 44 | [--exclude-nodes NODES [NODES ...]] [--silent] 45 | directory 46 | 47 | CLTK Converter for CapiTainS based reosurces 48 | 49 | **Positional arguments:** 50 | 51 | | Argument name | Description | 52 | |----------------------------------:|------------------------------------------------------------------------------------------| 53 | | directory | List of path to use to populate the repository or destination directory for cloning | 54 | 55 | **Optional Arguments:** 56 | 57 | | Argument name | Description | 58 | |----------------------------------:|------------------------------------------------------------------------------------------| 59 | | -h, --help | Show this help message and exit | 60 | | --output OUTPUT | List of path to use to populate the repository or destination directory for cloning | 61 | | --git REPOSITORY | Address of a repository | 62 | | --credit CREDIT | Credit line to use in json | 63 | | --exclude-nodes NODES [NODES ...] | Nodes to exclude from passages with "tei:" prefix, eg: --exclude-nodes tei:note tei:orig | 64 | | --silent | Show only errors | 65 | 66 | 67 | ## Example 68 | 69 | ### Converting Open Greek And Latin's CSEL 70 | 71 | With the virtual env activated or with global commandline : 72 | 73 | ```shell 74 | capitains-cltk-converter cloning --git https://github.com/OpenGreekAndLatin/csel-dev.git --credit "Open Philology, Humboldt Chair of Digital Humanities ( https://github.com/OpenGreekAndLatin/csel-dev )" --exclude-nodes tei:note tei:orig 75 | ``` 76 | -------------------------------------------------------------------------------- /test_data/repo/data/textgroup/work/textgroup.work.version-lat1.xml: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | 7 | 8 | 11 |

This pointer pattern extracts book and poem and line

12 |
13 | 16 |

This pointer pattern extracts book and poem

17 |
18 | 21 |

This pointer pattern extracts book

22 |
23 |
24 |
25 |
26 | 27 | 28 |
29 |
30 |
31 | Spero me secutum in libellis meis tale temperamen- 32 | tum, ut de illis queri non possit quisquis de se bene 33 | senserit, cum salva infimarum quoque personarum re- 34 | verentia ludant; quae adeo antiquis auctoribus defuit, ut 35 | nominibus non tantum veris abusi sint, sed et magnis. 36 | Mihi fama vilius constet et probetur in me novissimum 37 | ingenium. Absit a iocorum nostrorum simplicitate ma- 38 | lignus interpres nec epigrammata mea scribat: inprobe 39 | facit qui in alieno libro ingeniosus est. Lascivam ver- 40 | borum veritatem, id est epigrammaton linguam, excu- 41 | sarem, si meum esset exemplum: sic scribit Catullus, sic 42 | Marsus, sic Pedo, sic Gaetulicus, sic quicumque perlegi- 43 | tur. Si quis tamen tam ambitiose tristis est, ut apud 44 | illum in nulla pagina latine loqui fas sit, potest epistula 45 | vel potius titulo contentus esse. Epigrammata illis scri- 46 | buntur, qui solent spectare Florales. Non intret Cato 47 | theatrum meum, aut si intraverit, spectet. Videor mihi 48 | meo iure facturus, si epistulam versibus clusero: 49 | Nosses iocosae dulce cum sacrum Florae 50 | Festosque lusus et licentiam volgi, 51 | Cur in theatrum, Cato severe, venisti? 52 | An ideo tantum veneras, ut exires? 53 |
54 |
55 | I 56 | Hic est quem legis ille, quem requiris, 57 | Toto notus in orbe Martialis 58 | Argutis epigrammaton libellis: 59 | 60 | Cui, lector studiose, quod dedisti 61 | Viventi decus atque sentienti, 62 | Rari post cineres habent poetae. 63 |
64 |
65 |
66 |
67 | II 68 | Qui tecum cupis esse meos ubicumque libellos 69 | Et comites longae quaeris habere viae, Something 70 | Hos eme, quos artat brevibus membrana tabellis: 71 | Scrinia da magnis, me manus una capit. 72 | Ne tamen ignores ubi sim venalis, et erres 73 | Urbe vagus tota, me duce certus eris: 74 | Libertum docti Lucensis quaere Secundum 75 | Limina post Pacis Palladiumque forum. 76 |
77 |
78 | III 79 | Argiletanas mavis habitare tabernas, 80 | Cum tibi, parve liber, scrinia nostra vacent. 81 | Nescis, heu, nescis dominae fastidia Romae: 82 | Crede slug.mihi, nimium Martia turba sapit. 83 | Maiores nusquam rhonchi: iuvenesque senesque 84 | Et pueri nasum rhinocerotis habent. 85 | Audieris cum grande sophos, dum basia iactas, 86 | Ibis ab excusso missus in astra sago. 87 | Sed tu ne totiens domini patiare lituras 88 | Neve notet lusus tristis harundo tuos, 89 | Aetherias, lascive, cupis volitare per auras: 90 | I, fuge; sed poteras tutior esse domi. 91 |
92 |
93 |
94 | 95 |
96 |
97 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from cltk_capitains_corpora_converter import make_json, toNumber, parse_directory, cmd, clone 2 | from collections import OrderedDict 3 | from unittest import TestCase 4 | import json 5 | 6 | from MyCapytain.resources.inventory import TextInventory 7 | from MyCapytain.common.reference import URN 8 | from MyCapytain.resources.texts.local import Text 9 | 10 | 11 | def get_test_resources(): 12 | """ Create a set of test resources 13 | 14 | :return: dict with keywords for make_json 15 | """ 16 | with open("test_data/repo/data/textgroup/work/textgroup.work.version-lat1.xml") as file: 17 | text = Text(resource=file, urn="urn:cts:latinLit:textgroup.work.version-lat1") 18 | 19 | with open("test_data/repo/full_inventory.xml") as file: 20 | inventory = TextInventory(resource=file) 21 | work = inventory["urn:cts:latinLit:textgroup.work"] 22 | textgroup = inventory["urn:cts:latinLit:textgroup"] 23 | edition = inventory["urn:cts:latinLit:textgroup.work.version-lat1"] 24 | 25 | return { 26 | "text": text, 27 | "work": work, 28 | "textgroup": textgroup, 29 | "edition": edition 30 | } 31 | 32 | 33 | class TestFunctions(TestCase): 34 | """ Test individual functions and not the whole process 35 | """ 36 | def test_toDic(self): 37 | """ Ensure toDic function creates nested dict with int as keys from ordered dict with string keys 38 | """ 39 | test_dict = OrderedDict([ 40 | ("a", OrderedDict([ 41 | ("1", "Some text"), 42 | ("2", "Some other Text") 43 | ])), 44 | ("b", OrderedDict([ 45 | ("7", "Lorem"), 46 | ("e", "Ipsum") 47 | ])) 48 | ]) 49 | a = toNumber(test_dict) 50 | expected = { 51 | 0: { 52 | 0: "Some text", 53 | 1: "Some other Text" 54 | }, 55 | 1: { 56 | 0: "Lorem", 57 | 1: "Ipsum" 58 | } 59 | } 60 | self.assertEqual(a, expected, "Nested should be converted to nested dictionary with int indexes") 61 | 62 | def test_make_json_simple(self): 63 | """ Test make json with default values 64 | """ 65 | resources = get_test_resources() 66 | output, filename = make_json(**resources) 67 | output = json.loads(output) 68 | self.assertEqual( 69 | output["text"]["0"]["0"]["0"], "Spero me secutum in libellis meis tale temperamen-", 70 | "Text passages should be parsed correctly" 71 | ) 72 | self.assertEqual( 73 | output["text"]["1"]["0"]["0"], "Qui tecum cupis esse meos ubicumque libellos ", 74 | "Text passages should be parsed correctly" 75 | ) 76 | 77 | self.assertEqual( 78 | output["text"]["1"]["0"]["1"], "Et comites longae quaeris habere viae, Something", 79 | "Text passages should be parsed correctly and note kept" 80 | ) 81 | self.assertEqual( 82 | output["text"]["1"]["1"]["3"], "Crede slug. mihi, nimium Martia turba sapit. ", 83 | "Text passages should be parsed correctly and abbr kept" 84 | ) 85 | self.assertEqual( 86 | filename, "textgroup__work__lat.json", 87 | "Filename should be created in a stable and understandable manner" 88 | ) 89 | self.assertEqual( 90 | output["original-urn"], "urn:cts:latinLit:textgroup.work.version-lat1", 91 | "Original URN should be fed" 92 | ) 93 | self.assertEqual( 94 | output["urn"], "urn:cts:latinLit:textgroup.work.version-lat1-simple", 95 | "CLTK URN should be suffixed" 96 | ) 97 | self.assertEqual( 98 | output["credit"], "", 99 | "Credit should be empty by default" 100 | ) 101 | self.assertEqual( 102 | output["meta"], "book-poem-line", 103 | "meta should reflect the citation scheme" 104 | ) 105 | self.assertEqual( 106 | output["author"], "textgroup", 107 | "Author name should be the English textgroup name" 108 | ) 109 | self.assertEqual( 110 | output["work"], "work", 111 | "Work name should be the English work name" 112 | ) 113 | self.assertEqual( 114 | output["edition"], "description", 115 | "We should have the English description" 116 | ) 117 | 118 | def test_make_json_advanced(self): 119 | """ Test make json with default values 120 | """ 121 | resources = get_test_resources() 122 | output, filename = make_json(commit="1245", exclude=["tei:note", "tei:orig"], credit="PerseusDL", **resources) 123 | output = json.loads(output) 124 | self.assertEqual( 125 | output["text"]["0"]["0"]["0"], "Spero me secutum in libellis meis tale temperamen-", 126 | "Text passages should be parsed correctly" 127 | ) 128 | self.assertEqual( 129 | output["text"]["1"]["0"]["1"], "Et comites longae quaeris habere viae, ", 130 | "Text passages should be parsed correctly and note removed" 131 | ) 132 | self.assertEqual( 133 | output["text"]["1"]["1"]["3"], "Crede mihi, nimium Martia turba sapit. ", 134 | "Text passages should be parsed correctly and note removed" 135 | ) 136 | self.assertEqual( 137 | output["text"]["1"]["0"]["0"], "Qui tecum cupis esse meos ubicumque libellos ", 138 | "Text passages should be parsed correctly" 139 | ) 140 | self.assertEqual( 141 | filename, "textgroup__work__lat.json", 142 | "Filename should be created in a stable and understandable manner" 143 | ) 144 | self.assertEqual( 145 | output["original-urn"], "urn:cts:latinLit:textgroup.work.version-lat1", 146 | "Original URN should be fed" 147 | ) 148 | self.assertEqual( 149 | output["urn"], "urn:cts:latinLit:textgroup.work.version-lat1-simple", 150 | "CLTK URN should be suffixed" 151 | ) 152 | self.assertEqual( 153 | output["credit"], "PerseusDL", 154 | "Credit should be empty by default" 155 | ) 156 | self.assertEqual( 157 | output["meta"], "book-poem-line", 158 | "meta should reflect the citation scheme" 159 | ) 160 | self.assertEqual( 161 | output["author"], "textgroup", 162 | "Author name should be the English textgroup name" 163 | ) 164 | self.assertEqual( 165 | output["work"], "work", 166 | "Work name should be the English work name" 167 | ) 168 | self.assertEqual( 169 | output["edition"], "description", 170 | "We should have the English description" 171 | ) 172 | self.assertEqual( 173 | output["commit"], "1245", 174 | "We should have the commit information" 175 | ) 176 | 177 | def test_parse_directory(self): 178 | """ Ensure parse directory works 179 | """ 180 | parsed = [i for i in parse_directory("./test_data/repo")] 181 | self.assertEqual( 182 | len(parsed), 2, 183 | "There should be two texts which are found" 184 | ) 185 | 186 | def test_parse_directory_and_make_json(self): 187 | """ Test that we can reuse this for makejson 188 | """ 189 | parsed = [i for i in parse_directory("./test_data/repo")] 190 | martial = [ 191 | item for item in parsed if str(item[0].urn) == "urn:cts:latinLit:textgroup.work.version-lat1" 192 | ][0] 193 | french = [ 194 | item for item in parsed if str(item[0].urn) == "urn:cts:latinLit:groupe.oeuvre.version-lat1" 195 | ][0] 196 | json_obj, filename = make_json(*martial) 197 | json_parsed = json.loads(json_obj) 198 | self.assertEqual( 199 | json_parsed["text"]["0"]["0"]["0"], "Spero me secutum in libellis meis tale temperamen-", 200 | "Text passages should be parsed correctly" 201 | ) 202 | self.assertEqual( 203 | filename, "textgroup__work__lat.json", 204 | "Filename should be created in a stable and understandable manner" 205 | ) 206 | 207 | json_obj, filename = make_json(*french) 208 | json_parsed = json.loads(json_obj) 209 | self.assertIn( 210 | "cit, nobis magno consensu vitia", json_parsed["text"]["0"]["0"], 211 | "Text passages should be parsed correctly" 212 | ) 213 | self.assertEqual( 214 | filename, "groupe_de_texte__oeuvre__lat.json", 215 | "Filename should be created in a stable and understandable manner" 216 | ) 217 | 218 | 219 | class TestCommand(TestCase): 220 | def test_something(self): 221 | pass -------------------------------------------------------------------------------- /cltk_capitains_corpora_converter.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | """ 4 | from glob import glob 5 | import os 6 | import git 7 | import argparse 8 | import logging 9 | import json 10 | import traceback 11 | 12 | 13 | from MyCapytain.resources.inventory import TextGroup, TextInventory, Work, Citation 14 | from MyCapytain.common.reference import URN 15 | from MyCapytain.resources.texts.local import Text 16 | 17 | 18 | logger = logging.getLogger("cltk_capitains_corpora_converter") 19 | 20 | 21 | def toNumber(passages): 22 | """ Change the reference system of MyCapytain nested dict 23 | 24 | :param passages: 25 | :return: 26 | """ 27 | returnDictionary = dict() 28 | passages_list = [passage for passage in passages.values()] 29 | for passage in passages_list: 30 | identifier = passages_list.index(passage) 31 | if isinstance(passage, dict): 32 | returnDictionary[identifier] = toNumber(passage) 33 | else: 34 | returnDictionary[identifier] = passage 35 | 36 | return returnDictionary 37 | 38 | 39 | def make_json(text, textgroup, work, edition, exclude=None, credit="", commit=None): 40 | """ Make a json object out of a text and an inventory record 41 | 42 | :param text: Text object imported from CapiTainS standard 43 | :param text: MyCapytain.resources.texts.local.Text 44 | :param textgroup: Textgroup Metadata according to CapiTainS standards 45 | :type textgroup: MyCapytain.resources.inventory.Textgroup 46 | :param work: Work Metadata according to CapiTainS standards 47 | :type work: MyCapytain.resources.inventory.Work 48 | :param edition: Edition metadata according to CapiTainS standard 49 | :type edition: MyCapytain.resources.inventory.Edition or MyCapytain.resources.inventory.Translation 50 | :param exclude: Node to exclude such as tei:note 51 | :type exclude: list(str) 52 | :param credit: Line for Credit Attribution 53 | :type credit: str 54 | :param commit: Commit version 55 | :type commit: str 56 | :return: Json representation and filename 57 | """ 58 | author = textgroup.metadata["groupname"]["eng"].lower() 59 | lang = [edition.lang or "unk"][0].lower() 60 | work = work.metadata["title"]["eng"].lower() 61 | 62 | j = { 63 | "original-urn": str(text.urn), 64 | "urn": "{}-simple".format(str(text.urn)), 65 | # Make a difference between both because losing TEI is changing the object 66 | "credit": credit, 67 | "meta": "-".join([citation.name or "unknown" for citation in text.citation]), 68 | "author": author.lower(), 69 | "work": work.lower(), 70 | "edition": edition.metadata["description"]["eng"], 71 | "text": toNumber(text.nested_dict(exclude=exclude)) 72 | } 73 | if commit: 74 | j["commit"] = commit 75 | return json.dumps(j, ensure_ascii = False, indent=4, separators=(',', ':')), "{}__{}__{}.json".format( 76 | author, 77 | work, 78 | lang 79 | ).replace(" ", "_") 80 | 81 | 82 | def parse_directory(directory): 83 | """ Parse a directory and yield required informations 84 | 85 | :param directory: Directory to parse 86 | :yields: Yields a tuple with the parsed texts and its parsed metadata 87 | :ytype: tuple(Text, Textgroup, Work, Edition) 88 | """ 89 | textgroups = glob("{base_folder}/data/*/__cts__.xml".format(base_folder=directory)) 90 | inventory = TextInventory() 91 | for __cts__ in textgroups: 92 | try: 93 | with open(__cts__) as __xml__: 94 | textgroup = TextGroup(resource=__xml__) 95 | textgroup.urn = URN(textgroup.xml.get("urn")) 96 | inventory.textgroups[str(textgroup.urn)] = textgroup 97 | 98 | for __subcts__ in glob("{parent}/*/__cts__.xml".format(parent=os.path.dirname(__cts__))): 99 | with open(__subcts__) as __xml__: 100 | work = Work( 101 | resource=__xml__, 102 | parents=[inventory.textgroups[str(textgroup.urn)]] 103 | ) 104 | work.urn = URN(work.xml.get("urn")) 105 | 106 | inventory.textgroups[str(textgroup.urn)].works[str(work.urn)] = work 107 | 108 | for __text__ in inventory.textgroups[str(textgroup.urn)].works[str(work.urn)].texts.values(): 109 | __text__.path = "{directory}/{textgroup}.{work}.{version}.xml".format( 110 | directory=os.path.dirname(__subcts__), 111 | textgroup=__text__.urn.textgroup, 112 | work=__text__.urn.work, 113 | version=__text__.urn.version 114 | ) 115 | if os.path.isfile(__text__.path): 116 | try: 117 | with open(__text__.path) as f: 118 | t = Text(resource=f, urn=__text__.urn) 119 | cites = list() 120 | for cite in [c for c in t.citation][::-1]: 121 | if len(cites) >= 1: 122 | cites.append(Citation( 123 | xpath=cite.xpath.replace("'", '"'), 124 | scope=cite.scope.replace("'", '"'), 125 | name=cite.name, 126 | child=cites[-1] 127 | )) 128 | else: 129 | cites.append(Citation( 130 | xpath=cite.xpath.replace("'", '"'), 131 | scope=cite.scope.replace("'", '"'), 132 | name=cite.name 133 | )) 134 | __text__.citation = cites[-1] 135 | yield ( 136 | t, 137 | inventory[str(textgroup.urn)], 138 | inventory[str(work.urn)], 139 | __text__ 140 | ) 141 | except Exception as E: 142 | logger.error( 143 | "%s does not accept parsing at some level (most probably citation) ", 144 | __text__.path 145 | ) 146 | logger.debug("Exact error message : %s", E.with_traceback(E.__traceback__)) 147 | except Exception: 148 | logger.error("Error parsing %s ", __cts__) 149 | 150 | 151 | def clone(repository, dest, branch=None, ref=None): 152 | """ Clone repository in dest folder 153 | 154 | :param repository: Repository to clone (eg. HTTPS addresses from GitHub) 155 | :param dest: Directory to clone to 156 | :param branch: Branch to pull (default: master) 157 | :param ref: Exact Reference to pull (default: refs/heads/master) 158 | :returns: Git Repository 159 | :rtype: git.repo 160 | 161 | """ 162 | logger.info("Cloning %s into %s", repository, dest) 163 | repo = git.repo.base.Repo.clone_from( 164 | url=repository, 165 | to_path=dest 166 | ) 167 | 168 | if ref is None: 169 | if branch is None: 170 | branch = "refs/heads/master" 171 | ref = branch 172 | else: 173 | ref = "refs/{0}".format(branch) 174 | 175 | repo.remote().pull(ref) 176 | logger.info("Cloning done.") 177 | 178 | return repo 179 | 180 | 181 | def run(directory, output=None, repository=None, nodes=None, credit=None, silent=False): 182 | """ Run a full repository cloning 183 | 184 | :param directory: Directory in which to retrieve CapiTainS resources 185 | :param output: Output directory where we store the converted resources 186 | :param repository: GIT Repository to clone 187 | :param nodes: Nodes to remove from TEI using a list. eg. ["tei:note"] 188 | :param credit: Credit line to use in output json 189 | :param silent: Disable logging except for errors 190 | """ 191 | handler = logging.StreamHandler() 192 | formatter = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s') 193 | handler.setFormatter(formatter) 194 | logger.addHandler(handler) 195 | if silent is True: 196 | logger.setLevel(logging.ERROR) 197 | else: 198 | logger.setLevel(logging.INFO) 199 | 200 | if not output: 201 | output = "json-converted" 202 | 203 | if not os.path.exists(output): 204 | os.mkdir(output) 205 | 206 | if not credit: 207 | if repository: 208 | credit = "Downloaded from {}".format(repository) 209 | else: 210 | credit = "" 211 | 212 | if repository: 213 | repo = clone(repository, directory) 214 | last_commit = repo.head.commit.hexsha 215 | 216 | for text, textgroup, work, edition in parse_directory(directory): 217 | try: 218 | commit = None 219 | # If we cloned a repo, we try to get commit information. Right now, too consuming. Storing just last commit 220 | if repo: 221 | commit = last_commit 222 | 223 | _json, filename = make_json(text, textgroup, work, edition, nodes, credit=credit, commit=commit) 224 | filepath = os.path.join(output, *[filename]) 225 | with open(filepath, "w") as f: 226 | logger.info("Writing %s", filepath) 227 | f.write(_json) 228 | except Exception as E: 229 | logger.error( 230 | "%s issued an error \n %s", 231 | edition.path, 232 | "\n".join([str(E)] + traceback.format_list(traceback.extract_tb(E.__traceback__))) 233 | ) 234 | 235 | 236 | def cmd(): 237 | """ Commandline function to converter a CapiTainS Guidelines-based repository to a CLTK Corpus. 238 | """ 239 | parser = argparse.ArgumentParser(description='CLTK Converter for CapiTainS based resources') 240 | parser.add_argument('directory', type=str, 241 | help='List of path to use to populate the repository or destination directory for cloning') 242 | parser.add_argument('--output', type=str, 243 | help='List of path to use to populate the repository or destination directory for cloning') 244 | parser.add_argument('--git', type=str, default=None, dest="repository", 245 | help="Address of a repository") 246 | parser.add_argument('--credit', type=str, default=None, 247 | help="Credit line to use in json") 248 | parser.add_argument('--exclude-nodes', type=str, nargs="+", default=None, dest="nodes", 249 | help='Nodes to exclude from passages with "tei:" prefix, eg : --exclude-nodes tei:note tei:orig') 250 | parser.add_argument('--silent', action="store_true", default=False, dest="silent", 251 | help='Show only errors') 252 | args = parser.parse_args() 253 | 254 | if args.directory: 255 | run(**vars(args)) 256 | 257 | if __name__ == "__main__": 258 | cmd() 259 | --------------------------------------------------------------------------------