├── docs └── __init__.py ├── tests └── __init__.py ├── catf_feature_extractor ├── __init__.py ├── textApi │ └── __init__.py └── extractor │ ├── __init__.py │ └── cAtfFeatExtractor.py ├── .travis.yml ├── LICENSE └── README.md /docs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /catf_feature_extractor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /catf_feature_extractor/textApi/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /catf_feature_extractor/extractor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.3" 4 | - "3.4" 5 | - "3.5" 6 | - "3.6" 7 | 8 | before_script: 9 | - pip install --upgrade pip 10 | - pip install codecov 11 | - pip install coveralls 12 | 13 | script: nosetests --no-skip --with-coverage 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 DKE 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # c-atf-feature-extractor 2 | A Preprocessor for C-ATF files 3 | 4 | The extractor is pure python as in only\_uses\_four\_standard\_modules:\_itertools\_re\_json\_unittest\_aka\_as\_light\_as\_it\_gets. It takes a c-atf text and spits out a huge python heavily nested dictionary. An api would be available in the future for facilitating the usage of it by humans. 5 | 6 | **The Preprocessor is in its alpha stage, don't use it for scholarly work yet. When the milestone Version 0.1 Beta is achieved, you can use it for scholarly work. Major releases, like 0.1b or 0.1, will also be contributed to the cltk repository.** 7 | 8 | **The Resulting dictionary contains the following information with regard to text**: 9 | 10 | ### On text level: 11 | 12 | - text_textPartCount: Number of parts in a texts: like @obverse, @reverse, etc. 13 | 14 | - text_totalSignOccuranceCount: Total count of sign occurrences, this is to be differentiated from the number of signs used in a document. For example a text might have 30 signs with each of them occurring 3 times, thus total occurrence count would be 90. I will also provide number of signs, later on. 15 | 16 | - text_RelSignPositions: Relative Sign Positions with respect to text level. Meaning that the signs are counted throughout the text without considering changes in line or part. If the line 1 has 20 sign occurrences, for line 2, the counter continues from where the line 1 has left off. 17 | 18 | - text_RelWordPositions: Relative Word Positions with respect to text level. Same as signs but done for words. 19 | 20 | - text_language: Gives the specified language in the protocol section of the c-atf document, ex.. #atf: lang akk. 21 | 22 | - text_totalLineCount: Gives the total number of lines in the text. 23 | 24 | - text_objectType: Gives the object type stated with @, ex. @tablet, etc. 25 | 26 | - text_id: Gives the text id, ex.. P480793, etc. 27 | 28 | - text_RelLinePositions: Relative Line Positions with respect to text level. Same as signs but done for lines. 29 | 30 | - text_textParts: A list containing the parts of the text. Parts are stored as dictionaries. 31 | 32 | ----- 33 | 34 | ### On Part Level: 35 | 36 | - part_partTitle: Gives the title of the part indicated with @, ex. @observe, @reverse, etc. 37 | 38 | - part_partString: Gives the text of the part as string. 39 | 40 | - part\_AL\_occurances: Gives the list of Another Language Occurrences, these are indicated with _ in c-atf they can contain multiple lines, and a language switch. We'll explain some of the features contained by this in a second. 41 | 42 | - part_RelSignPositions: Relative Sign Positions with respect to part level. Same thing as text level, but this time the counter starts from zero for each part. 43 | 44 | - part_RelWordPositions: Relative Word Positions with respect to part level. Same thing as signs but done for words. 45 | 46 | - part_partLines: A list of dictionaries. Dictionaries relate information about the states of individual lines. 47 | 48 | #### AL Occurances: 49 | 50 | Another language occurances can be dispersed to multiple lines thus they are handled at a part level rather then in line level. 51 | 52 | The key part\_AL\_occurances gives a list of list of dicts. Each list of dicts represent an AL occurrence. Each dict represents a word. 53 | **The structure of the dict is as follows**: 54 | 55 | - alWord\_AlOc: is a string representation of the entire Another Language occurrence. Ex. For an AL occurrence of \_an kur\_u2, the dictionary might belong to the word kur_u2, but this key would contain the \_an kur\_u2 anyway. 56 | 57 | - alWord\_AlOc_Position: contains a dictionary with following keys: alWord\_Position, and totalWords\_AlOc: First one indicates the position of the another language word inside the AL occurrence, the second one gives the total number of AL words inside the AL occurrence. 58 | 59 | - alWord_LineNumber: Stores the number of the line which contains the AL word. 60 | 61 | - alWord\_LinePosition: contains a dictionary with the following keys: alWord\_Position and totalWords_Line: First one indicates the position of the AL word inside the line, the second one gives the total number of words in the line. 62 | 63 | - alWord\_language: Stores the language of the another language occurrence if stated, ex: if there is something like %hit right after the underscore it would give the hit as the value of this key. 64 | 65 | - alWord\_textLanguage: Stores the language of the text, indicated in the protocol 66 | 67 | - alWord\_word: Stores the AL word, so for an AL occurrence of \_an kur\_u2, this key would contain only kur\_u2. 68 | 69 | --- 70 | 71 | ### On Line Level: 72 | 73 | - lineText: Contains the string representation of the line. 74 | - lineNumber: Contains the line number. 75 | - isLineStructure: Stores a boolean value. Indicates whether the line is a comment about the structure indicated with $ in C-ATF. 76 | - isLineContent: Stores a boolean value. Indicates whether the line is a comment about the content of another line indicated with # in C-ATF. 77 | - lineWordCount: Stores the the number total word count in the line. 78 | - lineWordPos: Line Word Positions: Stores the words with their positions in the line. Different from the lineWords 79 | - lineWords: Stores a list of dictionaries. Each dictionary represent a unique word in the line. 80 | - line\_RelSignPositions: Relative Sign Positions with respect to line level. Same thing as in part level, this time the counter results to 0 at the beginning of each line. 81 | 82 | ------- 83 | 84 | ### On Word Level: 85 | 86 | Word dictionary includes some features from sign dictionary for facilitating processing afterwards. 87 | 88 | - word\_hasComplement: Boolean value. True if the word has a compliment indicated with +. 89 | - word\_Signs: Stores a list of dictionaries. Each dictionary represent a unique sign. 90 | - word\_hasUnknownReading: Boolean value. True if the word has an uppercase reading outside of a compound sign. 91 | - word\_hasNutillu: Boolean value. True if the word has a sign with nutillu modifier. 92 | - word\_hasRotated: Boolean value. True if the word has a sign with rotation modifier. 93 | - word\_hasAllograph: Boolean value. True if the word has a sign with allograph indicator ~. 94 | - word\_hasDamage: Boolean value. True if the word has a sign with #. 95 | - word\_punctuationDict: Stores a dictionary which includes information about the punctuation. Keys are punctuation\_punctElement, and punctuation\_punctGrapheme: First one contains the value of the punctuation without the qualifying grapheme, ex. *, /; the second one contains the grapheme qualifying the punctuation, ex. the disz in *(disz). 96 | - word\_hasSpecification: Boolean value. True if the word has parentheses in it. 97 | - word\_wordSignCount: Stores the sign occurrence count for a word. 98 | - word\_hasGunu: Boolean value. True if the word has a sign with gunu modifier. 99 | - word\_isSpecifiedWordDivider: boolean value. True if the word is composed of the following structure: /(GRAPHEME). 100 | - word\_signRelations: Stores a list of dictionaries. Each dictionary represents a relation indicated with an operator. Contained relation types are, sign-sign, sign-group, group-sign, group-group. 101 | - word\_hasKabatenu: Boolean value. True if the word has a sign with kabatenu modifier. 102 | - word\_isDColon: Boolean value. True if the word is punctuation of Double Colon '::'. 103 | - word\_hasJoining: Boolean value. True if the word has the joining operator in it. 104 | - word\_hasFlat: Boolean value. True if the word has a sign with flat modifier. 105 | - word\_hasCrossing: Boolean value. True if the word has the crossing operator. 106 | - word\_hasCollation: Boolean value. True if the word has a collation indicated with '*'. 107 | - word\_hasComposite: Boolean value. True if the word has compound sign in it. 108 | - word\_hasVertReflected: Boolean value. True if the word has a sign with vertically Reflected modifier. 109 | - word\_hasQuery: Boolean value. True if the word has a sign with ?. 110 | - word\_hasFormVariant: Boolean. True if the word has \. 111 | - word\_hasSpecialAllograph: Boolean value. True if the word has a special allograph. 112 | - word\_determinatives: Stores a list of tuples of dicts. Each dict represents a sign in a determinative. And each tuple represents a determinative. Keys of this determinative sign dictionary will be explained below. 113 | - word\_numberDict: Stores a dict with following keys. number\_repetitionCount, number\_grapheme: First one indicates the repetition count of a number, ex. n+1, n, 4, etc. The second one indicates the grapheme of the number, ex. asz in 4(asz), etc. 114 | - word\_hasCurved: Boolean value. True if the word has a sign with curved modifier. 115 | - word\_hasContaining: Boolean value. True if the word has the containing operator. 116 | - word\_hasCorrection: Boolean value. True if the word has a sign with !. 117 | - word\_word: String representation of the white-space delimited word. 118 | - word\_hasSheshig: Boolean value. True if the word has a sign with sheshig modifier. 119 | - word\_isColon: Boolean value. True if the word is a punctuation of the type, :. 120 | - word\_isBulletSpecified: Boolean value. True if the word is a punctuation of the type *(GRAPHEME). 121 | - word\_wordSignsPos: List of tuples. Stores the signs with their relative positions inside the word. 122 | - word\_isNumber: Boolean value. True if the word belongs to one of the three number types specified by Grapheme Description Language of ORACC. 123 | - word\_wordLang: Stores the information of the language of the word. This can be different from the text language if the word is inside an AL occurrence. 124 | - word\_hasVariant: Boolean value. True if the word has a sign with variant modifier. 125 | - word\_hasAbove: Boolean value. True if the word has the above operator. 126 | - word\_hasTenu: Boolean value. True if the word has a sign with tenu modifier. 127 | - word\_isColonDQ: Boolean value. True if the word is a punctuation of the type, :". 128 | - word\_hasHorReflected: Boolean value. True if the word has a sign with Horizontally Reflected modifier. 129 | - word\_isColonRQ: Boolean value. True if the word is a punctuation of the type, :' or MZL592~b. 130 | - word\_hasBeside: Boolean value. True if the word has the beside operator. 131 | word\_isBullet: Boolean value. True if the word is a punctuation of the type, *. 132 | word\_hasContainingGroup: Boolean value. True if the word has containing operator with parentheses. 133 | word\_isWordDivider: Boolean value. True if the word is an unspecified word divider. 134 | word\_hasZidatenu: Boolean value. True if the word has a sign with zidatenu modifier. 135 | 136 | #### Determinatives: 137 | 138 | Each determinative of the word is a tuple, which contains dictionaries representing signs. 139 | 140 | **The structure of the dictionary is of the following**: 141 | 142 | - detSign\_DetPosition: stores a dictionary with following keys. detSign\_position, totalSigns\_determinative. First one contains the position of the sign inside the determinative. Second one contains total number of signs inside the determinative. 143 | - detSign\_WordPosition: stores a dictionary with following keys. detSign\_position, totalSigns\_word. First one stores the position of the sign inside the word. Second one contains total number of signs inside the word. 144 | - detSign\_det: Stores the string representation of the determinative. 145 | - detSign\_detMark: Stores a string representation. It can have three values: Inpos, postpos, prepos. Prepos, for determinatives at the beginning of a word. Postpos for determinatives at the end of a word. Inpos for determinatives that are neither at the beginning nor at the end of the word. They maybe used for example for determinatives that follow other determinatives inside a word. 146 | - detSign\_detSign: Stores the string representation of the sign of the determinative to which the dictionary is consecrated. 147 | - detSign\_det\_WordPos: Stores a tuple which contains the beginning and the end of the sign range of the determinative which contains the above mentioned sign. Ex. for a made up word like {gesz}{an-il-hal}sza-pa-ra-ku2-me-{mesz}, the dictionary concerning il of {an-il-hal} would contain (1,3), since gesz is in the 0 position. 148 | 149 | ---- 150 | 151 | ### On Sign Level: 152 | 153 | Apart from the 'is' versions of the sign features indicated in word level, ex. isDamaged instead of hasDamage. Sign dictionary has the following 154 | information keys: 155 | 156 | - sign\_isPartOfCompound: Boolean value. True if the sign is part of a compound sign, ex. KA in KAxIR2. 157 | - sign\_nestLevel: Stores the nest level of the sign if the sign is contained in a compound sign involving groups, ex. 1 for KA in IR3x(AN.KA). The complete compound sign is considered as the 0 and each balanced parentheses is counted as a nest indicator. 158 | - sign\_relatedSigns: Stores a dictionary. Its keys will be explained below. 159 | - sign\_sign: Stores the string representation of the sign. 160 | - sign\_compoundSign: Stores the string representation of the compound sign if the sign is a part of a compound sign. 161 | 162 | #### Sign Relations: 163 | 164 | These are indicated at two levels: at word level and at sign level. Word level representation contains group-group relations which could not be conceived at a sign level. A more elegant solution would be to implement a compoundSignHandler class for this occasion, this will most probably be done in the future. However sign relation dictionaries contained at both of the levels have the same keys. 165 | 166 | **The structure of the sign relation dictionary is of the following**: 167 | 168 | - SR\_operator: String representation of the operator, ex: +, x, %, etc. 169 | - SR\_operator\_antec: String representation of the characters before the operator. 170 | - SR\_operator\_subsq: String representation of the characters after the operator. 171 | - SR\_nest\_level: Indicates the nest level of sign relation occurrence. 172 | - SR\_nest\_content: Stores the string representation of the text content in which a sign relation occurs. 173 | - SR\_compoundSign: Stores the string representation of the compound sign in which the nested sign relation occurrence is observed. 174 | - SR\_nest\_range: Stores the character range of the nest in which the sign relation is observed. 175 | - SR\_relation\_type: Stores a dictionary with the following keys. operator\_antecedent, operator\_subsequent. First one indicates whether a group or a sign comes before the operator, the second one indicates whether a group or sign comes after the operator. 176 | - SR\_operator\_position: Stores the character position of the operator. Position is with regard to the compound sign's character range. 177 | - SR\_operator\_type: Stores the operator type, ex. crossing, above, joining, etc. 178 | - SR\_operator\_antec\_range: Stores the character range of the elements that come before the operator. 179 | - SR\_operator\_subseq\_range: Stores the character range of the elements that come after the operator. 180 | 181 | ---- 182 | 183 | ## Usage Example: 184 | 185 | For now the parser is conceived for documents containing individual texts like in [here](http://cdli.ucla.edu/search/archival_view.php?ObjectID=P480793 "Random Example from CDLI") 186 | 187 | For now I am more concerned with fine tuning the parser rather than supporting multiple documents at once, because supporting multiple documents at once is quite easy. I would just need to add couple of lines to initial section getter. 188 | 189 | To use the feature extractor on a brute text like this [one](https://gist.github.com/D-K-E/dc7f5fcb7815b1e52bfb4c763bb0b3ac "Downloaded Single Text From CDLI") use the following: 190 | 191 | After importing the cAtfFeatExtractor.py from the extractor module. 192 | 193 | ```python 194 | 195 | with open("Archival view of P462811.txt","r",encoding="utf-8", newline="\n") as cAtfFile: 196 | test_file = cAtfFile.read() 197 | 198 | test_textClass = cAtfTextBuilder(test_file) 199 | 200 | test_text = test_textClass.buildTextDict_SP() 201 | 202 | ``` 203 | 204 | If you use `FP`, First Pass, instead of `SP`, Second Pass, at the end of `buildTextDict_` your dictionary would not have the relative positions of the signs with regard to different levels and total counts with regard to different levels. Use that if you don't need those. 205 | -------------------------------------------------------------------------------- /catf_feature_extractor/extractor/cAtfFeatExtractor.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | A pure python feature extractor for c-atf encoded texts in order to 4 | train classifiers. 5 | 6 | """ 7 | 8 | # Packages ------------------------------ 9 | 10 | import re 11 | import itertools 12 | 13 | # --------------------------------- 14 | 15 | # Credits -------------------------------- 16 | 17 | __author__ = "Doğu Kaan Eraslan, " 18 | __license__ = "MIT, see LICENSE" 19 | 20 | # ----------------------------------------- 21 | 22 | 23 | 24 | class cAtfLineTester(object): 25 | """ 26 | a class for testing lines of c-atf texts 27 | """ 28 | def __init__(self, atf_line): 29 | # 30 | self.cAtf_line = atf_line 31 | # 32 | # 33 | def test_id_line(self): 34 | """ 35 | params: atf_line, str. 36 | return: boolean 37 | 38 | Tests if the line starts with &, the id marker. 39 | """ 40 | # 41 | find_id_line = re.search("(^&P\d+)", self.cAtf_line) 42 | # 43 | if find_id_line is None: 44 | return False 45 | else: 46 | return True 47 | # 48 | # 49 | def test_language_line(self): 50 | """ 51 | params: atf_line, str. 52 | return: boolean 53 | 54 | tests if the line gives the language of the text 55 | """ 56 | # 57 | find_lang_line = re.search("atf: lang", self.cAtf_line) 58 | # 59 | if find_lang_line is None: 60 | return False 61 | else: 62 | return True 63 | # 64 | # 65 | def test_line_content(self): 66 | """ 67 | params: atf_line, str. 68 | return: boolean 69 | 70 | Tests if the line is commentary about the line content 71 | """ 72 | # 73 | find_line_content_comment = re.search("^#.*", self.cAtf_line) 74 | # 75 | if find_line_content_comment is None: 76 | return False 77 | else: 78 | return True 79 | # 80 | # 81 | def test_object_type_object_part(self): 82 | """ 83 | params: atf_line, str. 84 | return: boolean 85 | 86 | Tests if the line indicates the object type or object part 87 | """ 88 | # 89 | find_object_type_part = re.search("^@.*", self.cAtf_line) 90 | # 91 | if find_object_type_part is None: 92 | return False 93 | else: 94 | return True 95 | # 96 | # 97 | def test_text_structure(self): 98 | """ 99 | params: atf_line, str. 100 | return: boolean 101 | 102 | Tests if the line belongs to a commentary on the text structure 103 | """ 104 | # 105 | find_text_structure = re.search("^\$.*", self.cAtf_line) 106 | # 107 | if find_text_structure is None: 108 | return False 109 | else: 110 | return True 111 | # 112 | # 113 | def test_text_line(self): 114 | """ 115 | params: atf_line, str. 116 | return: boolean 117 | 118 | Tests if the line belongs to a translitteration of a text 119 | """ 120 | # 121 | find_text_line = re.search("^\d.*", self.cAtf_line) 122 | # 123 | if find_text_line is None: 124 | return False 125 | else: 126 | return True 127 | # 128 | # 129 | def test_lineHas_anotherLanguage(self): 130 | """ 131 | params: atf_line, str. 132 | return: boolean 133 | """ 134 | # 135 | find_logogram = re.search("_.*?_", self.cAtf_line) 136 | # 137 | if find_logogram is None: 138 | return False 139 | else: 140 | return True 141 | 142 | 143 | 144 | class cAtfALTester(object): 145 | """ 146 | class for handling another languages in the lines 147 | """ 148 | # 149 | def __init__(self): 150 | # 151 | self.cAtf_line = "" 152 | self.al_oc = "" 153 | self.al_word = "" 154 | # 155 | # 156 | # 157 | def testALHasPreSign(self, al_oc): 158 | """ 159 | params: atf_line, str. 160 | return: boolean 161 | 162 | Test to see if the another language occurence 163 | has a preeceding sign 164 | """ 165 | # 166 | if "-_" in self.cAtf_line or "-_" in al_oc: 167 | return True 168 | else: 169 | return False 170 | # 171 | def test_ALHasFolSign(self,al_oc): 172 | """ 173 | params: atf_line, str. 174 | return: boolean 175 | 176 | Test to see if the another language occurence 177 | has a preeceding sign 178 | 179 | """ 180 | # 181 | if "_-" in self.cAtf_line or "_-" in al_oc: 182 | return True 183 | else: 184 | return False 185 | # 186 | # 187 | @staticmethod 188 | def test_ALSwitch(cAtf_alWord): 189 | """ 190 | params: atf_logogram, str. 191 | return: boolean 192 | 193 | Tests if the occurence have a 194 | language switch 195 | """ 196 | # 197 | if "%" in cAtf_alWord: 198 | return True 199 | else: 200 | return False 201 | 202 | 203 | # ----------------------------------------- 204 | 205 | 206 | class cAtfWordTester(object): 207 | """ 208 | Class for testing the signs in a word 209 | """ 210 | # 211 | def __init__(self, catf_word): 212 | # 213 | self.cAtf_word = catf_word 214 | # 215 | # 216 | @staticmethod 217 | def test_String(string1,string2): 218 | """ 219 | Returns true if string2 220 | contains string1 221 | """ 222 | # 223 | if string1 in string2: 224 | return True 225 | else: 226 | return False 227 | # 228 | def test_damaged_sign(self): 229 | """ 230 | params: atf_word, str. 231 | return: boolean 232 | """ 233 | # 234 | find_damage_sign = re.search("\w+#", self.cAtf_word) 235 | # 236 | if find_damage_sign is None: 237 | return False 238 | else: 239 | return True 240 | # 241 | # 242 | def test_determinative_sign(self): 243 | """ 244 | params: atf_word, str. 245 | return: boolean 246 | """ 247 | # 248 | find_determinative_sign = re.search("\{\w.*?\}",self.cAtf_word) 249 | # 250 | if find_determinative_sign is None: 251 | return False 252 | else: 253 | return True 254 | # 255 | # 256 | def test_isNumber(self): 257 | """ 258 | params: atf_word, str. 259 | return: boolean 260 | """ 261 | # 262 | number_form_1 = re.search("\d+\(\w+.*?\)", self.cAtf_word) 263 | number_form_2 = re.search("n\(\w+.*?\)", self.cAtf_word) 264 | number_form_3 = re.search("n\+\d+\(\w+.*?\)", self.cAtf_word) 265 | # 266 | if number_form_1 is None and number_form_2 is None and number_form_3 is None: 267 | return False 268 | else: 269 | return True 270 | # 271 | def test_isNumberF1(self): 272 | """ 273 | params: atf_word, str. 274 | return: boolean 275 | """ 276 | number_form_1 = re.search("\d+\(\w+.*?\)", self.cAtf_word) 277 | # 278 | if number_form_1 is None: 279 | return False 280 | else: 281 | return True 282 | # 283 | def test_isNumberF2(self): 284 | """ 285 | params: atf_word, str. 286 | return: boolean 287 | """ 288 | # 289 | number_form_2 = re.search("n\(\w+.*?\)", self.cAtf_word) 290 | # 291 | if number_form_2 is None: 292 | return False 293 | else: 294 | return True 295 | # 296 | def test_isNumberF3(self): 297 | """ 298 | params: atf_word, str. 299 | return: boolean 300 | """ 301 | # 302 | number_form_3 = re.search("n\+\d+\(\w+.*?\)", self.cAtf_word) 303 | # 304 | if number_form_3 is None: 305 | return False 306 | else: 307 | return True 308 | 309 | # Punctuation Tests ----------- 310 | # 311 | def test_isColon(self): 312 | """ 313 | returns true if the word 314 | has : 315 | """ 316 | # 317 | if ":" == self.cAtf_word or "P2" == self.cAtf_word: 318 | return True 319 | else: 320 | return False 321 | # 322 | # 323 | def test_isDColon(self): 324 | """ 325 | returns true if the word 326 | is :: 327 | """ 328 | # 329 | if "::" == self.cAtf_word: 330 | return True 331 | else: 332 | return False 333 | # 334 | def test_isColonRQ(self): 335 | """ 336 | returns true if the word 337 | is :' 338 | """ 339 | # 340 | if ":'" == self.cAtf_word or "MZL592~b" == self.cAtf_word: 341 | return True 342 | else: 343 | return False 344 | # 345 | def test_isColonDQ(self): 346 | """ 347 | returns true if the word 348 | is :" 349 | """ 350 | # 351 | if ':"' == self.cAtf_word or "P3" == self.cAtf_word: 352 | return True 353 | else: 354 | return False 355 | # 356 | def test_isDoubleColon(self): 357 | """ 358 | returns true if the word 359 | is :: 360 | """ 361 | if "::" == self.cAtf_word: 362 | return True 363 | else: 364 | return False 365 | # 366 | # 367 | def test_isColonPoint(self): 368 | """ 369 | returns true if the word 370 | is :. 371 | """ 372 | # 373 | if ":." == self.cAtf_word or "P4" == self.cAtf_word: 374 | return True 375 | else: 376 | return False 377 | # 378 | def test_isWordDivider(self): 379 | """ 380 | returns true if the word 381 | is / 382 | """ 383 | if "/" == self.cAtf_word or "P1" == self.cAtf_word: 384 | return True 385 | else: 386 | return False 387 | # 388 | def test_isWordDivider_Specified(self): 389 | """ 390 | returns true if the word has 391 | /( 392 | """ 393 | # 394 | return self.test_String("/(", self.cAtf_word) 395 | # 396 | def test_isBullet(self): 397 | """ 398 | returns true if the word is 399 | * 400 | """ 401 | # 402 | if "*" == self.cAtf_word: 403 | return True 404 | else: 405 | return False 406 | # 407 | def test_isBulletSpeficied(self): 408 | """ 409 | checks if the word has the 410 | structure of *(Grapheme) 411 | """ 412 | # 413 | if re.search("^\*\(.*?\)", self.cAtf_word) is not None: 414 | return True 415 | else: 416 | return False 417 | # 418 | 419 | 420 | # 421 | # Individual Sign Tests ------------ 422 | # 423 | def test_has_complement(self): 424 | """ 425 | Returns true if the sign has 426 | + 427 | """ 428 | # 429 | if "+" in self.cAtf_word: 430 | return True 431 | else: 432 | return False 433 | # 434 | # 435 | def test_has_unknownReading(self): 436 | """ 437 | Returns true if the sign 438 | is uppercase 439 | """ 440 | # 441 | if self.cAtf_word.isupper() is True: 442 | return True 443 | else: 444 | return False 445 | # 446 | # 447 | def test_has_composite(self): 448 | """ 449 | Returns true if the sign 450 | has | 451 | """ 452 | # 453 | return self.test_String("|", self.cAtf_word) 454 | # 455 | def test_has_specification(self): 456 | """ 457 | Returns true if the sign 458 | has ( 459 | """ 460 | # 461 | return self.test_String("(", self.cAtf_word) 462 | # 463 | def test_has_query(self): 464 | """ 465 | Returns true if the sign 466 | has ? 467 | """ 468 | # 469 | return self.test_String("?", self.cAtf_word) 470 | # 471 | def test_has_collation(self): 472 | """ 473 | returns true if the sign 474 | has * 475 | """ 476 | # 477 | return self.test_String("*", self.cAtf_word) 478 | # 479 | def test_has_correction(self): 480 | """ 481 | returns true if the sign 482 | has ! 483 | """ 484 | # 485 | return self.test_String("!", self.cAtf_word) 486 | # 487 | def test_hasCurved(self): 488 | """ 489 | returns true if the sign 490 | has @c 491 | """ 492 | # 493 | return self.test_String("@c", self.cAtf_word) 494 | # 495 | def test_hasFlat(self): 496 | """ 497 | returns true if the sign 498 | has @f 499 | """ 500 | # 501 | return self.test_String("@f", self.cAtf_word) 502 | # 503 | def test_hasGunu(self): 504 | """ 505 | returns true if the sign has 506 | @g 507 | """ 508 | return self.test_String("@g", self.cAtf_word) 509 | # 510 | def test_hasSheshig(self): 511 | """ 512 | returns true if the sign has 513 | @s 514 | """ 515 | # 516 | return self.test_String("@s", self.cAtf_word) 517 | # 518 | def test_hasTenu(self): 519 | """ 520 | returns true if the sign has 521 | @t 522 | """ 523 | # 524 | return self.test_String("@t", self.cAtf_word) 525 | # 526 | def test_hasNutillu(self): 527 | """ 528 | returns true if the sign has 529 | @n 530 | """ 531 | # 532 | return self.test_String("@n", self.cAtf_word) 533 | # 534 | def test_hasZidatenu(self): 535 | """ 536 | returns true if the sign has 537 | @z 538 | """ 539 | # 540 | return self.test_String("@z", self.cAtf_word) 541 | # 542 | def test_hasKabatenu(self): 543 | """ 544 | returns true if the sign 545 | has @k 546 | """ 547 | # 548 | return self.test_String("@k", self.cAtf_word) 549 | # 550 | def test_hasVertReflected(self): 551 | """ 552 | returns true if the sign 553 | has @r 554 | """ 555 | # 556 | return self.test_String("@r", self.cAtf_word) 557 | # 558 | def test_hasHorReflected(self): 559 | """ 560 | returns true if the sign 561 | has @h 562 | """ 563 | # 564 | return self.test_String("@h", self.cAtf_word) 565 | # 566 | def test_hasVariant(self): 567 | """ 568 | returns true if the sign 569 | has @v 570 | """ 571 | # 572 | return self.test_String("@v", self.cAtf_word) 573 | # 574 | def test_hasRotated(self): 575 | """ 576 | returns true if the 577 | sign has @\d+ 578 | """ 579 | # 580 | if re.search("@\d+",self.cAtf_word) is not None: 581 | return True 582 | else: 583 | return False 584 | # 585 | # Compound Sign Tests ------------------ 586 | # 587 | def test_hasBeside(self): 588 | """ 589 | returns true if the 590 | sign has . 591 | """ 592 | # 593 | return self.test_String(".", self.cAtf_word) 594 | # 595 | def test_hasJoining(self): 596 | """ 597 | returns true if 598 | the sign has + 599 | """ 600 | # 601 | return self.test_String("+", self.cAtf_word) 602 | # 603 | def test_hasAbove(self): 604 | """ 605 | returns true if the sign 606 | has & 607 | """ 608 | # 609 | return self.test_String("&", self.cAtf_word) 610 | # 611 | def test_hasCrossing(self): 612 | """ 613 | returns true if the sign 614 | has % 615 | """ 616 | # 617 | return self.test_String("%", self.cAtf_word) 618 | # 619 | def test_hasAllograph(self): 620 | """ 621 | returns true if the sign 622 | has ~ 623 | """ 624 | # 625 | return self.test_String("~", self.cAtf_word) 626 | # 627 | def test_hasSpecialAllograph(self): 628 | """ 629 | returns true if the sign 630 | has ~v 631 | """ 632 | # 633 | return self.test_String("~v", self.cAtf_word) 634 | # 635 | def test_hasFormVariant(self): 636 | """ 637 | returns true if the sign 638 | has \ 639 | """ 640 | # 641 | return self.test_String("\\", self.cAtf_word) 642 | # 643 | def test_hasContaining(self): 644 | """ 645 | returns true if the sign 646 | has x 647 | """ 648 | # 649 | return self.test_String("x", self.cAtf_word) 650 | # 651 | def test_hasContaining_Group(self): 652 | """ 653 | returns true if the sign 654 | has x( 655 | """ 656 | # 657 | return self.test_String("x(", self.cAtf_word) 658 | # 659 | 660 | # TODO Take the sign from numbers 661 | 662 | 663 | # ---------------------------------------------------- 664 | 665 | class cAtfSignTester(object): 666 | """ 667 | Class for testing signs in order to buildinga sign dict afterwards 668 | """ 669 | # 670 | def __init__(self, cAtf_Sign): 671 | # 672 | self.catf_sign = cAtf_Sign 673 | # 674 | # 675 | @staticmethod 676 | def test_String(string1,string2): 677 | """ 678 | Returns true if string2 679 | contains string1 680 | """ 681 | # 682 | if string1 in string2: 683 | return True 684 | else: 685 | return False 686 | # 687 | def test_isDamaged(self): 688 | """ 689 | Returns true if the self.catf_sign 690 | has # 691 | """ 692 | # 693 | return self.test_String("#", self.catf_sign) 694 | # 695 | # 696 | def test_isComplement(self): 697 | """ 698 | Returns true if the self.catf_sign has 699 | + 700 | """ 701 | # 702 | if self.test_String("+", self.catf_sign) and self.test_isComposite(): 703 | return True 704 | else: 705 | return False 706 | # 707 | def test_isUnknownReading(self): 708 | """ 709 | Returns true if the self.catf_sign 710 | is uppercase 711 | """ 712 | # 713 | if self.catf_sign.isupper() is True: 714 | return True 715 | else: 716 | return False 717 | # 718 | # 719 | def test_isComposite(self): 720 | """ 721 | Returns true if the self.catf_sign 722 | has | 723 | """ 724 | # 725 | return self.test_String("|", self.catf_sign) 726 | # 727 | def test_isSpecification(self): 728 | """ 729 | Returns true if the self.catf_sign 730 | has ( 731 | """ 732 | # 733 | return self.test_String("(", self.catf_sign) 734 | # 735 | def test_is_query(self): 736 | """ 737 | Returns true if the self.catf_sign 738 | has ? 739 | """ 740 | # 741 | return self.test_String("?", self.catf_sign) 742 | # 743 | def test_is_collation(self): 744 | """ 745 | returns true if the self.catf_sign 746 | has * 747 | """ 748 | # 749 | return self.test_String("*", self.catf_sign) 750 | # 751 | def test_is_correction(self): 752 | """ 753 | returns true if the self.catf_sign 754 | has ! 755 | """ 756 | # 757 | return self.test_String("!", self.catf_sign) 758 | # 759 | # Modifier Tests ------------------------ 760 | # 761 | def test_isCurved(self): 762 | """ 763 | returns true if the self.catf_sign 764 | has @c 765 | """ 766 | # 767 | return self.test_String("@c", self.catf_sign) 768 | # 769 | def test_isFlat(self): 770 | """ 771 | returns true if the self.catf_sign 772 | has @f 773 | """ 774 | # 775 | return self.test_String("@f", self.catf_sign) 776 | # 777 | def test_isGunu(self): 778 | """ 779 | returns true if the self.catf_sign has 780 | @g 781 | """ 782 | return self.test_String("@g", self.catf_sign) 783 | # 784 | def test_isSheshig(self): 785 | """ 786 | returns true if the self.catf_sign has 787 | @s 788 | """ 789 | # 790 | return self.test_String("@s", self.catf_sign) 791 | # 792 | def test_isTenu(self): 793 | """ 794 | returns true if the self.catf_sign has 795 | @t 796 | """ 797 | # 798 | return self.test_String("@t", self.catf_sign) 799 | # 800 | def test_isNutillu(self): 801 | """ 802 | returns true if the self.catf_sign has 803 | @n 804 | """ 805 | # 806 | return self.test_String("@n", self.catf_sign) 807 | # 808 | def test_isZidatenu(self): 809 | """ 810 | returns true if the self.catf_sign has 811 | @z 812 | """ 813 | # 814 | return self.test_String("@z", self.catf_sign) 815 | # 816 | def test_isKabatenu(self): 817 | """ 818 | returns true if the self.catf_sign 819 | has @k 820 | """ 821 | # 822 | return self.test_String("@k", self.catf_sign) 823 | # 824 | def test_isVertReflected(self): 825 | """ 826 | returns true if the self.catf_sign 827 | has @r 828 | """ 829 | # 830 | return self.test_String("@r", self.catf_sign) 831 | # 832 | def test_isHorReflected(self): 833 | """ 834 | returns true if the self.catf_sign 835 | has @h 836 | """ 837 | # 838 | return self.test_String("@h", self.catf_sign) 839 | # 840 | def test_isVariant(self): 841 | """ 842 | returns true if the self.catf_sign 843 | has @v 844 | """ 845 | # 846 | return self.test_String("@v", self.catf_sign) 847 | # 848 | def test_isRotated(self): 849 | """ 850 | returns true if the 851 | self.catf_sign has @\d+ 852 | """ 853 | # 854 | if re.search("@\d+",self.catf_sign) is not None: 855 | return True 856 | else: 857 | return False 858 | # 859 | def test_isModifier(self): 860 | """ 861 | returns true 862 | if the self.catf_sign passes all 863 | the tests related to modifiers 864 | """ 865 | # 866 | if self.test_isRotated(self.catf_sign) is True or self.test_isVariant(self.catf_sign) is True or self.test_isHorReflected(self.catf_sign) is True or self.test_isCurved(self.catf_sign) is True or self.test_isFlat(self.catf_sign) is True or self.test_isGunu(self.catf_sign) is True or self.test_isSheshig(self.catf_sign) is True or self.test_isTenu(self.catf_sign) is True or self.test_isNutillu(self.catf_sign) is True or self.test_isZidatenu(self.catf_sign) is True or self.test_isKabatenu(self.catf_sign) is True or self.test_isVertReflected(self.catf_sign) is True: 867 | return True 868 | else: 869 | return False 870 | # Compound Self.Catf_Sign Tests ------------------ 871 | # 872 | @staticmethod 873 | def test_isBinaryScope(operator): 874 | """ 875 | Tests if the operator has 876 | binary scope 877 | the x and the @ will be 878 | handled individually 879 | """ 880 | # 881 | if operator == "&" or operator == "%": 882 | return True 883 | else: 884 | return False 885 | # 886 | # 887 | def test_hasBeside(self): 888 | """ 889 | returns true if the 890 | self.catf_sign has . 891 | """ 892 | # 893 | return self.test_String(".", self.catf_sign) 894 | # 895 | def test_hasJoining(self): 896 | """ 897 | returns true if 898 | the self.catf_sign has + 899 | """ 900 | # 901 | return self.test_String("+", self.catf_sign) 902 | # 903 | def test_hasContaining(self): 904 | """ 905 | returns true if the self.catf_sign 906 | has x 907 | """ 908 | # 909 | return self.test_String("x", self.catf_sign) 910 | # 911 | def test_hasContaining_Group(self): 912 | """ 913 | returns true if the self.catf_sign 914 | has x( 915 | """ 916 | # 917 | return self.test_String("x(", self.catf_sign) 918 | # 919 | def test_hasAbove(self): 920 | """ 921 | returns true if the self.catf_sign 922 | has & 923 | """ 924 | # 925 | return self.test_String("&", self.catf_sign) 926 | # 927 | def test_hasCrossing(self): 928 | """ 929 | returns true if the self.catf_sign 930 | has % 931 | """ 932 | # 933 | return self.test_String("%", self.catf_sign) 934 | # 935 | def test_hasOpposing(self): 936 | """ 937 | returns true if the seperated strings 938 | are in uppercase 939 | """ 940 | # 941 | test_list = [] 942 | if self.test_String("@",self.catf_sign) is True: 943 | rep_string = self.catf_sign.replace("@", " ") 944 | no_number = re.sub("\d+","", rep_string) 945 | no_whiteSpace = no_number.replace(" ","") 946 | if no_whiteSpace.isupper() is True: 947 | return True 948 | else: 949 | return False 950 | else: 951 | return False 952 | # 953 | def test_hasAllograph(self): 954 | """ 955 | returns true if the self.catf_sign 956 | has ~ 957 | """ 958 | # 959 | return self.test_String("~", self.catf_sign) 960 | # 961 | def test_hasSpecialAllograph(self): 962 | """ 963 | returns true if the self.catf_sign 964 | has ~v 965 | """ 966 | # 967 | return self.test_String("~v", self.catf_sign) 968 | # 969 | def test_hasFormVariant(self): 970 | """ 971 | returns true if the self.catf_sign 972 | has \ 973 | """ 974 | # 975 | return self.test_String("\\", self.catf_sign) 976 | # 977 | def test_hasRepeated(self): 978 | """ 979 | returns true if the first 980 | seperated character is digit 981 | """ 982 | # 983 | if self.test_String("x", self.catf_sign) is True: 984 | str_split = self.catf_sign.split("x") 985 | if str_split[0].isdigit(): 986 | return True 987 | else: 988 | return False 989 | else: 990 | return False 991 | # 992 | 993 | 994 | class cAtfLineGetter(cAtfLineTester): 995 | """ 996 | a class for getting text lines 997 | according to tests 998 | """ 999 | # 1000 | def __init__(self, atf_line): 1001 | super().__init__(atf_line) 1002 | self.cAtf_line = atf_line 1003 | self.text_id = "" 1004 | self.text_id_alternatives = [] 1005 | self.text_lang = "" 1006 | self.content_comment_line = "" 1007 | self.objectSurface_title = "" 1008 | self.structure_comment = "" 1009 | self.text_line = "" 1010 | self.lineNumber = int() 1011 | self.lineWordCount = int() 1012 | self.lineWords = [] 1013 | self.lineText = "" 1014 | # 1015 | # 1016 | def get_id_line(self): 1017 | """ 1018 | checks the line for 1019 | conforming the id no syntax, 1020 | then gets it. 1021 | """ 1022 | # 1023 | atf_line = self.cAtf_line 1024 | if self.test_id_line() == True: 1025 | text_id_search = re.search("&P\d+\s", atf_line) 1026 | text_id_brut = text_id_search.group(0) 1027 | text_id = text_id_brut[:-1] # Cleans the last space 1028 | self.text_id = text_id[1:] # Cleans the & 1029 | else: 1030 | pass 1031 | # 1032 | return self.text_id 1033 | # 1034 | # 1035 | def get_id_alternatives(self): 1036 | """ 1037 | Checks the line for id syntax. 1038 | Gets the id alternatives 1039 | separated with the "=". 1040 | """ 1041 | # 1042 | atf_line = self.cAtf_line 1043 | # 1044 | if self.test_id_line() == True: 1045 | text_id_alternative_split = atf_line.split("=") 1046 | text_id_alternative_brut = text_id_alternative_split[1:] 1047 | text_id_alternative = [alternative.strip() for alternative in text_id_alternative_brut] 1048 | self.text_id_alternatives = text_id_alternative 1049 | else: 1050 | pass 1051 | # 1052 | return self.text_id_alternatives 1053 | # 1054 | # 1055 | def get_language_line(self): 1056 | """ 1057 | Checks the line for 1058 | language protocol syntax 1059 | Gets the indicated language 1060 | """ 1061 | # 1062 | atf_line = self.cAtf_line 1063 | # 1064 | if self.test_language_line() == True: 1065 | text_lang_search = re.search("atf: lang.*", atf_line) 1066 | text_lang_brut = text_lang_search.group(0) 1067 | text_lang = text_lang_brut[len("atf: lang "):].strip() 1068 | self.text_lang = text_lang 1069 | else: 1070 | pass 1071 | # 1072 | return self.text_lang 1073 | # 1074 | # 1075 | def get_content_comment(self): 1076 | """ 1077 | Checks the line for 1078 | content comment syntax 1079 | ie #. 1080 | Gets the content comment line 1081 | """ 1082 | # 1083 | atf_line = self.cAtf_line 1084 | # 1085 | if self.test_line_content() == True: 1086 | content_comment_search = re.search("^#.*", atf_line) 1087 | content_comment = content_comment_search.group(0) 1088 | self.content_comment_line = content_comment 1089 | else: 1090 | pass 1091 | # 1092 | return self.content_comment_line 1093 | # 1094 | # 1095 | def get_object_part_title(self): 1096 | """ 1097 | Checks if the line starts with @. 1098 | Gets the line if it does. 1099 | """ 1100 | # 1101 | atf_line = self.cAtf_line 1102 | # 1103 | if self.test_object_type_object_part() == True: 1104 | object_title_search = re.search("^@.*", atf_line) 1105 | object_surface_title = object_title_search.group(0) 1106 | self.objectSurface_title = object_surface_title 1107 | # 1108 | else: 1109 | pass 1110 | # 1111 | return self.objectSurface_title 1112 | # 1113 | # 1114 | def get_structure_comment(self): 1115 | """ 1116 | Checks if the line starts with $ 1117 | Gets the line if it does. 1118 | """ 1119 | # 1120 | atf_line = self.cAtf_line 1121 | # 1122 | if self.test_text_structure() == True: 1123 | structure_comment_search = re.search("^\$.*", atf_line) 1124 | structure_comment = structure_comment_search.group(0) 1125 | self.structure_comment = structure_comment 1126 | # 1127 | else: 1128 | pass 1129 | # 1130 | return self.structure_comment 1131 | # 1132 | # 1133 | def get_text_line(self): 1134 | """ 1135 | Checks if the line starts with a \d+. 1136 | Gets the line if it does. 1137 | """ 1138 | # 1139 | atf_line = self.cAtf_line 1140 | # 1141 | if self.test_text_line() == True: 1142 | text_line_search = re.search("^\d+\.\s.*", atf_line) 1143 | text_line = text_line_search.group(0) 1144 | self.text_line = text_line 1145 | # 1146 | # 1147 | else: 1148 | pass 1149 | # 1150 | return self.text_line 1151 | # 1152 | # 1153 | def get_line_text(self): 1154 | """ 1155 | Gets the line text 1156 | excluding the line number. 1157 | """ 1158 | # 1159 | if self.test_text_line() == True: 1160 | # 1161 | # Getting rid of the line number 1162 | # 1163 | line_no_search = re.search("^\d+\.\s", self.cAtf_line) 1164 | line_no_brut = line_no_search.group(0) 1165 | text_line = self.cAtf_line[len(line_no_brut):] 1166 | self.lineText = text_line 1167 | else: 1168 | pass 1169 | return self.lineText 1170 | # 1171 | # 1172 | def get_line_number(self): 1173 | """ 1174 | return: self.lineNumber, int. 1175 | Checks if the line is text line 1176 | gets the line number if it is. 1177 | """ 1178 | # 1179 | if self.test_text_line() == True: 1180 | line_no_search = re.search("^\d+\.\s", self.cAtf_line) 1181 | line_no_brut = line_no_search.group(0) 1182 | line_no_str = line_no_brut[:-2] # Cleans the white space and the dot. 1183 | line_no = int(line_no_str) 1184 | self.lineNumber = line_no 1185 | # 1186 | else: 1187 | self.lineNumber = None 1188 | # 1189 | return self.lineNumber 1190 | # 1191 | # 1192 | def get_line_word_count(self): 1193 | """ 1194 | gets the number of words in text line 1195 | assuming that they are 1196 | seperated by whitespace 1197 | """ 1198 | # 1199 | text_line_no_number = self.get_line_text() 1200 | text_line_split = text_line_no_number.split(" ") 1201 | # 1202 | # See if there is anything empty 1203 | # 1204 | for text_line in text_line_split: 1205 | if len(text_line) == 0: 1206 | text_line_split.remove(text_line) 1207 | # 1208 | # 1209 | # 1210 | word_count = len(text_line_split) 1211 | self.lineWordCount = word_count 1212 | # 1213 | # 1214 | return self.lineWordCount 1215 | # 1216 | # 1217 | def get_line_words(self): 1218 | """ 1219 | params: lineText, str. 1220 | return: lineWords, [] 1221 | Gets the whitespace delimited 1222 | words in line 1223 | """ 1224 | # 1225 | text_line_no_number = self.get_line_text() 1226 | text_line_split = text_line_no_number.split(" ") 1227 | # 1228 | # See if there is anything empty 1229 | # 1230 | for text_line in text_line_split: 1231 | if len(text_line) == 0: 1232 | text_line_split.remove(text_line) 1233 | # 1234 | # 1235 | # 1236 | line_words = text_line_split 1237 | self.lineWords = line_words 1238 | # 1239 | # 1240 | return self.lineWords 1241 | # 1242 | 1243 | 1244 | 1245 | class cAtfLineDictBuilder(cAtfLineGetter): 1246 | """ 1247 | class for building the line_dict, 1248 | dictionary. 1249 | """ 1250 | # 1251 | def __init__(self, atf_line): 1252 | # 1253 | super().__init__(atf_line) 1254 | # 1255 | self.cAtf_line = atf_line 1256 | self.isLineStructure = False 1257 | self.isLineComment = False 1258 | self.lineDict = {} 1259 | # 1260 | # 1261 | def isLineStruc(self): 1262 | """ 1263 | Test if the line is a 1264 | structure comment 1265 | """ 1266 | # 1267 | if self.test_text_structure() == True: 1268 | self.isLineStructure = True 1269 | # 1270 | else: 1271 | self.isLineStructure = False 1272 | # 1273 | return self.isLineStructure 1274 | # 1275 | def isLineCom(self): 1276 | """ 1277 | test if the line is a 1278 | comment about the content 1279 | """ 1280 | # 1281 | if self.test_line_content() == True: 1282 | self.isLineComment = True 1283 | else: 1284 | self.isLineComment = False 1285 | # 1286 | return self.isLineComment 1287 | # 1288 | # 1289 | def lineDictBuild(self): 1290 | """ 1291 | builds the line dict 1292 | based on preeceding 1293 | methods 1294 | """ 1295 | # 1296 | self.lineDict["isLineStructure"] = self.isLineStruc() 1297 | self.lineDict["isLineContent"] = self.isLineCom() 1298 | self.lineDict["lineNumber"] = self.get_line_number() 1299 | self.lineDict["lineWordCount"] = self.get_line_word_count() 1300 | self.lineDict["lineText"] = self.get_line_text() 1301 | self.lineDict["lineWords"] = list(set(self.get_line_words())) 1302 | # Removed duplicates, for efficiency. 1303 | self.lineDict["lineWordPos"] = list(enumerate(self.get_line_words())) 1304 | if len(self.lineDict["lineWords"]) == 0 and self.lineDict["lineNumber"] is None: 1305 | return None 1306 | else: 1307 | pass 1308 | # 1309 | return self.lineDict 1310 | # 1311 | 1312 | 1313 | class cAtfALHandler(cAtfALTester): 1314 | """ 1315 | Handle Another Language occurrences. 1316 | """ 1317 | # 1318 | def __init__(self, cAtf_part): 1319 | super().__init__() 1320 | # 1321 | self.cAtf_part = cAtf_part 1322 | self.lineDict_list = [] 1323 | self.cAtf_part_lines = [] 1324 | self.alRef_list = [] 1325 | self.alGroup_list = [] 1326 | self.mulAlOc_group_list = [] 1327 | self.singAlOc_group_list = [] 1328 | self.mulAlOc_line_list = [] 1329 | self.mulAlOc_lineDict_list = [] 1330 | self.mulAlOcS = [] 1331 | self.singAlOcS = [] 1332 | self.alOc_list = [] 1333 | self.AlOcS = [] 1334 | self.alLanguage = "" 1335 | self.textLang = "" 1336 | # 1337 | # 1338 | # 1339 | def set_ALOC_lang(self, lang): 1340 | """ 1341 | Sets the value of self.alLanguage 1342 | """ 1343 | # 1344 | self.alLanguage = lang 1345 | # 1346 | return self.alLanguage 1347 | # 1348 | def set_textLang(self, lang): 1349 | """ 1350 | Sets the value of self.textLang 1351 | """ 1352 | # 1353 | self.textLang = lang 1354 | # 1355 | return self.textLang 1356 | # 1357 | def splitPartLines(self): 1358 | """ 1359 | params: self.cAtf_part, str. 1360 | return: self.cAtf_part_lines, [] 1361 | splits the part into lines 1362 | """ 1363 | # 1364 | self.cAtf_part_lines = self.cAtf_part.splitlines() 1365 | # 1366 | return self.cAtf_part_lines 1367 | # 1368 | @staticmethod 1369 | def lineDictBuild(cAtf_line): 1370 | """ 1371 | Uses the lineDictBuilder class 1372 | method 1373 | """ 1374 | # 1375 | line_class = cAtfLineDictBuilder(cAtf_line) 1376 | line_dict = line_class.lineDictBuild() 1377 | # 1378 | return line_dict 1379 | # 1380 | # 1381 | def get_lineDict_list(self): 1382 | """ 1383 | params: self.cAtf_part_lines, [] 1384 | return: self.lineDict_list, [] 1385 | 1386 | gets the lines in dict form 1387 | """ 1388 | # 1389 | for cAtf_line in self.cAtf_part_lines: 1390 | lineDict = self.lineDictBuild(cAtf_line) 1391 | if lineDict is not None: 1392 | self.lineDict_list.append(lineDict) 1393 | # 1394 | return self.lineDict_list 1395 | # 1396 | @staticmethod 1397 | def test_twoTimesUnScore(lineWord): 1398 | """ 1399 | Tests if a word has the underscore 1400 | two times or not. 1401 | """ 1402 | # 1403 | unscoCount = lineWord.count("_") 1404 | if unscoCount == 2: 1405 | return True 1406 | elif unscoCount == 1: 1407 | return False 1408 | elif unscoCount < 1: 1409 | return None 1410 | else: 1411 | pass 1412 | # 1413 | return None 1414 | # 1415 | def get_ALRefs_lineLevel(self): 1416 | """ 1417 | Searches whether words of a line 1418 | contain a another language switch 1419 | If the word contains the underscore 2 times 1420 | it is added 2 times for facilitating grouping 1421 | after. 1422 | """ 1423 | # 1424 | lineDict_list_sorted = sorted(self.lineDict_list, key=lambda lineDict:lineDict["lineNumber"]) 1425 | for lineDict in lineDict_list_sorted: 1426 | lw_list = list(lineDict["lineWordPos"]) 1427 | line_word_list_sorted = sorted(lw_list, key=lambda wpTuple:wpTuple[0]) 1428 | for WordP, lineWord in line_word_list_sorted: 1429 | if self.test_twoTimesUnScore(lineWord) is True: 1430 | self.alRef_list.append((WordP, lineWord, lineDict["lineNumber"])) 1431 | self.alRef_list.append((WordP, lineWord, lineDict["lineNumber"])) 1432 | elif self.test_twoTimesUnScore(lineWord) is False: 1433 | # (1, WORD, lineNO) 1434 | self.alRef_list.append((WordP, lineWord, lineDict["lineNumber"])) 1435 | # 1436 | # 1437 | @staticmethod 1438 | def grouper(iterable, n, fillvalue=None): 1439 | "Collect data into fixed-length chunks or blocks" 1440 | # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" 1441 | args = [iter(iterable)] * n 1442 | # 1443 | return itertools.zip_longest(*args, fillvalue=fillvalue) 1444 | # 1445 | def group_ALRefs(self): 1446 | """ 1447 | Groups the AL references for 1448 | marking the AL occurrences 1449 | """ 1450 | # 1451 | al_ref_groups = self.grouper(self.alRef_list, 2) 1452 | # There should be no need for a fill value, but ... 1453 | # I am hesitating... 1454 | self.alGroup_list = list(al_ref_groups) 1455 | # 1456 | return self.alGroup_list 1457 | # 1458 | @staticmethod 1459 | def test_multilineALGroup(ALGroup): 1460 | """ 1461 | params: ALGroup, ((()),(())) 1462 | return: boolean 1463 | Tests if the AL references 1464 | stocked in the al group 1465 | points to a AL occurrence 1466 | that spreads into multiple 1467 | lines 1468 | """ 1469 | #((11, '_re-e2-um', 1), (11, 're-e2-um_', 2)) 1470 | start_point = ALGroup[0] 1471 | end_point = ALGroup[1] 1472 | # (1, WORD, lineNO), (1, WORD, lineNO) 1473 | # 1474 | if start_point[2] != end_point[2]: 1475 | return True 1476 | else: 1477 | return False 1478 | # 1479 | # 1480 | @staticmethod 1481 | def test_singALGroup(ALGroup): 1482 | """ 1483 | params: ALGroup, ((()),(())) 1484 | return: boolean 1485 | Tests if the AL references 1486 | stocked in the al group 1487 | points to a AL occurrence 1488 | that is confined to 1 line 1489 | """ 1490 | # 1491 | # ((11, '_re-e2-um_', 1), (11, '_re-e2-um_', 1)) 1492 | start_point = ALGroup[0] 1493 | end_point = ALGroup[1] 1494 | # (1, WORD, lineNO), (1, WORD, lineNO) 1495 | # 1496 | if start_point[2] == end_point[2]: 1497 | return True 1498 | else: 1499 | return False 1500 | # 1501 | def populate_mulALOC_refs(self): 1502 | """ 1503 | Populates the multiline 1504 | AL occurrence reference list. 1505 | """ 1506 | # 1507 | self.mulAlOc_group_list = [] 1508 | for alGroup in self.alGroup_list: 1509 | if self.test_multilineALGroup(alGroup) is True: 1510 | self.mulAlOc_group_list.append(alGroup) 1511 | # 1512 | else: 1513 | pass 1514 | # 1515 | return self.mulAlOc_group_list 1516 | # 1517 | # 1518 | def populate_singALOC_refs(self): 1519 | """ 1520 | Populates the single line 1521 | AL occurrence reference list. 1522 | """ 1523 | # 1524 | self.singAlOc_group_list = [] 1525 | for alGroup in self.alGroup_list: 1526 | if self.test_singALGroup(alGroup) is True: 1527 | self.singAlOc_group_list.append(alGroup) 1528 | # 1529 | # 1530 | return self.singAlOc_group_list 1531 | # 1532 | # 1533 | @staticmethod 1534 | def get_mulAlOc_lines(alGroup, lineDictList): 1535 | """ 1536 | Gets the related lines from the lineDictList, by 1537 | using the alGroup elements as point of reference. 1538 | """ 1539 | # 1540 | start_point = alGroup[0] 1541 | end_point = alGroup[1] 1542 | # 1543 | mulAlOc_line_range = range(start_point[2],end_point[2]+1) 1544 | # (1, WORD, lineNO), (1, WORD, lineNO) 1545 | # +1 compensates the function's exclusion of the final element 1546 | mulAlOc_group_line_dict_list = [] 1547 | # 1548 | for lineDict in lineDictList: 1549 | if lineDict["lineNumber"] in mulAlOc_line_range: 1550 | mulAlOc_group_line_dict_list.append(lineDict) 1551 | # 1552 | # 1553 | return mulAlOc_group_line_dict_list 1554 | # 1555 | def get_mulAlOc_lineDict_list(self): 1556 | """ 1557 | Gets the related lineDicts for 1558 | AL occurrences that spread to multiple lines 1559 | """ 1560 | # 1561 | self.mulAlOc_lineDict_list = [] 1562 | # 1563 | for mulAlOc in self.mulAlOc_group_list: 1564 | line_list = self.get_mulAlOc_lines(mulAlOc,self.lineDict_list) 1565 | self.mulAlOc_lineDict_list.append(line_list) 1566 | # 1567 | # 1568 | return self.mulAlOc_lineDict_list 1569 | # 1570 | @staticmethod 1571 | def get_FW_mulAlOc(mulAlOc_group): 1572 | """ 1573 | Gets the First Word and its position of the 1574 | AL Occurrence that spreads to multiple 1575 | lines. 1576 | """ 1577 | # 1578 | first_item_dict = {} 1579 | mulAlOc_group_sort = sorted(mulAlOc_group, key=lambda lineDict:lineDict["lineNumber"]) 1580 | # 1581 | mulAlOc_first_lineDict = mulAlOc_group_sort[0] 1582 | fLineDict_words = mulAlOc_first_lineDict["lineWordPos"] 1583 | # 1584 | for wordPos, flineWord in fLineDict_words: 1585 | if "_" in flineWord: 1586 | first_item_dict[flineWord] = wordPos 1587 | # 1588 | first_item_sort = sorted(tuple(first_item_dict.items()), key=lambda wordWP:wordWP[1]) 1589 | first_item = (first_item_sort[-1],mulAlOc_first_lineDict["lineNumber"]) 1590 | # 1591 | return first_item 1592 | # 1593 | @staticmethod 1594 | def get_LW_mulAlOc(mulAlOc_group): 1595 | """ 1596 | Gets the Last Word and its position of 1597 | the AL Occurrence that spreads to multiple 1598 | lines 1599 | """ 1600 | # 1601 | last_item_dict = {} 1602 | mulAlOc_group_sort = sorted(mulAlOc_group, key=lambda lineDict:lineDict["lineNumber"]) 1603 | mulAlOc_last_lineDict = mulAlOc_group_sort[-1] 1604 | # 1605 | laLineDict_words = mulAlOc_last_lineDict["lineWords"] 1606 | # 1607 | for lalineWord in laLineDict_words: 1608 | if "_" in lalineWord: 1609 | last_item_dict[lalineWord] = laLineDict_words.index(lalineWord) 1610 | # 1611 | last_item_sort = sorted(tuple(last_item_dict.items()), key=lambda wordWP:wordWP[1]) 1612 | last_item = (last_item_sort[0],mulAlOc_last_lineDict["lineNumber"]) 1613 | # 1614 | return last_item 1615 | # 1616 | def get_ALOC_lang(self,alOc): 1617 | """ 1618 | Gets the AL occurrence language 1619 | if it has one specified with 1620 | %, 1621 | if not, we get the specified AL language 1622 | in the constructor 1623 | """ 1624 | alWord = alOc[0] 1625 | # 1626 | if self.test_ALSwitch(alWord) is True: 1627 | alword_find = re.search("%\w+",alWord) 1628 | alword_get = alword_find.group(0) 1629 | else: 1630 | alword_get = self.alLanguage 1631 | # 1632 | return alword_get 1633 | # 1634 | def mk_mulAlOc(self, first_item, last_item, mulAlOc_group): 1635 | """ 1636 | params: 1637 | first_item, () 1638 | last_item, () 1639 | mulAlOc_group, [{},{}, ... ] 1640 | 1641 | Creates multiline AL Occurrence from the parameters. 1642 | alWord_word, str. Another Language word in AL_occurrence 1643 | alWord_LineNumber, int. The line number for the al_word 1644 | alWord_AlOc_Position, dict. Relative position of the alWord inside the AL_occurrence. 1645 | alWord_AlOc, str. Al_occurrence in which the al_word is observed 1646 | alWord_AlOc_LineNumber, list. Line number(s) in which the al_oc is observed 1647 | alWord_LinePosition, dict. Relative position of the alWord inside the Line. 1648 | 1649 | """ 1650 | # 1651 | alWord_dict_list = [] 1652 | # 1653 | alOc_words = [] 1654 | # 1655 | for lineDict in mulAlOc_group: 1656 | lineNo = lineDict["lineNumber"] 1657 | lineWordPos = lineDict["lineWordPos"] 1658 | lineWCount = lineDict["lineWordCount"] 1659 | # 1660 | for wordPos, lineWord in lineWordPos: 1661 | # 1662 | if lineNo == first_item[1] and wordPos >= first_item[0][1]: 1663 | alOc_words.append((lineWord,wordPos,lineNo,lineWCount)) 1664 | elif first_item[1] < lineNo < last_item[1]: 1665 | alOc_words.append((lineWord,wordPos,lineNo, lineWCount)) 1666 | elif lineNo == last_item[1] and wordPos <= last_item[0][1]: 1667 | alOc_words.append((lineWord,wordPos,lineNo, lineWCount)) 1668 | else: 1669 | pass 1670 | # 1671 | # 1672 | alOc_words_sorted = sorted(alOc_words, key=lambda al:(al[2],al[1])) 1673 | alOc_word_list = [al[0] for al in alOc_words_sorted] 1674 | alOc_line_list = [al[2] for al in alOc_words_sorted] 1675 | alOc_text = " ".join(alOc_word_list) 1676 | alOc_wordPos = enumerate(alOc_words_sorted) 1677 | # 1678 | for wordP, alOc_tuple in alOc_wordPos: 1679 | alWord_dict = {} 1680 | alWord_dict["alWord_word"] = alOc_tuple[0] 1681 | alWord_dict["alWord_LineNumber"] = alOc_tuple[2] 1682 | alWord_dict["alWord_AlOc"] = alOc_text 1683 | alWord_dict["alWord_language"] = self.get_ALOC_lang(alOc_word_list) 1684 | alWord_dict["alWord_textLanguage"] = self.textLang 1685 | alWord_dict["alWord_alOc_LineNumber"] = alOc_line_list 1686 | alOc_pos_dict = {} 1687 | alOc_pos_dict["totalWords_AlOc"] = len(alOc_word_list) 1688 | alOc_pos_dict["alWord_Position"] = wordP 1689 | alWord_dict["alWord_AlOc_Position"] = alOc_pos_dict 1690 | alOc_line_dict = {} 1691 | alOc_line_dict["totalWords_Line"] = alOc_tuple[3] 1692 | alOc_line_dict["alWord_Position"] = alOc_tuple[1] 1693 | alWord_dict["alWord_LinePosition"] = alOc_line_dict 1694 | alWord_dict_list.append(alWord_dict) 1695 | # 1696 | return alWord_dict_list 1697 | # 1698 | # 1699 | def get_mulAlOcS(self): 1700 | """ 1701 | Gets the AL Occurrences that spread into multiple lines 1702 | as lists of another language word dictionary 1703 | """ 1704 | # 1705 | self.mulAlOcS = [] 1706 | # 1707 | for mulAlOc_group in self.get_mulAlOc_lineDict_list(): 1708 | first_point = self.get_FW_mulAlOc(mulAlOc_group) 1709 | last_point = self.get_LW_mulAlOc(mulAlOc_group) 1710 | mulAlOc = self.mk_mulAlOc(first_point, last_point, mulAlOc_group) 1711 | self.mulAlOcS.append(mulAlOc) 1712 | # 1713 | # 1714 | return self.mulAlOcS 1715 | # 1716 | @staticmethod 1717 | def get_AlRefs_WordLevel(lineWP): 1718 | """ 1719 | params: lineWP, () 1720 | Gets the starting point and 1721 | end point of the AL Occurrence observed 1722 | in a single line 1723 | 1724 | """ 1725 | # lineWP == (WordPOS, WORD, LineNumber ) 1726 | # 1727 | alRef_WP_list = [] 1728 | # 1729 | if "_" in lineWP[1]: 1730 | alRef_WP_list.append((lineWP[0], lineWP[1])) 1731 | # (WORDPOS, WORD) 1732 | # 1733 | return alRef_WP_list 1734 | # 1735 | def group_ALRef_sing_Wordlevel(self, alRef_WP_list): 1736 | """ 1737 | groups the AL occurrence references 1738 | observed in a single line 1739 | """ 1740 | # 1741 | alRef_WP_groups = self.grouper(alRef_WP_list,2) 1742 | # 1743 | return alRef_WP_groups 1744 | # 1745 | def mk_singAlOc(self, lineDict_list,alRef_WP_group): 1746 | """ 1747 | params: lineDict, {} 1748 | alRef_WP_group, () 1749 | Creates the AL occurrence from the lineDict, 1750 | by using the values in the alRef_WP_groups 1751 | """ 1752 | # 1753 | alWord_dict_list = [] 1754 | # 1755 | alRef_WP_group_sort = sorted(alRef_WP_group, key=lambda alRef:alRef[0]) 1756 | alRef_WP_range = range(alRef_WP_group_sort[0][0], alRef_WP_group_sort[1][0]+1) 1757 | # 1758 | lineDict = list(filter(lambda Ldicts: Ldicts.get("lineNumber") == alRef_WP_group[0][2], lineDict_list))[0] 1759 | # Gets the lineDict from the lineDict list for the relative 1760 | # al occurrence 1761 | # 1762 | alOc_words = [] 1763 | # 1764 | lineWordPos = lineDict["lineWordPos"] 1765 | # 1766 | for WP, word in lineWordPos: 1767 | if WP in alRef_WP_range: 1768 | alOc_words.append((WP, word)) 1769 | # 1770 | # 1771 | alOc_words_sorted = sorted(alOc_words, key=lambda alWords:alWords[0]) 1772 | alOc_word_list = [al[1] for al in alOc_words_sorted] 1773 | alOc_text = " ".join(alOc_word_list) 1774 | alOc_wordPos = enumerate(alOc_words_sorted) 1775 | # 1776 | for WP, alWordTuple in alOc_wordPos: 1777 | alWord_dict = {} 1778 | alWord_dict["alWord_word"] = alWordTuple[1] 1779 | alWord_dict["alWord_textLanguage"] = self.textLang 1780 | alWord_dict["alWord_language"] = self.get_ALOC_lang(alOc_word_list) 1781 | alWord_dict["alWord_LineNumber"] = lineDict["lineNumber"] 1782 | alWord_dict["alWord_AlOc"] = alOc_text 1783 | alWord_dict["alWord_alOc_LineNumber"] = lineDict["lineNumber"] 1784 | alOc_pos_dict = {} 1785 | alOc_pos_dict["totalWords_AlOc"] = len(alOc_word_list) 1786 | alOc_pos_dict["alWord_Position"] = WP 1787 | alWord_dict["alWord_AlOc_Position"] = alOc_pos_dict 1788 | alOc_line_dict = {} 1789 | alOc_line_dict["totalWords_Line"] = lineDict["lineWordCount"] 1790 | alOc_line_dict["alWord_Position"] = alWordTuple[0] 1791 | alWord_dict["alWord_LinePosition"] = alOc_line_dict 1792 | alWord_dict_list.append(alWord_dict) 1793 | # 1794 | return alWord_dict_list 1795 | # 1796 | def get_singALOcS(self): 1797 | """ 1798 | Gets AL Occurrences confined to a single 1799 | line as list of AL word dictionary. 1800 | """ 1801 | # 1802 | self.singAlOcS = [] 1803 | # 1804 | for singAlOc_group in self.singAlOc_group_list: 1805 | # singAlOc_group == ((10, '_kur_', 62), (10, '_kur_', 62)) 1806 | # (WORDPOS, WORD) 1807 | singAlOc = self.mk_singAlOc(self.lineDict_list,singAlOc_group) 1808 | self.singAlOcS.append(singAlOc) 1809 | # 1810 | return self.singAlOcS 1811 | # 1812 | def get_ALOcS(self): 1813 | """ 1814 | General Method for regrouping 1815 | The methods above. 1816 | """ 1817 | # 1818 | self.splitPartLines() 1819 | self.get_lineDict_list() 1820 | self.get_ALRefs_lineLevel() 1821 | self.group_ALRefs() 1822 | self.populate_mulALOC_refs() 1823 | self.populate_singALOC_refs() 1824 | self.get_mulAlOc_lineDict_list() 1825 | self.get_mulAlOcS() 1826 | self.get_singALOcS() 1827 | # 1828 | self.alOc_list = self.mulAlOcS + self.singAlOcS 1829 | flatten_alOc_list = list(itertools.chain.from_iterable(self.alOc_list)) 1830 | sort_aloc_list = sorted(flatten_alOc_list, key=lambda alword_dict:(alword_dict["alWord_LineNumber"],alword_dict["alWord_LinePosition"]["alWord_Position"])) 1831 | self.AlOcS = [] 1832 | for key, group in itertools.groupby(sort_aloc_list, key=lambda alWord_dict:alWord_dict["alWord_AlOc"]): 1833 | self.AlOcS.append(list(group)) 1834 | # 1835 | # 1836 | return self.AlOcS 1837 | 1838 | 1839 | class cAtfWordDictBuilder(cAtfWordTester): 1840 | """ 1841 | Class for building Word dictionaries 1842 | of a normal text line 1843 | """ 1844 | # 1845 | def __init__(self,cAtf_Word): 1846 | super().__init__(cAtf_Word) 1847 | self.wordPos_list = [] 1848 | self.word = cAtf_Word 1849 | self.lineDict_list = [] 1850 | self.det_signList = [] 1851 | self.detMarkList = [] 1852 | self.detRef_general_list = [] 1853 | self.detRef_Group_list = [] 1854 | self.signList = [] 1855 | self.signList_pos = [] 1856 | self.textLang = "" 1857 | self.wordLang = "" 1858 | self.detLang = "" 1859 | self.clean_word = "" 1860 | self.detDict_list = [] 1861 | self.wordDict = {} 1862 | # 1863 | # 1864 | def set_textLang(self, lang): 1865 | """ 1866 | Text language attribute 1867 | """ 1868 | # 1869 | self.textLang = lang 1870 | # 1871 | return self.textLang 1872 | # 1873 | def set_wordLang(self, value): 1874 | """ 1875 | Word Language property 1876 | """ 1877 | # 1878 | self.wordLang = value 1879 | # 1880 | return self.wordLang 1881 | # 1882 | def set_detLang(self,value): 1883 | """ 1884 | Set Determinative Language 1885 | """ 1886 | # 1887 | self.detLang = value 1888 | # 1889 | return self.detLang 1890 | # 1891 | @staticmethod 1892 | def set_sign_seperator_curvR(cAtf_Word): 1893 | """ 1894 | Sets the sign seperator - 1895 | to the entities with 1896 | parantheses 1897 | """ 1898 | # 1899 | if "}" in cAtf_Word and "}-" in cAtf_Word and "}#" in cAtf_Word: 1900 | rep_string = cAtf_Word.replace("}#","#}") 1901 | rep_word = rep_string.replace("}-","}") 1902 | curv_par_sep = rep_word.split("}") 1903 | curv_par = "}-".join(curv_par_sep) 1904 | elif "}" in cAtf_Word and "}-" not in cAtf_Word and "}#" in cAtf_Word: 1905 | rep_word = cAtf_Word.replace("}#","#}") 1906 | curv_par = rep_word.replace("}","}-") 1907 | elif "}#" in cAtf_Word: 1908 | curv_par = cAtf_Word.replace("}#","#}") 1909 | else: 1910 | curv_par = cAtf_Word 1911 | 1912 | # 1913 | return curv_par 1914 | # 1915 | @staticmethod 1916 | def set_sign_seperator_curvL(cAtf_Word): 1917 | """ 1918 | Sets the sign seperator - 1919 | to the entities with 1920 | parantheses 1921 | """ 1922 | # 1923 | if "{" in cAtf_Word and "-{" in cAtf_Word: 1924 | rep_word = cAtf_Word.replace("-{","{") 1925 | curv_par_sep = rep_word.split("{") 1926 | curv_par = "-{".join(curv_par_sep) 1927 | elif "{" in cAtf_Word and "-{" not in cAtf_Word: 1928 | curv_par = cAtf_Word.replace("{","-{") 1929 | else: 1930 | curv_par = cAtf_Word 1931 | 1932 | return curv_par 1933 | # 1934 | @staticmethod 1935 | def set_sign_seperator_corBL(cAtf_Word): 1936 | """ 1937 | Sets the sign seperator - 1938 | to the entities with 1939 | parantheses 1940 | """ 1941 | # 1942 | if "[" in cAtf_Word and "-[" in cAtf_Word: 1943 | rep_word = cAtf_Word.replace("-[","[") 1944 | corn_par_sep = rep_word.split("[") 1945 | corn_par = "-[".join(corn_par_sep) 1946 | elif "[" in cAtf_Word and "-[" not in cAtf_Word: 1947 | corn_par = cAtf_Word.replace("[","-[") 1948 | else: 1949 | corn_par = cAtf_Word 1950 | # 1951 | return corn_par 1952 | # 1953 | @staticmethod 1954 | def set_sign_seperator_corBR(cAtf_Word): 1955 | """ 1956 | Sets the sign seperator - 1957 | to the entities with 1958 | parantheses 1959 | """ 1960 | # 1961 | if "]" in cAtf_Word and "]-" in cAtf_Word: 1962 | rep_word = cAtf_Word.replace("]-","]") 1963 | corn_par_sep = rep_word.split("]") 1964 | corn_par = "]-".join(corn_par_sep) 1965 | elif "]" in cAtf_Word and "]-" not in cAtf_Word: 1966 | corn_par = cAtf_Word.replace("]","]-") 1967 | else: 1968 | corn_par = cAtf_Word 1969 | # 1970 | return corn_par 1971 | # 1972 | @staticmethod 1973 | def cleanWord(cWord): 1974 | """ 1975 | Cleans the excessive 1976 | sign seperators that might 1977 | have been generated by the 1978 | set_sign_seperators method 1979 | """ 1980 | # 1981 | first_el = cWord[0] 1982 | last_el = cWord[-1] 1983 | # 1984 | if "-" == first_el: 1985 | cWord = cWord[1:] 1986 | elif "-" == last_el: 1987 | cWord = cWord[:-1] 1988 | else: 1989 | pass 1990 | # 1991 | return cWord 1992 | # 1993 | # 1994 | def set_sign_seperators(self): 1995 | """ 1996 | Uses the previous sign 1997 | seperator methods to add 1998 | sign seperator - to right 1999 | places 2000 | """ 2001 | # 2002 | cvl_word = self.set_sign_seperator_curvL(self.cAtf_word) 2003 | cvr_word = self.set_sign_seperator_curvR(cvl_word) 2004 | crl_word = self.set_sign_seperator_corBL(cvr_word) 2005 | crr_word = self.set_sign_seperator_corBR(crl_word) 2006 | self.clean_word = self.cleanWord(crr_word) 2007 | # 2008 | return self.clean_word 2009 | # 2010 | @staticmethod 2011 | def seperate_signs(clean_word): 2012 | """ 2013 | Seperates the signs and assigns 2014 | them an index number. 2015 | """ 2016 | # 2017 | sign_list_brut = clean_word.split("-") 2018 | sign_list = [sign.strip() for sign in sign_list_brut if sign.strip()] 2019 | sign_list = sign_list 2020 | # 2021 | return sign_list 2022 | # 2023 | def get_detRefs(self): 2024 | """ 2025 | Gets the starting point and the end point 2026 | of determinatives 2027 | """ 2028 | # 2029 | signList_unsort = self.seperate_signs(self.clean_word) 2030 | self.signList = signList_unsort 2031 | signList_pos = list(enumerate(signList_unsort)) 2032 | self.signList_pos = sorted(signList_pos, key=lambda signPos:signPos[0]) 2033 | # (0,'lu'),(1, 'mesz'), etc. 2034 | # 2035 | self.detRef_general_list = [] 2036 | # 2037 | for signPos, sign in self.signList_pos: 2038 | if "{" in sign and "}" in sign: 2039 | self.detRef_general_list.append((signPos,sign)) 2040 | self.detRef_general_list.append((signPos,sign)) 2041 | elif "{" in sign or "}" in sign: 2042 | self.detRef_general_list.append((signPos,sign)) 2043 | # 2044 | else: 2045 | pass 2046 | # 2047 | # 2048 | return self.detRef_general_list 2049 | # 2050 | @staticmethod 2051 | def grouper(iterable, n, fillvalue=None): 2052 | "Collect data into fixed-length chunks or blocks" 2053 | # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" 2054 | args = [iter(iterable)] * n 2055 | # 2056 | return itertools.zip_longest(*args, fillvalue=fillvalue) 2057 | # 2058 | def group_detRefs(self): 2059 | """ 2060 | Groups the AL references for 2061 | marking the AL occurrences 2062 | """ 2063 | # 2064 | det_ref_groups = self.grouper(self.detRef_general_list, 2) 2065 | # 2066 | self.detRef_Group_list = list(det_ref_groups) 2067 | # (signPos,sign), (signPos,sign) 2068 | # 2069 | return self.detRef_Group_list 2070 | # 2071 | @staticmethod 2072 | def detRanger(detRef_Group): 2073 | """ 2074 | Gives the range of sign positions 2075 | included in the determinative 2076 | """ 2077 | # 2078 | first_sign = detRef_Group[0] 2079 | last_sign = detRef_Group[1] 2080 | # 2081 | detRange = range(first_sign[0], last_sign[0]+1) 2082 | # 2083 | return detRange 2084 | # 2085 | def get_detSigns(self): 2086 | """ 2087 | gets the signs of the determinatives 2088 | """ 2089 | # 2090 | for detRef_group in self.detRef_Group_list: 2091 | detSign_list = [] 2092 | detRange = self.detRanger(detRef_group) 2093 | for SP, sign in self.signList_pos: 2094 | #(SignPos, Sign),(SignPos, Sign), etc. 2095 | if SP in detRange: 2096 | detSign_list.append((SP, sign)) 2097 | self.det_signList.append(tuple(detSign_list)) 2098 | # 2099 | # 2100 | #self.det_signList.append(tuple(detSign_list)) 2101 | # 2102 | return self.det_signList 2103 | # 2104 | def uniqDetSigns(self): 2105 | """ 2106 | Filter duplicates from det_signList 2107 | """ 2108 | # 2109 | detSy = set() 2110 | det_list = [] 2111 | # 2112 | for detl in self.det_signList: 2113 | if detl not in detSy: 2114 | detSy.add(detl) 2115 | det_list.append(detl) 2116 | # 2117 | self.det_signList = det_list 2118 | # 2119 | return self.det_signList 2120 | # 2121 | def mrk_dets(self):# detSignlist element of self.det_signList 2122 | """ 2123 | params: detSignlist, [(signPos, sign),(), ...] 2124 | Marks the determinatives as 2125 | prepos, postpos, inpos 2126 | """ 2127 | # 2128 | mark_set = set() 2129 | # 2130 | signList = sorted(self.signList_pos, key=lambda x:x[0]) 2131 | # 2132 | for detSignlist in self.det_signList: 2133 | detList = sorted(detSignlist, key=lambda x:x[0]) 2134 | # detSignlist == [(signPos, sign),(), ...] 2135 | # sort according to sign position 2136 | # sort according to sign position 2137 | if detList[0][0] > signList[0][0] and detList[-1][0] < signList[-1][0]: 2138 | detList.append("inpos") 2139 | mark_set.add(tuple(detList)) 2140 | elif detList[0][0] == 0: 2141 | detList.append("prepos") 2142 | mark_set.add(tuple(detList)) 2143 | elif detList[-1][0] == signList[-1][0]: 2144 | detList.append("postpos") 2145 | mark_set.add(tuple(detList)) 2146 | # 2147 | self.detMarkList = list(mark_set) 2148 | # 2149 | return self.detMarkList 2150 | # 2151 | @staticmethod 2152 | def mk_detDict(detMark, sign_list): 2153 | """ 2154 | params: detMark, ((),(),(), ...,"") 2155 | Constructs the determinatives dictionary. 2156 | """ 2157 | # 2158 | # detMark == [(signPos, sign),(signPos, sign),MARK] 2159 | det_signList = [detm for detm in detMark if isinstance(detm, tuple)] 2160 | detList_sort = sorted(det_signList, key=lambda x:x[0]) 2161 | det_mark_str = detMark[-1] 2162 | totalSigns = len(sign_list) 2163 | detSigns = [det[1] for det in detList_sort] 2164 | detText = "-".join(detSigns) 2165 | detSignPos = list(enumerate(detList_sort)) 2166 | detPos_list = [det[0] for det in detList_sort] 2167 | detPos = (detPos_list[0],detPos_list[-1]) 2168 | detLength = len(detSigns) 2169 | # 2170 | detEntity_list = [] 2171 | # 2172 | for detSign in detSignPos: 2173 | # detSign == (0,(3,an)),(1,(4,mesz)), etc 2174 | detSign_dict = {} 2175 | detSign_dict["detSign_det"] = detText 2176 | detSign_dict["detSign_det_WordPos"] = detPos 2177 | detSign_dict["detSign_detMark"] = det_mark_str 2178 | detSign_dict["detSign_detSign"] = detSign[1][1] 2179 | detSign_word_pos = {} 2180 | detSign_word_pos["totalSigns_word"] = totalSigns 2181 | detSign_word_pos["detSign_position"] = detSign[1][0] 2182 | detSign_dict["detSign_WordPosition"] = detSign_word_pos 2183 | detSign_sign_pos = {} 2184 | detSign_sign_pos["totalSigns_determinative"] = detLength 2185 | detSign_sign_pos["detSign_position"] = detSign[0] 2186 | detSign_dict["detSign_DetPosition"] = detSign_sign_pos 2187 | detEntity_list.append(detSign_dict) 2188 | # 2189 | detEntity_tuple = tuple(detEntity_list) 2190 | # 2191 | return detEntity_tuple 2192 | # 2193 | def get_detDictS(self): 2194 | """ 2195 | Populates the determinative list 2196 | in the form of list of list of dicts. 2197 | Dicts represent a sign of a determinative 2198 | list of dicts represent the determinative 2199 | list of list of dicts represent the 2200 | determinatives of the word. 2201 | """ 2202 | # 2203 | self.get_detRefs() 2204 | self.group_detRefs() 2205 | signlist = self.signList_pos 2206 | # 2207 | self.get_detSigns() 2208 | self.uniqDetSigns() 2209 | #detSignlist == [[(signPos, sign),(), ...], [(signPos, sign),(), ...] ] 2210 | # detsign == [(signPos, sign),(), ...] 2211 | detMarkList = self.mrk_dets() 2212 | # 2213 | for detMark in detMarkList: 2214 | # detMark == [(signPos, sign),(signPos, sign),MARK] 2215 | detDicts = self.mk_detDict(detMark, signlist) 2216 | self.detDict_list.append(detDicts) 2217 | # 2218 | return self.detDict_list 2219 | # 2220 | def set_numberDict(self): 2221 | """ 2222 | Sets the number dict if the word is a number 2223 | """ 2224 | # 2225 | numberDict = {} 2226 | # 2227 | if self.test_isNumberF1() is True: 2228 | number_repetCount = re.search("\d+\(", self.cAtf_word) 2229 | numberDict["number_repetitionCount"] = int(number_repetCount.group(0)[:-1]) 2230 | # -1 for excluding the ( 2231 | number_grapheme = re.search("\(.*?\)", self.cAtf_word) 2232 | numberDict["number_grapheme"] = number_grapheme.group(0)[1:-1] 2233 | # 1, -1 for excluding the () 2234 | elif self.test_isNumberF2() is True: 2235 | number_repetCount = re.search(".*\(", self.cAtf_word) 2236 | numberDict["number_repetitionCount"] = number_repetCount.group(0)[:-1] 2237 | number_grapheme = re.search("\(.*?\)", self.cAtf_word) 2238 | numberDict["number_grapheme"] = number_grapheme.group(0)[1:-1] 2239 | elif self.test_isNumberF3() is True: 2240 | number_repetCount = re.search(".*\(", self.cAtf_word) 2241 | numberDict["number_repetitionCount"] = number_repetCount.group(0)[:-1] 2242 | number_grapheme = re.search("\(.*?\)", self.cAtf_word) 2243 | numberDict["number_grapheme"] = number_grapheme.group(0)[1:-1] 2244 | else: 2245 | pass 2246 | # 2247 | return numberDict 2248 | # 2249 | def set_punctDict(self): 2250 | """ 2251 | Sets the punctuation dictionary 2252 | if there is any qualification 2253 | of the grapheme 2254 | """ 2255 | # 2256 | punctDict = {} 2257 | # 2258 | if self.test_isWordDivider_Specified() is True: 2259 | puncElement = re.search(".*\(", self.cAtf_word) 2260 | punctDict["punctuation_punctElement"] = puncElement.group(0)[:-1] 2261 | # 2262 | puncGrapheme = re.search("\(.*?\)", self.cAtf_word) 2263 | punctDict["punctuation_punctGrapheme"] = puncGrapheme.group(0)[1:-1] 2264 | # 2265 | elif self.test_isBulletSpeficied() is True: 2266 | puncElement = re.search(".*\(", self.cAtf_word) 2267 | punctDict["punctuation_punctElement"] = puncElement.group(0)[:-1] 2268 | # 2269 | puncGrapheme = re.search("\(.*?\)", self.cAtf_word) 2270 | punctDict["punctuation_punctGrapheme"] = puncGrapheme.group(0)[1:-1] 2271 | # 2272 | else: 2273 | pass 2274 | # 2275 | return punctDict 2276 | # 2277 | def wordDictBuild(self): 2278 | """ 2279 | Builds the wordDict 2280 | """ 2281 | # 2282 | self.set_sign_seperators() 2283 | self.get_detRefs() 2284 | self.detDict_list = [] 2285 | self.get_detDictS() 2286 | # 2287 | self.wordDict = {} 2288 | self.wordDict["word_wordSignCount"] = len(self.signList) 2289 | self.wordDict["word_word"] = self.cAtf_word 2290 | self.wordDict["word_determinatives"] = self.detDict_list 2291 | self.wordDict["word_wordSignsPos"] = self.signList_pos 2292 | self.wordDict["word_Signs"] = list(set(self.signList)) 2293 | # Removed duplicates for efficiency 2294 | self.wordDict["word_hasDamage"] = self.test_damaged_sign() 2295 | self.wordDict["word_wordLang"] = self.wordLang 2296 | self.wordDict["word_isNumber"] = self.test_isNumber() 2297 | self.wordDict["word_numberDict"] = self.set_numberDict() 2298 | self.wordDict["word_hasComplement"] = self.test_has_complement() 2299 | self.wordDict["word_hasUnknownReading"] = self.test_has_unknownReading() 2300 | self.wordDict["word_hasComposite"] = self.test_has_composite() 2301 | self.wordDict["word_hasSpecification"] = self.test_has_specification() 2302 | self.wordDict["word_hasQuery"] = self.test_has_query() 2303 | self.wordDict["word_hasCollation"] = self.test_has_collation() 2304 | self.wordDict["word_hasCorrection"] = self.test_has_correction() 2305 | self.wordDict["word_isColon"] = self.test_isColon() 2306 | self.wordDict["word_isDColon"] = self.test_isDColon() 2307 | self.wordDict["word_isColonRQ"] = self.test_isColonRQ() 2308 | self.wordDict["word_isColonDQ"] = self.test_isColonDQ() 2309 | self.wordDict["word_isWordDivider"] = self.test_isWordDivider() 2310 | self.wordDict["word_isBullet"] = self.test_isBullet() 2311 | self.wordDict["word_isBulletSpecified"] = self.test_isBulletSpeficied() 2312 | self.wordDict["word_punctuationDict"] = self.set_punctDict() 2313 | self.wordDict["word_isSpecifiedWordDivider"] = self.test_isWordDivider_Specified() 2314 | self.wordDict["word_hasComplement"] = self.test_has_complement() 2315 | self.wordDict["word_hasUnknownReading"] = self.test_has_unknownReading() 2316 | self.wordDict["word_hasCurved"] = self.test_hasCurved() 2317 | self.wordDict["word_hasFlat"] = self.test_hasFlat() 2318 | self.wordDict["word_hasGunu"] = self.test_hasGunu() 2319 | self.wordDict["word_hasSheshig"] = self.test_hasSheshig() 2320 | self.wordDict["word_hasTenu"] = self.test_hasTenu() 2321 | self.wordDict["word_hasNutillu"] = self.test_hasNutillu() 2322 | self.wordDict["word_hasZidatenu"] = self.test_hasZidatenu() 2323 | self.wordDict["word_hasKabatenu"] = self.test_hasKabatenu() 2324 | self.wordDict["word_hasVertReflected"] = self.test_hasVertReflected() 2325 | self.wordDict["word_hasHorReflected"] = self.test_hasHorReflected() 2326 | self.wordDict["word_hasVariant"] = self.test_hasVariant() 2327 | self.wordDict["word_hasRotated"] = self.test_hasRotated() 2328 | self.wordDict["word_hasBeside"] = self.test_hasBeside() 2329 | self.wordDict["word_hasJoining"] = self.test_hasJoining() 2330 | self.wordDict["word_hasAbove"] = self.test_hasAbove() 2331 | self.wordDict["word_hasCrossing"] = self.test_hasCrossing() 2332 | self.wordDict["word_hasAllograph"] = self.test_hasAllograph() 2333 | self.wordDict["word_hasSpecialAllograph"] = self.test_hasSpecialAllograph() 2334 | self.wordDict["word_hasFormVariant"] = self.test_hasFormVariant() 2335 | self.wordDict["word_hasContaining"] = self.test_hasContaining() 2336 | self.wordDict["word_hasContainingGroup"] = self.test_hasContaining_Group() 2337 | # 2338 | return self.wordDict 2339 | 2340 | # ---------------------------------- 2341 | 2342 | 2343 | class cAtfSignDictBuilder(cAtfSignTester): 2344 | """ 2345 | Class regrouping methods for building a signDict 2346 | """ 2347 | # 2348 | # Operator types for Compound Signs ---------------------- 2349 | operator_dict = { 2350 | "beside":".", 2351 | "joining":"+", 2352 | "containing":"x", # This is also used for indicating repetitions. 2353 | # Thus needs to be handled DONE # Binary scope 2354 | "above":"&", # Binary scope 2355 | "crossing":"%", # Binary scope 2356 | "opposing":"@", # This needs to be handled, it is also used in 2357 | # modifiers and part titles. modifiers DONE 2358 | # binary scope 2359 | } 2360 | modifier_dict = { 2361 | "curved":"@c", 2362 | "flat":"@f", 2363 | "gunu":"@g", # 4 extra wedges 2364 | "sheshig":"@s", # added sze sign 2365 | "tenu":"@t", # slanting 2366 | "nutillu":"@n", # unfinished 2367 | "zidatenu":"@z", # slanting right 2368 | "kabatenu":"@k", # slanting left 2369 | "verticallyReflected":"@r", 2370 | "horizontallyReflected":"@h", 2371 | "variant":"@v" 2372 | # Rotations need to be handled seperately DONE 2373 | } 2374 | # 2375 | def __init__(self, catf_sign): 2376 | super().__init__(catf_sign) 2377 | self.catf_sign = catf_sign 2378 | self.signDict = {} 2379 | self.compositeSign = "" 2380 | self.prnthsPosition_list = [] 2381 | self.sign_dict_list = [] 2382 | self.signRelation_dict_list = [] 2383 | # 2384 | # 2385 | # 2386 | """ 2387 | TODO Specifications are treated as 2388 | words when they are delimited by space 2389 | signs when they are delimited by - 2390 | Sayılarla ilgili bir karar vermem lazım. 2391 | Karmaşık işaretlerden de oluşuyor olabilirler. 2392 | """ 2393 | # 2394 | def get_compositeSign(self): 2395 | """ 2396 | Gets the composite sign. 2397 | """ 2398 | # 2399 | if self.test_isComposite() is True: 2400 | composite_sign_search = re.search("\|.*?\|", self.catf_sign) 2401 | self.signDict["sign_isDamaged"] = self.test_isDamaged() 2402 | # This test is done here because 2403 | # C-ATF treates compound signs as atoms 2404 | # If one would like to extend this extractor to 2405 | # O-ATF then this has to moved to elsewhere. 2406 | composite_sign = composite_sign_search.group(0) 2407 | self.compositeSign = composite_sign[1:-1] 2408 | # 1 - -1 for getting rid of | on both sides 2409 | else: 2410 | pass 2411 | # 2412 | return self.compositeSign 2413 | # 2414 | @staticmethod 2415 | def get_nestElements(nestedString): 2416 | """ 2417 | Generates the paranthese content 2418 | with its associated level 2419 | if the composite sign is nested. 2420 | 2421 | Code adapted from SO: 2422 | author: Gareth Rees 2423 | date Published: 2010-11-26-12-32 2424 | date Retrieved: 2017-04-23-19-54 2425 | url: http://stackoverflow.com/questions/4284991/parsing-nested-parentheses-in-python-grab-content-by-level 2426 | """ 2427 | # 2428 | paren_stack = [] 2429 | for i, char in enumerate(nestedString): 2430 | # Ex. CompositeSign == |AN.(ANxAN)&((AN.AN)%AN)| 2431 | if char == "(": 2432 | paren_stack.append(i) 2433 | # Adds the position of ( 2434 | elif char == ")" and paren_stack: 2435 | # Comes the next ) 2436 | start = paren_stack.pop() 2437 | # Gives the last added ( position 2438 | # The logic is that the last added ( would correspond to 2439 | # the first ) and by using pop we ensure 2440 | # that the second ) doesn't mismatch with the ( of 2441 | # the previous right paranthese. 2442 | yield (len(paren_stack),list(range(start, i+1)), nestedString[start+1:i]) 2443 | # the last expression inside the [] excludes the i and 2444 | # adds one to the position of the ( so that we have the 2445 | # content. 2446 | # **WARNING** Range values includes parantheses 2447 | # 2448 | def get_OpPositions(self, compoundSign): 2449 | """ 2450 | gets the operator positions from the 2451 | compound sign. 2452 | """ 2453 | # 2454 | opPosition_list = [] 2455 | # 2456 | for charPos, char in enumerate(compoundSign): 2457 | if char in self.operator_dict.values(): 2458 | opPosition_list.append((charPos, char)) 2459 | # 2460 | # 2461 | return opPosition_list 2462 | # 2463 | @staticmethod 2464 | def get_nestLevelDict(nestList): 2465 | """ 2466 | Maps the output of the generator 2467 | expression to a dictionary 2468 | for facilitating later use. 2469 | """ 2470 | # 2471 | nestLevel_dict_list = [] 2472 | # 2473 | for nestL in nestList: 2474 | nestLDict = {} 2475 | nestLDict["nest_level"] = nestL[0] 2476 | nestLDict["nest_range"] = nestL[1] 2477 | nestLDict["nest_content"] = nestL[2] 2478 | nestLevel_dict_list.append(nestLDict) 2479 | # 2480 | return nestLevel_dict_list 2481 | # 2482 | @staticmethod 2483 | def get_nestDict(nestList): 2484 | """ 2485 | Creates a dictionary based on nest levels. 2486 | """ 2487 | # 2488 | nestDict = {} 2489 | # 2490 | sort_nestList = sorted(nestList, key=lambda x:x[0]) # 2491 | # 2492 | for nestEl in sort_nestList: 2493 | nestDict.setdefault(nestEl[0], []).append(nestEl[1:]) 2494 | # 2495 | return nestDict 2496 | # 2497 | @staticmethod 2498 | def nestDict_LevelRangeCreator(nestDict): 2499 | """ 2500 | Regroups the range list of nest elements 2501 | for each level and appends it to the end 2502 | of the value associated with the nest level 2503 | """ 2504 | # 2505 | nestDict_Ranges = {} 2506 | for key, nestEl in nestDict.items(): 2507 | nestLevel_range_list = [] 2508 | for nestTuple in nestEl: 2509 | nestLevel_range_list.extend(nestTuple[0]) 2510 | # nestTuple[0] should correspond to list of char positions 2511 | # 2512 | nestDict_Ranges[key] = nestEl 2513 | nestDict_Ranges[key].append(nestLevel_range_list) 2514 | # 2515 | return nestDict_Ranges 2516 | # 2517 | # "|(AN.((IR2%IR3).((AN&AN)+(IR3xAN))).((AN.IR3)xNITA))|" Test sign 2518 | # 2519 | @staticmethod 2520 | def get_OpDict_list(nestDict_Ranges, opPosition_list): 2521 | """ 2522 | Gets the operator levels plus one position before and after the 2523 | operator position. Maps all of this to a dictionary. 2524 | Appends the dictionary to a list 2525 | """ 2526 | # 2527 | opDictList = [] 2528 | # 2529 | for opPosition in opPosition_list: 2530 | for level, nestEl in nestDict_Ranges.items(): 2531 | nestRangeList = nestEl[-1] 2532 | if opPosition[0] in nestRangeList: 2533 | posPlace = nestRangeList.index(opPosition[0]) 2534 | posDict = {} 2535 | posDict["operatorPosition_nestlevel"] = level 2536 | posDict["operatorPosition_after"] = nestRangeList[posPlace+1:posPlace+4] 2537 | # This for checking modifier types afterwards 2538 | # Especially the rotation. 2539 | posDict["operatorPosition_before"] = nestRangeList[posPlace-1] 2540 | # Might come in handy for checking 'repeated' operator 2541 | posDict["operatorPosition_position"] = opPosition[0] 2542 | posDict["operatorPosition_operator"] = opPosition[1] 2543 | opDictList.append(posDict) 2544 | # 2545 | # 2546 | return opDictList 2547 | # 2548 | @staticmethod 2549 | def get_OpLevelPosition(opDictList): 2550 | """ 2551 | Eliminates the duplicate occurrences 2552 | for the operators. Only the 2553 | highest level in which the 2554 | operator occured is retained. 2555 | Function groups the operators 2556 | according to their positions 2557 | then makes a list with the highest levels 2558 | within the group. 2559 | """ 2560 | # 2561 | opdictsSorted = sorted(opDictList, key=lambda opDict:opDict["operatorPosition_position"]) # Sort list according to operator positions 2562 | opDictsGrouped = [list(group) for key, group in itertools.groupby(opdictsSorted, key=lambda x:x["operatorPosition_position"])] 2563 | # Group elements according to operator positions 2564 | opDictGroupsSort = [sorted(groupList, key=lambda opDict:opDict["operatorPosition_nestlevel"]) for groupList in opDictsGrouped] 2565 | # Sort group list according to the nest level 2566 | operatorPos_level_list = [sorted_group[-1] for sorted_group in opDictGroupsSort] 2567 | # 2568 | return operatorPos_level_list 2569 | # 2570 | def get_SignRelationBS(self, 2571 | operatorPos_level_list, 2572 | nestLevel_dict_list, 2573 | compositeSign): 2574 | """ 2575 | Gets the sign or sign groups that 2576 | are associated with each other through 2577 | a binary scoped operator 2578 | """ 2579 | # 2580 | signRelation_dict_list = [] 2581 | # 2582 | for operatorPos_level in operatorPos_level_list: 2583 | operatorNestLevel = operatorPos_level["operatorPosition_nestlevel"] 2584 | operatorPos = operatorPos_level["operatorPosition_position"] 2585 | operator = operatorPos_level["operatorPosition_operator"] 2586 | for nestLevel_dict in nestLevel_dict_list: 2587 | nestRange = nestLevel_dict["nest_range"] 2588 | nestLevel = nestLevel_dict["nest_level"] 2589 | nestContent = nestLevel_dict["nest_content"] 2590 | if self.test_isBinaryScope(operator) is True: 2591 | # x and @ will be handled individually 2592 | # we test only for % and & 2593 | if operatorPos in nestRange and operatorNestLevel == nestLevel: 2594 | opPosinRange = nestRange.index(operatorPos) 2595 | opPrecedents = nestRange[1:opPosinRange] 2596 | # 1 for excluding the ( 2597 | opFollowers = nestRange[opPosinRange+1:-1] 2598 | # -1 for excluding ) 2599 | opPrecLength = len(opPrecedents) 2600 | opPrecChars = nestContent[:opPrecLength] 2601 | opFolChars = nestContent[opPrecLength+1:] 2602 | # +1 for excluding the operator 2603 | signRelation_dict = {} 2604 | signRelation_dict["SR_operator"] = operator 2605 | signRelation_dict["SR_operator_antec"] = opPrecChars 2606 | signRelation_dict["SR_operator_subsq"] = opFolChars 2607 | signRelation_dict["SR_nest_level"] = nestLevel 2608 | signRelation_dict["SR_nest_content"] = nestContent 2609 | signRelation_dict["SR_compositeSign"] = compositeSign 2610 | signRelation_dict["SR_nest_range"] = nestRange 2611 | if "(" in opPrecChars and ")" in opPrecChars and ")" in opFolChars and "(" in opFolChars: 2612 | signRelation_dict["SR_relation_type"] = {"operator_antecedent":"Group", "operator_subsequent":"Group"} 2613 | elif "(" in opPrecChars and ")" in opPrecChars and ")" not in opFolChars and not "(" in opFolChars: 2614 | signRelation_dict["SR_relation_type"] = {"operator_antecedent":"Group", "operator_subsequent":"Sign"} 2615 | elif "(" not in opPrecChars and ")" not in opPrecChars and ")" in opFolChars and "(" in opFolChars: 2616 | signRelation_dict["SR_relation_type"] = {"operator_antecedent":"Sign", "operator_subsequent":"Group"} 2617 | elif "(" not in opPrecChars and ")" not in opPrecChars and ")" not in opFolChars and "(" not in opFolChars: 2618 | signRelation_dict["SR_relation_type"] = {"operator_antecedent":"Sign", "operator_subsequent":"Sign"} 2619 | signRelation_dict["SR_operator_position"] = operatorPos 2620 | if operator == "%": 2621 | signRelation_dict["SR_operator_type"] = "crossing" 2622 | elif operator == "&": 2623 | signRelation_dict["SR_operator_type"] = "above" 2624 | signRelation_dict["SR_operator_antec_range"] = opPrecedents 2625 | signRelation_dict["SR_operator_subseq_range"] = opFollowers 2626 | self.signRelation_dict_list.append(signRelation_dict) 2627 | # 2628 | return self.signRelation_dict_list 2629 | # 2630 | def get_SignRelationSpeCases(self,operatorPos_level_list, nestLevel_dict_list, compositeSign): 2631 | """ 2632 | Gets the sign or sign groups that 2633 | are associated with each other through 2634 | x and @ operators 2635 | """ 2636 | # 2637 | signRelation_dict_list = [] 2638 | # 2639 | for operatorPos_level in operatorPos_level_list: 2640 | operatorNestLevel = operatorPos_level["operatorPosition_nestlevel"] 2641 | operatorPos = operatorPos_level["operatorPosition_position"] 2642 | operator = operatorPos_level["operatorPosition_operator"] 2643 | for nestLevel_dict in nestLevel_dict_list: 2644 | nestRange = nestLevel_dict["nest_range"] 2645 | nestLevel = nestLevel_dict["nest_level"] 2646 | nestContent = nestLevel_dict["nest_content"] 2647 | if operatorPos in nestRange and operatorNestLevel == nestLevel: 2648 | opPosinRange = nestRange.index(operatorPos) 2649 | opPrecedents = nestRange[1:opPosinRange] 2650 | # 1 for excluding the ( 2651 | opFollowers = nestRange[opPosinRange+1:-1] 2652 | # -1 for excluding ) 2653 | opPrecLength = len(opPrecedents) 2654 | opPrecChars = nestContent[:opPrecLength] 2655 | opFolChars = nestContent[opPrecLength+1:] 2656 | # +1 for excluding the operator 2657 | signRelation_dict = {} 2658 | signRelation_dict["SR_operator"] = operator 2659 | signRelation_dict["SR_operator_antec"] = opPrecChars 2660 | signRelation_dict["SR_operator_subsq"] = opFolChars 2661 | signRelation_dict["SR_nest_level"] = nestLevel 2662 | signRelation_dict["SR_nest_content"] = nestContent 2663 | signRelation_dict["SR_compositeSign"] = compositeSign 2664 | signRelation_dict["SR_nest_range"] = nestRange 2665 | if "(" in opPrecChars and ")" in opPrecChars and ")" in opFolChars and "(" in opFolChars: 2666 | signRelation_dict["SR_relation_type"] = {"operator_antecedent":"Group", "operator_subsequent":"Group"} 2667 | elif "(" in opPrecChars and ")" in opPrecChars and ")" not in opFolChars and not "(" in opFolChars: 2668 | signRelation_dict["SR_relation_type"] = {"operator_antecedent":"Group", "operator_subsequent":"Sign"} 2669 | elif "(" not in opPrecChars and ")" not in opPrecChars and ")" in opFolChars and "(" in opFolChars: 2670 | signRelation_dict["SR_relation_type"] = {"operator_antecedent":"Sign", "operator_subsequent":"Group"} 2671 | elif "(" not in opPrecChars and ")" not in opPrecChars and ")" not in opFolChars and "(" not in opFolChars: 2672 | signRelation_dict["SR_relation_type"] = {"operator_antecedent":"Sign", "operator_subsequent":"Sign"} 2673 | signRelation_dict["SR_operator_position"] = operatorPos 2674 | signRelation_dict["SR_operator_antec_range"] = opPrecedents 2675 | signRelation_dict["SR_operator_subseq_range"] = opFollowers 2676 | if operator == ".": 2677 | signRelation_dict["SR_operator_type"] = "beside" 2678 | elif operator == "+": 2679 | signRelation_dict["SR_operator_type"] = "joining" 2680 | elif operator == "x" and opPrecChars.isdigit(): 2681 | signRelation_dict["SR_operator_type"] = "repeated" 2682 | elif operator == "x" and not opPrecChars.isdigit(): 2683 | 2684 | signRelation_dict["SR_operator_type"] = "containing" 2685 | elif operator == "@": 2686 | if re.search("^\d+", opFolChars) is not None: 2687 | # This means that the @ sign is 2688 | # a modifier here so we restart looping 2689 | continue 2690 | else: 2691 | opFolCharsOper = nestContent[opPrecLength:opPrecLength+3] 2692 | # Includes the operator @ 2693 | if "@c" in opFolCharsOper or "@f" in opFolCharsOper or "@g" in opFolCharsOper or "@s" in opFolCharsOper or "@s" in opFolCharsOper or "@t" in opFolCharsOper or "@n" in opFolCharsOper or "@z" in opFolCharsOper or "@k" in opFolCharsOper or "@r" in opFolCharsOper or "@h" in opFolCharsOper or "@v" in opFolCharsOper: 2694 | # This means that @ sign is 2695 | # a modifier so we restart looping 2696 | continue 2697 | else: 2698 | signRelation_dict["SR_operator_type"] = "opposing" 2699 | # 2700 | self.signRelation_dict_list.append(signRelation_dict) 2701 | # 2702 | # 2703 | return self.signRelation_dict_list 2704 | # 2705 | @staticmethod 2706 | def get_unNestedCompSigns(compositeSign, opPosition_list): 2707 | """ 2708 | gets the signs of composite sign 2709 | that is not nested. 2710 | """ 2711 | # 2712 | signRelation_dict_list = [] 2713 | # 2714 | for opPos in opPosition_list: 2715 | opP = opPos[0] 2716 | opChar = opPos[1] 2717 | opAnte = compositeSign[:opP] 2718 | opSubseq = compositeSign[opP:] 2719 | signRelation_dict = {} 2720 | signRelation_dict["SR_operator"] = opChar 2721 | signRelation_dict["SR_operator_antec"] = opAnte 2722 | signRelation_dict["SR_operator_subsq"] = opSubseq[1:] 2723 | # 1 for excluding the operator in mapping 2724 | signRelation_dict["SR_compositeSign"] = compositeSign 2725 | signRelation_dict["SR_nest_level"] = 0 2726 | signRelation_dict["SR_nest_content"] = compositeSign 2727 | signRelation_dict["SR_nest_range"] = list(range(0,len(compositeSign))) 2728 | 2729 | signRelation_dict["SR_operator_position"] = opP 2730 | signRelation_dict["SR_relation_type"] = {"operator_antecedent":"Sign", "operator_subsequent":"Sign"} 2731 | if opChar == "%": 2732 | signRelation_dict["SR_operator_type"] = "crossing" 2733 | elif opChar == "&": 2734 | signRelation_dict["SR_operator_type"] = "above" 2735 | elif opChar == ".": 2736 | signRelation_dict["SR_operator_type"] = "beside" 2737 | elif opChar == "+": 2738 | signRelation_dict["SR_operator_type"] = "joining" 2739 | elif opChar == "x" and opAnte.isdigit(): 2740 | signRelation_dict["SR_operator_type"] = "repeated" 2741 | elif opChar == "x" and not opAnte.isdigit(): 2742 | signRelation_dict["SR_operator_type"] = "containing" 2743 | elif operator == "@": 2744 | if re.search("^\d+",opSubseq[1:]) is not None: 2745 | # Starts from 1 because opSubseq[0]== operator 2746 | # This means that the @ sign is 2747 | # a modifier here so we restart looping 2748 | continue 2749 | else: 2750 | opFolCharsOper = opSubseq[0:2] 2751 | # Includes the operator @ 2752 | if "@c" in opFolCharsOper or "@f" in opFolCharsOper or "@g" in opFolCharsOper or "@s" in opFolCharsOper or "@s" in opFolCharsOper or "@t" in opFolCharsOper or "@n" in opFolCharsOper or "@z" in opFolCharsOper or "@k" in opFolCharsOper or "@r" in opFolCharsOper or "@h" in opFolCharsOper or "@v" in opFolCharsOper: 2753 | # This means that @ sign is 2754 | # a modifier so we restart looping 2755 | continue 2756 | else: 2757 | signRelation_dict["SR_operator_type"] = "opposing" 2758 | signRelation_dict["SR_operator_antec_range"] = list(range(0, opP)) 2759 | signRelation_dict["SR_operator_subseq_range"] = list(range(opP, len(compositeSign))) 2760 | # 2761 | self.signRelation_dict_list.append(signRelation_dict) 2762 | # 2763 | return self.signRelation_dict_list 2764 | # 2765 | @staticmethod 2766 | def get_signsSR(signRelDict): 2767 | """ 2768 | Gets signs from the sign dict. 2769 | """ 2770 | # 2771 | compoundSign_signList = [] 2772 | if signRelDict["SR_relation_type"]["operator_antecedent"] == "Sign" and signRelDict["SR_relation_type"]["operator_subsequent"] == "Sign": 2773 | compoundSign_signList.append(signRelDict["SR_operator_antec"]) 2774 | compoundSign_signList.append(signRelDict["SR_operator_subsq"]) 2775 | compoundSign_signList.append(signRelDict) 2776 | # 2777 | elif signRelDict["SR_relation_type"]["operator_antecedent"] == "Group" and signRelDict["SR_relation_type"]["operator_subsequent"] == "Sign": 2778 | compoundSign_signList.append(signRelDict["SR_operator_subsq"]) 2779 | compoundSign_signList.append(signRelDict) 2780 | # 2781 | elif signRelDict["SR_relation_type"]["operator_antecedent"] == "Sign" and signRelDict["SR_relation_type"]["operator_subsequent"] == "Group": 2782 | compoundSign_signList.append(signRelDict) 2783 | # 2784 | return compoundSign_signList 2785 | # 2786 | def get_signComplement(self): 2787 | """ 2788 | Gets the signs from a sign 2789 | that has a complement 2790 | """ 2791 | # 2792 | complement_sign_list = [] 2793 | 2794 | if self.test_isComplement(self.catf_sign) is True: 2795 | compSplit = self.catf_sign.split("+") 2796 | complement_sign = compSplit[1] 2797 | complement_sign_list.append(complement_sign) 2798 | # 2799 | return complement_sign_list 2800 | # 2801 | @staticmethod 2802 | def char_convert(sign): 2803 | """ 2804 | Convert CDLI C-ATF characters 2805 | to unicode 2806 | """ 2807 | # 2808 | text_sz = sign.replace("sz","\u0161") # sz -> š 2809 | text_SZ = text_sz.replace("SZ", "\u0160") # SZ -> Š 2810 | text_sPo = text_SZ.replace("s,", "\u1e63") # s, -> ṣ 2811 | text_SPo = text_sPo.replace("S,", "\u1e62") # S, -> Ṣ 2812 | text_tch = text_SPo.replace("t,", "\u1e6d") # t, -> ṭ 2813 | text_TCH = text_tch.replace("T,", "\u1e6c") # T, -> Ṭ 2814 | text_s = text_TCH.replace("s'", "\u015b") # s' -> ś 2815 | text_S = text_s.replace("S'","\u015a") # S' -> Ś 2816 | text_ayn = text_S.replace("'", "\u02be") # ' -> ʾ 2817 | text_sub0 = text_ayn.replace("0","\u2080")# Subscript numbers 2818 | text_sub1 = text_sub0.replace("1","\u2081") 2819 | text_sub2 = text_sub1.replace("2","\u2082") 2820 | text_sub3 = text_sub2.replace("3","\u2083") 2821 | text_sub4 = text_sub3.replace("4","\u2084") 2822 | text_sub5 = text_sub4.replace("5","\u2085") 2823 | text_sub6 = text_sub5.replace("6","\u2086") 2824 | text_sub7 = text_sub6.replace("7","\u2087") 2825 | text_sub8 = text_sub7.replace("8","\u2088") 2826 | text_sub9 = text_sub8.replace("9","\u2089") 2827 | text_subx = text_sub9.replace("x²","\u208a") # subscript x 2828 | text_subX = text_subx.replace("X²","\u208a") 2829 | text_h = text_subX.replace("h,", "\u1e2b") # h, -> ḫ 2830 | text_H = text_h.replace("H,", "\u1e2a") # H, -> Ḫ 2831 | text_j = text_H.replace("j","\u014b") # j -> ŋ 2832 | text_J = text_j.replace("J","\u014a") # J -> Ŋ 2833 | # 2834 | return text_J 2835 | # 2836 | @staticmethod 2837 | def signDictBuild(sign): 2838 | """ 2839 | params: 2840 | sign, str. 2841 | C(ompound/complement) S(ign), boolean 2842 | 2843 | Returns the sign dict 2844 | with all the features. 2845 | """ 2846 | # 2847 | signDict = {} 2848 | tester_class = cAtfSignTester(sign) 2849 | signDict["sign_sign"] = sign 2850 | signDict["sign_isComplement"] = tester_class.test_isComplement() 2851 | signDict["sign_isQuery"] = tester_class.test_is_query() 2852 | signDict["sign_isCorrection"] = tester_class.test_is_correction() 2853 | signDict["sign_isCollation"] = tester_class.test_is_collation() 2854 | signDict["sign_isCurved"] = tester_class.test_isCurved() 2855 | signDict["sign_isFlat"] = tester_class.test_isFlat() 2856 | signDict["sign_isGunu"] = tester_class.test_isGunu() 2857 | signDict["sign_isSheshig"] = tester_class.test_isSheshig() 2858 | signDict["sign_isTenu"] = tester_class.test_isTenu() 2859 | signDict["sign_isNutillu"] = tester_class.test_isNutillu() 2860 | signDict["sign_isZidatenu"] = tester_class.test_isZidatenu() 2861 | signDict["sign_isKabatenu"] = tester_class.test_isKabatenu() 2862 | signDict["sign_isVertReflected"] = tester_class.test_isVertReflected() 2863 | signDict["sign_hasAllograph"] = tester_class.test_hasAllograph() 2864 | signDict["sign_hasSpecialAllograph"] = tester_class.test_hasSpecialAllograph() 2865 | signDict["sign_isHorReflected"] = tester_class.test_isHorReflected() 2866 | signDict["sign_isVariant"] = tester_class.test_isVariant() 2867 | signDict["sign_isRotated"] = tester_class.test_isRotated() 2868 | #signDict["sign_isPartOfComposite"] = test_isComposite() 2869 | #signDict["sign_nestLevel"] = 0 Composite değilse 2870 | #signDict["sign_isUnknownReading"] = test_isUnknownReading() # Composite değilse 2871 | #signDict["sign_relatedSigns"] = {} # Buraya composite 2872 | # işaretleri oluşturan liste eklenecek 2873 | return signDict 2874 | # 2875 | def buildSignDict(self): 2876 | """ 2877 | Wraps the methods defined throughout the class. 2878 | """ 2879 | # 2880 | sign_dict_list = [] 2881 | # 2882 | if self.test_isComposite() is True and self.test_isSpecification() is True: 2883 | # Basically it is a nested composite sign 2884 | compositeSign = self.get_compositeSign() 2885 | nestedElements = self.get_nestElements(compositeSign) 2886 | opPositonList = self.get_OpPositions(compositeSign) 2887 | nestList = list(nestedElements) 2888 | nestLevelDictList = self.get_nestLevelDict(nestList) 2889 | nest_dict = self.get_nestDict(nestList) 2890 | nest_dict_levelRange = self.nestDict_LevelRangeCreator(nest_dict) 2891 | opDict_list = self.get_OpDict_list( 2892 | nest_dict_levelRange, 2893 | opPositonList 2894 | ) 2895 | opLvlPosition = self.get_OpLevelPosition(opDict_list) 2896 | SR_dictList_BS = self.get_SignRelationBS( 2897 | opLvlPosition, 2898 | nestLevelDictList, 2899 | compositeSign 2900 | ) 2901 | SR_dictList_SCases = self.get_SignRelationSpeCases( 2902 | opLvlPosition, 2903 | nestLevelDictList, 2904 | compositeSign 2905 | ) 2906 | SR_dictList = SR_dictList_SCases + SR_dictList_BS 2907 | compoundSign_SR_lists_brut = [self.get_signsSR(SignDict) for SignDict in SR_dictList] 2908 | # There are empty list in the brut file 2909 | # Created by the group - group associations 2910 | compoundSign_SR_lists = list(filter(None, compoundSign_SR_lists_brut)) 2911 | # They are filtered now. 2912 | for compoundSignList in compoundSign_SR_lists: 2913 | SR_dict = compoundSignList[-1] 2914 | for signElement in compoundSignList: 2915 | if not isinstance(signElement, dict): 2916 | self.signDict = self.signDictBuild(signElement) 2917 | self.signDict["sign_isPartOfComposite"] = True 2918 | self.signDict["sign_isUnknownReading"] = False 2919 | self.signDict["sign_relatedSigns"] = SR_dict 2920 | self.signDict["sign_nestLevel"] = SR_dict["SR_nest_level"] 2921 | self.signDict["sign_compositeSign"] = SR_dict["SR_compositeSign"] 2922 | sign_dict_list.append(self.signDict) 2923 | # Compound Nested DONE 2924 | # 2925 | elif self.test_isComposite() is True and self.test_isSpecification() is False: 2926 | # Compound Not Nested 2927 | compositeSign = self.get_compositeSign() 2928 | opPositonList = self.get_OpPositions(compositeSign) 2929 | unNestedList = self.get_unNestedCompSigns( 2930 | compositeSign, opPositonList 2931 | ) 2932 | compoundSign_SR_lists = [self.get_signsSR(SignDict) for SignDict in unNestedList] 2933 | for compoundSignList in compoundSign_SR_lists: 2934 | SR_dict = compoundSignList.pop() 2935 | for signElement in compoundSignList: 2936 | self.signDict = self.signDictBuild(signElement) 2937 | self.signDict["sign_isPartOfComposite"] = True 2938 | self.signDict["sign_isUnknownReading"] = False 2939 | self.signDict["sign_relatedSigns"] = SR_dict 2940 | self.signDict["sign_nestLevel"] = SR_dict["SR_nest_level"] 2941 | self.signDict["sign_compositeSign"] = SR_dict["SR_compositeSign"] 2942 | sign_dict_list.append(self.signDict) 2943 | # 2944 | # Compound not Nested DONE 2945 | # 2946 | elif self.test_isComposite() is False and self.test_isComplement() is True: 2947 | # Not a Compound Sign but is a complement 2948 | complementSignList = self.get_signComplement(sign) 2949 | for complementSign in complementSignList: 2950 | self.signDict = self.signDictBuild(complementSign) 2951 | self.signDict["sign_isPartOfComposite"] = False 2952 | self.signDict["sign_isUnknownReading"] = self.test_isUnknownReading(sign) 2953 | self.signDict["sign_relatedSigns"] = {} # TODO get Related Sign for Complement Signs 2954 | self.signDict["sign_nestLevel"] = 0 2955 | self.signDict["sign_compositeSign"] = "" 2956 | sign_dict_list.append(self.signDict) 2957 | # Complement sign DONE 2958 | # 2959 | elif self.test_isComplement() is False and self.test_isComposite() is False: 2960 | self.signDict = self.signDictBuild(self.catf_sign) 2961 | self.signDict["sign_isPartOfComposite"] = False 2962 | self.signDict["sign_isUnknownReading"] = self.test_isUnknownReading() 2963 | self.signDict["sign_isDamaged"] = self.test_isDamaged() 2964 | self.signDict["sign_relatedSigns"] = {} # TODO get Related Sign 2965 | self.signDict["sign_nestLevel"] = 0 2966 | self.signDict["sign_compositeSign"] = "" 2967 | sign_dict_list.append(self.signDict) 2968 | # 2969 | return sign_dict_list 2970 | # 2971 | # Algorithm DONE 2972 | # Tests! DONE 2973 | 2974 | 2975 | 2976 | class cAtfTextBuilder(object): 2977 | """ 2978 | Builds the brut text as a feature 2979 | dictionary, by calling the methods 2980 | from the classes above. 2981 | """ 2982 | # 2983 | def __init__(self, text): 2984 | # 2985 | self.text_brut = text 2986 | self.atf_section = "" 2987 | self.textLang = "" 2988 | self.wordLang = "" 2989 | self.alLanguage = "" 2990 | self.detLang = "" 2991 | self.object_parts_list = [] 2992 | self.objectIdPart = [] 2993 | self.catf_text_dict = {} 2994 | self.objectPartLines_list = [] 2995 | self.objectTextParts = [] 2996 | self.textPart_dict_list = [] 2997 | # 2998 | # Section Methods 2999 | # 3000 | def get_atf_section(self): 3001 | """ 3002 | params: atf_file, str. 3003 | return: atf_section, str. 3004 | 3005 | Takes a text given as the text output 3006 | of the cdli splits the atf section 3007 | for later use. 3008 | """ 3009 | # 3010 | find_atf_section = re.search("&P\d+.*", self.text_brut, re.DOTALL) 3011 | # 3012 | self.atf_section = find_atf_section.group(0) 3013 | # 3014 | return self.atf_section 3015 | # 3016 | def get_object_parts(self): 3017 | """ 3018 | params: atf_section, str. 3019 | return: object_part_list, [] 3020 | """ 3021 | # 3022 | try: 3023 | if "\n" not in self.atf_section: 3024 | raise ValueError("Newline character doesn't match to expected unix input type") 3025 | else: 3026 | pass 3027 | except ValueError as newlineError: 3028 | print(newlineError) 3029 | print("\n\n check if you have indeed specified \\n as \n the newline character while opening the text.") 3030 | return 3031 | else: 3032 | pass 3033 | object_part_split = self.atf_section.split("\n@") 3034 | object_part_id_part = object_part_split[0] 3035 | object_part_parts = object_part_split[1:] 3036 | self.object_parts_list = ["@" + part for part in object_part_parts] 3037 | self.object_parts_list.insert(0,object_part_id_part) 3038 | # 3039 | return self.object_parts_list 3040 | # 3041 | def splitLinesOParts(self): 3042 | """ 3043 | Splits the object part 3044 | into lines 3045 | """ 3046 | # 3047 | self.objectPartLines_list = [objectPart.splitlines() for objectPart in self.object_parts_list] 3048 | # 3049 | return self.objectPartLines_list 3050 | # 3051 | def get_ObjetIdPart(self): 3052 | """ 3053 | Gets the part in which 3054 | the id of the text occurs 3055 | 3056 | # In objectPartLines_list: 3057 | # [0] is the id part, [1] is the type part 3058 | # [2] is the text parts 3059 | 3060 | """ 3061 | # 3062 | self.objectIdPart = self.objectPartLines_list[0] 3063 | # 3064 | return self.objectIdPart 3065 | # 3066 | def get_text_id(self): 3067 | """ 3068 | Gets the text id from the 3069 | object id part 3070 | """ 3071 | # 3072 | for line in self.objectIdPart: 3073 | c_atf_line = cAtfLineGetter(line) 3074 | if len(c_atf_line.get_id_line()) != 0: 3075 | self.catf_text_dict["text_id"] = c_atf_line.get_id_line() 3076 | elif len(c_atf_line.get_id_alternatives()) != 0: 3077 | self.catf_text_dict["text_id_alternatives"] = c_atf_line.get_id_alternatives() 3078 | elif len(c_atf_line.get_language_line()) != 0: 3079 | self.catf_text_dict["text_language"] = c_atf_line.get_language_line() 3080 | # 3081 | return self.catf_text_dict 3082 | # 3083 | def set_textLang(self): 3084 | """ 3085 | sets the text language for 3086 | passing it to the other 3087 | sections 3088 | """ 3089 | # 3090 | self.textLang = self.catf_text_dict["text_language"] 3091 | # 3092 | return self.textLang 3093 | # 3094 | def get_objectTypePart(self): 3095 | """ 3096 | Gets the parts of 3097 | the text indicated by @ 3098 | 3099 | # In objectPartLines_list: 3100 | # [0] is the id part, [1] is the type part 3101 | # [2] is the text parts 3102 | 3103 | """ 3104 | # 3105 | self.objectTypePart = self.objectPartLines_list[1][0].strip() 3106 | # [1] corresponds to the list which contains only the type string 3107 | # Hence [0].strip() 3108 | # 3109 | return self.objectTypePart 3110 | # 3111 | def get_textParts(self): 3112 | """ 3113 | Gets the list of text parts 3114 | from the object part list 3115 | 3116 | This should correspond to [2:] 3117 | """ 3118 | # 3119 | self.objectTextParts = self.objectPartLines_list[2:] 3120 | # 3121 | return self.objectTextParts 3122 | # 3123 | def set_text_PartInfo(self): 3124 | """ 3125 | Sets what we have so far 3126 | to the text dictionary 3127 | """ 3128 | # 3129 | self.catf_text_dict["text_objectType"] = self.objectTypePart 3130 | self.catf_text_dict["text_textPartCount"] = len(self.objectTextParts) 3131 | #[2:] because [0] is the id part and [1] is the type part 3132 | # 3133 | return self.catf_text_dict 3134 | # 3135 | @staticmethod 3136 | def textPartString(textPart): 3137 | """ 3138 | params: textPart, [] 3139 | return: textPart_str, '' 3140 | 3141 | Regroups the lines 3142 | belonging to the part in 3143 | string form for handling 3144 | Another Language Occurrences 3145 | """ 3146 | # 3147 | partLines = textPart[1:] 3148 | # Since [0] is the part title indicated with @ 3149 | # the rest should be text lines, comments, etc. 3150 | textPart_str = "\n".join(partLines) 3151 | # 3152 | return textPart_str 3153 | # 3154 | def get_ALs(self, textPart_str): 3155 | """ 3156 | Passes the textPart_str to AL 3157 | handler for getting Another Language 3158 | occurrences 3159 | """ 3160 | # 3161 | alClass = cAtfALHandler(textPart_str) 3162 | alClass.set_textLang(self.textLang) 3163 | alClass.alLanguage = self.alLanguage 3164 | alOcS = alClass.get_ALOcS() 3165 | # 3166 | return alOcS 3167 | # 3168 | @staticmethod 3169 | def lineDicts(textPartLine): 3170 | """ 3171 | Converts the text part line 3172 | to a line dict 3173 | """ 3174 | # 3175 | lineClass = cAtfLineDictBuilder(textPartLine) 3176 | lineDict = lineClass.lineDictBuild() 3177 | # 3178 | return lineDict 3179 | # 3180 | def worDictBuilder(self,lineWord): 3181 | """ 3182 | Converts the words inside 3183 | a line dict to a 3184 | wordDict by using cAtfWordDictBuilder 3185 | """ 3186 | # 3187 | wordClass = cAtfWordDictBuilder(lineWord) 3188 | wordClass.set_textLang(self.textLang) 3189 | wordClass.set_wordLang(self.wordLang) 3190 | wordClass.set_detLang(self.detLang) 3191 | # 3192 | word_dict = wordClass.wordDictBuild() 3193 | # 3194 | return word_dict 3195 | # 3196 | @staticmethod 3197 | def signDictBuilder(WordSign): 3198 | """ 3199 | Converts the signs inside 3200 | a word dict to 3201 | a signDict by using 3202 | cAtfSignDictBuilder 3203 | """ 3204 | # 3205 | signClass = cAtfSignDictBuilder(WordSign) 3206 | sign_dict = signClass.buildSignDict() 3207 | sign_relations_dict_list = signClass.signRelation_dict_list 3208 | # 3209 | return (sign_dict,sign_relations_dict_list) 3210 | # 3211 | def get_SignDicts(self, wordDict): 3212 | """ 3213 | Builds sign dicts for the signs 3214 | in a word dict. 3215 | """ 3216 | signs = wordDict["word_Signs"] 3217 | signSignRel_tuple_list = [self.signDictBuilder(sign) for sign in signs] 3218 | sign_dict_list = [signSignRelTuple[0] for signSignRelTuple in signSignRel_tuple_list] 3219 | # 3220 | sign_relations_dict_list = [signSignRelTuple[1] for signSignRelTuple in signSignRel_tuple_list] 3221 | # sign relations dict list includes group - group relations 3222 | wordDict["word_Signs"] = sign_dict_list 3223 | wordDict["word_signRelations"] = sign_relations_dict_list 3224 | # 3225 | return wordDict 3226 | # 3227 | def get_WordDicts(self, lineDict): 3228 | """ 3229 | Builds word dicts for words 3230 | in a line dict 3231 | """ 3232 | # 3233 | words = lineDict["lineWords"] 3234 | wordDict_list = [self.worDictBuilder(word) for word in words] 3235 | lineDict["lineWords"] = wordDict_list 3236 | # 3237 | return lineDict 3238 | # 3239 | def set_partDict(self, textPart): 3240 | """ 3241 | Creates the part dictionary 3242 | from the textpart which is an 3243 | element of the objectpart list 3244 | """ 3245 | # 3246 | part_dict = {} 3247 | # 3248 | part_dict["part_partTitle"] = textPart[0].strip() 3249 | part_string = self.textPartString(textPart) 3250 | part_dict["part_partString"] = part_string 3251 | partlines = textPart[1:] 3252 | alOccurrences = self.get_ALs(part_string) 3253 | # pass text language to al occurrences TODO 3254 | part_dict["part_AL_occurrences"] = alOccurrences 3255 | # and the Adventure of Iteration starts ... 3256 | partLine_dict_list = [] 3257 | for line in partlines: 3258 | line_dict = self.lineDicts(line) 3259 | lineWord_dict = self.get_WordDicts(line_dict) 3260 | lineWordDict_list = lineWord_dict["lineWords"] 3261 | linewordsign_dict_list = [] 3262 | for lineWordDict in lineWordDict_list: 3263 | wordSigndict = self.get_SignDicts(lineWordDict) 3264 | linewordsign_dict_list.append(wordSigndict) 3265 | lineWord_dict["lineWords"] = linewordsign_dict_list 3266 | partLine_dict_list.append(lineWord_dict) 3267 | # 3268 | # 3269 | # 3270 | part_dict["part_partLines"] = partLine_dict_list 3271 | # 3272 | return part_dict 3273 | # 3274 | def buildTextDict_FP(self): 3275 | """ 3276 | Wraps the methods above for 3277 | building the text dictionary 3278 | With one pass. 3279 | The final dictionary doesn't 3280 | include the relative positioning of 3281 | signs, words, and lines 3282 | """ 3283 | # 3284 | self.get_atf_section() 3285 | self.get_object_parts() 3286 | # Text is splited into parts 3287 | self.splitLinesOParts() 3288 | # Each object part is splited into lines 3289 | self.get_ObjetIdPart() 3290 | # The part in which one observes the object id 3291 | # is seperated 3292 | self.get_text_id() 3293 | # From the object id part 3294 | # the text id is taken 3295 | self.get_objectTypePart() 3296 | # From the object part list 3297 | # object type part is taken 3298 | self.get_textParts() 3299 | # from the object parts that 3300 | # has been divided into lines 3301 | # textparts are taken 3302 | self.set_text_PartInfo() 3303 | # The type information 3304 | # and partCount is added to 3305 | # text dictionary 3306 | self.textPart_dict_list = [self.set_partDict(textpart) for textpart in self.objectTextParts] 3307 | # part dict is created for each text part. 3308 | self.catf_text_dict["text_textParts"] = self.textPart_dict_list 3309 | # 3310 | return self.catf_text_dict 3311 | # 3312 | @staticmethod 3313 | def set_lineStack(lineDict): 3314 | """ 3315 | params: lineDict, {} 3316 | return: lineStack, [] 3317 | Creates a lineStack list 3318 | for signs from the word dict 3319 | """ 3320 | # 3321 | lineWordPos_list = lineDict["lineWordPos"] 3322 | lineWord_list = lineDict["lineWords"] 3323 | lineStack_list = [] 3324 | # 3325 | lineStack = [] 3326 | for wordP, word in lineWordPos_list: 3327 | for wordDict in lineWord_list: 3328 | signPos_list = wordDict["word_wordSignsPos"] 3329 | word_str = wordDict["word_word"] 3330 | if word == word_str: 3331 | for signPos, sign in signPos_list: 3332 | signCount = [wordP, signPos, sign] 3333 | lineStack.append(signCount) 3334 | # 3335 | return lineStack 3336 | # 3337 | @staticmethod 3338 | def lineStackDicter(lineStack): 3339 | """ 3340 | Builds a dictionary from lineStack 3341 | to be mapped to a key in lineDict 3342 | """ 3343 | # 3344 | lineStackEnum = list(enumerate(lineStack)) 3345 | lineStackDict_list = [] 3346 | # 3347 | for linEnum in lineStackEnum: 3348 | lineStackDict = {} 3349 | linePos = linEnum[0] 3350 | lineEls = linEnum[1] 3351 | lineStackDict["line_RelSignPosition"] = linePos 3352 | # Sign position with respect to line 3353 | lineStackDict["line_wordPosition"] = lineEls[0] 3354 | # Position of the word with respect to line 3355 | lineStackDict["line_signWordPosition"] = lineEls[1] 3356 | # Position of the sign with respect to *word* 3357 | lineStackDict["line_sign"] = lineEls[2] 3358 | # The sign, like an, ir, ANSZE, etc. 3359 | lineStackDict_list.append(lineStackDict) 3360 | # 3361 | return lineStackDict_list 3362 | # 3363 | def set_lineStackDict(self,lineDict): 3364 | """ 3365 | Sets the lineStackDict to lineDict 3366 | by using methods above 3367 | """ 3368 | # 3369 | lineStack = self.set_lineStack(lineDict) 3370 | lineStack_dict_list = self.lineStackDicter(lineStack) 3371 | lineDict["line_RelSignPositions"] = lineStack_dict_list 3372 | # 3373 | return lineDict 3374 | # 3375 | @staticmethod 3376 | def set_partSignStack(partDict): 3377 | """ 3378 | Gets the partSignStack from the partDict 3379 | """ 3380 | # 3381 | partSignStack_dictList = [] 3382 | partLines_list = partDict["part_partLines"] 3383 | # 3384 | for lineDict in partLines_list: 3385 | lineNo = lineDict["lineNumber"] 3386 | lineSignStack_dictList = lineDict["line_RelSignPositions"] 3387 | for lineSignStack in lineSignStack_dictList: 3388 | lineSignStack["line_lineNumber"] = lineNo 3389 | partSignStack_dictList.append(lineSignStack) 3390 | # 3391 | # 3392 | return partSignStack_dictList 3393 | # 3394 | @staticmethod 3395 | def partStackDicter(partSignStack_dictList): 3396 | """ 3397 | Creates the partSignStackDict_list 3398 | """ 3399 | # 3400 | partSignSEnum = list(enumerate(partSignStack_dictList)) 3401 | part_signStack_list = [] 3402 | # 3403 | for partEnum in partSignSEnum: 3404 | partPos = partEnum[0] 3405 | partSignStacks = partEnum[1] 3406 | # 3407 | partSignStacks["part_RelSignPosition"] = partPos 3408 | part_signStack_list.append(partSignStacks) 3409 | # 3410 | # 3411 | return part_signStack_list 3412 | # 3413 | def set_partSignStackDict(self, partDict): 3414 | """ 3415 | Sets the partSignStack to partDict 3416 | """ 3417 | # 3418 | stackList = self.set_partSignStack(partDict) 3419 | partSignDict_list = self.partStackDicter(stackList) 3420 | # 3421 | partDict["part_RelSignPositions"] = partSignDict_list 3422 | # 3423 | return partDict 3424 | # 3425 | @staticmethod 3426 | def set_textSignStack(textDict): 3427 | """ 3428 | Sets the textSignStack from textPart 3429 | """ 3430 | # 3431 | textSignStack_dictList = [] 3432 | # 3433 | textPart_list = textDict["text_textParts"] 3434 | textPartEnum = list(enumerate(textPart_list)) 3435 | for textEn in textPartEnum: 3436 | partPos = textEn[0] 3437 | partDict = textEn[1] 3438 | partSignDict_list = partDict["part_RelSignPositions"] 3439 | # 3440 | for partSignDict in partSignDict_list: 3441 | partSignDict["text_partPosition"] = partPos 3442 | textSignStack_dictList.append(partSignDict) 3443 | # 3444 | # 3445 | return textSignStack_dictList 3446 | # 3447 | @staticmethod 3448 | def textStackDicter(textSignStack_dictList): 3449 | """ 3450 | Creates the textSignStackDict, from the list 3451 | """ 3452 | # 3453 | textSignSEnum = list(enumerate(textSignStack_dictList)) 3454 | text_signStack_list = [] 3455 | # 3456 | for textEnum in textSignSEnum: 3457 | textPos = textEnum[0] 3458 | textSignStacks = textEnum[1] 3459 | textSignStacks["text_RelSignPosition"] = textPos 3460 | text_signStack_list.append(textSignStacks) 3461 | # 3462 | # 3463 | return text_signStack_list 3464 | # 3465 | def set_textStackDict(self, textDict): 3466 | """ 3467 | sets textStactDict to the textDict 3468 | """ 3469 | # 3470 | stackList = self.set_textSignStack(textDict) 3471 | textSignDict_list = self.textStackDicter(stackList) 3472 | # 3473 | textDict["text_RelSignPositions"] = textSignDict_list 3474 | # 3475 | return textDict 3476 | # 3477 | def relSignSetter(self): 3478 | """ 3479 | Wrapping up the methods above 3480 | """ 3481 | # 3482 | textPart_list = self.catf_text_dict["text_textParts"] 3483 | text_part_dict_list = [] 3484 | # 3485 | for textPart in textPart_list: 3486 | partLines = textPart["part_partLines"] 3487 | part_line_dicts = [] 3488 | for lineDict in partLines: 3489 | lineModi = self.set_lineStackDict(lineDict) 3490 | part_line_dicts.append(lineModi) 3491 | # 3492 | textPart["part_partLines"] = part_line_dicts 3493 | partModi = self.set_partSignStackDict(textPart) 3494 | text_part_dict_list.append(partModi) 3495 | # 3496 | self.catf_text_dict["text_textParts"] = text_part_dict_list 3497 | # 3498 | text_dict_modi = self.set_textStackDict(self.catf_text_dict) 3499 | # 3500 | self.catf_text_dict = text_dict_modi 3501 | # 3502 | return self.catf_text_dict 3503 | # 3504 | def textSignCount(self): 3505 | """ 3506 | Gets and sets the sign count for the text. 3507 | """ 3508 | # 3509 | textSignList = self.catf_text_dict["text_RelSignPositions"] 3510 | lenSigns = textSignList[-1]["text_RelSignPosition"] + 1 3511 | # Total length is equal to last element position plus 1 3512 | # 3513 | self.catf_text_dict["text_totalSignCount"] = lenSigns 3514 | # 3515 | return self.catf_text_dict 3516 | # 3517 | # TODO Determinatiflerin ayrıştırılmasında problem var 3518 | # Şöyle şeyler oluyor line_sign': '{d}en' dikkat. 3519 | # 3520 | @staticmethod 3521 | def set_partWordStack(partDict): 3522 | """ 3523 | Sets the word stack from the part dict 3524 | """ 3525 | # 3526 | partWordStack_list = [] 3527 | # 3528 | partLines_list = partDict["part_partLines"] 3529 | # 3530 | for partLine_dict in partLines_list: 3531 | lineWordPos_list = partLine_dict["lineWordPos"] 3532 | lineNo = partLine_dict["lineNumber"] 3533 | for lineWordPos in lineWordPos_list: 3534 | word = lineWordPos[1] 3535 | wordP = lineWordPos[0] 3536 | wordCount = [lineNo, wordP, word] 3537 | partWordStack_list.append(wordCount) 3538 | # 3539 | # 3540 | return partWordStack_list 3541 | # 3542 | @staticmethod 3543 | def partWordStackDicter(partWordStack_list): 3544 | """ 3545 | creates the dict from the partWordStack and 3546 | stocks everything in a list. 3547 | """ 3548 | # 3549 | partWEnum = list(enumerate(partWordStack_list)) 3550 | partWordStack_dictList = [] 3551 | # 3552 | for partWe in partWEnum: 3553 | partPos = partWe[0] 3554 | stackDict = {} 3555 | wordCount = partWe[1] 3556 | stackDict["part_RelWordPosition"] = partPos 3557 | stackDict["line_lineNumber"] = wordCount[0] 3558 | stackDict["line_lineWordPosition"] = wordCount[1] 3559 | stackDict["line_word"] = wordCount[2] 3560 | partWordStack_dictList.append(stackDict) 3561 | # 3562 | return partWordStack_dictList 3563 | # 3564 | def set_partWordStackDict(self, partDict): 3565 | """ 3566 | Sets the partWordStack_dictList to partDict 3567 | """ 3568 | # 3569 | stackList = self.set_partWordStack(partDict) 3570 | stackDict_list = self.partWordStackDicter(stackList) 3571 | partDict["part_RelWordPositions"] = stackDict_list 3572 | # 3573 | return partDict 3574 | # 3575 | @staticmethod 3576 | def set_textWordStack(textDict): 3577 | """ 3578 | Sets the textStackList from textDict 3579 | """ 3580 | # 3581 | textWordStack_list = [] 3582 | textParts = textDict["text_textParts"] 3583 | # 3584 | textPEnum = list(enumerate(textParts)) 3585 | # 3586 | for textP in textPEnum: 3587 | partPos = textP[0] 3588 | partDict = textP[1] 3589 | partRels = partDict["part_RelWordPositions"] 3590 | # 3591 | for partR in partRels: 3592 | partR["text_partPosition"] = partPos 3593 | textWordStack_list.append(partR) 3594 | # 3595 | # 3596 | return textWordStack_list 3597 | # 3598 | @staticmethod 3599 | def textWordStackDicter(textWordStack_list): 3600 | """ 3601 | Creates the dict from textWordStack_list 3602 | """ 3603 | # 3604 | textWordStack_dictList = [] 3605 | textWordEnum = list(enumerate(textWordStack_list)) 3606 | # 3607 | for textWE in textWordEnum: 3608 | textPos = textWE[0] 3609 | stackDict = textWE[1] 3610 | stackDict["text_RelWordPosition"] = textPos 3611 | textWordStack_dictList.append(stackDict) 3612 | # 3613 | return textWordStack_dictList 3614 | # 3615 | def set_textWordStackDict(self, textDict): 3616 | """ 3617 | Sets the wordStack_dictList to textDict 3618 | """ 3619 | # 3620 | stackList = self.set_textWordStack(textDict) 3621 | stack_dictList = self.textWordStackDicter(stackList) 3622 | textDict["text_RelWordPositions"] = stack_dictList 3623 | # 3624 | return textDict 3625 | # 3626 | def relWordSetter(self): 3627 | """ 3628 | Sets the relative word Positions 3629 | and to catf_text_dict 3630 | Wraps up the methods above. 3631 | """ 3632 | # 3633 | textPart_list = self.catf_text_dict["text_textParts"] 3634 | text_part_dict_list = [] 3635 | # 3636 | for textPart in textPart_list: 3637 | textPartModi = self.set_partWordStackDict(textPart) 3638 | text_part_dict_list.append(textPartModi) 3639 | # 3640 | self.catf_text_dict["text_textParts"] = text_part_dict_list 3641 | textModi = self.set_textWordStackDict(self.catf_text_dict) 3642 | self.catf_text_dict = textModi 3643 | # 3644 | return self.catf_text_dict 3645 | # 3646 | @staticmethod 3647 | def set_textLineStack(textDict): 3648 | """ 3649 | Creates the stacklist for lines in a partdict 3650 | """ 3651 | # 3652 | textLineStack_list = [] 3653 | # 3654 | textparts = textDict["text_textParts"] 3655 | textpartEnum = list(enumerate(textparts)) 3656 | # 3657 | for textpartE in textpartEnum: 3658 | partPos = textpartE[0] 3659 | textpart = textpartE[1] 3660 | lines = textpart["part_partLines"] 3661 | for line in lines: 3662 | lineNo = line["lineNumber"] 3663 | lineCount = [partPos, lineNo] 3664 | textLineStack_list.append(lineCount) 3665 | # 3666 | # 3667 | return textLineStack_list 3668 | # 3669 | @staticmethod 3670 | def textLineStackDicter(textLineStack_list): 3671 | """ 3672 | sets the dictionary for textLineStack_list 3673 | """ 3674 | # 3675 | textLineStack_dictList = [] 3676 | # 3677 | textLinEn = list(enumerate(textLineStack_list)) 3678 | # 3679 | for textLin in textLinEn: 3680 | textPos = textLin[0] 3681 | stackList = textLin[1] 3682 | stackDict = {} 3683 | stackDict["text_RelLinePosition"] = textPos 3684 | stackDict["text_partPosition"] = stackList[0] 3685 | stackDict["line_lineNumber"] = stackList[1] 3686 | textLineStack_dictList.append(stackDict) 3687 | # 3688 | # 3689 | return textLineStack_dictList 3690 | # 3691 | def set_textLineStackDict(self, textDict): 3692 | """ 3693 | sets the textlinestack_dictList to textDict 3694 | """ 3695 | # 3696 | stackList = self.set_textLineStack(textDict) 3697 | stackDict_list = self.textLineStackDicter(stackList) 3698 | textDict["text_RelLinePositions"] = stackDict_list 3699 | # 3700 | return textDict 3701 | # 3702 | def relLineSetter(self): 3703 | """ 3704 | Sets the relative line Positions 3705 | to catf_text_dict 3706 | Wraps up the methods above 3707 | """ 3708 | textModi = self.set_textLineStackDict(self.catf_text_dict) 3709 | self.catf_text_dict = textModi 3710 | # 3711 | return self.catf_text_dict 3712 | # 3713 | def buildTextDict_SP(self): 3714 | """ 3715 | Builds the Text dictionary 3716 | With 2 passes. 3717 | Includes the relative positioning 3718 | of signs, words and lines 3719 | """ 3720 | self.buildTextDict_FP() 3721 | self.relSignSetter() 3722 | self.relWordSetter() 3723 | self.relLineSetter() 3724 | lineCount = self.catf_text_dict["text_RelLinePositions"] 3725 | self.catf_text_dict["text_totalLineCount"] = lineCount[-1]["text_RelLinePosition"] + 1 3726 | # Total length is equal to last item position + 1 for compensating zero indexing 3727 | wordCount = self.catf_text_dict["text_RelWordPositions"] 3728 | self.catf_text_dict["text_totalWordOccurrenceCount"] = wordCount[-1]["text_RelWordPosition"] + 1 3729 | # Counts the each occurrence of a word. Doesn't filter anything 3730 | signCount = self.catf_text_dict["text_RelSignPositions"] 3731 | self.catf_text_dict["text_totalSignOccurrenceCount"] = signCount[-1]["text_RelSignPosition"] + 1 3732 | # Count for the each occurrence of a sign. Doesn't filter anything. 3733 | # 3734 | return self.catf_text_dict 3735 | # 3736 | 3737 | --------------------------------------------------------------------------------