├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── setup.py ├── src └── sinopy │ ├── __init__.py │ ├── data.py │ ├── data │ ├── README.md │ ├── Unihan_Readings.txt │ ├── chinese.json │ ├── chinese.tsv │ ├── ids-analysis.txt │ ├── ids.txt │ ├── profiles │ │ └── mch.prf │ ├── tls.json │ ├── wang1980.tsv │ ├── yinku.qlc │ └── yiti.tsv │ ├── hanzi.py │ ├── seaquence.py │ ├── util.py │ └── yinyun.py └── tests ├── test_data.py ├── test_hanzi.py ├── test_seaquence.py └── test_yinyun.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *.bin 5 | _tests 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .cache 41 | nosetests.xml 42 | coverage.xml 43 | 44 | # Translations 45 | *.mo 46 | *.pot 47 | 48 | # Django stuff: 49 | *.log 50 | 51 | # Sphinx documentation 52 | docs/_build/ 53 | 54 | # PyBuilder 55 | target/ 56 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | {description} 294 | Copyright (C) {year} {fullname} 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | {signature of Ty Coon}, 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | 341 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.md LICENSE 2 | graft src 3 | global-exclude *.py[co] 4 | 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SinoPy: Python Library for quantitative tasks in Chinese historical linguistics 2 | 3 | [![DOI](https://zenodo.org/badge/30593438.svg)](https://zenodo.org/badge/latestdoi/30593438) 4 | ![PyPI](https://img.shields.io/pypi/v/sinopy.svg) 5 | 6 | SinoPy is an attempt to provide useful functionality for users working with Chinese dialects and Sino-Tibetan language data and struggling with tasks like converting characters to Pinyin, analysing characters, or analysing readings in Chinese dialects and other SEA languages. 7 | 8 | If you use the library in your research, please quote it as: 9 | 10 | > List, Johann-Mattis (2018): SinoPy: Python Library for quantitative tasks in Chinese historical linguistics. Version 0.3.0. Jena: Max Planck Institute for the Science of Human History. DOI: https://zenodo.org/badge/latestdoi/30593438 11 | 12 | This is intended as a plugin for LingPy, or an addon. The library gives utility functions that prove useful to handle Chinese data in a very broad context, ranging from Chinese character readings up to proposed readings in Middle Chinese and older stages of the language. 13 | 14 | ## Quick Usage Examples 15 | 16 | Convert Baxter's (1992) Middle Chinese transcription system to plain IPA (with tone marks). 17 | 18 | ```python 19 | >>> from sinopy import baxter2ipa 20 | >>> baxter2ipa('bjang') 21 | 'bjaŋ¹' 22 | >>> baxter2ipa('bjang', segmented=True) 23 | ['b', 'j', 'a', 'ŋ', '¹'] 24 | ``` 25 | 26 | Convert Chinese characters to Pīnyīn 27 | 28 | ```python 29 | >>> from sinopy import pinyin 30 | >>> pinyin('我', variant='cantonese') 31 | 'ngo5' 32 | >>> pinyin('我', variant='mandarin') 33 | 'wǒ' 34 | ``` 35 | 36 | Try to find character by combining two characters: 37 | 38 | ```python 39 | >>> from sinopy import character_from_structure 40 | >>> character_from_structure('+人我') 41 | '俄' 42 | ``` 43 | 44 | ## More examples 45 | 46 | At the moment, you may have difficulties finding a common idea behind SinoPy, 47 | as the collection of scripts is very diverse. The general topic, however, are 48 | basic operations one frequently encounters when working with Chinese and SEA 49 | linguistic data. 50 | 51 | But let's just look at a couple of examples: 52 | 53 | ```python 54 | >>> from sinopy import * 55 | >>> char = "我" 56 | >>> pinyin(char, variant="mandarin") 57 | wǒ 58 | ``` 59 | 60 | So obviously, we can convert characters to Pīnyīn. 61 | 62 | ```python 63 | >>> is_chinese(char) 64 | True 65 | >>> is_chinese('b') 66 | False 67 | ``` 68 | 69 | So the library also checks if a character belongs to Chinese Unicode range. 70 | 71 | But we have also a range of functions for handling Middle Chinese and related problems. For example the following: 72 | 73 | ```python 74 | >>> parse_baxter('ngaH') 75 | ('ng', '', 'a', 'H') 76 | ``` 77 | 78 | So this function will read in a Middle Chinese string (as encoded in the system of Baxter 1992) and return its main constituents (initial, medial, final, and tone). 79 | 80 | But we can also directly convert a character to its Middle Chinese reading: 81 | 82 | ```python 83 | >>> chars2baxter(char) 84 | ['ngaX'] 85 | ``` 86 | 87 | Or we can retrieve a basic gloss. 88 | 89 | ```python 90 | >>> chars2gloss(char) 91 | ['our, us, i, me, my, we'] 92 | ``` 93 | 94 | A rather complex function is the `sixtuple2baxter` function, which reads in the classical six-character descriptions of the Middle Chinese reading of a given character and yields the Middle Chinese value following Baxter's system. You find a lot of sixtuple readings in the DOC database (published with the [Tower of Babel project](http://starling.rinet.ru/cgi-bin/response.cgi?root=config&morpho=0&basename=\data\china\doc&first=1)). 95 | 96 | ```python 97 | >>> sixtuple2baxter('蟹開一上海泥') 98 | ['n', '', 'oj', 'X'] 99 | >>> chars2baxter('乃') 100 | ['nojX'] 101 | ``` 102 | 103 | You can also directly try to retrieve the MC reading from passing two fǎnqiè characters, for example: 104 | 105 | ```python 106 | >>> fanqie2mch('海泥') 107 | 'xej' 108 | >>> fanqie2mch('泥海') 109 | 'nojX' 110 | ``` 111 | 112 | And if you don't like Baxter's MCH transcriptions, you can simply turn it to IPA: 113 | 114 | ```python 115 | >>> baxter2ipa('nojX') 116 | noj² 117 | >>> baxter2ipa('tsyang') 118 | 'ʨaŋ¹' 119 | ``` 120 | 121 | As a final important function, consider the parser for morphemes: 122 | 123 | ```python 124 | >>> parse_chinese_morphemes('ʨaŋ¹') 125 | ['ʨ', '-', 'a', 'ŋ', '¹'] 126 | ``` 127 | 128 | The quintuple that he method returns splits the sequence into its five main constituents, initial, medial, nucleus, coda, and tone. 129 | 130 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #import distribute_setup 2 | #distribute_setup.use_setuptools() 3 | 4 | from setuptools import setup, find_packages,Extension 5 | import codecs 6 | # setup package name etc as a default 7 | pkgname = 'sinopy' 8 | 9 | 10 | setup( 11 | name=pkgname, 12 | description="A Python library for quantitative tasks in Chinese historical linguistics.", 13 | version='0.3.4', 14 | packages=find_packages(where='src'), 15 | package_dir={'': 'src'}, 16 | zip_safe=False, 17 | license="GPL", 18 | include_package_data=True, 19 | install_requires=['lingpy', 'segments'], 20 | url='https://github.com/lingpy/sinopy', 21 | long_description=codecs.open('README.md', 'r', 'utf-8').read(), 22 | long_description_content_type='text/markdown', 23 | entry_points={ 24 | 'console_scripts': ['sinopy=sinopy.cli:main'], 25 | }, 26 | author='Johann-Mattis List', 27 | author_email='list@shh.mpg.de', 28 | keywords='Chinese linguistics, historical linguistics, computer-assisted language comparison' 29 | ) 30 | -------------------------------------------------------------------------------- /src/sinopy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lingpy/sinopy/de74b43886ba24ad6169427c5f455f75c2a13637/src/sinopy/__init__.py -------------------------------------------------------------------------------- /src/sinopy/data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sinopy 3 | import lingpy 4 | import os 5 | import pickle 6 | import codecs 7 | from collections import defaultdict 8 | from segments import tokenizer 9 | 10 | _path = lambda *x: os.path.join( 11 | os.path.dirname(sinopy.__file__), 12 | 'data', *x) 13 | 14 | def load(path, alternative): 15 | try: 16 | with codecs.open(_path('_'+path+'.bin'), 'rb') as f: 17 | return pickle.load(f) 18 | except: 19 | with codecs.open(_path('_'+path+'.bin'), 'wb') as f: 20 | out = alternative() 21 | pickle.dump(out, f) 22 | return out 23 | 24 | def get_unihan(): 25 | _unihan = lingpy.csv2list(_path('Unihan_Readings.txt')) 26 | unihan = defaultdict(dict) 27 | for line in _unihan: 28 | key = eval('"""'+line[0]+'"""') 29 | if line[1] == 'kHanyuPinyin': 30 | unihan[key]['pinyin'] = line[2].split(':')[1] 31 | else: 32 | unihan[key][line[1][1:].lower()] = line[2] 33 | return unihan 34 | 35 | 36 | def get_ids(): 37 | _ids = lingpy.csv2list(_path('ids.txt')) 38 | ids = {} 39 | for line in _ids: 40 | char = line[1] 41 | motivations = line[2:] 42 | for motivation in motivations: 43 | if '[' in motivation: 44 | motivation = motivation[:motivation.index('[')] 45 | ids[motivation] = char 46 | return ids 47 | 48 | 49 | def get_tls(): 50 | return json.load(codecs.open(_path('tls.json'), 'r', 'utf-8')) 51 | 52 | # a dictionary representing middle chinese rime categories 53 | def get_gy(): 54 | return { 55 | "yun" : { 56 | '東': "uwng", 57 | '冬': "owng", 58 | '鍾': "jowng", 59 | '江': "aewng", 60 | '支': "j!e", # chongniu 61 | '脂': "!ij", 62 | '之': "i", 63 | '微': "j+j", # was miscoded before 64 | '魚': "jo", 65 | '虞': "ju", 66 | '模': "u", 67 | '齊': "ej", 68 | '佳': "ea", # used to be ea+ 69 | '皆': "eaj", 70 | '灰': "*oj", # hekou 71 | '咍': "oj", 72 | '真': "j!in", 73 | '諄': "win", 74 | '臻': "in", 75 | '文': "jun", 76 | '欣': "j+n", 77 | '元': "j?on", # unpure hekou (???) 78 | '魂': "*on", #hekou 79 | '痕': "on", 80 | '寒': "an", 81 | '桓': "wan", 82 | '刪': "aen", 83 | '山': "ean", 84 | '先': "en", 85 | '仙': "j!en", # maybe this should be simply stated as en???? 86 | '蕭': "ew", 87 | '宵': "j!ew", 88 | '肴': "aew", 89 | '豪': "aw", 90 | '歌': "a", 91 | '戈': "wa", 92 | '麻': "ae", 93 | '陽': "jang", 94 | '唐': "ang", 95 | '庚': "aeng", 96 | '耕': "eang", 97 | '清': "j!eng", #potential chongniu rime 98 | '青': "eng", 99 | '蒸': "ing", 100 | '登': "*ong", # unpure hekou? 101 | '尤': "juw", 102 | '侯': "uw", 103 | '幽': "!iw", 104 | '侵': "im", 105 | '談': "am", 106 | '覃': "om", 107 | '鹽': "jem", 108 | '添': "em", 109 | '咸': "eam", 110 | '銜': "aem", 111 | '嚴': "jaem", 112 | '凡': "jom", 113 | '董': "uwng", 114 | '腫': "jowng", 115 | '講': "aewng", 116 | '紙': "j!e", # chongniu 117 | '旨': "!ij", 118 | '止': "i", 119 | '尾': "j+j", # was miscoded before 120 | '語': "jo", 121 | '麌': "ju", 122 | '姥': "u", 123 | '薺': "ej", 124 | '蟹': "ea", # used to be ea+ 125 | '駭': "eaj", 126 | '賄': "*oj", # guess this is a hekou 127 | '海': "oj", 128 | '軫': "j!in", 129 | '准': "win", 130 | '吻': "jun", 131 | '隱': "j+n", 132 | '阮': "j?on", # j*on not a pure hekou 133 | '混': "*on", # hekou 134 | '很': "on", 135 | '旱': "an", 136 | '緩': "wan", # hekou 137 | '潸': "aen", 138 | '產': "ean", 139 | '銑': "en", 140 | '獮': "j!en", 141 | '筱': "ew", 142 | '小': "j!ew", 143 | '巧': "aew", 144 | '晧': "aw", 145 | '哿': "a", 146 | '果': "wa", 147 | '馬': "ae", 148 | '養': "jang", 149 | '蕩': "ang", 150 | '梗': "aeng", 151 | '耿': "eang", 152 | '靜': "j!eng", 153 | '迥': "eng", 154 | '拯': "ing", 155 | '等': "*ong", # unpore hekou? 156 | '有': "juw", 157 | '厚': "uw", 158 | '黝': "!iw", 159 | '寢': "im", 160 | '感': "am", 161 | '敢': "om", 162 | '琰': "jem", 163 | '忝': "em", 164 | '豏': "eam", 165 | '檻': "aem", 166 | '儼': "jaem", 167 | '梵': "jom", 168 | '送': "uwng", 169 | '宋': "owng", 170 | '用': "jowng", 171 | '絳': "aewng", 172 | '寘': "j!e", #chongniu 173 | '至': "!ij", # chongniu-rime 174 | '志': "i", 175 | '未': "j+j", # this was apparently miscoded as jij before!!! 176 | '御': "jo", 177 | '遇': "ju", 178 | '暮': "u", 179 | '霽': "ej", 180 | '祭': "j!ej", # chongniu 181 | '泰': "aj", 182 | '卦': "ea", # used to be ea+ 183 | '怪': "eaj", 184 | '夬': "aej", 185 | '隊': "*oj", # this seems to be an original hekou rime in Baxter's notion!!! 186 | '代': "oj", 187 | '廢': "joj", 188 | '震': "j!in", 189 | '稕': "win", 190 | '問': "jun", 191 | '焮': "j+n", 192 | '愿': "j?on", # j*ot not a pure hekou??? 193 | '慁': "*on", #hekou 194 | '恨': "on", 195 | '翰': "an", 196 | '換': "wan", 197 | '諫': "aen", 198 | '襉': "ean", 199 | '霰': "en", 200 | '線': "j!en", 201 | '嘯': "ew", 202 | '笑': "j!ew", 203 | '效': "aew", 204 | '號': "aw", 205 | '個': "a", 206 | '過': "wa", 207 | '禡': "ae", 208 | '漾': "jang", 209 | '宕': "ang", 210 | '映': "aeng", 211 | '諍': "eang", 212 | '勁': "j!eng", 213 | '徑': "eng", 214 | '證': "ing", 215 | '嶝': "*ong", # not a pure hekou ??? 216 | '宥': "juw", 217 | '候': "uw", 218 | '幼': "!iw", 219 | '沁': "im", 220 | '勘': "am", 221 | '鬫': "om", 222 | '艷': "jem", 223 | '陷': "eam", 224 | '鑒': "aem", 225 | '釅': "jaem", 226 | '范': "jom", 227 | '屋': "uwk", 228 | '沃': "owk", 229 | '燭': "jowk", 230 | '覺': "aewk", 231 | '質': "!it", # chongniu 232 | '術': "w!it", 233 | '櫛': "!it", 234 | '物': "jut", 235 | '迄': "j+t", 236 | '月': "j?ot", # j*ot not a pure hekou!!! 237 | '沒': "*ot", # original hekou 238 | '曷': "at", 239 | '末': "wat", 240 | '黠': "eat", 241 | '轄': "aet", 242 | '屑': "et", 243 | '薛': "j!et", #chongniu 244 | '藥': "jak", 245 | '鐸': "ak", 246 | '陌': "aek", 247 | '麥': "eak", 248 | '昔': "jek", # jek should be maintained!!! 249 | '錫': "ek", 250 | '職': "ik", 251 | '德': "*ok", 252 | '緝': "!ip", # chongniu 253 | '合': "ap", 254 | '盍': "op", 255 | '葉': "jep", 256 | '帖': "ep", 257 | '洽': "eap", 258 | '狎': "aep", 259 | '業': "jaep", 260 | '乏': "jop", 261 | '篠': "ew", 262 | '準': "win", 263 | '噳': "ju", 264 | '箇': "a", 265 | '獼': 'j!en', # probably chongniu? jen 266 | '願': 'on', # no w 267 | '合': 'op', # no w 268 | '皓': 'aw', 269 | '煔': 'em', 270 | '節': '!it', # 271 | '豔': 'j!em', # probably chongniu 272 | }, 273 | 274 | "diao" : { "入": 'R', 275 | '平': 'P', 276 | '上': 'X', 277 | '去': 'H' 278 | }, 279 | 280 | "sheng" : { '幫': "p", 281 | '滂': "ph", 282 | '並': "b", 283 | '明': "m", 284 | '端': "t", 285 | '知': "tr", 286 | '透': "th", 287 | '徹': "trh", 288 | '定': "d", 289 | '澄': "dr", 290 | '泥': "n", 291 | '娘': "nr", 292 | '見': "k", 293 | '溪': "kh", 294 | '群': "g", 295 | '疑': "ng", 296 | '精': "ts", 297 | '莊': "tsr", 298 | '章': "tsy", 299 | '清': "tsh", 300 | '初': "tsrh", 301 | '昌': "tsyh", 302 | '從': "dz", 303 | '崇': "dzr", 304 | '禪': "dzy", 305 | '心': "s", 306 | '生': "sr", 307 | '書': "sy", 308 | '邪': "z", 309 | '俟': "zr", 310 | '船': "zy", 311 | '影': "'", 312 | '曉': "x", 313 | '匣': "h", 314 | '云': "hj", 315 | '雲': "hj", 316 | '以': "y", 317 | '來': "l", 318 | '日': "ny", 319 | '非': "pj", 320 | '敷': "phj", 321 | '奉': "bj", 322 | '微': "mj", 323 | '襌': "dzy" 324 | }, 325 | "hu" : { 326 | '開':'', 327 | '合':'w' 328 | }, 329 | "deng" : { '一':'1', 330 | '二':'2', 331 | '三':'3', 332 | '四':'4' 333 | }, 334 | "ipa" : [ 335 | ("tsrh","tsʰ"), 336 | ("tsyh","tɕʰ"), 337 | ("tsy","tɕ"), 338 | ("dzy","dʑ"), 339 | ("trh","ʈʰ"), 340 | ("tsh","tsʰ"), 341 | ("tsr","ʈʂ"), 342 | ("dzr","ɖʐ"), 343 | ("jw","y"), 344 | ("ae","æ"), 345 | ("ea","ɛ"), 346 | ("ph","pʰ"), 347 | ("th","tʰ"), 348 | ("tr","ʈ"), 349 | ("ts","ts"), 350 | ("dr","ɖ"), 351 | ("nr","ɳ"), 352 | ("dz","dz"), 353 | ("sr","ʂ"), 354 | ("zr","ʐ"), 355 | ("kh","kʰ"), 356 | ("ny","ȵ"), 357 | ("sy","ɕ"), 358 | ("zy","ʑ"), 359 | ("ng","ŋ"), 360 | ("d","d"), 361 | ("n","n"), 362 | ("s","s"), 363 | ("z","z"), 364 | ("y","j"), 365 | ("k","k"), 366 | ("g","g"), 367 | ("x","x"), 368 | ("h","ɣ"), 369 | ("+","ɨ"), 370 | ("'","ʔ"), 371 | ("p","p"), 372 | ("b","b"), 373 | ("m","m"), 374 | ("t","t"), 375 | ("X","²"), 376 | ("H","³"), 377 | ("P","¹"), 378 | ("R","⁴") 379 | ] 380 | } 381 | 382 | def get_gbk(): 383 | return "\u9515\u7691\u853c\u788d\u7231\u55f3\u5ad2\u7477\u66a7\u972d\u8c19\u94f5\u9e4c\u80ae\u8884\u5965\u5aaa\u9a9c\u9ccc\u575d\u7f62\u94af\u6446\u8d25\u5457\u9881\u529e\u7eca\u94a3\u5e2e\u7ed1\u9551\u8c24\u5265\u9971\u5b9d\u62a5\u9c8d\u9e28\u9f85\u8f88\u8d1d\u94a1\u72c8\u5907\u60eb\u9e4e\u8d32\u951b\u7ef7\u7b14\u6bd5\u6bd9\u5e01\u95ed\u835c\u54d4\u6ed7\u94cb\u7b5a\u8df8\u8fb9\u7f16\u8d2c\u53d8\u8fa9\u8fab\u82c4\u7f0f\u7b3e\u6807\u9aa0\u98d1\u98d9\u9556\u9573\u9cd4\u9cd6\u522b\u762a\u6fd2\u6ee8\u5bbe\u6448\u50a7\u7f24\u69df\u6ba1\u8191\u9554\u9acc\u9b13\u997c\u7980\u62e8\u94b5\u94c2\u9a73\u997d\u94b9\u9e41\u8865\u94b8\u8d22\u53c2\u8695\u6b8b\u60ed\u60e8\u707f\u9a96\u9eea\u82cd\u8231\u4ed3\u6ca7\u5395\u4fa7\u518c\u6d4b\u607b\u5c42\u8be7\u9538\u4faa\u9497\u6400\u63ba\u8749\u998b\u8c17\u7f20\u94f2\u4ea7\u9610\u98a4\u5181\u8c04\u8c36\u8487\u5fcf\u5a75\u9aa3\u89c7\u7985\u9561\u573a\u5c1d\u957f\u507f\u80a0\u5382\u7545\u4f25\u82cc\u6005\u960a\u9cb3\u949e\u8f66\u5f7b\u7817\u5c18\u9648\u886c\u4f27\u8c0c\u6987\u789c\u9f80\u6491\u79f0\u60e9\u8bda\u9a8b\u67a8\u67fd\u94d6\u94db\u75f4\u8fdf\u9a70\u803b\u9f7f\u70bd\u996c\u9e31\u51b2\u51b2\u866b\u5ba0\u94f3\u7574\u8e0c\u7b79\u7ef8\u4fe6\u5e31\u96e0\u6a71\u53a8\u9504\u96cf\u7840\u50a8\u89e6\u5904\u520d\u7ecc\u8e70\u4f20\u948f\u75ae\u95ef\u521b\u6006\u9524\u7f0d\u7eaf\u9e51\u7ef0\u8f8d\u9f8a\u8f9e\u8bcd\u8d50\u9e5a\u806a\u8471\u56f1\u4ece\u4e1b\u82c1\u9aa2\u679e\u51d1\u8f8f\u8e7f\u7a9c\u64ba\u9519\u9509\u9e7e\u8fbe\u54d2\u9791\u5e26\u8d37\u9a80\u7ed0\u62c5\u5355\u90f8\u63b8\u80c6\u60ee\u8bde\u5f39\u6b9a\u8d55\u7605\u7baa\u5f53\u6321\u515a\u8361\u6863\u8c20\u7800\u88c6\u6363\u5c9b\u7977\u5bfc\u76d7\u7118\u706f\u9093\u956b\u654c\u6da4\u9012\u7f14\u7c74\u8bcb\u8c1b\u7ee8\u89cc\u955d\u98a0\u70b9\u57ab\u7535\u5dc5\u94bf\u766b\u9493\u8c03\u94eb\u9cb7\u8c0d\u53e0\u9cbd\u9489\u9876\u952d\u8ba2\u94e4\u4e22\u94e5\u4e1c\u52a8\u680b\u51bb\u5cbd\u9e2b\u7aa6\u728a\u72ec\u8bfb\u8d4c\u9540\u6e0e\u691f\u724d\u7b03\u9ee9\u953b\u65ad\u7f0e\u7c16\u5151\u961f\u5bf9\u603c\u9566\u5428\u987f\u949d\u7096\u8db8\u593a\u5815\u94ce\u9e45\u989d\u8bb9\u6076\u997f\u8c14\u57a9\u960f\u8f6d\u9507\u9537\u9e57\u989a\u989b\u9cc4\u8bf6\u513f\u5c14\u9975\u8d30\u8fe9\u94d2\u9e38\u9c95\u53d1\u7f5a\u9600\u73d0\u77fe\u9492\u70e6\u8d29\u996d\u8bbf\u7eba\u94ab\u9c82\u98de\u8bfd\u5e9f\u8d39\u7eef\u9544\u9cb1\u7eb7\u575f\u594b\u6124\u7caa\u507e\u4e30\u67ab\u950b\u98ce\u75af\u51af\u7f1d\u8bbd\u51e4\u6ca3\u80a4\u8f90\u629a\u8f85\u8d4b\u590d\u8d1f\u8ba3\u5987\u7f1a\u51eb\u9a78\u7ec2\u7ecb\u8d59\u9eb8\u9c8b\u9cc6\u9486\u8be5\u9499\u76d6\u8d45\u6746\u8d76\u79c6\u8d63\u5c34\u64c0\u7ec0\u5188\u521a\u94a2\u7eb2\u5c97\u6206\u9550\u777e\u8bf0\u7f1f\u9506\u6401\u9e3d\u9601\u94ec\u4e2a\u7ea5\u9549\u988d\u7ed9\u4e98\u8d53\u7ee0\u9ca0\u9f9a\u5bab\u5de9\u8d21\u94a9\u6c9f\u82df\u6784\u8d2d\u591f\u8bdf\u7f11\u89cf\u86ca\u987e\u8bc2\u6bc2\u94b4\u9522\u9e2a\u9e44\u9e58\u5250\u6302\u9e39\u63b4\u5173\u89c2\u9986\u60ef\u8d2f\u8bd6\u63bc\u9e73\u9ccf\u5e7f\u72b7\u89c4\u5f52\u9f9f\u95fa\u8f68\u8be1\u8d35\u523d\u5326\u523f\u59ab\u6867\u9c91\u9cdc\u8f8a\u6eda\u886e\u7ef2\u9ca7\u9505\u56fd\u8fc7\u57da\u5459\u5e3c\u6901\u8748\u94ea\u9a87\u97e9\u6c49\u961a\u7ed7\u9889\u53f7\u704f\u98a2\u9602\u9e64\u8d3a\u8bc3\u9616\u86ce\u6a2a\u8f70\u9e3f\u7ea2\u9ec9\u8ba7\u836d\u95f3\u9c8e\u58f6\u62a4\u6caa\u6237\u6d52\u9e55\u54d7\u534e\u753b\u5212\u8bdd\u9a85\u6866\u94e7\u6000\u574f\u6b22\u73af\u8fd8\u7f13\u6362\u5524\u75ea\u7115\u6da3\u5942\u7f33\u953e\u9ca9\u9ec4\u8c0e\u9cc7\u6325\u8f89\u6bc1\u8d3f\u79fd\u4f1a\u70e9\u6c47\u8bb3\u8bf2\u7ed8\u8bd9\u835f\u54d5\u6d4d\u7f0b\u73f2\u6656\u8364\u6d51\u8be8\u9984\u960d\u83b7\u8d27\u7978\u94ac\u956c\u51fb\u673a\u79ef\u9965\u8ff9\u8ba5\u9e21\u7ee9\u7f09\u6781\u8f91\u7ea7\u6324\u51e0\u84df\u5242\u6d4e\u8ba1\u8bb0\u9645\u7ee7\u7eaa\u8ba6\u8bd8\u8360\u53fd\u54dc\u9aa5\u7391\u89ca\u9f51\u77f6\u7f81\u867f\u8dfb\u9701\u9c9a\u9cab\u5939\u835a\u988a\u8d3e\u94be\u4ef7\u9a7e\u90cf\u6d43\u94d7\u9553\u86f2\u6b7c\u76d1\u575a\u7b3a\u95f4\u8270\u7f04\u8327\u68c0\u78b1\u7877\u62e3\u6361\u7b80\u4fed\u51cf\u8350\u69db\u9274\u8df5\u8d31\u89c1\u952e\u8230\u5251\u996f\u6e10\u6e85\u6da7\u8c0f\u7f23\u620b\u622c\u7751\u9e63\u7b15\u9ca3\u97af\u5c06\u6d46\u848b\u6868\u5956\u8bb2\u9171\u7edb\u7f30\u80f6\u6d47\u9a84\u5a07\u6405\u94f0\u77eb\u4fa5\u811a\u997a\u7f34\u7ede\u8f7f\u8f83\u6322\u5ce4\u9e6a\u9c9b\u9636\u8282\u6d01\u7ed3\u8beb\u5c4a\u7596\u988c\u9c92\u7d27\u9526\u4ec5\u8c28\u8fdb\u664b\u70ec\u5c3d\u52b2\u8346\u830e\u537a\u8369\u9991\u7f19\u8d46\u89d0\u9cb8\u60ca\u7ecf\u9888\u9759\u955c\u5f84\u75c9\u7ade\u51c0\u522d\u6cfe\u8ff3\u5f2a\u80eb\u9753\u7ea0\u53a9\u65e7\u9604\u9e20\u9e6b\u9a79\u4e3e\u636e\u952f\u60e7\u5267\u8bb5\u5c66\u6989\u98d3\u949c\u9514\u7aad\u9f83\u9e43\u7ee2\u9529\u954c\u96bd\u89c9\u51b3\u7edd\u8c32\u73cf\u94a7\u519b\u9a8f\u76b2\u5f00\u51ef\u5240\u57b2\u5ffe\u607a\u94e0\u9534\u9f9b\u95f6\u94aa\u94d0\u9897\u58f3\u8bfe\u9a92\u7f02\u8f72\u94b6\u951e\u9894\u57a6\u6073\u9f88\u94ff\u62a0\u5e93\u88e4\u55be\u5757\u4fa9\u90d0\u54d9\u810d\u5bbd\u72ef\u9acb\u77ff\u65f7\u51b5\u8bd3\u8bf3\u909d\u5739\u7ea9\u8d36\u4e8f\u5cbf\u7aa5\u9988\u6e83\u532e\u8489\u6126\u8069\u7bd1\u9603\u951f\u9cb2\u6269\u9614\u86f4\u8721\u814a\u83b1\u6765\u8d56\u5d03\u5f95\u6d9e\u6fd1\u8d49\u7750\u94fc\u765e\u7c41\u84dd\u680f\u62e6\u7bee\u9611\u5170\u6f9c\u8c30\u63fd\u89c8\u61d2\u7f06\u70c2\u6ee5\u5c9a\u6984\u6593\u9567\u8934\u7405\u9606\u9512\u635e\u52b3\u6d9d\u5520\u5d02\u94d1\u94f9\u75e8\u4e50\u9cd3\u956d\u5792\u7c7b\u6cea\u8bd4\u7f27\u7bf1\u72f8\u79bb\u9ca4\u793c\u4e3d\u5389\u52b1\u783e\u5386\u6ca5\u96b6\u4fea\u90e6\u575c\u82c8\u8385\u84e0\u5456\u9026\u9a8a\u7f21\u67a5\u680e\u8f79\u783a\u9502\u9e42\u75a0\u7c9d\u8dde\u96f3\u9ca1\u9ce2\u4fe9\u8054\u83b2\u8fde\u9570\u601c\u6d9f\u5e18\u655b\u8138\u94fe\u604b\u70bc\u7ec3\u8539\u5941\u6f4b\u740f\u6b93\u88e2\u88e3\u9ca2\u7cae\u51c9\u4e24\u8f86\u8c05\u9b49\u7597\u8fbd\u9563\u7f2d\u948c\u9e69\u730e\u4e34\u90bb\u9cde\u51db\u8d41\u853a\u5eea\u6aa9\u8f9a\u8e8f\u9f84\u94c3\u7075\u5cad\u9886\u7eeb\u68c2\u86cf\u9cae\u998f\u5218\u6d4f\u9a9d\u7efa\u954f\u9e68\u9f99\u804b\u5499\u7b3c\u5784\u62e2\u9647\u830f\u6cf7\u73d1\u680a\u80e7\u783b\u697c\u5a04\u6402\u7bd3\u507b\u848c\u55bd\u5d5d\u9542\u7618\u8027\u877c\u9ac5\u82a6\u5362\u9885\u5e90\u7089\u63b3\u5364\u864f\u9c81\u8d42\u7984\u5f55\u9646\u5786\u64b8\u565c\u95fe\u6cf8\u6e0c\u680c\u6a79\u8f73\u8f82\u8f98\u6c07\u80ea\u9e2c\u9e6d\u823b\u9c88\u5ce6\u631b\u5b6a\u6ee6\u4e71\u8114\u5a08\u683e\u9e3e\u92ae\u62a1\u8f6e\u4f26\u4ed1\u6ca6\u7eb6\u8bba\u56f5\u841d\u7f57\u903b\u9523\u7ba9\u9aa1\u9a86\u7edc\u8366\u7321\u6cfa\u6924\u8136\u9559\u9a74\u5415\u94dd\u4fa3\u5c61\u7f15\u8651\u6ee4\u7eff\u6988\u891b\u950a\u5452\u5988\u739b\u7801\u8682\u9a6c\u9a82\u5417\u551b\u5b37\u6769\u4e70\u9ea6\u5356\u8fc8\u8109\u52a2\u7792\u9992\u86ee\u6ee1\u8c29\u7f26\u9558\u98a1\u9cd7\u732b\u951a\u94c6\u8d38\u9ebd\u6ca1\u9541\u95e8\u95f7\u4eec\u626a\u7116\u61d1\u9494\u9530\u68a6\u772f\u8c1c\u5f25\u89c5\u5e42\u8288\u8c27\u7315\u7962\u7ef5\u7f05\u6e11\u817c\u9efe\u5e99\u7f08\u7f2a\u706d\u60af\u95fd\u95f5\u7f17\u9e23\u94ed\u8c2c\u8c1f\u84e6\u998d\u6b81\u9546\u8c0b\u4ea9\u94bc\u5450\u94a0\u7eb3\u96be\u6320\u8111\u607c\u95f9\u94d9\u8bb7\u9981\u5185\u62df\u817b\u94cc\u9cb5\u64b5\u8f87\u9cb6\u917f\u9e1f\u8311\u8885\u8042\u556e\u954a\u954d\u9667\u8616\u55eb\u989f\u8e51\u67e0\u72de\u5b81\u62e7\u6cde\u82ce\u549b\u804d\u94ae\u7ebd\u8113\u6d53\u519c\u4fac\u54dd\u9a7d\u9495\u8bfa\u50a9\u759f\u6b27\u9e25\u6bb4\u5455\u6ca4\u8bb4\u6004\u74ef\u76d8\u8e52\u5e9e\u629b\u75b1\u8d54\u8f94\u55b7\u9e4f\u7eb0\u7f74\u94cd\u9a97\u8c1d\u9a88\u98d8\u7f25\u9891\u8d2b\u5ad4\u82f9\u51ed\u8bc4\u6cfc\u9887\u948b\u6251\u94fa\u6734\u8c31\u9564\u9568\u6816\u8110\u9f50\u9a91\u5c82\u542f\u6c14\u5f03\u8bab\u8572\u9a90\u7eee\u6864\u789b\u9880\u9883\u9ccd\u7275\u948e\u94c5\u8fc1\u7b7e\u8c26\u94b1\u94b3\u6f5c\u6d45\u8c34\u5811\u4f65\u8368\u60ad\u9a9e\u7f31\u6920\u94a4\u67aa\u545b\u5899\u8537\u5f3a\u62a2\u5af1\u6a2f\u6217\u709d\u9516\u9535\u956a\u7f9f\u8dc4\u9539\u6865\u4e54\u4fa8\u7fd8\u7a8d\u8bee\u8c2f\u835e\u7f32\u7857\u8df7\u7a83\u60ec\u9532\u7ba7\u94a6\u4eb2\u5bdd\u9513\u8f7b\u6c22\u503e\u9877\u8bf7\u5e86\u63ff\u9cad\u743c\u7a77\u8315\u86f1\u5def\u8d47\u866e\u9cc5\u8d8b\u533a\u8eaf\u9a71\u9f8b\u8bce\u5c96\u9612\u89d1\u9e32\u98a7\u6743\u529d\u8be0\u7efb\u8f81\u94e8\u5374\u9e4a\u786e\u9615\u9619\u60ab\u8ba9\u9976\u6270\u7ed5\u835b\u5a06\u6861\u70ed\u97e7\u8ba4\u7eab\u996a\u8f6b\u8363\u7ed2\u5d58\u877e\u7f1b\u94f7\u98a6\u8f6f\u9510\u86ac\u95f0\u6da6\u6d12\u8428\u98d2\u9cc3\u8d5b\u4f1e\u6bf5\u7cc1\u4e27\u9a9a\u626b\u7f2b\u6da9\u556c\u94ef\u7a51\u6740\u5239\u7eb1\u94e9\u9ca8\u7b5b\u6652\u917e\u5220\u95ea\u9655\u8d61\u7f2e\u8baa\u59d7\u9a9f\u9490\u9cdd\u5892\u4f24\u8d4f\u57a7\u6b87\u89de\u70e7\u7ecd\u8d4a\u6444\u6151\u8bbe\u538d\u6ee0\u7572\u7ec5\u5ba1\u5a76\u80be\u6e17\u8bdc\u8c02\u6e16\u58f0\u7ef3\u80dc\u5e08\u72ee\u6e7f\u8bd7\u65f6\u8680\u5b9e\u8bc6\u9a76\u52bf\u9002\u91ca\u9970\u89c6\u8bd5\u8c25\u57d8\u83b3\u5f11\u8f7c\u8d33\u94c8\u9ca5\u5bff\u517d\u7ef6\u67a2\u8f93\u4e66\u8d4e\u5c5e\u672f\u6811\u7ad6\u6570\u6445\u7ebe\u5e05\u95e9\u53cc\u8c01\u7a0e\u987a\u8bf4\u7855\u70c1\u94c4\u4e1d\u9972\u53ae\u9a77\u7f0c\u9536\u9e36\u8038\u6002\u9882\u8bbc\u8bf5\u64de\u85ae\u998a\u98d5\u953c\u82cf\u8bc9\u8083\u8c21\u7a23\u867d\u968f\u7ee5\u5c81\u8c07\u5b59\u635f\u7b0b\u836a\u72f2\u7f29\u7410\u9501\u5522\u7743\u736d\u631e\u95fc\u94ca\u9cce\u53f0\u6001\u949b\u9c90\u644a\u8d2a\u762b\u6ee9\u575b\u8c2d\u8c08\u53f9\u6619\u94bd\u952c\u9878\u6c64\u70eb\u50a5\u9967\u94f4\u9557\u6d9b\u7ee6\u8ba8\u97ec\u94fd\u817e\u8a8a\u9511\u9898\u4f53\u5c49\u7f07\u9e48\u9617\u6761\u7c9c\u9f86\u9ca6\u8d34\u94c1\u5385\u542c\u70c3\u94dc\u7edf\u6078\u5934\u94ad\u79c3\u56fe\u948d\u56e2\u629f\u9893\u8715\u9968\u8131\u9e35\u9a6e\u9a7c\u692d\u7ba8\u9f0d\u889c\u5a32\u817d\u5f2f\u6e7e\u987d\u4e07\u7ea8\u7efe\u7f51\u8f8b\u97e6\u8fdd\u56f4\u4e3a\u6f4d\u7ef4\u82c7\u4f1f\u4f2a\u7eac\u8c13\u536b\u8bff\u5e0f\u95f1\u6ca9\u6da0\u73ae\u97ea\u709c\u9c94\u6e29\u95fb\u7eb9\u7a33\u95ee\u960c\u74ee\u631d\u8717\u6da1\u7a9d\u5367\u83b4\u9f8c\u545c\u94a8\u4e4c\u8bec\u65e0\u829c\u5434\u575e\u96fe\u52a1\u8bef\u90ac\u5e91\u6003\u59a9\u9a9b\u9e49\u9e5c\u9521\u727a\u88ad\u4e60\u94e3\u620f\u7ec6\u9969\u960b\u73ba\u89cb\u867e\u8f96\u5ce1\u4fa0\u72ed\u53a6\u5413\u7856\u9c9c\u7ea4\u8d24\u8854\u95f2\u663e\u9669\u73b0\u732e\u53bf\u9985\u7fa1\u5baa\u7ebf\u82cb\u83b6\u85d3\u5c98\u7303\u5a34\u9e47\u75eb\u869d\u7c7c\u8df9\u53a2\u9576\u4e61\u8be6\u54cd\u9879\u8297\u9977\u9aa7\u7f03\u98e8\u8427\u56a3\u9500\u6653\u5578\u54d3\u6f47\u9a81\u7ee1\u67ad\u7bab\u534f\u631f\u643a\u80c1\u8c10\u5199\u6cfb\u8c22\u4eb5\u64b7\u7ec1\u7f2c\u950c\u8845\u5174\u9649\u8365\u51f6\u6c79\u9508\u7ee3\u9990\u9e3a\u865a\u5618\u987b\u8bb8\u53d9\u7eea\u7eed\u8be9\u987c\u8f69\u60ac\u9009\u7663\u7eda\u8c16\u94c9\u955f\u5b66\u8c11\u6cf6\u9cd5\u52cb\u8be2\u5bfb\u9a6f\u8bad\u8baf\u900a\u57d9\u6d54\u9c9f\u538b\u9e26\u9e2d\u54d1\u4e9a\u8bb6\u57ad\u5a05\u6860\u6c29\u9609\u70df\u76d0\u4e25\u5ca9\u989c\u960e\u8273\u538c\u781a\u5f66\u8c1a\u9a8c\u53a3\u8d5d\u4fe8\u5156\u8c33\u6079\u95eb\u917d\u9b47\u990d\u9f39\u9e2f\u6768\u626c\u75a1\u9633\u75d2\u517b\u6837\u7080\u7476\u6447\u5c27\u9065\u7a91\u8c23\u836f\u8f7a\u9e5e\u9cd0\u7237\u9875\u4e1a\u53f6\u9765\u8c12\u90ba\u6654\u70e8\u533b\u94f1\u9890\u9057\u4eea\u8681\u827a\u4ebf\u5fc6\u4e49\u8be3\u8bae\u8c0a\u8bd1\u5f02\u7ece\u8bd2\u5453\u5cc4\u9974\u603f\u9a7f\u7f22\u8f76\u8d3b\u9487\u9552\u9571\u7617\u8223\u836b\u9634\u94f6\u996e\u9690\u94df\u763e\u6a31\u5a74\u9e70\u5e94\u7f28\u83b9\u8424\u8425\u8367\u8747\u8d62\u9896\u8314\u83ba\u8426\u84e5\u6484\u5624\u6ee2\u6f46\u748e\u9e66\u763f\u988f\u7f42\u54df\u62e5\u4f63\u75c8\u8e0a\u548f\u955b\u4f18\u5fe7\u90ae\u94c0\u72b9\u8bf1\u83b8\u94d5\u9c7f\u8206\u9c7c\u6e14\u5a31\u4e0e\u5c7f\u8bed\u72f1\u8a89\u9884\u9a6d\u4f1b\u4fe3\u8c00\u8c15\u84e3\u5d5b\u996b\u9608\u59aa\u7ea1\u89ce\u6b24\u94b0\u9e46\u9e6c\u9f89\u9e33\u6e0a\u8f95\u56ed\u5458\u5706\u7f18\u8fdc\u6a7c\u9e22\u9f0b\u7ea6\u8dc3\u94a5\u7ca4\u60a6\u9605\u94ba\u90e7\u5300\u9668\u8fd0\u8574\u915d\u6655\u97f5\u90d3\u82b8\u607d\u6120\u7ead\u97eb\u6b92\u6c32\u6742\u707e\u8f7d\u6512\u6682\u8d5e\u74d2\u8db1\u933e\u8d43\u810f\u9a75\u51ff\u67a3\u8d23\u62e9\u5219\u6cfd\u8d5c\u5567\u5e3b\u7ba6\u8d3c\u8c2e\u8d60\u7efc\u7f2f\u8f67\u94e1\u95f8\u6805\u8bc8\u658b\u503a\u6be1\u76cf\u65a9\u8f97\u5d2d\u6808\u6218\u7efd\u8c35\u5f20\u6da8\u5e10\u8d26\u80c0\u8d75\u8bcf\u948a\u86f0\u8f99\u9517\u8fd9\u8c2a\u8f84\u9e67\u8d1e\u9488\u4fa6\u8bca\u9547\u9635\u6d48\u7f1c\u6862\u8f78\u8d48\u796f\u9e29\u6323\u7741\u72f0\u4e89\u5e27\u75c7\u90d1\u8bc1\u8be4\u5ce5\u94b2\u94ee\u7b5d\u7ec7\u804c\u6267\u7eb8\u631a\u63b7\u5e1c\u8d28\u6ede\u9a98\u6809\u6800\u8f75\u8f7e\u8d3d\u9e37\u86f3\u7d77\u8e2c\u8e2f\u89ef\u949f\u7ec8\u79cd\u80bf\u4f17\u953a\u8bcc\u8f74\u76b1\u663c\u9aa4\u7ea3\u7ec9\u732a\u8bf8\u8bdb\u70db\u77a9\u5631\u8d2e\u94f8\u9a7b\u4f2b\u69e0\u94e2\u4e13\u7816\u8f6c\u8d5a\u556d\u9994\u989e\u6869\u5e84\u88c5\u5986\u58ee\u72b6\u9525\u8d58\u5760\u7f00\u9a93\u7f12\u8c06\u51c6\u7740\u6d4a\u8bfc\u956f\u5179\u8d44\u6e0d\u8c18\u7f01\u8f8e\u8d40\u7726\u9531\u9f87\u9cbb\u8e2a\u603b\u7eb5\u506c\u90b9\u8bf9\u9a7a\u9cb0\u8bc5\u7ec4\u955e\u94bb\u7f35\u8e9c\u9cdf\u7ff1\u5e76\u535c\u6c89\u4e11\u6dc0\u8fed\u6597\u8303\u5e72\u768b\u7845\u67dc\u540e\u4f19\u79f8\u6770\u8bc0\u5938\u91cc\u51cc\u4e48\u9709\u637b\u51c4\u6266\u5723\u5c38\u62ac\u6d82\u6d3c\u5582\u6c61\u9528\u54b8\u874e\u5f5d\u6d8c\u6e38\u5401\u5fa1\u613f\u5cb3\u4e91\u7076\u624e\u672d\u7b51\u4e8e\u5fd7\u6ce8\u51cb\u8ba0\u8c2b\u90c4\u52d0\u51fc\u5742\u5785\u57b4\u57ef\u57dd\u82d8\u836c\u836e\u839c\u83bc\u83f0\u85c1\u63f8\u5412\u5423\u5494\u549d\u54b4\u5658\u567c\u56af\u5e5e\u5c99\u5d74\u5f77\u5fbc\u72b8\u72cd\u9980\u9987\u9993\u9995\u6123\u61b7\u61d4\u4e2c\u6e86\u6edf\u6eb7\u6f24\u6f74\u6fb9\u752f\u7e9f\u7ed4\u7ef1\u73c9\u67a7\u684a\u6849\u69d4\u6a65\u8f71\u8f77\u8d4d\u80b7\u80e8\u98da\u7173\u7145\u7198\u610d\u6dfc\u781c\u78d9\u770d\u949a\u94b7\u94d8\u94de\u9503\u950d\u950e\u950f\u9518\u951d\u952a\u952b\u953f\u9545\u954e\u9562\u9565\u9569\u9572\u7a06\u9e4b\u9e5b\u9e71\u75ac\u75b4\u75d6\u766f\u88e5\u8941\u8022\u98a5\u87a8\u9eb4\u9c85\u9c86\u9c87\u9c9e\u9cb4\u9cba\u9cbc\u9cca\u9ccb\u9cd8\u9cd9\u9792\u97b4\u9f44" 384 | 385 | def get_big5(): 386 | return "\u9312\u769a\u85f9\u7919\u611b\u566f\u5b21\u74a6\u66d6\u9744\u8af3\u92a8\u9d6a\u9aaf\u8956\u5967\u5abc\u9a41\u9c32\u58e9\u7f77\u9200\u64fa\u6557\u5504\u9812\u8fa6\u7d46\u9211\u5e6b\u7d81\u938a\u8b17\u525d\u98fd\u5bf6\u5831\u9b91\u9d07\u9f59\u8f29\u8c9d\u92c7\u72fd\u5099\u618a\u9d6f\u8cc1\u931b\u7e43\u7b46\u7562\u6583\u5e63\u9589\u84fd\u55f6\u6f77\u924d\u7bf3\u8e55\u908a\u7de8\u8cb6\u8b8a\u8faf\u8fae\u8290\u7df6\u7c69\u6a19\u9a43\u98ae\u98c6\u93e2\u9463\u9c3e\u9c49\u5225\u765f\u7015\u6ff1\u8cd3\u64ef\u5110\u7e7d\u6ab3\u6baf\u81cf\u944c\u9ad5\u9b22\u9905\u7a1f\u64a5\u7f3d\u9251\u99c1\u9911\u9238\u9d53\u88dc\u923d\u8ca1\u53c3\u8836\u6b98\u615a\u6158\u71e6\u9a42\u9ef2\u84bc\u8259\u5009\u6ec4\u5ec1\u5074\u518a\u6e2c\u60fb\u5c64\u8a6b\u9364\u5115\u91f5\u6519\u647b\u87ec\u995e\u8b92\u7e8f\u93df\u7522\u95e1\u986b\u56c5\u8ac2\u8b96\u8546\u61fa\u5b0b\u9a4f\u8998\u79aa\u9414\u5834\u5617\u9577\u511f\u8178\u5ee0\u66a2\u5000\u8407\u60b5\u95b6\u9be7\u9214\u8eca\u5fb9\u7868\u5875\u9673\u896f\u5096\u8af6\u6aec\u78e3\u9f54\u6490\u7a31\u61f2\u8aa0\u9a01\u68d6\u6a89\u92ee\u943a\u7661\u9072\u99b3\u6065\u9f52\u71be\u98ed\u9d1f\u6c96\u885d\u87f2\u5bf5\u9283\u7587\u8e8a\u7c4c\u7da2\u5114\u5e6c\u8b8e\u6ae5\u5eda\u92e4\u96db\u790e\u5132\u89f8\u8655\u82bb\u7d40\u8e95\u50b3\u91e7\u7621\u95d6\u5275\u6134\u9318\u7d9e\u7d14\u9d89\u7dbd\u8f1f\u9f6a\u8fad\u8a5e\u8cdc\u9dbf\u8070\u8525\u56ea\u5f9e\u53e2\u84ef\u9a44\u6a05\u6e4a\u8f33\u8ea5\u7ac4\u651b\u932f\u92bc\u9e7a\u9054\u5660\u97c3\u5e36\u8cb8\u99d8\u7d3f\u64d4\u55ae\u9132\u64a3\u81bd\u619a\u8a95\u5f48\u6bab\u8ce7\u7649\u7c1e\u7576\u64cb\u9ee8\u8569\u6a94\u8b9c\u78ad\u8960\u6417\u5cf6\u79b1\u5c0e\u76dc\u71fe\u71c8\u9127\u9419\u6575\u6ecc\u905e\u7de0\u7cf4\u8a46\u8ae6\u7d88\u89bf\u93d1\u985b\u9ede\u588a\u96fb\u5dd4\u923f\u7672\u91e3\u8abf\u929a\u9bdb\u8adc\u758a\u9c08\u91d8\u9802\u9320\u8a02\u92cc\u4e1f\u92a9\u6771\u52d5\u68df\u51cd\u5d20\u9d87\u7ac7\u72a2\u7368\u8b80\u8ced\u934d\u7006\u6add\u7258\u7be4\u9ef7\u935b\u65b7\u7dde\u7c6a\u514c\u968a\u5c0d\u61df\u9413\u5678\u9813\u920d\u71c9\u8e89\u596a\u58ae\u9438\u9d5d\u984d\u8a1b\u60e1\u9913\u8ae4\u580a\u95bc\u8edb\u92e8\u9354\u9d9a\u984e\u9853\u9c77\u8a92\u5152\u723e\u990c\u8cb3\u9087\u927a\u9d2f\u9b9e\u767c\u7f70\u95a5\u743a\u792c\u91e9\u7169\u8ca9\u98ef\u8a2a\u7d21\u9201\u9b74\u98db\u8ab9\u5ee2\u8cbb\u7dcb\u9428\u9be1\u7d1b\u58b3\u596e\u61a4\u7cde\u50e8\u8c50\u6953\u92d2\u98a8\u760b\u99ae\u7e2b\u8af7\u9cf3\u7043\u819a\u8f3b\u64ab\u8f14\u8ce6\u5fa9\u8ca0\u8a03\u5a66\u7e1b\u9ce7\u99d9\u7d31\u7d3c\u8cfb\u9ea9\u9b92\u9c12\u91d3\u8a72\u9223\u84cb\u8cc5\u687f\u8d95\u7a08\u8d1b\u5c37\u641f\u7d3a\u5ca1\u525b\u92fc\u7db1\u5d17\u6207\u93ac\u776a\u8aa5\u7e1e\u92ef\u64f1\u9d3f\u95a3\u927b\u500b\u7d07\u9398\u6f41\u7d66\u4e99\u8ce1\u7d86\u9bc1\u9f94\u5bae\u978f\u8ca2\u9264\u6e9d\u830d\u69cb\u8cfc\u5920\u8a6c\u7df1\u89af\u8831\u9867\u8a41\u8f42\u9237\u932e\u9d23\u9d60\u9dbb\u526e\u639b\u9d30\u6451\u95dc\u89c0\u9928\u6163\u8cab\u8a7f\u645c\u9e1b\u9c25\u5ee3\u7377\u898f\u6b78\u9f9c\u95a8\u8ecc\u8a6d\u8cb4\u528a\u532d\u528c\u5aaf\u6a9c\u9bad\u9c56\u8f25\u6efe\u889e\u7dc4\u9bc0\u934b\u570b\u904e\u581d\u54bc\u5e57\u69e8\u87c8\u927f\u99ed\u97d3\u6f22\u95de\u7d4e\u9821\u865f\u705d\u9865\u95a1\u9db4\u8cc0\u8a36\u95d4\u8823\u6a6b\u8f5f\u9d3b\u7d05\u9ecc\u8a0c\u8452\u958e\u9c5f\u58fa\u8b77\u6eec\u6236\u6ef8\u9d98\u5629\u83ef\u756b\u5283\u8a71\u9a4a\u6a3a\u93f5\u61f7\u58de\u6b61\u74b0\u9084\u7de9\u63db\u559a\u7613\u7165\u6e19\u5950\u7e6f\u9370\u9bc7\u9ec3\u8b0a\u9c09\u63ee\u8f1d\u6bc0\u8cc4\u7a62\u6703\u71f4\u532f\u8af1\u8aa8\u7e6a\u8a7c\u8588\u5666\u6fae\u7e62\u743f\u6689\u8477\u6e3e\u8ae2\u991b\u95bd\u7372\u8ca8\u798d\u9225\u944a\u64ca\u6a5f\u7a4d\u9951\u8de1\u8b4f\u96de\u7e3e\u7ddd\u6975\u8f2f\u7d1a\u64e0\u5e7e\u858a\u5291\u6fdf\u8a08\u8a18\u969b\u7e7c\u7d00\u8a10\u8a70\u85ba\u5630\u568c\u9a65\u74a3\u89ac\u9f4f\u78ef\u7f88\u8806\u8e8b\u973d\u9c6d\u9bfd\u593e\u83a2\u9830\u8cc8\u9240\u50f9\u99d5\u90df\u6d79\u92cf\u93b5\u87ef\u6bb2\u76e3\u5805\u7b8b\u9593\u8271\u7dd8\u7e6d\u6aa2\u583f\u9e7c\u63c0\u64bf\u7c21\u5109\u6e1b\u85a6\u6abb\u9452\u8e10\u8ce4\u898b\u9375\u8266\u528d\u991e\u6f38\u6ffa\u6f97\u8aeb\u7e11\u6214\u6229\u77bc\u9dbc\u7b67\u9c39\u97c9\u5c07\u6f3f\u8523\u69f3\u734e\u8b1b\u91ac\u7d73\u97c1\u81a0\u6f86\u9a55\u5b0c\u652a\u9278\u77ef\u50e5\u8173\u9903\u7e73\u7d5e\u8f4e\u8f03\u649f\u5da0\u9de6\u9bab\u968e\u7bc0\u6f54\u7d50\u8aa1\u5c46\u7664\u981c\u9b9a\u7dca\u9326\u50c5\u8b39\u9032\u6649\u71fc\u76e1\u52c1\u834a\u8396\u5df9\u85ce\u9949\u7e09\u8d10\u89b2\u9be8\u9a5a\u7d93\u9838\u975c\u93e1\u5f91\u75d9\u7af6\u51c8\u5244\u6d87\u9015\u5f33\u811b\u975a\u7cfe\u5ec4\u820a\u9b2e\u9ce9\u9df2\u99d2\u8209\u64da\u92f8\u61fc\u5287\u8a4e\u5c68\u6af8\u98b6\u9245\u92e6\u7ab6\u9f5f\u9d51\u7d79\u9308\u942b\u96cb\u89ba\u6c7a\u7d55\u8b4e\u73a8\u921e\u8ecd\u99ff\u76b8\u958b\u51f1\u5274\u584f\u613e\u6137\u93a7\u9347\u9f95\u958c\u9227\u92ac\u9846\u6bbc\u8ab2\u9a0d\u7dd9\u8efb\u9233\u9301\u9837\u58be\u61c7\u9f66\u93d7\u6473\u5eab\u8932\u56b3\u584a\u5108\u9136\u5672\u81be\u5bec\u736a\u9ad6\u7926\u66e0\u6cc1\u8a86\u8a91\u913a\u58d9\u7e8a\u8cba\u8667\u5dcb\u7aba\u994b\u6f70\u5331\u8562\u6192\u8075\u7c23\u95ab\u9315\u9be4\u64f4\u95ca\u8810\u881f\u81d8\u840a\u4f86\u8cf4\u5d0d\u5fa0\u6df6\u7028\u8cda\u775e\u9338\u7669\u7c5f\u85cd\u6b04\u6514\u7c43\u95cc\u862d\u703e\u8b95\u652c\u89bd\u61f6\u7e9c\u721b\u6feb\u5d50\u6b16\u6595\u946d\u8964\u746f\u95ac\u92c3\u6488\u52de\u6f87\u562e\u5d97\u92a0\u9412\u7646\u6a02\u9c33\u9433\u58d8\u985e\u6dda\u8a84\u7e32\u7c6c\u8c8d\u96e2\u9bc9\u79ae\u9e97\u53b2\u52f5\u792b\u6b77\u701d\u96b8\u5137\u9148\u58e2\u85f6\u849e\u863a\u56a6\u9090\u9a6a\u7e2d\u6aea\u6adf\u8f62\u792a\u92f0\u9e1d\u7658\u7cf2\u8e92\u9742\u9c7a\u9c67\u5006\u806f\u84ee\u9023\u942e\u6190\u6f23\u7c3e\u6582\u81c9\u93c8\u6200\u7149\u7df4\u861e\u5969\u7032\u7489\u6bae\u8933\u895d\u9c31\u7ce7\u6dbc\u5169\u8f1b\u8ad2\u9b4e\u7642\u907c\u9410\u7e5a\u91d5\u9def\u7375\u81e8\u9130\u9c57\u51dc\u8cc3\u85fa\u5ee9\u6a81\u8f54\u8eaa\u9f61\u9234\u9748\u5dba\u9818\u7dbe\u6b1e\u87f6\u9bea\u993e\u5289\u700f\u9a2e\u7db9\u93a6\u9dda\u9f8d\u807e\u56a8\u7c60\u58df\u650f\u96b4\u8622\u7027\u74cf\u6af3\u6727\u7931\u6a13\u5a41\u645f\u7c0d\u50c2\u851e\u560d\u5d81\u93e4\u763a\u802c\u87bb\u9acf\u8606\u76e7\u9871\u5eec\u7210\u64c4\u9e75\u865c\u9b6f\u8cc2\u797f\u9304\u9678\u58da\u64fc\u5695\u95ad\u7018\u6de5\u6ae8\u6ad3\u8f64\u8f05\u8f46\u6c0c\u81da\u9e15\u9dfa\u826b\u9c78\u5dd2\u6523\u5b7f\u7064\u4e82\u81e0\u5b4c\u6b12\u9e1e\u947e\u6384\u8f2a\u502b\u4f96\u6dea\u7db8\u8ad6\u5707\u863f\u7f85\u908f\u947c\u7c6e\u9a3e\u99f1\u7d61\u7296\u7380\u6ffc\u6b0f\u8161\u93cd\u9a62\u5442\u92c1\u4fb6\u5c62\u7e37\u616e\u6ffe\u7da0\u6ada\u8938\u92dd\u5638\u5abd\u746a\u78bc\u879e\u99ac\u7f75\u55ce\u561c\u5b24\u69aa\u8cb7\u9ea5\u8ce3\u9081\u8108\u52f1\u779e\u9945\u883b\u6eff\u8b3e\u7e35\u93dd\u9859\u9c3b\u8c93\u9328\u925a\u8cbf\u9ebc\u6c92\u9382\u9580\u60b6\u5011\u636b\u71dc\u61e3\u9346\u9333\u5922\u7787\u8b0e\u5f4c\u8993\u51aa\u7f8b\u8b10\u737c\u79b0\u7dbf\u7dec\u6fa0\u9766\u9efd\u5edf\u7df2\u7e46\u6ec5\u61ab\u95a9\u9594\u7de1\u9cf4\u9298\u8b2c\u8b28\u9a40\u9943\u6b7f\u93cc\u8b00\u755d\u926c\u5436\u9209\u7d0d\u96e3\u6493\u8166\u60f1\u9b27\u9403\u8a25\u9912\u5167\u64ec\u81a9\u922e\u9be2\u6506\u8f26\u9bf0\u91c0\u9ce5\u8526\u88ca\u8076\u5699\u9477\u93b3\u9689\u8617\u56c1\u9862\u8ea1\u6ab8\u7370\u5be7\u64f0\u6fd8\u82e7\u5680\u8079\u9215\u7d10\u81bf\u6fc3\u8fb2\u5102\u5665\u99d1\u91f9\u8afe\u513a\u7627\u6b50\u9dd7\u6bc6\u5614\u6f1a\u8b33\u616a\u750c\u76e4\u8e63\u9f90\u62cb\u76b0\u8ce0\u8f61\u5674\u9d6c\u7d15\u7f86\u9239\u9a19\u8ade\u99e2\u98c4\u7e39\u983b\u8ca7\u5b2a\u860b\u6191\u8a55\u6f51\u9817\u91d9\u64b2\u92ea\u6a38\u8b5c\u93f7\u9420\u68f2\u81cd\u9f4a\u9a0e\u8c48\u555f\u6c23\u68c4\u8a16\u8604\u9a0f\u7dba\u69bf\u78e7\u980e\u980f\u9c2d\u727d\u91ec\u925b\u9077\u7c3d\u8b19\u9322\u9257\u6f5b\u6dfa\u8b74\u5879\u50c9\u8541\u6173\u9a2b\u7e7e\u69e7\u9210\u69cd\u55c6\u58bb\u8594\u5f37\u6436\u5b19\u6aa3\u6227\u7197\u9306\u93d8\u93f9\u7fa5\u8e4c\u936c\u6a4b\u55ac\u50d1\u7ff9\u7ac5\u8a9a\u8b59\u854e\u7e70\u78fd\u8e7a\u7aca\u611c\u9365\u7bcb\u6b3d\u89aa\u5be2\u92df\u8f15\u6c2b\u50be\u9803\u8acb\u6176\u64b3\u9bd6\u74ca\u7aae\u7162\u86fa\u5df0\u8cd5\u87e3\u9c0d\u8da8\u5340\u8ec0\u9a45\u9f72\u8a58\u5d87\u95c3\u89b7\u9d1d\u9874\u6b0a\u52f8\u8a6e\u7da3\u8f07\u9293\u537b\u9d72\u78ba\u95cb\u95d5\u6128\u8b93\u9952\u64fe\u7e5e\u8558\u5b08\u6a48\u71b1\u97cc\u8a8d\u7d09\u98ea\u8ed4\u69ae\u7d68\u5db8\u8811\u7e1f\u92a3\u9870\u8edf\u92b3\u8706\u958f\u6f64\u7051\u85a9\u98af\u9c13\u8cfd\u5098\u6bff\u7cdd\u55aa\u9a37\u6383\u7e45\u6f80\u55c7\u92ab\u7a61\u6bba\u524e\u7d17\u93a9\u9bca\u7be9\u66ec\u91c3\u522a\u9583\u965c\u8d0d\u7e55\u8a15\u59cd\u9a38\u91e4\u9c54\u5891\u50b7\u8cde\u5770\u6ba4\u89f4\u71d2\u7d39\u8cd2\u651d\u61fe\u8a2d\u5399\u7044\u756c\u7d33\u5be9\u5b38\u814e\u6ef2\u8a75\u8ad7\u700b\u8072\u7e69\u52dd\u5e2b\u7345\u6fd5\u8a69\u6642\u8755\u5be6\u8b58\u99db\u52e2\u9069\u91cb\u98fe\u8996\u8a66\u8b1a\u5852\u8494\u5f12\u8efe\u8cb0\u9230\u9c23\u58fd\u7378\u7dac\u6a1e\u8f38\u66f8\u8d16\u5c6c\u8853\u6a39\u8c4e\u6578\u6504\u7d13\u5e25\u9582\u96d9\u8ab0\u7a05\u9806\u8aaa\u78a9\u720d\u9460\u7d72\u98fc\u5edd\u99df\u7de6\u9376\u9de5\u8073\u616b\u980c\u8a1f\u8aa6\u64fb\u85ea\u993f\u98bc\u93aa\u8607\u8a34\u8085\u8b16\u7a4c\u96d6\u96a8\u7d8f\u6b72\u8ab6\u5b6b\u640d\u7b4d\u84c0\u733b\u7e2e\u7463\u9396\u55e9\u8127\u737a\u64bb\u95e5\u9248\u9c28\u81fa\u614b\u9226\u9b90\u6524\u8caa\u7671\u7058\u58c7\u8b5a\u8ac7\u5606\u66c7\u926d\u931f\u9807\u6e6f\u71d9\u513b\u9933\u940b\u93dc\u6fe4\u7d73\u8a0e\u97dc\u92f1\u9a30\u8b04\u92bb\u984c\u9ad4\u5c5c\u7df9\u9d5c\u95d0\u689d\u7cf6\u9f60\u9c37\u8cbc\u9435\u5ef3\u807d\u70f4\u9285\u7d71\u615f\u982d\u9204\u79bf\u5716\u91f7\u5718\u6476\u9839\u86fb\u98e9\u812b\u9d15\u99b1\u99dd\u6a62\u7c5c\u9f09\u896a\u5aa7\u8183\u5f4e\u7063\u9811\u842c\u7d08\u7db0\u7db2\u8f1e\u97cb\u9055\u570d\u70ba\u6ff0\u7dad\u8466\u5049\u507d\u7def\u8b02\u885b\u8ac9\u5e43\u95c8\u6e88\u6f7f\u744b\u97d9\u7152\u9baa\u6eab\u805e\u7d0b\u7a69\u554f\u95bf\u7515\u64be\u8778\u6e26\u7aa9\u81e5\u8435\u9f77\u55da\u93a2\u70cf\u8aa3\u7121\u856a\u5433\u5862\u9727\u52d9\u8aa4\u9114\u5ee1\u61ae\u5af5\u9a16\u9d61\u9da9\u932b\u72a7\u8972\u7fd2\u9291\u6232\u7d30\u993c\u9b29\u74bd\u89a1\u8766\u8f44\u5cfd\u4fe0\u72f9\u5ec8\u5687\u7864\u9bae\u7e96\u8ce2\u929c\u9591\u986f\u96aa\u73fe\u737b\u7e23\u9921\u7fa8\u61b2\u7dda\u83a7\u859f\u861a\u5cf4\u736b\u5afb\u9df4\u7647\u8814\u79c8\u8e9a\u5ec2\u9472\u9109\u8a73\u97ff\u9805\u858c\u9909\u9a64\u7dd7\u9957\u856d\u56c2\u92b7\u66c9\u562f\u5635\u701f\u9a4d\u7d83\u689f\u7c2b\u5354\u633e\u651c\u8105\u8ae7\u5beb\u7009\u8b1d\u893b\u64f7\u7d32\u7e88\u92c5\u91c1\u8208\u9658\u6ece\u5147\u6d36\u92b9\u7e61\u9948\u9d42\u865b\u5653\u9808\u8a31\u6558\u7dd2\u7e8c\u8a61\u980a\u8ed2\u61f8\u9078\u766c\u7d62\u8afc\u9249\u93c7\u5b78\u8b14\u6fa9\u9c48\u52db\u8a62\u5c0b\u99b4\u8a13\u8a0a\u905c\u5864\u6f6f\u9c58\u58d3\u9d09\u9d28\u555e\u4e9e\u8a1d\u57e1\u5a6d\u690f\u6c2c\u95b9\u7159\u9e7d\u56b4\u5dd6\u984f\u95bb\u8277\u53ad\u786f\u5f65\u8afa\u9a57\u53b4\u8d17\u513c\u5157\u8b9e\u61e8\u9586\u91c5\u9b58\u995c\u9f34\u9d26\u694a\u63da\u760d\u967d\u7662\u990a\u6a23\u716c\u7464\u6416\u582f\u9059\u7aaf\u8b20\u85e5\u8efa\u9dc2\u9c29\u723a\u9801\u696d\u8449\u9768\u8b01\u9134\u66c4\u71c1\u91ab\u92a5\u9824\u907a\u5100\u87fb\u85dd\u5104\u61b6\u7fa9\u8a63\u8b70\u8abc\u8b6f\u7570\u7e79\u8a52\u56c8\u5da7\u98f4\u61cc\u9a5b\u7e0a\u8efc\u8cbd\u91d4\u93b0\u943f\u761e\u8264\u852d\u9670\u9280\u98f2\u96b1\u92a6\u766e\u6afb\u5b30\u9df9\u61c9\u7e93\u7469\u87a2\u71df\u7192\u8805\u8d0f\u7a4e\u584b\u9daf\u7e08\u93a3\u6516\u56b6\u7005\u7020\u74d4\u9e1a\u766d\u9826\u7f4c\u55b2\u64c1\u50ad\u7670\u8e34\u8a60\u93de\u512a\u6182\u90f5\u923e\u7336\u8a98\u8555\u92aa\u9b77\u8f3f\u9b5a\u6f01\u5a1b\u8207\u5dbc\u8a9e\u7344\u8b7d\u9810\u99ad\u50b4\u4fc1\u8adb\u8aed\u8577\u5d33\u98eb\u95be\u5ad7\u7d06\u89a6\u6b5f\u923a\u9d52\u9df8\u9f6c\u9d1b\u6df5\u8f45\u5712\u54e1\u5713\u7de3\u9060\u6ade\u9cf6\u9eff\u7d04\u8e8d\u9470\u7cb5\u6085\u95b1\u925e\u9116\u52fb\u9695\u904b\u860a\u919e\u6688\u97fb\u9106\u8553\u60f2\u614d\u7d1c\u97de\u6b9e\u6c33\u96dc\u707d\u8f09\u6522\u66ab\u8d0a\u74da\u8db2\u93e8\u8d13\u81df\u99d4\u947f\u68d7\u8cac\u64c7\u5247\u6fa4\u8cfe\u5616\u5e58\u7c00\u8cca\u8b56\u8d08\u7d9c\u7e52\u8ecb\u9358\u9598\u67f5\u8a50\u9f4b\u50b5\u6c08\u76de\u65ac\u8f3e\u5d84\u68e7\u6230\u7dbb\u8b6b\u5f35\u6f32\u5e33\u8cec\u8139\u8d99\u8a54\u91d7\u87c4\u8f4d\u937a\u9019\u8b2b\u8f12\u9dd3\u8c9e\u91dd\u5075\u8a3a\u93ae\u9663\u6e5e\u7e1d\u6968\u8eeb\u8cd1\u798e\u9d06\u6399\u775c\u7319\u722d\u5e40\u7665\u912d\u8b49\u8acd\u5d22\u9266\u931a\u7b8f\u7e54\u8077\u57f7\u7d19\u646f\u64f2\u5e5f\u8cea\u6eef\u9a2d\u6adb\u6894\u8ef9\u8f0a\u8d04\u9dd9\u8784\u7e36\u8e93\u8e91\u89f6\u9418\u7d42\u7a2e\u816b\u773e\u937e\u8b05\u8ef8\u76ba\u665d\u9a5f\u7d02\u7e10\u8c6c\u8af8\u8a85\u71ed\u77da\u56d1\u8caf\u9444\u99d0\u4f47\u6ae7\u9296\u5c08\u78da\u8f49\u8cfa\u56c0\u994c\u9873\u6a01\u838a\u88dd\u599d\u58ef\u72c0\u9310\u8d05\u589c\u7db4\u9a05\u7e0b\u8ac4\u6e96\u8457\u6fc1\u8ad1\u9432\u8332\u8cc7\u6f2c\u8aee\u7dc7\u8f1c\u8cb2\u7725\u9319\u9f5c\u9bd4\u8e64\u7e3d\u7e31\u50af\u9112\u8acf\u9a36\u9beb\u8a5b\u7d44\u93c3\u9246\u7e98\u8ea6\u9c52\u7ffa\u4e26\u8514\u6c88\u919c\u6fb1\u53e0\u9b25\u7bc4\u5e79\u81ef\u77fd\u6ac3\u5f8c\u5925\u7a2d\u5091\u8a23\u8a87\u88cf\u6de9\u9ebd\u9ef4\u649a\u6dd2\u6261\u8056\u5c4d\u64e1\u5857\u7aaa\u9935\u6c59\u9341\u9e79\u880d\u5f5c\u6e67\u904a\u7c72\u79a6\u9858\u5dbd\u96f2\u7ac8\u7d2e\u5284\u7bc9\u65bc\u8a8c\u8a3b\u96d5\u8a01\u8b7e\u90e4\u731b\u6c39\u962a\u58df\u5816\u57b5\u588a\u6abe\u8552\u8464\u84e7\u8493\u83c7\u69c1\u6463\u54a4\u551a\u54e2\u565d\u5645\u6485\u5288\u8b14\u8946\u5db4\u810a\u4eff\u50e5\u7341\u9e85\u9918\u9937\u994a\u9962\u695e\u6035\u61cd\u723f\u6f35\u7069\u6df7\u6feb\u7026\u6de1\u5be7\u7cf8\u7d5d\u7dd4\u7449\u6898\u68ec\u6848\u6a70\u6aeb\u8ef2\u8ee4\u8ceb\u8181\u8156\u98c8\u7cca\u7146\u6e9c\u6e63\u6e3a\u78b8\u6efe\u7798\u9208\u9255\u92e3\u92b1\u92e5\u92f6\u9426\u9427\u9369\u9340\u9343\u9307\u9384\u9387\u93bf\u941d\u9465\u9479\u9454\u7a6d\u9d93\u9da5\u9e0c\u7667\u5c59\u7602\u81d2\u8947\u7e48\u802e\u986c\u87ce\u9eaf\u9b81\u9b83\u9b8e\u9bd7\u9bdd\u9bf4\u9c5d\u9bff\u9c20\u9c35\u9c45\u97bd\u97dd\u9f47" 387 | 388 | UNIHAN = load('unihan', get_unihan) 389 | GY = load('guangyun', get_gy) 390 | GBK = load('gbk', get_gbk) 391 | BIG5 = load('big5', get_big5) 392 | TLS = load('tls', get_tls) 393 | tone_converter = dict(zip( 394 | list('áéíóúýāēīōūȳàèìòùỳǎěǐǒǔ'), 395 | list('aeiouyaeiouyaeiouyaeiou') 396 | )) 397 | IDS = load('ids', get_ids) 398 | cp = tokenizer.Tokenizer(_path('chinese.tsv')) 399 | -------------------------------------------------------------------------------- /src/sinopy/data/README.md: -------------------------------------------------------------------------------- 1 | # Data Sources for SinoPy 2 | 3 | * TLS data stems from the Thesaurus Linguae Sericae 4 | * Unihan data stems from the Unihan project 5 | * Data on Chinese characters (motivation and structure) stems from the [CJKV project](https://github.com/cjkvi/) 6 | -------------------------------------------------------------------------------- /src/sinopy/data/profiles/mch.prf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lingpy/sinopy/de74b43886ba24ad6169427c5f455f75c2a13637/src/sinopy/data/profiles/mch.prf -------------------------------------------------------------------------------- /src/sinopy/data/wang1980.tsv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lingpy/sinopy/de74b43886ba24ad6169427c5f455f75c2a13637/src/sinopy/data/wang1980.tsv -------------------------------------------------------------------------------- /src/sinopy/data/yiti.tsv: -------------------------------------------------------------------------------- 1 | 襉 襇 2 | -------------------------------------------------------------------------------- /src/sinopy/hanzi.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for operations with Chinese characters 3 | """ 4 | from sinopy.data import * 5 | from sinopy.util import is_chinese 6 | from lingpy.sequence.ngrams import get_all_ngrams 7 | from lingpy.sequence.ngrams import trigrams 8 | from sinopy.util import is_chinese 9 | 10 | ids = get_ids() 11 | idsr = {v: k for k, v in ids.items()} 12 | helper = lambda x: len(x) == 1 and ord(x[0]) in range(12250, 12300) 13 | 14 | def compose(chars, ids=ids, _return_list=False): 15 | """ 16 | Find the complex character for character components. 17 | """ 18 | # test: ⿰亻⿱立日 -> 偣 19 | # replace triples by their counterparts 20 | #triples = [ 21 | # ('⿳', '⿱⿱'), 22 | # ] 23 | #for s, t in triples: 24 | # chars = chars.replace(s, t) 25 | # search for xAB pairs 26 | while True: 27 | found = False 28 | if len(chars) > 1: 29 | for i, grams in enumerate( 30 | sorted( 31 | get_all_ngrams(chars), 32 | key=lambda x: len(x), 33 | reverse=True 34 | ) 35 | ): 36 | if helper(grams[0]) and grams in ids and len(grams) > 1: 37 | idx = chars.index(grams) 38 | chars = chars[:idx]+ids[grams]+chars[idx+len(grams):] 39 | found = True 40 | break 41 | if not found: 42 | return '?'+chars 43 | else: 44 | break 45 | return chars 46 | 47 | def decompose(character, ids=idsr, _return_list=False, depth=2): 48 | """ 49 | Split a character into its components. 50 | """ 51 | 52 | chars = [(character, 0)] 53 | out = '' 54 | if depth == 0: 55 | return character 56 | 57 | while chars: 58 | charls, d = chars.pop(0) 59 | nchars = [] 60 | for char in charls: 61 | if helper(char): 62 | nchars += ['--'+char] 63 | elif char in ids and ids[char] != char: 64 | nchars += list(ids[char]) 65 | else: 66 | nchars += ['--'+char] 67 | if d < depth and [x for x in nchars if not x.startswith('--')]: 68 | chars += [(nchars, d+1)] 69 | return ''.join([x.lstrip('-') for x in nchars]) 70 | 71 | 72 | def character_from_structure(motivation): 73 | """ 74 | Find a character for a given structure. 75 | 76 | Note 77 | ---- 78 | This is a utility function that makes it easier to parse the more complex 79 | motivation structures that are handled by compose and decompose. 80 | """ 81 | assert len(motivation) == 3 82 | 83 | _c = { 84 | "+": "⿰", 85 | "-": "⿱", 86 | '>': "⿱", 87 | "手": "扌", 88 | "人": "亻", 89 | "刀": "刂", 90 | "丝": "糹", 91 | "水": "氵", 92 | "0": "⿴", 93 | } 94 | structure = ''.join([_c.get(x, x) for x in motivation]) 95 | return IDS.get(structure, '?') 96 | 97 | 98 | def gbk2big5(chars): 99 | """ 100 | Convert from gbk format to big5 representation of chars. 101 | """ 102 | out = '' 103 | for char in chars: 104 | if char in GBK: 105 | out += BIG5[GBK.index(char)] 106 | else: 107 | out += char 108 | return out 109 | 110 | def pinyin(char, variant='mandarin', sep=' ', out='tones'): 111 | """ 112 | Retrieve Pinyin of a character. 113 | """ 114 | if len(char) > 1: 115 | return sep.join([pinyin(c, variant=variant, sep=sep, out=out) for c in char]) 116 | 117 | if not is_chinese(char): 118 | return char 119 | 120 | if char in GBK: 121 | char = gbk2big5(char) 122 | 123 | out_char = UNIHAN.get(char, {variant: '?({0}'.format(char)}).get(variant, '!({0})'.format(char)) 124 | 125 | if out != 'tones': 126 | out_char = ''.join([tone_converter.get(x, x) for x in out_char]) 127 | 128 | return out_char 129 | 130 | 131 | def chars2baxter(chars): 132 | """ 133 | Convert a sequence of Characters to their MCH values. 134 | """ 135 | 136 | out = [] 137 | chars = gbk2big5(chars) 138 | 139 | for char in chars: 140 | tmp = [] 141 | if char in TLS: 142 | for entry in TLS[char]: 143 | baxter = TLS[char][entry]['BAXTER'] 144 | if baxter != '?': 145 | tmp += [baxter] 146 | out += [','.join(tmp)] 147 | return out 148 | 149 | 150 | def big52gbk(chars): 151 | """ 152 | Convert from long chars to short chars. 153 | """ 154 | out = '' 155 | for char in chars: 156 | if char in BIG5: 157 | out += GBK[BIG5.index(char)] 158 | else: 159 | out += char 160 | return out 161 | 162 | 163 | def chars2gloss(chars): 164 | """ 165 | Get the TLS basic gloss for a characters. 166 | """ 167 | out = [] 168 | chars = gbk2big5(chars) 169 | for char in chars: 170 | tmp = [] 171 | if char in TLS: 172 | for entry in TLS[char]: 173 | baxter = TLS[char][entry]['UNIHAN_GLOSS'] 174 | if baxter != '?': 175 | tmp += [baxter] 176 | out += [','.join(tmp)] 177 | return out 178 | -------------------------------------------------------------------------------- /src/sinopy/seaquence.py: -------------------------------------------------------------------------------- 1 | """ 2 | Operations on data from South-East-Asian languages 3 | """ 4 | import lingpy 5 | from lingpy.sequence import sound_classes as lsc 6 | from clldutils.text import strip_chars 7 | 8 | def parse_chinese_morphemes( 9 | seq, 10 | context=False, 11 | model=False 12 | ): 13 | """ 14 | Parse a Chinese syllable and return its basic structure. 15 | 16 | """ 17 | 18 | if not model: 19 | model = lingpy.rc('art') 20 | 21 | # get the tokens 22 | if isinstance(seq, list): 23 | tokens = [s for s in seq] 24 | else: 25 | tokens = lingpy.ipa2tokens(seq, merge_vowels=False) 26 | 27 | # get the sound classes according to the art-model 28 | arts = [int(x) for x in lingpy.tokens2class(tokens, model, cldf=True)] 29 | 30 | # get the pro-string 31 | prostring = lingpy.prosodic_string(arts, cldf=True) 32 | 33 | # parse the zip of tokens and arts 34 | I,M,N,C,T = '','','','','' 35 | 36 | ini = False 37 | med = False 38 | nuc = False 39 | cod = False 40 | ton = False 41 | 42 | triples = [('?','?','?')]+list(zip( 43 | tokens, arts, prostring))+[('?','?','?')] 44 | 45 | for i in range(1,len(triples)-1): 46 | t, c, p = triples[i] 47 | _t, _c, _p = triples[i-1] 48 | t_, c_, p_ = triples[i+1] 49 | 50 | # check for initial entry first 51 | if p == 'A' and _t == '?': 52 | # now, if we have a j-sound and a vowel follows, we go directly to 53 | # medial environment 54 | if t[0] in 'jɥw': 55 | med = True 56 | ini,nuc,cod,ton = False,False,False,False 57 | else: 58 | ini = True 59 | med,nuc,doc,ton = False,False,False,False 60 | 61 | # check for initial vowel 62 | elif p == 'X' and _t == '?': 63 | if t[0] in 'iuy' and c_ == '7': 64 | med = True 65 | ini, nuc, cod, ton = False,False,False,False 66 | else: 67 | nuc = True 68 | ini, med, cod, ton = False,False,False,False 69 | 70 | # check for medial after initial 71 | elif p == 'C': 72 | med = True 73 | ini, nuc, cod, ton = False,False,False,False 74 | 75 | # check for vowel medial 76 | elif p == 'X' and p_ == 'Y': 77 | 78 | # if we have a medial vowel, we classify it as medial 79 | if t in 'iyu': 80 | med = True 81 | ini, nuc, cod, ton = False,False,False,False 82 | else: 83 | nuc = True 84 | ini, med, cod, ton = False,False,False,False 85 | 86 | # check for vowel without medial 87 | elif p == 'X' or p == 'Y': 88 | if p_ in 'LTY' or p_ == '?': 89 | nuc = True 90 | ini, med, cod, ton = False,False,False,False 91 | elif p == 'Y': 92 | nuc = True 93 | ini, med, cod, ton = 4 * [False] 94 | else: 95 | cod = True 96 | ini, med, nuc, ton = 4 * [False] 97 | 98 | # check for consonant 99 | elif p == 'L': 100 | cod = True 101 | ini, med, nuc, ton = 4 * [False] 102 | 103 | # check for tone 104 | elif p == 'T': 105 | ton = True 106 | ini, med, nuc, cod = 4 * [False] 107 | 108 | if ini: 109 | I += t 110 | elif med: 111 | M += t 112 | elif nuc: 113 | N += t 114 | elif cod: 115 | C += t 116 | else: 117 | T += t 118 | 119 | # bad conversion for output, but makes what it is supposed to do 120 | out = [I,M,N,C,T] 121 | tf = lambda x: x if x else '-' 122 | out = [tf(x) for x in out] 123 | 124 | # transform tones to normal letters 125 | tones = dict(zip('¹²³⁴⁵⁶⁷⁸⁹⁰₁₂₃₄₅₆₇₈₉₀', '1234567890123456789')) 126 | 127 | # now, if context is wanted, we'll yield that 128 | ic = '1' if [x for x in I if x in 'bdgmnŋȵɳɴ'] else '0' 129 | mc = '1' if [m for m in M+N if m in 'ijyɥ'] else '0' 130 | cc = '1' if C in 'ptkʔ' else '0' 131 | tc = ''.join([tones.get(x, x) for x in T]) 132 | 133 | IC = '/'.join(['I',ic, mc, cc, tc]) if I else '' 134 | MC = '/'.join(['M',ic, mc, cc, tc]) if M else '' 135 | NC = '/'.join(['N',ic, mc, cc, tc]) if N else '' 136 | CC = '/'.join(['C',ic, mc, cc, tc]) if C else '' 137 | TC = '/'.join(['T',ic, mc, cc, tc]) if T else '' 138 | 139 | if context: 140 | return out, [x for x in [IC, MC, NC, CC, TC] if x] 141 | return out 142 | 143 | 144 | def get_structure( 145 | word, 146 | sep='+', 147 | zipped=False, 148 | semi_diacritics='hsʃʂʒʐzθɕʑfvθðnmȵ' 149 | ): 150 | if not isinstance(word, (list, tuple)): 151 | word = lingpy.ipa2tokens( 152 | word, 153 | expand_nasals=True, 154 | merge_vowels=False, 155 | semi_diacritics=semi_diacritics 156 | ) 157 | 158 | # check for unknown chars 159 | try: 160 | lingpy.tokens2class(word, 'cv', cldf=True) 161 | except ValueError: 162 | print('problem with {0}'.format(''.join(word))) 163 | return [] 164 | 165 | # get the morphemes 166 | if sep in word: 167 | words = lsc.tokens2morphemes(word, cldf=True) 168 | morphemes = [] 169 | for w in words: 170 | morphemes += lsc.tokens2morphemes(w, sep=sep) 171 | else: 172 | morphemes = lsc.tokens2morphemes(word, cldf=True) 173 | # get the basic structure for each morpheme 174 | for morpheme in morphemes: 175 | try: 176 | segments = parse_chinese_morphemes(morpheme) 177 | except: 178 | if not zipped: 179 | yield ['NULL'] 180 | else: 181 | yield ([('NULL', 'NULL')], morpheme) 182 | if not zipped: 183 | yield [x for x, y in zip('imnct', segments) if y != '-'] 184 | else: 185 | yield ([x for x in zip('imnct', segments) if x[1] != '-'], 186 | morpheme) 187 | 188 | 189 | def get_structure_profile( 190 | wordlist, 191 | column='ipa', 192 | text=False, 193 | semi_diacritics='hsʃʂʒʐzθɕʑfvθð', 194 | debug=False, 195 | language=None 196 | ): 197 | profile = defaultdict(list) 198 | modify = lambda x: x 199 | if column == 'ipa': 200 | modify = lambda x: x.replace(' ', '_') 201 | 202 | for idx, lang, segments in lingpy.iter_rows( 203 | wordlist, 'doculect', column): 204 | if debug: print(idx, lang, segments) 205 | if not language or language == lang: 206 | for structure, morpheme in get_structure( 207 | modify(segments), zipped=True, 208 | semi_diacritics=semi_diacritics): 209 | im, nc, t = [[], []], [[], []], [[], []] 210 | for pos, seg in structure: 211 | if pos in 'i': 212 | im[0] += [pos] 213 | im[1] += [seg] 214 | elif pos in 'mnc': 215 | nc[0] += [pos] 216 | nc[1] += [seg] 217 | else: 218 | t[0] += [pos] 219 | t[1] += [seg] 220 | if im[0]: 221 | profile[' '.join(im[0]), ' '.join(im[1])] += [(lang, 222 | morpheme)] 223 | if nc[0]: 224 | profile[' '.join(nc[0]), ' '.join(nc[1])] += [(lang, 225 | morpheme)] 226 | if t[0]: 227 | profile[' '.join(t[0]), ' '.join(t[1])] += [(lang, 228 | morpheme)] 229 | for (pos, seg), langs_ in sorted(profile.items(), key=lambda x: (x[0][0], 230 | len(x[1])), 231 | reverse=True): 232 | langs = [x[0] for x in langs_] 233 | examples = [''.join(x[1]) for x in langs_] 234 | if not text: 235 | yield (seg.replace(' ', ''), seg, seg, pos, codepoint(s), len(langs), 236 | ','.join(sorted(set(langs), 237 | key=lambda x: langs.count(x))), 238 | ', '.join(examples[:5])) 239 | else: 240 | yield '\t'.join([ 241 | strip_chars(' ∼', seg), seg, seg, pos, codepoint(seg), str(len(langs)), 242 | ','.join(sorted(set(langs), key=lambda x: langs.count(x))), 243 | ', '.join(examples[:5])]) 244 | 245 | 246 | def write_structure_profile(wordlist, column='ipa', 247 | language=None, 248 | filename='orthography.tsv', semi_diacritics='hsʃʂʒʐzθɕʑfvθðnmȵŋ', debug=False): 249 | content = ['Grapheme\tSegments\tCLPA\tStructure\tUnicode\tFrequency\tReflexes\tExamples'] 250 | content += list(get_structure_profile( 251 | wordlist, language=language, column=column, text=True, semi_diacritics=semi_diacritics, 252 | debug=debug)) 253 | 254 | lingpy.util.write_text_file(filename, content, normalize='NFC') 255 | -------------------------------------------------------------------------------- /src/sinopy/util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions that can be used without loading extensive amounts of data. 3 | """ 4 | 5 | def is_chinese(name): 6 | """ 7 | Check if a symbol is a Chinese character. 8 | 9 | Note 10 | ---- 11 | 12 | Taken from http://stackoverflow.com/questions/16441633/python-2-7-test-if-characters-in-a-string-are-all-chinese-characters 13 | """ 14 | if not name: 15 | return False 16 | for ch in name: 17 | ordch = ord(ch) 18 | if not (0x3400 <= ordch <= 0x9fff) and not (0x20000 <= ordch <= 0x2ceaf) \ 19 | and not (0xf900 <= ordch <= ordch) and not (0x2f800 <= ordch <= 0x2fa1f): 20 | return False 21 | return True 22 | -------------------------------------------------------------------------------- /src/sinopy/yinyun.py: -------------------------------------------------------------------------------- 1 | import lingpy 2 | import json 3 | from sinopy import data as sdata 4 | from sinopy.util import is_chinese 5 | import re 6 | from sinopy.hanzi import gbk2big5 7 | 8 | def parse_baxter(reading): 9 | """ 10 | Parse a Baxter string and render it with all its contents, namely 11 | initial, medial, final, and tone. 12 | """ 13 | 14 | initial = '' 15 | medial = '' 16 | final = '' 17 | tone = '' 18 | 19 | # determine environments 20 | inienv = True 21 | medienv = False 22 | finenv = False 23 | tonenv = False 24 | 25 | inichars = "pbmrtdnkgnsyhzl'x" 26 | 27 | 28 | chars = list(reading) 29 | for char in chars: 30 | 31 | # switch environments 32 | if char in 'jw' and not finenv: 33 | inienv,medienv,finenv,tonenv = False,True,False,False 34 | elif char not in inichars or finenv: 35 | if char in 'XH': 36 | inienv,medienv,finenv,tonenv = False,False,False,True 37 | else: 38 | inienv,medienv,finenv,tonenv = False,False,True,False 39 | 40 | # fill in slots 41 | if inienv: 42 | initial += char 43 | 44 | if medienv: 45 | medial += char 46 | 47 | if finenv: 48 | final += char 49 | 50 | if tonenv: 51 | tone += char 52 | 53 | # post-parse tone 54 | if not tone and final[-1] in 'ptk': 55 | tone = 'R' 56 | elif not tone: 57 | tone = 'P' 58 | 59 | # post-parse medial 60 | if 'j' not in medial and 'y' in initial: 61 | medial += 'j' 62 | 63 | # post-parse labial 64 | if final[0] in 'u' and 'w' not in medial: 65 | medial = 'w' + medial 66 | 67 | return initial, medial, final, tone 68 | 69 | 70 | def sixtuple2baxter(chars, debug=False, rhymebook=None): 71 | """ 72 | Convert the classicial six-tuple representation of MCH readings into IPA 73 | (or Baxter's ASCII system). 74 | This function is more or less implemented in MiddleChinese. 75 | """ 76 | if not rhymebook: 77 | rhymebook = sdata.GY 78 | 79 | if len(chars) != 6: 80 | raise ValueError('chars should be a sixtuple') 81 | 82 | # convert chars to long chars 83 | chars = gbk2big5(chars) 84 | 85 | # assign basic values 86 | she,hu,deng,diao,yun,sheng = list(chars) 87 | 88 | # try converting the values to mch representations 89 | initial = rhymebook['sheng'].get(sheng, '?') 90 | final = rhymebook['yun'].get(yun, '?') 91 | tone = rhymebook['diao'].get(diao, '?') 92 | medial = rhymebook['hu'].get(hu, '?') 93 | division = rhymebook['deng'].get(deng, '?') 94 | 95 | # debug is for cross-checking 96 | if debug: 97 | return [(sheng, initial), (hu, medial), (deng, division),(yun, final), 98 | (diao, tone)] 99 | 100 | # check and raise error if things are not handled 101 | if "?" in [initial, final, tone, medial, division]: 102 | raise ValueError("Unrecognized elements in {0}.".format( 103 | ' '.join([initial, final, tone, medial, division]))) 104 | 105 | # treat the final if division is 3 and they start with 'j', note that so 106 | # far, we don't handle chongnius 107 | final = final[1:] if final[0] == 'j' and division in '4' \ 108 | else final 109 | final = final[1:] if final[0] == 'j' and division in '3' \ 110 | else final 111 | 112 | # reduce finals starting with 'w' 113 | final = final[1:] if final[0] == 'w' else final 114 | 115 | # resolve the medial (the hu) by checking for labial initial 116 | medial = '' if (initial[0] in 'pbm' and '*' not in final) \ 117 | or final[0] in 'u' \ 118 | or 'o' in final and not '*' in final and not '?' in final \ 119 | else medial 120 | 121 | # correct for initials with sandeng-i 122 | initial = initial[:-1] if initial.endswith('j') else initial 123 | 124 | # get the medial corrected by deng 125 | medial = "j" + medial if division == '3' \ 126 | and 'i' not in final \ 127 | and 'y' not in initial \ 128 | else medial 129 | 130 | # deprive the rime from its leading "j" if we have a medial 131 | final = final[1:] if final[0] in 'j' and 'j' in medial else final 132 | final = final[1:] if final[0] in 'w' and 'w' in medial else final 133 | final = final[1:] if final[0] == '*' or final[0] == '?' else final 134 | final = 'i' + final[1:] if final[0] == '!' \ 135 | and division == '4' \ 136 | and 'i' not in final \ 137 | and (initial[0] in "pbmkgx'" or initial.startswith('ng')) \ 138 | else final 139 | 140 | # chongniu medial-re-order 141 | medial = 'j' + medial if division == '4' \ 142 | and '!' in final \ 143 | and 'j' not in medial \ 144 | and (initial[0] in "pbmkgx'" or initial.startswith('ng')) \ 145 | else medial 146 | 147 | final = final[1:] if final[0] == '!' else final 148 | 149 | # put everything together 150 | return [initial,medial,final,tone] 151 | 152 | 153 | def baxter2ipa(mch, segmented=False): 154 | """ 155 | Very simple aber convient-enough conversion from baxter MCH to IPA MCH. 156 | this is also more or less already implemented in MiddleChinese 157 | """ 158 | 159 | out = mch 160 | if out[-1] in 'ptk': 161 | out += 'R' 162 | elif out[-1] not in 'XHP': 163 | out += 'P' 164 | 165 | for s, t in sdata.GY['ipa']: 166 | out = out.replace(s, t) 167 | if segmented: 168 | return parse_chinese_morphemes(out) 169 | return out 170 | -------------------------------------------------------------------------------- /tests/test_data.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals, division 2 | 3 | def test_unihan(): 4 | from sinopy.data import UNIHAN 5 | assert UNIHAN['分']['mandarin'] == 'fēn' 6 | 7 | def test_character_from_structure(): 8 | from sinopy.hanzi import character_from_structure 9 | assert character_from_structure('+手羅') == '攞' 10 | -------------------------------------------------------------------------------- /tests/test_hanzi.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the hanzi module. 3 | """ 4 | from sinopy.hanzi import ( 5 | compose, decompose, character_from_structure, 6 | pinyin, chars2baxter, chars2gloss 7 | ) 8 | 9 | def test_compose(): 10 | start = '⿰亻⿱立日' 11 | 12 | char = compose(start) 13 | assert char == "偣" 14 | 15 | 16 | def test_decompose(): 17 | char = decompose("偣") 18 | assert char == "⿰亻⿱⿱⿱亠丷一日" 19 | 20 | 21 | def test_character_from_structure(): 22 | assert character_from_structure('+人我') == '俄' 23 | 24 | 25 | def test_pinyin(): 26 | assert pinyin('俄') == "é" 27 | assert pinyin('认得') == "rèn dé" 28 | 29 | 30 | def test_chars2baxter(): 31 | 32 | assert chars2baxter('认得')[0] == "nyinH" 33 | assert chars2baxter("认得")[1] == "tok" 34 | 35 | 36 | def test_chars2gloss(): 37 | 38 | assert chars2gloss('认')[0] == "recognize, know, understand" 39 | 40 | -------------------------------------------------------------------------------- /tests/test_seaquence.py: -------------------------------------------------------------------------------- 1 | from sinopy.seaquence import parse_chinese_morphemes, get_structure 2 | 3 | for seq in [ 4 | 'waŋ⁵⁵', 5 | 'fəŋ³¹', 6 | 'piao⁴', 7 | 'put⁵', 8 | 'tsʰ a i j ⁵¹', 9 | 'kiaoŋ⁰' 10 | ]: 11 | print(seq) 12 | print('\t'.join([x for x in parse_chinese_morphemes(seq) if x != '-'])) 13 | print('\t'.join(list(get_structure(seq))[0])) 14 | print('---') 15 | -------------------------------------------------------------------------------- /tests/test_yinyun.py: -------------------------------------------------------------------------------- 1 | from sinopy.yinyun import parse_baxter, sixtuple2baxter, baxter2ipa 2 | 3 | def test_parse_baxter(): 4 | 5 | assert parse_baxter('tsrhangH') == ('tsrh', '', 'ang', 'H') 6 | 7 | 8 | def test_baxter2ipa(): 9 | 10 | assert baxter2ipa('tsrhungX') == "ʦʰuŋ²" 11 | 12 | 13 | def test_sixtuple2baxter(): 14 | 15 | assert "/".join(sixtuple2baxter('臻開三入質影')) == "'//it/R" 16 | --------------------------------------------------------------------------------