├── .gitignore ├── .travis.yml ├── HISTORY.rst ├── LICENSE ├── MANIFEST.in ├── README.rst ├── cdata ├── __init__.py ├── core.py ├── entity.py ├── misc.py ├── region.py ├── region_data.json ├── region_dict.txt ├── summary.py ├── table.py ├── web.py └── wikify.py ├── requirements.txt ├── setup.py └── tests ├── ex1.json ├── ex2.xls ├── ex3-region-test.xls ├── test_core.py ├── test_core_stat.jsonld ├── test_entity.py ├── test_region.py ├── test_summary.py ├── test_table.py ├── test_web.py └── test_wikify.py /.gitignore: -------------------------------------------------------------------------------- 1 | # customized skip 2 | local 3 | .DS_Store 4 | 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | env/ 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | .hypothesis/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # dotenv 88 | .env 89 | 90 | # virtualenv 91 | .venv 92 | venv/ 93 | ENV/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: false 3 | python: 4 | - "2.7" 5 | 6 | install: 7 | - pip install --upgrade . 8 | - pip install --upgrade -r requirements.txt 9 | - pip install coveralls 10 | 11 | script: python setup.py test 12 | 13 | after_success: 14 | coveralls 15 | -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | .. :changelog: 2 | 3 | History 4 | ------- 5 | 6 | 0.1.9 (2017-09-07) 7 | ++++++++++++++++++ 8 | * core.py add stat_jsonld, add function to count triples in an JSON object 9 | 10 | 0.1.8 (2017-08-16) 11 | ++++++++++++++++++ 12 | * fixed summary.summarize_entity_person, handle empty nationality situation 13 | * changed core.json_get_first_item, add defaultValue as empty string 14 | * add core.json_append, append an item to a list 15 | 16 | 0.1.7 (2017-07-20) 17 | ++++++++++++++++++ 18 | * bugfix summary.summarize_entity_person 19 | 20 | 0.1.6 (2017-07-20) 21 | ++++++++++++++++++ 22 | * add summary.summarize_entity_person function 23 | 24 | 0.1.5 (2017-07-18) 25 | ++++++++++++++++++ 26 | * bugfix, normalize_region_name 27 | * pack region data with code 28 | 29 | 0.1.4 (2017-07-17) 30 | ++++++++++++++++++ 31 | * add module wikify with wikidata_search, wikidata_get 32 | * update module core with json_dict_copy 33 | * update modele entity with get_primary_entity 34 | * add one more district in region, add strict_mode for skipping exit() on error 35 | 36 | 0.1.1 (2017-06-22) 37 | ++++++++++++++++++ 38 | * add module entity with SimpleEntity.ner( text ) 39 | * add module region with RegionEntity.guess_all( [address, name]) 40 | 41 | 0.1.0 (2017-06-19) 42 | ++++++++++++++++++ 43 | 44 | * initial PyPI release 45 | * add module json, table(excel), web for data manipulation 46 | * provide cli ui via misc.main_subtask 47 | * connect to travis CI 48 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include cdata/region_data.json 3 | include cdata/region_dict.txt 4 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | cdata 2 | ------------- 3 | 4 | "see data", see data, handy snippets for conversion, cleaning and integration. 5 | 6 | install 7 | ------------- 8 | pip install cdata 9 | 10 | 11 | json data manipulation 12 | ------------- 13 | 14 | * json (and json stream) file IO, e.g. items2file(...) 15 | * json data access, e.g. json_get(...), any2utf8, json_dict_copy 16 | * json array statistics, e.g. stat(...) 17 | 18 | .. code-block:: python 19 | 20 | from cdata.core import any2utf8 21 | the_input = {"hello": u"世界"} 22 | the_output = any2utf8(the_input) 23 | logging.info((the_input, the_output)) 24 | 25 | 26 | .. code-block:: python 27 | property_list = [ 28 | { "name":"name", "alternateName": ["name","title"]}, 29 | { "name":"birthDate", "alternateName": ["dob","dateOfBirth"] }, 30 | { "name":"description" } 31 | ] 32 | json_object = {"dob":"2010-01-01","title":"John","interests":"data","description":"a person"} 33 | ret = json_dict_copy(json_object, property_list) 34 | 35 | 36 | table data manipulation 37 | ------------- 38 | 39 | * json array to/from excel 40 | 41 | .. code-block:: python 42 | 43 | import json 44 | from cdata.table import excel2json,json2excel 45 | filename = "test.xls" 46 | items = [{"first":"hello", "last":"world" }] 47 | json2excel(items, ["first","last"], filename) 48 | ret = excel2json(filename) 49 | print json.dumps(ret) 50 | 51 | 52 | 53 | JSON data from reading a single sheet excel file 54 | 55 | .. code-block:: json 56 | 57 | { 58 | "fields": { 59 | "00": [ 60 | "name", 61 | "年龄", 62 | "notes" 63 | ] 64 | }, 65 | "data": { 66 | "00": [ 67 | { 68 | "notes": "", 69 | "年龄": 18.0, 70 | "name": "张三" 71 | }, 72 | { 73 | "notes": "this is li si", 74 | "年龄": 18.0, 75 | "name": "李四" 76 | } 77 | ] 78 | } 79 | } 80 | 81 | web stuff 82 | ------------- 83 | 84 | * url domain extraction 85 | 86 | entity manipulation 87 | ------------- 88 | 89 | * entity.SimpleEntity.ner() 90 | 91 | .. code-block:: python 92 | 93 | from cdata.entity import SimpleEntity 94 | entity_list = [{"@id":"1","name":u"张三"},{"@id":"2","name":u"李四"}] 95 | ner = SimpleEntity(entity_list) 96 | sentence = "张三给了李四一个苹果" 97 | ret = ner.ner(sentence) 98 | logging.info(json.dumps(ret, ensure_ascii=False, indent=4)) 99 | """ 100 | [{ 101 | "text": "张三", 102 | "entities": [ 103 | { 104 | "@id": "1", 105 | "name": "张三" 106 | } 107 | ], 108 | "index": 0 109 | }, 110 | { 111 | "text": "李四", 112 | "entities": [ 113 | { 114 | "@id": "2", 115 | "name": "李四" 116 | } 117 | ], 118 | "index": 4 119 | }] 120 | """ 121 | 122 | * region.RegionEntity.guess_all() 123 | 124 | .. code-block:: python 125 | 126 | from cdata.region import RegionEntity 127 | addresses = ["北京海淀区阜成路52号(定慧寺)", "北京大学肿瘤医院"] 128 | 129 | city_data = RegionEntity() 130 | result = city_data.guess_all(addresses) 131 | logging.info(json.dumps(result, ensure_ascii=False)) 132 | """ 133 | {"province": "北京市", 134 | "city": "市辖区", 135 | "name": "海淀区", 136 | "district": "海淀区", 137 | "cityid": "110108", 138 | "type": "district"} 139 | """ 140 | 141 | wikification 142 | ------------- 143 | 144 | * 通过wikidata搜索,定位对应实体,查找实体中文名,别名等属性。wikidata_search (item/property) and wikidata_get 145 | 146 | .. code-block:: python 147 | 148 | query = u"居里夫人" 149 | ret = wikidata_search(query, lang="zh") 150 | logging.info(ret) 151 | 152 | nodeid = ret["itemList"][0]["identifier"] 153 | ret = wikidata_get(nodeid) 154 | lable_zh = ret["entities"][nodeid]["labels"]["zh"]["value"] 155 | logging.info(lable_zh) 156 | 157 | 158 | misc 159 | ------------- 160 | 161 | * support simple cli function using argparse 162 | 163 | 164 | notes 165 | ------------- 166 | release package using https://github.com/pypa/twine 167 | -------------------------------------------------------------------------------- /cdata/__init__.py: -------------------------------------------------------------------------------- 1 | def info(): 2 | return { 3 | 'version': 'v0.0.1', 4 | 'dataModified': '2017-06-19', 5 | 'author': 'Li Ding'} 6 | -------------------------------------------------------------------------------- /cdata/core.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: Li Ding 4 | # JSON data manipulation 5 | 6 | # base packages 7 | import os 8 | import sys 9 | import json 10 | import logging 11 | import codecs 12 | import hashlib 13 | import datetime 14 | import time 15 | import argparse 16 | import urlparse 17 | import re 18 | import collections 19 | 20 | # global constants 21 | VERSION = 'v20170713' 22 | CONTEXTS = [os.path.basename(__file__), VERSION] 23 | 24 | #################################### 25 | # file path 26 | 27 | 28 | def file2abspath(filename, this_file=__file__): 29 | """ 30 | generate absolute path for the given file and base dir 31 | """ 32 | return os.path.abspath( 33 | os.path.join(os.path.dirname(os.path.abspath(this_file)), filename)) 34 | 35 | 36 | #################################### 37 | # read from file 38 | 39 | def file2json(filename, encoding='utf-8'): 40 | """ 41 | save a line 42 | """ 43 | with codecs.open(filename, "r", encoding=encoding) as f: 44 | return json.load(f) 45 | 46 | 47 | def file2iter(filename, encoding='utf-8', comment_prefix="#", 48 | skip_empty_line=True): 49 | """ 50 | json stream parsing or line parsing 51 | """ 52 | ret = list() 53 | visited = set() 54 | with codecs.open(filename, encoding=encoding) as f: 55 | for line in f: 56 | line = line.strip() 57 | # skip empty line 58 | if skip_empty_line and len(line) == 0: 59 | continue 60 | 61 | # skip comment line 62 | if comment_prefix and line.startswith(comment_prefix): 63 | continue 64 | 65 | yield line 66 | 67 | 68 | #################################### 69 | # write to file 70 | 71 | def json2file(data, filename, encoding='utf-8'): 72 | """ 73 | write json in canonical json format 74 | """ 75 | with codecs.open(filename, "w", encoding=encoding) as f: 76 | json.dump(data, f, ensure_ascii=False, indent=4, sort_keys=True) 77 | 78 | 79 | def lines2file(lines, filename, encoding='utf-8'): 80 | """ 81 | write json stream, write lines too 82 | """ 83 | with codecs.open(filename, "w", encoding=encoding) as f: 84 | for line in lines: 85 | f.write(line) 86 | f.write("\n") 87 | 88 | 89 | def items2file(items, filename, encoding='utf-8', modifier='w'): 90 | """ 91 | json array to file, canonical json format 92 | """ 93 | with codecs.open(filename, modifier, encoding=encoding) as f: 94 | for item in items: 95 | f.write(u"{}\n".format(json.dumps( 96 | item, ensure_ascii=False, sort_keys=True))) 97 | 98 | 99 | #################################### 100 | # json data access 101 | 102 | def json_get(json_object, property_path, default=None): 103 | """ 104 | get value of property_path from a json object, e.g. person.father.name 105 | * invalid path return None 106 | * valid path (the -1 on path is an object), use default 107 | """ 108 | temp = json_object 109 | for field in property_path[:-1]: 110 | if not isinstance(temp, dict): 111 | return None 112 | temp = temp.get(field, {}) 113 | if not isinstance(temp, dict): 114 | return None 115 | return temp.get(property_path[-1], default) 116 | 117 | 118 | def json_get_list(json_object, p): 119 | v = json_object.get(p, []) 120 | if isinstance(v, list): 121 | return v 122 | else: 123 | return [v] 124 | 125 | 126 | def json_get_first_item(json_object, p, defaultValue=''): 127 | # return empty string if the item does not exist 128 | v = json_object.get(p, []) 129 | if isinstance(v, list): 130 | if len(v) > 0: 131 | return v[0] 132 | else: 133 | return defaultValue 134 | else: 135 | return v 136 | 137 | 138 | def json_dict_copy(json_object, property_list, defaultValue=None): 139 | """ 140 | property_list = [ 141 | { "name":"name", "alternateName": ["name","title"]}, 142 | { "name":"birthDate", "alternateName": ["dob","dateOfBirth"] }, 143 | { "name":"description" } 144 | ] 145 | """ 146 | ret = {} 147 | for prop in property_list: 148 | p_name = prop["name"] 149 | for alias in prop.get("alternateName", []): 150 | if json_object.get(alias) is not None: 151 | ret[p_name] = json_object.get(alias) 152 | break 153 | if not p_name in ret: 154 | if p_name in json_object: 155 | ret[p_name] = json_object[p_name] 156 | elif defaultValue is not None: 157 | ret[p_name] = defaultValue 158 | 159 | return ret 160 | 161 | def json_append(obj, p, v): 162 | vlist = obj.get(p, []) 163 | if not isinstance(vlist, list): 164 | return 165 | 166 | if vlist: 167 | vlist.append(v) 168 | else: 169 | obj[p] = [v] 170 | 171 | #################################### 172 | # data conversion 173 | 174 | 175 | def any2utf8(data): 176 | """ 177 | rewrite json object values (unicode) into utf-8 encoded string 178 | """ 179 | if isinstance(data, dict): 180 | ret = {} 181 | for k, v in data.items(): 182 | k = any2utf8(k) 183 | ret[k] = any2utf8(v) 184 | return ret 185 | elif isinstance(data, list): 186 | return [any2utf8(x) for x in data] 187 | elif isinstance(data, unicode): 188 | return data.encode("utf-8") 189 | elif type(data) in [str, basestring]: 190 | return data 191 | elif type(data) in [int, float]: 192 | return data 193 | else: 194 | logging.error("unexpected {} {}".format(type(data), data)) 195 | return data 196 | 197 | 198 | def any2unicode(data): 199 | """ 200 | rewrite json object values (assum utf-8) into unicode 201 | """ 202 | if isinstance(data, dict): 203 | ret = {} 204 | for k, v in data.items(): 205 | k = any2unicode(k) 206 | ret[k] = any2unicode(v) 207 | return ret 208 | elif isinstance(data, list): 209 | return [any2unicode(x) for x in data] 210 | elif isinstance(data, unicode): 211 | return data 212 | elif type(data) in [str, basestring]: 213 | return data.decode("utf-8") 214 | elif type(data) in [int, float]: 215 | return data 216 | else: 217 | logging.error("unexpected {} {}".format(type(data), data)) 218 | return data 219 | 220 | 221 | def any2sha1(text): 222 | """ 223 | convert a string into sha1hash. For json object/array, first convert 224 | it into canonical json string. 225 | """ 226 | # canonicalize json object or json array 227 | if type(text) in [dict, list]: 228 | text = json.dumps(text, sort_keys=True) 229 | 230 | # assert question as utf8 231 | if isinstance(text, unicode): 232 | text = text.encode('utf-8') 233 | 234 | return hashlib.sha1(text).hexdigest() 235 | 236 | 237 | #################################### 238 | # file statistics 239 | 240 | def stat(items, unique_fields, value_fields=[], printCounter=True): 241 | counter = collections.Counter() 242 | unique_counter = collections.defaultdict(list) 243 | 244 | for item in items: 245 | counter["all"] += 1 246 | for field in unique_fields: 247 | if item.get(field): 248 | unique_counter[field].append(item[field]) 249 | for field in value_fields: 250 | value = item.get(field) 251 | if value is None: 252 | continue 253 | elif type(value) in [ float, int ]: 254 | vx = "%1.0d" % value 255 | else: 256 | vx = value 257 | if len(vx) > 0: 258 | counter[u"{}_{}".format(field, value)] += 1 259 | for field in unique_fields: 260 | counter[u"{}_unique".format(field)] = len(set(unique_counter[field])) 261 | counter[u"{}_nonempty".format(field)] = len(unique_counter[field]) 262 | 263 | if printCounter: 264 | logging.info(json.dumps(counter, ensure_ascii=False, 265 | indent=4, sort_keys=True)) 266 | 267 | return counter 268 | 269 | def stat_jsonld(data, key=None, counter=None): 270 | """ 271 | provide statistics for jsonld, right now only count triples 272 | see also https://json-ld.org/playground/ 273 | note: attributes @id @context do not contribute any triple 274 | """ 275 | if counter is None: 276 | counter = collections.Counter() 277 | 278 | if isinstance(data, dict): 279 | ret = {} 280 | for k, v in data.items(): 281 | stat_jsonld(v, k, counter) 282 | counter[u"p_{}".format(k)] += 0 283 | if key: 284 | counter["triple"] += 1 285 | counter[u"p_{}".format(key)] +=1 286 | elif isinstance(data, list): 287 | [stat_jsonld(x, key, counter) for x in data] 288 | if key in ["tag"]: 289 | for x in data: 290 | if isinstance(x, dict) and x.get("name"): 291 | counter[u"{}_{}".format(key, x["name"])] +=1 292 | elif type(x) in [basestring, unicode]: 293 | counter[u"{}_{}".format(key, x)] +=1 294 | 295 | else: 296 | if key and key not in ["@id","@context"]: 297 | counter["triple"] += 1 298 | counter[u"p_{}".format(key)] +=1 299 | 300 | return counter 301 | -------------------------------------------------------------------------------- /cdata/entity.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: Li Ding 4 | 5 | # utility stuff 6 | 7 | # base packages 8 | import os 9 | import sys 10 | import json 11 | import logging 12 | import codecs 13 | import hashlib 14 | import datetime 15 | import logging 16 | import time 17 | import re 18 | import collections 19 | 20 | import jieba 21 | from core import any2unicode, stat 22 | from misc import main_subtask 23 | 24 | 25 | class SimpleEntity(): 26 | def __init__(self, entity_list): 27 | """ 28 | [{"@id":"1","name":"张三"},{"@id":"2","name":"李四"}] 29 | all input text are assumed (or will be converted into) unicode 30 | """ 31 | # init entity index 32 | self.entities = collections.defaultdict(list) 33 | entity_list_unicode = [] 34 | for entity in entity_list: 35 | entity_list_unicode.append(any2unicode(entity)) 36 | 37 | for entity in entity_list_unicode: 38 | name = entity["name"] 39 | self.entities[name].append(entity) 40 | 41 | for entity in entity_list_unicode: 42 | for name in entity.get("alternateName", []): 43 | self.entities[name].append(entity) 44 | 45 | stat(entity_list_unicode, ["name"]) 46 | 47 | # init jieba 48 | self.tokenizer = jieba.Tokenizer() 49 | for name in self.entities: 50 | self.tokenizer.add_word(name) 51 | 52 | def ner(self, sentence): 53 | # normalize to unicode 54 | sentence = any2unicode(sentence) 55 | 56 | # split 57 | segments = self.tokenizer.cut(sentence, HMM=False) 58 | 59 | # generate output 60 | word_index = 0 61 | ret = [] 62 | for segment in segments: 63 | logging.debug(segment) 64 | 65 | matched_entities = self.entities.get(unicode(segment)) 66 | if matched_entities: 67 | temp = {"text": segment, 68 | "index": word_index, 69 | "entities": matched_entities} 70 | ret.append(temp) 71 | word_index += len(segment) 72 | return ret 73 | 74 | # 提取文本列表中主要的实体 75 | def get_primary_entity(self, text_list, threshold=0.24): 76 | if not text_list: 77 | return [] 78 | 79 | # 统计各个实体在每个文本中出现的频率 80 | counter_list = [] 81 | for sentence in text_list: 82 | ret = self.ner(sentence) 83 | if ret: 84 | counter = collections.Counter() 85 | length = len(ret) 86 | for entity in ret: 87 | counter[entity["text"]] += 1.0 / length 88 | counter_list.append(counter) 89 | 90 | # 各个文本中同一实体的频率相加,归一化处理 91 | sum_counter = collections.Counter() 92 | for counter in counter_list: 93 | for name in counter: 94 | sum_counter[name] += counter[name] / len(text_list) 95 | 96 | result_entity_list = [] 97 | sorted_counter = sum_counter.most_common() # 按照分数从大到小排序 98 | for name, score in sorted_counter: 99 | if score >= threshold: 100 | tmp = { 101 | "text": name, 102 | "score": score, 103 | "entity": self.entities[name] 104 | } 105 | result_entity_list.append(tmp) 106 | else: 107 | break 108 | return result_entity_list 109 | 110 | 111 | def task_ner_test(args=None): 112 | entity_list = [{"@id": "1", "name": "张三"}, {"@id": "2", "name": "李四"}] 113 | ner = SimpleEntity(entity_list) 114 | sentence = "张三给了李四一个苹果" 115 | ret = ner.ner(sentence) 116 | logging.info(json.dumps(ret, ensure_ascii=False, indent=4)) 117 | 118 | sentence = "张三丰给了李四一个苹果" 119 | ret = ner.ner(sentence) 120 | logging.info(json.dumps(ret, ensure_ascii=False, indent=4)) 121 | 122 | sentence_list = ["张三给了李四一个苹果","王五给了李四一个橘子"] 123 | primary_entity = ner.get_primary_entity(sentence_list) 124 | logging.info(json.dumps(primary_entity, ensure_ascii=False, indent=4)) 125 | 126 | 127 | if __name__ == "__main__": 128 | logging.basicConfig(format='[%(levelname)s][%(asctime)s][%(module)s][%(funcName)s][%(lineno)s] %(message)s', level=logging.DEBUG) # noqa 129 | 130 | main_subtask(__name__) 131 | 132 | """ 133 | python cdata/entity.py task_ner_test 134 | """ 135 | -------------------------------------------------------------------------------- /cdata/misc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: Li Ding 4 | 5 | # utility stuff 6 | 7 | # base packages 8 | import os 9 | import sys 10 | import json 11 | import logging 12 | import codecs 13 | import hashlib 14 | import datetime 15 | import logging 16 | import time 17 | import argparse 18 | import urlparse 19 | import re 20 | import collections 21 | 22 | 23 | #################################################### 24 | def main_subtask(module_name, method_prefixs=["task_"], optional_params={}): 25 | """ 26 | http://stackoverflow.com/questions/3217673/why-use-argparse-rather-than-optparse 27 | As of 2.7, optparse is deprecated, and will hopefully go away in the future 28 | """ 29 | parser = argparse.ArgumentParser(description="") 30 | parser.add_argument('method_name', help='') 31 | for optional_param_key, optional_param_help in optional_params.items(): 32 | parser.add_argument(optional_param_key, 33 | required=False, 34 | help=optional_param_help) 35 | # parser.add_argument('--reset_cache', required=False, help='') 36 | args = parser.parse_args() 37 | 38 | for prefix in method_prefixs: 39 | if args.method_name.startswith(prefix): 40 | if prefix == "test_": 41 | # Remove all handlers associated with the root logger object. 42 | for handler in logging.root.handlers[:]: 43 | logging.root.removeHandler(handler) 44 | 45 | # Reconfigure logging again, this time with a file. 46 | logging.basicConfig(format='[%(levelname)s][%(asctime)s][%(module)s][%(funcName)s][%(lineno)s] %(message)s', level=logging.DEBUG) # noqa 47 | 48 | # http://stackoverflow.com/questions/17734618/dynamic-method-call-in-python-2-7-using-strings-of-method-names 49 | the_method = getattr(sys.modules[module_name], args.method_name) 50 | if the_method: 51 | the_method(args=vars(args)) 52 | 53 | logging.info("done") 54 | return 55 | else: 56 | break 57 | 58 | logging.info("unsupported") 59 | 60 | 61 | def task_subtask(args): 62 | print "called task_subtask" 63 | 64 | 65 | if __name__ == "__main__": 66 | logging.basicConfig(format='[%(levelname)s][%(asctime)s][%(module)s][%(funcName)s][%(lineno)s] %(message)s', level=logging.DEBUG) # noqa 67 | logging.getLogger("requests").setLevel(logging.WARNING) 68 | 69 | main_subtask(__name__) 70 | 71 | 72 | """ 73 | python misc.py task_subtask 74 | 75 | """ 76 | -------------------------------------------------------------------------------- /cdata/region.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: Li Ding 4 | 5 | import os 6 | import sys 7 | import json 8 | import logging 9 | import codecs 10 | import hashlib 11 | import datetime 12 | import logging 13 | import time 14 | import argparse 15 | import urlparse 16 | import re 17 | import collections 18 | 19 | from core import * 20 | from table import * 21 | from misc import main_subtask 22 | import jieba 23 | 24 | LIST_NATIONAL = [ 25 | u'壮族', 26 | u'满族', 27 | u'回族', 28 | u'苗族', 29 | u'维吾尔族', 30 | u'土家族', 31 | u'彝族', 32 | u'蒙古族', 33 | u'藏族', 34 | u'布依族', 35 | u'侗族', 36 | u'瑶族', 37 | u'朝鲜族', 38 | u'白族', 39 | u'哈尼族', 40 | u'哈萨克族', 41 | u'黎族', 42 | u'傣族', 43 | u'畲族', 44 | u'傈僳族', 45 | u'仡佬族', 46 | u'东乡族', 47 | u'高山族', 48 | u'拉祜族', 49 | u'水族', 50 | u'佤族', 51 | u'纳西族', 52 | u'羌族', 53 | u'土族', 54 | u'仫佬族', 55 | u'锡伯族', 56 | u'柯尔克孜族', 57 | u'达斡尔族', 58 | u'景颇族', 59 | u'毛南族', 60 | u'撒拉族', 61 | u'布朗族', 62 | u'塔吉克族', 63 | u'阿昌族', 64 | u'普米族', 65 | u'鄂温克族', 66 | u'怒族', 67 | u'京族', 68 | u'基诺族', 69 | u'德昂族', 70 | u'保安族', 71 | u'俄罗斯族', 72 | u'裕固族', 73 | u'乌孜别克族', 74 | u'门巴族', 75 | u'鄂伦春族', 76 | u'独龙族', 77 | u'塔塔尔族', 78 | u'赫哲族', 79 | u'珞巴族', 80 | u'各族' 81 | ] 82 | 83 | PATTERN_NATIONAL = u'({})'.format(u'|'.join(LIST_NATIONAL)) 84 | PATTERN_NATIONAL2 = u'({})'.format(u'?|'.join( 85 | [x for x in LIST_NATIONAL if len(x) > 2])) 86 | 87 | SPECIAL_ADDRESS_NAME = [ 88 | # 县 89 | u"葵潭", 90 | u"靖海", 91 | u"隆江", 92 | u"城月", # ["城月镇建新路49号", "城月药店第二门市部"] 93 | u"黄略", # ["黄略镇南亭圩三角路", "黄略药店5南亭门市部"] 94 | u"杨柑", # ["杨柑镇豆坡大石牛", "杨柑药店大石牛门市部"] 95 | u"黄岭", # ["杨柑镇豆坡大石牛", "杨柑药店大石牛门市部"] 96 | u"神泉", # ["神泉镇新观路八号", "神泉汉龙药店"] 97 | 98 | # 区 99 | u"新林", # ["新林区翠岗镇", "新林区旭东药店"] 100 | u"加格达奇", # ["加格达奇区曙光康庄小区18#楼车库1-15号", "加格达奇一正中西药店"] 101 | # 102 | u"拱北", # ["拱北夏湾中珠新村二期6号商铺", "守仁药店"] 103 | u"守仁", 104 | u"平沙", # ["平沙所平沙前进分场", "平沙前进药店"] 105 | u"三灶", # ["三灶所珠海机场海澄市场", "海澄健恒药店"] 106 | u"海澄", # ["三灶所珠海机场海澄市场", "海澄健恒药店"] 107 | u"大亚湾", # ["大亚湾霞涌市场", "大亚湾霞涌方方药店"] 108 | u"乌塘", # ["乌塘圩", "城月药店乌塘第一门市部"] 109 | 110 | 111 | # bad case 112 | u"大参林", # ["", "大参林医药集团股份有限公司第六百零九分店"] 113 | u"龙归", # ["", "龙归利农药店"] 114 | u"光明托老", # ["光明托老中心综合楼商服0113号", "加格达奇区寅河大药房"] 115 | u"云管端互联网", # ["", "云管端互联网软件有限公司"] 116 | u"康美健康云服务有限公司", # ["", "康美健康云服务有限公司"] 117 | ] 118 | 119 | 120 | def is_special_address(xinput): 121 | if type(xinput) == list: 122 | for addr in xinput: 123 | if not is_special_address(addr): 124 | return False 125 | return True 126 | else: 127 | if not xinput: 128 | return True 129 | 130 | regex = ur"[服装店药店药房医院集团有限公司股份有限责任科技第一门市分店总店]+$" 131 | temp = re.sub(regex, "", xinput) 132 | if len(temp) <= 3 and not re.search(ur"[圩省市县镇]", temp): 133 | logging.warn(u"skip {} => {}".format(xinput, temp)) 134 | return True 135 | 136 | m = re.search(ur"^(.{2,6}[镇区])", xinput) 137 | if m: 138 | logging.debug(m.group(1)) 139 | return True 140 | 141 | for name in SPECIAL_ADDRESS_NAME: 142 | if xinput.startswith(name): 143 | return True 144 | 145 | return False 146 | 147 | 148 | def normalize_national(name): 149 | temp = name 150 | temp = re.sub(u'东乡族自治县', u'东乡县', temp) 151 | temp = re.sub(PATTERN_NATIONAL2, '', temp) 152 | temp = re.sub(PATTERN_NATIONAL, '', temp) 153 | 154 | if len(temp) == 1: 155 | return name 156 | else: 157 | return temp 158 | 159 | 160 | def normalize_misspell(name): 161 | name = name.replace(u"恵", u"惠") 162 | return name 163 | 164 | 165 | def normalize_province(name): 166 | name_norm = name 167 | 168 | name_norm = normalize_national(name_norm) 169 | name_norm = re.sub(ur'(自治区|特别行政区)', '', name_norm) 170 | 171 | if name_norm == u'内': 172 | name_norm = u'内蒙古' 173 | 174 | if name_norm == u'内蒙': 175 | name_norm = u'内蒙古' 176 | 177 | name_compact = name_norm 178 | if len(name_compact) > 2: 179 | name_compact = re.sub(ur'(省|市)$', '', name_compact) 180 | 181 | return [name_norm, name_compact] 182 | 183 | 184 | def normalize_city(name): 185 | name_norm = name 186 | 187 | name_norm = normalize_national(name_norm) 188 | name_norm = re.sub(ur'^(市辖区|自治区直辖县级行政区划|省直辖县级行政区划|自治旗|自治州|矿区|县|区)$', '', name_norm) # noqa 189 | name_norm = re.sub(ur'自治', '', name_norm) 190 | 191 | name_compact = name_norm 192 | if len(name_compact) > 2: 193 | name_compact = re.sub(ur'(州|地区|市)$', '', name_compact) 194 | 195 | ret = [name_norm, name_compact] 196 | if name == u"哈尔滨市": 197 | ret.append(u"哈尔傧") 198 | 199 | if re.search(ur"市$", name): 200 | ret.append(re.sub(ur"市$", ur"市区", name)) 201 | 202 | return ret 203 | 204 | 205 | def normalize_district(name): 206 | name_norm = name 207 | 208 | name_norm = normalize_national(name_norm) 209 | 210 | # logging.info(len(name_norm)) 211 | 212 | name_norm = re.sub(ur'(市辖区)', '', name_norm) 213 | 214 | if len(name_norm) > 3: 215 | name_norm = re.sub(ur'(自治|郊区|城区)', '', name_norm) 216 | 217 | name_compact = name_norm 218 | if len(name_compact) > 3: 219 | name_compact = re.sub(ur'(新区|林区|矿区)$', '', name_compact) 220 | if len(name_compact) > 2: 221 | name_compact = re.sub(ur'(区|县|市)$', '', name_compact) 222 | # if name.startswith(u"富拉尔基"): 223 | # logging.info( name_compact ) 224 | 225 | ret = [name_norm, name_compact] 226 | # if name == u"增城区": 227 | # ret.append(u"增城市") 228 | 229 | if name_norm == name and re.search(ur"区$", name): 230 | ret.append(name_norm.replace(u"区", u"县")) 231 | ret.append(name_norm.replace(u"区", u"市")) 232 | ret.append(re.sub(ur"区$", u"县", name)) 233 | ret.append(re.sub(ur"县$", u"区", name)) 234 | 235 | ret.append(normalize_misspell(name_norm)) 236 | 237 | return ret 238 | 239 | 240 | def normalize_address(address, province, city, district): 241 | assert address is not None 242 | 243 | if not type(address) == unicode: 244 | address = address.decode("utf-8") 245 | 246 | region_list = [] 247 | if province: 248 | if not type(province) == unicode: 249 | province = province.decode("utf-8").strip() 250 | region_list.append(province) 251 | region_list.extend(normalize_province(province)) 252 | 253 | if city: 254 | if not type(city) == unicode: 255 | city = city.decode("utf-8") 256 | region_list.append(city) 257 | region_list.extend(normalize_city(city)) 258 | 259 | if district: 260 | if not type(district) == unicode: 261 | district = district.decode("utf-8") 262 | region_list.append(district) 263 | region_list.extend(normalize_district(district)) 264 | 265 | ret = { 266 | "address": address, 267 | "addressNorm": address, 268 | "province": province, 269 | "city": city, 270 | "district": district, 271 | } 272 | 273 | region_list = sorted(list(set(region_list)), reverse=True) 274 | region_list.append(u"区") 275 | # logging.info(json.dumps(region_list, ensure_ascii=False)) 276 | # logging.info(json.dumps(ret,ensure_ascii=False)) 277 | 278 | if region_list: 279 | regex = u"^({})+".format(u"|".join(region_list)) 280 | # logging.info(regex) 281 | ret["addressNorm"] = re.sub(regex, "", ret["addressNorm"]).strip() 282 | 283 | return ret 284 | 285 | 286 | class RegionEntity(): 287 | def _get_list_province_unique(self, list_cityid): 288 | cancityidates = set() 289 | for cityid in list_cityid: 290 | cancityidates.add(self.data['items'][cityid]['province']) 291 | if len(cancityidates) == 1: 292 | [pnorm, pcompact] = normalize_province(list(cancityidates)[0]) 293 | # print pcompact 294 | return pcompact 295 | 296 | def __init__(self, strict_mode=True): 297 | data = file2json(file2abspath('region_data.json', __file__)) 298 | counter = collections.Counter() 299 | self.strict_mode = strict_mode 300 | 301 | self.data = { 302 | 'items': {}, # 原始数据,基于cityid(多种指代,可以市省,市,区县级别) 303 | 304 | # 基于别名的索引, NER使用 305 | 'province': {}, # 无重名 306 | 'city': {}, # 有重名 307 | 'district': {}, # 有重名 308 | 309 | 310 | 'alias': {}, # 别名索引 311 | 312 | 'lookup': collections.defaultdict(set), 313 | } 314 | 315 | # copy data 316 | for item in data: 317 | self.data['items'][item['cityid']] = item 318 | 319 | # process province 320 | map_province = collections.defaultdict(set) 321 | for item in data: 322 | p = item.get('province') 323 | c = item.get('city') 324 | d = item.get('district') 325 | if p and not c: 326 | # cityid 为省的ID 327 | item["type"] = "province" 328 | item["name"] = p 329 | map_province[p].add(item['cityid']) 330 | assert 34 == len(map_province), len(map_province) 331 | # logging.info(json.dumps(list(map_province.keys()), ensure_ascii=False)) 332 | 333 | for p in sorted(list(map_province)): 334 | alias_list = normalize_province(p) 335 | pnorm = alias_list[1] 336 | self.data['province'][p] = { 337 | 'province': self._get_list_province_unique(map_province[p]), 338 | 'cityid_list': list(map_province[p]), 339 | 'alias': [p] + alias_list} 340 | map_province[p] = pnorm 341 | if pnorm.startswith(u'安徽'): 342 | logging.info(json.dumps(alias_list)) 343 | # print json.dumps(list(set([p,pnorm,pnorm2])),ensure_ascii=False) 344 | 345 | # process city 346 | map_city = collections.defaultdict(set) 347 | for item in data: 348 | c = item.get('city') 349 | d = item.get('district') 350 | if c in [u"市辖区", u"县", u"省直辖县级行政区划", u"自治区直辖县级行政区划"]: 351 | continue 352 | """ 353 | { 354 | "city": "市辖区", 355 | "cityid": "310105", 356 | "district": "长宁区", 357 | "province": "上海市" 358 | } 359 | 360 | { 361 | "city": "南通市", 362 | "cityid": "320601", 363 | "district": "市辖区", 364 | "province": "江苏省" 365 | }, 366 | """ 367 | 368 | if c and not d: 369 | item["type"] = "city" 370 | item["name"] = c 371 | map_city[c].add(item['cityid']) 372 | if len(map_city[c]) != 1: 373 | logging.error(json.dumps(item, ensure_ascii=False)) 374 | logging.error(len(map_city[c])) 375 | assert len(map_city[c]) == 1 376 | 377 | assert 333 == len(map_city), len(map_city) 378 | # logging.info(json.dumps(list(map_city.keys()), ensure_ascii=False)) 379 | 380 | for p in sorted(list(map_city)): 381 | alias_list = normalize_city(p) 382 | assert pnorm 383 | # print p, '-->',pnorm, '-->', pcompact 384 | self.data['city'][p] = { 385 | 'province': self._get_list_province_unique(map_city[p]), 386 | 'cityid_list': list(map_city[p]), 387 | 'alias': [p] + alias_list} 388 | assert len(map_city) == len(self.data['city']), len(self.data['city']) 389 | 390 | # process district 391 | map_district = collections.defaultdict(set) 392 | for item in data: 393 | d = item.get('district') 394 | if d in [u"市辖区"]: 395 | # check above 市辖区 is used both as value of city and district 396 | # simply drop them since they already defined in city level 397 | continue 398 | 399 | if d: 400 | item["type"] = "district" 401 | item["name"] = d 402 | map_district[d].add(item['cityid']) 403 | assert 2821 == len(map_district), len(map_district) 404 | 405 | for p in sorted(list(map_district)): 406 | alias_list = normalize_district(p) 407 | # print p, '-->',pnorm, '-->', pcompact 408 | cityid_list = list(map_district[p]) 409 | if len(cityid_list) > 1: 410 | # logging.info( len(cityid_list) ) 411 | # logging.info( p ) 412 | pass 413 | 414 | self.data['district'][p] = { 415 | 'province': self._get_list_province_unique(map_district[p]), 416 | 'cityid_list': cityid_list, 417 | 'alias': [p] + alias_list} 418 | 419 | # process duplicated name 别名索引 420 | for index in ['province', 'city', 'district']: 421 | for name, data in self.data[index].items(): 422 | for alias in set(data['alias']): 423 | # if alias.startswith(u"清"): 424 | # logging.info(alias) 425 | self.data['lookup'][alias].update(data['cityid_list']) 426 | 427 | for alias, alias_cityid_list in self.data['lookup'].items(): 428 | alias_cityid_list_unique = set(alias_cityid_list) 429 | if len(alias_cityid_list_unique) > 1: 430 | # logging.debug(u"{} {}".format(alias, len(alias_cityid_list_unique))) 431 | # print alias 432 | for code in alias_cityid_list_unique: 433 | # print json.dumps(self.data['items'][code], ensure_ascii=False) 434 | pass 435 | 436 | # 有唯一省的地点名, 歧义地点名不管 437 | for alias, alias_cityid_list in self.data['lookup'].items(): 438 | alias_cityid_list_unique = set(alias_cityid_list) 439 | province = self._get_list_province_unique(alias_cityid_list_unique) 440 | if province: 441 | self.data['alias'][alias] = province 442 | 443 | # with codecs.open(getTheFile('libcity_cn.new.json'),'w',encoding='utf-8') as f: 444 | # json.dump(self.data, f,ensure_ascii=False, indent=4) 445 | # 统计 446 | for index in self.data: 447 | counter[index] = len(self.data[index]) 448 | 449 | # validation 450 | for alias, entities in self.data['lookup'].items(): 451 | if len(alias) == 1: 452 | logging.error(json.dumps( 453 | entities, ensure_ascii=False, indent=4, sort_keys=True)) 454 | if self.strict_mode: 455 | exit() 456 | 457 | if alias in [u'自治']: 458 | logging.error(json.dumps( 459 | entities, ensure_ascii=False, indent=4, sort_keys=True)) 460 | if self.strict_mode: 461 | exit() 462 | 463 | if len(entities) > 1: 464 | counter["one-alias-many-entities"] += 1 465 | # logging.info(u"{}[{}] {}".format(alias, len(entities), u",".join([x["name"]+x["type"] for x in entities]))) 466 | 467 | # prepare for NER 468 | for index in ['province', 'city', 'district']: 469 | for name, data in self.data[index].items(): 470 | for alias in set(data['alias']): 471 | if re.search(ur"[省市县]$", alias): 472 | jieba.add_word(alias, 10000000) 473 | elif re.search(ur"[区]$", alias): 474 | jieba.add_word(alias, 1000000) 475 | else: 476 | jieba.add_word(alias, 100000) 477 | 478 | for suffix in u"路镇乡圩河区村": 479 | jieba.add_word(u"{}{}".format( 480 | alias, suffix), 1000000) 481 | 482 | names = file2iter(file2abspath('region_dict.txt', __file__)) 483 | for name in names: 484 | jieba.add_word(name.strip(), 1) 485 | 486 | # jieba.del_word(u"广州药业") 487 | 488 | def normalize_region_name(self, name, xtype): 489 | if not hasattr(self, "normalizeRegion_mapped"): 490 | setattr(self, "normalizeRegion_mapped", collections.Counter()) 491 | mapped = getattr(self, "normalizeRegion_mapped") 492 | 493 | if len(name) > 2: 494 | name = re.sub(u"[省市]+$", "", name) 495 | 496 | if name in [u"市辖区"]: 497 | return name 498 | 499 | if name in ["", u"省市"]: 500 | return "" 501 | 502 | # rewrite 503 | if name in [u"内蒙", u"蒙古"]: 504 | name = u"内蒙古" 505 | 506 | cityid_list = self.data["lookup"].get(name) 507 | if not cityid_list: 508 | logging.error("cannot find reigion name") 509 | logging.error(name) 510 | logging.error(xtype) 511 | if self.strict_mode: 512 | exit(0) 513 | 514 | matched = [] 515 | for cityid in cityid_list: 516 | item = self.data["items"][cityid] 517 | if item["type"] == xtype: 518 | matched.append(item) 519 | 520 | for item in matched: 521 | if item["name"] == name: 522 | return name 523 | 524 | for item in matched: 525 | if item["name"] != name: 526 | msg = u"normalized {} ->{}".format(name, item["name"]) 527 | if msg not in mapped: 528 | mapped[msg] += 1 529 | logging.info(msg) 530 | return item["name"] 531 | 532 | def guess_province(self, addresses): 533 | for address in addresses: 534 | if not address: 535 | continue 536 | 537 | if address.startswith(u"内蒙"): 538 | return u"内蒙古" 539 | 540 | for index in ['province', 'city']: 541 | for name in self.data[index]: 542 | for alias in set(self.data[index][name]['alias']): 543 | if address.startswith(alias): 544 | # print address, '-->', name, self.data[index][name]['province'] 545 | return self.data[index][name].get('province') 546 | 547 | for index in ['province', 'city']: 548 | for name in self.data[index]: 549 | for alias in set(self.data[index][name]['alias']): 550 | if re.search(ur'({})'.format(alias), address): 551 | # print address, '-->', name, self.data[index][name]['province'] 552 | return self.data[index][name].get('province') 553 | 554 | for alias in self.data['alias']: 555 | if address.startswith(alias): 556 | return self.data['alias'][alias] 557 | if re.search(ur'({})'.format(alias), address): 558 | return self.data['alias'][alias] 559 | 560 | print 'guess_province failed', json.dumps(addresses, ensure_ascii=False) 561 | return u"" 562 | 563 | def guess_all(self, addresses): 564 | 565 | # 解析实体 NER 566 | matched_alias = [] 567 | candidates_name_weight = collections.Counter() 568 | matched_alias_cityid_list = {} 569 | 570 | visited_seg = [] 571 | for address in addresses: 572 | if not type(address) == unicode: 573 | address = address.decode("utf-8") 574 | 575 | # skip shot name 576 | if len(address) < 3: 577 | continue 578 | 579 | # skip name without blacklist 580 | regex = ur"^[^省市县]{2,3}([庄村镇乡])" 581 | if re.search(regex, address): 582 | # logging.info(u"skip村镇乡 {}".format(address)) 583 | continue 584 | 585 | regex = ur"^[^省市县]{2,5}([街路巷弄组]|大道|花园|市场)" 586 | if re.search(regex, address): 587 | # logging.info(u"skip村镇乡 {}".format(address)) 588 | continue 589 | 590 | seg_list = list(jieba.cut(address, cut_all=False, HMM=False)) 591 | logging.debug("Full Mode: " + "/ ".join(seg_list)) 592 | 593 | # merge the first two seg if their combined into an alias 594 | if len(seg_list) > 1: 595 | # 清/ 新县/ 太和镇/ 滨江路/ 东/ 三/ 街/ 13/ 号/ 首层/ 5/ 号/ 铺 596 | # 恵/ 东县/ 大岭/ 镇/ 新园/ 路/ 145/ 号 597 | temp = u"{}{}".format(seg_list[0], seg_list[1]) 598 | # logging.info(temp) 599 | temp = normalize_misspell(temp) 600 | if self.data["lookup"].get(temp): 601 | logging.info(temp) 602 | temp_list = [temp] 603 | temp_list.extend(seg_list[2:]) 604 | seg_list = temp_list 605 | 606 | logging.debug("After Merge: " + "/ ".join(seg_list)) 607 | 608 | # 中山/ 市南区/ 寮/ 后/ 村/ 龙子/ 街/ 14/ 号 609 | if re.search(ur"^[省市县]", seg_list[1]): 610 | temp = u"{}{}".format(seg_list[0], seg_list[1][0]) 611 | # logging.info(temp) 612 | if self.data["lookup"].get(temp): 613 | # logging.info(temp) 614 | temp_list = [temp, seg_list[1][1:]] 615 | temp_list.extend(seg_list[2:]) 616 | seg_list = temp_list 617 | 618 | logging.debug("After Merge: " + "/ ".join(seg_list)) 619 | 620 | is_continuous_match = True 621 | for idx, seg in enumerate(seg_list): 622 | logging.debug(seg) 623 | if seg in visited_seg: 624 | continue 625 | else: 626 | visited_seg.append(seg) 627 | 628 | cityid_list = self.data["lookup"].get(seg) 629 | logging.debug(cityid_list) 630 | 631 | # if idx > 0: 632 | # skip name without whitelist 633 | # regex = ur"(.{2,6}(自治)?[省市县]|^.{2,6}(自治)?[省市县区])" 634 | # if not re.search(regex, address): 635 | # logging.info(u"skip省市县区 {}".format(address)) 636 | # break 637 | 638 | if not cityid_list and len(seg) > 2 and idx == 0: 639 | temp = re.sub(u"(市|县|经济特区).?$", "", seg) 640 | cityid_list = self.data["lookup"].get(temp) 641 | # logging.info(temp) 642 | 643 | if cityid_list: 644 | # logging.info(seg) 645 | matched_alias.append(seg) 646 | 647 | matched_alias_cityid_list[seg] = cityid_list 648 | weight_default = 1.0 / len(cityid_list) 649 | for cityid in cityid_list: 650 | 651 | name = self.data["items"][cityid]["name"] 652 | candidates_name_weight[name] += weight_default 653 | 654 | # dirty hack 2017-04-01 655 | # add seg one more time if the address starts with it 656 | # a very strong indicator 657 | if is_continuous_match: 658 | candidates_name_weight[name] += weight_default 659 | if idx == 0: 660 | logging.debug("idx0") 661 | candidates_name_weight[name] += 2 * weight_default 662 | if re.search(ur"[省市县]$", seg): 663 | logging.debug("省市县") 664 | candidates_name_weight[name] += 2 * \ 665 | weight_default 666 | elif re.search(ur"[区]$", seg): 667 | logging.debug("区") 668 | candidates_name_weight[name] += 1 * \ 669 | weight_default 670 | 671 | if seg == name: 672 | pass 673 | elif seg[-1] == name[-1]: # kind of matched by alias 674 | pass 675 | elif seg in name: # kind of matched by alias 676 | logging.debug("seg is part of name") 677 | candidates_name_weight[name] /= 2 678 | else: 679 | logging.debug("seg is different from name ") 680 | candidates_name_weight[name] /= 4 681 | 682 | else: 683 | is_continuous_match = False 684 | if not re.search(ur"([县州]|公司)", address): 685 | break 686 | 687 | # logging.info(json.dumps(matched_alias, ensure_ascii=False)) 688 | 689 | # select the best entity (most specific, most fit) 690 | # 统计支持率 691 | best_entity = None 692 | best_match_score = 0 693 | # logging.info(json.dumps(candidates_name_weight, ensure_ascii=False)) 694 | for seg in matched_alias: 695 | for city_id in matched_alias_cityid_list[seg]: 696 | entity = self.data["items"][city_id] 697 | # logging.info(json.dumps(entity.values(), ensure_ascii=False)) 698 | match_score = sum([w for x, w in candidates_name_weight.items() if x in entity.values()]) 699 | logging.debug(match_score) 700 | logging.debug(json.dumps(entity, ensure_ascii=False)) 701 | if match_score > best_match_score: 702 | best_entity = entity 703 | best_match_score = match_score 704 | logging.debug(json.dumps(best_entity, ensure_ascii=False)) 705 | 706 | # print 'guess_province failed', json.dumps(addresses, ensure_ascii=False) 707 | # if best_entity: 708 | # if len(addresses) == 2: 709 | # msg = u"\t".join(any2unicode([ addresses[1], 710 | # addresses[0], 711 | # best_entity["province"], 712 | # best_entity.get("city", u""), 713 | # best_entity.get("district", u""), 714 | # best_entity["type"] 715 | # ])) 716 | # print msg 717 | return best_entity 718 | 719 | 720 | def task_guess_all(args=None): 721 | city_data = RegionEntity() 722 | # confused 723 | addresses = ["太平区红河七街区711栋6单元1楼1号", "哈尔滨人民同泰医药连锁店宏伟分店"] 724 | # missed 725 | addresses = ["龙江路之路村集资楼1号楼4号门市", "合作区德仁堂药店"] 726 | addresses = ["", "北京同仁堂广州药业连锁有限公司农林店"] 727 | addresses = ["北京海淀区阜成路52号(定慧寺)", "北京大学肿瘤医院"] 728 | addresses = ["水东镇东阳北街50号", "水东镇长安药店(已迁入三角所)"] 729 | addresses = ["保定市长城北大街头台村2109号门脸", "保定市莲池区中昊翔启蒙大药房"] 730 | 731 | result = city_data.guess_all(addresses) 732 | if result: 733 | logging.info(json.dumps(result, ensure_ascii=False)) 734 | logging.info(render_result(result, addresses[1], addresses[0])) 735 | 736 | 737 | def render_result(result, name=None, address=None): 738 | data = [name, 739 | address, 740 | result["province"], 741 | result.get("city", u""), 742 | result.get("district", u""), 743 | result["type"]] 744 | data = [x for x in data if x] 745 | msg = u"\t".join(any2unicode(data)) 746 | return msg 747 | 748 | 749 | def task_guess_all_batch(args): 750 | ner = RegionEntity() 751 | filename = "../tests/ex3-region-test.xls" 752 | filename = file2abspath(filename) 753 | excel_data = excel2json(filename, non_empty_col=-1) 754 | sheet_data = excel_data["data"].values()[0] 755 | sheet_fields = excel_data["fields"].values()[0] 756 | test_results = [] 757 | 758 | for item in sheet_data: 759 | addresses = [item["address"], item["name"]] 760 | addresses = [x for x in addresses if x] 761 | result = ner.guess_all(addresses) 762 | msg = u"\n============\nexpect{}\nfound{}".format( 763 | json.dumps(item, ensure_ascii=False), 764 | json.dumps(result, ensure_ascii=False)) 765 | logging.info(msg) 766 | 767 | one_result = {} 768 | test_results.append(one_result) 769 | 770 | # new entry 771 | if item["type"] == "" and result: 772 | logging.warn(render_result(result)) 773 | 774 | match_errors = [] 775 | if not result: 776 | result_type = "none" 777 | else: 778 | result_type = result["type"] 779 | 780 | if item["type"] != result_type: 781 | match_errors.append("type") 782 | one_result["type_diff"] = "{}->{}".format(item["type"], result_type) 783 | 784 | if result: 785 | if item["province"] != result["province"]: 786 | match_errors.append("province") 787 | 788 | if item["type"] in ["city", "district"]: 789 | if item["city"] != result.get("city", ""): 790 | match_errors.append("city") 791 | 792 | if item["type"] in ["district"]: 793 | if item["district"] != result.get("district", ""): 794 | match_errors.append("district") 795 | 796 | one_result["match_error_count"] = len(match_errors) 797 | one_result["result_type"] = result_type 798 | 799 | logging.info("accuracy = {} match_error_count_0/all".format( 800 | 1.0 * len([x for x in test_results if x["match_error_count"] == 0])/len(test_results))) 801 | stat(test_results, [], ["match_error_count", "type_diff", "result_type"]) 802 | 803 | 804 | if __name__ == "__main__": 805 | logging.basicConfig(format='[%(levelname)s][%(asctime)s][%(module)s][%(funcName)s][%(lineno)s] %(message)s', level=logging.INFO) # noqa 806 | 807 | main_subtask(__name__) 808 | 809 | """ 810 | python cdata/region.py task_guess_all_batch 811 | 812 | python cdata/region.py task_guess_all 813 | """ 814 | -------------------------------------------------------------------------------- /cdata/region_dict.txt: -------------------------------------------------------------------------------- 1 | #药业 2 | 北京同仁堂 3 | 国药控股国大药房 4 | 北京神州汽车租赁有限公司 5 | 盘锦阳光大药房医药连锁有限公司 6 | 太仓市三庆医药连锁有限公司 7 | 桂林市春和堂医药连锁有限责任公司 8 | 湖北春天大药房连锁有限公司 9 | #哈尔滨人民同泰医药连锁店 10 | 湖北春天大药房 11 | -------------------------------------------------------------------------------- /cdata/summary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: 4 | # summarize a paragraph or an entity into short text description 5 | 6 | import os 7 | import sys 8 | import json 9 | import logging 10 | import codecs 11 | import hashlib 12 | import datetime 13 | import logging 14 | import time 15 | import re 16 | import collections 17 | 18 | from misc import main_subtask 19 | from core import * 20 | from table import * 21 | 22 | def summarize_paragraph_person(text): 23 | pass 24 | 25 | def summarize_entity_person(person): 26 | """ 27 | assume person entity using cnschma person vocabulary, http://cnschema.org/Person 28 | """ 29 | ret = [] 30 | value = person.get("name") 31 | if not value: 32 | return False 33 | ret.append(value) 34 | 35 | prop = "courtesyName" 36 | value = json_get_first_item(person, prop) 37 | if value == u"不详": 38 | value = "" 39 | if value: 40 | ret.append(u'字{}'.format(value)) 41 | 42 | value = person.get("alternateName") 43 | if value: 44 | #ret.append(u'别名{}'.format(value)) 45 | # Bugged 46 | pass 47 | 48 | prop = "artName" 49 | value = json_get_first_item(person, prop) 50 | if value: 51 | ret.append(u'号{}'.format(value)) 52 | 53 | value = person.get("dynasty") 54 | if value: 55 | ret.append(u'{}人'.format(value)) 56 | 57 | prop = "ancestralHome" 58 | value = json_get_first_item(person, prop) 59 | if value: 60 | ret.append(u'祖籍{}'.format(value)) 61 | 62 | birth_date = person.get("birthDate", "") 63 | birth_place = person.get("birthPlace", "") 64 | 65 | # Special case for unknown birth date 66 | if birth_date == u"不详": 67 | birth_date = "" 68 | 69 | if birth_place: 70 | ret.append(u'{}出生于{}'.format(birth_date, birth_place)) 71 | elif birth_date: 72 | ret.append(u'{}出生'.format(birth_date)) 73 | 74 | prop = "nationality" 75 | nationality = json_get_first_item(person, prop) 76 | prop = "occupation" 77 | occupation = json_get_first_item(person, prop) 78 | if occupation: 79 | if nationality: 80 | ret.append(u'{}{}'.format(nationality, occupation)) 81 | else: 82 | ret.append(u'{}'.format(occupation)) 83 | elif nationality: 84 | ret.append(u'{}人'.format(nationality)) 85 | 86 | prop = "authorOf" 87 | value = json_get_list(person, prop) 88 | if value: 89 | logging.info(value) 90 | value = u"、".join(value) 91 | ret.append(u'主要作品:{}'.format(value) ) 92 | 93 | prop = "accomplishment" 94 | value = json_get_list(person, prop) 95 | if value: 96 | value = u"、".join(value) 97 | if len(value) < 30: 98 | # Colon is handled by text reading software 99 | ret.append( u"主要成就:{}".format(value) ) 100 | 101 | ret = u",".join(ret) 102 | 103 | # Make all commas Chinese 104 | ret = ret.replace(u',', u',') 105 | ret = re.sub(u",+", u",", ret) # Removes repeat commas 106 | # Handles periods at end 107 | ret = re.sub(ur"[。,]+$", u"", ret) 108 | 109 | # Converts brackets to Chinese 110 | ret = ret.replace(u'(', u'(') 111 | ret = ret.replace(u')', u')') 112 | # Removes brackets and all contained info 113 | ret = re.sub(ur"([^)]*)", u"", ret) 114 | 115 | ret = u''.join([ret, u"。"]) 116 | 117 | return ret 118 | 119 | def task_summarize_entity_person(args): 120 | #print "called task_test_summarize_entity_person" 121 | person = { 122 | "name": u"张三", 123 | "accomplishment": u"三好学生" 124 | } 125 | ret = summarize_entity_person(person) 126 | logging.info(ret) 127 | 128 | def task_summarize_all_person(args): 129 | path2person = file2abspath('../local/person/person.json') 130 | 131 | result_person_list = [] 132 | for line in file2iter(path2person): 133 | person = json.loads(line) 134 | ret = summarize_entity_person(person) 135 | if ret: 136 | person["shortDescription"] = ret 137 | result_person_list.append(person) 138 | 139 | logging.info( "write to JSON and excel") 140 | 141 | KEYS = [u"@type",u"artName",u"ethnicGroup",u"student",u"courtesyName",u"religion",u"cnProfessionalTitle",u"occupation",u"jobTitle",u"sibling",u"weight",u"nationality",u"birthPlace",u"height",u"alumniOf",u"keywords", u"schoolsOfbuddhism",u"image",u"parent",u"children",u"accomplishment",u"academicDegree",u"dharmaName",u"deathDate",u"academicMajor",u"nobleTitle",u"posthumousName",u"familyName",u"memberOfPoliticalParty",u"award",u"description", u"shortDescription", u"placeOfBurial",u"cnEducationalAttainment",u"alternateName",u"pseudonym",u"templeName", u"birthDate",u"gender",u"worksFor",u"name",u"dynasty",u"earName",u"ancestralHome",u"birthName",u"studentOf",u"spouse",u"nobleFamily",u"authorOf",u"@id",u"colleague",u"fieldOfWork",u"mother",u"father"] 142 | 143 | out_path = "../local/person/" 144 | 145 | json2excel( 146 | result_person_list, KEYS, 147 | os.path.join(out_path, 'person_shortDescription.xls') 148 | ) 149 | 150 | items2file( 151 | result_person_list, 152 | os.path.join(out_path, 'person_shortDescription.json') 153 | ) 154 | 155 | if __name__ == "__main__": 156 | logging.basicConfig(format='[%(levelname)s][%(asctime)s][%(module)s][%(funcName)s][%(lineno)s] %(message)s', level=logging.DEBUG) # noqa 157 | logging.getLogger("requests").setLevel(logging.WARNING) 158 | 159 | main_subtask(__name__) 160 | 161 | """ 162 | python cdata/summary.py task_summarize_entity_person 163 | python cdata/summary.py task_summarize_all_person 164 | 165 | """ 166 | -------------------------------------------------------------------------------- /cdata/table.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: Li Ding 4 | # table/excel data manipulation 5 | 6 | import os 7 | import sys 8 | import json 9 | import logging 10 | import codecs 11 | import hashlib 12 | import datetime 13 | import logging 14 | import time 15 | import re 16 | import collections 17 | 18 | import xlwt 19 | import xlrd 20 | 21 | 22 | def json2excel(items, keys, filename, page_size=60000): 23 | """ max_page_size is 65000 because we output old excel .xls format 24 | """ 25 | wb = xlwt.Workbook() 26 | rowindex = 0 27 | sheetindex = 0 28 | for item in items: 29 | if rowindex % page_size == 0: 30 | sheetname = "%02d" % sheetindex 31 | ws = wb.add_sheet(sheetname) 32 | rowindex = 0 33 | sheetindex += 1 34 | 35 | colindex = 0 36 | for key in keys: 37 | ws.write(rowindex, colindex, key) 38 | colindex += 1 39 | rowindex += 1 40 | 41 | colindex = 0 42 | for key in keys: 43 | v = item.get(key, "") 44 | if type(v) == list: 45 | v = ','.join(v) 46 | if type(v) == set: 47 | v = ','.join(v) 48 | ws.write(rowindex, colindex, v) 49 | colindex += 1 50 | rowindex += 1 51 | 52 | logging.debug(filename) 53 | wb.save(filename) 54 | 55 | 56 | def excel2json(filename, non_empty_col=-1, file_contents=None): 57 | """ 58 | http://www.lexicon.net/sjmachin/xlrd.html 59 | non_empty_col is -1 to load all rows, when set to a none-empty value, 60 | this function will skip rows having empty cell on that col. 61 | """ 62 | 63 | if file_contents: 64 | workbook = xlrd.open_workbook(file_contents=file_contents) 65 | else: 66 | workbook = xlrd.open_workbook(filename) 67 | 68 | start_row = 0 69 | ret = collections.defaultdict(list) 70 | fields = {} 71 | for name in workbook.sheet_names(): 72 | sh = workbook.sheet_by_name(name) 73 | headers = [] 74 | for col in range(len(sh.row(start_row))): 75 | headers.append(sh.cell(start_row, col).value) 76 | 77 | logging.info(u"sheet={} rows={} cols={}".format( 78 | name, sh.nrows, len(headers))) 79 | logging.info(json.dumps(headers, ensure_ascii=False)) 80 | 81 | fields[name] = headers 82 | 83 | for row in range(start_row + 1, sh.nrows): 84 | item = {} 85 | rowdata = sh.row(row) 86 | if len(rowdata) < len(headers): 87 | msg = "skip mismatched row {}".format( 88 | json.dumps(rowdata, ensure_ascii=False)) 89 | logging.warning(msg) 90 | continue 91 | 92 | for col in range(len(headers)): 93 | value = sh.cell(row, col).value 94 | if type(value) in [unicode, basestring]: 95 | value = value.strip() 96 | item[headers[col]] = value 97 | 98 | if non_empty_col >= 0 and not item[headers[non_empty_col]]: 99 | logging.debug("skip empty cell") 100 | continue 101 | 102 | ret[name].append(item) 103 | # stat 104 | logging.info(u"loaded {} {} (non_empty_col={})".format( 105 | filename, len(ret[name]), non_empty_col)) 106 | return {'data': ret, 'fields': fields} 107 | -------------------------------------------------------------------------------- /cdata/web.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: Li Ding 4 | 5 | # utility stuff 6 | 7 | import os 8 | import sys 9 | import json 10 | import logging 11 | import codecs 12 | import hashlib 13 | import datetime 14 | import logging 15 | import time 16 | import urlparse 17 | import re 18 | 19 | 20 | def url2domain(url): 21 | """ extract domain from url 22 | """ 23 | parsed_uri = urlparse.urlparse(url) 24 | domain = '{uri.netloc}'.format(uri=parsed_uri) 25 | domain = re.sub("^.+@", "", domain) 26 | domain = re.sub(":.+$", "", domain) 27 | return domain 28 | -------------------------------------------------------------------------------- /cdata/wikify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: Li Ding 4 | 5 | # wikification apis 6 | 7 | import os 8 | import sys 9 | import json 10 | import logging 11 | import datetime 12 | import logging 13 | import time 14 | import urllib 15 | import re 16 | 17 | import requests 18 | 19 | from misc import main_subtask 20 | from core import * 21 | 22 | 23 | def task_compare(args): 24 | queries = [ 25 | "autodealer", 26 | "birthplace", 27 | u"居里夫人", 28 | u"爱因斯坦", 29 | ] 30 | for query in queries: 31 | args ={"query": query} 32 | logging.info(u"-----{}------".format(query)) 33 | task_wikipedia_test(args) 34 | 35 | task_wikidata_test(args) 36 | 37 | def task_wikipedia_test(args): 38 | ret = wikipedia_search(args["query"]) 39 | logging.info(json.dumps(ret, ensure_ascii=False, sort_keys=True, indent=4)) 40 | # ret = wikipedia_search_slow(query) 41 | # logging.info(json.dumps(ret, ensure_ascii=False, sort_keys=True, indent=4)) 42 | 43 | def task_wikidata_test(args): 44 | ret = wikidata_search(args["query"]) 45 | logging.info(json.dumps(ret, ensure_ascii=False, sort_keys=True, indent=4)) 46 | if ret["itemList"]: 47 | nodeid = ret["itemList"][0]["identifier"] 48 | ret = wikidata_get(nodeid) 49 | logging.info(json.dumps(ret["entities"][nodeid]["labels"]["zh"]["value"], ensure_ascii=False, sort_keys=True, indent=4)) 50 | 51 | def wikidata_get(identifier): 52 | """ 53 | https://www.wikidata.org/wiki/Special:EntityData/P248.json 54 | """ 55 | url = 'https://www.wikidata.org/wiki/Special:EntityData/{}.json'.format(identifier) 56 | #logging.info(url) 57 | return json.loads(requests.get(url).content) 58 | 59 | def wikidata_search(query, lang="zh", output_lang="en", searchtype="item", max_result=1): 60 | """ 61 | wikification: search wikipedia pages for the given query 62 | https://www.wikidata.org/w/api.php?action=help&modules=wbsearchentities 63 | 64 | result format 65 | { 66 | searchinfo: - { 67 | search: "birthday" 68 | }, 69 | search: - [ 70 | - { 71 | repository: "", 72 | id: "P3150", 73 | concepturi: "http://www.wikidata.org/entity/P3150", 74 | url: "//www.wikidata.org/wiki/Property:P3150", 75 | title: "Property:P3150", 76 | pageid: 28754653, 77 | datatype: "wikibase-item", 78 | label: "birthday", 79 | description: "item for day and month on which the subject was born. Used when full "date of birth" (P569) isn't known.", 80 | match: - { 81 | type: "label", 82 | language: "en", 83 | text: "birthday" 84 | } 85 | } 86 | """ 87 | query = any2unicode(query) 88 | params = { 89 | "action":"wbsearchentities", 90 | "search": query, 91 | "format":"json", 92 | "language":lang, 93 | "uselang":output_lang, 94 | "type":searchtype 95 | } 96 | urlBase = "https://www.wikidata.org/w/api.php?" 97 | url = urlBase + urllib.urlencode(any2utf8(params)) 98 | #logging.info(url) 99 | r = requests.get(url) 100 | results = json.loads(r.content).get("search",[]) 101 | #logging.info(items) 102 | 103 | property_list = [ 104 | {"name":"name", "alternateName":["label"]}, 105 | {"name":"url", "alternateName":["concepturi"]}, 106 | {"name":"identifier", "alternateName":["id"]}, 107 | {"name":"description"}, 108 | ] 109 | items = [] 110 | ret = {"query": query, "itemList":items} 111 | for result in results[0:max_result]: 112 | #logging.info(result) 113 | item = json_dict_copy(result, property_list) 114 | items.append(item) 115 | return ret 116 | 117 | def wikipedia_search_slow(query, lang="en", max_result=1): 118 | import wikipedia 119 | #wikification 120 | query = any2unicode(query) 121 | items = [] 122 | ret = {"query":query, "itemList":items} 123 | wikipedia.set_lang(lang) 124 | wikiterm = wikipedia.search(query) 125 | #logging.info(wikiterm) 126 | for idx, term in enumerate(wikiterm[0:max_result]): 127 | wikipage = wikipedia.page(term) 128 | item = { 129 | "name": wikipage.title, 130 | "description": wikipedia.summary(term, sentences=1), 131 | "url": wikipage.url, 132 | } 133 | items.append(item) 134 | 135 | return ret 136 | 137 | def wikipedia_search(query, lang="en", max_result=1): 138 | """ 139 | https://www.mediawiki.org/wiki/API:Opensearch 140 | """ 141 | query = any2unicode(query) 142 | params = { 143 | "action":"opensearch", 144 | "search": query, 145 | "format":"json", 146 | #"formatversion":2, 147 | #"namespace":0, 148 | "suggest":"true", 149 | "limit": 10 150 | } 151 | urlBase = "https://{}.wikipedia.org/w/api.php?".format(lang) 152 | url = urlBase + urllib.urlencode(any2utf8(params)) 153 | #logging.info(url) 154 | r = requests.get(url) 155 | jsonData = json.loads(r.content) 156 | #logging.info(jsonData) 157 | 158 | items = [] 159 | ret = {"query":query, "itemList":items} 160 | for idx, label in enumerate(jsonData[1][0:max_result]): 161 | description = jsonData[2][idx] 162 | url = jsonData[3][idx] 163 | 164 | item = { 165 | "name": label, 166 | "description":description, 167 | "url": url, 168 | } 169 | items.append(item) 170 | 171 | return ret 172 | 173 | if __name__ == "__main__": 174 | logging.basicConfig(format='[%(levelname)s][%(asctime)s][%(module)s][%(funcName)s][%(lineno)s] %(message)s', level=logging.INFO) 175 | logging.getLogger("requests").setLevel(logging.WARNING) 176 | 177 | optional_params = { 178 | '--query': 'query' 179 | } 180 | main_subtask(__name__, optional_params=optional_params) 181 | 182 | """ 183 | python cdata/wikify.py task_wikipedia_test --query="birth place" 184 | python cdata/wikify.py task_wikidata_test --query="birth place" 185 | python cdata/wikify.py task_wikidata_test --query="birthplace" 186 | python cdata/wikify.py task_wikidata_test --query=居里夫人 187 | 188 | python cdata/wikify.py task_compare 189 | 190 | """ 191 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | jieba==0.38 2 | nose==1.3.7 3 | xlrd==1.0.0 4 | xlwt==1.2.0 5 | requests==2.18.1 6 | wikipedia==1.4.0 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | 4 | def readme(): 5 | with open('README.rst') as f: 6 | return f.read() 7 | 8 | 9 | setup(name='cdata', 10 | version='0.1.9', 11 | description='see data, handy snippets for conversion, and ETL.', 12 | long_description=readme(), 13 | classifiers=[ 14 | 'Development Status :: 3 - Alpha', 15 | 'License :: OSI Approved :: Apache Software License', 16 | 'Programming Language :: Python :: 2.7', 17 | 'Topic :: Text Processing', 18 | ], 19 | url='http://github.com/cnschema/cdata', 20 | author='Li Ding', 21 | author_email='lidingpku@gmail.com', 22 | license='Apache 2.0', 23 | packages=['cdata'], 24 | install_requires=[ 25 | 'xlrd', 'xlwt', 'jieba', 'requests','wikipedia' 26 | ], 27 | package_data = {'': ['cdata/*.json','cdata/*.txt']}, 28 | test_suite='nose.collector', 29 | tests_require=['nose'], 30 | zip_safe=False) 31 | -------------------------------------------------------------------------------- /tests/ex1.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "张三", 3 | "nickname": "three", 4 | "age": 28 5 | } 6 | -------------------------------------------------------------------------------- /tests/ex2.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cnschema/cdata/893e2e1e27b61c8551c8b5f5f9bf05ec61490e23/tests/ex2.xls -------------------------------------------------------------------------------- /tests/ex3-region-test.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cnschema/cdata/893e2e1e27b61c8551c8b5f5f9bf05ec61490e23/tests/ex3-region-test.xls -------------------------------------------------------------------------------- /tests/test_core.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Path hack 4 | import os 5 | import sys 6 | sys.path.insert(0, os.path.abspath('..')) 7 | 8 | try: 9 | import unittest2 as unittest 10 | except ImportError: 11 | import unittest 12 | 13 | from cdata.core import * # noqa 14 | 15 | 16 | class CoreTestCase(unittest.TestCase): 17 | def setUp(self): 18 | pass 19 | 20 | def test_file2abspath(self): 21 | tin = "test.json" 22 | tout = file2abspath(tin, __file__) 23 | logging.info(" {} => {}".format(tin, tout)) 24 | assert tout.endswith(u"tests/" + tin), tout 25 | 26 | tin = "../test.json" 27 | tout = file2abspath(tin) 28 | logging.info(" {} => {}".format(tin, tout)) 29 | assert tout.endswith( 30 | u"cdata/" + os.path.basename(tin)), tout 31 | 32 | def test_file2json(self): 33 | filename = "ex1.json" 34 | filename = file2abspath(filename, __file__) 35 | ret = file2json(filename) 36 | assert len(ret) == 3 37 | 38 | def test_file2iter(self): 39 | filename = "ex1.json" 40 | filename = file2abspath(filename, __file__) 41 | str_iter = file2iter(filename) 42 | assert len(list(str_iter)) == 5 43 | 44 | def test_json_get(self): 45 | 46 | json_data = {"a": {"b": 1}, "c": ["d"], "e": "f"} 47 | assert type(json_get(json_data, ["a"])) == dict 48 | assert json_get(json_data, ["k"]) is None 49 | assert json_get(json_data, ["k"], 10) == 10 50 | assert json_get(json_data, ["a", "b"], 10) == 1 51 | assert json_get(json_data, ["a", "k"], 10) == 10 52 | assert json_get(json_data, ["c", "d"], 10) is None 53 | assert json_get(json_data, ["e", "k"], 10) is None 54 | assert type(json_get(json_data, ["c"])) == list 55 | 56 | json_data = { 57 | "father": {"name": "john"}, 58 | "birthPlace": "Beijing" 59 | } 60 | 61 | assert json_get(json_data, ["father", "name"]) == "john" 62 | assert json_get(json_data, ["father", "image"], default="n/a") == "n/a" 63 | assert json_get(json_data, ["father", "father"]) is None 64 | assert json_get(json_data, ["birthPlace"]) == "Beijing" 65 | assert json_get( 66 | json_data, ["birthPlace", "name"], default="n/a") is None 67 | 68 | def test_json_get_list(self): 69 | 70 | json_data = { 71 | "name": "john", 72 | "birthPlace": ["Beijing"] 73 | } 74 | assert json_get_list(json_data, "name") == ["john"] 75 | assert json_get_list(json_data, "birthPlace") == ["Beijing"] 76 | 77 | def test_json_get_first_item(self): 78 | 79 | json_data = { 80 | "name": "john", 81 | "birthPlace": ["Beijing"], 82 | "interests": [] 83 | } 84 | assert json_get_first_item(json_data, "name") == "john" 85 | assert json_get_first_item(json_data, "birthPlace") == "Beijing" 86 | assert json_get_first_item(json_data, "birthDate") == '' 87 | assert json_get_first_item(json_data, "interests", defaultValue=None) is None 88 | 89 | def test_json_append(self): 90 | 91 | json_data = { 92 | "name": "john", 93 | "birthPlace": ["Beijing"], 94 | "interests": [] 95 | } 96 | 97 | json_append(json_data, "name", "a") 98 | assert json_data["name"] == "john" 99 | 100 | json_append(json_data, "birthPlace", "a") 101 | assert json_data["birthPlace"] == ["Beijing","a"] 102 | 103 | json_append(json_data, "keywords", "a") 104 | assert json_data["keywords"] == ["a"] 105 | 106 | def test_any2utf8(self): 107 | tin = "你好世界" 108 | tout = any2utf8(tin) 109 | logging.info(" {} => {}".format(tin, tout)) 110 | 111 | tin = u"你好世界" 112 | tout = any2utf8(tin) 113 | logging.info((tin, tout)) 114 | 115 | tin = "hello world" 116 | tout = any2utf8(tin) 117 | logging.info((tin, tout)) 118 | 119 | tin = ["hello", "世界"] 120 | tout = any2utf8(tin) 121 | logging.info((tin, tout)) 122 | 123 | tin = {"hello": u"世界"} 124 | tout = any2utf8(tin) 125 | logging.info((tin, tout)) 126 | 127 | tin = {"hello": u"世界", "number": 90} 128 | tout = any2utf8(tin) 129 | logging.info((tin, tout)) 130 | 131 | def test_any2unicode(self): 132 | tin = "你好世界" 133 | tout = any2unicode(tin) 134 | logging.info((tin, tout)) 135 | 136 | tin = u"你好世界" 137 | tout = any2unicode(tin) 138 | logging.info((tin, tout)) 139 | 140 | tin = "hello world" 141 | tout = any2unicode(tin) 142 | logging.info((tin, tout)) 143 | 144 | tin = ["hello", "世界"] 145 | tout = any2unicode(tin) 146 | logging.info((tin, tout)) 147 | 148 | tin = {"hello": u"世界"} 149 | tout = any2unicode(tin) 150 | logging.info((tin, tout)) 151 | 152 | def test_any2sha1(self): 153 | tin = "你好世界" 154 | tout = any2sha1(tin) 155 | assert "dabaa5fe7c47fb21be902480a13013f16a1ab6eb" == tout, tout 156 | 157 | tin = u"你好世界" 158 | tout = any2sha1(tin) 159 | assert "dabaa5fe7c47fb21be902480a13013f16a1ab6eb" == tout, tout 160 | 161 | tin = "hello world" 162 | tout = any2sha1(tin) 163 | assert "2aae6c35c94fcfb415dbe95f408b9ce91ee846ed" == tout, tout 164 | 165 | tin = ["hello", "world"] 166 | tout = any2sha1(tin) 167 | assert "238d2b0d23b6b4fb22934792bec13448d12df3cf" == tout, tout 168 | 169 | tin = {"hello": "world"} 170 | tout = any2sha1(tin) 171 | assert "d3b09abe30cfe2edff4ee9e0a141c93bf5b3af87" == tout, tout 172 | 173 | def test_json_dict_copy(self): 174 | property_list = [ 175 | { "name":"name", "alternateName": ["name","title"]}, 176 | { "name":"birthDate", "alternateName": ["dob","dateOfBirth"] }, 177 | { "name":"description" } 178 | ] 179 | json_object = {"dob":"2010-01-01","title":"John","interests":"data","description":"a person"} 180 | ret = json_dict_copy(json_object, property_list) 181 | assert json_object["title"] == ret["name"] 182 | assert json_object["dob"] == ret["birthDate"] 183 | assert json_object["description"] == ret["description"] 184 | assert ret.get("interests") is None 185 | 186 | def test_statJsonld(self): 187 | tin = "test_core_stat.jsonld" 188 | tout = file2abspath(tin, __file__) 189 | with open(tout) as f: 190 | data = json.load(f) 191 | ret = stat_jsonld(data) 192 | print json.dumps(ret) 193 | assert ret["triple"] == 29 194 | assert ret[u"tag_抒情"] == 1 195 | 196 | def test_stat(self): 197 | data = [{u"名称": u"张三", u"年龄": u"13.0"}, {u"名称": u"李四", u"年龄": u"20"}] 198 | ret = stat(data,[u"名称", u"年龄"],[u"名称", u"年龄"]) 199 | 200 | 201 | if __name__ == '__main__': 202 | unittest.main() 203 | -------------------------------------------------------------------------------- /tests/test_core_stat.jsonld: -------------------------------------------------------------------------------- 1 | { 2 | "byArtist":[], 3 | "name":"暧昧", 4 | "keywords":["抒情"], 5 | "tag":[{"name":"抒情"},{"name":"愉快"}], 6 | "mergedFrom":[ 7 | { 8 | "alternateName":[], 9 | "name":"暧昧", 10 | "lyricist":[], 11 | "referenceUrl":"http://music.163.com/song?id=471385043", 12 | "@type":"MusicRecording", 13 | "durationInSeconds":312, 14 | "statedIn":"music.163.com", 15 | "shareCount":0, 16 | "inAlbum":{ 17 | "identifier":"35347475", 18 | "name":"暧昧", 19 | "datePublished": "", 20 | "image": "" 21 | }, 22 | "composer":[], 23 | "keywords":[], 24 | "byArtist":[ 25 | { 26 | "@id":"46a0e65c-bd69-327a-a9ca-93789ae6a473", 27 | "name": "薛之谦" 28 | } 29 | ], 30 | "identifier":"471385043", 31 | "@id":"022858de-f892-373a-bb0f-668c8e50d16f", 32 | "dateModified":"2017-05-15T15:54:44+08:00", 33 | "listenCount":0, 34 | "lyrics": "", 35 | "position": 0 36 | } 37 | ], 38 | "entityScore": 0, 39 | "lyrics": "", 40 | "keywords":[], 41 | "alternateName":[], 42 | "@id":"", 43 | "@context": "http://schema.org/", 44 | "@type":["MusicRecording","CreativeWork","Thing"] 45 | } 46 | -------------------------------------------------------------------------------- /tests/test_entity.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Path hack 4 | import os 5 | import sys 6 | sys.path.insert(0, os.path.abspath('..')) 7 | 8 | from cdata.entity import SimpleEntity # noqa 9 | 10 | try: 11 | import unittest2 as unittest 12 | except ImportError: 13 | import unittest 14 | 15 | 16 | class EntityTestCase(unittest.TestCase): 17 | def setUp(self): 18 | pass 19 | 20 | def test_ner_utf8(self): 21 | entity_list = [{"@id": "1", "name": "张三"}, {"@id": "2", "name": "李四"}] 22 | ner = SimpleEntity(entity_list) 23 | sentence = "张三给了李四一个苹果" 24 | ret = ner.ner(sentence) 25 | assert len(ret) == 2 26 | 27 | def test_ner(self): 28 | entity_list = [{"@id": "1", "name": u"张三"}, 29 | {"@id": "2", "name": u"李四"}] 30 | ner = SimpleEntity(entity_list) 31 | sentence = u"张三给了李四一个苹果" 32 | ret = ner.ner(sentence) 33 | assert len(ret) == 2 34 | 35 | # 张三丰 不会识别成 张三 丰 36 | sentence = u"张三丰给了李四一个苹果" 37 | ret = ner.ner(sentence) 38 | assert len(ret) == 1 39 | 40 | def test_get_primary_entity(self): 41 | entity_list = [{"@id": "1", "name": u"张三"}, 42 | {"@id": "2", "name": u"李四"}] 43 | ner = SimpleEntity(entity_list) 44 | sentence_list = ["张三给了李四一个苹果", "王五给了李四一个橘子"] 45 | # 张三:0.75 李四:0.25 46 | primary_entity = ner.get_primary_entity(sentence_list,0.4) 47 | assert len(primary_entity) == 1 48 | 49 | if __name__ == '__main__': 50 | unittest.main() 51 | -------------------------------------------------------------------------------- /tests/test_region.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Path hack 4 | import os 5 | import sys 6 | sys.path.insert(0, os.path.abspath('..')) 7 | 8 | from cdata.region import RegionEntity # noqa 9 | 10 | try: 11 | import unittest2 as unittest 12 | except ImportError: 13 | import unittest 14 | 15 | 16 | class EntityTestCase(unittest.TestCase): 17 | def setUp(self): 18 | self.ner = RegionEntity() 19 | pass 20 | 21 | def test_misc(self): 22 | ret = self.ner.normalize_region_name(u"市辖区", "province") 23 | assert ret == u"市辖区", u"[]".format(ret) 24 | # TODO self.ner.normalize_region_name(u"安huisheng", "province") 25 | assert u"安徽省" == self.ner.normalize_region_name(u"安徽", "province") 26 | assert u"内蒙古自治区" == self.ner.normalize_region_name(u"内蒙古", "province") 27 | assert u"哈尔滨市" == self.ner.normalize_region_name(u"哈尔滨", "city") 28 | assert u"乌鲁木齐市" == self.ner.normalize_region_name(u"乌鲁木齐", "city") 29 | assert u"道里区" == self.ner.normalize_region_name(u"道里区", "district") 30 | assert u"海淀区" == self.ner.normalize_region_name(u"海淀区", "district") 31 | assert u"海淀区" == self.ner.normalize_region_name(u"海淀", "district") 32 | assert u"浦东新区" == self.ner.normalize_region_name(u"浦东", "district") 33 | assert u"浦东新区" == self.ner.normalize_region_name(u"浦东新区", "district") 34 | assert u"芒市" == self.ner.normalize_region_name(u"芒市", "district") 35 | 36 | assert u"上海" == self.ner.guess_province([u"上海西红柿集团"]) 37 | assert u"上海" == self.ner.guess_province([u"浦东新区软件园"]) 38 | assert u"辽宁" == self.ner.guess_province([u"朝阳市软件园"]) 39 | assert u"内蒙古" == self.ner.guess_province([u"内蒙古自治区乌兰察布市丰镇市新标路丰美北小区232号"]) 40 | assert u"天津" == self.ner.guess_province([u"天津市食品药品监督管理局"]) 41 | assert u"内蒙古" == self.ner.guess_province([u"内蒙乌兰察布市丰镇市新标路丰美北小区232号"]) 42 | 43 | def test_guess_all(self): 44 | 45 | city_info = self.ner.guess_all([u"内蒙古自治区乌兰察布市丰镇市新标路丰美北小区232号"]) 46 | assert u"district" == city_info.get("type") 47 | assert u"内蒙古自治区" == city_info.get("province") 48 | assert u"乌兰察布市" == city_info.get("city") 49 | assert u"丰镇市" == city_info.get("district") 50 | 51 | city_info = self.ner.guess_all(["保定市长城北大街头台村2109号门脸", "保定市莲池区中昊翔启蒙大药房"]) 52 | assert u"district" == city_info.get("type") 53 | assert u"河北省" == city_info.get("province") 54 | assert u"保定市" == city_info.get("city") 55 | 56 | city_info = self.ner.guess_all([u"高州市平山木禾塘大塘村"]) 57 | assert u"district" == city_info.get("type") 58 | assert u"广东省" == city_info.get("province") 59 | 60 | city_info = self.ner.guess_all([u"珠海市拱北新市花园16栋102铺"]) 61 | assert u"city" == city_info.get("type") 62 | assert u"广东省" == city_info.get("province") 63 | 64 | city_info = self.ner.guess_all([u"南溪镇扬美刘大道中段老祠村道脚008号"]) 65 | assert None is city_info 66 | 67 | city_info = self.ner.guess_all(["新塘镇大敦村", "增城市新塘众生药店"]) 68 | assert u"district" == city_info.get("type") 69 | assert u"广东省" == city_info.get("province") 70 | 71 | city_info = self.ner.guess_all(["曲江县马坝城南", "曲江县马坝镇金良兽药店"]) 72 | assert u"district" == city_info.get("type") 73 | assert u"广东省" == city_info.get("province") 74 | 75 | city_info = self.ner.guess_all(["镇平路46号、48号", "汕头经济特区粤东药品公司镇平商店"]) 76 | assert u"city" == city_info.get("type") 77 | assert u"广东省" == city_info.get("province") 78 | 79 | inputdata = ["遂溪县河头镇文明街12号", "遂溪县河头回春堂药店"] 80 | city_info = self.ner.guess_all(inputdata) 81 | assert u"district" == city_info.get("type") 82 | assert u"广东省" == city_info.get("province") 83 | 84 | inputdata = [u"延寿镇南东风路", u"旺旺兽药店(延寿县)"] 85 | city_info = self.ner.guess_all(inputdata) 86 | assert u"district" == city_info.get("type") 87 | assert u"黑龙江省" == city_info.get("province") 88 | 89 | inputdata = ["", "富拉尔基秀坤百货商店药品专柜"] 90 | city_info = self.ner.guess_all(inputdata) 91 | assert u"district" == city_info.get("type") 92 | assert u"黑龙江省" == city_info.get("province") 93 | 94 | inputdata = ["兴隆工商局家属楼", "巴彦县鑫丰兽药饲料商店"] 95 | city_info = self.ner.guess_all(inputdata) 96 | assert u"district" == city_info.get("type") 97 | assert u"黑龙江省" == city_info.get("province") 98 | 99 | inputdata = ["下城子镇中心街", "穆棱市下城子镇宋大夫兽药饲料店"] 100 | city_info = self.ner.guess_all(inputdata) 101 | assert u"district" == city_info.get("type") 102 | assert u"黑龙江省" == city_info.get("province") 103 | 104 | inputdata = ["加格达奇区前进路(红旗东风一号楼)", "加格达奇区温馨大药店"] 105 | city_info = self.ner.guess_all(inputdata) 106 | assert u"黑龙江省" == city_info.get("province") 107 | assert u"district" == city_info.get("type") 108 | assert u"加格达奇区" == city_info.get("name") 109 | 110 | inputdata = ["", "北京神州汽车租赁有限公司深圳雅园分公司"] 111 | city_info = self.ner.guess_all(inputdata) 112 | assert u"city" == city_info.get("type") 113 | assert u"广东省" == city_info.get("province") 114 | 115 | inputdata = ["横山横安路镇政府出租屋第一间", "廉江市横山济生堂药店"] 116 | city_info = self.ner.guess_all(inputdata) 117 | assert u"district" == city_info.get("type") 118 | assert u"广东省" == city_info.get("province") 119 | 120 | inputdata = ["水东镇东阳北街50号", "水东镇长安药店(已迁入三角所)"] 121 | city_info = self.ner.guess_all(inputdata) 122 | assert None is city_info 123 | 124 | inputdata = ["龙门县龙城林园街33号", "龙城新利药店"] 125 | city_info = self.ner.guess_all(inputdata) 126 | assert u"district" == city_info.get("type") 127 | assert u"广东省" == city_info.get("province") 128 | 129 | inputdata = ["黄石街道", "龙川县药材公司黄石药店"] 130 | city_info = self.ner.guess_all(inputdata) 131 | assert u"district" == city_info.get("type") 132 | assert u"广东省" == city_info.get("province") 133 | 134 | inputdata = ["", "盘锦阳光大药房医药连锁有限公司清远麦围店"] 135 | city_info = self.ner.guess_all(inputdata) 136 | assert u"city" == city_info.get("type") 137 | assert u"广东省" == city_info.get("province") 138 | 139 | inputdata = ["", "盘锦阳光大药房医药连锁有限公司清远市清城区城市广场店"] 140 | city_info = self.ner.guess_all(inputdata) 141 | assert u"district" == city_info.get("type") 142 | assert u"广东省" == city_info.get("province") 143 | 144 | inputdata = ["四会市东城区四会大道南时代商贸广场141号(首层)", "广州仁参医药连锁有限公司四会时代分店"] 145 | city_info = self.ner.guess_all(inputdata) 146 | assert u"district" == city_info.get("type") 147 | assert u"广东省" == city_info.get("province") 148 | 149 | inputdata = ["中山市南朗镇岭南小区", "中山市南朗镇启发农药化肥店"] 150 | city_info = self.ner.guess_all(inputdata) 151 | assert u"city" == city_info.get("type") 152 | assert u"广东省" == city_info.get("province") 153 | 154 | inputdata = ["信宜市镇隆圩解放街29号", "信宜市镇隆回春药店"] 155 | city_info = self.ner.guess_all(inputdata) 156 | assert u"district" == city_info.get("type") 157 | assert u"广东省" == city_info.get("province") 158 | 159 | inputdata = ["乳源县大桥镇乳阳林业局溪头河西区域避暑林庄温泉大饭店主楼一楼", "东阳光药零售连锁(东莞)有限公司南岭店"] 160 | city_info = self.ner.guess_all(inputdata) 161 | assert u"district" == city_info.get("type") 162 | assert u"广东省" == city_info.get("province") 163 | 164 | inputdata = ["从化市太平镇神岗木棉村永三社(龟塘)", "从化市太平民健药店"] 165 | city_info = self.ner.guess_all(inputdata) 166 | assert u"district" == city_info.get("type") 167 | assert u"广东省" == city_info.get("province") 168 | 169 | inputdata = ["荔城镇和平路33号首层", "增城市荔城育善堂药店"] 170 | city_info = self.ner.guess_all(inputdata) 171 | assert u"district" == city_info.get("type") 172 | assert u"广东省" == city_info.get("province") 173 | assert u"广州市" == city_info.get("city") 174 | 175 | inputdata = ["清新县太和镇滨江路东三街13号首层5号铺", "清新县太和安顺堂药店"] 176 | city_info = self.ner.guess_all(inputdata) 177 | assert u"district" == city_info.get("type") 178 | assert u"广东省" == city_info.get("province") 179 | 180 | inputdata = ["广州市番禺区小谷围街广州大学城外环西路230号广大商业中心A区首层1015", "桂林市春和堂医药连锁有限责任公司广州大学城分店"] 181 | city_info = self.ner.guess_all(inputdata) 182 | assert u"district" == city_info.get("type") 183 | assert u"广东省" == city_info.get("province") 184 | 185 | inputdata = ["深圳市龙岗区龙岗街道南联社区向银路与怡丰路交叉路口南龙综合楼首层之三", "太仓市三庆医药连锁有限公司深圳南联店"] 186 | city_info = self.ner.guess_all(inputdata) 187 | assert u"district" == city_info.get("type") 188 | assert u"广东省" == city_info.get("province") 189 | 190 | inputdata = ["中山市南区寮后村龙子街14号", "中山市南区仁德堂药店"] 191 | city_info = self.ner.guess_all(inputdata) 192 | assert u"city" == city_info.get("type") 193 | assert u"广东省" == city_info.get("province") 194 | 195 | inputdata = ["", "珠海嘉伦药业集团光彩大药房连锁有限公司红旗分店"] 196 | city_info = self.ner.guess_all(inputdata) 197 | assert u"city" == city_info.get("type") 198 | assert u"广东省" == city_info.get("province") 199 | 200 | inputdata = ["东区新街村", "同江市龙鑫堂大药店"] 201 | city_info = self.ner.guess_all(inputdata) 202 | assert u"district" == city_info.get("type") 203 | assert u"黑龙江省" == city_info.get("province") 204 | 205 | inputdata = ["新兴县河头镇河头街65号", "新兴县河头镇同源堂药店"] 206 | city_info = self.ner.guess_all(inputdata) 207 | assert u"district" == city_info.get("type") 208 | assert u"广东省" == city_info.get("province") 209 | 210 | inputdata = ["怀集怀城镇河南第二卫生站"] 211 | city_info = self.ner.guess_all(inputdata) 212 | assert u"district" == city_info.get("type") 213 | assert u"广东省" == city_info.get("province") 214 | 215 | 216 | if __name__ == '__main__': 217 | unittest.main() 218 | -------------------------------------------------------------------------------- /tests/test_summary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Path hack 4 | import os 5 | import sys 6 | sys.path.insert(0, os.path.abspath('..')) 7 | 8 | from cdata.summary import * # noqa 9 | 10 | try: 11 | import unittest2 as unittest 12 | except ImportError: 13 | import unittest 14 | 15 | 16 | class SummaryTestCase(unittest.TestCase): 17 | def setUp(self): 18 | pass 19 | 20 | def test_misc(self): 21 | person = { 22 | "name": u"张三", 23 | "accomplishment": u"三好学生" 24 | } 25 | ret = summarize_entity_person(person) 26 | assert u"张三,主要成就:三好学生。" == ret 27 | 28 | person = { 29 | "name": u"张三", 30 | "accomplishment": u"三好学生", 31 | "artName": [u"张老三"] 32 | } 33 | ret = summarize_entity_person(person) 34 | assert u"张三,号张老三,主要成就:三好学生。" == ret 35 | 36 | person = { 37 | "name": u"张三", 38 | "accomplishment": u"三好学生", 39 | "artName": [] 40 | } 41 | ret = summarize_entity_person(person) 42 | assert u"张三,主要成就:三好学生。" == ret 43 | 44 | def test_real_data(self): 45 | person = { 46 | "description": u"黄健中,1941年12月29日出生于印度尼西亚泗水,国家一级导演、编剧、艺术指导。1979年,黄健中与张铮联合执导爱情片《小花》,该片获得第三届电影百花奖上获最佳故事片奖 。1982年,黄健中独立执导首部电影作品——爱情片《如意》。1985年,凭借家庭剧《良家妇女》获得第二十五届卡罗维·发利国际电影节主要奖[2-3] 。1990年,拍摄警匪剧《龙年警官》,该片获得第十四届大众电影百花奖最佳故事片奖。1991年,拍摄家庭剧《过年》,该片获得第十五届大众电影百花奖最佳故事片奖。1995年,执导剧情片《大鸿米店》[4-6] 。1998年,拍摄爱情片《红娘》,该片获得第二十二届大众电影百花奖最佳故事片奖[7-8] 。2001年,执导古装武侠剧《笑傲江湖》 。2003年,与佐藤纯弥联合执导家庭犯罪剧《世纪末的晚钟》[10-12] 。2005年,国家广播电影电视总局授予黄健中“优秀电影艺术家”称号 。2006年,执导古装历史剧《越王勾践》 。2009年,拍摄历史战争剧《大秦帝国之裂变》,该片获得第25届中国电视金鹰奖[14-17] 。2011年,执导古装剧《大风歌》[18-19] 。2013年,执导古装神话剧《蓬莱八仙》[20-22] 。", 47 | "birthPlace": u"印度尼西亚泗水", 48 | "name": u"黄健中", 49 | "image": u"http://c.hiphotos.baidu.com/baike/w%3D268%3Bg%3D0/sign=9ac8a3ed33adcbef01347900949449e0/aec379310a55b319a1ae185c41a98226cffc1747.jpg", 50 | "accomplishment": u"第4届东京国际电影节评委奖第11届中国电影金鸡奖最佳导演奖第12届中国电影金鸡奖最佳编剧奖", 51 | "birthDate": u"1941年12月29日", 52 | "keywords": [u"导演", u"娱乐人物", u"人物"], 53 | "nationality": u"中国", 54 | "alternateName": ["HuangJianzhong", "Huang Jianzhong"], 55 | "authorOf": u"过年、龙年警官、越王勾践、大风歌", 56 | "@id": u"d67f8dc6-3775-3e4a-9d67-84bb4007d6d1", 57 | "@type": ["Person", "Thing"], 58 | "occupation": u"导演、编剧、艺术指导," # Extra comma for punctuation testing 59 | } 60 | ret = summarize_entity_person(person) 61 | logging.info(ret) 62 | assert u"黄健中,1941年12月29日出生于印度尼西亚泗水,中国导演、编剧、艺术指导,主要作品:过年、龙年警官、越王勾践、大风歌。" == ret 63 | 64 | person = { 65 | "name": u"陈小群", 66 | "gender": u"女", 67 | "image": u"http://e.hiphotos.baidu.com/baike/w%3D268%3Bg%3D0/sign=3c89cd72acc379317d68812fd3ffd078/b90e7bec54e736d16b57837c98504fc2d5626979.jpg", 68 | "description": u"女,抒情女高音歌唱家,现任上海音乐学院声乐系教授、硕士生导师;先后担任文化部举办的国际声乐比赛全国选拔赛、中国音乐家协会举办的“金钟奖”全国声乐比赛、全国大学生艺术歌曲比赛等比赛评委。", 69 | "@type": ["Person", "Thing"], 70 | "ethnicGroup": u"汉族", 71 | "keywords": [u"音乐", u"行业人物", u"歌手", u"教育", u"娱乐人物", u"人物", u"书籍"], 72 | "nationality": u"中国", 73 | "@id": u"66548f8a-3f9e-37ca-afb1-e2e96fdb083b", 74 | "alumniOf": u"上海音乐学院", 75 | "occupation": u"教授" 76 | } 77 | ret = summarize_entity_person(person) 78 | assert u"陈小群,中国教授。" == ret 79 | 80 | # Test for bracket, unknown birth date, courtesy name 81 | person = { 82 | "@id": u"2d8d5ed9-108b-3621-86bd-6c67fbbf0896", 83 | "@type": u"Person,Thing", 84 | "accomplishment": u"袭龙城,收复河朔、河套地区,击败单于", 85 | "birthDate": u"不详", 86 | "birthPlace": u"河东平阳(今山西临汾市)", 87 | "courtesyName": u"仲卿", 88 | "deathDate": u"公元前106年(汉武帝元封五年)", 89 | "description": u"卫青,字仲卿,河东平阳人", 90 | "dynasty": u"西汉", 91 | "ethnicGroup": u"汉族", 92 | "image": "http://c.hiphotos.baidu.com/baike/w%3D268%3Bg%3D0/sign=dce9ce450f3387449cc5287a6934bec4/d53f8794a4c27d1ef8d6abd118d5ad6eddc43836.jpg", 93 | "name": u"卫青", 94 | "posthumousName": u"烈" 95 | } 96 | 97 | summary = u"卫青,字仲卿,西汉人,出生于河东平阳,主要成就:袭龙城,收复河朔、河套地区,击败单于。" 98 | assert summary == summarize_entity_person(person) 99 | 100 | person = { 101 | "name": u"陈小群", 102 | "gender": u"女", 103 | "image": u"http://e.hiphotos.baidu.com/baike/w%3D268%3Bg%3D0/sign=3c89cd72acc379317d68812fd3ffd078/b90e7bec54e736d16b57837c98504fc2d5626979.jpg", 104 | "description": u"女,抒情女高音歌唱家,现任上海音乐学院声乐系教授、硕士生导师;先后担任文化部举办的国际声乐比赛全国选拔赛、中国音乐家协会举办的“金钟奖”全国声乐比赛、全国大学生艺术歌曲比赛等比赛评委。", 105 | "@type": ["Person", "Thing"], 106 | "ethnicGroup": u"汉族", 107 | "keywords": [u"音乐", u"行业人物", u"歌手", u"教育", u"娱乐人物", u"人物", u"书籍"], 108 | "@id": u"66548f8a-3f9e-37ca-afb1-e2e96fdb083b", 109 | "alumniOf": u"上海音乐学院", 110 | "occupation": u"教授" 111 | } 112 | ret = summarize_entity_person(person) 113 | logging.info(ret) 114 | assert u"陈小群,教授。" == ret 115 | 116 | if __name__ == '__main__': 117 | unittest.main() 118 | -------------------------------------------------------------------------------- /tests/test_table.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Path hack 4 | import os 5 | import sys 6 | sys.path.insert(0, os.path.abspath('..')) 7 | 8 | try: 9 | import unittest2 as unittest 10 | except ImportError: 11 | import unittest 12 | 13 | from cdata.core import file2abspath # noqa 14 | from cdata.table import * # noqa 15 | 16 | 17 | class TableTestCase(unittest.TestCase): 18 | def setUp(self): 19 | pass 20 | 21 | def test_excel2json(self): 22 | filename = "ex2.xls" 23 | filename = file2abspath(filename, __file__) 24 | 25 | if not os.path.exists(filename): 26 | # init_excel(): 27 | input_data = [{ 28 | "name": u"张三", 29 | u"年龄": 18 30 | }, 31 | { 32 | "name": u"李四", 33 | "notes": u"this is li si", 34 | u"年龄": 18 35 | }] 36 | json2excel(input_data, ["name", u"年龄", "notes"], filename) 37 | 38 | output_data = excel2json(filename) 39 | assert len(output_data) == 2 40 | assert len(output_data["data"]) == 1 41 | assert len(output_data["data"].values()[0]) == 2 42 | assert output_data["fields"].values()[0] == ["name", u"年龄", "notes"] 43 | 44 | 45 | if __name__ == '__main__': 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /tests/test_web.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Path hack 4 | import os 5 | import sys 6 | sys.path.insert(0, os.path.abspath('..')) 7 | 8 | try: 9 | import unittest2 as unittest 10 | except ImportError: 11 | import unittest 12 | 13 | from cdata.web import url2domain # noqa 14 | 15 | 16 | class WebTestCase(unittest.TestCase): 17 | def setUp(self): 18 | pass 19 | 20 | def test_url2domain(self): 21 | the_input = "http://www.sge.com.cn/sjzx/mrhqsj/540603" 22 | the_output = url2domain(the_input) 23 | assert the_output == "www.sge.com.cn", the_output 24 | 25 | 26 | if __name__ == '__main__': 27 | unittest.main() 28 | -------------------------------------------------------------------------------- /tests/test_wikify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Path hack 4 | import os 5 | import sys 6 | import logging 7 | sys.path.insert(0, os.path.abspath('..')) 8 | 9 | try: 10 | import unittest2 as unittest 11 | except ImportError: 12 | import unittest 13 | 14 | from cdata.wikify import wikidata_search, wikidata_get # noqa 15 | 16 | 17 | class WikifyTestCase(unittest.TestCase): 18 | def setUp(self): 19 | pass 20 | 21 | def test_wikidata(self): 22 | query = u"居里夫人" 23 | ret = wikidata_search(query, lang="zh") 24 | #logging.info(ret) 25 | nodeid = ret["itemList"][0]["identifier"] 26 | assert nodeid == "Q7186" 27 | ret = wikidata_get(nodeid) 28 | lable_zh = ret["entities"][nodeid]["labels"]["zh"]["value"] 29 | assert lable_zh == u"玛丽·居里" 30 | 31 | query = u"AutoDealer" 32 | ret = wikidata_search(query) 33 | logging.info(ret) 34 | assert 0 == len(ret["itemList"]) 35 | 36 | query = u"Campsite" 37 | ret = wikidata_search(query) 38 | logging.info(ret) 39 | nodeid = ret["itemList"][0]["identifier"] 40 | assert nodeid == "Q832778" 41 | ret = wikidata_get(nodeid) 42 | lable_zh = ret["entities"][nodeid]["labels"]["zh"]["value"] 43 | logging.info(lable_zh) 44 | assert lable_zh == u"露營場" 45 | 46 | query = "birthplace" 47 | ret = wikidata_search(query, searchtype="property") 48 | #logging.info(ret) 49 | nodeid = ret["itemList"][0]["identifier"] 50 | assert nodeid == "P19" 51 | ret = wikidata_get(nodeid) 52 | lable_zh = ret["entities"][nodeid]["labels"]["zh"]["value"] 53 | logging.info(lable_zh) 54 | assert lable_zh == u"出生地" 55 | 56 | 57 | 58 | 59 | if __name__ == '__main__': 60 | unittest.main() 61 | --------------------------------------------------------------------------------