├── .gitignore
├── .travis.yml
├── HISTORY.rst
├── LICENSE
├── MANIFEST.in
├── README.rst
├── cdata
    ├── __init__.py
    ├── core.py
    ├── entity.py
    ├── misc.py
    ├── region.py
    ├── region_data.json
    ├── region_dict.txt
    ├── summary.py
    ├── table.py
    ├── web.py
    └── wikify.py
├── requirements.txt
├── setup.py
└── tests
    ├── ex1.json
    ├── ex2.xls
    ├── ex3-region-test.xls
    ├── test_core.py
    ├── test_core_stat.jsonld
    ├── test_entity.py
    ├── test_region.py
    ├── test_summary.py
    ├── test_table.py
    ├── test_web.py
    └── test_wikify.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # customized skip
  2 | local
  3 | .DS_Store
  4 | 
  5 | 
  6 | # Byte-compiled / optimized / DLL files
  7 | __pycache__/
  8 | *.py[cod]
  9 | *$py.class
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | env/
 17 | build/
 18 | develop-eggs/
 19 | dist/
 20 | downloads/
 21 | eggs/
 22 | .eggs/
 23 | lib/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | .hypothesis/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # pyenv
 79 | .python-version
 80 | 
 81 | # celery beat schedule file
 82 | celerybeat-schedule
 83 | 
 84 | # SageMath parsed files
 85 | *.sage.py
 86 | 
 87 | # dotenv
 88 | .env
 89 | 
 90 | # virtualenv
 91 | .venv
 92 | venv/
 93 | ENV/
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # mkdocs documentation
103 | /site
104 | 
105 | # mypy
106 | .mypy_cache/
107 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | sudo: false
 3 | python:
 4 |   - "2.7"
 5 | 
 6 | install:
 7 |   - pip install --upgrade .
 8 |   - pip install --upgrade -r requirements.txt
 9 |   - pip install coveralls
10 | 
11 | script:  python setup.py test
12 | 
13 | after_success:
14 |   coveralls
15 | 


--------------------------------------------------------------------------------
/HISTORY.rst:
--------------------------------------------------------------------------------
 1 | .. :changelog:
 2 | 
 3 | History
 4 | -------
 5 | 
 6 | 0.1.9 (2017-09-07)
 7 | ++++++++++++++++++
 8 | * core.py add stat_jsonld,  add function to count triples in an JSON object
 9 | 
10 | 0.1.8 (2017-08-16)
11 | ++++++++++++++++++
12 | * fixed summary.summarize_entity_person, handle empty nationality situation
13 | * changed core.json_get_first_item, add defaultValue as empty string
14 | * add core.json_append, append an item to a list
15 | 
16 | 0.1.7 (2017-07-20)
17 | ++++++++++++++++++
18 | * bugfix summary.summarize_entity_person
19 | 
20 | 0.1.6 (2017-07-20)
21 | ++++++++++++++++++
22 | * add summary.summarize_entity_person function
23 | 
24 | 0.1.5 (2017-07-18)
25 | ++++++++++++++++++
26 | * bugfix, normalize_region_name
27 | * pack region data with code
28 | 
29 | 0.1.4 (2017-07-17)
30 | ++++++++++++++++++
31 | * add module wikify with wikidata_search, wikidata_get
32 | * update module core with json_dict_copy
33 | * update modele entity with get_primary_entity
34 | * add one more district in region, add strict_mode for skipping exit() on error
35 | 
36 | 0.1.1 (2017-06-22)
37 | ++++++++++++++++++
38 | * add module entity with SimpleEntity.ner( text )
39 | * add module region with RegionEntity.guess_all( [address, name])
40 | 
41 | 0.1.0 (2017-06-19)
42 | ++++++++++++++++++
43 | 
44 | * initial PyPI release
45 | * add module json, table(excel), web for data manipulation
46 | * provide cli ui via misc.main_subtask
47 | * connect to travis CI
48 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include cdata/region_data.json
3 | include cdata/region_dict.txt
4 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | cdata
  2 | -------------
  3 | 
  4 | "see data", see data, handy snippets for conversion, cleaning and integration.
  5 | 
  6 | install
  7 | -------------
  8 |   pip install cdata
  9 | 
 10 | 
 11 | json data manipulation
 12 | -------------
 13 | 
 14 | * json (and json stream) file IO, e.g.  items2file(...)
 15 | * json data access, e.g. json_get(...), any2utf8, json_dict_copy
 16 | * json array statistics, e.g. stat(...)
 17 | 
 18 | .. code-block:: python
 19 | 
 20 |   from cdata.core import any2utf8
 21 |   the_input = {"hello": u"世界"}
 22 |   the_output = any2utf8(the_input)
 23 |   logging.info((the_input, the_output))
 24 | 
 25 | 
 26 | .. code-block:: python
 27 |   property_list = [
 28 |       { "name":"name", "alternateName": ["name","title"]},
 29 |       { "name":"birthDate", "alternateName": ["dob","dateOfBirth"] },
 30 |       { "name":"description" }
 31 |   ]
 32 |   json_object = {"dob":"2010-01-01","title":"John","interests":"data","description":"a person"}
 33 |   ret = json_dict_copy(json_object, property_list)
 34 | 
 35 | 
 36 | table data manipulation
 37 | -------------
 38 | 
 39 | * json array to/from excel
 40 | 
 41 | .. code-block:: python
 42 | 
 43 |   import json
 44 |   from cdata.table import excel2json,json2excel
 45 |   filename = "test.xls"
 46 |   items = [{"first":"hello", "last":"world" }]
 47 |   json2excel(items, ["first","last"], filename)
 48 |   ret = excel2json(filename)
 49 |   print json.dumps(ret)
 50 | 
 51 | 
 52 | 
 53 | JSON data from reading a single sheet excel file
 54 | 
 55 | .. code-block:: json
 56 | 
 57 |   {
 58 |     "fields": {
 59 |         "00": [
 60 |             "name",
 61 |             "年龄",
 62 |             "notes"
 63 |         ]
 64 |     },
 65 |     "data": {
 66 |         "00": [
 67 |             {
 68 |                 "notes": "",
 69 |                 "年龄": 18.0,
 70 |                 "name": "张三"
 71 |             },
 72 |             {
 73 |                 "notes": "this is li si",
 74 |                 "年龄": 18.0,
 75 |                 "name": "李四"
 76 |             }
 77 |         ]
 78 |     }
 79 |   }
 80 | 
 81 | web stuff
 82 | -------------
 83 | 
 84 | * url domain extraction
 85 | 
 86 | entity manipulation
 87 | -------------
 88 | 
 89 | * entity.SimpleEntity.ner()
 90 | 
 91 | .. code-block:: python
 92 | 
 93 |   from cdata.entity import SimpleEntity
 94 |   entity_list = [{"@id":"1","name":u"张三"},{"@id":"2","name":u"李四"}]
 95 |   ner = SimpleEntity(entity_list)
 96 |   sentence = "张三给了李四一个苹果"
 97 |   ret = ner.ner(sentence)
 98 |   logging.info(json.dumps(ret, ensure_ascii=False, indent=4))
 99 |   """
100 |   [{
101 |       "text": "张三",
102 |       "entities": [
103 |           {
104 |               "@id": "1",
105 |               "name": "张三"
106 |           }
107 |       ],
108 |       "index": 0
109 |   },
110 |   {
111 |       "text": "李四",
112 |       "entities": [
113 |           {
114 |               "@id": "2",
115 |               "name": "李四"
116 |           }
117 |       ],
118 |       "index": 4
119 |   }]
120 |   """
121 | 
122 | * region.RegionEntity.guess_all()
123 | 
124 | .. code-block:: python
125 | 
126 |   from cdata.region import RegionEntity
127 |   addresses = ["北京海淀区阜成路52号（定慧寺）", "北京大学肿瘤医院"]
128 | 
129 |   city_data = RegionEntity()
130 |   result = city_data.guess_all(addresses)
131 |   logging.info(json.dumps(result, ensure_ascii=False))
132 |   """
133 |      {"province": "北京市",
134 |      "city": "市辖区",
135 |      "name": "海淀区",
136 |      "district": "海淀区",
137 |      "cityid": "110108",
138 |      "type": "district"}
139 |   """
140 | 
141 | wikification
142 | -------------
143 | 
144 | * 通过wikidata搜索，定位对应实体，查找实体中文名，别名等属性。wikidata_search (item/property) and wikidata_get
145 | 
146 | .. code-block:: python
147 | 
148 |   query = u"居里夫人"
149 |   ret = wikidata_search(query, lang="zh")
150 |   logging.info(ret)
151 | 
152 |   nodeid = ret["itemList"][0]["identifier"]
153 |   ret = wikidata_get(nodeid)
154 |   lable_zh = ret["entities"][nodeid]["labels"]["zh"]["value"]
155 |   logging.info(lable_zh)
156 | 
157 | 
158 | misc
159 | -------------
160 | 
161 | * support simple cli function using argparse
162 | 
163 | 
164 | notes
165 | -------------
166 | release package using https://github.com/pypa/twine
167 | 


--------------------------------------------------------------------------------
/cdata/__init__.py:
--------------------------------------------------------------------------------
1 | def info():
2 |     return {
3 |         'version': 'v0.0.1',
4 |         'dataModified': '2017-06-19',
5 |         'author': 'Li Ding'}
6 | 


--------------------------------------------------------------------------------
/cdata/core.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Li Ding
  4 | # JSON data manipulation
  5 | 
  6 | # base packages
  7 | import os
  8 | import sys
  9 | import json
 10 | import logging
 11 | import codecs
 12 | import hashlib
 13 | import datetime
 14 | import time
 15 | import argparse
 16 | import urlparse
 17 | import re
 18 | import collections
 19 | 
 20 | # global constants
 21 | VERSION = 'v20170713'
 22 | CONTEXTS = [os.path.basename(__file__), VERSION]
 23 | 
 24 | ####################################
 25 | # file path
 26 | 
 27 | 
 28 | def file2abspath(filename, this_file=__file__):
 29 |     """
 30 |         generate absolute path for the given file and base dir
 31 |     """
 32 |     return os.path.abspath(
 33 |         os.path.join(os.path.dirname(os.path.abspath(this_file)), filename))
 34 | 
 35 | 
 36 | ####################################
 37 | # read from file
 38 | 
 39 | def file2json(filename, encoding='utf-8'):
 40 |     """
 41 |         save a line
 42 |     """
 43 |     with codecs.open(filename, "r", encoding=encoding) as f:
 44 |         return json.load(f)
 45 | 
 46 | 
 47 | def file2iter(filename, encoding='utf-8', comment_prefix="#",
 48 |               skip_empty_line=True):
 49 |     """
 50 |         json stream parsing or line parsing
 51 |     """
 52 |     ret = list()
 53 |     visited = set()
 54 |     with codecs.open(filename, encoding=encoding) as f:
 55 |         for line in f:
 56 |             line = line.strip()
 57 |             # skip empty line
 58 |             if skip_empty_line and len(line) == 0:
 59 |                 continue
 60 | 
 61 |             # skip comment line
 62 |             if comment_prefix and line.startswith(comment_prefix):
 63 |                 continue
 64 | 
 65 |             yield line
 66 | 
 67 | 
 68 | ####################################
 69 | # write to file
 70 | 
 71 | def json2file(data, filename, encoding='utf-8'):
 72 |     """
 73 |         write json in canonical json format
 74 |     """
 75 |     with codecs.open(filename, "w", encoding=encoding) as f:
 76 |         json.dump(data, f, ensure_ascii=False, indent=4, sort_keys=True)
 77 | 
 78 | 
 79 | def lines2file(lines, filename, encoding='utf-8'):
 80 |     """
 81 |         write json stream, write lines too
 82 |     """
 83 |     with codecs.open(filename, "w", encoding=encoding) as f:
 84 |         for line in lines:
 85 |             f.write(line)
 86 |             f.write("\n")
 87 | 
 88 | 
 89 | def items2file(items, filename, encoding='utf-8', modifier='w'):
 90 |     """
 91 |         json array to file, canonical json format
 92 |     """
 93 |     with codecs.open(filename, modifier, encoding=encoding) as f:
 94 |         for item in items:
 95 |             f.write(u"{}\n".format(json.dumps(
 96 |                 item, ensure_ascii=False, sort_keys=True)))
 97 | 
 98 | 
 99 | ####################################
100 | # json data access
101 | 
102 | def json_get(json_object, property_path, default=None):
103 |     """
104 |         get value of property_path from a json object, e.g. person.father.name
105 |         * invalid path return None
106 |         * valid path (the -1 on path is an object), use default
107 |     """
108 |     temp = json_object
109 |     for field in property_path[:-1]:
110 |         if not isinstance(temp, dict):
111 |             return None
112 |         temp = temp.get(field, {})
113 |     if not isinstance(temp, dict):
114 |         return None
115 |     return temp.get(property_path[-1], default)
116 | 
117 | 
118 | def json_get_list(json_object, p):
119 |     v = json_object.get(p, [])
120 |     if isinstance(v, list):
121 |         return v
122 |     else:
123 |         return [v]
124 | 
125 | 
126 | def json_get_first_item(json_object, p, defaultValue=''):
127 |     # return empty string if the item does not exist
128 |     v = json_object.get(p, [])
129 |     if isinstance(v, list):
130 |         if len(v) > 0:
131 |             return v[0]
132 |         else:
133 |             return defaultValue
134 |     else:
135 |         return v
136 | 
137 | 
138 | def json_dict_copy(json_object, property_list, defaultValue=None):
139 |     """
140 |         property_list = [
141 |             { "name":"name", "alternateName": ["name","title"]},
142 |             { "name":"birthDate", "alternateName": ["dob","dateOfBirth"] },
143 |             { "name":"description" }
144 |         ]
145 |     """
146 |     ret = {}
147 |     for prop in property_list:
148 |         p_name = prop["name"]
149 |         for alias in prop.get("alternateName", []):
150 |             if json_object.get(alias) is not None:
151 |                 ret[p_name] = json_object.get(alias)
152 |                 break
153 |         if not p_name in ret:
154 |             if p_name in json_object:
155 |                 ret[p_name] = json_object[p_name]
156 |             elif defaultValue is not None:
157 |                 ret[p_name] = defaultValue
158 | 
159 |     return ret
160 | 
161 | def json_append(obj, p, v):
162 |     vlist = obj.get(p, [])
163 |     if not isinstance(vlist, list):
164 |         return
165 | 
166 |     if vlist:
167 |         vlist.append(v)
168 |     else:
169 |         obj[p] = [v]
170 | 
171 | ####################################
172 | # data conversion
173 | 
174 | 
175 | def any2utf8(data):
176 |     """
177 |         rewrite json object values (unicode) into utf-8 encoded string
178 |     """
179 |     if isinstance(data, dict):
180 |         ret = {}
181 |         for k, v in data.items():
182 |             k = any2utf8(k)
183 |             ret[k] = any2utf8(v)
184 |         return ret
185 |     elif isinstance(data, list):
186 |         return [any2utf8(x) for x in data]
187 |     elif isinstance(data, unicode):
188 |         return data.encode("utf-8")
189 |     elif type(data) in [str, basestring]:
190 |         return data
191 |     elif type(data) in [int, float]:
192 |         return data
193 |     else:
194 |         logging.error("unexpected {} {}".format(type(data), data))
195 |         return data
196 | 
197 | 
198 | def any2unicode(data):
199 |     """
200 |         rewrite json object values (assum utf-8) into unicode
201 |     """
202 |     if isinstance(data, dict):
203 |         ret = {}
204 |         for k, v in data.items():
205 |             k = any2unicode(k)
206 |             ret[k] = any2unicode(v)
207 |         return ret
208 |     elif isinstance(data, list):
209 |         return [any2unicode(x) for x in data]
210 |     elif isinstance(data, unicode):
211 |         return data
212 |     elif type(data) in [str, basestring]:
213 |         return data.decode("utf-8")
214 |     elif type(data) in [int, float]:
215 |         return data
216 |     else:
217 |         logging.error("unexpected {} {}".format(type(data), data))
218 |         return data
219 | 
220 | 
221 | def any2sha1(text):
222 |     """
223 |         convert a string into sha1hash. For json object/array, first convert
224 |         it into canonical json string.
225 |     """
226 |     # canonicalize json object or json array
227 |     if type(text) in [dict, list]:
228 |         text = json.dumps(text, sort_keys=True)
229 | 
230 |     # assert question as utf8
231 |     if isinstance(text, unicode):
232 |         text = text.encode('utf-8')
233 | 
234 |     return hashlib.sha1(text).hexdigest()
235 | 
236 | 
237 | ####################################
238 | # file statistics
239 | 
240 | def stat(items, unique_fields, value_fields=[], printCounter=True):
241 |     counter = collections.Counter()
242 |     unique_counter = collections.defaultdict(list)
243 | 
244 |     for item in items:
245 |         counter["all"] += 1
246 |         for field in unique_fields:
247 |             if item.get(field):
248 |                 unique_counter[field].append(item[field])
249 |         for field in value_fields:
250 |             value = item.get(field)
251 |             if value is None:
252 |                 continue
253 |             elif type(value) in [ float, int ]:
254 |                 vx = "%1.0d" % value
255 |             else:
256 |                 vx = value
257 |             if len(vx) > 0:
258 |                 counter[u"{}_{}".format(field, value)] += 1
259 |         for field in unique_fields:
260 |             counter[u"{}_unique".format(field)] = len(set(unique_counter[field]))
261 |             counter[u"{}_nonempty".format(field)] = len(unique_counter[field])
262 | 
263 |     if printCounter:
264 |         logging.info(json.dumps(counter, ensure_ascii=False,
265 |                                 indent=4, sort_keys=True))
266 | 
267 |     return counter
268 | 
269 | def stat_jsonld(data, key=None, counter=None):
270 |     """
271 |         provide statistics for jsonld, right now only count triples
272 |         see also https://json-ld.org/playground/
273 |         note:  attributes  @id @context do not contribute any triple
274 |     """
275 |     if counter is None:
276 |         counter = collections.Counter()
277 | 
278 |     if isinstance(data, dict):
279 |         ret = {}
280 |         for k, v in data.items():
281 |             stat_jsonld(v, k, counter)
282 |             counter[u"p_{}".format(k)] += 0
283 |         if key:
284 |             counter["triple"] += 1
285 |             counter[u"p_{}".format(key)] +=1
286 |     elif isinstance(data, list):
287 |         [stat_jsonld(x, key, counter) for x in data]
288 |         if key in ["tag"]:
289 |             for x in data:
290 |                 if isinstance(x, dict) and x.get("name"):
291 |                     counter[u"{}_{}".format(key, x["name"])] +=1
292 |                 elif type(x) in [basestring, unicode]:
293 |                     counter[u"{}_{}".format(key, x)] +=1
294 | 
295 |     else:
296 |         if key and key not in ["@id","@context"]:
297 |             counter["triple"] += 1
298 |             counter[u"p_{}".format(key)] +=1
299 | 
300 |     return counter
301 | 


--------------------------------------------------------------------------------
/cdata/entity.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Li Ding
  4 | 
  5 | # utility stuff
  6 | 
  7 | # base packages
  8 | import os
  9 | import sys
 10 | import json
 11 | import logging
 12 | import codecs
 13 | import hashlib
 14 | import datetime
 15 | import logging
 16 | import time
 17 | import re
 18 | import collections
 19 | 
 20 | import jieba
 21 | from core import any2unicode, stat
 22 | from misc import main_subtask
 23 | 
 24 | 
 25 | class SimpleEntity():
 26 |     def __init__(self, entity_list):
 27 |         """
 28 |             [{"@id":"1","name":"张三"},{"@id":"2","name":"李四"}]
 29 |             all input text are assumed (or will be converted into) unicode
 30 |         """
 31 |         # init entity index
 32 |         self.entities = collections.defaultdict(list)
 33 |         entity_list_unicode = []
 34 |         for entity in entity_list:
 35 |             entity_list_unicode.append(any2unicode(entity))
 36 | 
 37 |         for entity in entity_list_unicode:
 38 |             name = entity["name"]
 39 |             self.entities[name].append(entity)
 40 | 
 41 |         for entity in entity_list_unicode:
 42 |             for name in entity.get("alternateName", []):
 43 |                 self.entities[name].append(entity)
 44 | 
 45 |         stat(entity_list_unicode, ["name"])
 46 | 
 47 |         # init jieba
 48 |         self.tokenizer = jieba.Tokenizer()
 49 |         for name in self.entities:
 50 |             self.tokenizer.add_word(name)
 51 | 
 52 |     def ner(self, sentence):
 53 |         # normalize to unicode
 54 |         sentence = any2unicode(sentence)
 55 | 
 56 |         # split
 57 |         segments = self.tokenizer.cut(sentence, HMM=False)
 58 | 
 59 |         # generate output
 60 |         word_index = 0
 61 |         ret = []
 62 |         for segment in segments:
 63 |             logging.debug(segment)
 64 | 
 65 |             matched_entities = self.entities.get(unicode(segment))
 66 |             if matched_entities:
 67 |                 temp = {"text": segment,
 68 |                         "index": word_index,
 69 |                         "entities": matched_entities}
 70 |                 ret.append(temp)
 71 |             word_index += len(segment)
 72 |         return ret
 73 | 
 74 |     # 提取文本列表中主要的实体
 75 |     def get_primary_entity(self, text_list, threshold=0.24):
 76 |         if not text_list:
 77 |             return []
 78 | 
 79 |         # 统计各个实体在每个文本中出现的频率
 80 |         counter_list = []
 81 |         for sentence in text_list:
 82 |             ret = self.ner(sentence)
 83 |             if ret:
 84 |                 counter = collections.Counter()
 85 |                 length = len(ret)
 86 |                 for entity in ret:
 87 |                     counter[entity["text"]] += 1.0 / length
 88 |                 counter_list.append(counter)
 89 | 
 90 |         # 各个文本中同一实体的频率相加，归一化处理
 91 |         sum_counter = collections.Counter()
 92 |         for counter in counter_list:
 93 |             for name in counter:
 94 |                 sum_counter[name] += counter[name] / len(text_list)
 95 | 
 96 |         result_entity_list = []
 97 |         sorted_counter = sum_counter.most_common() # 按照分数从大到小排序
 98 |         for name, score in sorted_counter:
 99 |             if score >= threshold:
100 |                 tmp = {
101 |                     "text": name,
102 |                     "score": score,
103 |                     "entity": self.entities[name]
104 |                 }
105 |                 result_entity_list.append(tmp)
106 |             else:
107 |                 break
108 |         return result_entity_list
109 |     
110 | 
111 | def task_ner_test(args=None):
112 |     entity_list = [{"@id": "1", "name": "张三"}, {"@id": "2", "name": "李四"}]
113 |     ner = SimpleEntity(entity_list)
114 |     sentence = "张三给了李四一个苹果"
115 |     ret = ner.ner(sentence)
116 |     logging.info(json.dumps(ret, ensure_ascii=False, indent=4))
117 | 
118 |     sentence = "张三丰给了李四一个苹果"
119 |     ret = ner.ner(sentence)
120 |     logging.info(json.dumps(ret, ensure_ascii=False, indent=4))
121 | 
122 |     sentence_list = ["张三给了李四一个苹果","王五给了李四一个橘子"]
123 |     primary_entity = ner.get_primary_entity(sentence_list)
124 |     logging.info(json.dumps(primary_entity, ensure_ascii=False, indent=4))
125 | 
126 | 
127 | if __name__ == "__main__":
128 |     logging.basicConfig(format='[%(levelname)s][%(asctime)s][%(module)s][%(funcName)s][%(lineno)s] %(message)s', level=logging.DEBUG)  # noqa
129 | 
130 |     main_subtask(__name__)
131 | 
132 | """
133 |     python cdata/entity.py task_ner_test
134 | """
135 | 


--------------------------------------------------------------------------------
/cdata/misc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Author: Li Ding
 4 | 
 5 | # utility stuff
 6 | 
 7 | # base packages
 8 | import os
 9 | import sys
10 | import json
11 | import logging
12 | import codecs
13 | import hashlib
14 | import datetime
15 | import logging
16 | import time
17 | import argparse
18 | import urlparse
19 | import re
20 | import collections
21 | 
22 | 
23 | ####################################################
24 | def main_subtask(module_name, method_prefixs=["task_"], optional_params={}):
25 |     """
26 |     http://stackoverflow.com/questions/3217673/why-use-argparse-rather-than-optparse
27 |     As of 2.7, optparse is deprecated, and will hopefully go away in the future
28 |     """
29 |     parser = argparse.ArgumentParser(description="")
30 |     parser.add_argument('method_name', help='')
31 |     for optional_param_key, optional_param_help in optional_params.items():
32 |         parser.add_argument(optional_param_key,
33 |                             required=False,
34 |                             help=optional_param_help)
35 |         # parser.add_argument('--reset_cache', required=False, help='')
36 |     args = parser.parse_args()
37 | 
38 |     for prefix in method_prefixs:
39 |         if args.method_name.startswith(prefix):
40 |             if prefix == "test_":
41 |                 # Remove all handlers associated with the root logger object.
42 |                 for handler in logging.root.handlers[:]:
43 |                     logging.root.removeHandler(handler)
44 | 
45 |                 # Reconfigure logging again, this time with a file.
46 |                 logging.basicConfig(format='[%(levelname)s][%(asctime)s][%(module)s][%(funcName)s][%(lineno)s] %(message)s', level=logging.DEBUG)  # noqa
47 | 
48 |             # http://stackoverflow.com/questions/17734618/dynamic-method-call-in-python-2-7-using-strings-of-method-names
49 |             the_method = getattr(sys.modules[module_name], args.method_name)
50 |             if the_method:
51 |                 the_method(args=vars(args))
52 | 
53 |                 logging.info("done")
54 |                 return
55 |             else:
56 |                 break
57 | 
58 |     logging.info("unsupported")
59 | 
60 | 
61 | def task_subtask(args):
62 |     print "called task_subtask"
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     logging.basicConfig(format='[%(levelname)s][%(asctime)s][%(module)s][%(funcName)s][%(lineno)s] %(message)s', level=logging.DEBUG)  # noqa
67 |     logging.getLogger("requests").setLevel(logging.WARNING)
68 | 
69 |     main_subtask(__name__)
70 | 
71 | 
72 | """
73 |     python misc.py task_subtask
74 | 
75 | """
76 | 


--------------------------------------------------------------------------------
/cdata/region.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Li Ding
  4 | 
  5 | import os
  6 | import sys
  7 | import json
  8 | import logging
  9 | import codecs
 10 | import hashlib
 11 | import datetime
 12 | import logging
 13 | import time
 14 | import argparse
 15 | import urlparse
 16 | import re
 17 | import collections
 18 | 
 19 | from core import *
 20 | from table import *
 21 | from misc import main_subtask
 22 | import jieba
 23 | 
 24 | LIST_NATIONAL = [
 25 |     u'壮族',
 26 |     u'满族',
 27 |     u'回族',
 28 |     u'苗族',
 29 |     u'维吾尔族',
 30 |     u'土家族',
 31 |     u'彝族',
 32 |     u'蒙古族',
 33 |     u'藏族',
 34 |     u'布依族',
 35 |     u'侗族',
 36 |     u'瑶族',
 37 |     u'朝鲜族',
 38 |     u'白族',
 39 |     u'哈尼族',
 40 |     u'哈萨克族',
 41 |     u'黎族',
 42 |     u'傣族',
 43 |     u'畲族',
 44 |     u'傈僳族',
 45 |     u'仡佬族',
 46 |     u'东乡族',
 47 |     u'高山族',
 48 |     u'拉祜族',
 49 |     u'水族',
 50 |     u'佤族',
 51 |     u'纳西族',
 52 |     u'羌族',
 53 |     u'土族',
 54 |     u'仫佬族',
 55 |     u'锡伯族',
 56 |     u'柯尔克孜族',
 57 |     u'达斡尔族',
 58 |     u'景颇族',
 59 |     u'毛南族',
 60 |     u'撒拉族',
 61 |     u'布朗族',
 62 |     u'塔吉克族',
 63 |     u'阿昌族',
 64 |     u'普米族',
 65 |     u'鄂温克族',
 66 |     u'怒族',
 67 |     u'京族',
 68 |     u'基诺族',
 69 |     u'德昂族',
 70 |     u'保安族',
 71 |     u'俄罗斯族',
 72 |     u'裕固族',
 73 |     u'乌孜别克族',
 74 |     u'门巴族',
 75 |     u'鄂伦春族',
 76 |     u'独龙族',
 77 |     u'塔塔尔族',
 78 |     u'赫哲族',
 79 |     u'珞巴族',
 80 |     u'各族'
 81 | ]
 82 | 
 83 | PATTERN_NATIONAL = u'({})'.format(u'|'.join(LIST_NATIONAL))
 84 | PATTERN_NATIONAL2 = u'({})'.format(u'?|'.join(
 85 |     [x for x in LIST_NATIONAL if len(x) > 2]))
 86 | 
 87 | SPECIAL_ADDRESS_NAME = [
 88 |     # 县
 89 |     u"葵潭",
 90 |     u"靖海",
 91 |     u"隆江",
 92 |     u"城月",  # ["城月镇建新路49号", "城月药店第二门市部"]
 93 |     u"黄略",  # ["黄略镇南亭圩三角路", "黄略药店5南亭门市部"]
 94 |     u"杨柑",  # ["杨柑镇豆坡大石牛", "杨柑药店大石牛门市部"]
 95 |     u"黄岭",  # ["杨柑镇豆坡大石牛", "杨柑药店大石牛门市部"]
 96 |     u"神泉",  # ["神泉镇新观路八号", "神泉汉龙药店"]
 97 | 
 98 |     # 区
 99 |     u"新林",  # ["新林区翠岗镇", "新林区旭东药店"]
100 |     u"加格达奇",  # ["加格达奇区曙光康庄小区18#楼车库1-15号", "加格达奇一正中西药店"]
101 |     #
102 |     u"拱北",  # ["拱北夏湾中珠新村二期6号商铺", "守仁药店"]
103 |     u"守仁",
104 |     u"平沙",  # ["平沙所平沙前进分场", "平沙前进药店"]
105 |     u"三灶",  # ["三灶所珠海机场海澄市场", "海澄健恒药店"]
106 |     u"海澄",  # ["三灶所珠海机场海澄市场", "海澄健恒药店"]
107 |     u"大亚湾",  # ["大亚湾霞涌市场", "大亚湾霞涌方方药店"]
108 |     u"乌塘",  # ["乌塘圩", "城月药店乌塘第一门市部"]
109 | 
110 | 
111 |     # bad case
112 |     u"大参林",  # ["", "大参林医药集团股份有限公司第六百零九分店"]
113 |     u"龙归",  # ["", "龙归利农药店"]
114 |     u"光明托老",  # ["光明托老中心综合楼商服0113号", "加格达奇区寅河大药房"]
115 |     u"云管端互联网",  # ["", "云管端互联网软件有限公司"]
116 |     u"康美健康云服务有限公司",  # ["", "康美健康云服务有限公司"]
117 | ]
118 | 
119 | 
120 | def is_special_address(xinput):
121 |     if type(xinput) == list:
122 |         for addr in xinput:
123 |             if not is_special_address(addr):
124 |                 return False
125 |         return True
126 |     else:
127 |         if not xinput:
128 |             return True
129 | 
130 |         regex = ur"[服装店药店药房医院集团有限公司股份有限责任科技第一门市分店总店]+$"
131 |         temp = re.sub(regex, "", xinput)
132 |         if len(temp) <= 3 and not re.search(ur"[圩省市县镇]", temp):
133 |             logging.warn(u"skip {} => {}".format(xinput, temp))
134 |             return True
135 | 
136 |         m = re.search(ur"^(.{2,6}[镇区])", xinput)
137 |         if m:
138 |             logging.debug(m.group(1))
139 |             return True
140 | 
141 |         for name in SPECIAL_ADDRESS_NAME:
142 |             if xinput.startswith(name):
143 |                 return True
144 | 
145 |         return False
146 | 
147 | 
148 | def normalize_national(name):
149 |     temp = name
150 |     temp = re.sub(u'东乡族自治县', u'东乡县', temp)
151 |     temp = re.sub(PATTERN_NATIONAL2, '', temp)
152 |     temp = re.sub(PATTERN_NATIONAL, '', temp)
153 | 
154 |     if len(temp) == 1:
155 |         return name
156 |     else:
157 |         return temp
158 | 
159 | 
160 | def normalize_misspell(name):
161 |     name = name.replace(u"恵", u"惠")
162 |     return name
163 | 
164 | 
165 | def normalize_province(name):
166 |     name_norm = name
167 | 
168 |     name_norm = normalize_national(name_norm)
169 |     name_norm = re.sub(ur'(自治区|特别行政区)', '', name_norm)
170 | 
171 |     if name_norm == u'内':
172 |         name_norm = u'内蒙古'
173 | 
174 |     if name_norm == u'内蒙':
175 |         name_norm = u'内蒙古'
176 | 
177 |     name_compact = name_norm
178 |     if len(name_compact) > 2:
179 |         name_compact = re.sub(ur'(省|市)$', '', name_compact)
180 | 
181 |     return [name_norm, name_compact]
182 | 
183 | 
184 | def normalize_city(name):
185 |     name_norm = name
186 | 
187 |     name_norm = normalize_national(name_norm)
188 |     name_norm = re.sub(ur'^(市辖区|自治区直辖县级行政区划|省直辖县级行政区划|自治旗|自治州|矿区|县|区)$', '', name_norm)  # noqa
189 |     name_norm = re.sub(ur'自治', '', name_norm)
190 | 
191 |     name_compact = name_norm
192 |     if len(name_compact) > 2:
193 |         name_compact = re.sub(ur'(州|地区|市)$', '', name_compact)
194 | 
195 |     ret = [name_norm, name_compact]
196 |     if name == u"哈尔滨市":
197 |         ret.append(u"哈尔傧")
198 | 
199 |     if re.search(ur"市$", name):
200 |         ret.append(re.sub(ur"市$", ur"市区", name))
201 | 
202 |     return ret
203 | 
204 | 
205 | def normalize_district(name):
206 |     name_norm = name
207 | 
208 |     name_norm = normalize_national(name_norm)
209 | 
210 |     # logging.info(len(name_norm))
211 | 
212 |     name_norm = re.sub(ur'(市辖区)', '', name_norm)
213 | 
214 |     if len(name_norm) > 3:
215 |         name_norm = re.sub(ur'(自治|郊区|城区)', '', name_norm)
216 | 
217 |     name_compact = name_norm
218 |     if len(name_compact) > 3:
219 |         name_compact = re.sub(ur'(新区|林区|矿区)$', '', name_compact)
220 |     if len(name_compact) > 2:
221 |         name_compact = re.sub(ur'(区|县|市)$', '', name_compact)
222 |     # if name.startswith(u"富拉尔基"):
223 |     #    logging.info( name_compact )
224 | 
225 |     ret = [name_norm, name_compact]
226 |     # if name == u"增城区":
227 |     #    ret.append(u"增城市")
228 | 
229 |     if name_norm == name and re.search(ur"区$", name):
230 |         ret.append(name_norm.replace(u"区", u"县"))
231 |         ret.append(name_norm.replace(u"区", u"市"))
232 |     ret.append(re.sub(ur"区$", u"县", name))
233 |     ret.append(re.sub(ur"县$", u"区", name))
234 | 
235 |     ret.append(normalize_misspell(name_norm))
236 | 
237 |     return ret
238 | 
239 | 
240 | def normalize_address(address, province, city, district):
241 |     assert address is not None
242 | 
243 |     if not type(address) == unicode:
244 |         address = address.decode("utf-8")
245 | 
246 |     region_list = []
247 |     if province:
248 |         if not type(province) == unicode:
249 |             province = province.decode("utf-8").strip()
250 |         region_list.append(province)
251 |         region_list.extend(normalize_province(province))
252 | 
253 |     if city:
254 |         if not type(city) == unicode:
255 |             city = city.decode("utf-8")
256 |         region_list.append(city)
257 |         region_list.extend(normalize_city(city))
258 | 
259 |     if district:
260 |         if not type(district) == unicode:
261 |             district = district.decode("utf-8")
262 |         region_list.append(district)
263 |         region_list.extend(normalize_district(district))
264 | 
265 |     ret = {
266 |         "address": address,
267 |         "addressNorm": address,
268 |         "province": province,
269 |         "city": city,
270 |         "district": district,
271 |     }
272 | 
273 |     region_list = sorted(list(set(region_list)), reverse=True)
274 |     region_list.append(u"区")
275 |     # logging.info(json.dumps(region_list, ensure_ascii=False))
276 |     # logging.info(json.dumps(ret,ensure_ascii=False))
277 | 
278 |     if region_list:
279 |         regex = u"^({})+".format(u"|".join(region_list))
280 |         # logging.info(regex)
281 |         ret["addressNorm"] = re.sub(regex, "", ret["addressNorm"]).strip()
282 | 
283 |     return ret
284 | 
285 | 
286 | class RegionEntity():
287 |     def _get_list_province_unique(self, list_cityid):
288 |         cancityidates = set()
289 |         for cityid in list_cityid:
290 |             cancityidates.add(self.data['items'][cityid]['province'])
291 |         if len(cancityidates) == 1:
292 |             [pnorm, pcompact] = normalize_province(list(cancityidates)[0])
293 |             # print pcompact
294 |             return pcompact
295 | 
296 |     def __init__(self, strict_mode=True):
297 |         data = file2json(file2abspath('region_data.json', __file__))
298 |         counter = collections.Counter()
299 |         self.strict_mode = strict_mode
300 | 
301 |         self.data = {
302 |             'items': {},  # 原始数据，基于cityid（多种指代，可以市省，市，区县级别）
303 | 
304 |             # 基于别名的索引， NER使用
305 |             'province': {},  # 无重名
306 |             'city': {},  # 有重名
307 |             'district': {},  # 有重名
308 | 
309 | 
310 |             'alias': {},  # 别名索引
311 | 
312 |             'lookup': collections.defaultdict(set),
313 |         }
314 | 
315 |         # copy data
316 |         for item in data:
317 |             self.data['items'][item['cityid']] = item
318 | 
319 |         # process province
320 |         map_province = collections.defaultdict(set)
321 |         for item in data:
322 |             p = item.get('province')
323 |             c = item.get('city')
324 |             d = item.get('district')
325 |             if p and not c:
326 |                 # cityid 为省的ID
327 |                 item["type"] = "province"
328 |                 item["name"] = p
329 |                 map_province[p].add(item['cityid'])
330 |         assert 34 == len(map_province), len(map_province)
331 |         # logging.info(json.dumps(list(map_province.keys()), ensure_ascii=False))
332 | 
333 |         for p in sorted(list(map_province)):
334 |             alias_list = normalize_province(p)
335 |             pnorm = alias_list[1]
336 |             self.data['province'][p] = {
337 |                 'province': self._get_list_province_unique(map_province[p]),
338 |                 'cityid_list': list(map_province[p]),
339 |                 'alias': [p] + alias_list}
340 |             map_province[p] = pnorm
341 |             if pnorm.startswith(u'安徽'):
342 |                 logging.info(json.dumps(alias_list))
343 |             # print json.dumps(list(set([p,pnorm,pnorm2])),ensure_ascii=False)
344 | 
345 |         # process city
346 |         map_city = collections.defaultdict(set)
347 |         for item in data:
348 |             c = item.get('city')
349 |             d = item.get('district')
350 |             if c in [u"市辖区", u"县", u"省直辖县级行政区划", u"自治区直辖县级行政区划"]:
351 |                 continue
352 |             """
353 |                 {
354 |                     "city": "市辖区",
355 |                     "cityid": "310105",
356 |                     "district": "长宁区",
357 |                     "province": "上海市"
358 |                 }
359 | 
360 |                 {
361 |                     "city": "南通市",
362 |                     "cityid": "320601",
363 |                     "district": "市辖区",
364 |                     "province": "江苏省"
365 |                 },
366 |             """
367 | 
368 |             if c and not d:
369 |                 item["type"] = "city"
370 |                 item["name"] = c
371 |                 map_city[c].add(item['cityid'])
372 |                 if len(map_city[c]) != 1:
373 |                     logging.error(json.dumps(item, ensure_ascii=False))
374 |                     logging.error(len(map_city[c]))
375 |                     assert len(map_city[c]) == 1
376 | 
377 |         assert 333 == len(map_city), len(map_city)
378 |         # logging.info(json.dumps(list(map_city.keys()), ensure_ascii=False))
379 | 
380 |         for p in sorted(list(map_city)):
381 |             alias_list = normalize_city(p)
382 |             assert pnorm
383 |             # print p, '-->',pnorm, '-->', pcompact
384 |             self.data['city'][p] = {
385 |                 'province': self._get_list_province_unique(map_city[p]),
386 |                 'cityid_list': list(map_city[p]),
387 |                 'alias': [p] + alias_list}
388 |         assert len(map_city) == len(self.data['city']), len(self.data['city'])
389 | 
390 |         # process district
391 |         map_district = collections.defaultdict(set)
392 |         for item in data:
393 |             d = item.get('district')
394 |             if d in [u"市辖区"]:
395 |                 # check above 市辖区 is used both as value of city and district
396 |                 # simply drop them since they already defined in city level
397 |                 continue
398 | 
399 |             if d:
400 |                 item["type"] = "district"
401 |                 item["name"] = d
402 |                 map_district[d].add(item['cityid'])
403 |         assert 2821 == len(map_district), len(map_district)
404 | 
405 |         for p in sorted(list(map_district)):
406 |             alias_list = normalize_district(p)
407 |             # print p, '-->',pnorm, '-->', pcompact
408 |             cityid_list = list(map_district[p])
409 |             if len(cityid_list) > 1:
410 |                 # logging.info( len(cityid_list) )
411 |                 # logging.info( p )
412 |                 pass
413 | 
414 |             self.data['district'][p] = {
415 |                 'province': self._get_list_province_unique(map_district[p]),
416 |                 'cityid_list': cityid_list,
417 |                 'alias': [p] + alias_list}
418 | 
419 |         # process duplicated name 别名索引
420 |         for index in ['province', 'city', 'district']:
421 |             for name, data in self.data[index].items():
422 |                 for alias in set(data['alias']):
423 |                     # if alias.startswith(u"清"):
424 |                     #    logging.info(alias)
425 |                     self.data['lookup'][alias].update(data['cityid_list'])
426 | 
427 |         for alias, alias_cityid_list in self.data['lookup'].items():
428 |             alias_cityid_list_unique = set(alias_cityid_list)
429 |             if len(alias_cityid_list_unique) > 1:
430 |                 # logging.debug(u"{} {}".format(alias, len(alias_cityid_list_unique)))
431 |                 # print alias
432 |                 for code in alias_cityid_list_unique:
433 |                     # print json.dumps(self.data['items'][code], ensure_ascii=False)
434 |                     pass
435 | 
436 |         # 有唯一省的地点名， 歧义地点名不管
437 |         for alias, alias_cityid_list in self.data['lookup'].items():
438 |             alias_cityid_list_unique = set(alias_cityid_list)
439 |             province = self._get_list_province_unique(alias_cityid_list_unique)
440 |             if province:
441 |                 self.data['alias'][alias] = province
442 | 
443 |         # with codecs.open(getTheFile('libcity_cn.new.json'),'w',encoding='utf-8') as f:
444 |         #    json.dump(self.data, f,ensure_ascii=False, indent=4)
445 |         # 统计
446 |         for index in self.data:
447 |             counter[index] = len(self.data[index])
448 | 
449 |         # validation
450 |         for alias, entities in self.data['lookup'].items():
451 |             if len(alias) == 1:
452 |                 logging.error(json.dumps(
453 |                     entities, ensure_ascii=False, indent=4, sort_keys=True))
454 |                 if self.strict_mode:
455 |                     exit()
456 | 
457 |             if alias in [u'自治']:
458 |                 logging.error(json.dumps(
459 |                     entities, ensure_ascii=False, indent=4, sort_keys=True))
460 |                 if self.strict_mode:
461 |                     exit()
462 | 
463 |             if len(entities) > 1:
464 |                 counter["one-alias-many-entities"] += 1
465 |                 # logging.info(u"{}[{}] {}".format(alias, len(entities), u",".join([x["name"]+x["type"] for x in entities])))
466 | 
467 |         # prepare for NER
468 |         for index in ['province', 'city', 'district']:
469 |             for name, data in self.data[index].items():
470 |                 for alias in set(data['alias']):
471 |                     if re.search(ur"[省市县]$", alias):
472 |                         jieba.add_word(alias, 10000000)
473 |                     elif re.search(ur"[区]$", alias):
474 |                         jieba.add_word(alias, 1000000)
475 |                     else:
476 |                         jieba.add_word(alias, 100000)
477 | 
478 |                         for suffix in u"路镇乡圩河区村":
479 |                             jieba.add_word(u"{}{}".format(
480 |                                 alias, suffix), 1000000)
481 | 
482 |         names = file2iter(file2abspath('region_dict.txt', __file__))
483 |         for name in names:
484 |             jieba.add_word(name.strip(), 1)
485 | 
486 |         # jieba.del_word(u"广州药业")
487 | 
488 |     def normalize_region_name(self, name, xtype):
489 |         if not hasattr(self, "normalizeRegion_mapped"):
490 |             setattr(self, "normalizeRegion_mapped", collections.Counter())
491 |         mapped = getattr(self, "normalizeRegion_mapped")
492 | 
493 |         if len(name) > 2:
494 |             name = re.sub(u"[省市]+$", "", name)
495 | 
496 |         if name in [u"市辖区"]:
497 |             return name
498 | 
499 |         if name in ["", u"省市"]:
500 |             return ""
501 | 
502 |         # rewrite
503 |         if name in [u"内蒙", u"蒙古"]:
504 |             name = u"内蒙古"
505 | 
506 |         cityid_list = self.data["lookup"].get(name)
507 |         if not cityid_list:
508 |             logging.error("cannot find reigion name")
509 |             logging.error(name)
510 |             logging.error(xtype)
511 |             if self.strict_mode:
512 |                 exit(0)
513 | 
514 |         matched = []
515 |         for cityid in cityid_list:
516 |             item = self.data["items"][cityid]
517 |             if item["type"] == xtype:
518 |                 matched.append(item)
519 | 
520 |         for item in matched:
521 |             if item["name"] == name:
522 |                 return name
523 | 
524 |         for item in matched:
525 |             if item["name"] != name:
526 |                 msg = u"normalized {} ->{}".format(name, item["name"])
527 |                 if msg not in mapped:
528 |                     mapped[msg] += 1
529 |                     logging.info(msg)
530 |             return item["name"]
531 | 
532 |     def guess_province(self, addresses):
533 |         for address in addresses:
534 |             if not address:
535 |                 continue
536 | 
537 |             if address.startswith(u"内蒙"):
538 |                 return u"内蒙古"
539 | 
540 |             for index in ['province', 'city']:
541 |                 for name in self.data[index]:
542 |                     for alias in set(self.data[index][name]['alias']):
543 |                         if address.startswith(alias):
544 |                             # print address, '-->', name, self.data[index][name]['province']
545 |                             return self.data[index][name].get('province')
546 | 
547 |             for index in ['province', 'city']:
548 |                 for name in self.data[index]:
549 |                     for alias in set(self.data[index][name]['alias']):
550 |                         if re.search(ur'（{}）'.format(alias), address):
551 |                             # print address, '-->', name, self.data[index][name]['province']
552 |                             return self.data[index][name].get('province')
553 | 
554 |             for alias in self.data['alias']:
555 |                 if address.startswith(alias):
556 |                     return self.data['alias'][alias]
557 |                 if re.search(ur'（{}）'.format(alias), address):
558 |                     return self.data['alias'][alias]
559 | 
560 |         print 'guess_province failed', json.dumps(addresses, ensure_ascii=False)
561 |         return u""
562 | 
563 |     def guess_all(self, addresses):
564 | 
565 |         # 解析实体 NER
566 |         matched_alias = []
567 |         candidates_name_weight = collections.Counter()
568 |         matched_alias_cityid_list = {}
569 | 
570 |         visited_seg = []
571 |         for address in addresses:
572 |             if not type(address) == unicode:
573 |                 address = address.decode("utf-8")
574 | 
575 |             # skip shot name
576 |             if len(address) < 3:
577 |                 continue
578 | 
579 |             # skip name without blacklist
580 |             regex = ur"^[^省市县]{2,3}([庄村镇乡])"
581 |             if re.search(regex, address):
582 |                 # logging.info(u"skip村镇乡 {}".format(address))
583 |                 continue
584 | 
585 |             regex = ur"^[^省市县]{2,5}([街路巷弄组]|大道|花园|市场)"
586 |             if re.search(regex, address):
587 |                 # logging.info(u"skip村镇乡 {}".format(address))
588 |                 continue
589 | 
590 |             seg_list = list(jieba.cut(address, cut_all=False, HMM=False))
591 |             logging.debug("Full Mode: " + "/ ".join(seg_list))
592 | 
593 |             # merge the first two seg if their combined into an alias
594 |             if len(seg_list) > 1:
595 |                 # 清/ 新县/ 太和镇/ 滨江路/ 东/ 三/ 街/ 13/ 号/ 首层/ 5/ 号/ 铺
596 |                 # 恵/ 东县/ 大岭/ 镇/ 新园/ 路/ 145/ 号
597 |                 temp = u"{}{}".format(seg_list[0], seg_list[1])
598 |                 # logging.info(temp)
599 |                 temp = normalize_misspell(temp)
600 |                 if self.data["lookup"].get(temp):
601 |                     logging.info(temp)
602 |                     temp_list = [temp]
603 |                     temp_list.extend(seg_list[2:])
604 |                     seg_list = temp_list
605 | 
606 |                     logging.debug("After Merge: " + "/ ".join(seg_list))
607 | 
608 |                 # 中山/ 市南区/ 寮/ 后/ 村/ 龙子/ 街/ 14/ 号
609 |                 if re.search(ur"^[省市县]", seg_list[1]):
610 |                     temp = u"{}{}".format(seg_list[0], seg_list[1][0])
611 |                     # logging.info(temp)
612 |                     if self.data["lookup"].get(temp):
613 |                         # logging.info(temp)
614 |                         temp_list = [temp, seg_list[1][1:]]
615 |                         temp_list.extend(seg_list[2:])
616 |                         seg_list = temp_list
617 | 
618 |                         logging.debug("After Merge: " + "/ ".join(seg_list))
619 | 
620 |             is_continuous_match = True
621 |             for idx, seg in enumerate(seg_list):
622 |                 logging.debug(seg)
623 |                 if seg in visited_seg:
624 |                     continue
625 |                 else:
626 |                     visited_seg.append(seg)
627 | 
628 |                 cityid_list = self.data["lookup"].get(seg)
629 |                 logging.debug(cityid_list)
630 | 
631 |                 # if idx > 0:
632 |                 # skip name without whitelist
633 |                 # regex = ur"(.{2,6}(自治)?[省市县]|^.{2,6}(自治)?[省市县区])"
634 |                 # if not re.search(regex, address):
635 |                 # logging.info(u"skip省市县区 {}".format(address))
636 |                 #    break
637 | 
638 |                 if not cityid_list and len(seg) > 2 and idx == 0:
639 |                     temp = re.sub(u"(市|县|经济特区).?$", "", seg)
640 |                     cityid_list = self.data["lookup"].get(temp)
641 |                     # logging.info(temp)
642 | 
643 |                 if cityid_list:
644 |                     # logging.info(seg)
645 |                     matched_alias.append(seg)
646 | 
647 |                     matched_alias_cityid_list[seg] = cityid_list
648 |                     weight_default = 1.0 / len(cityid_list)
649 |                     for cityid in cityid_list:
650 | 
651 |                         name = self.data["items"][cityid]["name"]
652 |                         candidates_name_weight[name] += weight_default
653 | 
654 |                         # dirty hack 2017-04-01
655 |                         # add seg one more time if the address starts with it
656 |                         # a very strong indicator
657 |                         if is_continuous_match:
658 |                             candidates_name_weight[name] += weight_default
659 |                         if idx == 0:
660 |                             logging.debug("idx0")
661 |                             candidates_name_weight[name] += 2 * weight_default
662 |                             if re.search(ur"[省市县]$", seg):
663 |                                 logging.debug("省市县")
664 |                                 candidates_name_weight[name] += 2 * \
665 |                                     weight_default
666 |                             elif re.search(ur"[区]$", seg):
667 |                                 logging.debug("区")
668 |                                 candidates_name_weight[name] += 1 * \
669 |                                     weight_default
670 | 
671 |                         if seg == name:
672 |                             pass
673 |                         elif seg[-1] == name[-1]:  # kind of matched by alias
674 |                             pass
675 |                         elif seg in name:  # kind of matched by alias
676 |                             logging.debug("seg is part of name")
677 |                             candidates_name_weight[name] /= 2
678 |                         else:
679 |                             logging.debug("seg is different from name ")
680 |                             candidates_name_weight[name] /= 4
681 | 
682 |                 else:
683 |                     is_continuous_match = False
684 |                     if not re.search(ur"([县州]|公司)", address):
685 |                         break
686 | 
687 |         # logging.info(json.dumps(matched_alias, ensure_ascii=False))
688 | 
689 |         # select the best entity (most specific, most fit)
690 |         # 统计支持率
691 |         best_entity = None
692 |         best_match_score = 0
693 |         # logging.info(json.dumps(candidates_name_weight, ensure_ascii=False))
694 |         for seg in matched_alias:
695 |             for city_id in matched_alias_cityid_list[seg]:
696 |                 entity = self.data["items"][city_id]
697 |                 # logging.info(json.dumps(entity.values(), ensure_ascii=False))
698 |                 match_score = sum([w for x, w in candidates_name_weight.items() if x in entity.values()])
699 |                 logging.debug(match_score)
700 |                 logging.debug(json.dumps(entity, ensure_ascii=False))
701 |                 if match_score > best_match_score:
702 |                     best_entity = entity
703 |                     best_match_score = match_score
704 |                     logging.debug(json.dumps(best_entity, ensure_ascii=False))
705 | 
706 |         # print 'guess_province failed', json.dumps(addresses, ensure_ascii=False)
707 |         # if best_entity:
708 |         #     if len(addresses) == 2:
709 |         #         msg = u"\t".join(any2unicode([ addresses[1],
710 |         #                     addresses[0],
711 |         #                     best_entity["province"],
712 |         #                     best_entity.get("city", u""),
713 |         #                     best_entity.get("district", u""),
714 |         #                     best_entity["type"]
715 |         #                 ]))
716 |         #         print msg
717 |         return best_entity
718 | 
719 | 
720 | def task_guess_all(args=None):
721 |     city_data = RegionEntity()
722 |     # confused
723 |     addresses = ["太平区红河七街区７１１栋６单元１楼１号", "哈尔滨人民同泰医药连锁店宏伟分店"]
724 |     # missed
725 |     addresses = ["龙江路之路村集资楼1号楼4号门市", "合作区德仁堂药店"]
726 |     addresses = ["", "北京同仁堂广州药业连锁有限公司农林店"]
727 |     addresses = ["北京海淀区阜成路52号（定慧寺）", "北京大学肿瘤医院"]
728 |     addresses = ["水东镇东阳北街50号", "水东镇长安药店（已迁入三角所）"]
729 |     addresses = ["保定市长城北大街头台村2109号门脸", "保定市莲池区中昊翔启蒙大药房"]
730 | 
731 |     result = city_data.guess_all(addresses)
732 |     if result:
733 |         logging.info(json.dumps(result, ensure_ascii=False))
734 |         logging.info(render_result(result, addresses[1], addresses[0]))
735 | 
736 | 
737 | def render_result(result, name=None, address=None):
738 |     data = [name,
739 |             address,
740 |             result["province"],
741 |             result.get("city", u""),
742 |             result.get("district", u""),
743 |             result["type"]]
744 |     data = [x for x in data if x]
745 |     msg = u"\t".join(any2unicode(data))
746 |     return msg
747 | 
748 | 
749 | def task_guess_all_batch(args):
750 |     ner = RegionEntity()
751 |     filename = "../tests/ex3-region-test.xls"
752 |     filename = file2abspath(filename)
753 |     excel_data = excel2json(filename, non_empty_col=-1)
754 |     sheet_data = excel_data["data"].values()[0]
755 |     sheet_fields = excel_data["fields"].values()[0]
756 |     test_results = []
757 | 
758 |     for item in sheet_data:
759 |         addresses = [item["address"], item["name"]]
760 |         addresses = [x for x in addresses if x]
761 |         result = ner.guess_all(addresses)
762 |         msg = u"\n============\nexpect{}\nfound{}".format(
763 |                 json.dumps(item, ensure_ascii=False),
764 |                 json.dumps(result, ensure_ascii=False))
765 |         logging.info(msg)
766 | 
767 |         one_result = {}
768 |         test_results.append(one_result)
769 | 
770 |         # new entry
771 |         if item["type"] == "" and result:
772 |             logging.warn(render_result(result))
773 | 
774 |         match_errors = []
775 |         if not result:
776 |             result_type = "none"
777 |         else:
778 |             result_type = result["type"]
779 | 
780 |         if item["type"] != result_type:
781 |             match_errors.append("type")
782 |             one_result["type_diff"] = "{}->{}".format(item["type"], result_type)
783 | 
784 |         if result:
785 |             if item["province"] != result["province"]:
786 |                 match_errors.append("province")
787 | 
788 |             if item["type"] in ["city", "district"]:
789 |                 if item["city"] != result.get("city", ""):
790 |                     match_errors.append("city")
791 | 
792 |             if item["type"] in ["district"]:
793 |                 if item["district"] != result.get("district", ""):
794 |                     match_errors.append("district")
795 | 
796 |         one_result["match_error_count"] = len(match_errors)
797 |         one_result["result_type"] = result_type
798 | 
799 |     logging.info("accuracy = {} match_error_count_0/all".format(
800 |         1.0 * len([x for x in test_results if x["match_error_count"] == 0])/len(test_results)))
801 |     stat(test_results, [], ["match_error_count", "type_diff", "result_type"])
802 | 
803 | 
804 | if __name__ == "__main__":
805 |     logging.basicConfig(format='[%(levelname)s][%(asctime)s][%(module)s][%(funcName)s][%(lineno)s] %(message)s', level=logging.INFO)  # noqa
806 | 
807 |     main_subtask(__name__)
808 | 
809 | """
810 |     python cdata/region.py task_guess_all_batch
811 | 
812 |     python cdata/region.py task_guess_all
813 | """
814 | 


--------------------------------------------------------------------------------
/cdata/region_dict.txt:
--------------------------------------------------------------------------------
 1 | #药业
 2 | 北京同仁堂
 3 | 国药控股国大药房
 4 | 北京神州汽车租赁有限公司
 5 | 盘锦阳光大药房医药连锁有限公司
 6 | 太仓市三庆医药连锁有限公司
 7 | 桂林市春和堂医药连锁有限责任公司
 8 | 湖北春天大药房连锁有限公司
 9 | #哈尔滨人民同泰医药连锁店
10 | 湖北春天大药房
11 | 


--------------------------------------------------------------------------------
/cdata/summary.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Author:
  4 | # summarize a paragraph or an entity into short text description
  5 | 
  6 | import os
  7 | import sys
  8 | import json
  9 | import logging
 10 | import codecs
 11 | import hashlib
 12 | import datetime
 13 | import logging
 14 | import time
 15 | import re
 16 | import collections
 17 | 
 18 | from misc import main_subtask
 19 | from core import *
 20 | from table import *
 21 | 
 22 | def summarize_paragraph_person(text):
 23 |     pass
 24 | 
 25 | def summarize_entity_person(person):
 26 |     """
 27 |         assume person entity using cnschma person vocabulary, http://cnschema.org/Person
 28 |     """
 29 |     ret = []
 30 |     value = person.get("name")
 31 |     if not value:
 32 |         return False
 33 |     ret.append(value)
 34 | 
 35 |     prop = "courtesyName"
 36 |     value = json_get_first_item(person, prop)
 37 |     if value == u"不详":
 38 |         value = ""
 39 |     if value:
 40 |         ret.append(u'字{}'.format(value))
 41 | 
 42 |     value = person.get("alternateName")
 43 |     if value:
 44 |         #ret.append(u'别名{}'.format(value))
 45 |         # Bugged
 46 |         pass
 47 | 
 48 |     prop = "artName"
 49 |     value = json_get_first_item(person, prop)
 50 |     if value:
 51 |         ret.append(u'号{}'.format(value))
 52 | 
 53 |     value = person.get("dynasty")
 54 |     if value:
 55 |         ret.append(u'{}人'.format(value))
 56 | 
 57 |     prop = "ancestralHome"
 58 |     value = json_get_first_item(person, prop)
 59 |     if value:
 60 |         ret.append(u'祖籍{}'.format(value))
 61 | 
 62 |     birth_date = person.get("birthDate", "")
 63 |     birth_place = person.get("birthPlace", "")
 64 | 
 65 |     # Special case for unknown birth date
 66 |     if birth_date == u"不详":
 67 |         birth_date = ""
 68 | 
 69 |     if birth_place:
 70 |         ret.append(u'{}出生于{}'.format(birth_date, birth_place))
 71 |     elif birth_date:
 72 |         ret.append(u'{}出生'.format(birth_date))
 73 | 
 74 |     prop = "nationality"
 75 |     nationality = json_get_first_item(person, prop)
 76 |     prop = "occupation"
 77 |     occupation = json_get_first_item(person, prop)
 78 |     if occupation:
 79 |         if nationality:
 80 |             ret.append(u'{}{}'.format(nationality, occupation))
 81 |         else:
 82 |             ret.append(u'{}'.format(occupation))
 83 |     elif nationality:
 84 |         ret.append(u'{}人'.format(nationality))
 85 | 
 86 |     prop = "authorOf"
 87 |     value = json_get_list(person, prop)
 88 |     if value:
 89 |         logging.info(value)
 90 |         value = u"、".join(value)
 91 |         ret.append(u'主要作品：{}'.format(value) )
 92 | 
 93 |     prop = "accomplishment"
 94 |     value = json_get_list(person, prop)
 95 |     if value:
 96 |         value = u"、".join(value)
 97 |         if len(value) < 30:
 98 |             # Colon is handled by text reading software
 99 |             ret.append( u"主要成就：{}".format(value) )
100 | 
101 |     ret = u"，".join(ret)
102 | 
103 |     # Make all commas Chinese
104 |     ret = ret.replace(u',', u'，')
105 |     ret = re.sub(u"，+", u"，", ret) # Removes repeat commas
106 |     # Handles periods at end
107 |     ret = re.sub(ur"[。，]+$", u"", ret)
108 | 
109 |     # Converts brackets to Chinese
110 |     ret = ret.replace(u'(', u'（')
111 |     ret = ret.replace(u')', u'）')
112 |     # Removes brackets and all contained info
113 |     ret = re.sub(ur"（[^）]*）", u"", ret)
114 | 
115 |     ret = u''.join([ret, u"。"])
116 | 
117 |     return ret
118 | 
119 | def task_summarize_entity_person(args):
120 |     #print "called task_test_summarize_entity_person"
121 |     person = {
122 |         "name": u"张三",
123 |         "accomplishment": u"三好学生"
124 |     }
125 |     ret = summarize_entity_person(person)
126 |     logging.info(ret)
127 | 
128 | def task_summarize_all_person(args):
129 |     path2person = file2abspath('../local/person/person.json')
130 | 
131 |     result_person_list = []
132 |     for line in file2iter(path2person):
133 |         person = json.loads(line)
134 |         ret = summarize_entity_person(person)
135 |         if ret:
136 |             person["shortDescription"] = ret
137 |             result_person_list.append(person)
138 | 
139 |     logging.info( "write to JSON and excel")
140 | 
141 |     KEYS = [u"@type",u"artName",u"ethnicGroup",u"student",u"courtesyName",u"religion",u"cnProfessionalTitle",u"occupation",u"jobTitle",u"sibling",u"weight",u"nationality",u"birthPlace",u"height",u"alumniOf",u"keywords", u"schoolsOfbuddhism",u"image",u"parent",u"children",u"accomplishment",u"academicDegree",u"dharmaName",u"deathDate",u"academicMajor",u"nobleTitle",u"posthumousName",u"familyName",u"memberOfPoliticalParty",u"award",u"description", u"shortDescription", u"placeOfBurial",u"cnEducationalAttainment",u"alternateName",u"pseudonym",u"templeName",	u"birthDate",u"gender",u"worksFor",u"name",u"dynasty",u"earName",u"ancestralHome",u"birthName",u"studentOf",u"spouse",u"nobleFamily",u"authorOf",u"@id",u"colleague",u"fieldOfWork",u"mother",u"father"]
142 | 
143 |     out_path = "../local/person/"
144 | 
145 |     json2excel(
146 |     result_person_list, KEYS,
147 |     os.path.join(out_path, 'person_shortDescription.xls')
148 |     )
149 | 
150 |     items2file(
151 |     result_person_list,
152 |     os.path.join(out_path, 'person_shortDescription.json')
153 |     )
154 | 
155 | if __name__ == "__main__":
156 |     logging.basicConfig(format='[%(levelname)s][%(asctime)s][%(module)s][%(funcName)s][%(lineno)s] %(message)s', level=logging.DEBUG)  # noqa
157 |     logging.getLogger("requests").setLevel(logging.WARNING)
158 | 
159 |     main_subtask(__name__)
160 | 
161 | """
162 |     python cdata/summary.py task_summarize_entity_person
163 |     python cdata/summary.py task_summarize_all_person
164 | 
165 | """
166 | 


--------------------------------------------------------------------------------
/cdata/table.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Li Ding
  4 | # table/excel data manipulation
  5 | 
  6 | import os
  7 | import sys
  8 | import json
  9 | import logging
 10 | import codecs
 11 | import hashlib
 12 | import datetime
 13 | import logging
 14 | import time
 15 | import re
 16 | import collections
 17 | 
 18 | import xlwt
 19 | import xlrd
 20 | 
 21 | 
 22 | def json2excel(items, keys, filename, page_size=60000):
 23 |     """ max_page_size is 65000 because we output old excel .xls format
 24 |     """
 25 |     wb = xlwt.Workbook()
 26 |     rowindex = 0
 27 |     sheetindex = 0
 28 |     for item in items:
 29 |         if rowindex % page_size == 0:
 30 |             sheetname = "%02d" % sheetindex
 31 |             ws = wb.add_sheet(sheetname)
 32 |             rowindex = 0
 33 |             sheetindex += 1
 34 | 
 35 |             colindex = 0
 36 |             for key in keys:
 37 |                 ws.write(rowindex, colindex, key)
 38 |                 colindex += 1
 39 |             rowindex += 1
 40 | 
 41 |         colindex = 0
 42 |         for key in keys:
 43 |             v = item.get(key, "")
 44 |             if type(v) == list:
 45 |                 v = ','.join(v)
 46 |             if type(v) == set:
 47 |                 v = ','.join(v)
 48 |             ws.write(rowindex, colindex, v)
 49 |             colindex += 1
 50 |         rowindex += 1
 51 | 
 52 |     logging.debug(filename)
 53 |     wb.save(filename)
 54 | 
 55 | 
 56 | def excel2json(filename, non_empty_col=-1, file_contents=None):
 57 |     """
 58 |         http://www.lexicon.net/sjmachin/xlrd.html
 59 |         non_empty_col is -1 to load all rows, when set to a none-empty value,
 60 |         this function will skip rows having empty cell on that col.
 61 |     """
 62 | 
 63 |     if file_contents:
 64 |         workbook = xlrd.open_workbook(file_contents=file_contents)
 65 |     else:
 66 |         workbook = xlrd.open_workbook(filename)
 67 | 
 68 |     start_row = 0
 69 |     ret = collections.defaultdict(list)
 70 |     fields = {}
 71 |     for name in workbook.sheet_names():
 72 |         sh = workbook.sheet_by_name(name)
 73 |         headers = []
 74 |         for col in range(len(sh.row(start_row))):
 75 |             headers.append(sh.cell(start_row, col).value)
 76 | 
 77 |         logging.info(u"sheet={} rows={} cols={}".format(
 78 |             name, sh.nrows, len(headers)))
 79 |         logging.info(json.dumps(headers, ensure_ascii=False))
 80 | 
 81 |         fields[name] = headers
 82 | 
 83 |         for row in range(start_row + 1, sh.nrows):
 84 |             item = {}
 85 |             rowdata = sh.row(row)
 86 |             if len(rowdata) < len(headers):
 87 |                 msg = "skip mismatched row {}".format(
 88 |                     json.dumps(rowdata, ensure_ascii=False))
 89 |                 logging.warning(msg)
 90 |                 continue
 91 | 
 92 |             for col in range(len(headers)):
 93 |                 value = sh.cell(row, col).value
 94 |                 if type(value) in [unicode, basestring]:
 95 |                     value = value.strip()
 96 |                 item[headers[col]] = value
 97 | 
 98 |             if non_empty_col >= 0 and not item[headers[non_empty_col]]:
 99 |                 logging.debug("skip empty cell")
100 |                 continue
101 | 
102 |             ret[name].append(item)
103 |         # stat
104 |         logging.info(u"loaded {} {} (non_empty_col={})".format(
105 |             filename, len(ret[name]), non_empty_col))
106 |     return {'data': ret, 'fields': fields}
107 | 


--------------------------------------------------------------------------------
/cdata/web.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Author: Li Ding
 4 | 
 5 | # utility stuff
 6 | 
 7 | import os
 8 | import sys
 9 | import json
10 | import logging
11 | import codecs
12 | import hashlib
13 | import datetime
14 | import logging
15 | import time
16 | import urlparse
17 | import re
18 | 
19 | 
20 | def url2domain(url):
21 |     """ extract domain from url
22 |     """
23 |     parsed_uri = urlparse.urlparse(url)
24 |     domain = '{uri.netloc}'.format(uri=parsed_uri)
25 |     domain = re.sub("^.+@", "", domain)
26 |     domain = re.sub(":.+$", "", domain)
27 |     return domain
28 | 


--------------------------------------------------------------------------------
/cdata/wikify.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Li Ding
  4 | 
  5 | # wikification apis
  6 | 
  7 | import os
  8 | import sys
  9 | import json
 10 | import logging
 11 | import datetime
 12 | import logging
 13 | import time
 14 | import urllib
 15 | import re
 16 | 
 17 | import requests
 18 | 
 19 | from misc import main_subtask
 20 | from core import *
 21 | 
 22 | 
 23 | def task_compare(args):
 24 |     queries = [
 25 |         "autodealer",
 26 |         "birthplace",
 27 |         u"居里夫人",
 28 |         u"爱因斯坦",
 29 |     ]
 30 |     for query  in queries:
 31 |         args ={"query": query}
 32 |         logging.info(u"-----{}------".format(query))
 33 |         task_wikipedia_test(args)
 34 | 
 35 |         task_wikidata_test(args)
 36 | 
 37 | def task_wikipedia_test(args):
 38 |     ret = wikipedia_search(args["query"])
 39 |     logging.info(json.dumps(ret, ensure_ascii=False, sort_keys=True, indent=4))
 40 |     # ret = wikipedia_search_slow(query)
 41 |     # logging.info(json.dumps(ret, ensure_ascii=False, sort_keys=True, indent=4))
 42 | 
 43 | def task_wikidata_test(args):
 44 |     ret = wikidata_search(args["query"])
 45 |     logging.info(json.dumps(ret, ensure_ascii=False, sort_keys=True, indent=4))
 46 |     if ret["itemList"]:
 47 |         nodeid = ret["itemList"][0]["identifier"]
 48 |         ret = wikidata_get(nodeid)
 49 |         logging.info(json.dumps(ret["entities"][nodeid]["labels"]["zh"]["value"], ensure_ascii=False, sort_keys=True, indent=4))
 50 | 
 51 | def wikidata_get(identifier):
 52 |     """
 53 |         https://www.wikidata.org/wiki/Special:EntityData/P248.json
 54 |     """
 55 |     url = 'https://www.wikidata.org/wiki/Special:EntityData/{}.json'.format(identifier)
 56 |     #logging.info(url)
 57 |     return json.loads(requests.get(url).content)
 58 | 
 59 | def wikidata_search(query, lang="zh", output_lang="en",  searchtype="item",  max_result=1):
 60 |     """
 61 |         wikification: search wikipedia pages for the given query
 62 |         https://www.wikidata.org/w/api.php?action=help&modules=wbsearchentities
 63 | 
 64 |         result format
 65 |         {
 66 |             searchinfo: - {
 67 |             search: "birthday"
 68 |             },
 69 |             search: - [
 70 |             - {
 71 |                 repository: "",
 72 |                 id: "P3150",
 73 |                 concepturi: "http://www.wikidata.org/entity/P3150",
 74 |                 url: "//www.wikidata.org/wiki/Property:P3150",
 75 |                 title: "Property:P3150",
 76 |                 pageid: 28754653,
 77 |                 datatype: "wikibase-item",
 78 |                 label: "birthday",
 79 |                 description: "item for day and month on which the subject was born. Used when full "date of birth" (P569) isn't known.",
 80 |                 match: - {
 81 |                 type: "label",
 82 |                 language: "en",
 83 |                 text: "birthday"
 84 |             }
 85 |         }
 86 |     """
 87 |     query = any2unicode(query)
 88 |     params = {
 89 |         "action":"wbsearchentities",
 90 |         "search": query,
 91 |         "format":"json",
 92 |         "language":lang,
 93 |         "uselang":output_lang,
 94 |         "type":searchtype
 95 |     }
 96 |     urlBase = "https://www.wikidata.org/w/api.php?"
 97 |     url = urlBase + urllib.urlencode(any2utf8(params))
 98 |     #logging.info(url)
 99 |     r = requests.get(url)
100 |     results = json.loads(r.content).get("search",[])
101 |     #logging.info(items)
102 | 
103 |     property_list = [
104 |         {"name":"name", "alternateName":["label"]},
105 |         {"name":"url", "alternateName":["concepturi"]},
106 |         {"name":"identifier", "alternateName":["id"]},
107 |         {"name":"description"},
108 |     ]
109 |     items = []
110 |     ret = {"query": query, "itemList":items}
111 |     for result in results[0:max_result]:
112 |         #logging.info(result)
113 |         item = json_dict_copy(result, property_list)
114 |         items.append(item)
115 |     return ret
116 | 
117 | def wikipedia_search_slow(query, lang="en", max_result=1):
118 |     import wikipedia
119 |     #wikification
120 |     query = any2unicode(query)
121 |     items = []
122 |     ret = {"query":query, "itemList":items}
123 |     wikipedia.set_lang(lang)
124 |     wikiterm = wikipedia.search(query)
125 |     #logging.info(wikiterm)
126 |     for idx, term in enumerate(wikiterm[0:max_result]):
127 |         wikipage = wikipedia.page(term)
128 |         item = {
129 |             "name": wikipage.title,
130 |             "description": wikipedia.summary(term, sentences=1),
131 |             "url": wikipage.url,
132 |         }
133 |         items.append(item)
134 | 
135 |     return ret
136 | 
137 | def wikipedia_search(query, lang="en", max_result=1):
138 |     """
139 |         https://www.mediawiki.org/wiki/API:Opensearch
140 |     """
141 |     query = any2unicode(query)
142 |     params = {
143 |         "action":"opensearch",
144 |         "search": query,
145 |         "format":"json",
146 |         #"formatversion":2,
147 |         #"namespace":0,
148 |         "suggest":"true",
149 |         "limit": 10
150 |     }
151 |     urlBase = "https://{}.wikipedia.org/w/api.php?".format(lang)
152 |     url = urlBase + urllib.urlencode(any2utf8(params))
153 |     #logging.info(url)
154 |     r = requests.get(url)
155 |     jsonData = json.loads(r.content)
156 |     #logging.info(jsonData)
157 | 
158 |     items = []
159 |     ret = {"query":query, "itemList":items}
160 |     for idx, label in enumerate(jsonData[1][0:max_result]):
161 |         description = jsonData[2][idx]
162 |         url = jsonData[3][idx]
163 | 
164 |         item = {
165 |             "name": label,
166 |             "description":description,
167 |             "url": url,
168 |         }
169 |         items.append(item)
170 | 
171 |     return ret
172 | 
173 | if __name__ == "__main__":
174 |     logging.basicConfig(format='[%(levelname)s][%(asctime)s][%(module)s][%(funcName)s][%(lineno)s] %(message)s', level=logging.INFO)
175 |     logging.getLogger("requests").setLevel(logging.WARNING)
176 | 
177 |     optional_params = {
178 |         '--query': 'query'
179 |     }
180 |     main_subtask(__name__, optional_params=optional_params)
181 | 
182 | """
183 |     python cdata/wikify.py task_wikipedia_test --query="birth place"
184 |     python cdata/wikify.py task_wikidata_test --query="birth place"
185 |     python cdata/wikify.py task_wikidata_test --query="birthplace"
186 |     python cdata/wikify.py task_wikidata_test --query=居里夫人
187 | 
188 |     python cdata/wikify.py task_compare
189 | 
190 | """
191 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | jieba==0.38
2 | nose==1.3.7
3 | xlrd==1.0.0
4 | xlwt==1.2.0
5 | requests==2.18.1
6 | wikipedia==1.4.0
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | 
 4 | def readme():
 5 |     with open('README.rst') as f:
 6 |         return f.read()
 7 | 
 8 | 
 9 | setup(name='cdata',
10 |       version='0.1.9',
11 |       description='see data, handy snippets for conversion, and ETL.',
12 |       long_description=readme(),
13 |       classifiers=[
14 |         'Development Status :: 3 - Alpha',
15 |         'License :: OSI Approved :: Apache Software License',
16 |         'Programming Language :: Python :: 2.7',
17 |         'Topic :: Text Processing',
18 |       ],
19 |       url='http://github.com/cnschema/cdata',
20 |       author='Li Ding',
21 |       author_email='lidingpku@gmail.com',
22 |       license='Apache 2.0',
23 |       packages=['cdata'],
24 |       install_requires=[
25 |           'xlrd', 'xlwt', 'jieba', 'requests','wikipedia'
26 |       ],
27 |       package_data = {'': ['cdata/*.json','cdata/*.txt']},
28 |       test_suite='nose.collector',
29 |       tests_require=['nose'],
30 |       zip_safe=False)
31 | 


--------------------------------------------------------------------------------
/tests/ex1.json:
--------------------------------------------------------------------------------
1 | {
2 |   "name": "张三",
3 |   "nickname": "three",
4 |   "age": 28
5 | }
6 | 


--------------------------------------------------------------------------------
/tests/ex2.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cnschema/cdata/893e2e1e27b61c8551c8b5f5f9bf05ec61490e23/tests/ex2.xls


--------------------------------------------------------------------------------
/tests/ex3-region-test.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cnschema/cdata/893e2e1e27b61c8551c8b5f5f9bf05ec61490e23/tests/ex3-region-test.xls


--------------------------------------------------------------------------------
/tests/test_core.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Path hack
  4 | import os
  5 | import sys
  6 | sys.path.insert(0, os.path.abspath('..'))
  7 | 
  8 | try:
  9 |     import unittest2 as unittest
 10 | except ImportError:
 11 |     import unittest
 12 | 
 13 | from cdata.core import *  # noqa
 14 | 
 15 | 
 16 | class CoreTestCase(unittest.TestCase):
 17 |     def setUp(self):
 18 |         pass
 19 | 
 20 |     def test_file2abspath(self):
 21 |         tin = "test.json"
 22 |         tout = file2abspath(tin, __file__)
 23 |         logging.info(" {} => {}".format(tin, tout))
 24 |         assert tout.endswith(u"tests/" + tin), tout
 25 | 
 26 |         tin = "../test.json"
 27 |         tout = file2abspath(tin)
 28 |         logging.info(" {} => {}".format(tin, tout))
 29 |         assert tout.endswith(
 30 |             u"cdata/" + os.path.basename(tin)), tout
 31 | 
 32 |     def test_file2json(self):
 33 |         filename = "ex1.json"
 34 |         filename = file2abspath(filename, __file__)
 35 |         ret = file2json(filename)
 36 |         assert len(ret) == 3
 37 | 
 38 |     def test_file2iter(self):
 39 |         filename = "ex1.json"
 40 |         filename = file2abspath(filename, __file__)
 41 |         str_iter = file2iter(filename)
 42 |         assert len(list(str_iter)) == 5
 43 | 
 44 |     def test_json_get(self):
 45 | 
 46 |         json_data = {"a": {"b": 1}, "c": ["d"], "e": "f"}
 47 |         assert type(json_get(json_data, ["a"])) == dict
 48 |         assert json_get(json_data, ["k"]) is None
 49 |         assert json_get(json_data, ["k"], 10) == 10
 50 |         assert json_get(json_data, ["a", "b"], 10) == 1
 51 |         assert json_get(json_data, ["a", "k"], 10) == 10
 52 |         assert json_get(json_data, ["c", "d"], 10) is None
 53 |         assert json_get(json_data, ["e", "k"], 10) is None
 54 |         assert type(json_get(json_data, ["c"])) == list
 55 | 
 56 |         json_data = {
 57 |             "father": {"name": "john"},
 58 |             "birthPlace": "Beijing"
 59 |         }
 60 | 
 61 |         assert json_get(json_data, ["father", "name"]) == "john"
 62 |         assert json_get(json_data, ["father", "image"], default="n/a") == "n/a"
 63 |         assert json_get(json_data, ["father", "father"]) is None
 64 |         assert json_get(json_data, ["birthPlace"]) == "Beijing"
 65 |         assert json_get(
 66 |             json_data, ["birthPlace", "name"], default="n/a") is None
 67 | 
 68 |     def test_json_get_list(self):
 69 | 
 70 |         json_data = {
 71 |             "name": "john",
 72 |             "birthPlace": ["Beijing"]
 73 |         }
 74 |         assert json_get_list(json_data, "name") == ["john"]
 75 |         assert json_get_list(json_data, "birthPlace") == ["Beijing"]
 76 | 
 77 |     def test_json_get_first_item(self):
 78 | 
 79 |         json_data = {
 80 |             "name": "john",
 81 |             "birthPlace": ["Beijing"],
 82 |             "interests": []
 83 |         }
 84 |         assert json_get_first_item(json_data, "name") == "john"
 85 |         assert json_get_first_item(json_data, "birthPlace") == "Beijing"
 86 |         assert json_get_first_item(json_data, "birthDate") == ''
 87 |         assert json_get_first_item(json_data, "interests", defaultValue=None) is None
 88 | 
 89 |     def test_json_append(self):
 90 | 
 91 |         json_data = {
 92 |             "name": "john",
 93 |             "birthPlace": ["Beijing"],
 94 |             "interests": []
 95 |         }
 96 | 
 97 |         json_append(json_data, "name", "a")
 98 |         assert json_data["name"] == "john"
 99 | 
100 |         json_append(json_data, "birthPlace", "a")
101 |         assert json_data["birthPlace"] == ["Beijing","a"]
102 | 
103 |         json_append(json_data, "keywords", "a")
104 |         assert json_data["keywords"] == ["a"]
105 | 
106 |     def test_any2utf8(self):
107 |         tin = "你好世界"
108 |         tout = any2utf8(tin)
109 |         logging.info(" {} => {}".format(tin, tout))
110 | 
111 |         tin = u"你好世界"
112 |         tout = any2utf8(tin)
113 |         logging.info((tin, tout))
114 | 
115 |         tin = "hello world"
116 |         tout = any2utf8(tin)
117 |         logging.info((tin, tout))
118 | 
119 |         tin = ["hello", "世界"]
120 |         tout = any2utf8(tin)
121 |         logging.info((tin, tout))
122 | 
123 |         tin = {"hello": u"世界"}
124 |         tout = any2utf8(tin)
125 |         logging.info((tin, tout))
126 | 
127 |         tin = {"hello": u"世界", "number": 90}
128 |         tout = any2utf8(tin)
129 |         logging.info((tin, tout))
130 | 
131 |     def test_any2unicode(self):
132 |         tin = "你好世界"
133 |         tout = any2unicode(tin)
134 |         logging.info((tin, tout))
135 | 
136 |         tin = u"你好世界"
137 |         tout = any2unicode(tin)
138 |         logging.info((tin, tout))
139 | 
140 |         tin = "hello world"
141 |         tout = any2unicode(tin)
142 |         logging.info((tin, tout))
143 | 
144 |         tin = ["hello", "世界"]
145 |         tout = any2unicode(tin)
146 |         logging.info((tin, tout))
147 | 
148 |         tin = {"hello": u"世界"}
149 |         tout = any2unicode(tin)
150 |         logging.info((tin, tout))
151 | 
152 |     def test_any2sha1(self):
153 |         tin = "你好世界"
154 |         tout = any2sha1(tin)
155 |         assert "dabaa5fe7c47fb21be902480a13013f16a1ab6eb" == tout, tout
156 | 
157 |         tin = u"你好世界"
158 |         tout = any2sha1(tin)
159 |         assert "dabaa5fe7c47fb21be902480a13013f16a1ab6eb" == tout, tout
160 | 
161 |         tin = "hello world"
162 |         tout = any2sha1(tin)
163 |         assert "2aae6c35c94fcfb415dbe95f408b9ce91ee846ed" == tout, tout
164 | 
165 |         tin = ["hello", "world"]
166 |         tout = any2sha1(tin)
167 |         assert "238d2b0d23b6b4fb22934792bec13448d12df3cf" == tout, tout
168 | 
169 |         tin = {"hello": "world"}
170 |         tout = any2sha1(tin)
171 |         assert "d3b09abe30cfe2edff4ee9e0a141c93bf5b3af87" == tout, tout
172 | 
173 |     def test_json_dict_copy(self):
174 |         property_list = [
175 |             { "name":"name", "alternateName": ["name","title"]},
176 |             { "name":"birthDate", "alternateName": ["dob","dateOfBirth"] },
177 |             { "name":"description" }
178 |         ]
179 |         json_object = {"dob":"2010-01-01","title":"John","interests":"data","description":"a person"}
180 |         ret = json_dict_copy(json_object, property_list)
181 |         assert json_object["title"] == ret["name"]
182 |         assert json_object["dob"] == ret["birthDate"]
183 |         assert json_object["description"] == ret["description"]
184 |         assert ret.get("interests") is None
185 | 
186 |     def test_statJsonld(self):
187 |         tin = "test_core_stat.jsonld"
188 |         tout = file2abspath(tin, __file__)
189 |         with open(tout) as f:
190 |             data = json.load(f)
191 |             ret = stat_jsonld(data)
192 |             print json.dumps(ret)
193 |             assert ret["triple"] == 29
194 |             assert ret[u"tag_抒情"] == 1
195 | 
196 |     def test_stat(self):
197 |         data = [{u"名称": u"张三", u"年龄": u"13.0"}, {u"名称": u"李四", u"年龄": u"20"}]
198 |         ret = stat(data,[u"名称", u"年龄"],[u"名称", u"年龄"])
199 | 
200 | 
201 | if __name__ == '__main__':
202 |     unittest.main()
203 | 


--------------------------------------------------------------------------------
/tests/test_core_stat.jsonld:
--------------------------------------------------------------------------------
 1 | {
 2 |    "byArtist":[],
 3 |    "name":"暧昧",
 4 |    "keywords":["抒情"],
 5 |    "tag":[{"name":"抒情"},{"name":"愉快"}],
 6 |    "mergedFrom":[
 7 |       {
 8 |          "alternateName":[],
 9 |          "name":"暧昧",
10 |          "lyricist":[],
11 |          "referenceUrl":"http://music.163.com/song?id=471385043",
12 |          "@type":"MusicRecording",
13 |          "durationInSeconds":312,
14 |          "statedIn":"music.163.com",
15 |          "shareCount":0,
16 |          "inAlbum":{
17 |             "identifier":"35347475",
18 |             "name":"暧昧",
19 |             "datePublished": "",
20 |             "image": ""
21 |          },
22 |          "composer":[],
23 |          "keywords":[],
24 |          "byArtist":[
25 |             {
26 |                "@id":"46a0e65c-bd69-327a-a9ca-93789ae6a473",
27 |                "name": "薛之谦"
28 |             }
29 |          ],
30 |          "identifier":"471385043",
31 |          "@id":"022858de-f892-373a-bb0f-668c8e50d16f",
32 |          "dateModified":"2017-05-15T15:54:44+08:00",
33 |          "listenCount":0,
34 |          "lyrics": "",
35 |          "position": 0
36 |       }
37 |    ],
38 |    "entityScore": 0,
39 |    "lyrics": "",
40 |    "keywords":[],
41 |    "alternateName":[],
42 |    "@id":"",
43 |    "@context": "http://schema.org/",
44 |   "@type":["MusicRecording","CreativeWork","Thing"]
45 | }
46 | 


--------------------------------------------------------------------------------
/tests/test_entity.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Path hack
 4 | import os
 5 | import sys
 6 | sys.path.insert(0, os.path.abspath('..'))
 7 | 
 8 | from cdata.entity import SimpleEntity  # noqa
 9 | 
10 | try:
11 |     import unittest2 as unittest
12 | except ImportError:
13 |     import unittest
14 | 
15 | 
16 | class EntityTestCase(unittest.TestCase):
17 |     def setUp(self):
18 |         pass
19 | 
20 |     def test_ner_utf8(self):
21 |         entity_list = [{"@id": "1", "name": "张三"}, {"@id": "2", "name": "李四"}]
22 |         ner = SimpleEntity(entity_list)
23 |         sentence = "张三给了李四一个苹果"
24 |         ret = ner.ner(sentence)
25 |         assert len(ret) == 2
26 | 
27 |     def test_ner(self):
28 |         entity_list = [{"@id": "1", "name": u"张三"},
29 |                        {"@id": "2", "name": u"李四"}]
30 |         ner = SimpleEntity(entity_list)
31 |         sentence = u"张三给了李四一个苹果"
32 |         ret = ner.ner(sentence)
33 |         assert len(ret) == 2
34 | 
35 |         # 张三丰 不会识别成 张三 丰
36 |         sentence = u"张三丰给了李四一个苹果"
37 |         ret = ner.ner(sentence)
38 |         assert len(ret) == 1
39 | 
40 |     def test_get_primary_entity(self):
41 |         entity_list = [{"@id": "1", "name": u"张三"},
42 |                        {"@id": "2", "name": u"李四"}]
43 |         ner = SimpleEntity(entity_list)
44 |         sentence_list = ["张三给了李四一个苹果", "王五给了李四一个橘子"]
45 |         # 张三:0.75  李四：0.25
46 |         primary_entity = ner.get_primary_entity(sentence_list,0.4)
47 |         assert len(primary_entity) == 1
48 |         
49 | if __name__ == '__main__':
50 |     unittest.main()
51 | 


--------------------------------------------------------------------------------
/tests/test_region.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Path hack
  4 | import os
  5 | import sys
  6 | sys.path.insert(0, os.path.abspath('..'))
  7 | 
  8 | from cdata.region import RegionEntity  # noqa
  9 | 
 10 | try:
 11 |     import unittest2 as unittest
 12 | except ImportError:
 13 |     import unittest
 14 | 
 15 | 
 16 | class EntityTestCase(unittest.TestCase):
 17 |     def setUp(self):
 18 |         self.ner = RegionEntity()
 19 |         pass
 20 | 
 21 |     def test_misc(self):
 22 |         ret = self.ner.normalize_region_name(u"市辖区", "province")
 23 |         assert ret == u"市辖区", u"[]".format(ret)
 24 |         # TODO self.ner.normalize_region_name(u"安huisheng", "province")
 25 |         assert u"安徽省" == self.ner.normalize_region_name(u"安徽", "province")
 26 |         assert u"内蒙古自治区" == self.ner.normalize_region_name(u"内蒙古", "province")
 27 |         assert u"哈尔滨市" == self.ner.normalize_region_name(u"哈尔滨", "city")
 28 |         assert u"乌鲁木齐市" == self.ner.normalize_region_name(u"乌鲁木齐", "city")
 29 |         assert u"道里区" == self.ner.normalize_region_name(u"道里区", "district")
 30 |         assert u"海淀区" == self.ner.normalize_region_name(u"海淀区", "district")
 31 |         assert u"海淀区" == self.ner.normalize_region_name(u"海淀", "district")
 32 |         assert u"浦东新区" == self.ner.normalize_region_name(u"浦东", "district")
 33 |         assert u"浦东新区" == self.ner.normalize_region_name(u"浦东新区", "district")
 34 |         assert u"芒市" == self.ner.normalize_region_name(u"芒市", "district")
 35 | 
 36 |         assert u"上海" == self.ner.guess_province([u"上海西红柿集团"])
 37 |         assert u"上海" == self.ner.guess_province([u"浦东新区软件园"])
 38 |         assert u"辽宁" == self.ner.guess_province([u"朝阳市软件园"])
 39 |         assert u"内蒙古" == self.ner.guess_province([u"内蒙古自治区乌兰察布市丰镇市新标路丰美北小区232号"])
 40 |         assert u"天津" == self.ner.guess_province([u"天津市食品药品监督管理局"])
 41 |         assert u"内蒙古" == self.ner.guess_province([u"内蒙乌兰察布市丰镇市新标路丰美北小区232号"])
 42 | 
 43 |     def test_guess_all(self):
 44 | 
 45 |         city_info = self.ner.guess_all([u"内蒙古自治区乌兰察布市丰镇市新标路丰美北小区232号"])
 46 |         assert u"district" == city_info.get("type")
 47 |         assert u"内蒙古自治区" == city_info.get("province")
 48 |         assert u"乌兰察布市" == city_info.get("city")
 49 |         assert u"丰镇市" == city_info.get("district")
 50 | 
 51 |         city_info = self.ner.guess_all(["保定市长城北大街头台村2109号门脸", "保定市莲池区中昊翔启蒙大药房"])
 52 |         assert u"district" == city_info.get("type")
 53 |         assert u"河北省" == city_info.get("province")
 54 |         assert u"保定市" == city_info.get("city")
 55 | 
 56 |         city_info = self.ner.guess_all([u"高州市平山木禾塘大塘村"])
 57 |         assert u"district" == city_info.get("type")
 58 |         assert u"广东省" == city_info.get("province")
 59 | 
 60 |         city_info = self.ner.guess_all([u"珠海市拱北新市花园16栋102铺"])
 61 |         assert u"city" == city_info.get("type")
 62 |         assert u"广东省" == city_info.get("province")
 63 | 
 64 |         city_info = self.ner.guess_all([u"南溪镇扬美刘大道中段老祠村道脚008号"])
 65 |         assert None is city_info
 66 | 
 67 |         city_info = self.ner.guess_all(["新塘镇大敦村", "增城市新塘众生药店"])
 68 |         assert u"district" == city_info.get("type")
 69 |         assert u"广东省" == city_info.get("province")
 70 | 
 71 |         city_info = self.ner.guess_all(["曲江县马坝城南", "曲江县马坝镇金良兽药店"])
 72 |         assert u"district" == city_info.get("type")
 73 |         assert u"广东省" == city_info.get("province")
 74 | 
 75 |         city_info = self.ner.guess_all(["镇平路４６号、４８号", "汕头经济特区粤东药品公司镇平商店"])
 76 |         assert u"city" == city_info.get("type")
 77 |         assert u"广东省" == city_info.get("province")
 78 | 
 79 |         inputdata = ["遂溪县河头镇文明街12号", "遂溪县河头回春堂药店"]
 80 |         city_info = self.ner.guess_all(inputdata)
 81 |         assert u"district" == city_info.get("type")
 82 |         assert u"广东省" == city_info.get("province")
 83 | 
 84 |         inputdata = [u"延寿镇南东风路", u"旺旺兽药店（延寿县）"]
 85 |         city_info = self.ner.guess_all(inputdata)
 86 |         assert u"district" == city_info.get("type")
 87 |         assert u"黑龙江省" == city_info.get("province")
 88 | 
 89 |         inputdata = ["", "富拉尔基秀坤百货商店药品专柜"]
 90 |         city_info = self.ner.guess_all(inputdata)
 91 |         assert u"district" == city_info.get("type")
 92 |         assert u"黑龙江省" == city_info.get("province")
 93 | 
 94 |         inputdata = ["兴隆工商局家属楼", "巴彦县鑫丰兽药饲料商店"]
 95 |         city_info = self.ner.guess_all(inputdata)
 96 |         assert u"district" == city_info.get("type")
 97 |         assert u"黑龙江省" == city_info.get("province")
 98 | 
 99 |         inputdata = ["下城子镇中心街", "穆棱市下城子镇宋大夫兽药饲料店"]
100 |         city_info = self.ner.guess_all(inputdata)
101 |         assert u"district" == city_info.get("type")
102 |         assert u"黑龙江省" == city_info.get("province")
103 | 
104 |         inputdata = ["加格达奇区前进路（红旗东风一号楼）", "加格达奇区温馨大药店"]
105 |         city_info = self.ner.guess_all(inputdata)
106 |         assert u"黑龙江省" == city_info.get("province")
107 |         assert u"district" == city_info.get("type")
108 |         assert u"加格达奇区" == city_info.get("name")
109 | 
110 |         inputdata = ["", "北京神州汽车租赁有限公司深圳雅园分公司"]
111 |         city_info = self.ner.guess_all(inputdata)
112 |         assert u"city" == city_info.get("type")
113 |         assert u"广东省" == city_info.get("province")
114 | 
115 |         inputdata = ["横山横安路镇政府出租屋第一间", "廉江市横山济生堂药店"]
116 |         city_info = self.ner.guess_all(inputdata)
117 |         assert u"district" == city_info.get("type")
118 |         assert u"广东省" == city_info.get("province")
119 | 
120 |         inputdata = ["水东镇东阳北街50号", "水东镇长安药店（已迁入三角所）"]
121 |         city_info = self.ner.guess_all(inputdata)
122 |         assert None is city_info
123 | 
124 |         inputdata = ["龙门县龙城林园街33号", "龙城新利药店"]
125 |         city_info = self.ner.guess_all(inputdata)
126 |         assert u"district" == city_info.get("type")
127 |         assert u"广东省" == city_info.get("province")
128 | 
129 |         inputdata = ["黄石街道", "龙川县药材公司黄石药店"]
130 |         city_info = self.ner.guess_all(inputdata)
131 |         assert u"district" == city_info.get("type")
132 |         assert u"广东省" == city_info.get("province")
133 | 
134 |         inputdata = ["", "盘锦阳光大药房医药连锁有限公司清远麦围店"]
135 |         city_info = self.ner.guess_all(inputdata)
136 |         assert u"city" == city_info.get("type")
137 |         assert u"广东省" == city_info.get("province")
138 | 
139 |         inputdata = ["", "盘锦阳光大药房医药连锁有限公司清远市清城区城市广场店"]
140 |         city_info = self.ner.guess_all(inputdata)
141 |         assert u"district" == city_info.get("type")
142 |         assert u"广东省" == city_info.get("province")
143 | 
144 |         inputdata = ["四会市东城区四会大道南时代商贸广场141号（首层）", "广州仁参医药连锁有限公司四会时代分店"]
145 |         city_info = self.ner.guess_all(inputdata)
146 |         assert u"district" == city_info.get("type")
147 |         assert u"广东省" == city_info.get("province")
148 | 
149 |         inputdata = ["中山市南朗镇岭南小区", "中山市南朗镇启发农药化肥店"]
150 |         city_info = self.ner.guess_all(inputdata)
151 |         assert u"city" == city_info.get("type")
152 |         assert u"广东省" == city_info.get("province")
153 | 
154 |         inputdata = ["信宜市镇隆圩解放街29号", "信宜市镇隆回春药店"]
155 |         city_info = self.ner.guess_all(inputdata)
156 |         assert u"district" == city_info.get("type")
157 |         assert u"广东省" == city_info.get("province")
158 | 
159 |         inputdata = ["乳源县大桥镇乳阳林业局溪头河西区域避暑林庄温泉大饭店主楼一楼", "东阳光药零售连锁（东莞）有限公司南岭店"]
160 |         city_info = self.ner.guess_all(inputdata)
161 |         assert u"district" == city_info.get("type")
162 |         assert u"广东省" == city_info.get("province")
163 | 
164 |         inputdata = ["从化市太平镇神岗木棉村永三社（龟塘）", "从化市太平民健药店"]
165 |         city_info = self.ner.guess_all(inputdata)
166 |         assert u"district" == city_info.get("type")
167 |         assert u"广东省" == city_info.get("province")
168 | 
169 |         inputdata = ["荔城镇和平路33号首层", "增城市荔城育善堂药店"]
170 |         city_info = self.ner.guess_all(inputdata)
171 |         assert u"district" == city_info.get("type")
172 |         assert u"广东省" == city_info.get("province")
173 |         assert u"广州市" == city_info.get("city")
174 | 
175 |         inputdata = ["清新县太和镇滨江路东三街13号首层5号铺", "清新县太和安顺堂药店"]
176 |         city_info = self.ner.guess_all(inputdata)
177 |         assert u"district" == city_info.get("type")
178 |         assert u"广东省" == city_info.get("province")
179 | 
180 |         inputdata = ["广州市番禺区小谷围街广州大学城外环西路230号广大商业中心A区首层1015", "桂林市春和堂医药连锁有限责任公司广州大学城分店"]
181 |         city_info = self.ner.guess_all(inputdata)
182 |         assert u"district" == city_info.get("type")
183 |         assert u"广东省" == city_info.get("province")
184 | 
185 |         inputdata = ["深圳市龙岗区龙岗街道南联社区向银路与怡丰路交叉路口南龙综合楼首层之三", "太仓市三庆医药连锁有限公司深圳南联店"]
186 |         city_info = self.ner.guess_all(inputdata)
187 |         assert u"district" == city_info.get("type")
188 |         assert u"广东省" == city_info.get("province")
189 | 
190 |         inputdata = ["中山市南区寮后村龙子街14号", "中山市南区仁德堂药店"]
191 |         city_info = self.ner.guess_all(inputdata)
192 |         assert u"city" == city_info.get("type")
193 |         assert u"广东省" == city_info.get("province")
194 | 
195 |         inputdata = ["", "珠海嘉伦药业集团光彩大药房连锁有限公司红旗分店"]
196 |         city_info = self.ner.guess_all(inputdata)
197 |         assert u"city" == city_info.get("type")
198 |         assert u"广东省" == city_info.get("province")
199 | 
200 |         inputdata = ["东区新街村", "同江市龙鑫堂大药店"]
201 |         city_info = self.ner.guess_all(inputdata)
202 |         assert u"district" == city_info.get("type")
203 |         assert u"黑龙江省" == city_info.get("province")
204 | 
205 |         inputdata = ["新兴县河头镇河头街65号", "新兴县河头镇同源堂药店"]
206 |         city_info = self.ner.guess_all(inputdata)
207 |         assert u"district" == city_info.get("type")
208 |         assert u"广东省" == city_info.get("province")
209 | 
210 |         inputdata = ["怀集怀城镇河南第二卫生站"]
211 |         city_info = self.ner.guess_all(inputdata)
212 |         assert u"district" == city_info.get("type")
213 |         assert u"广东省" == city_info.get("province")
214 | 
215 | 
216 | if __name__ == '__main__':
217 |     unittest.main()
218 | 


--------------------------------------------------------------------------------
/tests/test_summary.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Path hack
  4 | import os
  5 | import sys
  6 | sys.path.insert(0, os.path.abspath('..'))
  7 | 
  8 | from cdata.summary import *  # noqa
  9 | 
 10 | try:
 11 |     import unittest2 as unittest
 12 | except ImportError:
 13 |     import unittest
 14 | 
 15 | 
 16 | class SummaryTestCase(unittest.TestCase):
 17 |     def setUp(self):
 18 |         pass
 19 | 
 20 |     def test_misc(self):
 21 |         person = {
 22 |             "name": u"张三",
 23 |             "accomplishment": u"三好学生"
 24 |         }
 25 |         ret = summarize_entity_person(person)
 26 |         assert u"张三，主要成就：三好学生。" == ret
 27 | 
 28 |         person = {
 29 |             "name": u"张三",
 30 |             "accomplishment": u"三好学生",
 31 |             "artName": [u"张老三"]
 32 |         }
 33 |         ret = summarize_entity_person(person)
 34 |         assert u"张三，号张老三，主要成就：三好学生。" == ret
 35 | 
 36 |         person = {
 37 |             "name": u"张三",
 38 |             "accomplishment": u"三好学生",
 39 |             "artName": []
 40 |         }
 41 |         ret = summarize_entity_person(person)
 42 |         assert u"张三，主要成就：三好学生。" == ret
 43 | 
 44 |     def test_real_data(self):
 45 |         person = {
 46 |         "description": u"黄健中，1941年12月29日出生于印度尼西亚泗水，国家一级导演、编剧、艺术指导。1979年，黄健中与张铮联合执导爱情片《小花》，该片获得第三届电影百花奖上获最佳故事片奖 。1982年，黄健中独立执导首部电影作品——爱情片《如意》。1985年，凭借家庭剧《良家妇女》获得第二十五届卡罗维·发利国际电影节主要奖[2-3] 。1990年，拍摄警匪剧《龙年警官》，该片获得第十四届大众电影百花奖最佳故事片奖。1991年，拍摄家庭剧《过年》，该片获得第十五届大众电影百花奖最佳故事片奖。1995年，执导剧情片《大鸿米店》[4-6] 。1998年，拍摄爱情片《红娘》，该片获得第二十二届大众电影百花奖最佳故事片奖[7-8] 。2001年，执导古装武侠剧《笑傲江湖》 。2003年，与佐藤纯弥联合执导家庭犯罪剧《世纪末的晚钟》[10-12] 。2005年，国家广播电影电视总局授予黄健中“优秀电影艺术家”称号 。2006年，执导古装历史剧《越王勾践》 。2009年，拍摄历史战争剧《大秦帝国之裂变》，该片获得第25届中国电视金鹰奖[14-17] 。2011年，执导古装剧《大风歌》[18-19] 。2013年，执导古装神话剧《蓬莱八仙》[20-22] 。",
 47 |         "birthPlace": u"印度尼西亚泗水",
 48 |         "name": u"黄健中",
 49 |         "image": u"http://c.hiphotos.baidu.com/baike/w%3D268%3Bg%3D0/sign=9ac8a3ed33adcbef01347900949449e0/aec379310a55b319a1ae185c41a98226cffc1747.jpg",
 50 |         "accomplishment": u"第4届东京国际电影节评委奖第11届中国电影金鸡奖最佳导演奖第12届中国电影金鸡奖最佳编剧奖",
 51 |         "birthDate": u"1941年12月29日",
 52 |         "keywords": [u"导演", u"娱乐人物", u"人物"],
 53 |         "nationality": u"中国",
 54 |         "alternateName": ["HuangJianzhong", "Huang Jianzhong"],
 55 |         "authorOf": u"过年、龙年警官、越王勾践、大风歌",
 56 |         "@id": u"d67f8dc6-3775-3e4a-9d67-84bb4007d6d1",
 57 |         "@type": ["Person", "Thing"],
 58 |         "occupation": u"导演、编剧、艺术指导，" # Extra comma for punctuation testing
 59 |         }
 60 |         ret = summarize_entity_person(person)
 61 |         logging.info(ret)
 62 |         assert u"黄健中，1941年12月29日出生于印度尼西亚泗水，中国导演、编剧、艺术指导，主要作品：过年、龙年警官、越王勾践、大风歌。" == ret
 63 | 
 64 |         person = {
 65 |         "name": u"陈小群",
 66 |         "gender": u"女",
 67 |         "image": u"http://e.hiphotos.baidu.com/baike/w%3D268%3Bg%3D0/sign=3c89cd72acc379317d68812fd3ffd078/b90e7bec54e736d16b57837c98504fc2d5626979.jpg",
 68 |         "description": u"女，抒情女高音歌唱家，现任上海音乐学院声乐系教授、硕士生导师；先后担任文化部举办的国际声乐比赛全国选拔赛、中国音乐家协会举办的“金钟奖”全国声乐比赛、全国大学生艺术歌曲比赛等比赛评委。",
 69 |         "@type": ["Person", "Thing"],
 70 |         "ethnicGroup": u"汉族",
 71 |         "keywords": [u"音乐", u"行业人物", u"歌手", u"教育", u"娱乐人物", u"人物", u"书籍"],
 72 |         "nationality": u"中国",
 73 |         "@id": u"66548f8a-3f9e-37ca-afb1-e2e96fdb083b",
 74 |         "alumniOf": u"上海音乐学院",
 75 |         "occupation": u"教授"
 76 |         }
 77 |         ret = summarize_entity_person(person)
 78 |         assert u"陈小群，中国教授。" == ret
 79 | 
 80 |         # Test for bracket, unknown birth date, courtesy name
 81 |         person = {
 82 |         "@id": u"2d8d5ed9-108b-3621-86bd-6c67fbbf0896",
 83 |         "@type": u"Person,Thing",
 84 |         "accomplishment": u"袭龙城，收复河朔、河套地区，击败单于",
 85 |         "birthDate": u"不详",
 86 |         "birthPlace": u"河东平阳（今山西临汾市）",
 87 |         "courtesyName": u"仲卿",
 88 |         "deathDate": u"公元前106年（汉武帝元封五年）",
 89 |         "description": u"卫青，字仲卿，河东平阳人",
 90 |         "dynasty": u"西汉",
 91 |         "ethnicGroup": u"汉族",
 92 |         "image": "http://c.hiphotos.baidu.com/baike/w%3D268%3Bg%3D0/sign=dce9ce450f3387449cc5287a6934bec4/d53f8794a4c27d1ef8d6abd118d5ad6eddc43836.jpg",
 93 |         "name": u"卫青",
 94 |         "posthumousName": u"烈"
 95 |         }
 96 | 
 97 |         summary = u"卫青，字仲卿，西汉人，出生于河东平阳，主要成就：袭龙城，收复河朔、河套地区，击败单于。"
 98 |         assert summary == summarize_entity_person(person)
 99 | 
100 |         person = {
101 |         "name": u"陈小群",
102 |         "gender": u"女",
103 |         "image": u"http://e.hiphotos.baidu.com/baike/w%3D268%3Bg%3D0/sign=3c89cd72acc379317d68812fd3ffd078/b90e7bec54e736d16b57837c98504fc2d5626979.jpg",
104 |         "description": u"女，抒情女高音歌唱家，现任上海音乐学院声乐系教授、硕士生导师；先后担任文化部举办的国际声乐比赛全国选拔赛、中国音乐家协会举办的“金钟奖”全国声乐比赛、全国大学生艺术歌曲比赛等比赛评委。",
105 |         "@type": ["Person", "Thing"],
106 |         "ethnicGroup": u"汉族",
107 |         "keywords": [u"音乐", u"行业人物", u"歌手", u"教育", u"娱乐人物", u"人物", u"书籍"],
108 |         "@id": u"66548f8a-3f9e-37ca-afb1-e2e96fdb083b",
109 |         "alumniOf": u"上海音乐学院",
110 |         "occupation": u"教授"
111 |         }
112 |         ret = summarize_entity_person(person)
113 |         logging.info(ret)
114 |         assert u"陈小群，教授。" == ret
115 | 
116 | if __name__ == '__main__':
117 |     unittest.main()
118 | 


--------------------------------------------------------------------------------
/tests/test_table.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Path hack
 4 | import os
 5 | import sys
 6 | sys.path.insert(0, os.path.abspath('..'))
 7 | 
 8 | try:
 9 |     import unittest2 as unittest
10 | except ImportError:
11 |     import unittest
12 | 
13 | from cdata.core import file2abspath  # noqa
14 | from cdata.table import *  # noqa
15 | 
16 | 
17 | class TableTestCase(unittest.TestCase):
18 |     def setUp(self):
19 |         pass
20 | 
21 |     def test_excel2json(self):
22 |         filename = "ex2.xls"
23 |         filename = file2abspath(filename, __file__)
24 | 
25 |         if not os.path.exists(filename):
26 |             # init_excel():
27 |             input_data = [{
28 |                 "name": u"张三",
29 |                 u"年龄": 18
30 |             },
31 |                 {
32 |                 "name": u"李四",
33 |                 "notes": u"this is li si",
34 |                 u"年龄": 18
35 |             }]
36 |             json2excel(input_data, ["name", u"年龄", "notes"], filename)
37 | 
38 |         output_data = excel2json(filename)
39 |         assert len(output_data) == 2
40 |         assert len(output_data["data"]) == 1
41 |         assert len(output_data["data"].values()[0]) == 2
42 |         assert output_data["fields"].values()[0] == ["name", u"年龄", "notes"]
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/tests/test_web.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Path hack
 4 | import os
 5 | import sys
 6 | sys.path.insert(0, os.path.abspath('..'))
 7 | 
 8 | try:
 9 |     import unittest2 as unittest
10 | except ImportError:
11 |     import unittest
12 | 
13 | from cdata.web import url2domain  # noqa
14 | 
15 | 
16 | class WebTestCase(unittest.TestCase):
17 |     def setUp(self):
18 |         pass
19 | 
20 |     def test_url2domain(self):
21 |         the_input = "http://www.sge.com.cn/sjzx/mrhqsj/540603"
22 |         the_output = url2domain(the_input)
23 |         assert the_output == "www.sge.com.cn", the_output
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     unittest.main()
28 | 


--------------------------------------------------------------------------------
/tests/test_wikify.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Path hack
 4 | import os
 5 | import sys
 6 | import logging
 7 | sys.path.insert(0, os.path.abspath('..'))
 8 | 
 9 | try:
10 |     import unittest2 as unittest
11 | except ImportError:
12 |     import unittest
13 | 
14 | from cdata.wikify import wikidata_search, wikidata_get  # noqa
15 | 
16 | 
17 | class WikifyTestCase(unittest.TestCase):
18 |     def setUp(self):
19 |         pass
20 | 
21 |     def test_wikidata(self):
22 |         query = u"居里夫人"
23 |         ret = wikidata_search(query, lang="zh")
24 |         #logging.info(ret)
25 |         nodeid = ret["itemList"][0]["identifier"]
26 |         assert nodeid == "Q7186"
27 |         ret = wikidata_get(nodeid)
28 |         lable_zh = ret["entities"][nodeid]["labels"]["zh"]["value"]
29 |         assert lable_zh == u"玛丽·居里"
30 | 
31 |         query = u"AutoDealer"
32 |         ret = wikidata_search(query)
33 |         logging.info(ret)
34 |         assert 0 == len(ret["itemList"])
35 | 
36 |         query = u"Campsite"
37 |         ret = wikidata_search(query)
38 |         logging.info(ret)
39 |         nodeid = ret["itemList"][0]["identifier"]
40 |         assert nodeid == "Q832778"
41 |         ret = wikidata_get(nodeid)
42 |         lable_zh = ret["entities"][nodeid]["labels"]["zh"]["value"]
43 |         logging.info(lable_zh)
44 |         assert lable_zh == u"露營場"
45 | 
46 |         query = "birthplace"
47 |         ret = wikidata_search(query, searchtype="property")
48 |         #logging.info(ret)
49 |         nodeid = ret["itemList"][0]["identifier"]
50 |         assert nodeid == "P19"
51 |         ret = wikidata_get(nodeid)
52 |         lable_zh = ret["entities"][nodeid]["labels"]["zh"]["value"]
53 |         logging.info(lable_zh)
54 |         assert lable_zh == u"出生地"
55 | 
56 | 
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     unittest.main()
61 | 


--------------------------------------------------------------------------------