├── .gitignore ├── CHANGELOG.rst ├── LICENSE ├── README.rst ├── requirements.txt ├── setup.py ├── tagme └── __init__.py └── tests ├── __init__.py └── test_calls.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | # Eclipse and pydev 92 | /.project 93 | /.pydevproject 94 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | ========================== 2 | Changelog for tagme-python 3 | ========================== 4 | 5 | `0.1.3`_ (2017-04-06) 6 | ------------------- 7 | * Added support for Python 3 (thanks to `Aurélien Geron`_). 8 | 9 | `0.1.2`_ (2016-11-25) 10 | ------------------- 11 | * First release. 12 | 13 | .. _`Aurélien Geron`: https://github.com/ageron 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | tagme-python 3 | ============ 4 | 5 | Official TagMe API wrapper for Python. 6 | 7 | Installation and setup 8 | ---------------------- 9 | 10 | This library is hosted by PyPI. You can install it with: 11 | 12 | ``pip install tagme`` 13 | 14 | To access the TagMe API you have to register (for free!) at the D4Science platform and obtain an authorization *token*. 15 | 16 | - Register to the `D4Science TagMe VRE `_. 17 | - After login, click the *show* button on the left panel to get your authorization token. 18 | 19 | Using TagMe 20 | ----------- 21 | 22 | Before making any call to the web service, you will need to set the module-wise ``GCUBE_TOKEN`` variable. You can do so with: 23 | 24 | .. code-block:: python 25 | 26 | import tagme 27 | # Set the authorization token for subsequent calls. 28 | tagme.GCUBE_TOKEN = "" 29 | 30 | As an alternative to setting the module-wise variable, you can pass the token at each call with the optional ``gcube_token`` parameter. 31 | 32 | Annotation 33 | ---------- 34 | The annotation service lets you find entities mentioned in a text and link them to Wikipedia. 35 | This is the so-called Sa2KB problem. You can annotate a text with: 36 | 37 | .. code-block:: python 38 | 39 | lunch_annotations = tagme.annotate("My favourite meal is Mexican burritos.") 40 | 41 | # Print annotations with a score higher than 0.1 42 | for ann in lunch_annotations.get_annotations(0.1): 43 | print ann 44 | 45 | The ``annotate`` method accepts parameters to set the language (parameter ``lang``, that defaults to ``en``) and other stuff. 46 | See the code for more information. 47 | Annotations are associated a rho-score indicating the likelihood of an annotation being correct. In the example, we discard 48 | annotations with a score lower than 0.1. 49 | 50 | Mention finding 51 | --------------- 52 | 53 | The mention finding service lets you find what parts of text may be a mention of an entity, without linking them to any entity. 54 | 55 | .. code-block:: python 56 | 57 | tomatoes_mentions = tagme.mentions("I definitely like ice cream better than tomatoes.") 58 | 59 | for mention in tomatoes_mentions.mentions: 60 | print mention 61 | 62 | The ``mentions`` parameter accepts an optional language parameter ``lang`` that defaults to ``en``. 63 | 64 | Entity relatedness 65 | ------------------ 66 | 67 | Tagme also gives you the semantic relatedness among pairs of entities. Entities can be either specified as Wikipedia titles 68 | (like ``Barack Obama``) or as Wikipedia IDs (like ``534366``, the ID of the entity Barack Obama). 69 | The two methods for obtaining the relatedness among entities are ``relatedness_title`` (that accepts titles) and 70 | ``relatedness_wid`` (that accepts Wikipedia IDs). Both methods accept either a single pair of entities or a list of pairs. 71 | You can submit a list of pairs of any size, but the TagMe web service will be issued one query every 100 pairs. 72 | If one entity does not exist, the result will be ``None``. 73 | 74 | .. code-block:: python 75 | 76 | # Get relatedness between a pair of entities specified by title. 77 | rels = tagme.relatedness_title(("Barack Obama", "Italy")) 78 | print "Obama and italy have a semantic relation of", rels.relatedness[0].rel 79 | 80 | # Get relatedness between a pair of entities specified by Wikipedia ID. 81 | rels = tagme.relatedness_wid((31717, 534366)) 82 | print "IDs 31717 and 534366 have a semantic relation of ", rels.relatedness[0].rel 83 | 84 | # Get relatedness between three pairs of entities specified by title. 85 | # The last entity does not exist, hence the value for that pair will be None. 86 | rels = tagme.relatedness_title([("Barack_Obama", "Italy"), 87 | ("Italy", "Germany"), 88 | ("Italy", "BAD ENTITY NAME")]) 89 | for rel in rels.relatedness: 90 | print rel 91 | 92 | # You can also build a dictionary 93 | rels_dict = dict(rels) 94 | print rels_dict[("Barack Obama", "Italy")] 95 | 96 | Changelog 97 | --------- 98 | 99 | See the `Changelog`_. 100 | 101 | .. _Changelog: CHANGELOG.rst 102 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | python-dateutil>=2.6.0 2 | requests>=2.13.0 3 | six>=1.10.0 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Setup for Tagme API Wrapper. 3 | ''' 4 | 5 | import codecs 6 | from os import path 7 | from setuptools import setup 8 | 9 | HERE = path.abspath(path.dirname(__file__)) 10 | 11 | with codecs.open(path.join(HERE, 'README.rst'), encoding='utf-8') as f: 12 | LONG_DESCRIPTION = f.read() 13 | 14 | setup( 15 | name='tagme', 16 | version='0.1.3', 17 | description='Official TagMe API wrapper for Python', 18 | long_description=LONG_DESCRIPTION, 19 | url='https://github.com/marcocor/tagme-python', 20 | author='Marco Cornolti', 21 | author_email='cornolti@di.unipi.it', 22 | license='Apache', 23 | classifiers=[ 24 | 'Development Status :: 4 - Beta', 25 | 'Intended Audience :: Developers', 26 | 'Intended Audience :: Information Technology', 27 | 'Intended Audience :: Science/Research', 28 | 'License :: OSI Approved :: Apache Software License', 29 | 'Operating System :: OS Independent', 30 | 'Topic :: Scientific/Engineering :: Information Analysis', 31 | 'Topic :: Software Development :: Libraries :: Python Modules', 32 | 'Topic :: Text Processing :: Linguistic', 33 | 'Programming Language :: Python :: 2', 34 | 'Programming Language :: Python :: 2.7', 35 | 'Programming Language :: Python :: 3', 36 | 'Programming Language :: Python :: 3.5', 37 | ], 38 | 39 | keywords='entity-linking nlp tagme api', 40 | 41 | packages=['tagme'], 42 | 43 | install_requires=[ 44 | 'future', 45 | 'python-dateutil', 46 | 'requests', 47 | 'six', 48 | ], 49 | 50 | extras_require={ 51 | 'test': [], 52 | }, 53 | ) 54 | -------------------------------------------------------------------------------- /tagme/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This module provides a wrapper for the TagMe API. 3 | ''' 4 | 5 | from __future__ import absolute_import, division, print_function, unicode_literals 6 | 7 | import dateutil.parser 8 | import json 9 | import logging 10 | import requests 11 | import six 12 | 13 | from html.parser import HTMLParser 14 | 15 | __all__ = [ 16 | 'annotate', 'mentions', 'relatedness_wid', 'relatedness_title', 'Annotation', 17 | 'AnnotateResponse', 'Mention', 'MentionsResponse', 'Relatedness', 'RelatednessResponse', 18 | 'normalize_title', 'title_to_uri', 19 | ] 20 | 21 | __author__ = 'Marco Cornolti ' 22 | 23 | DEFAULT_TAG_API = "https://tagme.d4science.org/tagme/tag" 24 | DEFAULT_SPOT_API = "https://tagme.d4science.org/tagme/spot" 25 | DEFAULT_REL_API = "https://tagme.d4science.org/tagme/rel" 26 | DEFAULT_LANG = "en" 27 | DEFAULT_LONG_TEXT = 3 28 | WIKIPEDIA_URI_BASE = u"https://{}.wikipedia.org/wiki/{}" 29 | MAX_RELATEDNESS_PAIRS_PER_REQUEST = 100 30 | GCUBE_TOKEN = None 31 | HTML_PARSER = HTMLParser() 32 | 33 | class Annotation(object): 34 | ''' 35 | An annotation, i.e. a link of a part of text to an entity. 36 | ''' 37 | def __init__(self, ann_json): 38 | self.begin = int(ann_json.get("start")) 39 | self.end = int(ann_json.get("end")) 40 | self.entity_id = int(ann_json.get("id")) 41 | self.entity_title = ann_json.get("title") 42 | self.score = float(ann_json.get("rho")) 43 | self.mention = ann_json.get("spot") 44 | 45 | def __str__(self): 46 | return u"{} -> {} (score: {})".format(self.mention, self.entity_title, self.score) 47 | 48 | def uri(self, lang=DEFAULT_LANG): 49 | ''' 50 | Get the URI of this annotation entity. 51 | :param lang: the Wikipedia language. 52 | ''' 53 | return title_to_uri(self.entity_title, lang) 54 | 55 | 56 | class AnnotateResponse(object): 57 | ''' 58 | A response to a call to the annotation (/tag) service. It contains the list of annotations 59 | found. 60 | ''' 61 | def __init__(self, json_content): 62 | self.annotations = [Annotation(ann_json) for ann_json in json_content["annotations"] if "title" in ann_json] 63 | self.time = int(json_content["time"]) 64 | self.lang = json_content["lang"] 65 | self.timestamp = dateutil.parser.parse(json_content["timestamp"]) 66 | self.original_json = json_content 67 | 68 | def get_annotations(self, min_rho=None): 69 | ''' 70 | Get the list of annotations found. 71 | :param min_rho: if set, only get entities with a rho-score (confidence) higher than this. 72 | ''' 73 | return (a for a in self.annotations if min_rho is None or a.score > min_rho) 74 | 75 | def __str__(self): 76 | return "{}msec, {} annotations".format(self.time, len(self.annotations)) 77 | 78 | 79 | class Mention(object): 80 | ''' 81 | A mention, i.e. a part of text that may mention an entity. 82 | ''' 83 | def __init__(self, mention_json): 84 | self.begin = int(mention_json.get("start")) 85 | self.end = int(mention_json.get("end")) 86 | self.linkprob = float(mention_json.get("lp")) 87 | self.mention = mention_json.get("spot") 88 | 89 | def __str__(self): 90 | return u"{} [{},{}] lp={}".format(self.mention, self.begin, self.end, self.linkprob) 91 | 92 | 93 | class MentionsResponse(object): 94 | ''' 95 | A response to a call to the mention finding (/spot) service. It contains the list of mentions 96 | found. 97 | ''' 98 | def __init__(self, json_content): 99 | self.mentions = [Mention(mention_json) for mention_json in json_content["spots"]] 100 | self.time = int(json_content["time"]) 101 | self.lang = json_content["lang"] 102 | self.timestamp = dateutil.parser.parse(json_content["timestamp"]) 103 | 104 | def get_mentions(self, min_lp=None): 105 | ''' 106 | Get the list of mentions found. 107 | :param min_lp: if set, only get mentions with a link probability higher than this. 108 | ''' 109 | return (m for m in self.mentions if min_lp is None or m.linkprob > min_lp) 110 | 111 | def __str__(self): 112 | return "{}msec, {} mentions".format(self.time, len(self.mentions)) 113 | 114 | 115 | class Relatedness(object): 116 | ''' 117 | A relatedness, i.e. a real value between 0 and 1 indicating how semantically close two entities 118 | are. 119 | ''' 120 | def __init__(self, rel_json): 121 | self.title1, self.title2 = (wiki_title(t) for t in rel_json["couple"].split(" ")) 122 | self.rel = float(rel_json["rel"]) if "rel" in rel_json else None 123 | 124 | def as_pair(self): 125 | ''' 126 | Get this relatedness value as a pair (titles, rel), where rel is the relatedness value and 127 | titles is the pair of the two titles/Wikipedia IDs. 128 | ''' 129 | return ((self.title1, self.title2), self.rel) 130 | 131 | def __str__(self): 132 | return u"{}, {} rel={}".format(self.title1, self.title2, self.rel) 133 | 134 | 135 | class RelatednessResponse(object): 136 | ''' 137 | A response to a call to the relatedness (/rel) service. It contains the list of relatedness for 138 | each pair. 139 | ''' 140 | def __init__(self, json_contents): 141 | self.relatedness = [Relatedness(rel_json) 142 | for json_content in json_contents 143 | for rel_json in json_content["result"]] 144 | self.lang = json_contents[0]["lang"] 145 | self.timestamp = dateutil.parser.parse(json_contents[0]["timestamp"]) 146 | self.calls = len(json_contents) 147 | 148 | def __iter__(self): 149 | for rel in self.relatedness: 150 | yield rel.as_pair() 151 | 152 | def get_relatedness(self, i=0): 153 | ''' 154 | Get the relatedness of a pairs of entities. 155 | :param i: the index of an entity pair. The order is the same as the request. 156 | ''' 157 | return self.relatedness[i].rel 158 | 159 | def __str__(self): 160 | return "{} relatedness pairs, {} calls".format(len(self.relatedness), self.calls) 161 | 162 | 163 | def normalize_title(title): 164 | ''' 165 | Normalize a title to Wikipedia format. E.g. "barack Obama" becomes "Barack_Obama" 166 | :param title: a title to normalize. 167 | ''' 168 | title = title.strip().replace(" ", "_") 169 | return title[0].upper() + title[1:] 170 | 171 | 172 | def wiki_title(title): 173 | ''' 174 | Given a normalized title, get the page title. E.g. "Barack_Obama" becomes "Barack Obama" 175 | :param title: a wikipedia title. 176 | ''' 177 | return HTML_PARSER.unescape(title.strip(" _").replace("_", " ")) 178 | 179 | 180 | def title_to_uri(entity_title, lang=DEFAULT_LANG): 181 | ''' 182 | Get the URI of the page describing a Wikipedia entity. 183 | :param entity_title: an entity title. 184 | :param lang: the Wikipedia language. 185 | ''' 186 | return WIKIPEDIA_URI_BASE.format(lang, normalize_title(entity_title)) 187 | 188 | 189 | def annotate(text, gcube_token=None, lang=DEFAULT_LANG, api=DEFAULT_TAG_API, 190 | long_text=DEFAULT_LONG_TEXT): 191 | ''' 192 | Annotate a text, linking it to Wikipedia entities. 193 | :param text: the text to annotate. 194 | :param gcube_token: the authentication token provided by the D4Science infrastructure. 195 | :param lang: the Wikipedia language. 196 | :param api: the API endpoint. 197 | :param long_text: long_text parameter (see TagMe documentation). 198 | ''' 199 | payload = [("text", text.encode("utf-8")), 200 | ("long_text", long_text), 201 | ("lang", lang)] 202 | json_response = _issue_request(api, payload, gcube_token) 203 | return AnnotateResponse(json_response) if json_response else None 204 | 205 | 206 | def mentions(text, gcube_token=None, lang=DEFAULT_LANG, api=DEFAULT_SPOT_API): 207 | ''' 208 | Find possible mentions in a text, do not link them to any entity. 209 | :param text: the text where to find mentions. 210 | :param gcube_token: the authentication token provided by the D4Science infrastructure. 211 | :param lang: the Wikipedia language. 212 | :param api: the API endpoint. 213 | ''' 214 | payload = [("text", text.encode("utf-8")), 215 | ("lang", lang.encode("utf-8"))] 216 | json_response = _issue_request(api, payload, gcube_token) 217 | return MentionsResponse(json_response) if json_response else None 218 | 219 | 220 | def relatedness_wid(wid_pairs, gcube_token=None, lang=DEFAULT_LANG, api=DEFAULT_REL_API): 221 | ''' 222 | Get the semantic relatedness among pairs of entities. Entities are indicated by their 223 | Wikipedia ID (an integer). 224 | :param wid_pairs: either one pair or a list of pairs of Wikipedia IDs. 225 | :param gcube_token: the authentication token provided by the D4Science infrastructure. 226 | :param lang: the Wikipedia language. 227 | :param api: the API endpoint. 228 | ''' 229 | return _relatedness("id", wid_pairs, gcube_token, lang, api) 230 | 231 | 232 | def relatedness_title(tt_pairs, gcube_token=None, lang=DEFAULT_LANG, api=DEFAULT_REL_API): 233 | ''' 234 | Get the semantic relatedness among pairs of entities. Entities are indicated by their 235 | Wikipedia ID (an integer). 236 | :param tt_pairs: either one pair or a list of pairs of entity titles. 237 | :param gcube_token: the authentication token provided by the D4Science infrastructure. 238 | :param lang: the Wikipedia language. 239 | :param api: the API endpoint. 240 | ''' 241 | return _relatedness("tt", tt_pairs, gcube_token, lang, api) 242 | 243 | 244 | def _relatedness(pairs_type, pairs, gcube_token, lang, api): 245 | if not isinstance(pairs[0], (list, tuple)): 246 | pairs = [pairs] 247 | 248 | if isinstance(pairs[0][0], six.binary_type): # str in python 2, bytes in python 3 249 | pairs = [(p[0].decode("utf-8"), p[1].decode("utf-8")) for p in pairs] 250 | 251 | if isinstance(pairs[0][0], six.text_type): # unicode in python 2, str in python 3 252 | pairs = [(normalize_title(p[0]), normalize_title(p[1])) for p in pairs] 253 | 254 | json_responses = [] 255 | for chunk in range(0, len(pairs), MAX_RELATEDNESS_PAIRS_PER_REQUEST): 256 | payload = [("lang", lang)] 257 | payload += ((pairs_type, u"{} {}".format(p[0], p[1])) 258 | for p in pairs[chunk:chunk + MAX_RELATEDNESS_PAIRS_PER_REQUEST]) 259 | json_responses.append(_issue_request(api, payload, gcube_token)) 260 | return RelatednessResponse(json_responses) if json_responses and json_responses[0] else None 261 | 262 | 263 | def _issue_request(api, payload, gcube_token): 264 | if not gcube_token: 265 | gcube_token = GCUBE_TOKEN 266 | if not gcube_token: 267 | raise RuntimeError("You must define GCUBE_TOKEN before calling this function or pass the " 268 | "gcube_token parameter.") 269 | 270 | payload.append(("gcube-token", gcube_token)) 271 | logging.debug("Calling %s", api) 272 | res = requests.post(api, data=payload) 273 | if res.status_code != 200: 274 | logging.warning("Tagme returned status code %d message:\n%s", res.status_code, res.content) 275 | return None 276 | res_content = res.content.decode("utf-8") if isinstance(res.content, six.binary_type) else res.content 277 | return json.loads(res_content) 278 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/marcocor/tagme-python/008a8a30419db938729111f0d7c5c6c1518f9797/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_calls.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function, unicode_literals 2 | 3 | import sys 4 | import tagme 5 | 6 | SAMPLE_TEXT = "Obama visited uk" 7 | 8 | def main(): 9 | # Annotate a text. 10 | print("Annotating text: ", SAMPLE_TEXT) 11 | resp = tagme.annotate(SAMPLE_TEXT) 12 | print(resp) 13 | for ann in resp.annotations: 14 | print(ann) 15 | 16 | # Find mentions in a text. 17 | print("Finding mentions in text: ", SAMPLE_TEXT) 18 | resp = tagme.mentions(SAMPLE_TEXT) 19 | print(resp) 20 | for mention in resp.mentions: 21 | print(mention) 22 | 23 | # Find relatedness between one pair of entities, by title. 24 | resp = tagme.relatedness_title(["Barack_Obama", "Italy"]) 25 | print(resp) 26 | for rel in resp.relatedness: 27 | print(rel) 28 | 29 | # Find relatedness between pairs of entities, by title. 30 | resp = tagme.relatedness_title([("Barack_Obama", "Italy"), 31 | ("Italy", "Germany"), 32 | ("Italy", "BAD ENTITY NAME")]) 33 | print(resp) 34 | for rel in resp.relatedness: 35 | print(rel) 36 | 37 | # Access the relatedness response as a dictionary. 38 | resp_dict = dict(resp) 39 | print("Relatedness between Italy and Germany: ", resp_dict[("Italy", "Germany")]) 40 | 41 | # Find relatedness between one pair of entities, by wikipedia id 42 | resp = tagme.relatedness_wid((31717, 534366)) 43 | print(resp) 44 | for rel in resp.relatedness: 45 | print(rel) 46 | 47 | # Find relatedness between pairs of entities, by wikipedia id 48 | resp = tagme.relatedness_wid([(534366, 534366 + a) for a in range (1010)]) 49 | print(resp) 50 | for rel in resp.relatedness: 51 | print(rel) 52 | 53 | if __name__ == "__main__": 54 | tagme.GCUBE_TOKEN = sys.argv[1] 55 | assert tagme.normalize_title(" barack Obama ") == "Barack_Obama" 56 | assert tagme.title_to_uri(" barack Obama ") == "https://en.wikipedia.org/wiki/Barack_Obama" 57 | assert tagme.wiki_title("Barack_Obama") == ("Barack Obama") 58 | main() 59 | --------------------------------------------------------------------------------