├── .gitignore
├── CHANGELOG.rst
├── LICENSE
├── README.rst
├── requirements.txt
├── setup.py
├── tagme
    └── __init__.py
└── tests
    ├── __init__.py
    └── test_calls.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 
91 | # Eclipse and pydev
92 | /.project
93 | /.pydevproject
94 | 


--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
 1 | ==========================
 2 | Changelog for tagme-python
 3 | ==========================
 4 | 
 5 | `0.1.3`_ (2017-04-06)
 6 | -------------------
 7 | * Added support for Python 3 (thanks to `Aurélien Geron`_).
 8 | 
 9 | `0.1.2`_ (2016-11-25)
10 | -------------------
11 | * First release.
12 | 
13 | .. _`Aurélien Geron`: https://github.com/ageron
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ============
  2 | tagme-python
  3 | ============
  4 | 
  5 | Official TagMe API wrapper for Python.
  6 | 
  7 | Installation and setup
  8 | ----------------------
  9 | 
 10 | This library is hosted by PyPI. You can install it with:
 11 | 
 12 | ``pip install tagme``
 13 | 
 14 | To access the TagMe API you have to register (for free!) at the D4Science platform and obtain an authorization *token*.
 15 | 
 16 | - Register to the `D4Science TagMe VRE <https://services.d4science.org/group/tagme/>`_.
 17 | - After login, click the *show* button on the left panel to get your authorization token.
 18 | 
 19 | Using TagMe
 20 | -----------
 21 | 
 22 | Before making any call to the web service, you will need to set the module-wise ``GCUBE_TOKEN`` variable. You can do so with:
 23 | 
 24 | .. code-block:: python
 25 | 
 26 |  import tagme
 27 |  # Set the authorization token for subsequent calls.
 28 |  tagme.GCUBE_TOKEN = "<Your token goes here>"
 29 | 
 30 | As an alternative to setting the module-wise variable, you can pass the token at each call with the optional ``gcube_token`` parameter. 
 31 | 
 32 | Annotation
 33 | ----------
 34 | The annotation service lets you find entities mentioned in a text and link them to Wikipedia.
 35 | This is the so-called Sa2KB problem. You can annotate a text with:
 36 | 
 37 | .. code-block:: python
 38 | 
 39 |  lunch_annotations = tagme.annotate("My favourite meal is Mexican burritos.")
 40 |  
 41 |  # Print annotations with a score higher than 0.1
 42 |  for ann in lunch_annotations.get_annotations(0.1):
 43 |      print ann
 44 | 
 45 | The ``annotate`` method accepts parameters to set the language (parameter ``lang``, that defaults to ``en``) and other stuff.
 46 | See the code for more information.
 47 | Annotations are associated a rho-score indicating the likelihood of an annotation being correct. In the example, we discard
 48 | annotations with a score lower than 0.1.
 49 | 
 50 | Mention finding
 51 | ---------------
 52 | 
 53 | The mention finding service lets you find what parts of text may be a mention of an entity, without linking them to any entity.
 54 | 
 55 | .. code-block:: python
 56 | 
 57 |  tomatoes_mentions = tagme.mentions("I definitely like ice cream better than tomatoes.")
 58 | 
 59 |  for mention in tomatoes_mentions.mentions:
 60 |      print mention
 61 | 
 62 | The ``mentions`` parameter accepts an optional language parameter ``lang`` that defaults to ``en``.
 63 | 
 64 | Entity relatedness
 65 | ------------------
 66 | 
 67 | Tagme also gives you the semantic relatedness among pairs of entities. Entities can be either specified as Wikipedia titles
 68 | (like ``Barack Obama``) or as Wikipedia IDs (like ``534366``, the ID of the entity Barack Obama).
 69 | The two methods for obtaining the relatedness among entities are ``relatedness_title`` (that accepts titles) and
 70 | ``relatedness_wid`` (that accepts Wikipedia IDs). Both methods accept either a single pair of entities or a list of pairs.
 71 | You can submit a list of pairs of any size, but the TagMe web service will be issued one query every 100 pairs.
 72 | If one entity does not exist, the result will be ``None``.
 73 | 
 74 | .. code-block:: python
 75 | 
 76 |  # Get relatedness between a pair of entities specified by title.
 77 |  rels = tagme.relatedness_title(("Barack Obama", "Italy"))
 78 |  print "Obama and italy have a semantic relation of", rels.relatedness[0].rel
 79 |  
 80 |  # Get relatedness between a pair of entities specified by Wikipedia ID.
 81 |  rels = tagme.relatedness_wid((31717, 534366))
 82 |  print "IDs 31717 and 534366 have a semantic relation of ", rels.relatedness[0].rel
 83 |  
 84 |  # Get relatedness between three pairs of entities specified by title.
 85 |  # The last entity does not exist, hence the value for that pair will be None.
 86 |  rels = tagme.relatedness_title([("Barack_Obama", "Italy"),
 87 |                                  ("Italy", "Germany"),
 88 |                                  ("Italy", "BAD ENTITY NAME")])
 89 |  for rel in rels.relatedness:
 90 |      print rel
 91 | 
 92 |  # You can also build a dictionary
 93 |  rels_dict = dict(rels)
 94 |  print rels_dict[("Barack Obama", "Italy")]
 95 |  
 96 | Changelog
 97 | ---------
 98 | 
 99 | See the `Changelog`_.
100 | 
101 | .. _Changelog: CHANGELOG.rst
102 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | python-dateutil>=2.6.0
2 | requests>=2.13.0
3 | six>=1.10.0
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Setup for Tagme API Wrapper.
 3 | '''
 4 | 
 5 | import codecs
 6 | from os import path
 7 | from setuptools import setup
 8 | 
 9 | HERE = path.abspath(path.dirname(__file__))
10 | 
11 | with codecs.open(path.join(HERE, 'README.rst'), encoding='utf-8') as f:
12 |     LONG_DESCRIPTION = f.read()
13 | 
14 | setup(
15 |     name='tagme',
16 |     version='0.1.3',
17 |     description='Official TagMe API wrapper for Python',
18 |     long_description=LONG_DESCRIPTION,
19 |     url='https://github.com/marcocor/tagme-python',
20 |     author='Marco Cornolti',
21 |     author_email='cornolti@di.unipi.it',
22 |     license='Apache',
23 |     classifiers=[
24 |         'Development Status :: 4 - Beta',
25 |         'Intended Audience :: Developers',
26 |         'Intended Audience :: Information Technology',
27 |         'Intended Audience :: Science/Research',
28 |         'License :: OSI Approved :: Apache Software License',
29 |         'Operating System :: OS Independent',
30 |         'Topic :: Scientific/Engineering :: Information Analysis',
31 |         'Topic :: Software Development :: Libraries :: Python Modules',
32 |         'Topic :: Text Processing :: Linguistic',
33 |         'Programming Language :: Python :: 2',
34 |         'Programming Language :: Python :: 2.7',
35 |         'Programming Language :: Python :: 3',
36 |         'Programming Language :: Python :: 3.5',
37 |     ],
38 | 
39 |     keywords='entity-linking nlp tagme api',
40 | 
41 |     packages=['tagme'],
42 | 
43 |     install_requires=[
44 |         'future',
45 |         'python-dateutil',
46 |         'requests',
47 |         'six',
48 |     ],
49 | 
50 |     extras_require={
51 |         'test': [],
52 |     },
53 | )
54 | 


--------------------------------------------------------------------------------
/tagme/__init__.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This module provides a wrapper for the TagMe API.
  3 | '''
  4 | 
  5 | from __future__ import absolute_import, division, print_function, unicode_literals
  6 | 
  7 | import dateutil.parser
  8 | import json
  9 | import logging
 10 | import requests
 11 | import six
 12 | 
 13 | from html.parser import HTMLParser
 14 | 
 15 | __all__ = [
 16 |     'annotate', 'mentions', 'relatedness_wid', 'relatedness_title', 'Annotation',
 17 |     'AnnotateResponse', 'Mention', 'MentionsResponse', 'Relatedness', 'RelatednessResponse',
 18 |     'normalize_title', 'title_to_uri',
 19 |     ]
 20 | 
 21 | __author__ = 'Marco Cornolti <cornolti@di.unipi.it>'
 22 | 
 23 | DEFAULT_TAG_API = "https://tagme.d4science.org/tagme/tag"
 24 | DEFAULT_SPOT_API = "https://tagme.d4science.org/tagme/spot"
 25 | DEFAULT_REL_API = "https://tagme.d4science.org/tagme/rel"
 26 | DEFAULT_LANG = "en"
 27 | DEFAULT_LONG_TEXT = 3
 28 | WIKIPEDIA_URI_BASE = u"https://{}.wikipedia.org/wiki/{}"
 29 | MAX_RELATEDNESS_PAIRS_PER_REQUEST = 100
 30 | GCUBE_TOKEN = None
 31 | HTML_PARSER = HTMLParser()
 32 | 
 33 | class Annotation(object):
 34 |     '''
 35 |     An annotation, i.e. a link of a part of text to an entity.
 36 |     '''
 37 |     def __init__(self, ann_json):
 38 |         self.begin = int(ann_json.get("start"))
 39 |         self.end = int(ann_json.get("end"))
 40 |         self.entity_id = int(ann_json.get("id"))
 41 |         self.entity_title = ann_json.get("title")
 42 |         self.score = float(ann_json.get("rho"))
 43 |         self.mention = ann_json.get("spot")
 44 | 
 45 |     def __str__(self):
 46 |         return u"{} -> {} (score: {})".format(self.mention, self.entity_title, self.score)
 47 | 
 48 |     def uri(self, lang=DEFAULT_LANG):
 49 |         '''
 50 |         Get the URI of this annotation entity.
 51 |         :param lang: the Wikipedia language.
 52 |         '''
 53 |         return title_to_uri(self.entity_title, lang)
 54 | 
 55 | 
 56 | class AnnotateResponse(object):
 57 |     '''
 58 |     A response to a call to the annotation (/tag) service. It contains the list of annotations
 59 |     found.
 60 |     '''
 61 |     def __init__(self, json_content):
 62 |         self.annotations = [Annotation(ann_json) for ann_json in json_content["annotations"] if "title" in ann_json]
 63 |         self.time = int(json_content["time"])
 64 |         self.lang = json_content["lang"]
 65 |         self.timestamp = dateutil.parser.parse(json_content["timestamp"])
 66 |         self.original_json = json_content
 67 | 
 68 |     def get_annotations(self, min_rho=None):
 69 |         '''
 70 |         Get the list of annotations found.
 71 |         :param min_rho: if set, only get entities with a rho-score (confidence) higher than this.
 72 |         '''
 73 |         return (a for a in self.annotations if min_rho is None or a.score > min_rho)
 74 | 
 75 |     def __str__(self):
 76 |         return "{}msec, {} annotations".format(self.time, len(self.annotations))
 77 | 
 78 | 
 79 | class Mention(object):
 80 |     '''
 81 |     A mention, i.e. a part of text that may mention an entity.
 82 |     '''
 83 |     def __init__(self, mention_json):
 84 |         self.begin = int(mention_json.get("start"))
 85 |         self.end = int(mention_json.get("end"))
 86 |         self.linkprob = float(mention_json.get("lp"))
 87 |         self.mention = mention_json.get("spot")
 88 | 
 89 |     def __str__(self):
 90 |         return u"{} [{},{}] lp={}".format(self.mention, self.begin, self.end, self.linkprob)
 91 | 
 92 | 
 93 | class MentionsResponse(object):
 94 |     '''
 95 |     A response to a call to the mention finding (/spot) service. It contains the list of mentions
 96 |     found.
 97 |     '''
 98 |     def __init__(self, json_content):
 99 |         self.mentions = [Mention(mention_json) for mention_json in json_content["spots"]]
100 |         self.time = int(json_content["time"])
101 |         self.lang = json_content["lang"]
102 |         self.timestamp = dateutil.parser.parse(json_content["timestamp"])
103 | 
104 |     def get_mentions(self, min_lp=None):
105 |         '''
106 |         Get the list of mentions found.
107 |         :param min_lp: if set, only get mentions with a link probability higher than this.
108 |         '''
109 |         return (m for m in self.mentions if min_lp is None or m.linkprob > min_lp)
110 | 
111 |     def __str__(self):
112 |         return "{}msec, {} mentions".format(self.time, len(self.mentions))
113 | 
114 | 
115 | class Relatedness(object):
116 |     '''
117 |     A relatedness, i.e. a real value between 0 and 1 indicating how semantically close two entities
118 |     are.
119 |     '''
120 |     def __init__(self, rel_json):
121 |         self.title1, self.title2 = (wiki_title(t) for t in rel_json["couple"].split(" "))
122 |         self.rel = float(rel_json["rel"]) if "rel" in rel_json else None
123 | 
124 |     def as_pair(self):
125 |         '''
126 |         Get this relatedness value as a pair (titles, rel), where rel is the relatedness value and
127 |         titles is the pair of the two titles/Wikipedia IDs.
128 |         '''
129 |         return ((self.title1, self.title2), self.rel)
130 | 
131 |     def __str__(self):
132 |         return u"{}, {} rel={}".format(self.title1, self.title2, self.rel)
133 | 
134 | 
135 | class RelatednessResponse(object):
136 |     '''
137 |     A response to a call to the relatedness (/rel) service. It contains the list of relatedness for
138 |     each pair.
139 |     '''
140 |     def __init__(self, json_contents):
141 |         self.relatedness = [Relatedness(rel_json)
142 |                             for json_content in json_contents
143 |                             for rel_json in json_content["result"]]
144 |         self.lang = json_contents[0]["lang"]
145 |         self.timestamp = dateutil.parser.parse(json_contents[0]["timestamp"])
146 |         self.calls = len(json_contents)
147 | 
148 |     def __iter__(self):
149 |         for rel in self.relatedness:
150 |             yield rel.as_pair()
151 | 
152 |     def get_relatedness(self, i=0):
153 |         '''
154 |         Get the relatedness of a pairs of entities.
155 |         :param i: the index of an entity pair. The order is the same as the request.
156 |         '''
157 |         return self.relatedness[i].rel
158 | 
159 |     def __str__(self):
160 |         return "{} relatedness pairs, {} calls".format(len(self.relatedness), self.calls)
161 | 
162 | 
163 | def normalize_title(title):
164 |     '''
165 |     Normalize a title to Wikipedia format. E.g. "barack Obama" becomes "Barack_Obama"
166 |     :param title: a title to normalize.
167 |     '''
168 |     title = title.strip().replace(" ", "_")
169 |     return title[0].upper() + title[1:]
170 | 
171 | 
172 | def wiki_title(title):
173 |     '''
174 |     Given a normalized title, get the page title. E.g. "Barack_Obama" becomes "Barack Obama"
175 |     :param title: a wikipedia title.
176 |     '''
177 |     return HTML_PARSER.unescape(title.strip(" _").replace("_", " "))
178 | 
179 | 
180 | def title_to_uri(entity_title, lang=DEFAULT_LANG):
181 |     '''
182 |     Get the URI of the page describing a Wikipedia entity.
183 |     :param entity_title: an entity title.
184 |     :param lang: the Wikipedia language.
185 |     '''
186 |     return WIKIPEDIA_URI_BASE.format(lang, normalize_title(entity_title))
187 | 
188 | 
189 | def annotate(text, gcube_token=None, lang=DEFAULT_LANG, api=DEFAULT_TAG_API,
190 |              long_text=DEFAULT_LONG_TEXT):
191 |     '''
192 |     Annotate a text, linking it to Wikipedia entities.
193 |     :param text: the text to annotate.
194 |     :param gcube_token: the authentication token provided by the D4Science infrastructure.
195 |     :param lang: the Wikipedia language.
196 |     :param api: the API endpoint.
197 |     :param long_text: long_text parameter (see TagMe documentation).
198 |     '''
199 |     payload = [("text", text.encode("utf-8")),
200 |                ("long_text", long_text),
201 |                ("lang", lang)]
202 |     json_response = _issue_request(api, payload, gcube_token)
203 |     return AnnotateResponse(json_response) if json_response else None
204 | 
205 | 
206 | def mentions(text, gcube_token=None, lang=DEFAULT_LANG, api=DEFAULT_SPOT_API):
207 |     '''
208 |     Find possible mentions in a text, do not link them to any entity.
209 |     :param text: the text where to find mentions.
210 |     :param gcube_token: the authentication token provided by the D4Science infrastructure.
211 |     :param lang: the Wikipedia language.
212 |     :param api: the API endpoint.
213 |     '''
214 |     payload = [("text", text.encode("utf-8")),
215 |                ("lang", lang.encode("utf-8"))]
216 |     json_response = _issue_request(api, payload, gcube_token)
217 |     return MentionsResponse(json_response) if json_response else None
218 | 
219 | 
220 | def relatedness_wid(wid_pairs, gcube_token=None, lang=DEFAULT_LANG, api=DEFAULT_REL_API):
221 |     '''
222 |     Get the semantic relatedness among pairs of entities. Entities are indicated by their
223 |     Wikipedia ID (an integer).
224 |     :param wid_pairs: either one pair or a list of pairs of Wikipedia IDs.
225 |     :param gcube_token: the authentication token provided by the D4Science infrastructure.
226 |     :param lang: the Wikipedia language.
227 |     :param api: the API endpoint.
228 |     '''
229 |     return _relatedness("id", wid_pairs, gcube_token, lang, api)
230 | 
231 | 
232 | def relatedness_title(tt_pairs, gcube_token=None, lang=DEFAULT_LANG, api=DEFAULT_REL_API):
233 |     '''
234 |     Get the semantic relatedness among pairs of entities. Entities are indicated by their
235 |     Wikipedia ID (an integer).
236 |     :param tt_pairs: either one pair or a list of pairs of entity titles.
237 |     :param gcube_token: the authentication token provided by the D4Science infrastructure.
238 |     :param lang: the Wikipedia language.
239 |     :param api: the API endpoint.
240 |     '''
241 |     return _relatedness("tt", tt_pairs, gcube_token, lang, api)
242 | 
243 | 
244 | def _relatedness(pairs_type, pairs, gcube_token, lang, api):
245 |     if not isinstance(pairs[0], (list, tuple)):
246 |         pairs = [pairs]
247 | 
248 |     if isinstance(pairs[0][0], six.binary_type):  # str in python 2, bytes in python 3
249 |         pairs = [(p[0].decode("utf-8"), p[1].decode("utf-8")) for p in pairs]
250 | 
251 |     if isinstance(pairs[0][0], six.text_type):  # unicode in python 2, str in python 3
252 |         pairs = [(normalize_title(p[0]), normalize_title(p[1])) for p in pairs]
253 | 
254 |     json_responses = []
255 |     for chunk in range(0, len(pairs), MAX_RELATEDNESS_PAIRS_PER_REQUEST):
256 |         payload = [("lang", lang)]
257 |         payload += ((pairs_type, u"{} {}".format(p[0], p[1]))
258 |                     for p in pairs[chunk:chunk + MAX_RELATEDNESS_PAIRS_PER_REQUEST])
259 |         json_responses.append(_issue_request(api, payload, gcube_token))
260 |     return RelatednessResponse(json_responses) if json_responses and json_responses[0] else None
261 | 
262 | 
263 | def _issue_request(api, payload, gcube_token):
264 |     if not gcube_token:
265 |         gcube_token = GCUBE_TOKEN
266 |     if not gcube_token:
267 |         raise RuntimeError("You must define GCUBE_TOKEN before calling this function or pass the "
268 |                            "gcube_token parameter.")
269 | 
270 |     payload.append(("gcube-token", gcube_token))
271 |     logging.debug("Calling %s", api)
272 |     res = requests.post(api, data=payload)
273 |     if res.status_code != 200:
274 |         logging.warning("Tagme returned status code %d message:\n%s", res.status_code, res.content)
275 |         return None
276 |     res_content = res.content.decode("utf-8") if isinstance(res.content, six.binary_type) else res.content
277 |     return json.loads(res_content)
278 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/marcocor/tagme-python/008a8a30419db938729111f0d7c5c6c1518f9797/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_calls.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function, unicode_literals
 2 | 
 3 | import sys
 4 | import tagme
 5 | 
 6 | SAMPLE_TEXT = "Obama visited uk"
 7 | 
 8 | def main():
 9 |     # Annotate a text.
10 |     print("Annotating text: ", SAMPLE_TEXT)
11 |     resp = tagme.annotate(SAMPLE_TEXT)
12 |     print(resp)
13 |     for ann in resp.annotations:
14 |         print(ann)
15 | 
16 |     # Find mentions in a text.
17 |     print("Finding mentions in text: ", SAMPLE_TEXT)
18 |     resp = tagme.mentions(SAMPLE_TEXT)
19 |     print(resp)
20 |     for mention in resp.mentions:
21 |         print(mention)
22 | 
23 |     # Find relatedness between one pair of entities, by title.
24 |     resp = tagme.relatedness_title(["Barack_Obama", "Italy"])
25 |     print(resp)
26 |     for rel in resp.relatedness:
27 |         print(rel)
28 | 
29 |     # Find relatedness between pairs of entities, by title.
30 |     resp = tagme.relatedness_title([("Barack_Obama", "Italy"),
31 |                                 ("Italy", "Germany"),
32 |                                 ("Italy", "BAD ENTITY NAME")])
33 |     print(resp)
34 |     for rel in resp.relatedness:
35 |         print(rel)
36 | 
37 |     # Access the relatedness response as a dictionary.
38 |     resp_dict = dict(resp)
39 |     print("Relatedness between Italy and Germany: ", resp_dict[("Italy", "Germany")])
40 | 
41 |     # Find relatedness between one pair of entities, by wikipedia id
42 |     resp = tagme.relatedness_wid((31717, 534366))
43 |     print(resp)
44 |     for rel in resp.relatedness:
45 |         print(rel)
46 | 
47 |     # Find relatedness between pairs of entities, by wikipedia id
48 |     resp = tagme.relatedness_wid([(534366, 534366 + a) for a in range (1010)])
49 |     print(resp)
50 |     for rel in resp.relatedness:
51 |         print(rel)
52 | 
53 | if __name__ == "__main__":
54 |     tagme.GCUBE_TOKEN = sys.argv[1]
55 |     assert tagme.normalize_title(" barack Obama  ") == "Barack_Obama"
56 |     assert tagme.title_to_uri(" barack Obama  ") == "https://en.wikipedia.org/wiki/Barack_Obama"
57 |     assert tagme.wiki_title("Barack_Obama") == ("Barack Obama")
58 |     main()
59 | 


--------------------------------------------------------------------------------