├── .gitignore
├── CHANGELOG.rst
├── LICENSE
├── README.rst
├── requirements.txt
├── setup.py
├── tagme
└── __init__.py
└── tests
├── __init__.py
└── test_calls.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
91 | # Eclipse and pydev
92 | /.project
93 | /.pydevproject
94 |
--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
1 | ==========================
2 | Changelog for tagme-python
3 | ==========================
4 |
5 | `0.1.3`_ (2017-04-06)
6 | -------------------
7 | * Added support for Python 3 (thanks to `Aurélien Geron`_).
8 |
9 | `0.1.2`_ (2016-11-25)
10 | -------------------
11 | * First release.
12 |
13 | .. _`Aurélien Geron`: https://github.com/ageron
14 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | ============
2 | tagme-python
3 | ============
4 |
5 | Official TagMe API wrapper for Python.
6 |
7 | Installation and setup
8 | ----------------------
9 |
10 | This library is hosted by PyPI. You can install it with:
11 |
12 | ``pip install tagme``
13 |
14 | To access the TagMe API you have to register (for free!) at the D4Science platform and obtain an authorization *token*.
15 |
16 | - Register to the `D4Science TagMe VRE `_.
17 | - After login, click the *show* button on the left panel to get your authorization token.
18 |
19 | Using TagMe
20 | -----------
21 |
22 | Before making any call to the web service, you will need to set the module-wise ``GCUBE_TOKEN`` variable. You can do so with:
23 |
24 | .. code-block:: python
25 |
26 | import tagme
27 | # Set the authorization token for subsequent calls.
28 | tagme.GCUBE_TOKEN = ""
29 |
30 | As an alternative to setting the module-wise variable, you can pass the token at each call with the optional ``gcube_token`` parameter.
31 |
32 | Annotation
33 | ----------
34 | The annotation service lets you find entities mentioned in a text and link them to Wikipedia.
35 | This is the so-called Sa2KB problem. You can annotate a text with:
36 |
37 | .. code-block:: python
38 |
39 | lunch_annotations = tagme.annotate("My favourite meal is Mexican burritos.")
40 |
41 | # Print annotations with a score higher than 0.1
42 | for ann in lunch_annotations.get_annotations(0.1):
43 | print ann
44 |
45 | The ``annotate`` method accepts parameters to set the language (parameter ``lang``, that defaults to ``en``) and other stuff.
46 | See the code for more information.
47 | Annotations are associated a rho-score indicating the likelihood of an annotation being correct. In the example, we discard
48 | annotations with a score lower than 0.1.
49 |
50 | Mention finding
51 | ---------------
52 |
53 | The mention finding service lets you find what parts of text may be a mention of an entity, without linking them to any entity.
54 |
55 | .. code-block:: python
56 |
57 | tomatoes_mentions = tagme.mentions("I definitely like ice cream better than tomatoes.")
58 |
59 | for mention in tomatoes_mentions.mentions:
60 | print mention
61 |
62 | The ``mentions`` parameter accepts an optional language parameter ``lang`` that defaults to ``en``.
63 |
64 | Entity relatedness
65 | ------------------
66 |
67 | Tagme also gives you the semantic relatedness among pairs of entities. Entities can be either specified as Wikipedia titles
68 | (like ``Barack Obama``) or as Wikipedia IDs (like ``534366``, the ID of the entity Barack Obama).
69 | The two methods for obtaining the relatedness among entities are ``relatedness_title`` (that accepts titles) and
70 | ``relatedness_wid`` (that accepts Wikipedia IDs). Both methods accept either a single pair of entities or a list of pairs.
71 | You can submit a list of pairs of any size, but the TagMe web service will be issued one query every 100 pairs.
72 | If one entity does not exist, the result will be ``None``.
73 |
74 | .. code-block:: python
75 |
76 | # Get relatedness between a pair of entities specified by title.
77 | rels = tagme.relatedness_title(("Barack Obama", "Italy"))
78 | print "Obama and italy have a semantic relation of", rels.relatedness[0].rel
79 |
80 | # Get relatedness between a pair of entities specified by Wikipedia ID.
81 | rels = tagme.relatedness_wid((31717, 534366))
82 | print "IDs 31717 and 534366 have a semantic relation of ", rels.relatedness[0].rel
83 |
84 | # Get relatedness between three pairs of entities specified by title.
85 | # The last entity does not exist, hence the value for that pair will be None.
86 | rels = tagme.relatedness_title([("Barack_Obama", "Italy"),
87 | ("Italy", "Germany"),
88 | ("Italy", "BAD ENTITY NAME")])
89 | for rel in rels.relatedness:
90 | print rel
91 |
92 | # You can also build a dictionary
93 | rels_dict = dict(rels)
94 | print rels_dict[("Barack Obama", "Italy")]
95 |
96 | Changelog
97 | ---------
98 |
99 | See the `Changelog`_.
100 |
101 | .. _Changelog: CHANGELOG.rst
102 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | python-dateutil>=2.6.0
2 | requests>=2.13.0
3 | six>=1.10.0
4 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | '''
2 | Setup for Tagme API Wrapper.
3 | '''
4 |
5 | import codecs
6 | from os import path
7 | from setuptools import setup
8 |
9 | HERE = path.abspath(path.dirname(__file__))
10 |
11 | with codecs.open(path.join(HERE, 'README.rst'), encoding='utf-8') as f:
12 | LONG_DESCRIPTION = f.read()
13 |
14 | setup(
15 | name='tagme',
16 | version='0.1.3',
17 | description='Official TagMe API wrapper for Python',
18 | long_description=LONG_DESCRIPTION,
19 | url='https://github.com/marcocor/tagme-python',
20 | author='Marco Cornolti',
21 | author_email='cornolti@di.unipi.it',
22 | license='Apache',
23 | classifiers=[
24 | 'Development Status :: 4 - Beta',
25 | 'Intended Audience :: Developers',
26 | 'Intended Audience :: Information Technology',
27 | 'Intended Audience :: Science/Research',
28 | 'License :: OSI Approved :: Apache Software License',
29 | 'Operating System :: OS Independent',
30 | 'Topic :: Scientific/Engineering :: Information Analysis',
31 | 'Topic :: Software Development :: Libraries :: Python Modules',
32 | 'Topic :: Text Processing :: Linguistic',
33 | 'Programming Language :: Python :: 2',
34 | 'Programming Language :: Python :: 2.7',
35 | 'Programming Language :: Python :: 3',
36 | 'Programming Language :: Python :: 3.5',
37 | ],
38 |
39 | keywords='entity-linking nlp tagme api',
40 |
41 | packages=['tagme'],
42 |
43 | install_requires=[
44 | 'future',
45 | 'python-dateutil',
46 | 'requests',
47 | 'six',
48 | ],
49 |
50 | extras_require={
51 | 'test': [],
52 | },
53 | )
54 |
--------------------------------------------------------------------------------
/tagme/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | This module provides a wrapper for the TagMe API.
3 | '''
4 |
5 | from __future__ import absolute_import, division, print_function, unicode_literals
6 |
7 | import dateutil.parser
8 | import json
9 | import logging
10 | import requests
11 | import six
12 |
13 | from html.parser import HTMLParser
14 |
15 | __all__ = [
16 | 'annotate', 'mentions', 'relatedness_wid', 'relatedness_title', 'Annotation',
17 | 'AnnotateResponse', 'Mention', 'MentionsResponse', 'Relatedness', 'RelatednessResponse',
18 | 'normalize_title', 'title_to_uri',
19 | ]
20 |
21 | __author__ = 'Marco Cornolti '
22 |
23 | DEFAULT_TAG_API = "https://tagme.d4science.org/tagme/tag"
24 | DEFAULT_SPOT_API = "https://tagme.d4science.org/tagme/spot"
25 | DEFAULT_REL_API = "https://tagme.d4science.org/tagme/rel"
26 | DEFAULT_LANG = "en"
27 | DEFAULT_LONG_TEXT = 3
28 | WIKIPEDIA_URI_BASE = u"https://{}.wikipedia.org/wiki/{}"
29 | MAX_RELATEDNESS_PAIRS_PER_REQUEST = 100
30 | GCUBE_TOKEN = None
31 | HTML_PARSER = HTMLParser()
32 |
33 | class Annotation(object):
34 | '''
35 | An annotation, i.e. a link of a part of text to an entity.
36 | '''
37 | def __init__(self, ann_json):
38 | self.begin = int(ann_json.get("start"))
39 | self.end = int(ann_json.get("end"))
40 | self.entity_id = int(ann_json.get("id"))
41 | self.entity_title = ann_json.get("title")
42 | self.score = float(ann_json.get("rho"))
43 | self.mention = ann_json.get("spot")
44 |
45 | def __str__(self):
46 | return u"{} -> {} (score: {})".format(self.mention, self.entity_title, self.score)
47 |
48 | def uri(self, lang=DEFAULT_LANG):
49 | '''
50 | Get the URI of this annotation entity.
51 | :param lang: the Wikipedia language.
52 | '''
53 | return title_to_uri(self.entity_title, lang)
54 |
55 |
56 | class AnnotateResponse(object):
57 | '''
58 | A response to a call to the annotation (/tag) service. It contains the list of annotations
59 | found.
60 | '''
61 | def __init__(self, json_content):
62 | self.annotations = [Annotation(ann_json) for ann_json in json_content["annotations"] if "title" in ann_json]
63 | self.time = int(json_content["time"])
64 | self.lang = json_content["lang"]
65 | self.timestamp = dateutil.parser.parse(json_content["timestamp"])
66 | self.original_json = json_content
67 |
68 | def get_annotations(self, min_rho=None):
69 | '''
70 | Get the list of annotations found.
71 | :param min_rho: if set, only get entities with a rho-score (confidence) higher than this.
72 | '''
73 | return (a for a in self.annotations if min_rho is None or a.score > min_rho)
74 |
75 | def __str__(self):
76 | return "{}msec, {} annotations".format(self.time, len(self.annotations))
77 |
78 |
79 | class Mention(object):
80 | '''
81 | A mention, i.e. a part of text that may mention an entity.
82 | '''
83 | def __init__(self, mention_json):
84 | self.begin = int(mention_json.get("start"))
85 | self.end = int(mention_json.get("end"))
86 | self.linkprob = float(mention_json.get("lp"))
87 | self.mention = mention_json.get("spot")
88 |
89 | def __str__(self):
90 | return u"{} [{},{}] lp={}".format(self.mention, self.begin, self.end, self.linkprob)
91 |
92 |
93 | class MentionsResponse(object):
94 | '''
95 | A response to a call to the mention finding (/spot) service. It contains the list of mentions
96 | found.
97 | '''
98 | def __init__(self, json_content):
99 | self.mentions = [Mention(mention_json) for mention_json in json_content["spots"]]
100 | self.time = int(json_content["time"])
101 | self.lang = json_content["lang"]
102 | self.timestamp = dateutil.parser.parse(json_content["timestamp"])
103 |
104 | def get_mentions(self, min_lp=None):
105 | '''
106 | Get the list of mentions found.
107 | :param min_lp: if set, only get mentions with a link probability higher than this.
108 | '''
109 | return (m for m in self.mentions if min_lp is None or m.linkprob > min_lp)
110 |
111 | def __str__(self):
112 | return "{}msec, {} mentions".format(self.time, len(self.mentions))
113 |
114 |
115 | class Relatedness(object):
116 | '''
117 | A relatedness, i.e. a real value between 0 and 1 indicating how semantically close two entities
118 | are.
119 | '''
120 | def __init__(self, rel_json):
121 | self.title1, self.title2 = (wiki_title(t) for t in rel_json["couple"].split(" "))
122 | self.rel = float(rel_json["rel"]) if "rel" in rel_json else None
123 |
124 | def as_pair(self):
125 | '''
126 | Get this relatedness value as a pair (titles, rel), where rel is the relatedness value and
127 | titles is the pair of the two titles/Wikipedia IDs.
128 | '''
129 | return ((self.title1, self.title2), self.rel)
130 |
131 | def __str__(self):
132 | return u"{}, {} rel={}".format(self.title1, self.title2, self.rel)
133 |
134 |
135 | class RelatednessResponse(object):
136 | '''
137 | A response to a call to the relatedness (/rel) service. It contains the list of relatedness for
138 | each pair.
139 | '''
140 | def __init__(self, json_contents):
141 | self.relatedness = [Relatedness(rel_json)
142 | for json_content in json_contents
143 | for rel_json in json_content["result"]]
144 | self.lang = json_contents[0]["lang"]
145 | self.timestamp = dateutil.parser.parse(json_contents[0]["timestamp"])
146 | self.calls = len(json_contents)
147 |
148 | def __iter__(self):
149 | for rel in self.relatedness:
150 | yield rel.as_pair()
151 |
152 | def get_relatedness(self, i=0):
153 | '''
154 | Get the relatedness of a pairs of entities.
155 | :param i: the index of an entity pair. The order is the same as the request.
156 | '''
157 | return self.relatedness[i].rel
158 |
159 | def __str__(self):
160 | return "{} relatedness pairs, {} calls".format(len(self.relatedness), self.calls)
161 |
162 |
163 | def normalize_title(title):
164 | '''
165 | Normalize a title to Wikipedia format. E.g. "barack Obama" becomes "Barack_Obama"
166 | :param title: a title to normalize.
167 | '''
168 | title = title.strip().replace(" ", "_")
169 | return title[0].upper() + title[1:]
170 |
171 |
172 | def wiki_title(title):
173 | '''
174 | Given a normalized title, get the page title. E.g. "Barack_Obama" becomes "Barack Obama"
175 | :param title: a wikipedia title.
176 | '''
177 | return HTML_PARSER.unescape(title.strip(" _").replace("_", " "))
178 |
179 |
180 | def title_to_uri(entity_title, lang=DEFAULT_LANG):
181 | '''
182 | Get the URI of the page describing a Wikipedia entity.
183 | :param entity_title: an entity title.
184 | :param lang: the Wikipedia language.
185 | '''
186 | return WIKIPEDIA_URI_BASE.format(lang, normalize_title(entity_title))
187 |
188 |
189 | def annotate(text, gcube_token=None, lang=DEFAULT_LANG, api=DEFAULT_TAG_API,
190 | long_text=DEFAULT_LONG_TEXT):
191 | '''
192 | Annotate a text, linking it to Wikipedia entities.
193 | :param text: the text to annotate.
194 | :param gcube_token: the authentication token provided by the D4Science infrastructure.
195 | :param lang: the Wikipedia language.
196 | :param api: the API endpoint.
197 | :param long_text: long_text parameter (see TagMe documentation).
198 | '''
199 | payload = [("text", text.encode("utf-8")),
200 | ("long_text", long_text),
201 | ("lang", lang)]
202 | json_response = _issue_request(api, payload, gcube_token)
203 | return AnnotateResponse(json_response) if json_response else None
204 |
205 |
206 | def mentions(text, gcube_token=None, lang=DEFAULT_LANG, api=DEFAULT_SPOT_API):
207 | '''
208 | Find possible mentions in a text, do not link them to any entity.
209 | :param text: the text where to find mentions.
210 | :param gcube_token: the authentication token provided by the D4Science infrastructure.
211 | :param lang: the Wikipedia language.
212 | :param api: the API endpoint.
213 | '''
214 | payload = [("text", text.encode("utf-8")),
215 | ("lang", lang.encode("utf-8"))]
216 | json_response = _issue_request(api, payload, gcube_token)
217 | return MentionsResponse(json_response) if json_response else None
218 |
219 |
220 | def relatedness_wid(wid_pairs, gcube_token=None, lang=DEFAULT_LANG, api=DEFAULT_REL_API):
221 | '''
222 | Get the semantic relatedness among pairs of entities. Entities are indicated by their
223 | Wikipedia ID (an integer).
224 | :param wid_pairs: either one pair or a list of pairs of Wikipedia IDs.
225 | :param gcube_token: the authentication token provided by the D4Science infrastructure.
226 | :param lang: the Wikipedia language.
227 | :param api: the API endpoint.
228 | '''
229 | return _relatedness("id", wid_pairs, gcube_token, lang, api)
230 |
231 |
232 | def relatedness_title(tt_pairs, gcube_token=None, lang=DEFAULT_LANG, api=DEFAULT_REL_API):
233 | '''
234 | Get the semantic relatedness among pairs of entities. Entities are indicated by their
235 | Wikipedia ID (an integer).
236 | :param tt_pairs: either one pair or a list of pairs of entity titles.
237 | :param gcube_token: the authentication token provided by the D4Science infrastructure.
238 | :param lang: the Wikipedia language.
239 | :param api: the API endpoint.
240 | '''
241 | return _relatedness("tt", tt_pairs, gcube_token, lang, api)
242 |
243 |
244 | def _relatedness(pairs_type, pairs, gcube_token, lang, api):
245 | if not isinstance(pairs[0], (list, tuple)):
246 | pairs = [pairs]
247 |
248 | if isinstance(pairs[0][0], six.binary_type): # str in python 2, bytes in python 3
249 | pairs = [(p[0].decode("utf-8"), p[1].decode("utf-8")) for p in pairs]
250 |
251 | if isinstance(pairs[0][0], six.text_type): # unicode in python 2, str in python 3
252 | pairs = [(normalize_title(p[0]), normalize_title(p[1])) for p in pairs]
253 |
254 | json_responses = []
255 | for chunk in range(0, len(pairs), MAX_RELATEDNESS_PAIRS_PER_REQUEST):
256 | payload = [("lang", lang)]
257 | payload += ((pairs_type, u"{} {}".format(p[0], p[1]))
258 | for p in pairs[chunk:chunk + MAX_RELATEDNESS_PAIRS_PER_REQUEST])
259 | json_responses.append(_issue_request(api, payload, gcube_token))
260 | return RelatednessResponse(json_responses) if json_responses and json_responses[0] else None
261 |
262 |
263 | def _issue_request(api, payload, gcube_token):
264 | if not gcube_token:
265 | gcube_token = GCUBE_TOKEN
266 | if not gcube_token:
267 | raise RuntimeError("You must define GCUBE_TOKEN before calling this function or pass the "
268 | "gcube_token parameter.")
269 |
270 | payload.append(("gcube-token", gcube_token))
271 | logging.debug("Calling %s", api)
272 | res = requests.post(api, data=payload)
273 | if res.status_code != 200:
274 | logging.warning("Tagme returned status code %d message:\n%s", res.status_code, res.content)
275 | return None
276 | res_content = res.content.decode("utf-8") if isinstance(res.content, six.binary_type) else res.content
277 | return json.loads(res_content)
278 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/marcocor/tagme-python/008a8a30419db938729111f0d7c5c6c1518f9797/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_calls.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function, unicode_literals
2 |
3 | import sys
4 | import tagme
5 |
6 | SAMPLE_TEXT = "Obama visited uk"
7 |
8 | def main():
9 | # Annotate a text.
10 | print("Annotating text: ", SAMPLE_TEXT)
11 | resp = tagme.annotate(SAMPLE_TEXT)
12 | print(resp)
13 | for ann in resp.annotations:
14 | print(ann)
15 |
16 | # Find mentions in a text.
17 | print("Finding mentions in text: ", SAMPLE_TEXT)
18 | resp = tagme.mentions(SAMPLE_TEXT)
19 | print(resp)
20 | for mention in resp.mentions:
21 | print(mention)
22 |
23 | # Find relatedness between one pair of entities, by title.
24 | resp = tagme.relatedness_title(["Barack_Obama", "Italy"])
25 | print(resp)
26 | for rel in resp.relatedness:
27 | print(rel)
28 |
29 | # Find relatedness between pairs of entities, by title.
30 | resp = tagme.relatedness_title([("Barack_Obama", "Italy"),
31 | ("Italy", "Germany"),
32 | ("Italy", "BAD ENTITY NAME")])
33 | print(resp)
34 | for rel in resp.relatedness:
35 | print(rel)
36 |
37 | # Access the relatedness response as a dictionary.
38 | resp_dict = dict(resp)
39 | print("Relatedness between Italy and Germany: ", resp_dict[("Italy", "Germany")])
40 |
41 | # Find relatedness between one pair of entities, by wikipedia id
42 | resp = tagme.relatedness_wid((31717, 534366))
43 | print(resp)
44 | for rel in resp.relatedness:
45 | print(rel)
46 |
47 | # Find relatedness between pairs of entities, by wikipedia id
48 | resp = tagme.relatedness_wid([(534366, 534366 + a) for a in range (1010)])
49 | print(resp)
50 | for rel in resp.relatedness:
51 | print(rel)
52 |
53 | if __name__ == "__main__":
54 | tagme.GCUBE_TOKEN = sys.argv[1]
55 | assert tagme.normalize_title(" barack Obama ") == "Barack_Obama"
56 | assert tagme.title_to_uri(" barack Obama ") == "https://en.wikipedia.org/wiki/Barack_Obama"
57 | assert tagme.wiki_title("Barack_Obama") == ("Barack Obama")
58 | main()
59 |
--------------------------------------------------------------------------------