├── LICENSE ├── README.md ├── README.rst ├── setup.py └── textrazor.py /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Toby Crayston 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | TextRazor Python SDK 2 | ==================== 3 | 4 | Python SDK for the TextRazor Text Analytics API. 5 | 6 | TextRazor offers state-of-the-art natural language processing tools through a simple API, allowing you to build semantic technology into your applications in minutes. 7 | 8 | Hundreds of applications rely on TextRazor to understand unstructured text across a range of verticals, with use cases including social media monitoring, enterprise search, recommendation systems and ad targeting. 9 | 10 | Getting Started 11 | =============== 12 | 13 | - Get a free API key from [https://www.textrazor.com](https://www.textrazor.com). 14 | 15 | - Install the TextRazor Python SDK 16 | 17 | ```bash 18 | pip install textrazor 19 | ``` 20 | 21 | - Create an instance of the TextRazor object and start analyzing your text. 22 | 23 | ```python 24 | from textrazor import TextRazor 25 | 26 | client = TextRazor(YOUR_API_KEY_HERE, extractors=["entities"]) 27 | response = client.analyze("Barclays misled shareholders and the public about one of the biggest investments in the bank's history, a BBC Panorama investigation has found.") 28 | 29 | for entity in response.entities(): 30 | print(entity) 31 | ``` 32 | 33 | For full API documentation visit [https://www.textrazor.com/docs/python](https://www.textrazor.com/docs/python). 34 | 35 | If you have any questions please get in touch at support@textrazor.com 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | TextRazor Python SDK 2 | ==================== 3 | 4 | Python SDK for the TextRazor Text Analytics API. 5 | 6 | TextRazor offers state-of-the-art natural language processing tools 7 | through a simple API, allowing you to build semantic technology into 8 | your applications in minutes. 9 | 10 | Hundreds of applications rely on TextRazor to understand unstructured 11 | text across a range of verticals, with use cases including social media 12 | monitoring, enterprise search, recommendation systems and ad targeting. 13 | 14 | Getting Started 15 | =============== 16 | 17 | - Get a free API key from https://www.textrazor.com. 18 | 19 | - Install the TextRazor Python SDK 20 | 21 | .. code:: bash 22 | 23 | pip install textrazor 24 | 25 | - Create an instance of the TextRazor object and start analyzing your 26 | text. 27 | 28 | .. code:: python 29 | 30 | from textrazor import TextRazor 31 | 32 | client = TextRazor(YOUR_API_KEY_HERE, extractors=["entities"]) 33 | response = client.analyze("Barclays misled shareholders and the public about one of the biggest investments in the bank's history, a BBC Panorama investigation has found.") 34 | 35 | for entity in response.entities(): 36 | print(entity) 37 | 38 | For full API documentation visit https://www.textrazor.com/docs/python 39 | 40 | If you have any questions please get in touch at support@textrazor.com 41 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup 4 | 5 | setup( 6 | name='textrazor', 7 | version='1.4.1', 8 | description='Official Python SDK for TextRazor (https://textrazor.com).', 9 | long_description=open('README.rst').read(), 10 | author='TextRazor Ltd.', 11 | author_email='toby@textrazor.com', 12 | url='https://textrazor.com/', 13 | license='MIT', 14 | py_modules=['textrazor'], 15 | classifiers=[ 16 | 'License :: OSI Approved :: MIT License', 17 | 'Operating System :: OS Independent', 18 | 'Programming Language :: Python', 19 | 'Programming Language :: Python :: 2.6', 20 | 'Programming Language :: Python :: 2.7', 21 | 'Programming Language :: Python :: 3', 22 | 'Topic :: Software Development' 23 | ] 24 | ) 25 | -------------------------------------------------------------------------------- /textrazor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2023 TextRazor, https://www.textrazor.com/ 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining 5 | a copy of this software and associated documentation files (the "Software"), 6 | to deal in the Software without restriction, including without limitation 7 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | and/or sell copies of the Software, and to permit persons to whom the Software 9 | is furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | 22 | """ 23 | 24 | try: 25 | from urllib2 import Request, urlopen, HTTPError 26 | from urllib import urlencode 27 | except ImportError: 28 | from urllib.request import Request, urlopen 29 | from urllib.parse import urlencode 30 | from urllib.error import HTTPError 31 | 32 | import warnings 33 | 34 | try: 35 | import simplejson as json 36 | except ImportError: 37 | import json 38 | 39 | try: 40 | import cStringIO.StringIO as IOStream 41 | except ImportError: 42 | try: 43 | import StringIO.StringIO as IOStream 44 | except ImportError: 45 | from io import BytesIO as IOStream 46 | 47 | import gzip 48 | import zlib 49 | 50 | # These options don't usually change much within a user's app, 51 | # for convenience allow them to set global defaults for connection options. 52 | 53 | api_key = None 54 | do_compression = True 55 | do_encryption = True 56 | 57 | # Endpoints aren't usually changed by an end user, but helpful to 58 | # have as an option for debug purposes. 59 | 60 | _SECURE_TEXTRAZOR_ENDPOINT = "https://api.textrazor.com/" 61 | _TEXTRAZOR_ENDPOINT = "http://api.textrazor.com/" 62 | 63 | 64 | def _chunks(l, n): 65 | n = max(1, n) 66 | return (l[i:i + n] for i in range(0, len(l), n)) 67 | 68 | 69 | class proxy_response_json(object): 70 | """ Helper class to provide a transparent proxy for python properties 71 | with easy access to an underlying json document. This is to avoid unneccesary 72 | copying of the response, while explictly exposing the expected response fields 73 | and documentation.""" 74 | 75 | def __init__(self, attr_name, default=None, doc=None): 76 | self.attr_name = attr_name 77 | self.default = default 78 | 79 | if doc: 80 | self.__doc__ = doc 81 | 82 | def __get__(self, instance, owner=None): 83 | return instance.json.get(self.attr_name, self.default) 84 | 85 | def __set__(self, instance, value): 86 | instance.json[self.attr_name] = value 87 | 88 | 89 | class proxy_member(object): 90 | """ Slightly redundant given the property decorator, but saves some space 91 | and makes non-json property access consistent with the above. """ 92 | 93 | def __init__(self, attr_name, doc=None): 94 | self.attr_name = attr_name 95 | 96 | if doc: 97 | self.__doc__ = doc 98 | 99 | def __get__(self, instance, owner=None): 100 | return getattr(instance, self.attr_name) 101 | 102 | def _generate_str(instance, banned_properties=[]): 103 | out = ["TextRazor", type(instance).__name__] 104 | 105 | try: 106 | out.extend(["with id:", repr(instance.id), "\n"]) 107 | except AttributeError: 108 | out.extend([":\n", ]) 109 | 110 | for prop in dir(instance): 111 | if not prop.startswith("_") and prop != "id" and prop not in banned_properties: 112 | out.extend([prop, ":", repr(getattr(instance, prop)), "\n"]) 113 | 114 | return " ".join(out) 115 | 116 | class TextRazorConnection(object): 117 | 118 | def __init__(self, local_api_key=None, local_do_compression=None, local_do_encryption=None): 119 | global api_key, do_compression, do_encryption, _TEXTRAZOR_ENDPOINT, _SECURE_TEXTRAZOR_ENDPOINT 120 | 121 | self.api_key = local_api_key 122 | self.do_compression = local_do_compression 123 | self.do_encryption = local_do_encryption 124 | 125 | self.endpoint = _TEXTRAZOR_ENDPOINT 126 | self.secure_endpoint = _SECURE_TEXTRAZOR_ENDPOINT 127 | 128 | if self.api_key is None: 129 | self.api_key = api_key 130 | if self.do_compression is None: 131 | self.do_compression = do_compression 132 | if self.do_encryption is None: 133 | self.do_encryption = do_encryption 134 | 135 | def set_api_key(self, api_key): 136 | """Sets the TextRazor API key, required for all requests.""" 137 | self.api_key = api_key 138 | 139 | def set_do_compression(self, do_compression): 140 | """When True, request gzipped responses from TextRazor. When expecting a large response this can 141 | significantly reduce bandwidth. Defaults to True.""" 142 | self.do_compression = do_compression 143 | 144 | def set_do_encryption(self, do_encryption): 145 | """When True, all communication to TextRazor will be sent over SSL, when handling sensitive 146 | or private information this should be set to True. Defaults to False.""" 147 | self.do_encryption = do_encryption 148 | 149 | def set_endpoint(self, endpoint): 150 | self.endpoint = endpoint 151 | 152 | def set_secure_endpoint(self, endpoint): 153 | self.secure_endpoint = endpoint 154 | 155 | def _build_request_headers(self, do_request_compression=False): 156 | request_headers = { 157 | 'X-TextRazor-Key': self.api_key 158 | } 159 | 160 | if self.do_compression: 161 | request_headers['Accept-Encoding'] = 'gzip' 162 | 163 | if do_request_compression: 164 | request_headers['Content-Encoding'] = 'gzip' 165 | 166 | return request_headers 167 | 168 | def do_request(self, path, post_data=None, content_type=None, method="GET"): 169 | # Where compression is enabled, TextRazor supports compression of both request and response bodys. 170 | # Request compression can result in a significant decrease in processing time, especially for 171 | # larger documents. 172 | do_request_compression = False 173 | 174 | encoded_post_data = None 175 | if post_data: 176 | encoded_post_data = post_data.encode("utf-8") 177 | 178 | # Don't do request compression for small/empty bodies 179 | do_request_compression = self.do_compression and encoded_post_data and len(encoded_post_data) > 50 180 | 181 | request_headers = self._build_request_headers(do_request_compression) 182 | 183 | if content_type: 184 | request_headers['Content-Type'] = content_type 185 | 186 | if self.do_encryption: 187 | endpoint = self.secure_endpoint 188 | else: 189 | endpoint = self.endpoint 190 | 191 | url = "".join([endpoint, path]) 192 | 193 | if do_request_compression: 194 | encoded_post_data = zlib.compress(encoded_post_data) 195 | 196 | request = Request(url, headers=request_headers, data=encoded_post_data) 197 | 198 | request.get_method = lambda: method 199 | 200 | try: 201 | response = urlopen(request) 202 | except HTTPError as e: 203 | raise TextRazorAnalysisException("TextRazor returned HTTP Code %d: %s" % (e.code, e.read())) 204 | 205 | if response.info().get('Content-Encoding') == 'gzip': 206 | buf = IOStream(response.read()) 207 | response = gzip.GzipFile(fileobj=buf) 208 | 209 | response_text = response.read().decode("utf-8") 210 | return json.loads(response_text) 211 | 212 | 213 | class TextRazorAnalysisException(Exception): 214 | pass 215 | 216 | 217 | class Topic(object): 218 | """Represents a single abstract topic extracted from the input text. 219 | 220 | Requires the "topics" extractor to be added to the TextRazor request. 221 | """ 222 | 223 | def __init__(self, topic_json, link_index): 224 | self.json = topic_json 225 | 226 | for callback, arg in link_index.get(("topic", self.id), []): 227 | callback(arg, self) 228 | 229 | id = proxy_response_json("id", None, """The unique id of this Topic within the result set.""") 230 | 231 | label = proxy_response_json("label", None, """The label of this Topic.""") 232 | 233 | wikipedia_link = proxy_response_json("wikiLink", None, """A link to Wikipedia for this topic, or None if this Topic couldn't be linked to a Wikipedia page.""") 234 | 235 | wikidata_id = proxy_response_json("wikidataId", None, """A link to the Wikidata ID for this topic, or None if this Topic couldn't be linked to a Wikipedia page.""") 236 | 237 | score = proxy_response_json("score", None, """The contextual relevance of this Topic to your document.""") 238 | 239 | def __str__(self): 240 | return _generate_str(self) 241 | 242 | def __repr__(self): 243 | return "TextRazor Topic %s with label %s" % (str(self.id), str(self.label)) 244 | 245 | 246 | class Entity(object): 247 | """Represents a single "Named Entity" extracted from the input text. 248 | 249 | Requires the "entities" extractor to be added to the TextRazor request. 250 | """ 251 | 252 | def __init__(self, entity_json, link_index): 253 | self.json = entity_json 254 | self._matched_words = [] 255 | 256 | for callback, arg in link_index.get(("entity", self.document_id), []): 257 | callback(arg, self) 258 | 259 | for position in self.matched_positions: 260 | try: 261 | link_index[("word", position)].append((self._register_link, None)) 262 | except KeyError: 263 | link_index[("word", position)] = [(self._register_link, None)] 264 | 265 | def _register_link(self, dummy, word): 266 | self._matched_words.append(word) 267 | word._add_entity(self) 268 | 269 | custom_entity_id = proxy_response_json("customEntityId", "", """ 270 | The custom entity DictionaryEntry id that matched this Entity, 271 | if this entity was matched in a custom dictionary.""") 272 | 273 | document_id = proxy_response_json("id", None) 274 | 275 | id = proxy_response_json("entityId", None, "The disambiguated Wikipedia ID for this entity, or None if this entity could not be disambiguated.") 276 | 277 | english_id = proxy_response_json("entityEnglishId", None, "The disambiguated entityId in the English Wikipedia, where a link between localized and English ID could be found. None if either the entity could not be linked, or where a language link did not exist.") 278 | 279 | freebase_id = proxy_response_json("freebaseId", None, "The disambiguated Freebase ID for this entity, or None if either this entity could not be disambiguated, or has no Freebase link.") 280 | 281 | wikidata_id = proxy_response_json("wikidataId", None, "The disambiguated Wikidata QID for this entity, or None if either this entity could not be disambiguated, or has no Freebase link.") 282 | 283 | wikipedia_link = proxy_response_json("wikiLink", None, "Link to Wikipedia for this entity, or None if either this entity could not be disambiguated or a Wikipedia link doesn't exist.") 284 | 285 | matched_text = proxy_response_json("matchedText", None, "The source text string that matched this entity") 286 | 287 | starting_position = proxy_response_json("startingPos", None, "The character offset in the unicode source text that marks the start of this entity.") 288 | 289 | ending_position = proxy_response_json("endingPos", None, "The character offset in the unicode source text that marks the end of this entity.") 290 | 291 | matched_positions = proxy_response_json("matchingTokens", [], "List of the token positions in the current sentence that make up this entity.") 292 | 293 | freebase_types = proxy_response_json("freebaseTypes", [], "List of Freebase types for this entity, or an empty list if there are none.") 294 | 295 | dbpedia_types = proxy_response_json("type", [], "List of Dbpedia types for this entity, or an empty list if there are none.") 296 | 297 | relevance_score = proxy_response_json("relevanceScore", None, """The relevance this entity has to the source text. This is a float on a scale of 0 to 1, with 1 being the most relevant. 298 | Relevance is computed using a number contextual clues found in the entity context and facts in the TextRazor knowledgebase.""") 299 | 300 | confidence_score = proxy_response_json("confidenceScore", None, """ 301 | The confidence that TextRazor is correct that this is a valid entity. TextRazor uses an ever increasing 302 | number of signals to help spot valid entities, all of which contribute to this score. These include the contextual 303 | agreement between the words in the source text and our knowledgebase, agreement between other entities in the text, 304 | agreement between the expected entity type and context, and prior probabilities of having seen this entity across Wikipedia 305 | and other web datasets. The score ranges from 0.5 to 10, with 10 representing the highest confidence that this is 306 | a valid entity.""") 307 | 308 | data = proxy_response_json("data", {}, """Dictionary containing enriched data found for this entity. 309 | This is either as a result of an enrichment query, or as uploaded as part of a custom Entity Dictionary.""") 310 | 311 | crunchbase_id = proxy_response_json("crunchbaseId", None, "The disambiguated Crunchbase ID for this entity. None if either the entity could not be linked, or the entity was not a Company type.") 312 | 313 | lei = proxy_response_json("lei", None, "The disambiguated Legal Entity Identifier for this entity. None if either the entity could not be linked, or the entity was not a Company type.") 314 | 315 | figi = proxy_response_json("figi", None, "The disambiguated Open FIGI for this entity. None if either the entity could not be linked, or the entity was not a Company type.") 316 | 317 | permid = proxy_response_json("permid", None, "The disambiguated Thomson Reuters Open PermID for this entity. None if either the entity could not be linked, or the entity was not a Company type.") 318 | 319 | @property 320 | def matched_words(self): 321 | """Returns a list of :class:`Word` that make up this entity.""" 322 | return self._matched_words 323 | 324 | def __repr__(self): 325 | return "TextRazor Entity %s at positions %s" % (self.id.encode("utf-8"), str(self.matched_positions)) 326 | 327 | def __str__(self): 328 | return _generate_str(self) 329 | 330 | class Entailment(object): 331 | """Represents a single "entailment" derived from the source text. 332 | 333 | Requires the "entailments" extractor to be added to the TextRazor request. 334 | """ 335 | 336 | def __init__(self, entailment_json, link_index): 337 | self.json = entailment_json 338 | self._matched_words = [] 339 | 340 | for callback, arg in link_index.get(("entailment", self.id), []): 341 | callback(arg, self) 342 | 343 | for position in self.matched_positions: 344 | try: 345 | link_index[("word", position)].append((self._register_link, None)) 346 | except KeyError: 347 | link_index[("word", position)] = [(self._register_link, None)] 348 | 349 | def _register_link(self, dummy, word): 350 | self._matched_words.append(word) 351 | word._add_entailment(self) 352 | 353 | id = proxy_response_json("id", None, "The unique id of this Entailment within the result set.") 354 | 355 | matched_positions = proxy_response_json("wordPositions", [], "The token positions in the current sentence that generated this entailment.") 356 | 357 | prior_score = proxy_response_json("priorScore", None, "The score of this entailment independent of the context it is used in this sentence.") 358 | 359 | context_score = proxy_response_json("contextScore", None, "Score of this entailment given the source word's usage in its sentence and the entailed word's usage in our knowledgebase") 360 | 361 | score = proxy_response_json("score", None, "TextRazor's overall confidence that this is a valid entailment, a combination of the prior and context score") 362 | 363 | @property 364 | def matched_words(self): 365 | """The :class:`Word` in the current sentence that generated this entailment.""" 366 | return self._matched_words 367 | 368 | @property 369 | def entailed_word(self): 370 | """The word string that is entailed by the source words.""" 371 | entailed_tree = self.json.get("entailedTree") 372 | if entailed_tree: 373 | return entailed_tree.get("word") 374 | 375 | def __repr__(self): 376 | return "TextRazor Entailment:\"%s\" at positions %s" % (str(self.entailed_word), str(self.matched_positions)) 377 | 378 | def __str__(self): 379 | return _generate_str(self) 380 | 381 | 382 | class RelationParam(object): 383 | """Represents a Param to a specific :class:`Relation`. 384 | 385 | Requires the "relations" extractor to be added to the TextRazor request.""" 386 | 387 | def __init__(self, param_json, relation_parent, link_index): 388 | self.json = param_json 389 | self._relation_parent = relation_parent 390 | self._param_words = [] 391 | 392 | for position in self.param_positions: 393 | try: 394 | link_index[("word", position)].append((self._register_link, None)) 395 | except KeyError: 396 | link_index[("word", position)] = [(self._register_link, None)] 397 | 398 | def _register_link(self, dummy, word): 399 | self._param_words.append(word) 400 | word._add_relation_param(self) 401 | 402 | @property 403 | def relation_parent(self): 404 | """Returns the :class:`Relation` that owns this param.""" 405 | return self._relation_parent 406 | 407 | relation = proxy_response_json("relation", None, """ 408 | The relation of this param to the predicate. 409 | Possible values: SUBJECT, OBJECT, OTHER""") 410 | 411 | param_positions = proxy_response_json("wordPositions", [], "List of the positions of the words in this param within their sentence.") 412 | 413 | @property 414 | def param_words(self): 415 | """Returns a list of all the :class:`Word` that make up this param.""" 416 | return self._param_words 417 | 418 | def entities(self): 419 | """Returns a generator of all :class:`Entity` mentioned in this param.""" 420 | seen = set() 421 | for word in self.param_words: 422 | for entity in word.entities: 423 | if entity not in seen: 424 | seen.add(entity) 425 | yield entity 426 | 427 | def __repr__(self): 428 | return "TextRazor RelationParam:\"%s\" at positions %s" % (str(self.relation), str(self.param_words)) 429 | 430 | def __str__(self): 431 | return _generate_str(self) 432 | 433 | 434 | class NounPhrase(object): 435 | """Represents a multi-word phrase extracted from a sentence. 436 | 437 | Requires the "relations" extractor to be added to the TextRazor request.""" 438 | 439 | def __init__(self, noun_phrase_json, link_index): 440 | self.json = noun_phrase_json 441 | self._words = [] 442 | 443 | for callback, arg in link_index.get(("nounPhrase", self.id), []): 444 | callback(arg, self) 445 | 446 | for position in self.word_positions: 447 | try: 448 | link_index[("word", position)].append((self._register_link, None)) 449 | except KeyError: 450 | link_index[("word", position)] = [(self._register_link, None)] 451 | 452 | def _register_link(self, dummy, word): 453 | self._words.append(word) 454 | word._add_noun_phrase(self) 455 | 456 | id = proxy_response_json("id", None, "The unique id of this NounPhrase within the result set.") 457 | 458 | word_positions = proxy_response_json("wordPositions", None, "List of the positions of the words in this phrase.") 459 | 460 | @property 461 | def words(self): 462 | """Returns a list of :class:`Word` that make up this phrase.""" 463 | return self._words 464 | 465 | def __repr__(self): 466 | return "TextRazor NounPhrase at positions %s" % (str(self.words)) 467 | 468 | def __str__(self): 469 | return _generate_str(self, banned_properties=["word_positions", ]) 470 | 471 | class Property(object): 472 | """Represents a property relation extracted from raw text. A property implies an "is-a" or "has-a" relationship 473 | between the predicate (or focus) and its property. 474 | 475 | Requires the "relations" extractor to be added to the TextRazor request. 476 | """ 477 | 478 | def __init__(self, property_json, link_index): 479 | self.json = property_json 480 | self._predicate_words = [] 481 | self._property_words = [] 482 | 483 | for callback, arg in link_index.get(("property", self.id), []): 484 | callback(arg, self) 485 | 486 | for position in self.predicate_positions: 487 | try: 488 | link_index[("word", position)].append((self._register_link, True)) 489 | except KeyError: 490 | link_index[("word", position)] = [(self._register_link, True)] 491 | 492 | for position in self.property_positions: 493 | try: 494 | link_index[("word", position)].append((self._register_link, False)) 495 | except KeyError: 496 | link_index[("word", position)] = [(self._register_link, False)] 497 | 498 | def _register_link(self, is_predicate, word): 499 | if is_predicate: 500 | self._predicate_words.append(word) 501 | word._add_property_predicate(self) 502 | else: 503 | self._property_words.append(word) 504 | word._add_property_properties(self) 505 | 506 | id = proxy_response_json("id", None, "The unique id of this NounPhrase within the result set.") 507 | 508 | predicate_positions = proxy_response_json("wordPositions", [], "List of the positions of the words in the predicate (or focus) of this property.") 509 | 510 | predicate_words = proxy_member("_predicate_words", "List of TextRazor words that make up the predicate (or focus) of this property.") 511 | 512 | property_positions = proxy_response_json("propertyPositions", [], "List of the positions of the words that modify the predicate of this property.") 513 | 514 | property_words = proxy_member("_property_words", "List of :class:`Word` that modify the predicate of this property.") 515 | 516 | def __repr__(self): 517 | return "TextRazor Property at positions %s" % (str(self.predicate_positions)) 518 | 519 | def __str__(self): 520 | return _generate_str(self, banned_properties=["predicate_positions", ]) 521 | 522 | class Relation(object): 523 | """Represents a grammatical relation between words. Typically owns a number of 524 | :class:`RelationParam`, representing the SUBJECT and OBJECT of the relation. 525 | 526 | Requires the "relations" extractor to be added to the TextRazor request.""" 527 | 528 | def __init__(self, relation_json, link_index): 529 | self.json = relation_json 530 | 531 | self._params = [RelationParam(param, self, link_index) for param in relation_json["params"]] 532 | self._predicate_words = [] 533 | 534 | for callback, arg in link_index.get(("relation", self.id), []): 535 | callback(arg, self) 536 | 537 | for position in self.predicate_positions: 538 | try: 539 | link_index[("word", position)].append((self._register_link, None)) 540 | except KeyError: 541 | link_index[("word", position)] = [(self._register_link, None)] 542 | 543 | def _register_link(self, dummy, word): 544 | self._predicate_words.append(word) 545 | word._add_relation(self) 546 | 547 | id = proxy_response_json("id", None, "The unique id of this Relation within the result set.") 548 | 549 | predicate_positions = proxy_response_json("wordPositions", [], "List of the positions of the predicate words in this relation.") 550 | 551 | predicate_words = proxy_member("_predicate_words", "List of the positions of the predicate words in this relation.") 552 | 553 | params = proxy_member("_params", "List of the TextRazor RelationParam that are part of this relation.") 554 | 555 | def __repr__(self): 556 | return "TextRazor Relation at positions %s" % (str(self.predicate_words)) 557 | 558 | def __str__(self): 559 | return _generate_str(self, banned_properties=["predicate_positions", ]) 560 | 561 | class Word(object): 562 | """Represents a single Word (token) extracted by TextRazor. 563 | 564 | Requires the "words" extractor to be added to the TextRazor request.""" 565 | 566 | def __init__(self, response_word, link_index): 567 | self.json = response_word 568 | 569 | self._parent = None 570 | self._children = [] 571 | 572 | self._entities = [] 573 | self._entailments = [] 574 | self._relations = [] 575 | self._relation_params = [] 576 | self._property_predicates = [] 577 | self._property_properties = [] 578 | self._noun_phrases = [] 579 | 580 | for callback, arg in link_index.get(("word", self.position), []): 581 | callback(arg, self) 582 | 583 | def _add_child(self, child): 584 | self._children.append(child) 585 | 586 | def _set_parent(self, parent): 587 | self._parent = parent 588 | parent._add_child(self) 589 | 590 | def _add_entity(self, entity): 591 | self._entities.append(entity) 592 | 593 | def _add_entailment(self, entailment): 594 | self._entailments.append(entailment) 595 | 596 | def _add_relation(self, relation): 597 | self._relations.append(relation) 598 | 599 | def _add_relation_param(self, relation_param): 600 | self._relation_params.append(relation_param) 601 | 602 | def _add_property_predicate(self, property): 603 | self._property_predicates.append(property) 604 | 605 | def _add_property_properties(self, property): 606 | self._property_properties.append(property) 607 | 608 | def _add_noun_phrase(self, noun_phrase): 609 | self._noun_phrases.append(noun_phrase) 610 | 611 | parent_position = proxy_response_json("parentPosition", None, """ 612 | The position of the grammatical parent of this Word, or None if this Word is either at the root 613 | of the sentence or the "dependency-trees" extractor was not requested.""") 614 | 615 | parent = proxy_member("_parent", """ 616 | Link to the TextRazor Word that is parent of this Word, or None if this word is either at the root 617 | of the sentence or the "dependency-trees" extractor was not requested.""") 618 | 619 | relation_to_parent = proxy_response_json("relationToParent", None, """ 620 | Returns the grammatical relation between this word and its parent, or None if this Word is either at the root 621 | of the sentence or the "dependency-trees" extractor was not requested. 622 | 623 | TextRazor parses into the Stanford uncollapsed dependencies, as detailed at: 624 | 625 | http://nlp.stanford.edu/software/dependencies_manual.pdf""") 626 | 627 | children = proxy_member("_children", """ 628 | List of TextRazor words that make up the children of this word. Returns an empty list 629 | for leaf words, or if the "dependency-trees" extractor was not requested.""") 630 | 631 | position = proxy_response_json("position", None, "The position of this word in its sentence.") 632 | 633 | stem = proxy_response_json("stem", None, "The stem of this word.") 634 | 635 | lemma = proxy_response_json("lemma", None, "The morphological root of this word, see http://en.wikipedia.org/wiki/Lemma_(morphology) for details.") 636 | 637 | token = proxy_response_json("token", None, "The raw token string that matched this word in the source text.") 638 | 639 | part_of_speech = proxy_response_json("partOfSpeech", None, """ 640 | The Part of Speech that applies to this word. We use the Penn treebank tagset, 641 | as detailed here: 642 | 643 | http://www.comp.leeds.ac.uk/ccalas/tagsets/upenn.html""") 644 | 645 | input_start_offset = proxy_response_json("startingPos", None, """ 646 | The start offset in the input text for this token. Note that this offset applies to the 647 | original Unicode string passed in to the api, TextRazor treats multi byte utf8 charaters as a single position.""") 648 | 649 | input_end_offset = proxy_response_json("endingPos", None, """ 650 | The end offset in the input text for this token. Note that this offset applies to the 651 | original Unicode string passed in to the api, TextRazor treats multi byte utf8 charaters as a single position.""") 652 | 653 | entailments = proxy_member("_entailments", "List of :class:`Entailment` that this word entails") 654 | 655 | entities = proxy_member("_entities", "List of :class:`Entity` that this word is a part of.") 656 | 657 | relations = proxy_member("_relations", "List of :class:`Relation` that this word is a predicate of.") 658 | 659 | relation_params = proxy_member("_relation_params", "List of :class:`RelationParam` that this word is a member of.") 660 | 661 | property_properties = proxy_member("_property_properties", "List of :class:`Property` that this word is a property member of.") 662 | 663 | property_predicates = proxy_member("_property_predicates", "List of :class:`Property` that this word is a predicate (or focus) member of.") 664 | 665 | noun_phrases = proxy_member("_noun_phrases", "List of :class:`NounPhrase` that this word is a member of.") 666 | 667 | senses = proxy_response_json("senses", [], "List of {'sense', 'score'} dictionaries representing scores of each Wordnet sense this this word may be a part of.") 668 | 669 | spelling_suggestions = proxy_response_json("spellingSuggestions", [], "List of {'suggestion', 'score'} dictionaries representing scores of each spelling suggestion that might replace this word. This property requires the \"spelling\" extractor to be sent with your request.") 670 | 671 | def __repr__(self): 672 | return "TextRazor Word:\"%s\" at position %s" % ((self.token).encode("utf-8"), str(self.position)) 673 | 674 | def __str__(self): 675 | return _generate_str(self) 676 | 677 | class Sentence(object): 678 | """Represents a single sentence extracted by TextRazor.""" 679 | 680 | def __init__(self, sentence_json, link_index): 681 | if "words" in sentence_json: 682 | self._words = [Word(word_json, link_index) for word_json in sentence_json["words"]] 683 | else: 684 | self._words = [] 685 | 686 | self._add_links(link_index) 687 | 688 | def _add_links(self, link_index): 689 | if not self._words: 690 | return 691 | 692 | self._root_word = None 693 | 694 | # Add links between the parent/children of the dependency tree in this sentence. 695 | 696 | word_positions = {} 697 | for word in self._words: 698 | word_positions[word.position] = word 699 | 700 | for word in self._words: 701 | parent_position = word.parent_position 702 | if parent_position is not None and parent_position >= 0: 703 | word._set_parent(word_positions[parent_position]) 704 | elif word.part_of_speech not in ("$", "``", "''", "(", ")", ",", "--", ".", ":"): 705 | # Punctuation does not get attached to any parent, any non punctuation part of speech 706 | # must be the root word. 707 | self._root_word = word 708 | 709 | root_word = proxy_member("_root_word", """The root word of this sentence if "dependency-trees" extractor was requested""") 710 | 711 | words = proxy_member("_words", """List of all the :class:`Word` in this sentence""") 712 | 713 | 714 | class CustomAnnotation(object): 715 | 716 | def __init__(self, annotation_json, link_index): 717 | self.json = annotation_json 718 | 719 | for key_value in annotation_json.get("contents", []): 720 | for link in key_value.get("links", []): 721 | try: 722 | link_index[(link["annotationName"], link["linkedId"])].append((self._register_link, link)) 723 | except Exception: 724 | link_index[(link["annotationName"], link["linkedId"])] = [(self._register_link, link)] 725 | 726 | def _register_link(self, link, annotation): 727 | link["linked"] = annotation 728 | 729 | new_custom_annotation_list = [] 730 | try: 731 | new_custom_annotation_list = getattr(annotation, self.name()) 732 | except Exception: 733 | pass 734 | new_custom_annotation_list.append(self) 735 | setattr(annotation, self.name(), new_custom_annotation_list) 736 | 737 | def name(self): 738 | return self.json["name"] 739 | 740 | def __getattr__(self, attr): 741 | exists = False 742 | for key_value in self.json["contents"]: 743 | if "key" in key_value and key_value["key"] == attr: 744 | exists = True 745 | for link in key_value.get("links", []): 746 | try: 747 | yield link["linked"] 748 | except Exception: 749 | yield link 750 | for int_value in key_value.get("intValue", []): 751 | yield int_value 752 | for float_value in key_value.get("floatValue", []): 753 | yield float_value 754 | for str_value in key_value.get("stringValue", []): 755 | yield str_value 756 | for bytes_value in key_value.get("bytesValue", []): 757 | yield bytes_value 758 | 759 | if not exists: 760 | raise AttributeError("%r annotation has no attribute %r" % (self.name(), attr)) 761 | 762 | def __repr__(self): 763 | return "TextRazor CustomAnnotation:\"%s\"" % (self.json["name"]) 764 | 765 | def __str__(self): 766 | out = ["TextRazor CustomAnnotation:", str(self.json["name"]), "\n"] 767 | 768 | for key_value in self.json["contents"]: 769 | try: 770 | out.append("Param %s:" % key_value["key"]) 771 | except Exception: 772 | out.append("Param (unlabelled):") 773 | out.append("\n") 774 | for link in self.__getattr__(key_value["key"]): 775 | out.append(repr(link)) 776 | out.append("\n") 777 | 778 | return " ".join(out) 779 | 780 | 781 | class TextRazorResponse(object): 782 | """Represents a processed response from TextRazor.""" 783 | 784 | def __init__(self, response_json): 785 | self.json = response_json 786 | 787 | self._sentences = [] 788 | self._custom_annotations = [] 789 | self._topics = [] 790 | self._coarse_topics = [] 791 | self._entities = [] 792 | self._entailments = [] 793 | self._relations = [] 794 | self._properties = [] 795 | self._noun_phrases = [] 796 | self._categories = [] 797 | 798 | link_index = {} 799 | 800 | if "response" in self.json: 801 | # There's a bit of magic here. Each annotation registers a callback with the ids and types of annotation 802 | # that it is linked to. When the linked annotation is later parsed it adds the link via the callback. 803 | # This means that annotations must be added in order of the dependency between them. 804 | 805 | if "customAnnotations" in self.json["response"]: 806 | self._custom_annotations = [CustomAnnotation(json, link_index) for json in self.json["response"]["customAnnotations"]] 807 | 808 | if "topics" in self.json["response"]: 809 | self._topics = [Topic(topic_json, link_index) for topic_json in self.json["response"]["topics"]] 810 | 811 | if "coarseTopics" in self.json["response"]: 812 | self._coarse_topics = [Topic(topic_json, link_index) for topic_json in self.json["response"]["coarseTopics"]] 813 | 814 | if "entities" in self.json["response"]: 815 | self._entities = [Entity(entity_json, link_index) for entity_json in self.json["response"]["entities"]] 816 | 817 | if "entailments" in self.json["response"]: 818 | self._entailments = [Entailment(entailment_json, link_index) for entailment_json in self.json["response"]["entailments"]] 819 | 820 | if "relations" in self.json["response"]: 821 | self._relations = [Relation(relation_json, link_index) for relation_json in self.json["response"]["relations"]] 822 | 823 | if "properties" in self.json["response"]: 824 | self._properties = [Property(property_json, link_index) for property_json in self.json["response"]["properties"]] 825 | 826 | if "nounPhrases" in self.json["response"]: 827 | self._noun_phrases = [NounPhrase(phrase_json, link_index) for phrase_json in self.json["response"]["nounPhrases"]] 828 | 829 | if "sentences" in self.json["response"]: 830 | self._sentences = [Sentence(sentence_json, link_index) for sentence_json in self.json["response"]["sentences"]] 831 | 832 | if "categories" in self.json["response"]: 833 | self._categories = [ScoredCategory(category_json) for category_json in self.json["response"]["categories"]] 834 | 835 | @property 836 | def raw_text(self): 837 | """"When the set_cleanup_return_raw option is enabled, contains the input text before any cleanup.""" 838 | return self.json["response"].get("rawText", "") 839 | 840 | @property 841 | def cleaned_text(self): 842 | """"When the set_cleanup_return_cleaned option is enabled, contains the input text after any cleanup/article extraction.""" 843 | return self.json["response"].get("cleanedText", "") 844 | 845 | @property 846 | def language(self): 847 | """"The ISO-639-2 language used to analyze this document, either explicitly provided as the languageOverride, or as detected by the language detector.""" 848 | return self.json["response"].get("language", "") 849 | 850 | @property 851 | def custom_annotation_output(self): 852 | """"Any output generated while running the embedded Prolog engine on your rules.""" 853 | return self.json["response"].get("customAnnotationOutput", "") 854 | 855 | ok = proxy_response_json("ok", False, """ 856 | True if TextRazor successfully analyzed your document, False if there was some error. 857 | More detailed information about the error is available in the :meth:`error` property. 858 | """) 859 | 860 | error = proxy_response_json("error", "", """ 861 | Descriptive error message of any problems that may have occurred during analysis, 862 | or an empty string if there was no error. 863 | """) 864 | 865 | message = proxy_response_json("message", "", """ 866 | Any warning or informational messages returned from the server. 867 | """) 868 | 869 | def coarse_topics(self): 870 | """Returns a list of all the coarse :class:`Topic` in the response. """ 871 | return self._coarse_topics 872 | 873 | def topics(self): 874 | """Returns a list of all the :class:`Topic` in the response. """ 875 | return self._topics 876 | 877 | def entities(self): 878 | """Returns a list of all the :class:`Entity` across all sentences in the response.""" 879 | return self._entities 880 | 881 | def words(self): 882 | """Returns a generator of all :class:`Word` across all sentences in the response.""" 883 | for sentence in self._sentences: 884 | for word in sentence.words: 885 | yield word 886 | 887 | def entailments(self): 888 | """Returns a list of all :class:`Entailment` across all sentences in the response.""" 889 | return self._entailments 890 | 891 | def relations(self): 892 | """Returns a list of all :class:`Relation` across all sentences in the response.""" 893 | return self._relations 894 | 895 | def properties(self): 896 | """Returns a list of all :class:`Property` across all sentences in the response.""" 897 | return self._properties 898 | 899 | def noun_phrases(self): 900 | """Returns a list of all the :class:`NounPhrase` across all sentences in the response.""" 901 | return self._noun_phrases 902 | 903 | def sentences(self): 904 | """Returns a list of all :class:`Sentence` in the response.""" 905 | return self._sentences 906 | 907 | def categories(self): 908 | """List of all :class:`ScoredCategory` in the response.""" 909 | return self._categories 910 | 911 | def matching_rules(self): 912 | """Returns a list of rule names that matched this document.""" 913 | return [custom_annotation.name() for custom_annotation in self._custom_annotations] 914 | 915 | def summary(self): 916 | return """Request processed in: %s seconds. Num Sentences:%s""" % ( 917 | self.json["time"], len(self.json["response"]["sentences"]) 918 | ) 919 | 920 | def __getattr__(self, attr): 921 | exists = False 922 | for custom_annotation in self._custom_annotations: 923 | if custom_annotation.name() == attr: 924 | exists = True 925 | yield custom_annotation 926 | 927 | if not exists: 928 | raise AttributeError("TextRazor response has no annotation %r" % attr) 929 | 930 | 931 | class AllDictionaryEntriesResponse(object): 932 | 933 | def __init__(self, json): 934 | self.json = json 935 | 936 | self.entries = [DictionaryEntry(dictionary_json) for dictionary_json in json.get("entries", [])] 937 | 938 | total = proxy_response_json("total", 0, """ 939 | The total number of DictionaryEntry in this Dictionary. 940 | """) 941 | 942 | limit = proxy_response_json("limit", 0, """ 943 | The maximium number of DictionaryEntry to be returned. 944 | """) 945 | 946 | offset = proxy_response_json("offset", 0, """ 947 | Offset into the full list of DictionaryEntry that this result set started from. 948 | """) 949 | 950 | 951 | class DictionaryManager(TextRazorConnection): 952 | 953 | path = "entities/" 954 | 955 | def __init__(self, api_key=None): 956 | super(DictionaryManager, self).__init__(api_key) 957 | 958 | def create_dictionary(self, dictionary_properties): 959 | """ Creates a new dictionary using properties provided in the dict dictionary_properties. 960 | See the properties of class Dictionary for valid options. 961 | 962 | >>> import textrazor 963 | >>> dictionary_manager = textrazor.DictionaryManager("YOUR_API_KEY_HERE") 964 | >>> 965 | >>> dictionary_manager.create_dictionary({"id":"UNIQUE_ID"}) 966 | """ 967 | 968 | new_dictionary = Dictionary({}) 969 | 970 | for key, value in dictionary_properties.items(): 971 | if not hasattr(new_dictionary, key): 972 | valid_options = ",".join(name for name, obj in Dictionary.__dict__.items() if isinstance(obj, proxy_response_json)) 973 | 974 | raise TextRazorAnalysisException("Cannot create dictionary, unexpected param: %s. Supported params: %s" % (key, valid_options)) 975 | 976 | setattr(new_dictionary, key, value) 977 | 978 | # Check for the existence of a dictionary ID, without that 979 | # we can't generate a URL and the server will return an unhelpful message. 980 | if not new_dictionary.id: 981 | raise TextRazorAnalysisException("Cannot create dictionary, dictionary id not provided.") 982 | 983 | dictionary_path = "".join([self.path, new_dictionary.id]) 984 | 985 | self.do_request(dictionary_path, json.dumps(new_dictionary.json), method="PUT") 986 | 987 | # The server may have added some optional fields so we want to force the user to "get" the new dictionary. 988 | return self.get_dictionary(new_dictionary.id) 989 | 990 | def all_dictionaries(self): 991 | """ Returns a list of all Dictionary in your account. 992 | 993 | >>> for dictionary in dictionary_manager.all_dictionaries(): 994 | >>> print dictionary.id 995 | """ 996 | 997 | response = self.do_request(self.path) 998 | 999 | if "ok" in response and not response["ok"]: 1000 | raise TextRazorAnalysisException("TextRazor was unable to retrieve all dictionaries. Error: %s" % str(response)) 1001 | 1002 | if "dictionaries" in response: 1003 | return [Dictionary(dictionary_json) for dictionary_json in response["dictionaries"]] 1004 | 1005 | return [] 1006 | 1007 | def get_dictionary(self, id): 1008 | """ Returns a Dictionary object by id. 1009 | 1010 | >>> print dictionary_manager.get_id("UNIQUE_ID").language 1011 | """ 1012 | dictionary_path = "".join([self.path, id]) 1013 | response = self.do_request(dictionary_path, method="GET") 1014 | 1015 | if "ok" in response and not response["ok"]: 1016 | raise TextRazorAnalysisException("TextRazor was unable to retrieve dictionary with id: %s. Error: %s" % (id, str(response))) 1017 | 1018 | return Dictionary(response["response"]) 1019 | 1020 | def delete_dictionary(self, id): 1021 | """ Deletes a dictionary and all its entries by id. 1022 | 1023 | >>> dictionary_manager.delete_dictionary("UNIQUE_ID") 1024 | """ 1025 | dictionary_path = "".join([self.path, id]) 1026 | response = self.do_request(dictionary_path, method="DELETE") 1027 | 1028 | if "ok" in response and not response["ok"]: 1029 | raise TextRazorAnalysisException("Unable to delete dictionary with ID:%s. Error: %s" % (id, str(response))) 1030 | 1031 | def all_entries(self, dictionary_id, limit=None, offset=None): 1032 | """ Returns a AllDictionaryEntriesResponse containing all DictionaryEntry for dictionary with id dictionary_id, along with paging information. 1033 | 1034 | Larger dictionaries can be too large to download all at once. Where possible it is recommended that you use 1035 | limit and offset paramaters to control the TextRazor response, rather than filtering client side. 1036 | 1037 | >>> entry_response = dictionary_manager.all_entries("UNIQUE_ID", limit=10, offset=10) 1038 | >>> for entry in entry_response.entries: 1039 | >>> print entry.text 1040 | """ 1041 | 1042 | params = {} 1043 | if limit: 1044 | params['limit'] = limit 1045 | if offset: 1046 | params['offset'] = offset 1047 | 1048 | all_path = "".join([self.path, dictionary_id, "/_all?", urlencode(params)]) 1049 | 1050 | response = self.do_request(all_path, method="GET") 1051 | 1052 | if "ok" in response and not response["ok"]: 1053 | raise TextRazorAnalysisException("TextRazor was unable to retrieve dictionary entries with dictionary id: %s, Error: %s" % (dictionary_id, str(response))) 1054 | 1055 | return AllDictionaryEntriesResponse(response["response"]) 1056 | 1057 | def add_entries(self, dictionary_id, entities): 1058 | """ Adds entries to a dictionary with id dictionary_id. 1059 | 1060 | Entries must be a List of dicts corresponding to properties of the new DictionaryEntry objects. 1061 | At a minimum this would be [{'text':'test text to match'}]. 1062 | 1063 | >>> dictionary_manager.add_entries("UNIQUE_ID", [{'text':'test text to match'}, {'text':'more text to match', 'id':'UNIQUE_ENTRY_ID'}]) 1064 | """ 1065 | dictionary_path = "".join([self.path, dictionary_id, "/"]) 1066 | all_entries = [] 1067 | 1068 | for entity in entities: 1069 | new_entry = DictionaryEntry({}) 1070 | 1071 | for key, value in entity.items(): 1072 | if not hasattr(new_entry, key): 1073 | valid_options = ",".join(name for name, obj in DictionaryEntry.__dict__.items() if isinstance(obj, proxy_response_json)) 1074 | 1075 | raise TextRazorAnalysisException("Cannot create dictionary entry, unexpected param: %s. Supported params: %s" % (key, valid_options)) 1076 | 1077 | setattr(new_entry, key, value) 1078 | 1079 | all_entries.append(new_entry.json) 1080 | 1081 | # For performance reasons TextRazor expects a maximum of 20000 dictionary entries at a time, 1082 | # we transparently batch them up here. 1083 | 1084 | for batch in _chunks(all_entries, 20000): 1085 | response = self.do_request(dictionary_path, json.dumps(batch), method="POST") 1086 | 1087 | if "ok" in response and not response["ok"]: 1088 | raise TextRazorAnalysisException("Unable to add entries to dictionary with ID:%s. Error: %s" % (dictionary_id, str(response))) 1089 | 1090 | def delete_entry(self, dictionary_id, entry_id): 1091 | """Deletes a specific DictionaryEntry by dictionary id and entry id. 1092 | 1093 | For performance reasons it's always faster to perform major changes 1094 | to dictionaries by deleting and recreating the whole dictionary rather than removing 1095 | many individual entries. 1096 | 1097 | >>> dictionary_manager.delete_entry('UNIQUE_ID', 'UNIQUE_ENTRY_ID') 1098 | """ 1099 | 1100 | dictionary_path = "".join([self.path, dictionary_id, "/", entry_id]) 1101 | 1102 | response = self.do_request(dictionary_path, method="DELETE") 1103 | 1104 | if "ok" in response and not response["ok"]: 1105 | raise TextRazorAnalysisException("TextRazor was unable to delete dictionary entry with dictionary id: %s, entry id: %s Error: %s" % (dictionary_id, entry_id, str(response))) 1106 | 1107 | def get_entry(self, dictionary_id, entry_id): 1108 | """ Retrieves a specific DictionaryEntry by dictionary id and entry id. 1109 | 1110 | >>> print dictionary_manager.get_id('UNIQUE_ID', 'UNIQUE_ENTRY_ID').text 1111 | """ 1112 | 1113 | dictionary_path = "".join([self.path, dictionary_id, "/", entry_id]) 1114 | 1115 | response = self.do_request(dictionary_path, method="GET") 1116 | 1117 | if "ok" in response and not response["ok"]: 1118 | raise TextRazorAnalysisException("TextRazor was unable to retrieve dictionary entry with dictionary id: %s, entry id: %s Error: %s" % (dictionary_id, entry_id, str(response))) 1119 | 1120 | return DictionaryEntry(response["response"]) 1121 | 1122 | 1123 | class DictionaryEntry(object): 1124 | 1125 | def __init__(self, json): 1126 | self.json = json 1127 | 1128 | id = proxy_response_json("id", "", """ 1129 | Unique ID for this entry, used to identify and manipulate specific entries. 1130 | 1131 | Defaults to an automatically generated unique id. 1132 | """) 1133 | 1134 | text = proxy_response_json("text", "", """ 1135 | Unicode string representing the text to match to this DictionaryEntry. 1136 | """) 1137 | 1138 | data = proxy_response_json("data", {}, """ 1139 | A dictionary mapping string keys to lists of string data values. 1140 | TextRazor will return this dictionary to you as part of the Entity 'data' property whenever it matches this entry. 1141 | This is useful for adding application-specific metadata to each entry. 1142 | 1143 | >>> {'type':['people', 'person', 'politician']} 1144 | """) 1145 | 1146 | 1147 | class Dictionary(object): 1148 | 1149 | def __init__(self, json): 1150 | self.json = json 1151 | 1152 | match_type = proxy_response_json("matchType", "", """ 1153 | Controls any pre-processing done on your dictionary before matching. 1154 | 1155 | Valid options are: 1156 | stem - Words are split and "stemmed" before matching, resulting in a more relaxed match. 1157 | This is an easy way to match plurals - love, loved, loves will all match the same dictionary entry. 1158 | This implicitly sets "case_insensitive" to True. 1159 | 1160 | token - Words are split and matched literally. 1161 | 1162 | Defaults to 'token'.""") 1163 | 1164 | case_insensitive = proxy_response_json("caseInsensitive", False, """ 1165 | When True, this dictionary will match both uppercase and lowercase characters. 1166 | """) 1167 | 1168 | id = proxy_response_json("id", "", """ 1169 | The unique identifier for this dictionary. 1170 | """) 1171 | 1172 | language = proxy_response_json("language", "", """ 1173 | When set to a ISO-639-2 language code, this dictionary will only match documents of the corresponding language. 1174 | 1175 | When set to 'any', this dictionary will match any document. 1176 | 1177 | Defaults to 'any'. 1178 | """) 1179 | 1180 | 1181 | class AllCategoriesResponse(object): 1182 | 1183 | def __init__(self, json): 1184 | self.json = json 1185 | self.categories = [Category(category_json) for category_json in json.get("categories", [])] 1186 | 1187 | total = proxy_response_json("total", 0, """ 1188 | The total number of Category in this Classifier. 1189 | """) 1190 | 1191 | limit = proxy_response_json("limit", 0, """ 1192 | The maximium number of Category to be returned. 1193 | """) 1194 | 1195 | offset = proxy_response_json("offset", 0, """ 1196 | Offset into the full list of Category that this result set started from. 1197 | """) 1198 | 1199 | 1200 | class ScoredCategory(object): 1201 | 1202 | def __init__(self, json): 1203 | self.json = json 1204 | 1205 | classifier_id = proxy_response_json("classifierId", "", """ 1206 | The unique identifier for the classifier that matched this ScoredCategory. 1207 | """) 1208 | 1209 | category_id = proxy_response_json("categoryId", "", """ 1210 | The unique identifier of this category. 1211 | """) 1212 | 1213 | label = proxy_response_json("label", "", """ 1214 | The human readable label for this category. 1215 | """) 1216 | 1217 | score = proxy_response_json("score", 0, """ 1218 | The score TextRazor has assigned to this category, between 0 and 1. 1219 | """) 1220 | 1221 | 1222 | class Category(object): 1223 | path = "categories/" 1224 | 1225 | def __init__(self, json): 1226 | self.json = json 1227 | 1228 | query = proxy_response_json("query", "", """The query used to define this category.""") 1229 | 1230 | category_id = proxy_response_json("categoryId", "", """The unique ID for this category within its classifier.""") 1231 | 1232 | label = proxy_response_json("label", "", """The human readable label for this category. This is an optional field.""") 1233 | 1234 | 1235 | class ClassifierManager(TextRazorConnection): 1236 | 1237 | path = "categories/" 1238 | 1239 | def __init__(self, api_key=None): 1240 | super(ClassifierManager, self).__init__(api_key) 1241 | 1242 | def delete_classifier(self, classifier_id): 1243 | """ Deletes a Classifier and all its Categories by id. """ 1244 | classifier_path = "".join([self.path, classifier_id]) 1245 | self.do_request(classifier_path, method="DELETE") 1246 | 1247 | def create_classifier(self, classifier_id, categories): 1248 | """ Creates a new classifier using the provided list of Category. 1249 | 1250 | See the properties of class Category for valid options. """ 1251 | 1252 | classifier_path = "".join([self.path, classifier_id]) 1253 | 1254 | all_categories = [] 1255 | 1256 | for category in categories: 1257 | new_category = Category({}) 1258 | 1259 | for key, value in category.items(): 1260 | if not hasattr(new_category, key): 1261 | valid_options = ",".join(name for name, obj in Category.__dict__.items() if isinstance(obj, proxy_response_json)) 1262 | 1263 | raise TextRazorAnalysisException("Cannot create category, unexpected param: %s. Supported params: %s" % (key, valid_options)) 1264 | 1265 | setattr(new_category, key, value) 1266 | 1267 | all_categories.append(new_category.json) 1268 | 1269 | self.do_request(classifier_path, json.dumps(all_categories), content_type="application/json", method="PUT") 1270 | 1271 | def create_classifier_with_csv(self, classifier_id, categories_csv): 1272 | """ Uploads the string contents of a CSV file containing new categories to be added to the classifier called classifier_name. 1273 | Any existing classifier with this ID will be replaced. """ 1274 | 1275 | classifier_path = "".join([self.path, classifier_id]) 1276 | self.do_request(classifier_path, categories_csv, content_type="application/csv", method="PUT") 1277 | 1278 | def all_categories(self, classifier_id, limit=None, offset=None): 1279 | """ Returns a AllCategoriesResponse containing all Categories for classifier with id classifier_id, along with paging information. 1280 | 1281 | Larger classifiers can be too large to download all at once. Where possible it is recommended that you use 1282 | limit and offset paramaters to control the TextRazor response, rather than filtering client side. 1283 | 1284 | >>> category_response = classifier_manager.all_entries("UNIQUE_CLASSIFIER_ID", limit=10, offset=10) 1285 | >>> for category in category_response.categories: 1286 | >>> print category.text 1287 | """ 1288 | 1289 | params = {} 1290 | if limit: 1291 | params['limit'] = limit 1292 | if offset: 1293 | params['offset'] = offset 1294 | 1295 | all_path = "".join([self.path, classifier_id, "/_all?", urlencode(params)]) 1296 | 1297 | response = self.do_request(all_path, method="GET") 1298 | 1299 | if "ok" in response and not response["ok"]: 1300 | raise TextRazorAnalysisException("TextRazor was unable to retrieve categories for classifier id: %s, Error: %s" % (classifier_id, str(response))) 1301 | 1302 | return AllCategoriesResponse(response["response"]) 1303 | 1304 | def delete_category(self, classifier_id, category_id): 1305 | """ Deletes a Category by ID. """ 1306 | category_path = "".join([self.path, classifier_id, "/", category_id]) 1307 | self.do_request(category_path, method="DELETE") 1308 | 1309 | def get_category(self, classifier_id, category_id): 1310 | """ Returns a Category by ID. """ 1311 | category_path = "".join([self.path, classifier_id, "/", category_id]) 1312 | 1313 | response = self.do_request(category_path, method="GET") 1314 | 1315 | if "ok" in response and not response["ok"]: 1316 | raise TextRazorAnalysisException("TextRazor was unable to retrieve category for classifier id: %s, Error: %s" % (classifier_id, str(response))) 1317 | 1318 | return Category(response["response"]) 1319 | 1320 | class Account(object): 1321 | 1322 | def __init__(self, json): 1323 | self.json = json 1324 | 1325 | plan = proxy_response_json("plan", "", """ 1326 | The ID of your current subscription plan. 1327 | """) 1328 | 1329 | concurrent_request_limit = proxy_response_json("concurrentRequestLimit", 0, """ 1330 | The maximum number of requests your account can make at the same time. 1331 | """) 1332 | 1333 | concurrent_requests_used = proxy_response_json("concurrentRequestsUsed", 0, """ 1334 | The number of requests currently being processed by your account. 1335 | """) 1336 | 1337 | plan_daily_included_requests = proxy_response_json("planDailyRequestsIncluded", 0, """ 1338 | The daily number of requests included with your subscription plan. 1339 | """) 1340 | 1341 | requests_used_today = proxy_response_json("requestsUsedToday", 0, """ 1342 | The total number of requests that have been made today. 1343 | """) 1344 | 1345 | class AccountManager(TextRazorConnection): 1346 | 1347 | path = "account/" 1348 | 1349 | def __init__(self, api_key=None): 1350 | super(AccountManager, self).__init__(api_key) 1351 | 1352 | def get_account(self): 1353 | """ Retrieves the Account settings and realtime usage statistics for your account. 1354 | 1355 | This call does not count towards your daily request or concurrency limits. 1356 | 1357 | >>> import textrazor 1358 | >>> textrazor.api_key = "YOUR_API_KEY_HERE" 1359 | >>> 1360 | >>> account_manager = textrazor.AccountManager() 1361 | >>> 1362 | >>> print account_manager.get_account().requests_used_today 1363 | """ 1364 | 1365 | response = self.do_request(self.path, method="GET") 1366 | 1367 | if "ok" in response and not response["ok"]: 1368 | raise TextRazorAnalysisException("TextRazor was unable to retrieve your account details, Error: %s" % str(response)) 1369 | 1370 | return Account(response["response"]) 1371 | 1372 | 1373 | class TextRazor(TextRazorConnection): 1374 | """ 1375 | The main TextRazor client. To process your text, create a :class:`TextRazor` instance with your API key 1376 | and set the extractors you need to process the text. Calls to :meth:`analyze` and :meth:`analyze_url` will then process raw text or URLs 1377 | , returning a :class:`TextRazorResponse` on success. 1378 | 1379 | This class is threadsafe once initialized with the request options. You should create a new instance for each request 1380 | if you are likely to be changing the request options in a multithreaded environment. 1381 | 1382 | Below is an entity extraction example from the tutorial, you can find more examples at http://www.textrazor.com/tutorials. 1383 | 1384 | >>> import textrazor 1385 | >>> 1386 | >>> client = textrazor.TextRazor("API_KEY_GOES_HERE", extractors=["entities"]) 1387 | >>> client.set_cleanup_mode("cleanHTML") 1388 | >>> 1389 | >>> response = client.analyze_url("http://www.bbc.co.uk/news/uk-politics-18640916") 1390 | >>> 1391 | >>> entities = list(response.entities()) 1392 | >>> entities.sort(key=lambda x: x.relevance_score, reverse=True) 1393 | >>> 1394 | >>> seen = set() 1395 | >>> for entity in entities: 1396 | >>> if entity.id not in seen: 1397 | >>> print entity.id, entity.relevance_score, entity.confidence_score, entity.freebase_types 1398 | >>> seen.add(entity.id) 1399 | """ 1400 | 1401 | def __init__(self, api_key=None, extractors=[], do_compression=None, do_encryption=None): 1402 | super(TextRazor, self).__init__(api_key, do_compression, do_encryption) 1403 | 1404 | self.extractors = extractors 1405 | self.cleanup_html = False 1406 | self.cleanup_mode = None 1407 | self.cleanup_return_cleaned = None 1408 | self.cleanup_return_raw = None 1409 | self.cleanup_use_metadata = None 1410 | self.download_user_agent = None 1411 | self.rules = "" 1412 | self.language_override = None 1413 | self.enrichment_queries = [] 1414 | self.dbpedia_type_filters = [] 1415 | self.freebase_type_filters = [] 1416 | self.allow_overlap = None 1417 | self.entity_dictionaries = [] 1418 | self.classifiers = [] 1419 | self.classifier_max_categories = None 1420 | 1421 | def set_extractors(self, extractors): 1422 | """Sets a list of "Extractors" which extract various information from your text. 1423 | Only select the extractors that are explicitly required by your application for optimal performance. 1424 | Any extractor that doesn't match one of the predefined list below will be assumed to be a custom Prolog extractor. 1425 | 1426 | Valid options are: words, phrases, entities, dependency-trees, relations, entailments. """ 1427 | self.extractors = extractors 1428 | 1429 | def set_rules(self, rules): 1430 | """Sets a string containing Prolog logic. All rules matching an extractor name listed in the request will be evaluated 1431 | and all matching param combinations linked in the response. """ 1432 | self.rules = rules 1433 | 1434 | def set_enrichment_queries(self, enrichment_queries): 1435 | """Set a list of "Enrichment Queries", used to enrich the entity response with structured linked data. 1436 | The syntax for these queries is documented at https://www.textrazor.com/enrichment """ 1437 | self.enrichment_queries = enrichment_queries 1438 | 1439 | def set_language_override(self, language_override): 1440 | """When set to a ISO-639-2 language code, force TextRazor to analyze content with this language. 1441 | If not set TextRazor will use the automatically identified language. 1442 | """ 1443 | self.language_override = language_override 1444 | 1445 | def set_do_cleanup_HTML(self, cleanup_html): 1446 | """When True, input text is treated as raw HTML and will be cleaned of tags, comments, scripts, 1447 | and boilerplate content removed. When this option is enabled, the cleaned_text property is returned 1448 | with the text content, providing access to the raw filtered text. When enabled, position offsets returned 1449 | in individual words apply to the clean text, not the provided HTML.""" 1450 | 1451 | warnings.warn("set_do_cleanup_HTML has been deprecated. Please see set_cleanup_mode for a more flexible cleanup option.", DeprecationWarning) 1452 | 1453 | self.cleanup_html = cleanup_html 1454 | 1455 | def set_cleanup_mode(self, cleanup_mode): 1456 | """Controls the preprocessing cleanup mode that TextRazor will apply to your content before analysis. 1457 | For all options aside from "raw" any position offsets returned will apply to the final cleaned text, 1458 | not the raw HTML. If the cleaned text is required please see the :meth:`set_cleanup_return_cleaned' option. 1459 | 1460 | Valid options are: 1461 | raw - Content is analyzed "as-is", with no preprocessing. 1462 | cleanHTML - Boilerplate HTML is removed prior to analysis, including tags, comments, menus, leaving only the 1463 | body of the article. 1464 | stripTags - All Tags are removed from the document prior to analysis. This will remove all HTML, XML tags, but 1465 | the content of headings, menus will remain. This is a good option for analysis of HTML pages that aren't 1466 | long form documents. 1467 | 1468 | Defaults to "raw" for analyze requests, and "cleanHTML" for analyze_url requests. 1469 | """ 1470 | self.cleanup_mode = cleanup_mode 1471 | 1472 | def set_cleanup_return_cleaned(self, return_cleaned): 1473 | """When return_cleaned is True, the TextRazor response will contain the cleaned_text property. To save bandwidth, only set this to 1474 | True if you need it in your application. Defaults to False.""" 1475 | self.cleanup_return_cleaned = return_cleaned 1476 | 1477 | def set_cleanup_return_raw(self, return_raw): 1478 | """When return_raw is True, the TextRazor response will contain the raw_text property, the original text TextRazor received or downloaded 1479 | before cleaning. To save bandwidth, only set this to True if you need it in your application. Defaults to False.""" 1480 | self.cleanup_return_raw = return_raw 1481 | 1482 | def set_cleanup_use_metadata(self, use_metadata): 1483 | """When use_metadata is True, TextRazor will use metadata extracted from your document to help in the disambiguation/extraction 1484 | process. This include HTML titles and metadata, and can significantly improve results for shorter documents without much other 1485 | content. 1486 | 1487 | This option has no effect when cleanup_mode is 'raw'. 1488 | """ 1489 | self.cleanup_use_metadata = use_metadata 1490 | 1491 | def set_download_user_agent(self, user_agent): 1492 | """Sets the User-Agent header to be used when downloading URLs through analyze_url. This should be a descriptive string identifying 1493 | your application, or an end user's browser user agent if you are performing live requests from a given user. 1494 | 1495 | Defaults to "TextRazor Downloader (https://www.textrazor.com)" 1496 | """ 1497 | self.download_user_agent = user_agent 1498 | 1499 | def set_entity_dictionaries(self, entity_dictionaries): 1500 | """Sets a list of the custom entity dictionaries to match against your content. Each item should be a string ID 1501 | corresponding to dictionaries you have previously configured through the textrazor.Dictionary interface.""" 1502 | self.entity_dictionaries = entity_dictionaries 1503 | 1504 | def set_entity_allow_overlap(self, allow_overlap): 1505 | """When allow_overlap is True, entities in the response may overlap. When False, the "best" entity 1506 | is found such that none overlap. Defaults to True. """ 1507 | self.allow_overlap = allow_overlap 1508 | 1509 | def set_entity_dbpedia_type_filters(self, filters): 1510 | """Set a list of DBPedia types to filter entity extraction on. All returned entities must 1511 | match at least one of these types.""" 1512 | self.dbpedia_type_filters = filters 1513 | 1514 | def set_entity_freebase_type_filters(self, filters): 1515 | """Set a list of Freebase types to filter entity extraction on. All returned entities must 1516 | match at least one of these types.""" 1517 | self.freebase_type_filters = filters 1518 | 1519 | def set_classifiers(self, classifiers): 1520 | """Sets a list of classifiers to evaluate against your document. Each entry should be a string ID corresponding to either one of TextRazor's default classifiers, or one you have previously configured through the ClassifierManager interface. 1521 | 1522 | Valid Options are: 1523 | textrazor_iab Score against the Internet Advertising Bureau QAG segments - approximately 400 high level categories arranged into two tiers. 1524 | textrazor_newscodes Score against the IPTC newscodes - approximately 1400 high level categories organized into a three level tree. 1525 | custom classifier name Score against a custom classifier, previously created through the Classifier Manager interface.""" 1526 | self.classifiers = classifiers 1527 | 1528 | def set_classifier_max_categories(self, max_categories): 1529 | """Sets the maximum number of matching categories to retrieve from the TextRazor.""" 1530 | self.classifier_max_categories = max_categories 1531 | 1532 | def _add_optional_param(self, post_data, param, value): 1533 | if value is not None: 1534 | post_data.append((param, value)) 1535 | 1536 | def _build_post_data(self): 1537 | post_data = [("rules", self.rules), 1538 | ("extractors", ",".join(self.extractors)), 1539 | ("cleanupHTML", self.cleanup_html), 1540 | ("classifiers", ",".join(self.classifiers))] 1541 | 1542 | for dictionary in self.entity_dictionaries: 1543 | post_data.append(("entities.dictionaries", dictionary)) 1544 | 1545 | for filter in self.dbpedia_type_filters: 1546 | post_data.append(("entities.filterDbpediaTypes", filter)) 1547 | 1548 | for filter in self.freebase_type_filters: 1549 | post_data.append(("entities.filterFreebaseTypes", filter)) 1550 | 1551 | for query in self.enrichment_queries: 1552 | post_data.append(("entities.enrichmentQueries", query)) 1553 | 1554 | self._add_optional_param(post_data, "entities.allowOverlap", self.allow_overlap) 1555 | self._add_optional_param(post_data, "languageOverride", self.language_override) 1556 | self._add_optional_param(post_data, "cleanup.mode", self.cleanup_mode) 1557 | self._add_optional_param(post_data, "cleanup.returnCleaned", self.cleanup_return_cleaned) 1558 | self._add_optional_param(post_data, "cleanup.returnRaw", self.cleanup_return_raw) 1559 | self._add_optional_param(post_data, "cleanup.useMetadata", self.cleanup_use_metadata) 1560 | self._add_optional_param(post_data, "download.userAgent", self.download_user_agent) 1561 | self._add_optional_param(post_data, "classifier.maxCategories", self.classifier_max_categories) 1562 | 1563 | return post_data 1564 | 1565 | def analyze_url(self, url): 1566 | """Calls the TextRazor API with the provided url. 1567 | 1568 | TextRazor will first download the contents of this URL, and then process the resulting text. 1569 | 1570 | TextRazor will only attempt to analyze text documents. Any invalid UTF-8 characters will be replaced with a space character and ignored. 1571 | TextRazor limits the total download size to approximately 1M. Any larger documents will be truncated to that size, and a warning 1572 | will be returned in the response. 1573 | 1574 | By default, TextRazor will clean all HTML prior to processing. For more control of the cleanup process, 1575 | see the :meth:`set_cleanup_mode' option. 1576 | 1577 | Returns a :class:`TextRazorResponse` with the parsed data on success. 1578 | Raises a :class:`TextRazorAnalysisException` on failure. """ 1579 | 1580 | post_data = self._build_post_data() 1581 | post_data.append(("url", url.encode("utf-8"))) 1582 | 1583 | return TextRazorResponse(self.do_request("", urlencode(post_data), method="POST")) 1584 | 1585 | def analyze(self, text): 1586 | """Calls the TextRazor API with the provided unicode text. 1587 | 1588 | Returns a :class:`TextRazorResponse` with the parsed data on success. 1589 | Raises a :class:`TextRazorAnalysisException` on failure. """ 1590 | 1591 | post_data = self._build_post_data() 1592 | post_data.append(("text", text.encode("utf-8"))) 1593 | 1594 | return TextRazorResponse(self.do_request("", urlencode(post_data), method="POST")) 1595 | --------------------------------------------------------------------------------