├── LICENSE
├── README.md
├── README.rst
├── setup.py
└── textrazor.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Toby Crayston
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | TextRazor Python SDK
 2 | ====================
 3 | 
 4 | Python SDK for the TextRazor Text Analytics API. 
 5 | 
 6 | TextRazor offers state-of-the-art natural language processing tools through a simple API, allowing you to build semantic technology into your applications in minutes.  
 7 | 
 8 | Hundreds of applications rely on TextRazor to understand unstructured text across a range of verticals, with use cases including social media monitoring, enterprise search, recommendation systems and ad targeting.  
 9 | 
10 | Getting Started
11 | ===============
12 | 
13 | - Get a free API key from [https://www.textrazor.com](https://www.textrazor.com).
14 | 
15 | - Install the TextRazor Python SDK
16 | 
17 | 	```bash
18 | 	pip install textrazor
19 | 	```
20 | 
21 | - Create an instance of the TextRazor object and start analyzing your text.
22 | 
23 | 	```python
24 | 	from textrazor import TextRazor
25 | 
26 | 	client = TextRazor(YOUR_API_KEY_HERE, extractors=["entities"])
27 | 	response = client.analyze("Barclays misled shareholders and the public about one of the biggest investments in the bank's history, a BBC Panorama investigation has found.")
28 | 
29 | 	for entity in response.entities():
30 | 		print(entity)
31 | 	```
32 | 
33 | For full API documentation visit [https://www.textrazor.com/docs/python](https://www.textrazor.com/docs/python).
34 | 
35 | If you have any questions please get in touch at support@textrazor.com
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | TextRazor Python SDK
 2 | ====================
 3 | 
 4 | Python SDK for the TextRazor Text Analytics API.
 5 | 
 6 | TextRazor offers state-of-the-art natural language processing tools
 7 | through a simple API, allowing you to build semantic technology into
 8 | your applications in minutes.
 9 | 
10 | Hundreds of applications rely on TextRazor to understand unstructured
11 | text across a range of verticals, with use cases including social media
12 | monitoring, enterprise search, recommendation systems and ad targeting.
13 | 
14 | Getting Started
15 | ===============
16 | 
17 | -  Get a free API key from https://www.textrazor.com.
18 | 
19 | -  Install the TextRazor Python SDK
20 | 
21 |    .. code:: bash
22 | 
23 |        pip install textrazor
24 | 
25 | -  Create an instance of the TextRazor object and start analyzing your
26 |    text.
27 | 
28 |    .. code:: python
29 | 
30 |        from textrazor import TextRazor
31 | 
32 |        client = TextRazor(YOUR_API_KEY_HERE, extractors=["entities"])
33 |        response = client.analyze("Barclays misled shareholders and the public about one of the biggest investments in the bank's history, a BBC Panorama investigation has found.")
34 | 
35 |        for entity in response.entities():
36 |            print(entity)
37 | 
38 | For full API documentation visit https://www.textrazor.com/docs/python
39 | 
40 | If you have any questions please get in touch at support@textrazor.com
41 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import setup
 4 | 
 5 | setup(
 6 |     name='textrazor',
 7 |     version='1.4.1',
 8 |     description='Official Python SDK for TextRazor (https://textrazor.com).',
 9 |     long_description=open('README.rst').read(),
10 |     author='TextRazor Ltd.',
11 |     author_email='toby@textrazor.com',
12 |     url='https://textrazor.com/',
13 |     license='MIT',
14 |     py_modules=['textrazor'],
15 |     classifiers=[
16 |         'License :: OSI Approved :: MIT License',
17 |         'Operating System :: OS Independent',
18 |         'Programming Language :: Python',
19 |         'Programming Language :: Python :: 2.6',
20 |         'Programming Language :: Python :: 2.7',
21 |         'Programming Language :: Python :: 3',
22 |         'Topic :: Software Development'
23 |     ]
24 | )
25 | 


--------------------------------------------------------------------------------
/textrazor.py:
--------------------------------------------------------------------------------
   1 | """
   2 | Copyright (c) 2023 TextRazor, https://www.textrazor.com/
   3 | 
   4 | Permission is hereby granted, free of charge, to any person obtaining
   5 | a copy of this software and associated documentation files (the "Software"),
   6 | to deal in the Software without restriction, including without limitation
   7 | the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 | and/or sell copies of the Software, and to permit persons to whom the Software
   9 | is furnished to do so, subject to the following conditions:
  10 | 
  11 | The above copyright notice and this permission notice shall be included in
  12 |  all copies or substantial portions of the Software.
  13 | 
  14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  20 | THE SOFTWARE.
  21 | 
  22 | """
  23 | 
  24 | try:
  25 |     from urllib2 import Request, urlopen, HTTPError
  26 |     from urllib import urlencode
  27 | except ImportError:
  28 |     from urllib.request import Request, urlopen
  29 |     from urllib.parse import urlencode
  30 |     from urllib.error import HTTPError
  31 | 
  32 | import warnings
  33 | 
  34 | try:
  35 |     import simplejson as json
  36 | except ImportError:
  37 |     import json
  38 | 
  39 | try:
  40 |     import cStringIO.StringIO as IOStream
  41 | except ImportError:
  42 |     try:
  43 |         import StringIO.StringIO as IOStream
  44 |     except ImportError:
  45 |         from io import BytesIO as IOStream
  46 | 
  47 | import gzip
  48 | import zlib
  49 | 
  50 | # These options don't usually change much within a user's app,
  51 | # for convenience allow them to set global defaults for connection options.
  52 | 
  53 | api_key = None
  54 | do_compression = True
  55 | do_encryption = True
  56 | 
  57 | # Endpoints aren't usually changed by an end user, but helpful to
  58 | # have as an option for debug purposes.
  59 | 
  60 | _SECURE_TEXTRAZOR_ENDPOINT = "https://api.textrazor.com/"
  61 | _TEXTRAZOR_ENDPOINT = "http://api.textrazor.com/"
  62 | 
  63 | 
  64 | def _chunks(l, n):
  65 |     n = max(1, n)
  66 |     return (l[i:i + n] for i in range(0, len(l), n))
  67 | 
  68 | 
  69 | class proxy_response_json(object):
  70 |     """ Helper class to provide a transparent proxy for python properties
  71 |     with easy access to an underlying json document. This is to avoid unneccesary
  72 |     copying of the response, while explictly exposing the expected response fields
  73 |     and documentation."""
  74 | 
  75 |     def __init__(self, attr_name, default=None, doc=None):
  76 |         self.attr_name = attr_name
  77 |         self.default = default
  78 | 
  79 |         if doc:
  80 |             self.__doc__ = doc
  81 | 
  82 |     def __get__(self, instance, owner=None):
  83 |         return instance.json.get(self.attr_name, self.default)
  84 | 
  85 |     def __set__(self, instance, value):
  86 |         instance.json[self.attr_name] = value
  87 | 
  88 | 
  89 | class proxy_member(object):
  90 |     """ Slightly redundant given the property decorator, but saves some space
  91 |     and makes non-json property access consistent with the above. """
  92 | 
  93 |     def __init__(self, attr_name, doc=None):
  94 |         self.attr_name = attr_name
  95 | 
  96 |         if doc:
  97 |             self.__doc__ = doc
  98 | 
  99 |     def __get__(self, instance, owner=None):
 100 |         return getattr(instance, self.attr_name)
 101 | 
 102 | def _generate_str(instance, banned_properties=[]):
 103 |     out = ["TextRazor", type(instance).__name__]
 104 | 
 105 |     try:
 106 |         out.extend(["with id:", repr(instance.id), "\n"])
 107 |     except AttributeError:
 108 |         out.extend([":\n", ])
 109 | 
 110 |     for prop in dir(instance):
 111 |         if not prop.startswith("_") and prop != "id" and prop not in banned_properties:
 112 |             out.extend([prop, ":", repr(getattr(instance, prop)), "\n"])
 113 | 
 114 |     return " ".join(out)
 115 | 
 116 | class TextRazorConnection(object):
 117 | 
 118 |     def __init__(self, local_api_key=None, local_do_compression=None, local_do_encryption=None):
 119 |         global api_key, do_compression, do_encryption, _TEXTRAZOR_ENDPOINT, _SECURE_TEXTRAZOR_ENDPOINT
 120 | 
 121 |         self.api_key = local_api_key
 122 |         self.do_compression = local_do_compression
 123 |         self.do_encryption = local_do_encryption
 124 | 
 125 |         self.endpoint = _TEXTRAZOR_ENDPOINT
 126 |         self.secure_endpoint = _SECURE_TEXTRAZOR_ENDPOINT
 127 | 
 128 |         if self.api_key is None:
 129 |             self.api_key = api_key
 130 |         if self.do_compression is None:
 131 |             self.do_compression = do_compression
 132 |         if self.do_encryption is None:
 133 |             self.do_encryption = do_encryption
 134 | 
 135 |     def set_api_key(self, api_key):
 136 |         """Sets the TextRazor API key, required for all requests."""
 137 |         self.api_key = api_key
 138 | 
 139 |     def set_do_compression(self, do_compression):
 140 |         """When True, request gzipped responses from TextRazor.  When expecting a large response this can
 141 |         significantly reduce bandwidth.  Defaults to True."""
 142 |         self.do_compression = do_compression
 143 | 
 144 |     def set_do_encryption(self, do_encryption):
 145 |         """When True, all communication to TextRazor will be sent over SSL, when handling sensitive
 146 |         or private information this should be set to True.  Defaults to False."""
 147 |         self.do_encryption = do_encryption
 148 | 
 149 |     def set_endpoint(self, endpoint):
 150 |         self.endpoint = endpoint
 151 | 
 152 |     def set_secure_endpoint(self, endpoint):
 153 |         self.secure_endpoint = endpoint
 154 | 
 155 |     def _build_request_headers(self, do_request_compression=False):
 156 |         request_headers = {
 157 |             'X-TextRazor-Key': self.api_key
 158 |         }
 159 | 
 160 |         if self.do_compression:
 161 |             request_headers['Accept-Encoding'] = 'gzip'
 162 | 
 163 |         if do_request_compression:
 164 |             request_headers['Content-Encoding'] = 'gzip'
 165 | 
 166 |         return request_headers
 167 | 
 168 |     def do_request(self, path, post_data=None, content_type=None, method="GET"):
 169 |         # Where compression is enabled, TextRazor supports compression of both request and response bodys.
 170 |         # Request compression can result in a significant decrease in processing time, especially for
 171 |         # larger documents.
 172 |         do_request_compression = False
 173 | 
 174 |         encoded_post_data = None
 175 |         if post_data:
 176 |             encoded_post_data = post_data.encode("utf-8")
 177 | 
 178 |             # Don't do request compression for small/empty bodies
 179 |             do_request_compression = self.do_compression and encoded_post_data and len(encoded_post_data) > 50
 180 | 
 181 |         request_headers = self._build_request_headers(do_request_compression)
 182 | 
 183 |         if content_type:
 184 |             request_headers['Content-Type'] = content_type
 185 | 
 186 |         if self.do_encryption:
 187 |             endpoint = self.secure_endpoint
 188 |         else:
 189 |             endpoint = self.endpoint
 190 | 
 191 |         url = "".join([endpoint, path])
 192 | 
 193 |         if do_request_compression:
 194 |             encoded_post_data = zlib.compress(encoded_post_data)
 195 | 
 196 |         request = Request(url, headers=request_headers, data=encoded_post_data)
 197 | 
 198 |         request.get_method = lambda: method
 199 | 
 200 |         try:
 201 |             response = urlopen(request)
 202 |         except HTTPError as e:
 203 |             raise TextRazorAnalysisException("TextRazor returned HTTP Code %d: %s" % (e.code, e.read()))
 204 | 
 205 |         if response.info().get('Content-Encoding') == 'gzip':
 206 |             buf = IOStream(response.read())
 207 |             response = gzip.GzipFile(fileobj=buf)
 208 | 
 209 |         response_text = response.read().decode("utf-8")
 210 |         return json.loads(response_text)
 211 | 
 212 | 
 213 | class TextRazorAnalysisException(Exception):
 214 |     pass
 215 | 
 216 | 
 217 | class Topic(object):
 218 |     """Represents a single abstract topic extracted from the input text.
 219 | 
 220 |     Requires the "topics" extractor to be added to the TextRazor request.
 221 |     """
 222 | 
 223 |     def __init__(self, topic_json, link_index):
 224 |         self.json = topic_json
 225 | 
 226 |         for callback, arg in link_index.get(("topic", self.id), []):
 227 |             callback(arg, self)
 228 | 
 229 |     id = proxy_response_json("id", None, """The unique id of this Topic within the result set.""")
 230 | 
 231 |     label = proxy_response_json("label", None, """The label of this Topic.""")
 232 | 
 233 |     wikipedia_link = proxy_response_json("wikiLink", None, """A link to Wikipedia for this topic, or None if this Topic couldn't be linked to a Wikipedia page.""")
 234 | 
 235 |     wikidata_id = proxy_response_json("wikidataId", None, """A link to the Wikidata ID for this topic, or None if this Topic couldn't be linked to a Wikipedia page.""")
 236 | 
 237 |     score = proxy_response_json("score", None, """The contextual relevance of this Topic to your document.""")
 238 | 
 239 |     def __str__(self):
 240 |         return _generate_str(self)
 241 | 
 242 |     def __repr__(self):
 243 |         return "TextRazor Topic %s with label %s" % (str(self.id), str(self.label))
 244 | 
 245 | 
 246 | class Entity(object):
 247 |     """Represents a single "Named Entity" extracted from the input text.
 248 | 
 249 |     Requires the "entities" extractor to be added to the TextRazor request.
 250 |     """
 251 | 
 252 |     def __init__(self, entity_json, link_index):
 253 |         self.json = entity_json
 254 |         self._matched_words = []
 255 | 
 256 |         for callback, arg in link_index.get(("entity", self.document_id), []):
 257 |             callback(arg, self)
 258 | 
 259 |         for position in self.matched_positions:
 260 |             try:
 261 |                 link_index[("word", position)].append((self._register_link, None))
 262 |             except KeyError:
 263 |                 link_index[("word", position)] = [(self._register_link, None)]
 264 | 
 265 |     def _register_link(self, dummy, word):
 266 |         self._matched_words.append(word)
 267 |         word._add_entity(self)
 268 | 
 269 |     custom_entity_id = proxy_response_json("customEntityId", "", """
 270 |     The custom entity DictionaryEntry id that matched this Entity,
 271 |     if this entity was matched in a custom dictionary.""")
 272 | 
 273 |     document_id = proxy_response_json("id", None)
 274 | 
 275 |     id = proxy_response_json("entityId", None, "The disambiguated Wikipedia ID for this entity, or None if this entity could not be disambiguated.")
 276 | 
 277 |     english_id = proxy_response_json("entityEnglishId", None, "The disambiguated entityId in the English Wikipedia, where a link between localized and English ID could be found. None if either the entity could not be linked, or where a language link did not exist.")
 278 | 
 279 |     freebase_id = proxy_response_json("freebaseId", None, "The disambiguated Freebase ID for this entity, or None if either this entity could not be disambiguated, or has no Freebase link.")
 280 | 
 281 |     wikidata_id = proxy_response_json("wikidataId", None, "The disambiguated Wikidata QID for this entity, or None if either this entity could not be disambiguated, or has no Freebase link.")
 282 | 
 283 |     wikipedia_link = proxy_response_json("wikiLink", None, "Link to Wikipedia for this entity, or None if either this entity could not be disambiguated or a Wikipedia link doesn't exist.")
 284 | 
 285 |     matched_text = proxy_response_json("matchedText", None, "The source text string that matched this entity")
 286 | 
 287 |     starting_position = proxy_response_json("startingPos", None, "The character offset in the unicode source text that marks the start of this entity.")
 288 | 
 289 |     ending_position = proxy_response_json("endingPos", None, "The character offset in the unicode source text that marks the end of this entity.")
 290 | 
 291 |     matched_positions = proxy_response_json("matchingTokens", [], "List of the token positions in the current sentence that make up this entity.")
 292 | 
 293 |     freebase_types = proxy_response_json("freebaseTypes", [], "List of Freebase types for this entity, or an empty list if there are none.")
 294 | 
 295 |     dbpedia_types = proxy_response_json("type", [], "List of Dbpedia types for this entity, or an empty list if there are none.")
 296 | 
 297 |     relevance_score = proxy_response_json("relevanceScore", None, """The relevance this entity has to the source text. This is a float on a scale of 0 to 1, with 1 being the most relevant.
 298 |     Relevance is computed using a number contextual clues found in the entity context and facts in the TextRazor knowledgebase.""")
 299 | 
 300 |     confidence_score = proxy_response_json("confidenceScore", None, """
 301 |     The confidence that TextRazor is correct that this is a valid entity. TextRazor uses an ever increasing
 302 |     number of signals to help spot valid entities, all of which contribute to this score. These include the contextual
 303 |     agreement between the words in the source text and our knowledgebase, agreement between other entities in the text,
 304 |     agreement between the expected entity type and context, and prior probabilities of having seen this entity across Wikipedia
 305 |     and other web datasets. The score ranges from 0.5 to 10, with 10 representing the highest confidence that this is
 306 |     a valid entity.""")
 307 | 
 308 |     data = proxy_response_json("data", {}, """Dictionary containing enriched data found for this entity.
 309 |     This is either as a result of an enrichment query, or as uploaded as part of a custom Entity Dictionary.""")
 310 | 
 311 |     crunchbase_id = proxy_response_json("crunchbaseId", None, "The disambiguated Crunchbase ID for this entity. None if either the entity could not be linked, or the entity was not a Company type.")
 312 | 
 313 |     lei = proxy_response_json("lei", None, "The disambiguated Legal Entity Identifier for this entity. None if either the entity could not be linked, or the entity was not a Company type.")
 314 | 
 315 |     figi = proxy_response_json("figi", None, "The disambiguated Open FIGI for this entity. None if either the entity could not be linked, or the entity was not a Company type.")
 316 | 
 317 |     permid = proxy_response_json("permid", None, "The disambiguated Thomson Reuters Open PermID for this entity. None if either the entity could not be linked, or the entity was not a Company type.")
 318 | 
 319 |     @property
 320 |     def matched_words(self):
 321 |         """Returns a list of :class:`Word` that make up this entity."""
 322 |         return self._matched_words
 323 | 
 324 |     def __repr__(self):
 325 |         return "TextRazor Entity %s at positions %s" % (self.id.encode("utf-8"), str(self.matched_positions))
 326 | 
 327 |     def __str__(self):
 328 |         return _generate_str(self)
 329 | 
 330 | class Entailment(object):
 331 |     """Represents a single "entailment" derived from the source text.
 332 | 
 333 |     Requires the "entailments" extractor to be added to the TextRazor request.
 334 |     """
 335 | 
 336 |     def __init__(self, entailment_json, link_index):
 337 |         self.json = entailment_json
 338 |         self._matched_words = []
 339 | 
 340 |         for callback, arg in link_index.get(("entailment", self.id), []):
 341 |             callback(arg, self)
 342 | 
 343 |         for position in self.matched_positions:
 344 |             try:
 345 |                 link_index[("word", position)].append((self._register_link, None))
 346 |             except KeyError:
 347 |                 link_index[("word", position)] = [(self._register_link, None)]
 348 | 
 349 |     def _register_link(self, dummy, word):
 350 |         self._matched_words.append(word)
 351 |         word._add_entailment(self)
 352 | 
 353 |     id = proxy_response_json("id", None, "The unique id of this Entailment within the result set.")
 354 | 
 355 |     matched_positions = proxy_response_json("wordPositions", [], "The token positions in the current sentence that generated this entailment.")
 356 | 
 357 |     prior_score = proxy_response_json("priorScore", None, "The score of this entailment independent of the context it is used in this sentence.")
 358 | 
 359 |     context_score = proxy_response_json("contextScore", None, "Score of this entailment given the source word's usage in its sentence and the entailed word's usage in our knowledgebase")
 360 | 
 361 |     score = proxy_response_json("score", None, "TextRazor's overall confidence that this is a valid entailment, a combination of the prior and context score")
 362 | 
 363 |     @property
 364 |     def matched_words(self):
 365 |         """The :class:`Word` in the current sentence that generated this entailment."""
 366 |         return self._matched_words
 367 | 
 368 |     @property
 369 |     def entailed_word(self):
 370 |         """The word string that is entailed by the source words."""
 371 |         entailed_tree = self.json.get("entailedTree")
 372 |         if entailed_tree:
 373 |             return entailed_tree.get("word")
 374 | 
 375 |     def __repr__(self):
 376 |         return "TextRazor Entailment:\"%s\" at positions %s" % (str(self.entailed_word), str(self.matched_positions))
 377 | 
 378 |     def __str__(self):
 379 |         return _generate_str(self)
 380 | 
 381 | 
 382 | class RelationParam(object):
 383 |     """Represents a Param to a specific :class:`Relation`.
 384 | 
 385 |     Requires the "relations" extractor to be added to the TextRazor request."""
 386 | 
 387 |     def __init__(self, param_json, relation_parent, link_index):
 388 |         self.json = param_json
 389 |         self._relation_parent = relation_parent
 390 |         self._param_words = []
 391 | 
 392 |         for position in self.param_positions:
 393 |             try:
 394 |                 link_index[("word", position)].append((self._register_link, None))
 395 |             except KeyError:
 396 |                 link_index[("word", position)] = [(self._register_link, None)]
 397 | 
 398 |     def _register_link(self, dummy, word):
 399 |         self._param_words.append(word)
 400 |         word._add_relation_param(self)
 401 | 
 402 |     @property
 403 |     def relation_parent(self):
 404 |         """Returns the :class:`Relation` that owns this param."""
 405 |         return self._relation_parent
 406 | 
 407 |     relation = proxy_response_json("relation", None, """
 408 |     The relation of this param to the predicate.
 409 |     Possible values: SUBJECT, OBJECT, OTHER""")
 410 | 
 411 |     param_positions = proxy_response_json("wordPositions", [], "List of the positions of the words in this param within their sentence.")
 412 | 
 413 |     @property
 414 |     def param_words(self):
 415 |         """Returns a list of all the :class:`Word` that make up this param."""
 416 |         return self._param_words
 417 | 
 418 |     def entities(self):
 419 |         """Returns a generator of all :class:`Entity` mentioned in this param."""
 420 |         seen = set()
 421 |         for word in self.param_words:
 422 |             for entity in word.entities:
 423 |                 if entity not in seen:
 424 |                     seen.add(entity)
 425 |                     yield entity
 426 | 
 427 |     def __repr__(self):
 428 |         return "TextRazor RelationParam:\"%s\" at positions %s" % (str(self.relation), str(self.param_words))
 429 | 
 430 |     def __str__(self):
 431 |         return _generate_str(self)
 432 | 
 433 | 
 434 | class NounPhrase(object):
 435 |     """Represents a multi-word phrase extracted from a sentence.
 436 | 
 437 |     Requires the "relations" extractor to be added to the TextRazor request."""
 438 | 
 439 |     def __init__(self, noun_phrase_json, link_index):
 440 |         self.json = noun_phrase_json
 441 |         self._words = []
 442 | 
 443 |         for callback, arg in link_index.get(("nounPhrase", self.id), []):
 444 |             callback(arg, self)
 445 | 
 446 |         for position in self.word_positions:
 447 |             try:
 448 |                 link_index[("word", position)].append((self._register_link, None))
 449 |             except KeyError:
 450 |                 link_index[("word", position)] = [(self._register_link, None)]
 451 | 
 452 |     def _register_link(self, dummy, word):
 453 |         self._words.append(word)
 454 |         word._add_noun_phrase(self)
 455 | 
 456 |     id = proxy_response_json("id", None, "The unique id of this NounPhrase within the result set.")
 457 | 
 458 |     word_positions = proxy_response_json("wordPositions", None, "List of the positions of the words in this phrase.")
 459 | 
 460 |     @property
 461 |     def words(self):
 462 |         """Returns a list of :class:`Word` that make up this phrase."""
 463 |         return self._words
 464 | 
 465 |     def __repr__(self):
 466 |         return "TextRazor NounPhrase at positions %s" % (str(self.words))
 467 | 
 468 |     def __str__(self):
 469 |         return _generate_str(self, banned_properties=["word_positions", ])
 470 | 
 471 | class Property(object):
 472 |     """Represents a property relation extracted from raw text.  A property implies an "is-a" or "has-a" relationship
 473 |     between the predicate (or focus) and its property.
 474 | 
 475 |     Requires the "relations" extractor to be added to the TextRazor request.
 476 |     """
 477 | 
 478 |     def __init__(self, property_json, link_index):
 479 |         self.json = property_json
 480 |         self._predicate_words = []
 481 |         self._property_words = []
 482 | 
 483 |         for callback, arg in link_index.get(("property", self.id), []):
 484 |             callback(arg, self)
 485 | 
 486 |         for position in self.predicate_positions:
 487 |             try:
 488 |                 link_index[("word", position)].append((self._register_link, True))
 489 |             except KeyError:
 490 |                 link_index[("word", position)] = [(self._register_link, True)]
 491 | 
 492 |         for position in self.property_positions:
 493 |             try:
 494 |                 link_index[("word", position)].append((self._register_link, False))
 495 |             except KeyError:
 496 |                 link_index[("word", position)] = [(self._register_link, False)]
 497 | 
 498 |     def _register_link(self, is_predicate, word):
 499 |         if is_predicate:
 500 |             self._predicate_words.append(word)
 501 |             word._add_property_predicate(self)
 502 |         else:
 503 |             self._property_words.append(word)
 504 |             word._add_property_properties(self)
 505 | 
 506 |     id = proxy_response_json("id", None, "The unique id of this NounPhrase within the result set.")
 507 | 
 508 |     predicate_positions = proxy_response_json("wordPositions", [], "List of the positions of the words in the predicate (or focus) of this property.")
 509 | 
 510 |     predicate_words = proxy_member("_predicate_words", "List of TextRazor words that make up the predicate (or focus) of this property.")
 511 | 
 512 |     property_positions = proxy_response_json("propertyPositions", [], "List of the positions of the words that modify the predicate of this property.")
 513 | 
 514 |     property_words = proxy_member("_property_words", "List of :class:`Word` that modify the predicate of this property.")
 515 | 
 516 |     def __repr__(self):
 517 |         return "TextRazor Property at positions %s" % (str(self.predicate_positions))
 518 | 
 519 |     def __str__(self):
 520 |         return _generate_str(self, banned_properties=["predicate_positions", ])
 521 | 
 522 | class Relation(object):
 523 |     """Represents a grammatical relation between words.  Typically owns a number of
 524 |     :class:`RelationParam`, representing the SUBJECT and OBJECT of the relation.
 525 | 
 526 |     Requires the "relations" extractor to be added to the TextRazor request."""
 527 | 
 528 |     def __init__(self, relation_json, link_index):
 529 |         self.json = relation_json
 530 | 
 531 |         self._params = [RelationParam(param, self, link_index) for param in relation_json["params"]]
 532 |         self._predicate_words = []
 533 | 
 534 |         for callback, arg in link_index.get(("relation", self.id), []):
 535 |             callback(arg, self)
 536 | 
 537 |         for position in self.predicate_positions:
 538 |             try:
 539 |                 link_index[("word", position)].append((self._register_link, None))
 540 |             except KeyError:
 541 |                 link_index[("word", position)] = [(self._register_link, None)]
 542 | 
 543 |     def _register_link(self, dummy, word):
 544 |         self._predicate_words.append(word)
 545 |         word._add_relation(self)
 546 | 
 547 |     id = proxy_response_json("id", None, "The unique id of this Relation within the result set.")
 548 | 
 549 |     predicate_positions = proxy_response_json("wordPositions", [], "List of the positions of the predicate words in this relation.")
 550 | 
 551 |     predicate_words = proxy_member("_predicate_words", "List of the positions of the predicate words in this relation.")
 552 | 
 553 |     params = proxy_member("_params", "List of the TextRazor RelationParam that are part of this relation.")
 554 | 
 555 |     def __repr__(self):
 556 |         return "TextRazor Relation at positions %s" % (str(self.predicate_words))
 557 | 
 558 |     def __str__(self):
 559 |         return _generate_str(self, banned_properties=["predicate_positions", ])
 560 | 
 561 | class Word(object):
 562 |     """Represents a single Word (token) extracted by TextRazor.
 563 | 
 564 |     Requires the "words" extractor to be added to the TextRazor request."""
 565 | 
 566 |     def __init__(self, response_word, link_index):
 567 |         self.json = response_word
 568 | 
 569 |         self._parent = None
 570 |         self._children = []
 571 | 
 572 |         self._entities = []
 573 |         self._entailments = []
 574 |         self._relations = []
 575 |         self._relation_params = []
 576 |         self._property_predicates = []
 577 |         self._property_properties = []
 578 |         self._noun_phrases = []
 579 | 
 580 |         for callback, arg in link_index.get(("word", self.position), []):
 581 |             callback(arg, self)
 582 | 
 583 |     def _add_child(self, child):
 584 |         self._children.append(child)
 585 | 
 586 |     def _set_parent(self, parent):
 587 |         self._parent = parent
 588 |         parent._add_child(self)
 589 | 
 590 |     def _add_entity(self, entity):
 591 |         self._entities.append(entity)
 592 | 
 593 |     def _add_entailment(self, entailment):
 594 |         self._entailments.append(entailment)
 595 | 
 596 |     def _add_relation(self, relation):
 597 |         self._relations.append(relation)
 598 | 
 599 |     def _add_relation_param(self, relation_param):
 600 |         self._relation_params.append(relation_param)
 601 | 
 602 |     def _add_property_predicate(self, property):
 603 |         self._property_predicates.append(property)
 604 | 
 605 |     def _add_property_properties(self, property):
 606 |         self._property_properties.append(property)
 607 | 
 608 |     def _add_noun_phrase(self, noun_phrase):
 609 |         self._noun_phrases.append(noun_phrase)
 610 | 
 611 |     parent_position = proxy_response_json("parentPosition", None, """
 612 |     The position of the grammatical parent of this Word, or None if this Word is either at the root
 613 |     of the sentence or the "dependency-trees" extractor was not requested.""")
 614 | 
 615 |     parent = proxy_member("_parent", """
 616 |     Link to the TextRazor Word that is parent of this Word, or None if this word is either at the root
 617 |     of the sentence or the "dependency-trees" extractor was not requested.""")
 618 | 
 619 |     relation_to_parent = proxy_response_json("relationToParent", None, """
 620 |     Returns the grammatical relation between this word and its parent, or None if this Word is either at the root
 621 |     of the sentence or the "dependency-trees" extractor was not requested.
 622 | 
 623 |     TextRazor parses into the Stanford uncollapsed dependencies, as detailed at:
 624 | 
 625 |     http://nlp.stanford.edu/software/dependencies_manual.pdf""")
 626 | 
 627 |     children = proxy_member("_children", """
 628 |     List of TextRazor words that make up the children of this word.  Returns an empty list
 629 |     for leaf words, or if the "dependency-trees" extractor was not requested.""")
 630 | 
 631 |     position = proxy_response_json("position", None, "The position of this word in its sentence.")
 632 | 
 633 |     stem = proxy_response_json("stem", None, "The stem of this word.")
 634 | 
 635 |     lemma = proxy_response_json("lemma", None, "The morphological root of this word, see http://en.wikipedia.org/wiki/Lemma_(morphology) for details.")
 636 | 
 637 |     token = proxy_response_json("token", None, "The raw token string that matched this word in the source text.")
 638 | 
 639 |     part_of_speech = proxy_response_json("partOfSpeech", None, """
 640 |     The Part of Speech that applies to this word. We use the Penn treebank tagset,
 641 |     as detailed here:
 642 | 
 643 |     http://www.comp.leeds.ac.uk/ccalas/tagsets/upenn.html""")
 644 | 
 645 |     input_start_offset = proxy_response_json("startingPos", None, """
 646 |     The start offset in the input text for this token. Note that this offset applies to the
 647 |     original Unicode string passed in to the api, TextRazor treats multi byte utf8 charaters as a single position.""")
 648 | 
 649 |     input_end_offset = proxy_response_json("endingPos", None, """
 650 |     The end offset in the input text for this token. Note that this offset applies to the
 651 |     original Unicode string passed in to the api, TextRazor treats multi byte utf8 charaters as a single position.""")
 652 | 
 653 |     entailments = proxy_member("_entailments", "List of :class:`Entailment` that this word entails")
 654 | 
 655 |     entities = proxy_member("_entities", "List of :class:`Entity` that this word is a part of.")
 656 | 
 657 |     relations = proxy_member("_relations", "List of :class:`Relation` that this word is a predicate of.")
 658 | 
 659 |     relation_params = proxy_member("_relation_params", "List of :class:`RelationParam` that this word is a member of.")
 660 | 
 661 |     property_properties = proxy_member("_property_properties", "List of :class:`Property` that this word is a property member of.")
 662 | 
 663 |     property_predicates = proxy_member("_property_predicates", "List of :class:`Property` that this word is a predicate (or focus) member of.")
 664 | 
 665 |     noun_phrases = proxy_member("_noun_phrases", "List of :class:`NounPhrase` that this word is a member of.")
 666 | 
 667 |     senses = proxy_response_json("senses", [], "List of {'sense', 'score'} dictionaries representing scores of each Wordnet sense this this word may be a part of.")
 668 | 
 669 |     spelling_suggestions = proxy_response_json("spellingSuggestions", [], "List of {'suggestion', 'score'} dictionaries representing scores of each spelling suggestion that might replace this word. This property requires the \"spelling\" extractor to be sent with your request.")
 670 | 
 671 |     def __repr__(self):
 672 |         return "TextRazor Word:\"%s\" at position %s" % ((self.token).encode("utf-8"), str(self.position))
 673 | 
 674 |     def __str__(self):
 675 |         return _generate_str(self)
 676 | 
 677 | class Sentence(object):
 678 |     """Represents a single sentence extracted by TextRazor."""
 679 | 
 680 |     def __init__(self, sentence_json, link_index):
 681 |         if "words" in sentence_json:
 682 |             self._words = [Word(word_json, link_index) for word_json in sentence_json["words"]]
 683 |         else:
 684 |             self._words = []
 685 | 
 686 |         self._add_links(link_index)
 687 | 
 688 |     def _add_links(self, link_index):
 689 |         if not self._words:
 690 |             return
 691 | 
 692 |         self._root_word = None
 693 | 
 694 |         # Add links between the parent/children of the dependency tree in this sentence.
 695 | 
 696 |         word_positions = {}
 697 |         for word in self._words:
 698 |             word_positions[word.position] = word
 699 | 
 700 |         for word in self._words:
 701 |             parent_position = word.parent_position
 702 |             if parent_position is not None and parent_position >= 0:
 703 |                 word._set_parent(word_positions[parent_position])
 704 |             elif word.part_of_speech not in ("$", "``", "''", "(", ")", ",", "--", ".", ":"):
 705 |                 # Punctuation does not get attached to any parent, any non punctuation part of speech
 706 |                 # must be the root word.
 707 |                 self._root_word = word
 708 | 
 709 |     root_word = proxy_member("_root_word", """The root word of this sentence if "dependency-trees" extractor was requested""")
 710 | 
 711 |     words = proxy_member("_words", """List of all the :class:`Word` in this sentence""")
 712 | 
 713 | 
 714 | class CustomAnnotation(object):
 715 | 
 716 |     def __init__(self, annotation_json, link_index):
 717 |         self.json = annotation_json
 718 | 
 719 |         for key_value in annotation_json.get("contents", []):
 720 |             for link in key_value.get("links", []):
 721 |                 try:
 722 |                     link_index[(link["annotationName"], link["linkedId"])].append((self._register_link, link))
 723 |                 except Exception:
 724 |                     link_index[(link["annotationName"], link["linkedId"])] = [(self._register_link, link)]
 725 | 
 726 |     def _register_link(self, link, annotation):
 727 |         link["linked"] = annotation
 728 | 
 729 |         new_custom_annotation_list = []
 730 |         try:
 731 |             new_custom_annotation_list = getattr(annotation, self.name())
 732 |         except Exception:
 733 |             pass
 734 |         new_custom_annotation_list.append(self)
 735 |         setattr(annotation, self.name(), new_custom_annotation_list)
 736 | 
 737 |     def name(self):
 738 |         return self.json["name"]
 739 | 
 740 |     def __getattr__(self, attr):
 741 |         exists = False
 742 |         for key_value in self.json["contents"]:
 743 |             if "key" in key_value and key_value["key"] == attr:
 744 |                 exists = True
 745 |                 for link in key_value.get("links", []):
 746 |                     try:
 747 |                         yield link["linked"]
 748 |                     except Exception:
 749 |                         yield link
 750 |                 for int_value in key_value.get("intValue", []):
 751 |                     yield int_value
 752 |                 for float_value in key_value.get("floatValue", []):
 753 |                     yield float_value
 754 |                 for str_value in key_value.get("stringValue", []):
 755 |                     yield str_value
 756 |                 for bytes_value in key_value.get("bytesValue", []):
 757 |                     yield bytes_value
 758 | 
 759 |         if not exists:
 760 |             raise AttributeError("%r annotation has no attribute %r" % (self.name(), attr))
 761 | 
 762 |     def __repr__(self):
 763 |         return "TextRazor CustomAnnotation:\"%s\"" % (self.json["name"])
 764 | 
 765 |     def __str__(self):
 766 |         out = ["TextRazor CustomAnnotation:", str(self.json["name"]), "\n"]
 767 | 
 768 |         for key_value in self.json["contents"]:
 769 |             try:
 770 |                 out.append("Param %s:" % key_value["key"])
 771 |             except Exception:
 772 |                 out.append("Param (unlabelled):")
 773 |             out.append("\n")
 774 |             for link in self.__getattr__(key_value["key"]):
 775 |                 out.append(repr(link))
 776 |                 out.append("\n")
 777 | 
 778 |         return " ".join(out)
 779 | 
 780 | 
 781 | class TextRazorResponse(object):
 782 |     """Represents a processed response from TextRazor."""
 783 | 
 784 |     def __init__(self, response_json):
 785 |         self.json = response_json
 786 | 
 787 |         self._sentences = []
 788 |         self._custom_annotations = []
 789 |         self._topics = []
 790 |         self._coarse_topics = []
 791 |         self._entities = []
 792 |         self._entailments = []
 793 |         self._relations = []
 794 |         self._properties = []
 795 |         self._noun_phrases = []
 796 |         self._categories = []
 797 | 
 798 |         link_index = {}
 799 | 
 800 |         if "response" in self.json:
 801 |             # There's a bit of magic here.  Each annotation registers a callback with the ids and types of annotation
 802 |             # that it is linked to.  When the linked annotation is later parsed it adds the link via the callback.
 803 |             # This means that annotations must be added in order of the dependency between them.
 804 | 
 805 |             if "customAnnotations" in self.json["response"]:
 806 |                 self._custom_annotations = [CustomAnnotation(json, link_index) for json in self.json["response"]["customAnnotations"]]
 807 | 
 808 |             if "topics" in self.json["response"]:
 809 |                 self._topics = [Topic(topic_json, link_index) for topic_json in self.json["response"]["topics"]]
 810 | 
 811 |             if "coarseTopics" in self.json["response"]:
 812 |                 self._coarse_topics = [Topic(topic_json, link_index) for topic_json in self.json["response"]["coarseTopics"]]
 813 | 
 814 |             if "entities" in self.json["response"]:
 815 |                 self._entities = [Entity(entity_json, link_index) for entity_json in self.json["response"]["entities"]]
 816 | 
 817 |             if "entailments" in self.json["response"]:
 818 |                 self._entailments = [Entailment(entailment_json, link_index) for entailment_json in self.json["response"]["entailments"]]
 819 | 
 820 |             if "relations" in self.json["response"]:
 821 |                 self._relations = [Relation(relation_json, link_index) for relation_json in self.json["response"]["relations"]]
 822 | 
 823 |             if "properties" in self.json["response"]:
 824 |                 self._properties = [Property(property_json, link_index) for property_json in self.json["response"]["properties"]]
 825 | 
 826 |             if "nounPhrases" in self.json["response"]:
 827 |                 self._noun_phrases = [NounPhrase(phrase_json, link_index) for phrase_json in self.json["response"]["nounPhrases"]]
 828 | 
 829 |             if "sentences" in self.json["response"]:
 830 |                 self._sentences = [Sentence(sentence_json, link_index) for sentence_json in self.json["response"]["sentences"]]
 831 | 
 832 |             if "categories" in self.json["response"]:
 833 |                 self._categories = [ScoredCategory(category_json) for category_json in self.json["response"]["categories"]]
 834 | 
 835 |     @property
 836 |     def raw_text(self):
 837 |         """"When the set_cleanup_return_raw option is enabled, contains the input text before any cleanup."""
 838 |         return self.json["response"].get("rawText", "")
 839 | 
 840 |     @property
 841 |     def cleaned_text(self):
 842 |         """"When the set_cleanup_return_cleaned option is enabled, contains the input text after any cleanup/article extraction."""
 843 |         return self.json["response"].get("cleanedText", "")
 844 | 
 845 |     @property
 846 |     def language(self):
 847 |         """"The ISO-639-2 language used to analyze this document, either explicitly provided as the languageOverride, or as detected by the language detector."""
 848 |         return self.json["response"].get("language", "")
 849 | 
 850 |     @property
 851 |     def custom_annotation_output(self):
 852 |         """"Any output generated while running the embedded Prolog engine on your rules."""
 853 |         return self.json["response"].get("customAnnotationOutput", "")
 854 | 
 855 |     ok = proxy_response_json("ok", False, """
 856 |     True if TextRazor successfully analyzed your document, False if there was some error.
 857 |     More detailed information about the error is available in the :meth:`error` property.
 858 |     """)
 859 | 
 860 |     error = proxy_response_json("error", "", """
 861 |     Descriptive error message of any problems that may have occurred during analysis,
 862 |     or an empty string if there was no error.
 863 |     """)
 864 | 
 865 |     message = proxy_response_json("message", "", """
 866 |     Any warning or informational messages returned from the server.
 867 |     """)
 868 | 
 869 |     def coarse_topics(self):
 870 |         """Returns a list of all the coarse :class:`Topic` in the response. """
 871 |         return self._coarse_topics
 872 | 
 873 |     def topics(self):
 874 |         """Returns a list of all the :class:`Topic` in the response. """
 875 |         return self._topics
 876 | 
 877 |     def entities(self):
 878 |         """Returns a list of all the :class:`Entity` across all sentences in the response."""
 879 |         return self._entities
 880 | 
 881 |     def words(self):
 882 |         """Returns a generator of all :class:`Word` across all sentences in the response."""
 883 |         for sentence in self._sentences:
 884 |             for word in sentence.words:
 885 |                 yield word
 886 | 
 887 |     def entailments(self):
 888 |         """Returns a list of all :class:`Entailment` across all sentences in the response."""
 889 |         return self._entailments
 890 | 
 891 |     def relations(self):
 892 |         """Returns a list of all :class:`Relation` across all sentences in the response."""
 893 |         return self._relations
 894 | 
 895 |     def properties(self):
 896 |         """Returns a list of all :class:`Property` across all sentences in the response."""
 897 |         return self._properties
 898 | 
 899 |     def noun_phrases(self):
 900 |         """Returns a list of all the :class:`NounPhrase` across all sentences in the response."""
 901 |         return self._noun_phrases
 902 | 
 903 |     def sentences(self):
 904 |         """Returns a list of all :class:`Sentence` in the response."""
 905 |         return self._sentences
 906 | 
 907 |     def categories(self):
 908 |         """List of all :class:`ScoredCategory` in the response."""
 909 |         return self._categories
 910 | 
 911 |     def matching_rules(self):
 912 |         """Returns a list of rule names that matched this document."""
 913 |         return [custom_annotation.name() for custom_annotation in self._custom_annotations]
 914 | 
 915 |     def summary(self):
 916 |         return """Request processed in: %s seconds.  Num Sentences:%s""" % (
 917 |             self.json["time"], len(self.json["response"]["sentences"])
 918 |         )
 919 | 
 920 |     def __getattr__(self, attr):
 921 |         exists = False
 922 |         for custom_annotation in self._custom_annotations:
 923 |             if custom_annotation.name() == attr:
 924 |                 exists = True
 925 |                 yield custom_annotation
 926 | 
 927 |         if not exists:
 928 |             raise AttributeError("TextRazor response has no annotation %r" % attr)
 929 | 
 930 | 
 931 | class AllDictionaryEntriesResponse(object):
 932 | 
 933 |     def __init__(self, json):
 934 |         self.json = json
 935 | 
 936 |         self.entries = [DictionaryEntry(dictionary_json) for dictionary_json in json.get("entries", [])]
 937 | 
 938 |     total = proxy_response_json("total", 0, """
 939 |     The total number of DictionaryEntry in this Dictionary.
 940 |     """)
 941 | 
 942 |     limit = proxy_response_json("limit", 0, """
 943 |     The maximium number of DictionaryEntry to be returned.
 944 |     """)
 945 | 
 946 |     offset = proxy_response_json("offset", 0, """
 947 |     Offset into the full list of DictionaryEntry that this result set started from.
 948 |     """)
 949 | 
 950 | 
 951 | class DictionaryManager(TextRazorConnection):
 952 | 
 953 |     path = "entities/"
 954 | 
 955 |     def __init__(self, api_key=None):
 956 |         super(DictionaryManager, self).__init__(api_key)
 957 | 
 958 |     def create_dictionary(self, dictionary_properties):
 959 |         """ Creates a new dictionary using properties provided in the dict dictionary_properties.
 960 |         See the properties of class Dictionary for valid options.
 961 | 
 962 |         >>> import textrazor
 963 |         >>> dictionary_manager = textrazor.DictionaryManager("YOUR_API_KEY_HERE")
 964 |         >>>
 965 |         >>> dictionary_manager.create_dictionary({"id":"UNIQUE_ID"})
 966 |         """
 967 | 
 968 |         new_dictionary = Dictionary({})
 969 | 
 970 |         for key, value in dictionary_properties.items():
 971 |             if not hasattr(new_dictionary, key):
 972 |                 valid_options = ",".join(name for name, obj in Dictionary.__dict__.items() if isinstance(obj, proxy_response_json))
 973 | 
 974 |                 raise TextRazorAnalysisException("Cannot create dictionary, unexpected param: %s. Supported params: %s" % (key, valid_options))
 975 | 
 976 |             setattr(new_dictionary, key, value)
 977 | 
 978 |         # Check for the existence of a dictionary ID, without that
 979 |         # we can't generate a URL and the server will return an unhelpful message.
 980 |         if not new_dictionary.id:
 981 |             raise TextRazorAnalysisException("Cannot create dictionary, dictionary id not provided.")
 982 | 
 983 |         dictionary_path = "".join([self.path, new_dictionary.id])
 984 | 
 985 |         self.do_request(dictionary_path, json.dumps(new_dictionary.json), method="PUT")
 986 | 
 987 |         # The server may have added some optional fields so we want to force the user to "get" the new dictionary.
 988 |         return self.get_dictionary(new_dictionary.id)
 989 | 
 990 |     def all_dictionaries(self):
 991 |         """ Returns a list of all Dictionary in your account.
 992 | 
 993 |         >>> for dictionary in dictionary_manager.all_dictionaries():
 994 |         >>>     print dictionary.id
 995 |         """
 996 | 
 997 |         response = self.do_request(self.path)
 998 | 
 999 |         if "ok" in response and not response["ok"]:
1000 |             raise TextRazorAnalysisException("TextRazor was unable to retrieve all dictionaries. Error: %s" % str(response))
1001 | 
1002 |         if "dictionaries" in response:
1003 |             return [Dictionary(dictionary_json) for dictionary_json in response["dictionaries"]]
1004 | 
1005 |         return []
1006 | 
1007 |     def get_dictionary(self, id):
1008 |         """ Returns a Dictionary object by id.
1009 | 
1010 |         >>> print dictionary_manager.get_id("UNIQUE_ID").language
1011 |         """
1012 |         dictionary_path = "".join([self.path, id])
1013 |         response = self.do_request(dictionary_path, method="GET")
1014 | 
1015 |         if "ok" in response and not response["ok"]:
1016 |             raise TextRazorAnalysisException("TextRazor was unable to retrieve dictionary with id: %s. Error: %s" % (id, str(response)))
1017 | 
1018 |         return Dictionary(response["response"])
1019 | 
1020 |     def delete_dictionary(self, id):
1021 |         """ Deletes a dictionary and all its entries by id.
1022 | 
1023 |         >>> dictionary_manager.delete_dictionary("UNIQUE_ID")
1024 |         """
1025 |         dictionary_path = "".join([self.path, id])
1026 |         response = self.do_request(dictionary_path, method="DELETE")
1027 | 
1028 |         if "ok" in response and not response["ok"]:
1029 |             raise TextRazorAnalysisException("Unable to delete dictionary with ID:%s. Error: %s" % (id, str(response)))
1030 | 
1031 |     def all_entries(self, dictionary_id, limit=None, offset=None):
1032 |         """ Returns a AllDictionaryEntriesResponse containing all DictionaryEntry for dictionary with id dictionary_id, along with paging information.
1033 | 
1034 |         Larger dictionaries can be too large to download all at once. Where possible it is recommended that you use
1035 |         limit and offset paramaters to control the TextRazor response, rather than filtering client side.
1036 | 
1037 |         >>> entry_response = dictionary_manager.all_entries("UNIQUE_ID", limit=10, offset=10)
1038 |         >>> for entry in entry_response.entries:
1039 |         >>>     print entry.text
1040 |         """
1041 | 
1042 |         params = {}
1043 |         if limit:
1044 |             params['limit'] = limit
1045 |         if offset:
1046 |             params['offset'] = offset
1047 | 
1048 |         all_path = "".join([self.path, dictionary_id, "/_all?", urlencode(params)])
1049 | 
1050 |         response = self.do_request(all_path, method="GET")
1051 | 
1052 |         if "ok" in response and not response["ok"]:
1053 |             raise TextRazorAnalysisException("TextRazor was unable to retrieve dictionary entries with dictionary id: %s, Error: %s" % (dictionary_id, str(response)))
1054 | 
1055 |         return AllDictionaryEntriesResponse(response["response"])
1056 | 
1057 |     def add_entries(self, dictionary_id, entities):
1058 |         """ Adds entries to a dictionary with id dictionary_id.
1059 | 
1060 |         Entries must be a List of dicts corresponding to properties of the new DictionaryEntry objects.
1061 |         At a minimum this would be [{'text':'test text to match'}].
1062 | 
1063 |         >>> dictionary_manager.add_entries("UNIQUE_ID", [{'text':'test text to match'}, {'text':'more text to match', 'id':'UNIQUE_ENTRY_ID'}])
1064 |         """
1065 |         dictionary_path = "".join([self.path, dictionary_id, "/"])
1066 |         all_entries = []
1067 | 
1068 |         for entity in entities:
1069 |             new_entry = DictionaryEntry({})
1070 | 
1071 |             for key, value in entity.items():
1072 |                 if not hasattr(new_entry, key):
1073 |                     valid_options = ",".join(name for name, obj in DictionaryEntry.__dict__.items() if isinstance(obj, proxy_response_json))
1074 | 
1075 |                     raise TextRazorAnalysisException("Cannot create dictionary entry, unexpected param: %s. Supported params: %s" % (key, valid_options))
1076 | 
1077 |                 setattr(new_entry, key, value)
1078 | 
1079 |             all_entries.append(new_entry.json)
1080 | 
1081 |         # For performance reasons TextRazor expects a maximum of 20000 dictionary entries at a time,
1082 |         # we transparently batch them up here.
1083 | 
1084 |         for batch in _chunks(all_entries, 20000):
1085 |             response = self.do_request(dictionary_path, json.dumps(batch), method="POST")
1086 | 
1087 |             if "ok" in response and not response["ok"]:
1088 |                 raise TextRazorAnalysisException("Unable to add entries to dictionary with ID:%s. Error: %s" % (dictionary_id, str(response)))
1089 | 
1090 |     def delete_entry(self, dictionary_id, entry_id):
1091 |         """Deletes a specific DictionaryEntry by dictionary id and entry id.
1092 | 
1093 |         For performance reasons it's always faster to perform major changes
1094 |         to dictionaries by deleting and recreating the whole dictionary rather than removing
1095 |         many individual entries.
1096 | 
1097 |         >>> dictionary_manager.delete_entry('UNIQUE_ID', 'UNIQUE_ENTRY_ID')
1098 |         """
1099 | 
1100 |         dictionary_path = "".join([self.path, dictionary_id, "/", entry_id])
1101 | 
1102 |         response = self.do_request(dictionary_path, method="DELETE")
1103 | 
1104 |         if "ok" in response and not response["ok"]:
1105 |             raise TextRazorAnalysisException("TextRazor was unable to delete dictionary entry with dictionary id: %s, entry id: %s Error: %s" % (dictionary_id, entry_id, str(response)))
1106 | 
1107 |     def get_entry(self, dictionary_id, entry_id):
1108 |         """ Retrieves a specific DictionaryEntry by dictionary id and entry id.
1109 | 
1110 |         >>> print dictionary_manager.get_id('UNIQUE_ID', 'UNIQUE_ENTRY_ID').text
1111 |         """
1112 | 
1113 |         dictionary_path = "".join([self.path, dictionary_id, "/", entry_id])
1114 | 
1115 |         response = self.do_request(dictionary_path, method="GET")
1116 | 
1117 |         if "ok" in response and not response["ok"]:
1118 |             raise TextRazorAnalysisException("TextRazor was unable to retrieve dictionary entry with dictionary id: %s, entry id: %s Error: %s" % (dictionary_id, entry_id, str(response)))
1119 | 
1120 |         return DictionaryEntry(response["response"])
1121 | 
1122 | 
1123 | class DictionaryEntry(object):
1124 | 
1125 |     def __init__(self, json):
1126 |         self.json = json
1127 | 
1128 |     id = proxy_response_json("id", "", """
1129 |     Unique ID for this entry, used to identify and manipulate specific entries.
1130 | 
1131 |     Defaults to an automatically generated unique id.
1132 |     """)
1133 | 
1134 |     text = proxy_response_json("text", "", """
1135 |     Unicode string representing the text to match to this DictionaryEntry.
1136 |     """)
1137 | 
1138 |     data = proxy_response_json("data", {}, """
1139 |     A dictionary mapping string keys to lists of string data values.
1140 |     TextRazor will return this dictionary to you as part of the Entity 'data' property whenever it matches this entry.
1141 |     This is useful for adding application-specific metadata to each entry.
1142 | 
1143 |     >>> {'type':['people', 'person', 'politician']}
1144 |     """)
1145 | 
1146 | 
1147 | class Dictionary(object):
1148 | 
1149 |     def __init__(self, json):
1150 |         self.json = json
1151 | 
1152 |     match_type = proxy_response_json("matchType", "", """
1153 |     Controls any pre-processing done on your dictionary before matching.
1154 | 
1155 |     Valid options are:
1156 |     stem    - Words are split and "stemmed" before matching, resulting in a more relaxed match.
1157 |               This is an easy way to match plurals - love, loved, loves will all match the same dictionary entry.
1158 |               This implicitly sets "case_insensitive" to True.
1159 | 
1160 |     token   - Words are split and matched literally.
1161 | 
1162 |     Defaults to 'token'.""")
1163 | 
1164 |     case_insensitive = proxy_response_json("caseInsensitive", False, """
1165 |     When True, this dictionary will match both uppercase and lowercase characters.
1166 |     """)
1167 | 
1168 |     id = proxy_response_json("id", "", """
1169 |     The unique identifier for this dictionary.
1170 |     """)
1171 | 
1172 |     language = proxy_response_json("language", "", """
1173 |     When set to a ISO-639-2 language code, this dictionary will only match documents of the corresponding language.
1174 | 
1175 |     When set to 'any', this dictionary will match any document.
1176 | 
1177 |     Defaults to 'any'.
1178 |     """)
1179 | 
1180 | 
1181 | class AllCategoriesResponse(object):
1182 | 
1183 |     def __init__(self, json):
1184 |         self.json = json
1185 |         self.categories = [Category(category_json) for category_json in json.get("categories", [])]
1186 | 
1187 |     total = proxy_response_json("total", 0, """
1188 |     The total number of Category in this Classifier.
1189 |     """)
1190 | 
1191 |     limit = proxy_response_json("limit", 0, """
1192 |     The maximium number of Category to be returned.
1193 |     """)
1194 | 
1195 |     offset = proxy_response_json("offset", 0, """
1196 |     Offset into the full list of Category that this result set started from.
1197 |     """)
1198 | 
1199 | 
1200 | class ScoredCategory(object):
1201 | 
1202 |     def __init__(self, json):
1203 |         self.json = json
1204 | 
1205 |     classifier_id = proxy_response_json("classifierId", "", """
1206 |     The unique identifier for the classifier that matched this ScoredCategory.
1207 |     """)
1208 | 
1209 |     category_id = proxy_response_json("categoryId", "", """
1210 |     The unique identifier of this category.
1211 |     """)
1212 | 
1213 |     label = proxy_response_json("label", "", """
1214 |     The human readable label for this category.
1215 |     """)
1216 | 
1217 |     score = proxy_response_json("score", 0, """
1218 |     The score TextRazor has assigned to this category, between 0 and 1.
1219 |     """)
1220 | 
1221 | 
1222 | class Category(object):
1223 |     path = "categories/"
1224 | 
1225 |     def __init__(self, json):
1226 |         self.json = json
1227 | 
1228 |     query = proxy_response_json("query", "", """The query used to define this category.""")
1229 | 
1230 |     category_id = proxy_response_json("categoryId", "", """The unique ID for this category within its classifier.""")
1231 | 
1232 |     label = proxy_response_json("label", "", """The human readable label for this category. This is an optional field.""")
1233 | 
1234 | 
1235 | class ClassifierManager(TextRazorConnection):
1236 | 
1237 |     path = "categories/"
1238 | 
1239 |     def __init__(self, api_key=None):
1240 |         super(ClassifierManager, self).__init__(api_key)
1241 | 
1242 |     def delete_classifier(self, classifier_id):
1243 |         """ Deletes a Classifier and all its Categories by id. """
1244 |         classifier_path = "".join([self.path, classifier_id])
1245 |         self.do_request(classifier_path, method="DELETE")
1246 | 
1247 |     def create_classifier(self, classifier_id, categories):
1248 |         """ Creates a new classifier using the provided list of Category.
1249 | 
1250 |         See the properties of class Category for valid options. """
1251 | 
1252 |         classifier_path = "".join([self.path, classifier_id])
1253 | 
1254 |         all_categories = []
1255 | 
1256 |         for category in categories:
1257 |             new_category = Category({})
1258 | 
1259 |             for key, value in category.items():
1260 |                 if not hasattr(new_category, key):
1261 |                     valid_options = ",".join(name for name, obj in Category.__dict__.items() if isinstance(obj, proxy_response_json))
1262 | 
1263 |                     raise TextRazorAnalysisException("Cannot create category, unexpected param: %s. Supported params: %s" % (key, valid_options))
1264 | 
1265 |                 setattr(new_category, key, value)
1266 | 
1267 |             all_categories.append(new_category.json)
1268 | 
1269 |         self.do_request(classifier_path, json.dumps(all_categories), content_type="application/json", method="PUT")
1270 | 
1271 |     def create_classifier_with_csv(self, classifier_id, categories_csv):
1272 |         """ Uploads the string contents of a CSV file containing new categories to be added to the classifier called classifier_name.
1273 |            Any existing classifier with this ID will be replaced. """
1274 | 
1275 |         classifier_path = "".join([self.path, classifier_id])
1276 |         self.do_request(classifier_path, categories_csv, content_type="application/csv", method="PUT")
1277 | 
1278 |     def all_categories(self, classifier_id, limit=None, offset=None):
1279 |         """ Returns a AllCategoriesResponse containing all Categories for classifier with id classifier_id, along with paging information.
1280 | 
1281 |         Larger classifiers can be too large to download all at once. Where possible it is recommended that you use
1282 |         limit and offset paramaters to control the TextRazor response, rather than filtering client side.
1283 | 
1284 |         >>> category_response = classifier_manager.all_entries("UNIQUE_CLASSIFIER_ID", limit=10, offset=10)
1285 |         >>> for category in category_response.categories:
1286 |         >>>     print category.text
1287 |         """
1288 | 
1289 |         params = {}
1290 |         if limit:
1291 |             params['limit'] = limit
1292 |         if offset:
1293 |             params['offset'] = offset
1294 | 
1295 |         all_path = "".join([self.path, classifier_id, "/_all?", urlencode(params)])
1296 | 
1297 |         response = self.do_request(all_path, method="GET")
1298 | 
1299 |         if "ok" in response and not response["ok"]:
1300 |             raise TextRazorAnalysisException("TextRazor was unable to retrieve categories for classifier id: %s, Error: %s" % (classifier_id, str(response)))
1301 | 
1302 |         return AllCategoriesResponse(response["response"])
1303 | 
1304 |     def delete_category(self, classifier_id, category_id):
1305 |         """ Deletes a Category by ID. """
1306 |         category_path = "".join([self.path, classifier_id, "/", category_id])
1307 |         self.do_request(category_path, method="DELETE")
1308 | 
1309 |     def get_category(self, classifier_id, category_id):
1310 |         """ Returns a Category by ID. """
1311 |         category_path = "".join([self.path, classifier_id, "/", category_id])
1312 | 
1313 |         response = self.do_request(category_path, method="GET")
1314 | 
1315 |         if "ok" in response and not response["ok"]:
1316 |             raise TextRazorAnalysisException("TextRazor was unable to retrieve category for classifier id: %s, Error: %s" % (classifier_id, str(response)))
1317 | 
1318 |         return Category(response["response"])
1319 | 
1320 | class Account(object):
1321 | 
1322 |     def __init__(self, json):
1323 |         self.json = json
1324 | 
1325 |     plan = proxy_response_json("plan", "", """
1326 |     The ID of your current subscription plan.
1327 |     """)
1328 | 
1329 |     concurrent_request_limit = proxy_response_json("concurrentRequestLimit", 0, """
1330 |     The maximum number of requests your account can make at the same time.
1331 |     """)
1332 | 
1333 |     concurrent_requests_used = proxy_response_json("concurrentRequestsUsed", 0, """
1334 |     The number of requests currently being processed by your account.
1335 |     """)
1336 | 
1337 |     plan_daily_included_requests = proxy_response_json("planDailyRequestsIncluded", 0, """
1338 |     The daily number of requests included with your subscription plan.
1339 |     """)
1340 | 
1341 |     requests_used_today = proxy_response_json("requestsUsedToday", 0, """
1342 |     The total number of requests that have been made today.
1343 |     """)
1344 | 
1345 | class AccountManager(TextRazorConnection):
1346 | 
1347 |     path = "account/"
1348 | 
1349 |     def __init__(self, api_key=None):
1350 |         super(AccountManager, self).__init__(api_key)
1351 | 
1352 |     def get_account(self):
1353 |         """ Retrieves the Account settings and realtime usage statistics for your account.
1354 | 
1355 |         This call does not count towards your daily request or concurrency limits.
1356 | 
1357 |         >>> import textrazor
1358 |         >>> textrazor.api_key = "YOUR_API_KEY_HERE"
1359 |         >>>
1360 |         >>> account_manager = textrazor.AccountManager()
1361 |         >>>
1362 |         >>> print account_manager.get_account().requests_used_today
1363 |         """
1364 | 
1365 |         response = self.do_request(self.path, method="GET")
1366 | 
1367 |         if "ok" in response and not response["ok"]:
1368 |             raise TextRazorAnalysisException("TextRazor was unable to retrieve your account details, Error: %s" % str(response))
1369 | 
1370 |         return Account(response["response"])
1371 | 
1372 | 
1373 | class TextRazor(TextRazorConnection):
1374 |     """
1375 |     The main TextRazor client.  To process your text, create a :class:`TextRazor` instance with your API key
1376 |     and set the extractors you need to process the text.  Calls to :meth:`analyze` and :meth:`analyze_url` will then process raw text or URLs
1377 |     , returning a :class:`TextRazorResponse` on success.
1378 | 
1379 |     This class is threadsafe once initialized with the request options.  You should create a new instance for each request
1380 |     if you are likely to be changing the request options in a multithreaded environment.
1381 | 
1382 |     Below is an entity extraction example from the tutorial, you can find more examples at http://www.textrazor.com/tutorials.
1383 | 
1384 |     >>> import textrazor
1385 |     >>>
1386 |     >>> client = textrazor.TextRazor("API_KEY_GOES_HERE", extractors=["entities"])
1387 |     >>> client.set_cleanup_mode("cleanHTML")
1388 |     >>>
1389 |     >>> response = client.analyze_url("http://www.bbc.co.uk/news/uk-politics-18640916")
1390 |     >>>
1391 |     >>> entities = list(response.entities())
1392 |     >>> entities.sort(key=lambda x: x.relevance_score, reverse=True)
1393 |     >>>
1394 |     >>> seen = set()
1395 |     >>> for entity in entities:
1396 |     >>>     if entity.id not in seen:
1397 |     >>>         print entity.id, entity.relevance_score, entity.confidence_score, entity.freebase_types
1398 |     >>>         seen.add(entity.id)
1399 |     """
1400 | 
1401 |     def __init__(self, api_key=None, extractors=[], do_compression=None, do_encryption=None):
1402 |         super(TextRazor, self).__init__(api_key, do_compression, do_encryption)
1403 | 
1404 |         self.extractors = extractors
1405 |         self.cleanup_html = False
1406 |         self.cleanup_mode = None
1407 |         self.cleanup_return_cleaned = None
1408 |         self.cleanup_return_raw = None
1409 |         self.cleanup_use_metadata = None
1410 |         self.download_user_agent = None
1411 |         self.rules = ""
1412 |         self.language_override = None
1413 |         self.enrichment_queries = []
1414 |         self.dbpedia_type_filters = []
1415 |         self.freebase_type_filters = []
1416 |         self.allow_overlap = None
1417 |         self.entity_dictionaries = []
1418 |         self.classifiers = []
1419 |         self.classifier_max_categories = None
1420 | 
1421 |     def set_extractors(self, extractors):
1422 |         """Sets a list of "Extractors" which extract various information from your text.
1423 |         Only select the extractors that are explicitly required by your application for optimal performance.
1424 |         Any extractor that doesn't match one of the predefined list below will be assumed to be a custom Prolog extractor.
1425 | 
1426 |         Valid options are: words, phrases, entities, dependency-trees, relations, entailments. """
1427 |         self.extractors = extractors
1428 | 
1429 |     def set_rules(self, rules):
1430 |         """Sets a string containing Prolog logic.  All rules matching an extractor name listed in the request will be evaluated
1431 |         and all matching param combinations linked in the response. """
1432 |         self.rules = rules
1433 | 
1434 |     def set_enrichment_queries(self, enrichment_queries):
1435 |         """Set a list of "Enrichment Queries", used to enrich the entity response with structured linked data.
1436 |         The syntax for these queries is documented at https://www.textrazor.com/enrichment """
1437 |         self.enrichment_queries = enrichment_queries
1438 | 
1439 |     def set_language_override(self, language_override):
1440 |         """When set to a ISO-639-2 language code, force TextRazor to analyze content with this language.
1441 |         If not set TextRazor will use the automatically identified language.
1442 |         """
1443 |         self.language_override = language_override
1444 | 
1445 |     def set_do_cleanup_HTML(self, cleanup_html):
1446 |         """When True, input text is treated as raw HTML and will be cleaned of tags, comments, scripts,
1447 |         and boilerplate content removed.  When this option is enabled, the cleaned_text property is returned
1448 |         with the text content, providing access to the raw filtered text.  When enabled, position offsets returned
1449 |         in individual words apply to the clean text, not the provided HTML."""
1450 | 
1451 |         warnings.warn("set_do_cleanup_HTML has been deprecated. Please see set_cleanup_mode for a more flexible cleanup option.", DeprecationWarning)
1452 | 
1453 |         self.cleanup_html = cleanup_html
1454 | 
1455 |     def set_cleanup_mode(self, cleanup_mode):
1456 |         """Controls the preprocessing cleanup mode that TextRazor will apply to your content before analysis.
1457 |         For all options aside from "raw" any position offsets returned will apply to the final cleaned text,
1458 |         not the raw HTML. If the cleaned text is required please see the :meth:`set_cleanup_return_cleaned' option.
1459 | 
1460 |         Valid options are:
1461 |         raw       - Content is analyzed "as-is", with no preprocessing.
1462 |         cleanHTML - Boilerplate HTML is removed prior to analysis, including tags, comments, menus, leaving only the
1463 |                     body of the article.
1464 |         stripTags - All Tags are removed from the document prior to analysis. This will remove all HTML, XML tags, but
1465 |                     the content of headings, menus will remain. This is a good option for analysis of HTML pages that aren't
1466 |                     long form documents.
1467 | 
1468 |         Defaults to "raw" for analyze requests, and "cleanHTML" for analyze_url requests.
1469 |         """
1470 |         self.cleanup_mode = cleanup_mode
1471 | 
1472 |     def set_cleanup_return_cleaned(self, return_cleaned):
1473 |         """When return_cleaned is True, the TextRazor response will contain the cleaned_text property. To save bandwidth, only set this to
1474 |         True if you need it in your application. Defaults to False."""
1475 |         self.cleanup_return_cleaned = return_cleaned
1476 | 
1477 |     def set_cleanup_return_raw(self, return_raw):
1478 |         """When return_raw is True, the TextRazor response will contain the raw_text property, the original text TextRazor received or downloaded
1479 |         before cleaning. To save bandwidth, only set this to True if you need it in your application. Defaults to False."""
1480 |         self.cleanup_return_raw = return_raw
1481 | 
1482 |     def set_cleanup_use_metadata(self, use_metadata):
1483 |         """When use_metadata is True, TextRazor will use metadata extracted from your document to help in the disambiguation/extraction
1484 |         process. This include HTML titles and metadata, and can significantly improve results for shorter documents without much other
1485 |         content.
1486 | 
1487 |         This option has no effect when cleanup_mode is 'raw'.
1488 |         """
1489 |         self.cleanup_use_metadata = use_metadata
1490 | 
1491 |     def set_download_user_agent(self, user_agent):
1492 |         """Sets the User-Agent header to be used when downloading URLs through analyze_url. This should be a descriptive string identifying
1493 |         your application, or an end user's browser user agent if you are performing live requests from a given user.
1494 | 
1495 |         Defaults to "TextRazor Downloader (https://www.textrazor.com)"
1496 |         """
1497 |         self.download_user_agent = user_agent
1498 | 
1499 |     def set_entity_dictionaries(self, entity_dictionaries):
1500 |         """Sets a list of the custom entity dictionaries to match against your content. Each item should be a string ID
1501 |         corresponding to dictionaries you have previously configured through the textrazor.Dictionary interface."""
1502 |         self.entity_dictionaries = entity_dictionaries
1503 | 
1504 |     def set_entity_allow_overlap(self, allow_overlap):
1505 |         """When allow_overlap is True, entities in the response may overlap. When False, the "best" entity
1506 |         is found such that none overlap. Defaults to True. """
1507 |         self.allow_overlap = allow_overlap
1508 | 
1509 |     def set_entity_dbpedia_type_filters(self, filters):
1510 |         """Set a list of DBPedia types to filter entity extraction on. All returned entities must
1511 |         match at least one of these types."""
1512 |         self.dbpedia_type_filters = filters
1513 | 
1514 |     def set_entity_freebase_type_filters(self, filters):
1515 |         """Set a list of Freebase types to filter entity extraction on. All returned entities must
1516 |         match at least one of these types."""
1517 |         self.freebase_type_filters = filters
1518 | 
1519 |     def set_classifiers(self, classifiers):
1520 |         """Sets a list of classifiers to evaluate against your document. Each entry should be a string ID corresponding to either one of TextRazor's default classifiers, or one you have previously configured through the ClassifierManager interface.
1521 | 
1522 |         Valid Options are:
1523 |         textrazor_iab Score against the Internet Advertising Bureau QAG segments - approximately 400 high level categories arranged into two tiers.
1524 |         textrazor_newscodes Score against the IPTC newscodes - approximately 1400 high level categories organized into a three level tree.
1525 |         custom classifier name Score against a custom classifier, previously created through the Classifier Manager interface."""
1526 |         self.classifiers = classifiers
1527 | 
1528 |     def set_classifier_max_categories(self, max_categories):
1529 |         """Sets the maximum number of matching categories to retrieve from the TextRazor."""
1530 |         self.classifier_max_categories = max_categories
1531 | 
1532 |     def _add_optional_param(self, post_data, param, value):
1533 |         if value is not None:
1534 |             post_data.append((param, value))
1535 | 
1536 |     def _build_post_data(self):
1537 |         post_data = [("rules", self.rules),
1538 |                      ("extractors", ",".join(self.extractors)),
1539 |                      ("cleanupHTML", self.cleanup_html),
1540 |                      ("classifiers", ",".join(self.classifiers))]
1541 | 
1542 |         for dictionary in self.entity_dictionaries:
1543 |             post_data.append(("entities.dictionaries", dictionary))
1544 | 
1545 |         for filter in self.dbpedia_type_filters:
1546 |             post_data.append(("entities.filterDbpediaTypes", filter))
1547 | 
1548 |         for filter in self.freebase_type_filters:
1549 |             post_data.append(("entities.filterFreebaseTypes", filter))
1550 | 
1551 |         for query in self.enrichment_queries:
1552 |             post_data.append(("entities.enrichmentQueries", query))
1553 | 
1554 |         self._add_optional_param(post_data, "entities.allowOverlap", self.allow_overlap)
1555 |         self._add_optional_param(post_data, "languageOverride", self.language_override)
1556 |         self._add_optional_param(post_data, "cleanup.mode", self.cleanup_mode)
1557 |         self._add_optional_param(post_data, "cleanup.returnCleaned", self.cleanup_return_cleaned)
1558 |         self._add_optional_param(post_data, "cleanup.returnRaw", self.cleanup_return_raw)
1559 |         self._add_optional_param(post_data, "cleanup.useMetadata", self.cleanup_use_metadata)
1560 |         self._add_optional_param(post_data, "download.userAgent", self.download_user_agent)
1561 |         self._add_optional_param(post_data, "classifier.maxCategories", self.classifier_max_categories)
1562 | 
1563 |         return post_data
1564 | 
1565 |     def analyze_url(self, url):
1566 |         """Calls the TextRazor API with the provided url.
1567 | 
1568 |         TextRazor will first download the contents of this URL, and then process the resulting text.
1569 | 
1570 |         TextRazor will only attempt to analyze text documents. Any invalid UTF-8 characters will be replaced with a space character and ignored.
1571 |         TextRazor limits the total download size to approximately 1M. Any larger documents will be truncated to that size, and a warning
1572 |         will be returned in the response.
1573 | 
1574 |         By default, TextRazor will clean all HTML prior to processing. For more control of the cleanup process,
1575 |         see the :meth:`set_cleanup_mode' option.
1576 | 
1577 |         Returns a :class:`TextRazorResponse` with the parsed data on success.
1578 |         Raises a :class:`TextRazorAnalysisException` on failure. """
1579 | 
1580 |         post_data = self._build_post_data()
1581 |         post_data.append(("url", url.encode("utf-8")))
1582 | 
1583 |         return TextRazorResponse(self.do_request("", urlencode(post_data), method="POST"))
1584 | 
1585 |     def analyze(self, text):
1586 |         """Calls the TextRazor API with the provided unicode text.
1587 | 
1588 |         Returns a :class:`TextRazorResponse` with the parsed data on success.
1589 |         Raises a :class:`TextRazorAnalysisException` on failure. """
1590 | 
1591 |         post_data = self._build_post_data()
1592 |         post_data.append(("text", text.encode("utf-8")))
1593 | 
1594 |         return TextRazorResponse(self.do_request("", urlencode(post_data), method="POST"))
1595 | 


--------------------------------------------------------------------------------