2 |
3 |
4 | Foobar Man
5 |
6 |
--------------------------------------------------------------------------------
/calais/tests/data/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Foobar Man
5 |
6 |
--------------------------------------------------------------------------------
/calais/rdf/client.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """
3 | RDF Extension for pycalais.
4 |
5 | The original RDF extension was written by Mark Soper for Jordan Dimov's
6 | original Python interface to the OpenCalais API.
7 | """
8 | from calais.base.client import Calais
9 | from calais.rdf.response import RDFCalaisResponse
10 |
11 |
12 | class RDFCalais(Calais):
13 | processing_directives = {'contentType': 'TEXT/RAW',
14 | 'outputFormat': 'xml/rdf',
15 | 'reltagBaseURL': None,
16 | 'calculateRelevanceScore': 'true',
17 | 'enableMetadataType': None,
18 | 'discardMetadata': None,
19 | 'omitOutputtingOriginalText': 'true'}
20 |
21 | def analyze(self, content, content_type='TEXT/RAW', external_id=None):
22 | return super(RDFCalais, self).analyze(content, content_type,
23 | external_id,
24 | response_cls=RDFCalaisResponse)
25 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 | from setuptools import setup
4 | from setuptools import find_packages
5 |
6 |
7 | classifiers = [
8 | "Intended Audience :: Developers",
9 | "Programming Language :: Python",
10 | "Operating System :: OS Independent",
11 | "Topic :: Software Development :: Libraries",
12 | "Environment :: Web Environment",
13 | "License :: OSI Approved :: BSD License",
14 | "Development Status :: 5 - Production/Stable",
15 | ]
16 |
17 | requires = ["rdflib==3.0", "rdfextras==0.1", ]
18 | # This might not be the best idea, but I did not encounter any bug
19 | # while testing with both libraries.
20 | try:
21 | import json
22 | except ImportError:
23 | requires.append('simplejson>=2.0')
24 |
25 |
26 | setup(name='pycalais',
27 | version=__import__('calais').__version__,
28 | license='BSD',
29 | url='https://github.com/newsgrape/pycalais',
30 | packages=find_packages(),
31 | description='Python interface to the OpenCalais REST API',
32 | long_description=open('README.rst').read(),
33 | keywords="opencalais calais rdf",
34 | classifiers=classifiers,
35 | install_requires=requires,
36 | )
37 |
--------------------------------------------------------------------------------
/calais/exceptions.py:
--------------------------------------------------------------------------------
1 | """
2 | Exceptions related to OpenCalais.
3 | """
4 | class CalaisError(Exception):
5 | """
6 | Base exception for errors returned by OpenCalais.
7 | """
8 | pass
9 |
10 |
11 | class MaxQpsExceeded(CalaisError):
12 | """
13 | Exception raised when the maximum requests per seconds have been reached.
14 | """
15 | pass
16 |
17 |
18 | class BusyCalais(CalaisError):
19 | """
20 | Exception raised when OpenCalais tells us that it is busy at the moment.
21 | """
22 | pass
23 |
24 |
25 | class LanguageUnsupported(CalaisError):
26 | """
27 | Exception raised when the content language is not supported by OpenCalais.
28 |
29 | I've found out that this may also happen when you send some "unusual"
30 | content, like scores or lots of tabular data, to OpenCalais.
31 | """
32 | pass
33 |
34 |
35 | class MaxLenExceeded(CalaisError):
36 | """
37 | Exception raised when too much content was tried to send to OpenCalais.
38 |
39 | I could not find a true limit. Some state that it is 100 000 characters,
40 | others say around 20-40kByte.
41 | """
42 | pass
43 |
44 |
45 | class GatewayTimeout(CalaisError):
46 | """
47 | Exception raised when OpenCalais' Gateway timed out.
48 | """
49 | pass
50 |
--------------------------------------------------------------------------------
/calais/rdf/response.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """
3 | CalaisResponse queryable through SPARQL.
4 |
5 | The original RDF extension was written by Mark Soper for Jordan Dimov's
6 | original Python interface to the OpenCalais API.
7 | """
8 | from StringIO import StringIO
9 |
10 | from rdflib import ConjunctiveGraph as Graph
11 | from rdflib import Namespace
12 | from rdflib import plugin
13 | from rdflib import query
14 |
15 | from calais.base.response import CalaisResponse
16 |
17 |
18 | # Register RDFLib SPARQL support.
19 | plugin.register('sparql', query.Processor,
20 | 'rdfextras.sparql.processor', 'Processor')
21 | plugin.register('sparql', query.Result,
22 | 'rdfextras.sparql.query', 'SPARQLQueryResult')
23 |
24 | # SPARQL Queries
25 | CATEGORY_QUERY = {'fields': ['docId', 'category', 'categoryName', 'score'],
26 | 'SPARQL': """
27 | PREFIX rdf:
28 | PREFIX cp:
29 |
30 | SELECT ?docId ?category ?categoryName ?score
31 | WHERE { ?doc cp:docId ?docId .
32 | ?doc cp:category ?category .
33 | ?doc cp:categoryName ?categoryName .
34 | ?doc cp:score ?score . }
35 | """, }
36 |
37 | ENTITY_QUERY = {'fields': ['entityId', 'name', 'type', 'relevance',
38 | 'resolves_to_uri', 'resolves_to_type',
39 | 'resolves_to_name', 'resolves_to_score'],
40 | 'SPARQL': """
41 | PREFIX rdf:
42 | PREFIX cp:
43 |
44 | SELECT ?entity ?name ?type ?relevance
45 | ?res_uri ?res_type ?res_name ?res_score
46 | WHERE {
47 | ?entity cp:name ?name .
48 | ?entity rdf:type ?type .
49 | ?rel_uri cp:subject ?entity .
50 | ?rel_uri cp:relevance ?relevance .
51 | OPTIONAL { ?res_uri cp:subject ?entity .
52 | ?res_uri rdf:type ?res_type .
53 | ?res_uri cp:name ?res_name . }
54 | }
55 | """, }
56 |
57 |
58 |
59 | class RDFCalaisResponse(CalaisResponse):
60 | """
61 | RDFCalaisResponse creates a graph from the received output from OpenCalais
62 | and makes it queryable through SPARQL.
63 | """
64 | def __init__(self, raw_result):
65 | self._detect_fails(raw_result)
66 | rdf = Namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#')
67 | c = Namespace('http://s.opencalais.com/1/pred/')
68 | g = Graph()
69 | self.graph = g
70 |
71 | g.parse(StringIO(raw_result.decode('utf-8').encode('utf-8')))
72 |
73 | self.categories = [row for row in g.query(CATEGORY_QUERY['SPARQL'])]
74 | self.entities = [row for row in g.query(ENTITY_QUERY['SPARQL'])]
75 |
--------------------------------------------------------------------------------
/calais/base/response.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """
3 | The CalaisResponse object as should be returned by a default
4 | ``Calais.analyze(...)`` call.
5 | """
6 | try: # Python <2.6 needs the simplejson module.
7 | import simplejson as json
8 | except ImportError:
9 | import json
10 |
11 | from calais import exceptions
12 |
13 |
14 | class CalaisResponse(object):
15 | """
16 | Encapsulates a parsed Calais response and provides pythonic access
17 | to the data.
18 | """
19 | def __init__(self, raw_result):
20 | # Usually OpenCalais returns a valid JSON object, therefore
21 | # it is pretty safe to assume a { should be in the response.
22 | # I would have put that in a try/except on json.loads, however
23 | # it is pretty hard to differ between a json exception and an
24 | # OpenCalais error message?
25 | self._detect_fails(raw_result)
26 |
27 | self.raw_response = json.loads(raw_result)
28 |
29 | self.info = self.raw_response['doc']['info']
30 | self.meta = self.raw_response['doc']['meta']
31 |
32 | for key, value in self.raw_response.iteritems():
33 | try:
34 | if key.startswith('http://'):
35 | attr_name = value['_typeGroup']
36 | # pluralise the attribute name
37 | if not attr_name.endswith('s'):
38 | attr_name += 's'
39 |
40 | if not hasattr(self, attr_name):
41 | setattr(self, attr_name, {})
42 | getattr(self, attr_name)[key] = value
43 | except AttributeError:
44 | # FIXME: Looks like the key was not an URI, ignore for now.
45 | continue
46 |
47 | def _detect_fails(self, resp):
48 | """
49 | Detect any failures in the given raw response.
50 | """
51 | if '{' in resp:
52 | return
53 |
54 | lowercase = resp.lower()
55 | if 'qps' in lowercase:
56 | raise exceptions.MaxQpsExceeded('You reached your queries per '
57 | 'second limit.')
58 | elif 'busy' in lowercase:
59 | raise exceptions.BusyCalais('OpenCalais is too busy.')
60 | elif 'supported languages' in lowercase:
61 | raise exceptions.LanguageUnsupported("The content's language is"
62 | 'not supported by OpenCalais'
63 | 'yet.')
64 | elif 'text length' in lowercase:
65 | raise exceptions.MaxLenExceeded('Content too long for OpenCalais.')
66 | elif 'gateway timeout' in lowercase:
67 | raise exceptions.GatewayTimeout('Gateway timed out.')
68 | else:
69 | raise exceptions.CalaisError('OpenCalais returned the following '
70 | 'error: "%s"' % resp)
71 |
72 | def __contains__(self, item):
73 | if hasattr(self, item):
74 | return True
75 | return False
76 |
--------------------------------------------------------------------------------
/calais/tests/test_response.py:
--------------------------------------------------------------------------------
1 | from nose.tools import eq_, ok_, raises
2 |
3 | from calais.base.response import CalaisResponse
4 | from calais import exceptions
5 |
6 |
7 | RAW_RESPONSE = '{"http://d.opencalais.com/pershash-1/6192d572-838c-3be4-8724-93fb0fca25d7": {"_typeReference": "http://s.opencalais.com/1/type/em/e/Person", "_type": "Person", "name": "Winston Churchill", "commonname": "Winston Churchill", "_typeGroup": "entities", "instances": [{"detection": "[]Winston Churchill[ was an optimist, by all]", "length": 17, "exact": "Winston Churchill", "suffix": " was an optimist, by all", "offset": 0}], "relevance": 0.857, "nationality": "N/A", "persontype": "N/A"}, "doc": {"info": {"docId": "http://d.opencalais.com/dochash-1/bf01a89a-8854-3db0-a9e0-17ce98a28016", "docDate": "2011-11-25 06:43:05.146", "allowSearch": "false", "docTitle": "", "submitter": "1.0", "allowDistribution": "false", "document": "", "calaisRequestID": "5b42083c-818b-04f4-133d-ac023fc298cf", "id": "http://id.opencalais.com/UfegThDnEiLVEjxPuKA4WQ", "externalMetadata": " "}, "meta": {"submitterCode": "73a204cb-98e2-2823-14ea-0197eba97bb8", "contentType": "TEXT/RAW", "language": "InputTextTooShort", "emVer": "7.1.1103.5", "messages": [], "processingVer": "CalaisJob01", "submitionDate": "2011-11-25 06:43:05.084", "signature": "digestalg-1|S7tippuJEhLeLFJ2IAm/ah368FA=|RMXX7xaA53pBD/LXRtCS5Rt8fmhn5NwdfJ9Ql8lO0iyvc6MU9YDeaA==", "langIdVer": "DefaultLangId"}}}'
8 | BUSY_RESPONSE = 'Calais Backend-Server is Busy. Please try again later."'
9 | QPS_RESPONSE = '
403 Developer Over Qps
'
10 | LANGUAGE_FAIL_RESPONSE = 'Calais continues to expand its list of supported languages, but does not yet support your submitted content.'
11 | MAX_LENGTH_RESPONSE = 'Text length has exceeded the allowed size .'
12 | GATEWAY_TIMEOUT_RESPONSE = '