├── .gitignore ├── calais ├── base │ ├── __init__.py │ ├── response.py │ └── client.py ├── rdf │ ├── __init__.py │ ├── client.py │ └── response.py ├── tests │ ├── __init__.py │ ├── data │ │ ├── foobar.baz │ │ ├── test.xml │ │ ├── index.htm │ │ └── index.html │ ├── test_response.py │ └── test_client.py ├── __init__.py └── exceptions.py ├── setup.py └── README.rst /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /calais/base/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /calais/rdf/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /calais/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /calais/tests/data/foobar.baz: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /calais/tests/data/test.xml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /calais/__init__.py: -------------------------------------------------------------------------------- 1 | __version_info__ = (1, 1, 4) 2 | __version__ = '.'.join(map(str, __version_info__)) 3 | -------------------------------------------------------------------------------- /calais/tests/data/index.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Foobar Man 5 | 6 | -------------------------------------------------------------------------------- /calais/tests/data/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Foobar Man 5 | 6 | -------------------------------------------------------------------------------- /calais/rdf/client.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | RDF Extension for pycalais. 4 | 5 | The original RDF extension was written by Mark Soper for Jordan Dimov's 6 | original Python interface to the OpenCalais API. 7 | """ 8 | from calais.base.client import Calais 9 | from calais.rdf.response import RDFCalaisResponse 10 | 11 | 12 | class RDFCalais(Calais): 13 | processing_directives = {'contentType': 'TEXT/RAW', 14 | 'outputFormat': 'xml/rdf', 15 | 'reltagBaseURL': None, 16 | 'calculateRelevanceScore': 'true', 17 | 'enableMetadataType': None, 18 | 'discardMetadata': None, 19 | 'omitOutputtingOriginalText': 'true'} 20 | 21 | def analyze(self, content, content_type='TEXT/RAW', external_id=None): 22 | return super(RDFCalais, self).analyze(content, content_type, 23 | external_id, 24 | response_cls=RDFCalaisResponse) 25 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | from setuptools import setup 4 | from setuptools import find_packages 5 | 6 | 7 | classifiers = [ 8 | "Intended Audience :: Developers", 9 | "Programming Language :: Python", 10 | "Operating System :: OS Independent", 11 | "Topic :: Software Development :: Libraries", 12 | "Environment :: Web Environment", 13 | "License :: OSI Approved :: BSD License", 14 | "Development Status :: 5 - Production/Stable", 15 | ] 16 | 17 | requires = ["rdflib==3.0", "rdfextras==0.1", ] 18 | # This might not be the best idea, but I did not encounter any bug 19 | # while testing with both libraries. 20 | try: 21 | import json 22 | except ImportError: 23 | requires.append('simplejson>=2.0') 24 | 25 | 26 | setup(name='pycalais', 27 | version=__import__('calais').__version__, 28 | license='BSD', 29 | url='https://github.com/newsgrape/pycalais', 30 | packages=find_packages(), 31 | description='Python interface to the OpenCalais REST API', 32 | long_description=open('README.rst').read(), 33 | keywords="opencalais calais rdf", 34 | classifiers=classifiers, 35 | install_requires=requires, 36 | ) 37 | -------------------------------------------------------------------------------- /calais/exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Exceptions related to OpenCalais. 3 | """ 4 | class CalaisError(Exception): 5 | """ 6 | Base exception for errors returned by OpenCalais. 7 | """ 8 | pass 9 | 10 | 11 | class MaxQpsExceeded(CalaisError): 12 | """ 13 | Exception raised when the maximum requests per seconds have been reached. 14 | """ 15 | pass 16 | 17 | 18 | class BusyCalais(CalaisError): 19 | """ 20 | Exception raised when OpenCalais tells us that it is busy at the moment. 21 | """ 22 | pass 23 | 24 | 25 | class LanguageUnsupported(CalaisError): 26 | """ 27 | Exception raised when the content language is not supported by OpenCalais. 28 | 29 | I've found out that this may also happen when you send some "unusual" 30 | content, like scores or lots of tabular data, to OpenCalais. 31 | """ 32 | pass 33 | 34 | 35 | class MaxLenExceeded(CalaisError): 36 | """ 37 | Exception raised when too much content was tried to send to OpenCalais. 38 | 39 | I could not find a true limit. Some state that it is 100 000 characters, 40 | others say around 20-40kByte. 41 | """ 42 | pass 43 | 44 | 45 | class GatewayTimeout(CalaisError): 46 | """ 47 | Exception raised when OpenCalais' Gateway timed out. 48 | """ 49 | pass 50 | -------------------------------------------------------------------------------- /calais/rdf/response.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | CalaisResponse queryable through SPARQL. 4 | 5 | The original RDF extension was written by Mark Soper for Jordan Dimov's 6 | original Python interface to the OpenCalais API. 7 | """ 8 | from StringIO import StringIO 9 | 10 | from rdflib import ConjunctiveGraph as Graph 11 | from rdflib import Namespace 12 | from rdflib import plugin 13 | from rdflib import query 14 | 15 | from calais.base.response import CalaisResponse 16 | 17 | 18 | # Register RDFLib SPARQL support. 19 | plugin.register('sparql', query.Processor, 20 | 'rdfextras.sparql.processor', 'Processor') 21 | plugin.register('sparql', query.Result, 22 | 'rdfextras.sparql.query', 'SPARQLQueryResult') 23 | 24 | # SPARQL Queries 25 | CATEGORY_QUERY = {'fields': ['docId', 'category', 'categoryName', 'score'], 26 | 'SPARQL': """ 27 | PREFIX rdf: 28 | PREFIX cp: 29 | 30 | SELECT ?docId ?category ?categoryName ?score 31 | WHERE { ?doc cp:docId ?docId . 32 | ?doc cp:category ?category . 33 | ?doc cp:categoryName ?categoryName . 34 | ?doc cp:score ?score . } 35 | """, } 36 | 37 | ENTITY_QUERY = {'fields': ['entityId', 'name', 'type', 'relevance', 38 | 'resolves_to_uri', 'resolves_to_type', 39 | 'resolves_to_name', 'resolves_to_score'], 40 | 'SPARQL': """ 41 | PREFIX rdf: 42 | PREFIX cp: 43 | 44 | SELECT ?entity ?name ?type ?relevance 45 | ?res_uri ?res_type ?res_name ?res_score 46 | WHERE { 47 | ?entity cp:name ?name . 48 | ?entity rdf:type ?type . 49 | ?rel_uri cp:subject ?entity . 50 | ?rel_uri cp:relevance ?relevance . 51 | OPTIONAL { ?res_uri cp:subject ?entity . 52 | ?res_uri rdf:type ?res_type . 53 | ?res_uri cp:name ?res_name . } 54 | } 55 | """, } 56 | 57 | 58 | 59 | class RDFCalaisResponse(CalaisResponse): 60 | """ 61 | RDFCalaisResponse creates a graph from the received output from OpenCalais 62 | and makes it queryable through SPARQL. 63 | """ 64 | def __init__(self, raw_result): 65 | self._detect_fails(raw_result) 66 | rdf = Namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#') 67 | c = Namespace('http://s.opencalais.com/1/pred/') 68 | g = Graph() 69 | self.graph = g 70 | 71 | g.parse(StringIO(raw_result.decode('utf-8').encode('utf-8'))) 72 | 73 | self.categories = [row for row in g.query(CATEGORY_QUERY['SPARQL'])] 74 | self.entities = [row for row in g.query(ENTITY_QUERY['SPARQL'])] 75 | -------------------------------------------------------------------------------- /calais/base/response.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | The CalaisResponse object as should be returned by a default 4 | ``Calais.analyze(...)`` call. 5 | """ 6 | try: # Python <2.6 needs the simplejson module. 7 | import simplejson as json 8 | except ImportError: 9 | import json 10 | 11 | from calais import exceptions 12 | 13 | 14 | class CalaisResponse(object): 15 | """ 16 | Encapsulates a parsed Calais response and provides pythonic access 17 | to the data. 18 | """ 19 | def __init__(self, raw_result): 20 | # Usually OpenCalais returns a valid JSON object, therefore 21 | # it is pretty safe to assume a { should be in the response. 22 | # I would have put that in a try/except on json.loads, however 23 | # it is pretty hard to differ between a json exception and an 24 | # OpenCalais error message? 25 | self._detect_fails(raw_result) 26 | 27 | self.raw_response = json.loads(raw_result) 28 | 29 | self.info = self.raw_response['doc']['info'] 30 | self.meta = self.raw_response['doc']['meta'] 31 | 32 | for key, value in self.raw_response.iteritems(): 33 | try: 34 | if key.startswith('http://'): 35 | attr_name = value['_typeGroup'] 36 | # pluralise the attribute name 37 | if not attr_name.endswith('s'): 38 | attr_name += 's' 39 | 40 | if not hasattr(self, attr_name): 41 | setattr(self, attr_name, {}) 42 | getattr(self, attr_name)[key] = value 43 | except AttributeError: 44 | # FIXME: Looks like the key was not an URI, ignore for now. 45 | continue 46 | 47 | def _detect_fails(self, resp): 48 | """ 49 | Detect any failures in the given raw response. 50 | """ 51 | if '{' in resp: 52 | return 53 | 54 | lowercase = resp.lower() 55 | if 'qps' in lowercase: 56 | raise exceptions.MaxQpsExceeded('You reached your queries per ' 57 | 'second limit.') 58 | elif 'busy' in lowercase: 59 | raise exceptions.BusyCalais('OpenCalais is too busy.') 60 | elif 'supported languages' in lowercase: 61 | raise exceptions.LanguageUnsupported("The content's language is" 62 | 'not supported by OpenCalais' 63 | 'yet.') 64 | elif 'text length' in lowercase: 65 | raise exceptions.MaxLenExceeded('Content too long for OpenCalais.') 66 | elif 'gateway timeout' in lowercase: 67 | raise exceptions.GatewayTimeout('Gateway timed out.') 68 | else: 69 | raise exceptions.CalaisError('OpenCalais returned the following ' 70 | 'error: "%s"' % resp) 71 | 72 | def __contains__(self, item): 73 | if hasattr(self, item): 74 | return True 75 | return False 76 | -------------------------------------------------------------------------------- /calais/tests/test_response.py: -------------------------------------------------------------------------------- 1 | from nose.tools import eq_, ok_, raises 2 | 3 | from calais.base.response import CalaisResponse 4 | from calais import exceptions 5 | 6 | 7 | RAW_RESPONSE = '{"http://d.opencalais.com/pershash-1/6192d572-838c-3be4-8724-93fb0fca25d7": {"_typeReference": "http://s.opencalais.com/1/type/em/e/Person", "_type": "Person", "name": "Winston Churchill", "commonname": "Winston Churchill", "_typeGroup": "entities", "instances": [{"detection": "[]Winston Churchill[ was an optimist, by all]", "length": 17, "exact": "Winston Churchill", "suffix": " was an optimist, by all", "offset": 0}], "relevance": 0.857, "nationality": "N/A", "persontype": "N/A"}, "doc": {"info": {"docId": "http://d.opencalais.com/dochash-1/bf01a89a-8854-3db0-a9e0-17ce98a28016", "docDate": "2011-11-25 06:43:05.146", "allowSearch": "false", "docTitle": "", "submitter": "1.0", "allowDistribution": "false", "document": "", "calaisRequestID": "5b42083c-818b-04f4-133d-ac023fc298cf", "id": "http://id.opencalais.com/UfegThDnEiLVEjxPuKA4WQ", "externalMetadata": " "}, "meta": {"submitterCode": "73a204cb-98e2-2823-14ea-0197eba97bb8", "contentType": "TEXT/RAW", "language": "InputTextTooShort", "emVer": "7.1.1103.5", "messages": [], "processingVer": "CalaisJob01", "submitionDate": "2011-11-25 06:43:05.084", "signature": "digestalg-1|S7tippuJEhLeLFJ2IAm/ah368FA=|RMXX7xaA53pBD/LXRtCS5Rt8fmhn5NwdfJ9Ql8lO0iyvc6MU9YDeaA==", "langIdVer": "DefaultLangId"}}}' 8 | BUSY_RESPONSE = 'Calais Backend-Server is Busy. Please try again later."' 9 | QPS_RESPONSE = '

403 Developer Over Qps

' 10 | LANGUAGE_FAIL_RESPONSE = 'Calais continues to expand its list of supported languages, but does not yet support your submitted content.' 11 | MAX_LENGTH_RESPONSE = 'Text length has exceeded the allowed size .' 12 | GATEWAY_TIMEOUT_RESPONSE = '

504 Gateway Timeout

' 13 | 14 | 15 | def test_info(): 16 | r = CalaisResponse(RAW_RESPONSE) 17 | eq_(r.info['id'], 'http://id.opencalais.com/UfegThDnEiLVEjxPuKA4WQ') 18 | 19 | 20 | def test_meta(): 21 | r = CalaisResponse(RAW_RESPONSE) 22 | eq_(r.meta['submitionDate'], '2011-11-25 06:43:05.084') 23 | 24 | 25 | def test_attrs(): 26 | r = CalaisResponse(RAW_RESPONSE) 27 | ok_('entities' in r) 28 | ok_('socialTags' not in r) 29 | 30 | 31 | @raises(exceptions.MaxQpsExceeded) 32 | def test_maxqps(): 33 | return CalaisResponse(QPS_RESPONSE) 34 | 35 | 36 | @raises(exceptions.BusyCalais) 37 | def test_busy(): 38 | return CalaisResponse(BUSY_RESPONSE) 39 | 40 | 41 | @raises(exceptions.LanguageUnsupported) 42 | def test_lang(): 43 | return CalaisResponse(LANGUAGE_FAIL_RESPONSE) 44 | 45 | 46 | @raises(exceptions.MaxLenExceeded) 47 | def test_len(): 48 | return CalaisResponse(MAX_LENGTH_RESPONSE) 49 | 50 | 51 | @raises(exceptions.GatewayTimeout) 52 | def test_gateway(): 53 | return CalaisResponse(GATEWAY_TIMEOUT_RESPONSE) 54 | 55 | 56 | @raises(exceptions.CalaisError) 57 | def test_general_fail(): 58 | return CalaisResponse('oh noe this is missing curly braces!') 59 | -------------------------------------------------------------------------------- /calais/tests/test_client.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import os 3 | import unittest 4 | 5 | from nose.tools import eq_, ok_, raises 6 | 7 | from calais.base import client 8 | 9 | 10 | # The good monkeypatching 11 | class DummyRequest(object): 12 | def read(self): 13 | return 'foobar' 14 | 15 | def dummy_urlopen(*args, **kwargs): 16 | return DummyRequest() 17 | client.urllib.urlopen = dummy_urlopen 18 | 19 | 20 | class DummyCalaisResponse(object): 21 | def __init__(self, *args, **kwargs): 22 | pass 23 | 24 | # I feel dirty when using isinstance() 25 | def is_dummy(self): 26 | return True 27 | 28 | 29 | class BaseCalaisTest(unittest.TestCase): 30 | def setUp(self): 31 | self.c = client.Calais('asdf') 32 | 33 | 34 | class CalaisTest(BaseCalaisTest): 35 | def testStripper(self): 36 | stripped = self.c.preprocess_html("""

TestWinFail

37 | Opium for the people. 38 | 39 | 40 | """) 41 | eq_(stripped, ('

TestWinFail

Opium for the ' 42 | 'people. ')) 43 | 44 | def testRandomID(self): 45 | """ 46 | Test Random ID Generation. 47 | 48 | Note that this class fails if the sample 49 | (string.latters + string.chars) has less than 10 items. 50 | 51 | Hereby I warn you: do not tinker with your string module or this 52 | will fail fataly. 53 | """ 54 | eq_(len(self.c.get_random_id()), 10) 55 | 56 | def testContentHash(self): 57 | """ 58 | Let's check that the hash of a string stayed the same. 59 | """ 60 | eq_(self.c.get_content_id('newsgrape'), 61 | '10e9a5f599b467d22b86d6fb9c762d0d4df37abe') 62 | 63 | def testLazyResponseClass(self): 64 | self.c.rest_POST = lambda x: True 65 | ok_(self.c.analyze('asd', response_cls=DummyCalaisResponse).is_dummy()) 66 | 67 | 68 | class MimeTypeTest(BaseCalaisTest): 69 | """ 70 | Make sure MimeType and therefore ContentType detection works. 71 | """ 72 | def setUp(self): 73 | super(MimeTypeTest, self).setUp() 74 | self.c.analyze = self.analyze_stub 75 | 76 | def analyze_stub(self, *args, **kwargs): 77 | """ 78 | Mocking of ``analyze()`` function to check if ``analyze_file()`` 79 | worked as expected. 80 | """ 81 | external_id = kwargs['external_id'] 82 | content_type = kwargs['content_type'] 83 | 84 | if external_id.endswith('.xml'): 85 | eq_(content_type, 'TEXT/XML') 86 | elif external_id.endswith(('.html', '.htm',)): 87 | eq_(content_type, 'TEXT/HTML') 88 | # make sure stripping worked. 89 | content = args[0] 90 | ok_('\n' not in content) 91 | ok_('||', 32 | re.IGNORECASE) 33 | 34 | 35 | class AppURLopener(urllib.FancyURLopener): 36 | # Lie shamelessly to every website. 37 | version = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:9.0) ' 38 | 'Gecko/20100101 Firefox/9.0') 39 | urllib._urlopener = AppURLopener() 40 | 41 | 42 | class Calais(object): 43 | """ 44 | Python class that knows how to talk to the OpenCalais API. 45 | 46 | Use the ``analyze()`` and ``analyze_url()`` methods, which return 47 | ``CalaisResponse`` objects. 48 | """ 49 | api_key = None 50 | processing_directives = {"contentType": "TEXT/RAW", 51 | "outputFormat": "application/json", 52 | "reltagBaseURL": None, 53 | "calculateRelevanceScore": True, 54 | "enableMetadataType": "SocialTags", 55 | "discardMetadata": None, 56 | "omitOutputtingOriginalText": True, } 57 | user_directives = {"allowDistribution": False, 58 | "allowSearch": False, 59 | "externalID": None, } 60 | external_metadata = {} 61 | 62 | def __init__(self, api_key, proc_directs=None, user_directs=None, 63 | ext_metadata=None, submitter='pycalais-v%s' % __version__): 64 | # Basically you could simply overwrite the directives of an instance, 65 | # but this feels much better when you already know what you want. 66 | if proc_directs is not None: 67 | self.processing_directives.update(proc_directs) 68 | if user_directs is not None: 69 | self.user_directives.update(user_directs) 70 | if ext_metadata is not None: 71 | self.external_metadata.update(ext_metadata) 72 | 73 | self.api_key = api_key 74 | self.user_directives["submitter"] = submitter 75 | 76 | def _directives_to_XML(self, dictionary): 77 | xml_template = 'c:%s="%s"' 78 | attrs = [] 79 | for key, value in dictionary.iteritems(): 80 | if value is None: 81 | continue 82 | 83 | if value == True: 84 | value = "true" 85 | elif value == False: 86 | value = "false" 87 | 88 | attrs.append(xml_template % (key, value)) 89 | return ' '.join(attrs) 90 | 91 | def _get_params_XML(self): 92 | return PARAMS_XML % ( 93 | self._directives_to_XML(self.processing_directives), 94 | self._directives_to_XML(self.user_directives), 95 | self._directives_to_XML(self.external_metadata)) 96 | 97 | def rest_POST(self, content): 98 | # Convert non-ascii characters into their XML entity counterparts. 99 | try: 100 | content = content.decode('utf-8').encode('ascii', 101 | 'xmlcharrefreplace') 102 | except (UnicodeDecodeError, UnicodeEncodeError): 103 | content = content.encode('ascii', 'xmlcharrefreplace') 104 | 105 | params = urllib.urlencode( 106 | {'licenseID': self.api_key, 107 | 'content': content, 108 | 'paramsXML': self._get_params_XML(), 109 | }) 110 | headers = {'Content-type': 'application/x-www-form-urlencoded'} 111 | conn = httplib.HTTPConnection('api.opencalais.com:80') 112 | conn.request('POST', '/enlighten/rest/', params, headers) 113 | response = conn.getresponse() 114 | data = response.read() 115 | conn.close() 116 | return data 117 | 118 | def get_random_id(self): 119 | """ 120 | Creates a random 10-character ID for your submission. 121 | 122 | Don't get confused, this method is not directly used here, 123 | however the user may use it as external_id for ``analyze()``. 124 | """ 125 | chars = string.letters + string.digits 126 | return ''.join(random.sample(chars, 10)) 127 | 128 | def get_content_id(self, text): 129 | """ 130 | Creates a SHA1 hash of the text of your submission. 131 | 132 | Don't get confused, this method is not directly used here, 133 | however the user may use it as external_id for ``analyze()``. 134 | """ 135 | h = hashlib.sha1() 136 | h.update(text) 137 | return h.hexdigest() 138 | 139 | def preprocess_html(self, html): 140 | html = html.replace('\n', '') 141 | html = SCRIPT_STYLE_RE.sub('', html) 142 | return html 143 | 144 | def analyze(self, content, content_type='TEXT/RAW', external_id=None, 145 | response_cls=CalaisResponse): 146 | if not (content and len(content.strip())): 147 | return None 148 | 149 | self.processing_directives['contentType'] = content_type 150 | 151 | if external_id is not None: 152 | self.user_directives['externalID'] = urllib.quote(external_id) 153 | 154 | return response_cls(self.rest_POST(content)) 155 | 156 | def analyze_url(self, url): 157 | request = urllib.urlopen(url) 158 | html = self.preprocess_html(request.read()) 159 | return self.analyze(html, content_type='TEXT/HTML', external_id=url) 160 | 161 | def analyze_file(self, filename): 162 | filetype = mimetypes.guess_type(filename)[0] 163 | if filetype is None: 164 | raise ValueError('Can not determine file type for "%s"' % filename) 165 | 166 | # Let's hope this does not leave file descriptors open. 167 | content = open(filename).read() 168 | content_type = '' 169 | if filetype == 'text/plain': 170 | content_type = 'TEXT/RAW' 171 | elif filetype == 'application/xml': 172 | content_type = 'TEXT/XML' 173 | elif filetype == 'text/html': 174 | content_type = filetype.upper() 175 | content = self.preprocess_html(content) 176 | else: 177 | raise ValueError('Only plaintext, HTML or XML files are ' 178 | 'currently supported.') 179 | 180 | return self.analyze(content, content_type=content_type, 181 | external_id=filename) 182 | --------------------------------------------------------------------------------