├── scrapy_corenlp ├── __init__.py └── middlewares.py ├── requirements.txt ├── setup.py ├── LICENSE.txt ├── .gitignore └── README.md /scrapy_corenlp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | nltk==3.4.5 2 | Scrapy==1.2.1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | 4 | setup( 5 | name='scrapy-corenlp', 6 | version='0.2.0', 7 | description='Scrapy spider middleware :: Stanford CoreNLP Named Entity Recognition', 8 | url='https://github.com/vu3jej/scrapy-corenlp', 9 | author='Jithesh E J', 10 | author_email='mail@jithesh.net', 11 | license='BSD-2-Clause', 12 | packages=['scrapy_corenlp'], 13 | classifiers=[ 14 | 'Development Status :: 3 - Alpha', 15 | 'Programming Language :: Python :: 2.7', 16 | 'Programming Language :: Python :: 3.4', 17 | 'Programming Language :: Python :: 3.5', 18 | 'Topic :: Text Processing :: Linguistic', 19 | ] 20 | ) 21 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2016, 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scrapy-corenlp 2 | 3 | [![PyPI](https://img.shields.io/pypi/v/scrapy-corenlp.svg?style=flat-square)]() 4 | [![PyPI](https://img.shields.io/pypi/pyversions/scrapy-corenlp.svg?style=flat-square)]() 5 | 6 | A [Scrapy][scrapy] middleware to perform Named Entity Recognition (NER) on response with Stanford CoreNLP. 7 | 8 | ## Settings 9 | 10 | | Option | Value | Example Value | 11 | |-------------------------------|---------------------------------------------------------------|---------------------------------------------------------------------------------------------| 12 | | STANFORD_NER_ENABLED | Boolean | `True` | 13 | | STANFORD_NER_CLASSIFIER | absolute path to `CRFClassifier` | `'/home/jithesh/stanford-ner-2015-12-09/classifiers/english.muc.7class.distsim.crf.ser.gz'` | 14 | | STANFORD_NER_JAR | absolute path to `stanford-ner.jar` file | `'/home/jithesh/stanford-ner-2015-12-09/stanford-ner.jar'` | 15 | | STANFORD_NER_FIELD_TO_PROCESS | A field or list of Item text fields to use for classification | `['title', 'description']` | 16 | | STANFORD_NER_FIELD_OUTPUT | scrapy item field to update the result with | `'result'` | 17 | 18 | In your `settings.py` file, add the previously described settings and add `CoreNLP` to your `SPIDER_MIDDLEWARES`, e.g. 19 | 20 | ```python 21 | SPIDER_MIDDLEWARES = { 22 | 'scrapy_corenlp.middlewares.CoreNLP': 543, 23 | } 24 | ``` 25 | 26 | An example value of the `STANFORD_NER_FIELD_OUTPUT` field after recognising the entities is: 27 | 28 | ```json 29 | {"result": {"DATE": ["1963", "2009", "1979", "1663", "1982"], "ORGANIZATION": ["Royal Society", "US National Academy of Science", "University of California", "Home Home About Stephen The Computer Stephen", "the University of Cambridge", "Sally Tsui Wong-Avery Director of Research", "Theoretical Physics", "Leiden University", "Baby Universe", "Department of Applied Mathematics", "Cambridge Lectures Publications Books Images Films", "Briefer History of Time", "ESA", "NASA", "Brief History of Time", "CBE", "Caius College", "The Universe"], "PERSON": ["P. Oesch", "Einstein", "D. Magee", "Stephen Hawking", "George", "Annie", "Isaac Newton", "G. Illingworth", "Dennis Stanton Avery", "R. Bouwens"], "LOCATION": ["London", "Santa Cruz", "Einstein", "Cambridge", "Gonville"]}} 30 | ``` 31 | 32 | [scrapy]: https://scrapy.org/ 33 | -------------------------------------------------------------------------------- /scrapy_corenlp/middlewares.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from itertools import groupby 3 | from operator import itemgetter 4 | from nltk.tokenize import StanfordTokenizer 5 | from nltk.tag.stanford import StanfordNERTagger 6 | from six import string_types 7 | 8 | from scrapy import Item 9 | from scrapy.exceptions import NotConfigured 10 | 11 | 12 | class CoreNLP(object): 13 | 14 | def __init__(self, classifier, jar_file, field_to_process, output_field): 15 | self.classifier = classifier 16 | self.jar_file = jar_file 17 | self.field_to_process = field_to_process 18 | self.output_field = output_field 19 | self.tokenizer = StanfordTokenizer(path_to_jar=self.jar_file).tokenize 20 | 21 | @classmethod 22 | def from_crawler(cls, crawler): 23 | if (not crawler.settings.get('STANFORD_NER_ENABLED') or 24 | not crawler.settings.get('STANFORD_NER_CLASSIFIER') or 25 | not crawler.settings.get('STANFORD_NER_JAR') or 26 | not crawler.settings.get('STANFORD_NER_FIELD_TO_PROCESS') or 27 | not crawler.settings.get('STANFORD_NER_FIELD_OUTPUT')): 28 | raise NotConfigured 29 | 30 | classifier = crawler.settings.get('STANFORD_NER_CLASSIFIER') 31 | jar_file = crawler.settings.get('STANFORD_NER_JAR') 32 | field_to_process = crawler.settings.get('STANFORD_NER_FIELD_TO_PROCESS') 33 | output_field = crawler.settings.get('STANFORD_NER_FIELD_OUTPUT') 34 | 35 | corenlp_settings = cls(classifier, jar_file, field_to_process, 36 | output_field) 37 | 38 | return corenlp_settings 39 | 40 | @staticmethod 41 | def accumulate(list_of_tuples): 42 | tokens, entities = zip(*list_of_tuples) 43 | recognised = defaultdict(set) 44 | duplicates = defaultdict(list) 45 | 46 | for i, item in enumerate(entities): 47 | duplicates[item].append(i) 48 | 49 | for key, value in duplicates.items(): 50 | for k, g in groupby(enumerate(value), lambda x: x[0] - x[1]): 51 | indices = list(map(itemgetter(1), g)) 52 | recognised[key].add( 53 | ' '.join(tokens[index] for index in indices) 54 | ) 55 | recognised.pop('O', None) 56 | 57 | return dict(recognised) 58 | 59 | def process_spider_output(self, response, result, spider): 60 | for element in result: 61 | if isinstance(element, (Item, dict)): 62 | if isinstance(self.field_to_process, list): 63 | text = ' '.join( 64 | [element[field] for field in self.field_to_process] 65 | ) 66 | elif isinstance(self.field_to_process, string_types): 67 | text = element[self.field_to_process] 68 | else: 69 | yield element 70 | 71 | tagger = StanfordNERTagger( 72 | model_filename=self.classifier, 73 | path_to_jar=self.jar_file 74 | ) 75 | token_entity_pairs = tagger.tag( 76 | tokens=self.tokenizer(s=text) 77 | ) 78 | accumulated = self.accumulate(token_entity_pairs) 79 | element.setdefault(self.output_field, accumulated) 80 | yield element 81 | else: 82 | yield element 83 | --------------------------------------------------------------------------------