├── scrapy_corenlp
    ├── __init__.py
    └── middlewares.py
├── requirements.txt
├── setup.py
├── LICENSE.txt
├── .gitignore
└── README.md


/scrapy_corenlp/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | nltk==3.4.5
2 | Scrapy==1.2.1
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | 
 4 | setup(
 5 |     name='scrapy-corenlp',
 6 |     version='0.2.0',
 7 |     description='Scrapy spider middleware :: Stanford CoreNLP Named Entity Recognition',
 8 |     url='https://github.com/vu3jej/scrapy-corenlp',
 9 |     author='Jithesh E J',
10 |     author_email='mail@jithesh.net',
11 |     license='BSD-2-Clause',
12 |     packages=['scrapy_corenlp'],
13 |     classifiers=[
14 |         'Development Status :: 3 - Alpha',
15 |         'Programming Language :: Python :: 2.7',
16 |         'Programming Language :: Python :: 3.4',
17 |         'Programming Language :: Python :: 3.5',
18 |         'Topic :: Text Processing :: Linguistic',
19 |     ]
20 | )
21 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2016, 
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # scrapy-corenlp
 2 | 
 3 | [![PyPI](https://img.shields.io/pypi/v/scrapy-corenlp.svg?style=flat-square)]()
 4 | [![PyPI](https://img.shields.io/pypi/pyversions/scrapy-corenlp.svg?style=flat-square)]()
 5 | 
 6 | A [Scrapy][scrapy] middleware to perform Named Entity Recognition (NER) on response with Stanford CoreNLP.
 7 | 
 8 | ## Settings
 9 | 
10 | | Option                        | Value                                                         | Example Value                                                                               |
11 | |-------------------------------|---------------------------------------------------------------|---------------------------------------------------------------------------------------------|
12 | | STANFORD_NER_ENABLED          | Boolean                                                       | `True`                                                                                      |
13 | | STANFORD_NER_CLASSIFIER       | absolute path to `CRFClassifier`                              | `'/home/jithesh/stanford-ner-2015-12-09/classifiers/english.muc.7class.distsim.crf.ser.gz'` |
14 | | STANFORD_NER_JAR              | absolute path to `stanford-ner.jar` file                      | `'/home/jithesh/stanford-ner-2015-12-09/stanford-ner.jar'`                                  |
15 | | STANFORD_NER_FIELD_TO_PROCESS | A field or list of Item text fields to use for classification | `['title', 'description']`                                                                  |
16 | | STANFORD_NER_FIELD_OUTPUT     | scrapy item field to update the result with                   | `'result'`                                                                                  |
17 | 
18 | In your `settings.py` file, add the previously described settings and add `CoreNLP` to your `SPIDER_MIDDLEWARES`, e.g.
19 | 
20 | ```python
21 | SPIDER_MIDDLEWARES = {
22 |     'scrapy_corenlp.middlewares.CoreNLP': 543,
23 | }
24 | ```
25 | 
26 | An example value of the `STANFORD_NER_FIELD_OUTPUT` field after recognising the entities is:
27 | 
28 | ```json
29 | {"result": {"DATE": ["1963", "2009", "1979", "1663", "1982"], "ORGANIZATION": ["Royal Society", "US National Academy of Science", "University of California", "Home Home About Stephen The Computer Stephen", "the University of Cambridge", "Sally Tsui Wong-Avery Director of Research", "Theoretical Physics", "Leiden University", "Baby Universe", "Department of Applied Mathematics", "Cambridge Lectures Publications Books Images Films", "Briefer History of Time", "ESA", "NASA", "Brief History of Time", "CBE", "Caius College", "The Universe"], "PERSON": ["P. Oesch", "Einstein", "D. Magee", "Stephen Hawking", "George", "Annie", "Isaac Newton", "G. Illingworth", "Dennis Stanton Avery", "R. Bouwens"], "LOCATION": ["London", "Santa Cruz", "Einstein", "Cambridge", "Gonville"]}}
30 | ```
31 | 
32 | [scrapy]: https://scrapy.org/
33 | 


--------------------------------------------------------------------------------
/scrapy_corenlp/middlewares.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from itertools import groupby
 3 | from operator import itemgetter
 4 | from nltk.tokenize import StanfordTokenizer
 5 | from nltk.tag.stanford import StanfordNERTagger
 6 | from six import string_types
 7 | 
 8 | from scrapy import Item
 9 | from scrapy.exceptions import NotConfigured
10 | 
11 | 
12 | class CoreNLP(object):
13 | 
14 |     def __init__(self, classifier, jar_file, field_to_process, output_field):
15 |         self.classifier = classifier
16 |         self.jar_file = jar_file
17 |         self.field_to_process = field_to_process
18 |         self.output_field = output_field
19 |         self.tokenizer = StanfordTokenizer(path_to_jar=self.jar_file).tokenize
20 | 
21 |     @classmethod
22 |     def from_crawler(cls, crawler):
23 |         if (not crawler.settings.get('STANFORD_NER_ENABLED') or
24 |                 not crawler.settings.get('STANFORD_NER_CLASSIFIER') or
25 |                 not crawler.settings.get('STANFORD_NER_JAR') or
26 |                 not crawler.settings.get('STANFORD_NER_FIELD_TO_PROCESS') or
27 |                 not crawler.settings.get('STANFORD_NER_FIELD_OUTPUT')):
28 |             raise NotConfigured
29 | 
30 |         classifier = crawler.settings.get('STANFORD_NER_CLASSIFIER')
31 |         jar_file = crawler.settings.get('STANFORD_NER_JAR')
32 |         field_to_process = crawler.settings.get('STANFORD_NER_FIELD_TO_PROCESS')
33 |         output_field = crawler.settings.get('STANFORD_NER_FIELD_OUTPUT')
34 | 
35 |         corenlp_settings = cls(classifier, jar_file, field_to_process,
36 |                                output_field)
37 | 
38 |         return corenlp_settings
39 | 
40 |     @staticmethod
41 |     def accumulate(list_of_tuples):
42 |         tokens, entities = zip(*list_of_tuples)
43 |         recognised = defaultdict(set)
44 |         duplicates = defaultdict(list)
45 | 
46 |         for i, item in enumerate(entities):
47 |             duplicates[item].append(i)
48 | 
49 |         for key, value in duplicates.items():
50 |             for k, g in groupby(enumerate(value), lambda x: x[0] - x[1]):
51 |                 indices = list(map(itemgetter(1), g))
52 |                 recognised[key].add(
53 |                     ' '.join(tokens[index] for index in indices)
54 |                 )
55 |         recognised.pop('O', None)
56 | 
57 |         return dict(recognised)
58 | 
59 |     def process_spider_output(self, response, result, spider):
60 |         for element in result:
61 |             if isinstance(element, (Item, dict)):
62 |                 if isinstance(self.field_to_process, list):
63 |                     text = ' '.join(
64 |                         [element[field] for field in self.field_to_process]
65 |                     )
66 |                 elif isinstance(self.field_to_process, string_types):
67 |                     text = element[self.field_to_process]
68 |                 else:
69 |                     yield element
70 | 
71 |                 tagger = StanfordNERTagger(
72 |                     model_filename=self.classifier,
73 |                     path_to_jar=self.jar_file
74 |                 )
75 |                 token_entity_pairs = tagger.tag(
76 |                     tokens=self.tokenizer(s=text)
77 |                 )
78 |                 accumulated = self.accumulate(token_entity_pairs)
79 |                 element.setdefault(self.output_field, accumulated)
80 |                 yield element
81 |             else:
82 |                 yield element
83 | 


--------------------------------------------------------------------------------