├── .babelrc
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── api
    └── server.py
├── dist
    └── index.js
├── package-lock.json
├── package.json
├── requirements.txt
├── src
    ├── __mocks__
    │   └── language.js
    ├── index.js
    ├── language.js
    ├── tokens.js
    └── util.js
└── tests
    ├── doc.test.js
    ├── language.test.js
    ├── span.test.js
    ├── test_api.py
    ├── token.test.js
    └── util.js


/.babelrc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "presets": [
 3 |     ["@babel/preset-env", {
 4 |       "modules": false,
 5 |       "targets": { "node": "current"}
 6 |     }]
 7 |   ],
 8 |   "plugins": [
 9 |     "add-module-exports"
10 |   ],
11 |   "env": {
12 |     "test": {
13 |       "presets": [
14 |         ["@babel/preset-env", {
15 |           "targets": { "node": "current"}
16 |         }]
17 |       ]
18 |     }
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .vscode/
  2 | 
  3 | # JavaScript
  4 | node_modules/
  5 | *.tgz
  6 | 
  7 | # Byte-compiled / optimized / DLL files
  8 | __pycache__/
  9 | *.py[cod]
 10 | *$py.class
 11 | 
 12 | # C extensions
 13 | *.so
 14 | 
 15 | # Distribution / packaging
 16 | .Python
 17 | build/
 18 | develop-eggs/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # celery beat schedule file
 89 | celerybeat-schedule
 90 | 
 91 | # SageMath parsed files
 92 | *.sage.py
 93 | 
 94 | # Environments
 95 | .env
 96 | .venv
 97 | env/
 98 | venv/
 99 | ENV/
100 | env.bak/
101 | venv.bak/
102 | 
103 | # Spyder project settings
104 | .spyderproject
105 | .spyproject
106 | 
107 | # Rope project settings
108 | .ropeproject
109 | 
110 | # mkdocs documentation
111 | /site
112 | 
113 | # mypy
114 | .mypy_cache/
115 | .dmypy.json
116 | dmypy.json
117 | 
118 | # Pyre type checker
119 | .pyre/
120 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | matrix:
 2 |   include:
 3 |   - language: python
 4 |     python: 3.6
 5 |     before_script:
 6 |       - pip install pytest
 7 |     install:
 8 |       - pip install -r requirements.txt
 9 |     script:
10 |       - python -m pytest tests
11 |     cache: pip
12 |   - language: node_js
13 |     node_js:
14 |       - "8"
15 |     install:
16 |       - npm install
17 |     script:
18 |       - npm run test
19 |     cache: npm
20 | notifications:
21 |   email: false
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Ines Montani
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <a href="https://explosion.ai"><img src="https://explosion.ai/assets/img/logo.svg" width="125" height="125" align="right" /></a>
  2 | 
  3 | # spaCy JS
  4 | 
  5 | [![travis](https://img.shields.io/travis/ines/spacy-js/master.svg?style=flat-square&logo=travis)](https://travis-ci.org/ines/spacy-js)
  6 | [![npm](https://img.shields.io/npm/v/spacy.svg?style=flat-square)](https://www.npmjs.com/package/spacy)
  7 | [![GitHub](https://img.shields.io/github/release/ines/spacy-js/all.svg?style=flat-square)](https://github.com/ines/spacy-js)
  8 | [![unpkg](https://img.shields.io/badge/unpkg-dist/index.js-brightgreen.svg?style=flat-square)](https://unpkg.com/spacy)
  9 | 
 10 | 
 11 | JavaScript interface for accessing linguistic annotations provided by
 12 | [spaCy](https://spacy.io). This project is mostly experimental and was
 13 | developed for fun to play around with different ways of mimicking spaCy's
 14 | Python API.
 15 | 
 16 | The results will still be computed in Python and made available via a REST API.
 17 | The JavaScript API resembles spaCy's Python API as closely as possible (with
 18 | a few exceptions, as the values are all pre-computed and it's tricky to express
 19 | complex recursive relationships).
 20 | 
 21 | ```javascript
 22 | const spacy = require('spacy');
 23 | 
 24 | (async function() {
 25 |     const nlp = spacy.load('en_core_web_sm');
 26 |     const doc = await nlp('This is a text about Facebook.');
 27 |     for (let ent of doc.ents) {
 28 |         console.log(ent.text, ent.label);
 29 |     }
 30 |     for (let token of doc) {
 31 |         console.log(token.text, token.pos, token.head.text);
 32 |     }
 33 | })();
 34 | ```
 35 | 
 36 | ## ⌛️ Installation
 37 | 
 38 | ### Installing the JavaScript library
 39 | 
 40 | You can install the JavaScript package via npm:
 41 | 
 42 | ```bash
 43 | npm install spacy
 44 | ```
 45 | 
 46 | ### Setting up the Python server
 47 | 
 48 | First, clone this repo and install the requirements. If you've installed the
 49 | package via npm, you can also use the `api/server.py` and `requirements.txt` in
 50 | your `./node_modules/spacy` directory. It's recommended to use a virtual
 51 | environment.
 52 | 
 53 | ```bash
 54 | pip install -r requirements.txt
 55 | ```
 56 | 
 57 | You can then run the REST API. By default, this will serve the API via
 58 | `0.0.0.0:8080`:
 59 | 
 60 | ```bash
 61 | python api/server.py
 62 | ```
 63 | 
 64 | If you like, you can install more [models](https://spacy.io/models) and specify
 65 | a comma-separated list of models to load as the first argument when you run
 66 | the server. All models need to be installed in the same environment.
 67 | 
 68 | ```bash
 69 | python api/server.py en_core_web_sm,de_core_news_sm
 70 | ```
 71 | 
 72 | | Argument | Type | Description | Default |
 73 | | --- | --- | --- | --- |
 74 | | `models` | positional (str) | Comma-separated list of models to load and make available. | `en_core_web_sm` |
 75 | | `--host`, `-ho` | option (str) | Host to serve the API. | `0.0.0.0` |
 76 | | `--port`, `-p` | option (int) | Port to server the API. | `8080` |
 77 | 
 78 | ## 🎛 API
 79 | 
 80 | ### `spacy.load`
 81 | 
 82 | "Load" a spaCy model. This method mostly exists for consistency with the Python
 83 | API. It sets up the REST API and `nlp` object, but doesn't actually load
 84 | anything, since the models are already available via the REST API.
 85 | 
 86 | ```javascript
 87 | const nlp = spacy.load('en_core_web_sm');
 88 | ```
 89 | 
 90 | | Argument | Type | Description |
 91 | | --- | --- | --- |
 92 | | `model` | String | Name of model to load, e.g. `'en_core_web_sm'`. Needs to be available via the REST API. |
 93 | | `api` | String | Alternative URL of REST API. Defaults to `http://0.0.0.0:8080`. |
 94 | | **RETURNS** | [`Language`](src/language.js) | The `nlp` object. |
 95 | 
 96 | ### `nlp` <kbd>async</kbd>
 97 | 
 98 | The `nlp` object created by `spacy.load` can be called on a string of text
 99 | and makes a request to the REST API. The easiest way to use it is to wrap the
100 | call in an `async` function and use `await`:
101 | 
102 | ```javascript
103 | async function() {
104 |     const nlp = spacy.load('en_core_web_sm');
105 |     const doc = await nlp('This is a text.');
106 | }
107 | ```
108 | 
109 | | Argument | Type | Description |
110 | | --- | --- | --- |
111 | | `text` | String | The text to process. |
112 | | **RETURNS** | [`Doc`](src/tokens.js) | The processed `Doc`. |
113 | 
114 | ### `Doc`
115 | 
116 | Just like [in the original API](https://spacy.io/api/doc), the `Doc` object can
117 | be constructed with an array of `words` and `spaces`. It also takes an
118 | additional `attrs` object, which corresponds to the JSON-serialized linguistic
119 | annotations created in [`doc2json` in `api/server.py`](api/server.py).
120 | 
121 | The `Doc` behaves just like the regular spaCy `Doc` – you can iterate over its
122 | tokens, index into individual tokens, access the `Doc` attributes and properties
123 | and also use native JavaScript methods like `map` and `slice` (since there's no
124 | real way to make Python's slice notation like `doc[2:4]` work).
125 | 
126 | #### Construction
127 | 
128 | ```javascript
129 | import { Doc } from 'spacy';
130 | 
131 | const words = ['Hello', 'world', '!'];
132 | const spaces = [true, false, false];
133 | const doc = Doc(words, spaces)
134 | console.log(doc.text) // 'Hello world!'
135 | ```
136 | 
137 | | Argument | Type | Description |
138 | | --- | --- | --- |
139 | | `words` | Array | The individual token texts. |
140 | | `spaces` | Array | Whether the token at this position is followed by a space or not. |
141 | | `attrs` | Object | JSON-serialized attributes, see [`doc2json`](api/server.py). |
142 | | **RETURNS** | [`Doc`](src/tokens.js) | The newly constructed `Doc`. |
143 | 
144 | #### Symbol iterator and token indexing
145 | 
146 | ```javascript
147 | async function() {
148 |     const nlp = spacy.load('en_core_web_sm');
149 |     const doc = await nlp('Hello world');
150 | 
151 |     for (let token of doc) {
152 |         console.log(token.text);
153 |     }
154 |     // Hello
155 |     // world
156 | 
157 |     const token1 = doc[0];
158 |     console.log(token1.text);
159 |     // Hello
160 | }
161 | ```
162 | 
163 | #### Properties and Attributes
164 | 
165 | | Name | Type | Description |
166 | | --- | --- | --- |
167 | | `text` | String | The `Doc` text. |
168 | | `length` | Number | The number of tokens in the `Doc`. |
169 | | `ents` | Array | A list of [`Span`](src/tokens.js) objects, describing the named entities in the `Doc`. |
170 | | `sents` | Array | A list of [`Span`](src/tokens.js) objects, describing the sentences in the `Doc`. |
171 | | `nounChunks` | Array | A list of [`Span`](src/tokens.js) objects, describing the base noun phrases in the `Doc`. |
172 | | `cats` | Object | The document categories predicted by the text classifier, if available in the model. |
173 | | `isTagged` | Boolean | Whether the part-of-speech tagger has been applied to the `Doc`. |
174 | | `isParsed` | Boolean | Whether the dependency parser has been applied to the `Doc`. |
175 | | `isSentenced` | Boolean | Whether the sentence boundary detector has been applied to the `Doc`. |
176 | 
177 | ### `Span`
178 | 
179 | A `Span` object is a slice of a `Doc` and contains of one or more tokens. Just
180 | like [in the original API](https://spacy.io/api/span), it can be constructed
181 | from a `Doc`, a start and end index and an optional label, or by slicing a `Doc`.
182 | 
183 | #### Construction
184 | 
185 | ```javascript
186 | import { Doc, Span } from 'spacy';
187 | 
188 | const doc = Doc(['Hello', 'world', '!'], [true, false, false]);
189 | const span = Span(doc, 1, 3);
190 | console.log(span.text) // 'world!'
191 | ```
192 | 
193 | | Argument | Type | Description |
194 | | --- | --- | --- |
195 | | `doc` | `Doc` | The reference document. |
196 | | `start` | Number | The start token index. |
197 | | `end` | Number | The end token index. This is *exclusive*, i.e. "up to token X". |
198 | | `label` | String | Optional label. |
199 | | **RETURNS** | [`Span`](src/tokens.js) | The newly constructed `Span`. |
200 | 
201 | #### Properties and Attributes
202 | 
203 | | Name | Type | Description |
204 | | --- | --- | --- |
205 | | `text` | String | The `Span` text. |
206 | | `length` | Number | The number of tokens in the `Span`. |
207 | | `doc` | `Doc` | The parent `Doc`. |
208 | | `start` | Number | The `Span`'s start index in the parent document. |
209 | | `end` | Number | The `Span`'s end index in the parent document. |
210 | | `label` | String | The `Span`'s label, if available. |
211 | 
212 | ### `Token`
213 | 
214 | For token attributes that exist as string and ID versions (e.g. `Token.pos` vs.
215 | `Token.pos_`), only the string versions are exposed.
216 | 
217 | #### Usage Examples
218 | 
219 | ```javascript
220 | async function() {
221 |     const nlp = spacy.load('en_core_web_sm');
222 |     const doc = await nlp('Hello world');
223 | 
224 |     for (let token of doc) {
225 |         console.log(token.text, token.pos, token.isLower);
226 |     }
227 |     // Hello INTJ false
228 |     // world NOUN true
229 | }
230 | ```
231 | 
232 | #### Properties and Attributes
233 | 
234 | | Name | Type | Description |
235 | | --- | --- | --- |
236 | | `text` | String | The token text. |
237 | | `whitespace` | String | Whitespace character following the token, if available. |
238 | | `textWithWs` | String | Token text with training whitespace. |
239 | | `length` | Number | The length of the token text. |
240 | | `orth` | Number | ID of the token text. |
241 | | `doc` | `Doc` | The parent `Doc`. |
242 | | `head` | `Token` | The syntactic parent, or "governor", of this token. |
243 | | `i` | Number | Index of the token in the parent document. |
244 | | `entType` | String | The token's named entity type. |
245 | | `entIob` | String | IOB code of the token's named entity tag. |
246 | | `lemma` | String | The token's lemma, i.e. the base form. |
247 | | `norm` | String | The normalised form of the token. |
248 | | `lower` | String | The lowercase form of the token. |
249 | | `shape` | String | Transform of the tokens's string, to show orthographic features. For example, "Xxxx" or "dd". |
250 | | `prefix` | String | A length-N substring from the start of the token. Defaults to `N=1`. |
251 | | `suffix` | String | Length-N substring from the end of the token. Defaults to `N=3`. |
252 | | `pos` | String | The token's coarse-grained part-of-speech tag. |
253 | | `tag` | String | The token's fine-grained part-of-speech tag. |
254 | | `isAlpha` | Boolean | Does the token consist of alphabetic characters? |
255 | | `isAscii` | Boolean | Does the token consist of ASCII characters? |
256 | | `isDigit` | Boolean | Does the token consist of digits? |
257 | | `isLower` | Boolean | Is the token lowercase? |
258 | | `isUpper` | Boolean | Is the token uppercase? |
259 | | `isTitle` | Boolean | Is the token titlecase? |
260 | | `isPunct` | Boolean | Is the token punctuation? |
261 | | `isLeftPunct` | Boolean | Is the token left punctuation? |
262 | | `isRightPunct` | Boolean | Is the token right punctuation? |
263 | | `isSpace` | Boolean | Is the token a whitespace character? |
264 | | `isBracket` | Boolean | Is the token a bracket? |
265 | | `isCurrency` | Boolean | Is the token a currency symbol? |
266 | | `likeUrl` | Boolean | Does the token resemble a URL? |
267 | | `likeNum` | Boolean | Does the token resemble a number? |
268 | | `likeEmail` | Boolean | Does the token resemble an email address? |
269 | | `isOov` | Boolean | Is the token out-of-vocabulary? |
270 | | `isStop` | Boolean | Is the token a stop word? |
271 | | `isSentStart` | Boolean | Does the token start a sentence? |
272 | 
273 | ## 🔔 Run Tests
274 | 
275 | ### Python
276 | 
277 | First, make sure you have `pytest` and all dependencies installed. You can then
278 | run the tests by pointing `pytest` to [`/tests`](/tests):
279 | 
280 | ```bash
281 | python -m pytest tests
282 | ```
283 | 
284 | ### JavaScript
285 | 
286 | This project uses [Jest](https://jestjs.io) for testing. Make sure you have
287 | all dependencies and development dependencies installed. You can then run:
288 | 
289 | ```bash
290 | npm run test
291 | ```
292 | 
293 | To allow testing the code without a REST API providing the data, the test suite
294 | currently uses a [mock of the `Language` class](src/__mocks__), which returns
295 | static data located in [`tests/util.js`](tests/util.js).
296 | 
297 | ## ✅ Ideas and Todos
298 | 
299 | - [ ] Improve JavaScript tests.
300 | - [ ] Experiment with NodeJS bindings to make Python integration easier. To be fair, running a separate API in an environment controlled by the user and *not* hiding it a few levels deep is often much easier. But maybe there are some modern Node tricks that this project could benefit from.
301 | 


--------------------------------------------------------------------------------
/api/server.py:
--------------------------------------------------------------------------------
  1 | # coding: utf8
  2 | from __future__ import unicode_literals
  3 | 
  4 | import hug
  5 | from hug_middleware_cors import CORSMiddleware
  6 | import waitress
  7 | import spacy
  8 | import plac
  9 | 
 10 | 
 11 | MODELS = {}
 12 | 
 13 | 
 14 | @plac.annotations(
 15 |     models=("Comma-separated list of spaCy models", "positional", None, str),
 16 |     host=("Host to serve API", "option", "ho", str),
 17 |     port=("Port to serve API", "option", "p", int),
 18 | )
 19 | def main(models=None, host="0.0.0.0", port=8080):
 20 |     if not models:
 21 |         models = ["en_core_web_sm"]
 22 |     else:
 23 |         models = [m.strip() for m in models.split(",")]
 24 |     for model in models:
 25 |         load_model(model)
 26 |     # Serving Hug API
 27 |     app = hug.API(__name__)
 28 |     app.http.add_middleware(CORSMiddleware(app))
 29 |     waitress.serve(__hug_wsgi__, port=port)
 30 | 
 31 | 
 32 | def load_model(model):
 33 |     print("Loading model '{}'...".format(model))
 34 |     MODELS[model] = spacy.load(model)
 35 | 
 36 | 
 37 | def doc2json(doc: spacy.tokens.Doc, model: str):
 38 |     json_doc = {
 39 |         "text": doc.text,
 40 |         "text_with_ws": doc.text_with_ws,
 41 |         "cats": doc.cats,
 42 |         "is_tagged": doc.is_tagged,
 43 |         "is_parsed": doc.is_parsed,
 44 |         "is_nered": doc.is_nered,
 45 |         "is_sentenced": doc.is_sentenced,
 46 |     }
 47 |     ents = [
 48 |         {"start": ent.start, "end": ent.end, "label": ent.label_} for ent in doc.ents
 49 |     ]
 50 |     if doc.is_sentenced:
 51 |         sents = [{"start": sent.start, "end": sent.end} for sent in doc.sents]
 52 |     else:
 53 |         sents = []
 54 |     if doc.is_tagged and doc.is_parsed:
 55 |         noun_chunks = [
 56 |             {"start": chunk.start, "end": chunk.end} for chunk in doc.noun_chunks
 57 |         ]
 58 |     else:
 59 |         noun_chunks = []
 60 |     tokens = [
 61 |         {
 62 |             "text": token.text,
 63 |             "text_with_ws": token.text_with_ws,
 64 |             "whitespace": token.whitespace_,
 65 |             "orth": token.orth,
 66 |             "i": token.i,
 67 |             "ent_type": token.ent_type_,
 68 |             "ent_iob": token.ent_iob_,
 69 |             "lemma": token.lemma_,
 70 |             "norm": token.norm_,
 71 |             "lower": token.lower_,
 72 |             "shape": token.shape_,
 73 |             "prefix": token.prefix_,
 74 |             "suffix": token.suffix_,
 75 |             "pos": token.pos_,
 76 |             "tag": token.tag_,
 77 |             "dep": token.dep_,
 78 |             "is_alpha": token.is_alpha,
 79 |             "is_ascii": token.is_ascii,
 80 |             "is_digit": token.is_digit,
 81 |             "is_lower": token.is_lower,
 82 |             "is_upper": token.is_upper,
 83 |             "is_title": token.is_title,
 84 |             "is_punct": token.is_punct,
 85 |             "is_left_punct": token.is_left_punct,
 86 |             "is_right_punct": token.is_right_punct,
 87 |             "is_space": token.is_space,
 88 |             "is_bracket": token.is_bracket,
 89 |             "is_currency": token.is_currency,
 90 |             "like_url": token.like_url,
 91 |             "like_num": token.like_num,
 92 |             "like_email": token.like_email,
 93 |             "is_oov": token.is_oov,
 94 |             "is_stop": token.is_stop,
 95 |             "is_sent_start": token.is_sent_start,
 96 |             "head": token.head.i,
 97 |         }
 98 |         for token in doc
 99 |     ]
100 |     return {
101 |         "model": model,
102 |         "doc": json_doc,
103 |         "ents": ents,
104 |         "sents": sents,
105 |         "noun_chunks": noun_chunks,
106 |         "tokens": tokens,
107 |     }
108 | 
109 | 
110 | @hug.post("/parse")
111 | def parse(model: str, text: str):
112 |     nlp = MODELS[model]
113 |     doc = nlp(text)
114 |     return doc2json(doc, model)
115 | 
116 | 
117 | @hug.post("/similarity")
118 | def similarity(model: str, text1: str, text2: str):
119 |     # We can always create Doc objects here, because the result is the same
120 |     nlp = MODELS[model]
121 |     doc1 = nlp(text1)
122 |     doc2 = nlp(text2)
123 |     return {"similarity": doc1.similarity(doc2)}
124 | 
125 | 
126 | if __name__ == "__main__":
127 |     plac.call(main)
128 | 


--------------------------------------------------------------------------------
/dist/index.js:
--------------------------------------------------------------------------------
1 | parcelRequire=function(e,r,n,t){var i="function"==typeof parcelRequire&&parcelRequire,o="function"==typeof require&&require;function u(n,t){if(!r[n]){if(!e[n]){var f="function"==typeof parcelRequire&&parcelRequire;if(!t&&f)return f(n,!0);if(i)return i(n,!0);if(o&&"string"==typeof n)return o(n);var c=new Error("Cannot find module '"+n+"'");throw c.code="MODULE_NOT_FOUND",c}p.resolve=function(r){return e[n][1][r]||r},p.cache={};var l=r[n]=new u.Module(n);e[n][0].call(l.exports,p,l,l.exports,this)}return r[n].exports;function p(e){return u(p.resolve(e))}}u.isParcelRequire=!0,u.Module=function(e){this.id=e,this.bundle=u,this.exports={}},u.modules=e,u.cache=r,u.parent=i,u.register=function(r,n){e[r]=[function(e,r){r.exports=n},{}]};for(var f=0;f<n.length;f++)u(n[f]);if(n.length){var c=u(n[n.length-1]);"object"==typeof exports&&"undefined"!=typeof module?module.exports=c:"function"==typeof define&&define.amd?define(function(){return c}):t&&(this[t]=c)}return u}({"Y/Oq":[function(require,module,exports) {
2 | "use strict";Object.defineProperty(exports,"__esModule",{value:!0}),exports.makeRequest=i,exports.getSimilarity=o;var e=r(require("node-fetch")),t=r(require("url"));function r(e){return e&&e.__esModule?e:{default:e}}async function i(r,i,o,a="POST"){const n={Accept:"application/json","Content-Type":"application/json"},s=JSON.stringify(o),c=t.default.resolve(r,i);try{const t=await(0,e.default)(c,{method:a,headers:n,credentials:"same-origin",body:s});return await t.json()}catch(l){console.log(`Error fetching data from API: ${r}`)}}async function o(e,t,r,o){return(await i(e,"/similarity",{model:t,text1:r,text2:o})).similarity}
3 | },{}],"nJFl":[function(require,module,exports) {
4 | "use strict";Object.defineProperty(exports,"__esModule",{value:!0}),exports.Token=exports.Span=exports.Doc=void 0;var t=require("./util");class s{constructor(t,s,i={}){this._doc=i.doc||{},this._tokens=i.tokens||[],this._ents=i.ents||[],this._sents=i.sents||[],this._chunks=i.noun_chunks||[],this._model=i.model,this._api=i.api,this.tokens=t.map((t,i)=>new e(this,t,s[i],this._tokens[i]));for(let e=0;e<this.tokens.length;e++)this[e]=this.tokens[e];this.cats=this._doc.cats,this.isTagged=this._doc.is_tagged,this.isParsed=this._doc.is_parsed,this.isSentenced=this._doc.is_sentenced}inspect(){return this.text}get text(){let t="";for(let s of this.tokens)t+=s.textWithWs;return t}get length(){return this.tokens.length}get ents(){return this._ents.map(({start:t,end:s,label:e})=>new i(this,t,s,e))}get sents(){return this._sents.map(({start:t,end:s})=>new i(this,t,s))}get nounChunks(){return this._chunks.map(({start:t,end:s})=>new i(this,t,s))}*[Symbol.iterator](){let t=0;for(;void 0!==this.tokens[t];)yield this.tokens[t],++t}toString(){return this.text}map(t){let s=[];for(let i of this)s.push(t(i));return s}slice(t,s){return new i(this,t,s)}async similarity(s){return await(0,t.getSimilarity)(this._api,this._model,this.text,s.text)}}exports.Doc=s;class i{constructor(t,s,i,e){this.doc=t,this.start=s,this.end=i,this._label=e,this.tokens=[...this.doc].slice(this.start,this.end);for(let h=0;h<this.tokens.length;h++)this[h]=this.tokens[0]}get text(){let t="";for(let s of this.tokens)t+=s.textWithWs;return t.trim()}get length(){return this.tokens.length}get label(){if(this._label)return this._label;for(let t of this.doc.ents)if(t.start===this.start&&t.end==this.end)return t.label}*[Symbol.iterator](){let t=0;for(;void 0!==this.tokens[t];)yield this.tokens[t],++t}slice(t,s){return new i(this,t,s)}toString(){return this.text}inspect(){return this.text}async similarity(s){return await(0,t.getSimilarity)(this.doc._api,this.doc._model,this.text,s.text)}}exports.Span=i;class e{constructor(t,s,i,e={}){this.doc=t,this.whitespace=i?" ":"",this.text=s,this.textWithWs=this.text+this.whitespace,this.orth=e.orth,this.i=e.i,this.entType=e.ent_type,this.entIob=e.ent_iob,this.lemma=e.lemma,this.norm=e.norm,this.lower=e.lower,this.shape=e.shape,this.prefix=e.prefix,this.suffix=e.suffix,this.pos=e.pos,this.tag=e.tag,this.dep=e.dep,this.isAlpha=e.is_alpha,this.isAscii=e.is_ascii,this.isDigit=e.is_digit,this.isLower=e.is_lower,this.isUpper=e.is_upper,this.isTitle=e.is_title,this.isPunct=e.is_punct,this.isLeftPunct=e.is_left_punct,this.isRightPunct=e.is_right_punct,this.isSpace=e.is_space,this.isBracket=e.is_bracket,this.isCurrency=e.is_currency,this.likeUrl=e.like_url,this.likeNum=e.like_num,this.likeEmail=e.like_email,this.isOov=e.is_oov,this.isStop=e.is_stop,this.isSentStart=e.is_sent_start,this._head=e.head}get length(){return this.text.length}get head(){return this.doc[this._head]}toString(){return this.text}inspect(){return this.text}async similarity(s){return await(0,t.getSimilarity)(this.doc._api,this.doc._model,this.text,s.text)}}exports.Token=e;
5 | },{"./util":"Y/Oq"}],"hk5u":[function(require,module,exports) {
6 | "use strict";Object.defineProperty(exports,"__esModule",{value:!0}),exports.default=void 0;var e=require("./tokens"),t=require("./util");class s{constructor(t,s="http://localhost:8080"){const a=this;return async function(o){const{words:r,spaces:c,attrs:n}=await a.makeDoc(t,o,s);return new e.Doc(r,c,n)}}async makeDoc(e,s,a){const o=await(0,t.makeRequest)(a,"parse",{model:e,text:s});return{words:o.tokens.map(({text:e})=>e),spaces:o.tokens.map(({whitespace:e})=>Boolean(e)),attrs:Object.assign({},o,{api:a})}}}exports.default=s;
7 | },{"./tokens":"nJFl","./util":"Y/Oq"}],"Focm":[function(require,module,exports) {
8 | "use strict";Object.defineProperty(exports,"__esModule",{value:!0}),Object.defineProperty(exports,"Doc",{enumerable:!0,get:function(){return r.Doc}}),Object.defineProperty(exports,"Token",{enumerable:!0,get:function(){return r.Token}}),Object.defineProperty(exports,"Span",{enumerable:!0,get:function(){return r.Span}}),exports.default=void 0;var e=t(require("./language")),r=require("./tokens");function t(e){return e&&e.__esModule?e:{default:e}}var n={load:function(r,t){return new e.default(r,t)}};exports.default=n;
9 | },{"./language":"hk5u","./tokens":"nJFl"}]},{},["Focm"], "spacy")


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "spacy",
 3 |   "version": "0.0.4",
 4 |   "description": "JavaScript API for spaCy with Python REST API",
 5 |   "main": "dist/index.js",
 6 |   "scripts": {
 7 |     "test": "jest",
 8 |     "build": "parcel build src/index.js --target node --no-source-maps --global spacy",
 9 |     "dev": "parcel src/index.js --target node --no-source-maps --global spacy",
10 |     "package": "npm run build && npm pack"
11 |   },
12 |   "author": "Ines Montani",
13 |   "license": "MIT",
14 |   "homepage": "https://github.com/ines/spacy-js#readme",
15 |   "repository": {
16 |     "type": "git",
17 |     "url": "https://github.com/ines/spacy-js.git"
18 |   },
19 |   "keywords": [
20 |     "spacy",
21 |     "nlp",
22 |     "natural language processing",
23 |     "machine learning",
24 |     "artificial intelligence"
25 |   ],
26 |   "dependencies": {
27 |     "node-fetch": "^2.1.2"
28 |   },
29 |   "devDependencies": {
30 |     "@babel/cli": "^7.1.2",
31 |     "@babel/core": "^7.1.2",
32 |     "@babel/preset-env": "^7.1.0",
33 |     "babel-core": "^7.0.0-bridge.0",
34 |     "babel-plugin-add-module-exports": "^1.0.0",
35 |     "jest": "^23.6.0",
36 |     "parcel-bundler": "^1.10.3",
37 |     "regenerator-runtime": "^0.12.1"
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | hug>=2.4.0,<3.0.0
2 | hug-middleware-cors>=1.0.0,<2.0.0
3 | spacy>=2.1.0,<2.2.0
4 | waitress>=1.0.2,<2.0.0
5 | plac>=0.9.6,<1.0.0
6 | 
7 | https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz#egg=en_core_web_sm==2.1.0
8 | 


--------------------------------------------------------------------------------
/src/__mocks__/language.js:
--------------------------------------------------------------------------------
 1 | import { Doc } from '../tokens';
 2 | import { words, spaces, attrs } from '../../tests/util';
 3 | 
 4 | export default class Language {
 5 |     constructor(model, api) {
 6 |         return async function(text) {
 7 |             return new Doc(words, spaces, attrs);
 8 |         }
 9 |     }
10 | }
11 | 


--------------------------------------------------------------------------------
/src/index.js:
--------------------------------------------------------------------------------
1 | import Language from './language'
2 | export { Doc, Token, Span } from './tokens'
3 | 
4 | export default {
5 |     load: function(model, api) {
6 |         return new Language(model, api);
7 |     }
8 | }
9 | 


--------------------------------------------------------------------------------
/src/language.js:
--------------------------------------------------------------------------------
 1 | import { Doc } from './tokens';
 2 | import { makeRequest } from './util';
 3 | 
 4 | export default class Language {
 5 |     constructor(model, api = 'http://localhost:8080') {
 6 |         const self = this;
 7 |         return async function(text) {
 8 |             const { words, spaces, attrs } = await self.makeDoc(model, text, api);
 9 |             return new Doc(words, spaces, attrs);
10 |         }
11 |     }
12 | 
13 |     async makeDoc(model, text, api) {
14 |         const json = await makeRequest(api, 'parse', { model, text })
15 |         const words = json.tokens.map(({ text }) => text);
16 |         const spaces = json.tokens.map(({ whitespace }) => Boolean(whitespace));
17 |         return { words, spaces, attrs: Object.assign({}, json, { api }) }
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/tokens.js:
--------------------------------------------------------------------------------
  1 | import { getSimilarity } from './util'
  2 | 
  3 | export class Doc {
  4 |     constructor(words, spaces, attrs = {}) {
  5 |         this._doc = attrs.doc || {}
  6 |         this._tokens = attrs.tokens || []
  7 |         this._ents = attrs.ents || []
  8 |         this._sents = attrs.sents || []
  9 |         this._chunks = attrs.noun_chunks || []
 10 |         this._model = attrs.model
 11 |         this._api = attrs.api
 12 |         this.tokens = words.map((word, i) => new Token(this, word, spaces[i], this._tokens[i]))
 13 |         for (let i = 0; i < this.tokens.length; i++) {
 14 |             this[i] = this.tokens[i]
 15 |         }
 16 |         this.cats = this._doc.cats
 17 |         this.isTagged = this._doc.is_tagged
 18 |         this.isParsed = this._doc.is_parsed
 19 |         this.isNered = this._doc.is_nered
 20 |         this.isSentenced = this._doc.is_sentenced
 21 |     }
 22 | 
 23 |     inspect() {
 24 |         return this.text
 25 |     }
 26 | 
 27 |     get text() {
 28 |         let text = ''
 29 |         for (let token of this.tokens) {
 30 |             text += token.textWithWs
 31 |         }
 32 |         return text
 33 |     }
 34 | 
 35 |     get length() {
 36 |         return this.tokens.length
 37 |     }
 38 | 
 39 |     get ents() {
 40 |         return this._ents.map(({ start, end, label }) => new Span(this, start, end, label))
 41 |     }
 42 | 
 43 |     get sents() {
 44 |         return this._sents.map(({ start, end }) => new Span(this, start, end))
 45 |     }
 46 | 
 47 |     get nounChunks() {
 48 |         return this._chunks.map(({ start, end }) => new Span(this, start, end))
 49 |     }
 50 | 
 51 |     *[Symbol.iterator]() {
 52 |         let i = 0
 53 |         while (this.tokens[i] !== undefined) {
 54 |             yield this.tokens[i]
 55 |             ++i
 56 |         }
 57 |     }
 58 | 
 59 |     toString() {
 60 |         return this.text
 61 |     }
 62 | 
 63 |     map(func) {
 64 |         let tokens = []
 65 |         for (let token of this) {
 66 |             tokens.push(func(token))
 67 |         }
 68 |         return tokens
 69 |     }
 70 | 
 71 |     slice(start, end) {
 72 |         return new Span(this, start, end)
 73 |     }
 74 | 
 75 |     async similarity(obj) {
 76 |         return await getSimilarity(this._api, this._model, this.text, obj.text)
 77 |     }
 78 | }
 79 | 
 80 | export class Span {
 81 |     constructor(doc, start, end, label) {
 82 |         this.doc = doc
 83 |         this.start = start
 84 |         this.end = end
 85 |         this._label = label
 86 |         this.tokens = [...this.doc].slice(this.start, this.end)
 87 |         for (let i = 0; i < this.tokens.length; i++) {
 88 |             this[i] = this.tokens[0]
 89 |         }
 90 |     }
 91 | 
 92 |     get text() {
 93 |         let text = ''
 94 |         for (let token of this.tokens) {
 95 |             text += token.textWithWs
 96 |         }
 97 |         return text.trim()
 98 |     }
 99 | 
100 |     get length() {
101 |         return this.tokens.length
102 |     }
103 | 
104 |     get label() {
105 |         if (this._label) {
106 |             return this._label
107 |         }
108 |         // Manually check if span is an entity
109 |         for (let ent of this.doc.ents) {
110 |             if (ent.start === this.start && ent.end == this.end) {
111 |                 return ent.label
112 |             }
113 |         }
114 |     }
115 | 
116 |     *[Symbol.iterator]() {
117 |         let i = 0
118 |         while (this.tokens[i] !== undefined) {
119 |             yield this.tokens[i]
120 |             ++i
121 |         }
122 |     }
123 | 
124 |     slice(start, end) {
125 |         return new Span(this, start, end)
126 |     }
127 | 
128 |     toString() {
129 |         return this.text
130 |     }
131 | 
132 |     inspect() {
133 |         return this.text
134 |     }
135 | 
136 |     async similarity(obj) {
137 |         return await getSimilarity(this.doc._api, this.doc._model, this.text, obj.text)
138 |     }
139 | }
140 | 
141 | export class Token {
142 |     constructor(doc, word, space, attrs = {}) {
143 |         this.doc = doc
144 |         this.whitespace = space ? ' ' : ''
145 |         this.text = word
146 |         this.textWithWs = this.text + this.whitespace
147 |         this.orth = attrs.orth
148 |         this.i = attrs.i
149 |         this.entType = attrs.ent_type
150 |         this.entIob = attrs.ent_iob
151 |         this.lemma = attrs.lemma
152 |         this.norm = attrs.norm
153 |         this.lower = attrs.lower
154 |         ;(this.shape = attrs.shape), (this.prefix = attrs.prefix)
155 |         this.suffix = attrs.suffix
156 |         this.pos = attrs.pos
157 |         this.tag = attrs.tag
158 |         this.dep = attrs.dep
159 |         this.isAlpha = attrs.is_alpha
160 |         this.isAscii = attrs.is_ascii
161 |         this.isDigit = attrs.is_digit
162 |         this.isLower = attrs.is_lower
163 |         this.isUpper = attrs.is_upper
164 |         this.isTitle = attrs.is_title
165 |         this.isPunct = attrs.is_punct
166 |         this.isLeftPunct = attrs.is_left_punct
167 |         this.isRightPunct = attrs.is_right_punct
168 |         this.isSpace = attrs.is_space
169 |         this.isBracket = attrs.is_bracket
170 |         this.isCurrency = attrs.is_currency
171 |         this.likeUrl = attrs.like_url
172 |         this.likeNum = attrs.like_num
173 |         this.likeEmail = attrs.like_email
174 |         this.isOov = attrs.is_oov
175 |         this.isStop = attrs.is_stop
176 |         this.isSentStart = attrs.is_sent_start
177 | 
178 |         this._head = attrs.head
179 |     }
180 | 
181 |     get length() {
182 |         return this.text.length
183 |     }
184 | 
185 |     get head() {
186 |         return this.doc[this._head]
187 |     }
188 | 
189 |     toString() {
190 |         return this.text
191 |     }
192 | 
193 |     inspect() {
194 |         return this.text
195 |     }
196 | 
197 |     async similarity(obj) {
198 |         return await getSimilarity(this.doc._api, this.doc._model, this.text, obj.text)
199 |     }
200 | }
201 | 


--------------------------------------------------------------------------------
/src/util.js:
--------------------------------------------------------------------------------
 1 | import fetch from 'node-fetch';
 2 | import url from 'url';
 3 | 
 4 | export async function makeRequest(api, endpoint, opts, method = 'POST') {
 5 |     const headers = { 'Accept': 'application/json', 'Content-Type': 'application/json' };
 6 |     const credentials = 'same-origin';
 7 |     const body = JSON.stringify(opts);
 8 |     const apiUrl = url.resolve(api, endpoint);
 9 |     try {
10 |         const res = await fetch(apiUrl, { method, headers, credentials, body });
11 |         return await res.json();
12 |     }
13 |     catch(err) {
14 |         console.log(`Error fetching data from API: ${api}`)
15 |     }
16 | }
17 | 
18 | export async function getSimilarity(api, model, text1, text2) {
19 |     const json = await makeRequest(api, '/similarity', { model, text1, text2 });
20 |     return json.similarity;
21 | }
22 | 


--------------------------------------------------------------------------------
/tests/doc.test.js:
--------------------------------------------------------------------------------
 1 | import spacy from '../src'
 2 | import { Doc, Token, Span } from '../src/tokens'
 3 | import { text, words, spaces, attrs } from './util'
 4 | 
 5 | jest.mock('../src/language')
 6 | 
 7 | const nlp = spacy.load('en_core_web_sm')
 8 | 
 9 | test('allows manual construction', () => {
10 |     const doc = new Doc(words, spaces, attrs)
11 |     expect(doc).toBeInstanceOf(Doc)
12 | })
13 | 
14 | test('has Doc attributes', async () => {
15 |     const doc = await nlp(text)
16 |     expect(doc.text).toBe(text)
17 |     expect(doc.toString()).toBe(text)
18 |     expect(doc.length).toBe(10)
19 |     expect(doc.cats).toEqual({})
20 |     expect(doc.isTagged).toBe(true)
21 |     expect(doc.isParsed).toBe(true)
22 |     expect(doc.isSentenced).toBe(true)
23 | })
24 | 
25 | test('allows token indexing', async () => {
26 |     const doc = await nlp(text)
27 |     for (let i = 0; i < doc.length; i++) {
28 |         expect(doc[i]).toBeInstanceOf(Token)
29 |     }
30 |     expect(doc[doc.length + 1]).toBeUndefined()
31 | })
32 | 
33 | test('allows token iteration', async () => {
34 |     const doc = await nlp(text)
35 |     for (let token of doc) {
36 |         expect(token).toBeInstanceOf(Token)
37 |     }
38 | })
39 | 
40 | test('has named entities (doc.ents)', async () => {
41 |     const doc = await nlp(text)
42 |     expect(doc.ents).toBeInstanceOf(Array)
43 |     expect(doc.ents).toEqual(expect.arrayContaining([expect.any(Span)]))
44 |     expect(doc.ents.length).toBe(1)
45 |     const entity = doc.ents[0]
46 |     expect(entity).toBeInstanceOf(Span)
47 |     expect(entity.text).toBe('Facebook')
48 |     expect(entity.start).toBe(8)
49 |     expect(entity.end).toBe(9)
50 |     expect(entity.label).toBe('ORG')
51 | })
52 | 
53 | test('has sentences (doc.sents)', async () => {
54 |     const doc = await nlp(text)
55 |     expect(doc.sents).toBeInstanceOf(Array)
56 |     expect(doc.sents).toEqual(expect.arrayContaining([expect.any(Span)]))
57 |     expect(doc.sents.length).toBe(2)
58 |     const sentence = doc.sents[0]
59 |     expect(sentence).toBeInstanceOf(Span)
60 |     expect(sentence.text).toBe('Hello world!')
61 |     expect(sentence.start).toBe(0)
62 |     expect(sentence.end).toBe(3)
63 | })
64 | 
65 | test('has noun chunks (doc.noun_chunks)', async () => {
66 |     const doc = await nlp(text)
67 |     expect(doc.nounChunks).toBeInstanceOf(Array)
68 |     expect(doc.nounChunks).toEqual(expect.arrayContaining([expect.any(Span)]))
69 |     expect(doc.nounChunks.length).toBe(3)
70 |     const chunk = doc.nounChunks[0]
71 |     expect(chunk).toBeInstanceOf(Span)
72 |     expect(chunk.text).toBe('Hello world')
73 |     expect(chunk.start).toBe(0)
74 |     expect(chunk.end).toBe(2)
75 | })
76 | 


--------------------------------------------------------------------------------
/tests/language.test.js:
--------------------------------------------------------------------------------
1 | import spacy from '../src'
2 | 
3 | jest.mock('../src/language')
4 | 
5 | test('creates new nlp object', () => {
6 |     const nlp = spacy.load('en_core_web_sm')
7 |     expect(nlp).toEqual(expect.any(Function))
8 | })
9 | 


--------------------------------------------------------------------------------
/tests/span.test.js:
--------------------------------------------------------------------------------
 1 | import spacy from '../src'
 2 | import { Doc, Token, Span } from '../src/tokens'
 3 | import { text, words, spaces, attrs } from './util'
 4 | 
 5 | jest.mock('../src/language')
 6 | 
 7 | const nlp = spacy.load('en_core_web_sm')
 8 | 
 9 | test('allows manual construction', async () => {
10 |     const doc = await nlp(text)
11 |     const span = new Span(doc, 6, 9)
12 |     expect(span).toBeInstanceOf(Span)
13 |     expect(span.text).toBe('sentence about Facebook')
14 | })
15 | 
16 | test('allows being sliced off Doc', async () => {
17 |     const doc = await nlp(text)
18 |     const span = doc.slice(6, 9)
19 |     expect(span).toBeInstanceOf(Span)
20 |     expect(span.text).toBe('sentence about Facebook')
21 | })
22 | 
23 | test('has Span attributes', async () => {
24 |     const doc = await nlp(text)
25 |     const span = doc.slice(6, 9)
26 |     expect(span.toString()).toBe('sentence about Facebook')
27 |     expect(span.length).toBe(3)
28 |     expect(span.start).toBe(6)
29 |     expect(span.end).toBe(9)
30 |     expect(span.label).toBeUndefined()
31 | })
32 | 
33 | test('has parent Doc', async () => {
34 |     const doc = await nlp(text)
35 |     const span = doc.slice(6, 9)
36 |     expect(span.doc).toBeInstanceOf(Doc)
37 |     expect(span.doc).toBe(doc)
38 | })
39 | 
40 | test('has entity label', async () => {
41 |     const doc = await nlp(text)
42 |     const span = doc.slice(8, 9)
43 |     expect(span.toString()).toBe('Facebook')
44 |     expect(span.label).toBe('ORG')
45 | })
46 | 
47 | test('allows token indexing', async () => {
48 |     const doc = await nlp(text)
49 |     const span = doc.slice(6, 9)
50 |     for (let i = 0; i < span.length; i++) {
51 |         expect(span[i]).toBeInstanceOf(Token)
52 |     }
53 |     expect(span[span.length + 1]).toBeUndefined()
54 | })
55 | 
56 | test('allows token iteration', async () => {
57 |     const doc = await nlp(text)
58 |     const span = doc.slice(6, 9)
59 |     for (let token of span) {
60 |         expect(token).toBeInstanceOf(Token)
61 |     }
62 | })
63 | 


--------------------------------------------------------------------------------
/tests/test_api.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | from __future__ import unicode_literals
 3 | 
 4 | import pytest
 5 | import spacy
 6 | import json
 7 | 
 8 | from api.server import parse, doc2json, load_model
 9 | 
10 | 
11 | @pytest.fixture(scope="session")
12 | def model():
13 |     return "en_core_web_sm"
14 | 
15 | 
16 | @pytest.fixture(scope="session")
17 | def text():
18 |     return "This is a sentence about Facebook. This is another one."
19 | 
20 | 
21 | @pytest.fixture(scope="session")
22 | def nlp(model):
23 |     return spacy.load(model)
24 | 
25 | 
26 | @pytest.fixture(scope="session")
27 | def doc(nlp, text):
28 |     return nlp(text)
29 | 
30 | 
31 | def test_server_parse(model, text, doc):
32 |     load_model(model)
33 |     json_doc = parse(model, text)
34 |     direct_json_doc = doc2json(doc, model)
35 |     assert json.dumps(json_doc, sort_keys=True) == json.dumps(
36 |         direct_json_doc, sort_keys=True
37 |     )
38 | 
39 | 
40 | def test_doc2json_doc_tokens(doc, model):
41 |     data = doc2json(doc, model)
42 |     assert data["model"] == model
43 |     assert data["doc"]["text"] == doc.text
44 |     assert data["doc"]["text_with_ws"] == doc.text_with_ws
45 |     assert data["doc"]["is_tagged"]
46 |     assert data["doc"]["is_parsed"]
47 |     assert data["doc"]["is_sentenced"]
48 |     assert len(data["tokens"]) == len(doc)
49 |     assert data["tokens"][0]["text"] == doc[0].text
50 |     assert data["tokens"][0]["head"] == doc[0].head.i
51 | 
52 | 
53 | def test_doc2json_doc_ents(doc, model):
54 |     data = doc2json(doc, model)
55 |     ents = list(doc.ents)
56 |     assert "ents" in data
57 |     assert len(data["ents"]) == len(ents)
58 |     assert len(data["ents"]) >= 1
59 |     assert data["ents"][0]["start"] == ents[0].start
60 |     assert data["ents"][0]["end"] == ents[0].end
61 |     assert data["ents"][0]["label"] == ents[0].label_
62 | 
63 | 
64 | def test_doc2json_doc_sents(doc, model):
65 |     data = doc2json(doc, model)
66 |     sents = list(doc.sents)
67 |     assert "sents" in data
68 |     assert len(data["sents"]) == len(sents)
69 |     assert len(data["sents"]) >= 1
70 |     assert data["sents"][0]["start"] == sents[0].start
71 |     assert data["sents"][0]["end"] == sents[0].end
72 | 
73 | 
74 | def test_doc2json_doc_noun_chunks(doc, model):
75 |     data = doc2json(doc, model)
76 |     chunks = list(doc.noun_chunks)
77 |     assert "noun_chunks" in data
78 |     assert len(data["noun_chunks"]) == len(chunks)
79 |     assert len(data["noun_chunks"]) >= 1
80 |     assert data["noun_chunks"][0]["start"] == chunks[0].start
81 |     assert data["noun_chunks"][0]["end"] == chunks[0].end
82 | 


--------------------------------------------------------------------------------
/tests/token.test.js:
--------------------------------------------------------------------------------
 1 | import spacy from '../src'
 2 | import { Doc, Token } from '../src/tokens'
 3 | import { text, words, spaces, attrs } from './util'
 4 | 
 5 | jest.mock('../src/language')
 6 | 
 7 | const nlp = spacy.load('en_core_web_sm')
 8 | 
 9 | test('allows manual construction', async () => {
10 |     const doc = await nlp(text)
11 |     const token = new Token(doc, words[7], spaces[7], attrs.tokens[7])
12 |     expect(token).toBeInstanceOf(Token)
13 |     expect(token.text).toBe('about')
14 | })
15 | 
16 | test('allows indexing from Doc', async () => {
17 |     const doc = await nlp(text)
18 |     const token = doc[7]
19 |     expect(token.text).toBe('about')
20 | })
21 | 
22 | test('has Token attributes', async () => {
23 |     const doc = await nlp(text)
24 |     const token = doc[7]
25 |     expect(token.length).toBe(5)
26 |     expect(token.toString()).toBe('about')
27 |     expect(token.text).toBe('about')
28 |     expect(token.textWithWs).toBe('about ')
29 |     expect(token.whitespace).toBe(' ')
30 |     expect(token.orth).toBe(942632335873952620)
31 |     expect(token.i).toBe(7)
32 |     expect(token.entType).toBe('')
33 |     expect(token.entIob).toBe('O')
34 |     expect(token.lemma).toBe('about')
35 |     expect(token.norm).toBe('about')
36 |     expect(token.lower).toBe('about')
37 |     expect(token.shape).toBe('xxxx')
38 |     expect(token.prefix).toBe('a')
39 |     expect(token.suffix).toBe('out')
40 |     expect(token.pos).toBe('ADP')
41 |     expect(token.tag).toBe('IN')
42 |     expect(token.dep).toBe('prep')
43 |     expect(token.isAlpha).toBe(true)
44 |     expect(token.isAscii).toBe(true)
45 |     expect(token.isDigit).toBe(false)
46 |     expect(token.isLower).toBe(true)
47 |     expect(token.isUpper).toBe(false)
48 |     expect(token.isTitle).toBe(false)
49 |     expect(token.isPunct).toBe(false)
50 |     expect(token.isLeftPunct).toBe(false)
51 |     expect(token.isRightPunct).toBe(false)
52 |     expect(token.isSpace).toBe(false)
53 |     expect(token.isBracket).toBe(false)
54 |     expect(token.isCurrency).toBe(false)
55 |     expect(token.likeUrl).toBe(false)
56 |     expect(token.likeNum).toBe(false)
57 |     expect(token.likeEmail).toBe(false)
58 |     expect(token.isOov).toBe(true)
59 |     expect(token.isStop).toBe(true)
60 |     expect(token.isSentStart).toBe(null)
61 | })
62 | 
63 | test('has parent Doc', async () => {
64 |     const doc = await nlp(text)
65 |     const token = doc[7]
66 |     expect(token.doc).toBeInstanceOf(Doc)
67 |     expect(token.doc).toBe(doc)
68 | })
69 | 
70 | test('has head', async () => {
71 |     const doc = await nlp(text)
72 |     const head = doc[7].head
73 |     expect(head).toBeInstanceOf(Token)
74 |     expect(head.i).toBe(6)
75 |     expect(head.text).toBe('sentence')
76 | })
77 | 


--------------------------------------------------------------------------------
/tests/util.js:
--------------------------------------------------------------------------------
  1 | export const text = 'Hello world! This is a sentence about Facebook.'
  2 | export const words = [
  3 |     'Hello',
  4 |     'world',
  5 |     '!',
  6 |     'This',
  7 |     'is',
  8 |     'a',
  9 |     'sentence',
 10 |     'about',
 11 |     'Facebook',
 12 |     '.'
 13 | ]
 14 | export const spaces = [true, false, true, true, true, true, true, true, false, false]
 15 | export const attrs = {
 16 |     model: 'en_core_web_sm',
 17 |     doc: {
 18 |         text: 'Hello world! This is a sentence about Facebook.',
 19 |         text_with_ws: 'Hello world! This is a sentence about Facebook.',
 20 |         cats: {},
 21 |         is_tagged: true,
 22 |         is_parsed: true,
 23 |         is_sentenced: true
 24 |     },
 25 |     ents: [
 26 |         {
 27 |             start: 8,
 28 |             end: 9,
 29 |             label: 'ORG'
 30 |         }
 31 |     ],
 32 |     sents: [
 33 |         {
 34 |             start: 0,
 35 |             end: 3
 36 |         },
 37 |         {
 38 |             start: 3,
 39 |             end: 10
 40 |         }
 41 |     ],
 42 |     noun_chunks: [
 43 |         {
 44 |             start: 0,
 45 |             end: 2
 46 |         },
 47 |         {
 48 |             start: 5,
 49 |             end: 7
 50 |         },
 51 |         {
 52 |             start: 8,
 53 |             end: 9
 54 |         }
 55 |     ],
 56 |     tokens: [
 57 |         {
 58 |             text: 'Hello',
 59 |             text_with_ws: 'Hello ',
 60 |             whitespace: ' ',
 61 |             orth: 15777305708150031551,
 62 |             i: 0,
 63 |             ent_type: '',
 64 |             ent_iob: 'O',
 65 |             lemma: 'hello',
 66 |             norm: 'hello',
 67 |             lower: 'hello',
 68 |             shape: 'Xxxxx',
 69 |             prefix: 'H',
 70 |             suffix: 'llo',
 71 |             pos: 'INTJ',
 72 |             tag: 'UH',
 73 |             dep: 'intj',
 74 |             is_alpha: true,
 75 |             is_ascii: true,
 76 |             is_digit: false,
 77 |             is_lower: false,
 78 |             is_upper: false,
 79 |             is_title: true,
 80 |             is_punct: false,
 81 |             is_left_punct: false,
 82 |             is_right_punct: false,
 83 |             is_space: false,
 84 |             is_bracket: false,
 85 |             is_currency: false,
 86 |             like_url: false,
 87 |             like_num: false,
 88 |             like_email: false,
 89 |             is_oov: true,
 90 |             is_stop: false,
 91 |             is_sent_start: null,
 92 |             head: 1
 93 |         },
 94 |         {
 95 |             text: 'world',
 96 |             text_with_ws: 'world',
 97 |             whitespace: '',
 98 |             orth: 1703489418272052182,
 99 |             i: 1,
100 |             ent_type: '',
101 |             ent_iob: 'O',
102 |             lemma: 'world',
103 |             norm: 'world',
104 |             lower: 'world',
105 |             shape: 'xxxx',
106 |             prefix: 'w',
107 |             suffix: 'rld',
108 |             pos: 'NOUN',
109 |             tag: 'NN',
110 |             dep: 'ROOT',
111 |             is_alpha: true,
112 |             is_ascii: true,
113 |             is_digit: false,
114 |             is_lower: true,
115 |             is_upper: false,
116 |             is_title: false,
117 |             is_punct: false,
118 |             is_left_punct: false,
119 |             is_right_punct: false,
120 |             is_space: false,
121 |             is_bracket: false,
122 |             is_currency: false,
123 |             like_url: false,
124 |             like_num: false,
125 |             like_email: false,
126 |             is_oov: true,
127 |             is_stop: false,
128 |             is_sent_start: null,
129 |             head: 1
130 |         },
131 |         {
132 |             text: '!',
133 |             text_with_ws: '! ',
134 |             whitespace: ' ',
135 |             orth: 17494803046312582752,
136 |             i: 2,
137 |             ent_type: '',
138 |             ent_iob: 'O',
139 |             lemma: '!',
140 |             norm: '!',
141 |             lower: '!',
142 |             shape: '!',
143 |             prefix: '!',
144 |             suffix: '!',
145 |             pos: 'PUNCT',
146 |             tag: '.',
147 |             dep: 'punct',
148 |             is_alpha: false,
149 |             is_ascii: true,
150 |             is_digit: false,
151 |             is_lower: false,
152 |             is_upper: false,
153 |             is_title: false,
154 |             is_punct: true,
155 |             is_left_punct: false,
156 |             is_right_punct: false,
157 |             is_space: false,
158 |             is_bracket: false,
159 |             is_currency: false,
160 |             like_url: false,
161 |             like_num: false,
162 |             like_email: false,
163 |             is_oov: true,
164 |             is_stop: false,
165 |             is_sent_start: null,
166 |             head: 1
167 |         },
168 |         {
169 |             text: 'This',
170 |             text_with_ws: 'This ',
171 |             whitespace: ' ',
172 |             orth: 12943039165150086467,
173 |             i: 3,
174 |             ent_type: '',
175 |             ent_iob: 'O',
176 |             lemma: 'this',
177 |             norm: 'this',
178 |             lower: 'this',
179 |             shape: 'Xxxx',
180 |             prefix: 'T',
181 |             suffix: 'his',
182 |             pos: 'DET',
183 |             tag: 'DT',
184 |             dep: 'nsubj',
185 |             is_alpha: true,
186 |             is_ascii: true,
187 |             is_digit: false,
188 |             is_lower: false,
189 |             is_upper: false,
190 |             is_title: true,
191 |             is_punct: false,
192 |             is_left_punct: false,
193 |             is_right_punct: false,
194 |             is_space: false,
195 |             is_bracket: false,
196 |             is_currency: false,
197 |             like_url: false,
198 |             like_num: false,
199 |             like_email: false,
200 |             is_oov: true,
201 |             is_stop: false,
202 |             is_sent_start: true,
203 |             head: 4
204 |         },
205 |         {
206 |             text: 'is',
207 |             text_with_ws: 'is ',
208 |             whitespace: ' ',
209 |             orth: 3411606890003347522,
210 |             i: 4,
211 |             ent_type: '',
212 |             ent_iob: 'O',
213 |             lemma: 'be',
214 |             norm: 'is',
215 |             lower: 'is',
216 |             shape: 'xx',
217 |             prefix: 'i',
218 |             suffix: 'is',
219 |             pos: 'VERB',
220 |             tag: 'VBZ',
221 |             dep: 'ROOT',
222 |             is_alpha: true,
223 |             is_ascii: true,
224 |             is_digit: false,
225 |             is_lower: true,
226 |             is_upper: false,
227 |             is_title: false,
228 |             is_punct: false,
229 |             is_left_punct: false,
230 |             is_right_punct: false,
231 |             is_space: false,
232 |             is_bracket: false,
233 |             is_currency: false,
234 |             like_url: false,
235 |             like_num: false,
236 |             like_email: false,
237 |             is_oov: true,
238 |             is_stop: true,
239 |             is_sent_start: null,
240 |             head: 4
241 |         },
242 |         {
243 |             text: 'a',
244 |             text_with_ws: 'a ',
245 |             whitespace: ' ',
246 |             orth: 11901859001352538922,
247 |             i: 5,
248 |             ent_type: '',
249 |             ent_iob: 'O',
250 |             lemma: 'a',
251 |             norm: 'gonna',
252 |             lower: 'a',
253 |             shape: 'x',
254 |             prefix: 'a',
255 |             suffix: 'a',
256 |             pos: 'DET',
257 |             tag: 'DT',
258 |             dep: 'det',
259 |             is_alpha: true,
260 |             is_ascii: true,
261 |             is_digit: false,
262 |             is_lower: true,
263 |             is_upper: false,
264 |             is_title: false,
265 |             is_punct: false,
266 |             is_left_punct: false,
267 |             is_right_punct: false,
268 |             is_space: false,
269 |             is_bracket: false,
270 |             is_currency: false,
271 |             like_url: false,
272 |             like_num: false,
273 |             like_email: false,
274 |             is_oov: true,
275 |             is_stop: true,
276 |             is_sent_start: null,
277 |             head: 6
278 |         },
279 |         {
280 |             text: 'sentence',
281 |             text_with_ws: 'sentence ',
282 |             whitespace: ' ',
283 |             orth: 18108853898452662235,
284 |             i: 6,
285 |             ent_type: '',
286 |             ent_iob: 'O',
287 |             lemma: 'sentence',
288 |             norm: 'sentence',
289 |             lower: 'sentence',
290 |             shape: 'xxxx',
291 |             prefix: 's',
292 |             suffix: 'nce',
293 |             pos: 'NOUN',
294 |             tag: 'NN',
295 |             dep: 'attr',
296 |             is_alpha: true,
297 |             is_ascii: true,
298 |             is_digit: false,
299 |             is_lower: true,
300 |             is_upper: false,
301 |             is_title: false,
302 |             is_punct: false,
303 |             is_left_punct: false,
304 |             is_right_punct: false,
305 |             is_space: false,
306 |             is_bracket: false,
307 |             is_currency: false,
308 |             like_url: false,
309 |             like_num: false,
310 |             like_email: false,
311 |             is_oov: true,
312 |             is_stop: false,
313 |             is_sent_start: null,
314 |             head: 4
315 |         },
316 |         {
317 |             text: 'about',
318 |             text_with_ws: 'about ',
319 |             whitespace: ' ',
320 |             orth: 942632335873952620,
321 |             i: 7,
322 |             ent_type: '',
323 |             ent_iob: 'O',
324 |             lemma: 'about',
325 |             norm: 'about',
326 |             lower: 'about',
327 |             shape: 'xxxx',
328 |             prefix: 'a',
329 |             suffix: 'out',
330 |             pos: 'ADP',
331 |             tag: 'IN',
332 |             dep: 'prep',
333 |             is_alpha: true,
334 |             is_ascii: true,
335 |             is_digit: false,
336 |             is_lower: true,
337 |             is_upper: false,
338 |             is_title: false,
339 |             is_punct: false,
340 |             is_left_punct: false,
341 |             is_right_punct: false,
342 |             is_space: false,
343 |             is_bracket: false,
344 |             is_currency: false,
345 |             like_url: false,
346 |             like_num: false,
347 |             like_email: false,
348 |             is_oov: true,
349 |             is_stop: true,
350 |             is_sent_start: null,
351 |             head: 6
352 |         },
353 |         {
354 |             text: 'Facebook',
355 |             text_with_ws: 'Facebook',
356 |             whitespace: '',
357 |             orth: 8081970590932371665,
358 |             i: 8,
359 |             ent_type: 'ORG',
360 |             ent_iob: 'B',
361 |             lemma: 'facebook',
362 |             norm: 'facebook',
363 |             lower: 'facebook',
364 |             shape: 'Xxxxx',
365 |             prefix: 'F',
366 |             suffix: 'ook',
367 |             pos: 'PROPN',
368 |             tag: 'NNP',
369 |             dep: 'pobj',
370 |             is_alpha: true,
371 |             is_ascii: true,
372 |             is_digit: false,
373 |             is_lower: false,
374 |             is_upper: false,
375 |             is_title: true,
376 |             is_punct: false,
377 |             is_left_punct: false,
378 |             is_right_punct: false,
379 |             is_space: false,
380 |             is_bracket: false,
381 |             is_currency: false,
382 |             like_url: false,
383 |             like_num: false,
384 |             like_email: false,
385 |             is_oov: true,
386 |             is_stop: false,
387 |             is_sent_start: null,
388 |             head: 7
389 |         },
390 |         {
391 |             text: '.',
392 |             text_with_ws: '.',
393 |             whitespace: '',
394 |             orth: 12646065887601541794,
395 |             i: 9,
396 |             ent_type: '',
397 |             ent_iob: 'O',
398 |             lemma: '.',
399 |             norm: '.',
400 |             lower: '.',
401 |             shape: '.',
402 |             prefix: '.',
403 |             suffix: '.',
404 |             pos: 'PUNCT',
405 |             tag: '.',
406 |             dep: 'punct',
407 |             is_alpha: false,
408 |             is_ascii: true,
409 |             is_digit: false,
410 |             is_lower: false,
411 |             is_upper: false,
412 |             is_title: false,
413 |             is_punct: true,
414 |             is_left_punct: false,
415 |             is_right_punct: false,
416 |             is_space: false,
417 |             is_bracket: false,
418 |             is_currency: false,
419 |             like_url: false,
420 |             like_num: false,
421 |             like_email: false,
422 |             is_oov: true,
423 |             is_stop: false,
424 |             is_sent_start: null,
425 |             head: 4
426 |         }
427 |     ]
428 | }
429 | 


--------------------------------------------------------------------------------