├── .babelrc ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── api └── server.py ├── dist └── index.js ├── package-lock.json ├── package.json ├── requirements.txt ├── src ├── __mocks__ │ └── language.js ├── index.js ├── language.js ├── tokens.js └── util.js └── tests ├── doc.test.js ├── language.test.js ├── span.test.js ├── test_api.py ├── token.test.js └── util.js /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "presets": [ 3 | ["@babel/preset-env", { 4 | "modules": false, 5 | "targets": { "node": "current"} 6 | }] 7 | ], 8 | "plugins": [ 9 | "add-module-exports" 10 | ], 11 | "env": { 12 | "test": { 13 | "presets": [ 14 | ["@babel/preset-env", { 15 | "targets": { "node": "current"} 16 | }] 17 | ] 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | 3 | # JavaScript 4 | node_modules/ 5 | *.tgz 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # celery beat schedule file 89 | celerybeat-schedule 90 | 91 | # SageMath parsed files 92 | *.sage.py 93 | 94 | # Environments 95 | .env 96 | .venv 97 | env/ 98 | venv/ 99 | ENV/ 100 | env.bak/ 101 | venv.bak/ 102 | 103 | # Spyder project settings 104 | .spyderproject 105 | .spyproject 106 | 107 | # Rope project settings 108 | .ropeproject 109 | 110 | # mkdocs documentation 111 | /site 112 | 113 | # mypy 114 | .mypy_cache/ 115 | .dmypy.json 116 | dmypy.json 117 | 118 | # Pyre type checker 119 | .pyre/ 120 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | matrix: 2 | include: 3 | - language: python 4 | python: 3.6 5 | before_script: 6 | - pip install pytest 7 | install: 8 | - pip install -r requirements.txt 9 | script: 10 | - python -m pytest tests 11 | cache: pip 12 | - language: node_js 13 | node_js: 14 | - "8" 15 | install: 16 | - npm install 17 | script: 18 | - npm run test 19 | cache: npm 20 | notifications: 21 | email: false 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Ines Montani 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # spaCy JS 4 | 5 | [![travis](https://img.shields.io/travis/ines/spacy-js/master.svg?style=flat-square&logo=travis)](https://travis-ci.org/ines/spacy-js) 6 | [![npm](https://img.shields.io/npm/v/spacy.svg?style=flat-square)](https://www.npmjs.com/package/spacy) 7 | [![GitHub](https://img.shields.io/github/release/ines/spacy-js/all.svg?style=flat-square)](https://github.com/ines/spacy-js) 8 | [![unpkg](https://img.shields.io/badge/unpkg-dist/index.js-brightgreen.svg?style=flat-square)](https://unpkg.com/spacy) 9 | 10 | 11 | JavaScript interface for accessing linguistic annotations provided by 12 | [spaCy](https://spacy.io). This project is mostly experimental and was 13 | developed for fun to play around with different ways of mimicking spaCy's 14 | Python API. 15 | 16 | The results will still be computed in Python and made available via a REST API. 17 | The JavaScript API resembles spaCy's Python API as closely as possible (with 18 | a few exceptions, as the values are all pre-computed and it's tricky to express 19 | complex recursive relationships). 20 | 21 | ```javascript 22 | const spacy = require('spacy'); 23 | 24 | (async function() { 25 | const nlp = spacy.load('en_core_web_sm'); 26 | const doc = await nlp('This is a text about Facebook.'); 27 | for (let ent of doc.ents) { 28 | console.log(ent.text, ent.label); 29 | } 30 | for (let token of doc) { 31 | console.log(token.text, token.pos, token.head.text); 32 | } 33 | })(); 34 | ``` 35 | 36 | ## ⌛️ Installation 37 | 38 | ### Installing the JavaScript library 39 | 40 | You can install the JavaScript package via npm: 41 | 42 | ```bash 43 | npm install spacy 44 | ``` 45 | 46 | ### Setting up the Python server 47 | 48 | First, clone this repo and install the requirements. If you've installed the 49 | package via npm, you can also use the `api/server.py` and `requirements.txt` in 50 | your `./node_modules/spacy` directory. It's recommended to use a virtual 51 | environment. 52 | 53 | ```bash 54 | pip install -r requirements.txt 55 | ``` 56 | 57 | You can then run the REST API. By default, this will serve the API via 58 | `0.0.0.0:8080`: 59 | 60 | ```bash 61 | python api/server.py 62 | ``` 63 | 64 | If you like, you can install more [models](https://spacy.io/models) and specify 65 | a comma-separated list of models to load as the first argument when you run 66 | the server. All models need to be installed in the same environment. 67 | 68 | ```bash 69 | python api/server.py en_core_web_sm,de_core_news_sm 70 | ``` 71 | 72 | | Argument | Type | Description | Default | 73 | | --- | --- | --- | --- | 74 | | `models` | positional (str) | Comma-separated list of models to load and make available. | `en_core_web_sm` | 75 | | `--host`, `-ho` | option (str) | Host to serve the API. | `0.0.0.0` | 76 | | `--port`, `-p` | option (int) | Port to server the API. | `8080` | 77 | 78 | ## 🎛 API 79 | 80 | ### `spacy.load` 81 | 82 | "Load" a spaCy model. This method mostly exists for consistency with the Python 83 | API. It sets up the REST API and `nlp` object, but doesn't actually load 84 | anything, since the models are already available via the REST API. 85 | 86 | ```javascript 87 | const nlp = spacy.load('en_core_web_sm'); 88 | ``` 89 | 90 | | Argument | Type | Description | 91 | | --- | --- | --- | 92 | | `model` | String | Name of model to load, e.g. `'en_core_web_sm'`. Needs to be available via the REST API. | 93 | | `api` | String | Alternative URL of REST API. Defaults to `http://0.0.0.0:8080`. | 94 | | **RETURNS** | [`Language`](src/language.js) | The `nlp` object. | 95 | 96 | ### `nlp` async 97 | 98 | The `nlp` object created by `spacy.load` can be called on a string of text 99 | and makes a request to the REST API. The easiest way to use it is to wrap the 100 | call in an `async` function and use `await`: 101 | 102 | ```javascript 103 | async function() { 104 | const nlp = spacy.load('en_core_web_sm'); 105 | const doc = await nlp('This is a text.'); 106 | } 107 | ``` 108 | 109 | | Argument | Type | Description | 110 | | --- | --- | --- | 111 | | `text` | String | The text to process. | 112 | | **RETURNS** | [`Doc`](src/tokens.js) | The processed `Doc`. | 113 | 114 | ### `Doc` 115 | 116 | Just like [in the original API](https://spacy.io/api/doc), the `Doc` object can 117 | be constructed with an array of `words` and `spaces`. It also takes an 118 | additional `attrs` object, which corresponds to the JSON-serialized linguistic 119 | annotations created in [`doc2json` in `api/server.py`](api/server.py). 120 | 121 | The `Doc` behaves just like the regular spaCy `Doc` – you can iterate over its 122 | tokens, index into individual tokens, access the `Doc` attributes and properties 123 | and also use native JavaScript methods like `map` and `slice` (since there's no 124 | real way to make Python's slice notation like `doc[2:4]` work). 125 | 126 | #### Construction 127 | 128 | ```javascript 129 | import { Doc } from 'spacy'; 130 | 131 | const words = ['Hello', 'world', '!']; 132 | const spaces = [true, false, false]; 133 | const doc = Doc(words, spaces) 134 | console.log(doc.text) // 'Hello world!' 135 | ``` 136 | 137 | | Argument | Type | Description | 138 | | --- | --- | --- | 139 | | `words` | Array | The individual token texts. | 140 | | `spaces` | Array | Whether the token at this position is followed by a space or not. | 141 | | `attrs` | Object | JSON-serialized attributes, see [`doc2json`](api/server.py). | 142 | | **RETURNS** | [`Doc`](src/tokens.js) | The newly constructed `Doc`. | 143 | 144 | #### Symbol iterator and token indexing 145 | 146 | ```javascript 147 | async function() { 148 | const nlp = spacy.load('en_core_web_sm'); 149 | const doc = await nlp('Hello world'); 150 | 151 | for (let token of doc) { 152 | console.log(token.text); 153 | } 154 | // Hello 155 | // world 156 | 157 | const token1 = doc[0]; 158 | console.log(token1.text); 159 | // Hello 160 | } 161 | ``` 162 | 163 | #### Properties and Attributes 164 | 165 | | Name | Type | Description | 166 | | --- | --- | --- | 167 | | `text` | String | The `Doc` text. | 168 | | `length` | Number | The number of tokens in the `Doc`. | 169 | | `ents` | Array | A list of [`Span`](src/tokens.js) objects, describing the named entities in the `Doc`. | 170 | | `sents` | Array | A list of [`Span`](src/tokens.js) objects, describing the sentences in the `Doc`. | 171 | | `nounChunks` | Array | A list of [`Span`](src/tokens.js) objects, describing the base noun phrases in the `Doc`. | 172 | | `cats` | Object | The document categories predicted by the text classifier, if available in the model. | 173 | | `isTagged` | Boolean | Whether the part-of-speech tagger has been applied to the `Doc`. | 174 | | `isParsed` | Boolean | Whether the dependency parser has been applied to the `Doc`. | 175 | | `isSentenced` | Boolean | Whether the sentence boundary detector has been applied to the `Doc`. | 176 | 177 | ### `Span` 178 | 179 | A `Span` object is a slice of a `Doc` and contains of one or more tokens. Just 180 | like [in the original API](https://spacy.io/api/span), it can be constructed 181 | from a `Doc`, a start and end index and an optional label, or by slicing a `Doc`. 182 | 183 | #### Construction 184 | 185 | ```javascript 186 | import { Doc, Span } from 'spacy'; 187 | 188 | const doc = Doc(['Hello', 'world', '!'], [true, false, false]); 189 | const span = Span(doc, 1, 3); 190 | console.log(span.text) // 'world!' 191 | ``` 192 | 193 | | Argument | Type | Description | 194 | | --- | --- | --- | 195 | | `doc` | `Doc` | The reference document. | 196 | | `start` | Number | The start token index. | 197 | | `end` | Number | The end token index. This is *exclusive*, i.e. "up to token X". | 198 | | `label` | String | Optional label. | 199 | | **RETURNS** | [`Span`](src/tokens.js) | The newly constructed `Span`. | 200 | 201 | #### Properties and Attributes 202 | 203 | | Name | Type | Description | 204 | | --- | --- | --- | 205 | | `text` | String | The `Span` text. | 206 | | `length` | Number | The number of tokens in the `Span`. | 207 | | `doc` | `Doc` | The parent `Doc`. | 208 | | `start` | Number | The `Span`'s start index in the parent document. | 209 | | `end` | Number | The `Span`'s end index in the parent document. | 210 | | `label` | String | The `Span`'s label, if available. | 211 | 212 | ### `Token` 213 | 214 | For token attributes that exist as string and ID versions (e.g. `Token.pos` vs. 215 | `Token.pos_`), only the string versions are exposed. 216 | 217 | #### Usage Examples 218 | 219 | ```javascript 220 | async function() { 221 | const nlp = spacy.load('en_core_web_sm'); 222 | const doc = await nlp('Hello world'); 223 | 224 | for (let token of doc) { 225 | console.log(token.text, token.pos, token.isLower); 226 | } 227 | // Hello INTJ false 228 | // world NOUN true 229 | } 230 | ``` 231 | 232 | #### Properties and Attributes 233 | 234 | | Name | Type | Description | 235 | | --- | --- | --- | 236 | | `text` | String | The token text. | 237 | | `whitespace` | String | Whitespace character following the token, if available. | 238 | | `textWithWs` | String | Token text with training whitespace. | 239 | | `length` | Number | The length of the token text. | 240 | | `orth` | Number | ID of the token text. | 241 | | `doc` | `Doc` | The parent `Doc`. | 242 | | `head` | `Token` | The syntactic parent, or "governor", of this token. | 243 | | `i` | Number | Index of the token in the parent document. | 244 | | `entType` | String | The token's named entity type. | 245 | | `entIob` | String | IOB code of the token's named entity tag. | 246 | | `lemma` | String | The token's lemma, i.e. the base form. | 247 | | `norm` | String | The normalised form of the token. | 248 | | `lower` | String | The lowercase form of the token. | 249 | | `shape` | String | Transform of the tokens's string, to show orthographic features. For example, "Xxxx" or "dd". | 250 | | `prefix` | String | A length-N substring from the start of the token. Defaults to `N=1`. | 251 | | `suffix` | String | Length-N substring from the end of the token. Defaults to `N=3`. | 252 | | `pos` | String | The token's coarse-grained part-of-speech tag. | 253 | | `tag` | String | The token's fine-grained part-of-speech tag. | 254 | | `isAlpha` | Boolean | Does the token consist of alphabetic characters? | 255 | | `isAscii` | Boolean | Does the token consist of ASCII characters? | 256 | | `isDigit` | Boolean | Does the token consist of digits? | 257 | | `isLower` | Boolean | Is the token lowercase? | 258 | | `isUpper` | Boolean | Is the token uppercase? | 259 | | `isTitle` | Boolean | Is the token titlecase? | 260 | | `isPunct` | Boolean | Is the token punctuation? | 261 | | `isLeftPunct` | Boolean | Is the token left punctuation? | 262 | | `isRightPunct` | Boolean | Is the token right punctuation? | 263 | | `isSpace` | Boolean | Is the token a whitespace character? | 264 | | `isBracket` | Boolean | Is the token a bracket? | 265 | | `isCurrency` | Boolean | Is the token a currency symbol? | 266 | | `likeUrl` | Boolean | Does the token resemble a URL? | 267 | | `likeNum` | Boolean | Does the token resemble a number? | 268 | | `likeEmail` | Boolean | Does the token resemble an email address? | 269 | | `isOov` | Boolean | Is the token out-of-vocabulary? | 270 | | `isStop` | Boolean | Is the token a stop word? | 271 | | `isSentStart` | Boolean | Does the token start a sentence? | 272 | 273 | ## 🔔 Run Tests 274 | 275 | ### Python 276 | 277 | First, make sure you have `pytest` and all dependencies installed. You can then 278 | run the tests by pointing `pytest` to [`/tests`](/tests): 279 | 280 | ```bash 281 | python -m pytest tests 282 | ``` 283 | 284 | ### JavaScript 285 | 286 | This project uses [Jest](https://jestjs.io) for testing. Make sure you have 287 | all dependencies and development dependencies installed. You can then run: 288 | 289 | ```bash 290 | npm run test 291 | ``` 292 | 293 | To allow testing the code without a REST API providing the data, the test suite 294 | currently uses a [mock of the `Language` class](src/__mocks__), which returns 295 | static data located in [`tests/util.js`](tests/util.js). 296 | 297 | ## ✅ Ideas and Todos 298 | 299 | - [ ] Improve JavaScript tests. 300 | - [ ] Experiment with NodeJS bindings to make Python integration easier. To be fair, running a separate API in an environment controlled by the user and *not* hiding it a few levels deep is often much easier. But maybe there are some modern Node tricks that this project could benefit from. 301 | -------------------------------------------------------------------------------- /api/server.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from __future__ import unicode_literals 3 | 4 | import hug 5 | from hug_middleware_cors import CORSMiddleware 6 | import waitress 7 | import spacy 8 | import plac 9 | 10 | 11 | MODELS = {} 12 | 13 | 14 | @plac.annotations( 15 | models=("Comma-separated list of spaCy models", "positional", None, str), 16 | host=("Host to serve API", "option", "ho", str), 17 | port=("Port to serve API", "option", "p", int), 18 | ) 19 | def main(models=None, host="0.0.0.0", port=8080): 20 | if not models: 21 | models = ["en_core_web_sm"] 22 | else: 23 | models = [m.strip() for m in models.split(",")] 24 | for model in models: 25 | load_model(model) 26 | # Serving Hug API 27 | app = hug.API(__name__) 28 | app.http.add_middleware(CORSMiddleware(app)) 29 | waitress.serve(__hug_wsgi__, port=port) 30 | 31 | 32 | def load_model(model): 33 | print("Loading model '{}'...".format(model)) 34 | MODELS[model] = spacy.load(model) 35 | 36 | 37 | def doc2json(doc: spacy.tokens.Doc, model: str): 38 | json_doc = { 39 | "text": doc.text, 40 | "text_with_ws": doc.text_with_ws, 41 | "cats": doc.cats, 42 | "is_tagged": doc.is_tagged, 43 | "is_parsed": doc.is_parsed, 44 | "is_nered": doc.is_nered, 45 | "is_sentenced": doc.is_sentenced, 46 | } 47 | ents = [ 48 | {"start": ent.start, "end": ent.end, "label": ent.label_} for ent in doc.ents 49 | ] 50 | if doc.is_sentenced: 51 | sents = [{"start": sent.start, "end": sent.end} for sent in doc.sents] 52 | else: 53 | sents = [] 54 | if doc.is_tagged and doc.is_parsed: 55 | noun_chunks = [ 56 | {"start": chunk.start, "end": chunk.end} for chunk in doc.noun_chunks 57 | ] 58 | else: 59 | noun_chunks = [] 60 | tokens = [ 61 | { 62 | "text": token.text, 63 | "text_with_ws": token.text_with_ws, 64 | "whitespace": token.whitespace_, 65 | "orth": token.orth, 66 | "i": token.i, 67 | "ent_type": token.ent_type_, 68 | "ent_iob": token.ent_iob_, 69 | "lemma": token.lemma_, 70 | "norm": token.norm_, 71 | "lower": token.lower_, 72 | "shape": token.shape_, 73 | "prefix": token.prefix_, 74 | "suffix": token.suffix_, 75 | "pos": token.pos_, 76 | "tag": token.tag_, 77 | "dep": token.dep_, 78 | "is_alpha": token.is_alpha, 79 | "is_ascii": token.is_ascii, 80 | "is_digit": token.is_digit, 81 | "is_lower": token.is_lower, 82 | "is_upper": token.is_upper, 83 | "is_title": token.is_title, 84 | "is_punct": token.is_punct, 85 | "is_left_punct": token.is_left_punct, 86 | "is_right_punct": token.is_right_punct, 87 | "is_space": token.is_space, 88 | "is_bracket": token.is_bracket, 89 | "is_currency": token.is_currency, 90 | "like_url": token.like_url, 91 | "like_num": token.like_num, 92 | "like_email": token.like_email, 93 | "is_oov": token.is_oov, 94 | "is_stop": token.is_stop, 95 | "is_sent_start": token.is_sent_start, 96 | "head": token.head.i, 97 | } 98 | for token in doc 99 | ] 100 | return { 101 | "model": model, 102 | "doc": json_doc, 103 | "ents": ents, 104 | "sents": sents, 105 | "noun_chunks": noun_chunks, 106 | "tokens": tokens, 107 | } 108 | 109 | 110 | @hug.post("/parse") 111 | def parse(model: str, text: str): 112 | nlp = MODELS[model] 113 | doc = nlp(text) 114 | return doc2json(doc, model) 115 | 116 | 117 | @hug.post("/similarity") 118 | def similarity(model: str, text1: str, text2: str): 119 | # We can always create Doc objects here, because the result is the same 120 | nlp = MODELS[model] 121 | doc1 = nlp(text1) 122 | doc2 = nlp(text2) 123 | return {"similarity": doc1.similarity(doc2)} 124 | 125 | 126 | if __name__ == "__main__": 127 | plac.call(main) 128 | -------------------------------------------------------------------------------- /dist/index.js: -------------------------------------------------------------------------------- 1 | parcelRequire=function(e,r,n,t){var i="function"==typeof parcelRequire&&parcelRequire,o="function"==typeof require&&require;function u(n,t){if(!r[n]){if(!e[n]){var f="function"==typeof parcelRequire&&parcelRequire;if(!t&&f)return f(n,!0);if(i)return i(n,!0);if(o&&"string"==typeof n)return o(n);var c=new Error("Cannot find module '"+n+"'");throw c.code="MODULE_NOT_FOUND",c}p.resolve=function(r){return e[n][1][r]||r},p.cache={};var l=r[n]=new u.Module(n);e[n][0].call(l.exports,p,l,l.exports,this)}return r[n].exports;function p(e){return u(p.resolve(e))}}u.isParcelRequire=!0,u.Module=function(e){this.id=e,this.bundle=u,this.exports={}},u.modules=e,u.cache=r,u.parent=i,u.register=function(r,n){e[r]=[function(e,r){r.exports=n},{}]};for(var f=0;fnew e(this,t,s[i],this._tokens[i]));for(let e=0;enew i(this,t,s,e))}get sents(){return this._sents.map(({start:t,end:s})=>new i(this,t,s))}get nounChunks(){return this._chunks.map(({start:t,end:s})=>new i(this,t,s))}*[Symbol.iterator](){let t=0;for(;void 0!==this.tokens[t];)yield this.tokens[t],++t}toString(){return this.text}map(t){let s=[];for(let i of this)s.push(t(i));return s}slice(t,s){return new i(this,t,s)}async similarity(s){return await(0,t.getSimilarity)(this._api,this._model,this.text,s.text)}}exports.Doc=s;class i{constructor(t,s,i,e){this.doc=t,this.start=s,this.end=i,this._label=e,this.tokens=[...this.doc].slice(this.start,this.end);for(let h=0;he),spaces:o.tokens.map(({whitespace:e})=>Boolean(e)),attrs:Object.assign({},o,{api:a})}}}exports.default=s; 7 | },{"./tokens":"nJFl","./util":"Y/Oq"}],"Focm":[function(require,module,exports) { 8 | "use strict";Object.defineProperty(exports,"__esModule",{value:!0}),Object.defineProperty(exports,"Doc",{enumerable:!0,get:function(){return r.Doc}}),Object.defineProperty(exports,"Token",{enumerable:!0,get:function(){return r.Token}}),Object.defineProperty(exports,"Span",{enumerable:!0,get:function(){return r.Span}}),exports.default=void 0;var e=t(require("./language")),r=require("./tokens");function t(e){return e&&e.__esModule?e:{default:e}}var n={load:function(r,t){return new e.default(r,t)}};exports.default=n; 9 | },{"./language":"hk5u","./tokens":"nJFl"}]},{},["Focm"], "spacy") -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "spacy", 3 | "version": "0.0.4", 4 | "description": "JavaScript API for spaCy with Python REST API", 5 | "main": "dist/index.js", 6 | "scripts": { 7 | "test": "jest", 8 | "build": "parcel build src/index.js --target node --no-source-maps --global spacy", 9 | "dev": "parcel src/index.js --target node --no-source-maps --global spacy", 10 | "package": "npm run build && npm pack" 11 | }, 12 | "author": "Ines Montani", 13 | "license": "MIT", 14 | "homepage": "https://github.com/ines/spacy-js#readme", 15 | "repository": { 16 | "type": "git", 17 | "url": "https://github.com/ines/spacy-js.git" 18 | }, 19 | "keywords": [ 20 | "spacy", 21 | "nlp", 22 | "natural language processing", 23 | "machine learning", 24 | "artificial intelligence" 25 | ], 26 | "dependencies": { 27 | "node-fetch": "^2.1.2" 28 | }, 29 | "devDependencies": { 30 | "@babel/cli": "^7.1.2", 31 | "@babel/core": "^7.1.2", 32 | "@babel/preset-env": "^7.1.0", 33 | "babel-core": "^7.0.0-bridge.0", 34 | "babel-plugin-add-module-exports": "^1.0.0", 35 | "jest": "^23.6.0", 36 | "parcel-bundler": "^1.10.3", 37 | "regenerator-runtime": "^0.12.1" 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | hug>=2.4.0,<3.0.0 2 | hug-middleware-cors>=1.0.0,<2.0.0 3 | spacy>=2.1.0,<2.2.0 4 | waitress>=1.0.2,<2.0.0 5 | plac>=0.9.6,<1.0.0 6 | 7 | https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz#egg=en_core_web_sm==2.1.0 8 | -------------------------------------------------------------------------------- /src/__mocks__/language.js: -------------------------------------------------------------------------------- 1 | import { Doc } from '../tokens'; 2 | import { words, spaces, attrs } from '../../tests/util'; 3 | 4 | export default class Language { 5 | constructor(model, api) { 6 | return async function(text) { 7 | return new Doc(words, spaces, attrs); 8 | } 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | import Language from './language' 2 | export { Doc, Token, Span } from './tokens' 3 | 4 | export default { 5 | load: function(model, api) { 6 | return new Language(model, api); 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /src/language.js: -------------------------------------------------------------------------------- 1 | import { Doc } from './tokens'; 2 | import { makeRequest } from './util'; 3 | 4 | export default class Language { 5 | constructor(model, api = 'http://localhost:8080') { 6 | const self = this; 7 | return async function(text) { 8 | const { words, spaces, attrs } = await self.makeDoc(model, text, api); 9 | return new Doc(words, spaces, attrs); 10 | } 11 | } 12 | 13 | async makeDoc(model, text, api) { 14 | const json = await makeRequest(api, 'parse', { model, text }) 15 | const words = json.tokens.map(({ text }) => text); 16 | const spaces = json.tokens.map(({ whitespace }) => Boolean(whitespace)); 17 | return { words, spaces, attrs: Object.assign({}, json, { api }) } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/tokens.js: -------------------------------------------------------------------------------- 1 | import { getSimilarity } from './util' 2 | 3 | export class Doc { 4 | constructor(words, spaces, attrs = {}) { 5 | this._doc = attrs.doc || {} 6 | this._tokens = attrs.tokens || [] 7 | this._ents = attrs.ents || [] 8 | this._sents = attrs.sents || [] 9 | this._chunks = attrs.noun_chunks || [] 10 | this._model = attrs.model 11 | this._api = attrs.api 12 | this.tokens = words.map((word, i) => new Token(this, word, spaces[i], this._tokens[i])) 13 | for (let i = 0; i < this.tokens.length; i++) { 14 | this[i] = this.tokens[i] 15 | } 16 | this.cats = this._doc.cats 17 | this.isTagged = this._doc.is_tagged 18 | this.isParsed = this._doc.is_parsed 19 | this.isNered = this._doc.is_nered 20 | this.isSentenced = this._doc.is_sentenced 21 | } 22 | 23 | inspect() { 24 | return this.text 25 | } 26 | 27 | get text() { 28 | let text = '' 29 | for (let token of this.tokens) { 30 | text += token.textWithWs 31 | } 32 | return text 33 | } 34 | 35 | get length() { 36 | return this.tokens.length 37 | } 38 | 39 | get ents() { 40 | return this._ents.map(({ start, end, label }) => new Span(this, start, end, label)) 41 | } 42 | 43 | get sents() { 44 | return this._sents.map(({ start, end }) => new Span(this, start, end)) 45 | } 46 | 47 | get nounChunks() { 48 | return this._chunks.map(({ start, end }) => new Span(this, start, end)) 49 | } 50 | 51 | *[Symbol.iterator]() { 52 | let i = 0 53 | while (this.tokens[i] !== undefined) { 54 | yield this.tokens[i] 55 | ++i 56 | } 57 | } 58 | 59 | toString() { 60 | return this.text 61 | } 62 | 63 | map(func) { 64 | let tokens = [] 65 | for (let token of this) { 66 | tokens.push(func(token)) 67 | } 68 | return tokens 69 | } 70 | 71 | slice(start, end) { 72 | return new Span(this, start, end) 73 | } 74 | 75 | async similarity(obj) { 76 | return await getSimilarity(this._api, this._model, this.text, obj.text) 77 | } 78 | } 79 | 80 | export class Span { 81 | constructor(doc, start, end, label) { 82 | this.doc = doc 83 | this.start = start 84 | this.end = end 85 | this._label = label 86 | this.tokens = [...this.doc].slice(this.start, this.end) 87 | for (let i = 0; i < this.tokens.length; i++) { 88 | this[i] = this.tokens[0] 89 | } 90 | } 91 | 92 | get text() { 93 | let text = '' 94 | for (let token of this.tokens) { 95 | text += token.textWithWs 96 | } 97 | return text.trim() 98 | } 99 | 100 | get length() { 101 | return this.tokens.length 102 | } 103 | 104 | get label() { 105 | if (this._label) { 106 | return this._label 107 | } 108 | // Manually check if span is an entity 109 | for (let ent of this.doc.ents) { 110 | if (ent.start === this.start && ent.end == this.end) { 111 | return ent.label 112 | } 113 | } 114 | } 115 | 116 | *[Symbol.iterator]() { 117 | let i = 0 118 | while (this.tokens[i] !== undefined) { 119 | yield this.tokens[i] 120 | ++i 121 | } 122 | } 123 | 124 | slice(start, end) { 125 | return new Span(this, start, end) 126 | } 127 | 128 | toString() { 129 | return this.text 130 | } 131 | 132 | inspect() { 133 | return this.text 134 | } 135 | 136 | async similarity(obj) { 137 | return await getSimilarity(this.doc._api, this.doc._model, this.text, obj.text) 138 | } 139 | } 140 | 141 | export class Token { 142 | constructor(doc, word, space, attrs = {}) { 143 | this.doc = doc 144 | this.whitespace = space ? ' ' : '' 145 | this.text = word 146 | this.textWithWs = this.text + this.whitespace 147 | this.orth = attrs.orth 148 | this.i = attrs.i 149 | this.entType = attrs.ent_type 150 | this.entIob = attrs.ent_iob 151 | this.lemma = attrs.lemma 152 | this.norm = attrs.norm 153 | this.lower = attrs.lower 154 | ;(this.shape = attrs.shape), (this.prefix = attrs.prefix) 155 | this.suffix = attrs.suffix 156 | this.pos = attrs.pos 157 | this.tag = attrs.tag 158 | this.dep = attrs.dep 159 | this.isAlpha = attrs.is_alpha 160 | this.isAscii = attrs.is_ascii 161 | this.isDigit = attrs.is_digit 162 | this.isLower = attrs.is_lower 163 | this.isUpper = attrs.is_upper 164 | this.isTitle = attrs.is_title 165 | this.isPunct = attrs.is_punct 166 | this.isLeftPunct = attrs.is_left_punct 167 | this.isRightPunct = attrs.is_right_punct 168 | this.isSpace = attrs.is_space 169 | this.isBracket = attrs.is_bracket 170 | this.isCurrency = attrs.is_currency 171 | this.likeUrl = attrs.like_url 172 | this.likeNum = attrs.like_num 173 | this.likeEmail = attrs.like_email 174 | this.isOov = attrs.is_oov 175 | this.isStop = attrs.is_stop 176 | this.isSentStart = attrs.is_sent_start 177 | 178 | this._head = attrs.head 179 | } 180 | 181 | get length() { 182 | return this.text.length 183 | } 184 | 185 | get head() { 186 | return this.doc[this._head] 187 | } 188 | 189 | toString() { 190 | return this.text 191 | } 192 | 193 | inspect() { 194 | return this.text 195 | } 196 | 197 | async similarity(obj) { 198 | return await getSimilarity(this.doc._api, this.doc._model, this.text, obj.text) 199 | } 200 | } 201 | -------------------------------------------------------------------------------- /src/util.js: -------------------------------------------------------------------------------- 1 | import fetch from 'node-fetch'; 2 | import url from 'url'; 3 | 4 | export async function makeRequest(api, endpoint, opts, method = 'POST') { 5 | const headers = { 'Accept': 'application/json', 'Content-Type': 'application/json' }; 6 | const credentials = 'same-origin'; 7 | const body = JSON.stringify(opts); 8 | const apiUrl = url.resolve(api, endpoint); 9 | try { 10 | const res = await fetch(apiUrl, { method, headers, credentials, body }); 11 | return await res.json(); 12 | } 13 | catch(err) { 14 | console.log(`Error fetching data from API: ${api}`) 15 | } 16 | } 17 | 18 | export async function getSimilarity(api, model, text1, text2) { 19 | const json = await makeRequest(api, '/similarity', { model, text1, text2 }); 20 | return json.similarity; 21 | } 22 | -------------------------------------------------------------------------------- /tests/doc.test.js: -------------------------------------------------------------------------------- 1 | import spacy from '../src' 2 | import { Doc, Token, Span } from '../src/tokens' 3 | import { text, words, spaces, attrs } from './util' 4 | 5 | jest.mock('../src/language') 6 | 7 | const nlp = spacy.load('en_core_web_sm') 8 | 9 | test('allows manual construction', () => { 10 | const doc = new Doc(words, spaces, attrs) 11 | expect(doc).toBeInstanceOf(Doc) 12 | }) 13 | 14 | test('has Doc attributes', async () => { 15 | const doc = await nlp(text) 16 | expect(doc.text).toBe(text) 17 | expect(doc.toString()).toBe(text) 18 | expect(doc.length).toBe(10) 19 | expect(doc.cats).toEqual({}) 20 | expect(doc.isTagged).toBe(true) 21 | expect(doc.isParsed).toBe(true) 22 | expect(doc.isSentenced).toBe(true) 23 | }) 24 | 25 | test('allows token indexing', async () => { 26 | const doc = await nlp(text) 27 | for (let i = 0; i < doc.length; i++) { 28 | expect(doc[i]).toBeInstanceOf(Token) 29 | } 30 | expect(doc[doc.length + 1]).toBeUndefined() 31 | }) 32 | 33 | test('allows token iteration', async () => { 34 | const doc = await nlp(text) 35 | for (let token of doc) { 36 | expect(token).toBeInstanceOf(Token) 37 | } 38 | }) 39 | 40 | test('has named entities (doc.ents)', async () => { 41 | const doc = await nlp(text) 42 | expect(doc.ents).toBeInstanceOf(Array) 43 | expect(doc.ents).toEqual(expect.arrayContaining([expect.any(Span)])) 44 | expect(doc.ents.length).toBe(1) 45 | const entity = doc.ents[0] 46 | expect(entity).toBeInstanceOf(Span) 47 | expect(entity.text).toBe('Facebook') 48 | expect(entity.start).toBe(8) 49 | expect(entity.end).toBe(9) 50 | expect(entity.label).toBe('ORG') 51 | }) 52 | 53 | test('has sentences (doc.sents)', async () => { 54 | const doc = await nlp(text) 55 | expect(doc.sents).toBeInstanceOf(Array) 56 | expect(doc.sents).toEqual(expect.arrayContaining([expect.any(Span)])) 57 | expect(doc.sents.length).toBe(2) 58 | const sentence = doc.sents[0] 59 | expect(sentence).toBeInstanceOf(Span) 60 | expect(sentence.text).toBe('Hello world!') 61 | expect(sentence.start).toBe(0) 62 | expect(sentence.end).toBe(3) 63 | }) 64 | 65 | test('has noun chunks (doc.noun_chunks)', async () => { 66 | const doc = await nlp(text) 67 | expect(doc.nounChunks).toBeInstanceOf(Array) 68 | expect(doc.nounChunks).toEqual(expect.arrayContaining([expect.any(Span)])) 69 | expect(doc.nounChunks.length).toBe(3) 70 | const chunk = doc.nounChunks[0] 71 | expect(chunk).toBeInstanceOf(Span) 72 | expect(chunk.text).toBe('Hello world') 73 | expect(chunk.start).toBe(0) 74 | expect(chunk.end).toBe(2) 75 | }) 76 | -------------------------------------------------------------------------------- /tests/language.test.js: -------------------------------------------------------------------------------- 1 | import spacy from '../src' 2 | 3 | jest.mock('../src/language') 4 | 5 | test('creates new nlp object', () => { 6 | const nlp = spacy.load('en_core_web_sm') 7 | expect(nlp).toEqual(expect.any(Function)) 8 | }) 9 | -------------------------------------------------------------------------------- /tests/span.test.js: -------------------------------------------------------------------------------- 1 | import spacy from '../src' 2 | import { Doc, Token, Span } from '../src/tokens' 3 | import { text, words, spaces, attrs } from './util' 4 | 5 | jest.mock('../src/language') 6 | 7 | const nlp = spacy.load('en_core_web_sm') 8 | 9 | test('allows manual construction', async () => { 10 | const doc = await nlp(text) 11 | const span = new Span(doc, 6, 9) 12 | expect(span).toBeInstanceOf(Span) 13 | expect(span.text).toBe('sentence about Facebook') 14 | }) 15 | 16 | test('allows being sliced off Doc', async () => { 17 | const doc = await nlp(text) 18 | const span = doc.slice(6, 9) 19 | expect(span).toBeInstanceOf(Span) 20 | expect(span.text).toBe('sentence about Facebook') 21 | }) 22 | 23 | test('has Span attributes', async () => { 24 | const doc = await nlp(text) 25 | const span = doc.slice(6, 9) 26 | expect(span.toString()).toBe('sentence about Facebook') 27 | expect(span.length).toBe(3) 28 | expect(span.start).toBe(6) 29 | expect(span.end).toBe(9) 30 | expect(span.label).toBeUndefined() 31 | }) 32 | 33 | test('has parent Doc', async () => { 34 | const doc = await nlp(text) 35 | const span = doc.slice(6, 9) 36 | expect(span.doc).toBeInstanceOf(Doc) 37 | expect(span.doc).toBe(doc) 38 | }) 39 | 40 | test('has entity label', async () => { 41 | const doc = await nlp(text) 42 | const span = doc.slice(8, 9) 43 | expect(span.toString()).toBe('Facebook') 44 | expect(span.label).toBe('ORG') 45 | }) 46 | 47 | test('allows token indexing', async () => { 48 | const doc = await nlp(text) 49 | const span = doc.slice(6, 9) 50 | for (let i = 0; i < span.length; i++) { 51 | expect(span[i]).toBeInstanceOf(Token) 52 | } 53 | expect(span[span.length + 1]).toBeUndefined() 54 | }) 55 | 56 | test('allows token iteration', async () => { 57 | const doc = await nlp(text) 58 | const span = doc.slice(6, 9) 59 | for (let token of span) { 60 | expect(token).toBeInstanceOf(Token) 61 | } 62 | }) 63 | -------------------------------------------------------------------------------- /tests/test_api.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from __future__ import unicode_literals 3 | 4 | import pytest 5 | import spacy 6 | import json 7 | 8 | from api.server import parse, doc2json, load_model 9 | 10 | 11 | @pytest.fixture(scope="session") 12 | def model(): 13 | return "en_core_web_sm" 14 | 15 | 16 | @pytest.fixture(scope="session") 17 | def text(): 18 | return "This is a sentence about Facebook. This is another one." 19 | 20 | 21 | @pytest.fixture(scope="session") 22 | def nlp(model): 23 | return spacy.load(model) 24 | 25 | 26 | @pytest.fixture(scope="session") 27 | def doc(nlp, text): 28 | return nlp(text) 29 | 30 | 31 | def test_server_parse(model, text, doc): 32 | load_model(model) 33 | json_doc = parse(model, text) 34 | direct_json_doc = doc2json(doc, model) 35 | assert json.dumps(json_doc, sort_keys=True) == json.dumps( 36 | direct_json_doc, sort_keys=True 37 | ) 38 | 39 | 40 | def test_doc2json_doc_tokens(doc, model): 41 | data = doc2json(doc, model) 42 | assert data["model"] == model 43 | assert data["doc"]["text"] == doc.text 44 | assert data["doc"]["text_with_ws"] == doc.text_with_ws 45 | assert data["doc"]["is_tagged"] 46 | assert data["doc"]["is_parsed"] 47 | assert data["doc"]["is_sentenced"] 48 | assert len(data["tokens"]) == len(doc) 49 | assert data["tokens"][0]["text"] == doc[0].text 50 | assert data["tokens"][0]["head"] == doc[0].head.i 51 | 52 | 53 | def test_doc2json_doc_ents(doc, model): 54 | data = doc2json(doc, model) 55 | ents = list(doc.ents) 56 | assert "ents" in data 57 | assert len(data["ents"]) == len(ents) 58 | assert len(data["ents"]) >= 1 59 | assert data["ents"][0]["start"] == ents[0].start 60 | assert data["ents"][0]["end"] == ents[0].end 61 | assert data["ents"][0]["label"] == ents[0].label_ 62 | 63 | 64 | def test_doc2json_doc_sents(doc, model): 65 | data = doc2json(doc, model) 66 | sents = list(doc.sents) 67 | assert "sents" in data 68 | assert len(data["sents"]) == len(sents) 69 | assert len(data["sents"]) >= 1 70 | assert data["sents"][0]["start"] == sents[0].start 71 | assert data["sents"][0]["end"] == sents[0].end 72 | 73 | 74 | def test_doc2json_doc_noun_chunks(doc, model): 75 | data = doc2json(doc, model) 76 | chunks = list(doc.noun_chunks) 77 | assert "noun_chunks" in data 78 | assert len(data["noun_chunks"]) == len(chunks) 79 | assert len(data["noun_chunks"]) >= 1 80 | assert data["noun_chunks"][0]["start"] == chunks[0].start 81 | assert data["noun_chunks"][0]["end"] == chunks[0].end 82 | -------------------------------------------------------------------------------- /tests/token.test.js: -------------------------------------------------------------------------------- 1 | import spacy from '../src' 2 | import { Doc, Token } from '../src/tokens' 3 | import { text, words, spaces, attrs } from './util' 4 | 5 | jest.mock('../src/language') 6 | 7 | const nlp = spacy.load('en_core_web_sm') 8 | 9 | test('allows manual construction', async () => { 10 | const doc = await nlp(text) 11 | const token = new Token(doc, words[7], spaces[7], attrs.tokens[7]) 12 | expect(token).toBeInstanceOf(Token) 13 | expect(token.text).toBe('about') 14 | }) 15 | 16 | test('allows indexing from Doc', async () => { 17 | const doc = await nlp(text) 18 | const token = doc[7] 19 | expect(token.text).toBe('about') 20 | }) 21 | 22 | test('has Token attributes', async () => { 23 | const doc = await nlp(text) 24 | const token = doc[7] 25 | expect(token.length).toBe(5) 26 | expect(token.toString()).toBe('about') 27 | expect(token.text).toBe('about') 28 | expect(token.textWithWs).toBe('about ') 29 | expect(token.whitespace).toBe(' ') 30 | expect(token.orth).toBe(942632335873952620) 31 | expect(token.i).toBe(7) 32 | expect(token.entType).toBe('') 33 | expect(token.entIob).toBe('O') 34 | expect(token.lemma).toBe('about') 35 | expect(token.norm).toBe('about') 36 | expect(token.lower).toBe('about') 37 | expect(token.shape).toBe('xxxx') 38 | expect(token.prefix).toBe('a') 39 | expect(token.suffix).toBe('out') 40 | expect(token.pos).toBe('ADP') 41 | expect(token.tag).toBe('IN') 42 | expect(token.dep).toBe('prep') 43 | expect(token.isAlpha).toBe(true) 44 | expect(token.isAscii).toBe(true) 45 | expect(token.isDigit).toBe(false) 46 | expect(token.isLower).toBe(true) 47 | expect(token.isUpper).toBe(false) 48 | expect(token.isTitle).toBe(false) 49 | expect(token.isPunct).toBe(false) 50 | expect(token.isLeftPunct).toBe(false) 51 | expect(token.isRightPunct).toBe(false) 52 | expect(token.isSpace).toBe(false) 53 | expect(token.isBracket).toBe(false) 54 | expect(token.isCurrency).toBe(false) 55 | expect(token.likeUrl).toBe(false) 56 | expect(token.likeNum).toBe(false) 57 | expect(token.likeEmail).toBe(false) 58 | expect(token.isOov).toBe(true) 59 | expect(token.isStop).toBe(true) 60 | expect(token.isSentStart).toBe(null) 61 | }) 62 | 63 | test('has parent Doc', async () => { 64 | const doc = await nlp(text) 65 | const token = doc[7] 66 | expect(token.doc).toBeInstanceOf(Doc) 67 | expect(token.doc).toBe(doc) 68 | }) 69 | 70 | test('has head', async () => { 71 | const doc = await nlp(text) 72 | const head = doc[7].head 73 | expect(head).toBeInstanceOf(Token) 74 | expect(head.i).toBe(6) 75 | expect(head.text).toBe('sentence') 76 | }) 77 | -------------------------------------------------------------------------------- /tests/util.js: -------------------------------------------------------------------------------- 1 | export const text = 'Hello world! This is a sentence about Facebook.' 2 | export const words = [ 3 | 'Hello', 4 | 'world', 5 | '!', 6 | 'This', 7 | 'is', 8 | 'a', 9 | 'sentence', 10 | 'about', 11 | 'Facebook', 12 | '.' 13 | ] 14 | export const spaces = [true, false, true, true, true, true, true, true, false, false] 15 | export const attrs = { 16 | model: 'en_core_web_sm', 17 | doc: { 18 | text: 'Hello world! This is a sentence about Facebook.', 19 | text_with_ws: 'Hello world! This is a sentence about Facebook.', 20 | cats: {}, 21 | is_tagged: true, 22 | is_parsed: true, 23 | is_sentenced: true 24 | }, 25 | ents: [ 26 | { 27 | start: 8, 28 | end: 9, 29 | label: 'ORG' 30 | } 31 | ], 32 | sents: [ 33 | { 34 | start: 0, 35 | end: 3 36 | }, 37 | { 38 | start: 3, 39 | end: 10 40 | } 41 | ], 42 | noun_chunks: [ 43 | { 44 | start: 0, 45 | end: 2 46 | }, 47 | { 48 | start: 5, 49 | end: 7 50 | }, 51 | { 52 | start: 8, 53 | end: 9 54 | } 55 | ], 56 | tokens: [ 57 | { 58 | text: 'Hello', 59 | text_with_ws: 'Hello ', 60 | whitespace: ' ', 61 | orth: 15777305708150031551, 62 | i: 0, 63 | ent_type: '', 64 | ent_iob: 'O', 65 | lemma: 'hello', 66 | norm: 'hello', 67 | lower: 'hello', 68 | shape: 'Xxxxx', 69 | prefix: 'H', 70 | suffix: 'llo', 71 | pos: 'INTJ', 72 | tag: 'UH', 73 | dep: 'intj', 74 | is_alpha: true, 75 | is_ascii: true, 76 | is_digit: false, 77 | is_lower: false, 78 | is_upper: false, 79 | is_title: true, 80 | is_punct: false, 81 | is_left_punct: false, 82 | is_right_punct: false, 83 | is_space: false, 84 | is_bracket: false, 85 | is_currency: false, 86 | like_url: false, 87 | like_num: false, 88 | like_email: false, 89 | is_oov: true, 90 | is_stop: false, 91 | is_sent_start: null, 92 | head: 1 93 | }, 94 | { 95 | text: 'world', 96 | text_with_ws: 'world', 97 | whitespace: '', 98 | orth: 1703489418272052182, 99 | i: 1, 100 | ent_type: '', 101 | ent_iob: 'O', 102 | lemma: 'world', 103 | norm: 'world', 104 | lower: 'world', 105 | shape: 'xxxx', 106 | prefix: 'w', 107 | suffix: 'rld', 108 | pos: 'NOUN', 109 | tag: 'NN', 110 | dep: 'ROOT', 111 | is_alpha: true, 112 | is_ascii: true, 113 | is_digit: false, 114 | is_lower: true, 115 | is_upper: false, 116 | is_title: false, 117 | is_punct: false, 118 | is_left_punct: false, 119 | is_right_punct: false, 120 | is_space: false, 121 | is_bracket: false, 122 | is_currency: false, 123 | like_url: false, 124 | like_num: false, 125 | like_email: false, 126 | is_oov: true, 127 | is_stop: false, 128 | is_sent_start: null, 129 | head: 1 130 | }, 131 | { 132 | text: '!', 133 | text_with_ws: '! ', 134 | whitespace: ' ', 135 | orth: 17494803046312582752, 136 | i: 2, 137 | ent_type: '', 138 | ent_iob: 'O', 139 | lemma: '!', 140 | norm: '!', 141 | lower: '!', 142 | shape: '!', 143 | prefix: '!', 144 | suffix: '!', 145 | pos: 'PUNCT', 146 | tag: '.', 147 | dep: 'punct', 148 | is_alpha: false, 149 | is_ascii: true, 150 | is_digit: false, 151 | is_lower: false, 152 | is_upper: false, 153 | is_title: false, 154 | is_punct: true, 155 | is_left_punct: false, 156 | is_right_punct: false, 157 | is_space: false, 158 | is_bracket: false, 159 | is_currency: false, 160 | like_url: false, 161 | like_num: false, 162 | like_email: false, 163 | is_oov: true, 164 | is_stop: false, 165 | is_sent_start: null, 166 | head: 1 167 | }, 168 | { 169 | text: 'This', 170 | text_with_ws: 'This ', 171 | whitespace: ' ', 172 | orth: 12943039165150086467, 173 | i: 3, 174 | ent_type: '', 175 | ent_iob: 'O', 176 | lemma: 'this', 177 | norm: 'this', 178 | lower: 'this', 179 | shape: 'Xxxx', 180 | prefix: 'T', 181 | suffix: 'his', 182 | pos: 'DET', 183 | tag: 'DT', 184 | dep: 'nsubj', 185 | is_alpha: true, 186 | is_ascii: true, 187 | is_digit: false, 188 | is_lower: false, 189 | is_upper: false, 190 | is_title: true, 191 | is_punct: false, 192 | is_left_punct: false, 193 | is_right_punct: false, 194 | is_space: false, 195 | is_bracket: false, 196 | is_currency: false, 197 | like_url: false, 198 | like_num: false, 199 | like_email: false, 200 | is_oov: true, 201 | is_stop: false, 202 | is_sent_start: true, 203 | head: 4 204 | }, 205 | { 206 | text: 'is', 207 | text_with_ws: 'is ', 208 | whitespace: ' ', 209 | orth: 3411606890003347522, 210 | i: 4, 211 | ent_type: '', 212 | ent_iob: 'O', 213 | lemma: 'be', 214 | norm: 'is', 215 | lower: 'is', 216 | shape: 'xx', 217 | prefix: 'i', 218 | suffix: 'is', 219 | pos: 'VERB', 220 | tag: 'VBZ', 221 | dep: 'ROOT', 222 | is_alpha: true, 223 | is_ascii: true, 224 | is_digit: false, 225 | is_lower: true, 226 | is_upper: false, 227 | is_title: false, 228 | is_punct: false, 229 | is_left_punct: false, 230 | is_right_punct: false, 231 | is_space: false, 232 | is_bracket: false, 233 | is_currency: false, 234 | like_url: false, 235 | like_num: false, 236 | like_email: false, 237 | is_oov: true, 238 | is_stop: true, 239 | is_sent_start: null, 240 | head: 4 241 | }, 242 | { 243 | text: 'a', 244 | text_with_ws: 'a ', 245 | whitespace: ' ', 246 | orth: 11901859001352538922, 247 | i: 5, 248 | ent_type: '', 249 | ent_iob: 'O', 250 | lemma: 'a', 251 | norm: 'gonna', 252 | lower: 'a', 253 | shape: 'x', 254 | prefix: 'a', 255 | suffix: 'a', 256 | pos: 'DET', 257 | tag: 'DT', 258 | dep: 'det', 259 | is_alpha: true, 260 | is_ascii: true, 261 | is_digit: false, 262 | is_lower: true, 263 | is_upper: false, 264 | is_title: false, 265 | is_punct: false, 266 | is_left_punct: false, 267 | is_right_punct: false, 268 | is_space: false, 269 | is_bracket: false, 270 | is_currency: false, 271 | like_url: false, 272 | like_num: false, 273 | like_email: false, 274 | is_oov: true, 275 | is_stop: true, 276 | is_sent_start: null, 277 | head: 6 278 | }, 279 | { 280 | text: 'sentence', 281 | text_with_ws: 'sentence ', 282 | whitespace: ' ', 283 | orth: 18108853898452662235, 284 | i: 6, 285 | ent_type: '', 286 | ent_iob: 'O', 287 | lemma: 'sentence', 288 | norm: 'sentence', 289 | lower: 'sentence', 290 | shape: 'xxxx', 291 | prefix: 's', 292 | suffix: 'nce', 293 | pos: 'NOUN', 294 | tag: 'NN', 295 | dep: 'attr', 296 | is_alpha: true, 297 | is_ascii: true, 298 | is_digit: false, 299 | is_lower: true, 300 | is_upper: false, 301 | is_title: false, 302 | is_punct: false, 303 | is_left_punct: false, 304 | is_right_punct: false, 305 | is_space: false, 306 | is_bracket: false, 307 | is_currency: false, 308 | like_url: false, 309 | like_num: false, 310 | like_email: false, 311 | is_oov: true, 312 | is_stop: false, 313 | is_sent_start: null, 314 | head: 4 315 | }, 316 | { 317 | text: 'about', 318 | text_with_ws: 'about ', 319 | whitespace: ' ', 320 | orth: 942632335873952620, 321 | i: 7, 322 | ent_type: '', 323 | ent_iob: 'O', 324 | lemma: 'about', 325 | norm: 'about', 326 | lower: 'about', 327 | shape: 'xxxx', 328 | prefix: 'a', 329 | suffix: 'out', 330 | pos: 'ADP', 331 | tag: 'IN', 332 | dep: 'prep', 333 | is_alpha: true, 334 | is_ascii: true, 335 | is_digit: false, 336 | is_lower: true, 337 | is_upper: false, 338 | is_title: false, 339 | is_punct: false, 340 | is_left_punct: false, 341 | is_right_punct: false, 342 | is_space: false, 343 | is_bracket: false, 344 | is_currency: false, 345 | like_url: false, 346 | like_num: false, 347 | like_email: false, 348 | is_oov: true, 349 | is_stop: true, 350 | is_sent_start: null, 351 | head: 6 352 | }, 353 | { 354 | text: 'Facebook', 355 | text_with_ws: 'Facebook', 356 | whitespace: '', 357 | orth: 8081970590932371665, 358 | i: 8, 359 | ent_type: 'ORG', 360 | ent_iob: 'B', 361 | lemma: 'facebook', 362 | norm: 'facebook', 363 | lower: 'facebook', 364 | shape: 'Xxxxx', 365 | prefix: 'F', 366 | suffix: 'ook', 367 | pos: 'PROPN', 368 | tag: 'NNP', 369 | dep: 'pobj', 370 | is_alpha: true, 371 | is_ascii: true, 372 | is_digit: false, 373 | is_lower: false, 374 | is_upper: false, 375 | is_title: true, 376 | is_punct: false, 377 | is_left_punct: false, 378 | is_right_punct: false, 379 | is_space: false, 380 | is_bracket: false, 381 | is_currency: false, 382 | like_url: false, 383 | like_num: false, 384 | like_email: false, 385 | is_oov: true, 386 | is_stop: false, 387 | is_sent_start: null, 388 | head: 7 389 | }, 390 | { 391 | text: '.', 392 | text_with_ws: '.', 393 | whitespace: '', 394 | orth: 12646065887601541794, 395 | i: 9, 396 | ent_type: '', 397 | ent_iob: 'O', 398 | lemma: '.', 399 | norm: '.', 400 | lower: '.', 401 | shape: '.', 402 | prefix: '.', 403 | suffix: '.', 404 | pos: 'PUNCT', 405 | tag: '.', 406 | dep: 'punct', 407 | is_alpha: false, 408 | is_ascii: true, 409 | is_digit: false, 410 | is_lower: false, 411 | is_upper: false, 412 | is_title: false, 413 | is_punct: true, 414 | is_left_punct: false, 415 | is_right_punct: false, 416 | is_space: false, 417 | is_bracket: false, 418 | is_currency: false, 419 | like_url: false, 420 | like_num: false, 421 | like_email: false, 422 | is_oov: true, 423 | is_stop: false, 424 | is_sent_start: null, 425 | head: 4 426 | } 427 | ] 428 | } 429 | --------------------------------------------------------------------------------