├── .babelrc
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── api
└── server.py
├── dist
└── index.js
├── package-lock.json
├── package.json
├── requirements.txt
├── src
├── __mocks__
│ └── language.js
├── index.js
├── language.js
├── tokens.js
└── util.js
└── tests
├── doc.test.js
├── language.test.js
├── span.test.js
├── test_api.py
├── token.test.js
└── util.js
/.babelrc:
--------------------------------------------------------------------------------
1 | {
2 | "presets": [
3 | ["@babel/preset-env", {
4 | "modules": false,
5 | "targets": { "node": "current"}
6 | }]
7 | ],
8 | "plugins": [
9 | "add-module-exports"
10 | ],
11 | "env": {
12 | "test": {
13 | "presets": [
14 | ["@babel/preset-env", {
15 | "targets": { "node": "current"}
16 | }]
17 | ]
18 | }
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/
2 |
3 | # JavaScript
4 | node_modules/
5 | *.tgz
6 |
7 | # Byte-compiled / optimized / DLL files
8 | __pycache__/
9 | *.py[cod]
10 | *$py.class
11 |
12 | # C extensions
13 | *.so
14 |
15 | # Distribution / packaging
16 | .Python
17 | build/
18 | develop-eggs/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | wheels/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | MANIFEST
32 |
33 | # PyInstaller
34 | # Usually these files are written by a python script from a template
35 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
36 | *.manifest
37 | *.spec
38 |
39 | # Installer logs
40 | pip-log.txt
41 | pip-delete-this-directory.txt
42 |
43 | # Unit test / coverage reports
44 | htmlcov/
45 | .tox/
46 | .nox/
47 | .coverage
48 | .coverage.*
49 | .cache
50 | nosetests.xml
51 | coverage.xml
52 | *.cover
53 | .hypothesis/
54 | .pytest_cache/
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 | db.sqlite3
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | .python-version
87 |
88 | # celery beat schedule file
89 | celerybeat-schedule
90 |
91 | # SageMath parsed files
92 | *.sage.py
93 |
94 | # Environments
95 | .env
96 | .venv
97 | env/
98 | venv/
99 | ENV/
100 | env.bak/
101 | venv.bak/
102 |
103 | # Spyder project settings
104 | .spyderproject
105 | .spyproject
106 |
107 | # Rope project settings
108 | .ropeproject
109 |
110 | # mkdocs documentation
111 | /site
112 |
113 | # mypy
114 | .mypy_cache/
115 | .dmypy.json
116 | dmypy.json
117 |
118 | # Pyre type checker
119 | .pyre/
120 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | matrix:
2 | include:
3 | - language: python
4 | python: 3.6
5 | before_script:
6 | - pip install pytest
7 | install:
8 | - pip install -r requirements.txt
9 | script:
10 | - python -m pytest tests
11 | cache: pip
12 | - language: node_js
13 | node_js:
14 | - "8"
15 | install:
16 | - npm install
17 | script:
18 | - npm run test
19 | cache: npm
20 | notifications:
21 | email: false
22 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Ines Montani
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # spaCy JS
4 |
5 | [](https://travis-ci.org/ines/spacy-js)
6 | [](https://www.npmjs.com/package/spacy)
7 | [](https://github.com/ines/spacy-js)
8 | [](https://unpkg.com/spacy)
9 |
10 |
11 | JavaScript interface for accessing linguistic annotations provided by
12 | [spaCy](https://spacy.io). This project is mostly experimental and was
13 | developed for fun to play around with different ways of mimicking spaCy's
14 | Python API.
15 |
16 | The results will still be computed in Python and made available via a REST API.
17 | The JavaScript API resembles spaCy's Python API as closely as possible (with
18 | a few exceptions, as the values are all pre-computed and it's tricky to express
19 | complex recursive relationships).
20 |
21 | ```javascript
22 | const spacy = require('spacy');
23 |
24 | (async function() {
25 | const nlp = spacy.load('en_core_web_sm');
26 | const doc = await nlp('This is a text about Facebook.');
27 | for (let ent of doc.ents) {
28 | console.log(ent.text, ent.label);
29 | }
30 | for (let token of doc) {
31 | console.log(token.text, token.pos, token.head.text);
32 | }
33 | })();
34 | ```
35 |
36 | ## ⌛️ Installation
37 |
38 | ### Installing the JavaScript library
39 |
40 | You can install the JavaScript package via npm:
41 |
42 | ```bash
43 | npm install spacy
44 | ```
45 |
46 | ### Setting up the Python server
47 |
48 | First, clone this repo and install the requirements. If you've installed the
49 | package via npm, you can also use the `api/server.py` and `requirements.txt` in
50 | your `./node_modules/spacy` directory. It's recommended to use a virtual
51 | environment.
52 |
53 | ```bash
54 | pip install -r requirements.txt
55 | ```
56 |
57 | You can then run the REST API. By default, this will serve the API via
58 | `0.0.0.0:8080`:
59 |
60 | ```bash
61 | python api/server.py
62 | ```
63 |
64 | If you like, you can install more [models](https://spacy.io/models) and specify
65 | a comma-separated list of models to load as the first argument when you run
66 | the server. All models need to be installed in the same environment.
67 |
68 | ```bash
69 | python api/server.py en_core_web_sm,de_core_news_sm
70 | ```
71 |
72 | | Argument | Type | Description | Default |
73 | | --- | --- | --- | --- |
74 | | `models` | positional (str) | Comma-separated list of models to load and make available. | `en_core_web_sm` |
75 | | `--host`, `-ho` | option (str) | Host to serve the API. | `0.0.0.0` |
76 | | `--port`, `-p` | option (int) | Port to server the API. | `8080` |
77 |
78 | ## 🎛 API
79 |
80 | ### `spacy.load`
81 |
82 | "Load" a spaCy model. This method mostly exists for consistency with the Python
83 | API. It sets up the REST API and `nlp` object, but doesn't actually load
84 | anything, since the models are already available via the REST API.
85 |
86 | ```javascript
87 | const nlp = spacy.load('en_core_web_sm');
88 | ```
89 |
90 | | Argument | Type | Description |
91 | | --- | --- | --- |
92 | | `model` | String | Name of model to load, e.g. `'en_core_web_sm'`. Needs to be available via the REST API. |
93 | | `api` | String | Alternative URL of REST API. Defaults to `http://0.0.0.0:8080`. |
94 | | **RETURNS** | [`Language`](src/language.js) | The `nlp` object. |
95 |
96 | ### `nlp` async
97 |
98 | The `nlp` object created by `spacy.load` can be called on a string of text
99 | and makes a request to the REST API. The easiest way to use it is to wrap the
100 | call in an `async` function and use `await`:
101 |
102 | ```javascript
103 | async function() {
104 | const nlp = spacy.load('en_core_web_sm');
105 | const doc = await nlp('This is a text.');
106 | }
107 | ```
108 |
109 | | Argument | Type | Description |
110 | | --- | --- | --- |
111 | | `text` | String | The text to process. |
112 | | **RETURNS** | [`Doc`](src/tokens.js) | The processed `Doc`. |
113 |
114 | ### `Doc`
115 |
116 | Just like [in the original API](https://spacy.io/api/doc), the `Doc` object can
117 | be constructed with an array of `words` and `spaces`. It also takes an
118 | additional `attrs` object, which corresponds to the JSON-serialized linguistic
119 | annotations created in [`doc2json` in `api/server.py`](api/server.py).
120 |
121 | The `Doc` behaves just like the regular spaCy `Doc` – you can iterate over its
122 | tokens, index into individual tokens, access the `Doc` attributes and properties
123 | and also use native JavaScript methods like `map` and `slice` (since there's no
124 | real way to make Python's slice notation like `doc[2:4]` work).
125 |
126 | #### Construction
127 |
128 | ```javascript
129 | import { Doc } from 'spacy';
130 |
131 | const words = ['Hello', 'world', '!'];
132 | const spaces = [true, false, false];
133 | const doc = Doc(words, spaces)
134 | console.log(doc.text) // 'Hello world!'
135 | ```
136 |
137 | | Argument | Type | Description |
138 | | --- | --- | --- |
139 | | `words` | Array | The individual token texts. |
140 | | `spaces` | Array | Whether the token at this position is followed by a space or not. |
141 | | `attrs` | Object | JSON-serialized attributes, see [`doc2json`](api/server.py). |
142 | | **RETURNS** | [`Doc`](src/tokens.js) | The newly constructed `Doc`. |
143 |
144 | #### Symbol iterator and token indexing
145 |
146 | ```javascript
147 | async function() {
148 | const nlp = spacy.load('en_core_web_sm');
149 | const doc = await nlp('Hello world');
150 |
151 | for (let token of doc) {
152 | console.log(token.text);
153 | }
154 | // Hello
155 | // world
156 |
157 | const token1 = doc[0];
158 | console.log(token1.text);
159 | // Hello
160 | }
161 | ```
162 |
163 | #### Properties and Attributes
164 |
165 | | Name | Type | Description |
166 | | --- | --- | --- |
167 | | `text` | String | The `Doc` text. |
168 | | `length` | Number | The number of tokens in the `Doc`. |
169 | | `ents` | Array | A list of [`Span`](src/tokens.js) objects, describing the named entities in the `Doc`. |
170 | | `sents` | Array | A list of [`Span`](src/tokens.js) objects, describing the sentences in the `Doc`. |
171 | | `nounChunks` | Array | A list of [`Span`](src/tokens.js) objects, describing the base noun phrases in the `Doc`. |
172 | | `cats` | Object | The document categories predicted by the text classifier, if available in the model. |
173 | | `isTagged` | Boolean | Whether the part-of-speech tagger has been applied to the `Doc`. |
174 | | `isParsed` | Boolean | Whether the dependency parser has been applied to the `Doc`. |
175 | | `isSentenced` | Boolean | Whether the sentence boundary detector has been applied to the `Doc`. |
176 |
177 | ### `Span`
178 |
179 | A `Span` object is a slice of a `Doc` and contains of one or more tokens. Just
180 | like [in the original API](https://spacy.io/api/span), it can be constructed
181 | from a `Doc`, a start and end index and an optional label, or by slicing a `Doc`.
182 |
183 | #### Construction
184 |
185 | ```javascript
186 | import { Doc, Span } from 'spacy';
187 |
188 | const doc = Doc(['Hello', 'world', '!'], [true, false, false]);
189 | const span = Span(doc, 1, 3);
190 | console.log(span.text) // 'world!'
191 | ```
192 |
193 | | Argument | Type | Description |
194 | | --- | --- | --- |
195 | | `doc` | `Doc` | The reference document. |
196 | | `start` | Number | The start token index. |
197 | | `end` | Number | The end token index. This is *exclusive*, i.e. "up to token X". |
198 | | `label` | String | Optional label. |
199 | | **RETURNS** | [`Span`](src/tokens.js) | The newly constructed `Span`. |
200 |
201 | #### Properties and Attributes
202 |
203 | | Name | Type | Description |
204 | | --- | --- | --- |
205 | | `text` | String | The `Span` text. |
206 | | `length` | Number | The number of tokens in the `Span`. |
207 | | `doc` | `Doc` | The parent `Doc`. |
208 | | `start` | Number | The `Span`'s start index in the parent document. |
209 | | `end` | Number | The `Span`'s end index in the parent document. |
210 | | `label` | String | The `Span`'s label, if available. |
211 |
212 | ### `Token`
213 |
214 | For token attributes that exist as string and ID versions (e.g. `Token.pos` vs.
215 | `Token.pos_`), only the string versions are exposed.
216 |
217 | #### Usage Examples
218 |
219 | ```javascript
220 | async function() {
221 | const nlp = spacy.load('en_core_web_sm');
222 | const doc = await nlp('Hello world');
223 |
224 | for (let token of doc) {
225 | console.log(token.text, token.pos, token.isLower);
226 | }
227 | // Hello INTJ false
228 | // world NOUN true
229 | }
230 | ```
231 |
232 | #### Properties and Attributes
233 |
234 | | Name | Type | Description |
235 | | --- | --- | --- |
236 | | `text` | String | The token text. |
237 | | `whitespace` | String | Whitespace character following the token, if available. |
238 | | `textWithWs` | String | Token text with training whitespace. |
239 | | `length` | Number | The length of the token text. |
240 | | `orth` | Number | ID of the token text. |
241 | | `doc` | `Doc` | The parent `Doc`. |
242 | | `head` | `Token` | The syntactic parent, or "governor", of this token. |
243 | | `i` | Number | Index of the token in the parent document. |
244 | | `entType` | String | The token's named entity type. |
245 | | `entIob` | String | IOB code of the token's named entity tag. |
246 | | `lemma` | String | The token's lemma, i.e. the base form. |
247 | | `norm` | String | The normalised form of the token. |
248 | | `lower` | String | The lowercase form of the token. |
249 | | `shape` | String | Transform of the tokens's string, to show orthographic features. For example, "Xxxx" or "dd". |
250 | | `prefix` | String | A length-N substring from the start of the token. Defaults to `N=1`. |
251 | | `suffix` | String | Length-N substring from the end of the token. Defaults to `N=3`. |
252 | | `pos` | String | The token's coarse-grained part-of-speech tag. |
253 | | `tag` | String | The token's fine-grained part-of-speech tag. |
254 | | `isAlpha` | Boolean | Does the token consist of alphabetic characters? |
255 | | `isAscii` | Boolean | Does the token consist of ASCII characters? |
256 | | `isDigit` | Boolean | Does the token consist of digits? |
257 | | `isLower` | Boolean | Is the token lowercase? |
258 | | `isUpper` | Boolean | Is the token uppercase? |
259 | | `isTitle` | Boolean | Is the token titlecase? |
260 | | `isPunct` | Boolean | Is the token punctuation? |
261 | | `isLeftPunct` | Boolean | Is the token left punctuation? |
262 | | `isRightPunct` | Boolean | Is the token right punctuation? |
263 | | `isSpace` | Boolean | Is the token a whitespace character? |
264 | | `isBracket` | Boolean | Is the token a bracket? |
265 | | `isCurrency` | Boolean | Is the token a currency symbol? |
266 | | `likeUrl` | Boolean | Does the token resemble a URL? |
267 | | `likeNum` | Boolean | Does the token resemble a number? |
268 | | `likeEmail` | Boolean | Does the token resemble an email address? |
269 | | `isOov` | Boolean | Is the token out-of-vocabulary? |
270 | | `isStop` | Boolean | Is the token a stop word? |
271 | | `isSentStart` | Boolean | Does the token start a sentence? |
272 |
273 | ## 🔔 Run Tests
274 |
275 | ### Python
276 |
277 | First, make sure you have `pytest` and all dependencies installed. You can then
278 | run the tests by pointing `pytest` to [`/tests`](/tests):
279 |
280 | ```bash
281 | python -m pytest tests
282 | ```
283 |
284 | ### JavaScript
285 |
286 | This project uses [Jest](https://jestjs.io) for testing. Make sure you have
287 | all dependencies and development dependencies installed. You can then run:
288 |
289 | ```bash
290 | npm run test
291 | ```
292 |
293 | To allow testing the code without a REST API providing the data, the test suite
294 | currently uses a [mock of the `Language` class](src/__mocks__), which returns
295 | static data located in [`tests/util.js`](tests/util.js).
296 |
297 | ## ✅ Ideas and Todos
298 |
299 | - [ ] Improve JavaScript tests.
300 | - [ ] Experiment with NodeJS bindings to make Python integration easier. To be fair, running a separate API in an environment controlled by the user and *not* hiding it a few levels deep is often much easier. But maybe there are some modern Node tricks that this project could benefit from.
301 |
--------------------------------------------------------------------------------
/api/server.py:
--------------------------------------------------------------------------------
1 | # coding: utf8
2 | from __future__ import unicode_literals
3 |
4 | import hug
5 | from hug_middleware_cors import CORSMiddleware
6 | import waitress
7 | import spacy
8 | import plac
9 |
10 |
11 | MODELS = {}
12 |
13 |
14 | @plac.annotations(
15 | models=("Comma-separated list of spaCy models", "positional", None, str),
16 | host=("Host to serve API", "option", "ho", str),
17 | port=("Port to serve API", "option", "p", int),
18 | )
19 | def main(models=None, host="0.0.0.0", port=8080):
20 | if not models:
21 | models = ["en_core_web_sm"]
22 | else:
23 | models = [m.strip() for m in models.split(",")]
24 | for model in models:
25 | load_model(model)
26 | # Serving Hug API
27 | app = hug.API(__name__)
28 | app.http.add_middleware(CORSMiddleware(app))
29 | waitress.serve(__hug_wsgi__, port=port)
30 |
31 |
32 | def load_model(model):
33 | print("Loading model '{}'...".format(model))
34 | MODELS[model] = spacy.load(model)
35 |
36 |
37 | def doc2json(doc: spacy.tokens.Doc, model: str):
38 | json_doc = {
39 | "text": doc.text,
40 | "text_with_ws": doc.text_with_ws,
41 | "cats": doc.cats,
42 | "is_tagged": doc.is_tagged,
43 | "is_parsed": doc.is_parsed,
44 | "is_nered": doc.is_nered,
45 | "is_sentenced": doc.is_sentenced,
46 | }
47 | ents = [
48 | {"start": ent.start, "end": ent.end, "label": ent.label_} for ent in doc.ents
49 | ]
50 | if doc.is_sentenced:
51 | sents = [{"start": sent.start, "end": sent.end} for sent in doc.sents]
52 | else:
53 | sents = []
54 | if doc.is_tagged and doc.is_parsed:
55 | noun_chunks = [
56 | {"start": chunk.start, "end": chunk.end} for chunk in doc.noun_chunks
57 | ]
58 | else:
59 | noun_chunks = []
60 | tokens = [
61 | {
62 | "text": token.text,
63 | "text_with_ws": token.text_with_ws,
64 | "whitespace": token.whitespace_,
65 | "orth": token.orth,
66 | "i": token.i,
67 | "ent_type": token.ent_type_,
68 | "ent_iob": token.ent_iob_,
69 | "lemma": token.lemma_,
70 | "norm": token.norm_,
71 | "lower": token.lower_,
72 | "shape": token.shape_,
73 | "prefix": token.prefix_,
74 | "suffix": token.suffix_,
75 | "pos": token.pos_,
76 | "tag": token.tag_,
77 | "dep": token.dep_,
78 | "is_alpha": token.is_alpha,
79 | "is_ascii": token.is_ascii,
80 | "is_digit": token.is_digit,
81 | "is_lower": token.is_lower,
82 | "is_upper": token.is_upper,
83 | "is_title": token.is_title,
84 | "is_punct": token.is_punct,
85 | "is_left_punct": token.is_left_punct,
86 | "is_right_punct": token.is_right_punct,
87 | "is_space": token.is_space,
88 | "is_bracket": token.is_bracket,
89 | "is_currency": token.is_currency,
90 | "like_url": token.like_url,
91 | "like_num": token.like_num,
92 | "like_email": token.like_email,
93 | "is_oov": token.is_oov,
94 | "is_stop": token.is_stop,
95 | "is_sent_start": token.is_sent_start,
96 | "head": token.head.i,
97 | }
98 | for token in doc
99 | ]
100 | return {
101 | "model": model,
102 | "doc": json_doc,
103 | "ents": ents,
104 | "sents": sents,
105 | "noun_chunks": noun_chunks,
106 | "tokens": tokens,
107 | }
108 |
109 |
110 | @hug.post("/parse")
111 | def parse(model: str, text: str):
112 | nlp = MODELS[model]
113 | doc = nlp(text)
114 | return doc2json(doc, model)
115 |
116 |
117 | @hug.post("/similarity")
118 | def similarity(model: str, text1: str, text2: str):
119 | # We can always create Doc objects here, because the result is the same
120 | nlp = MODELS[model]
121 | doc1 = nlp(text1)
122 | doc2 = nlp(text2)
123 | return {"similarity": doc1.similarity(doc2)}
124 |
125 |
126 | if __name__ == "__main__":
127 | plac.call(main)
128 |
--------------------------------------------------------------------------------
/dist/index.js:
--------------------------------------------------------------------------------
1 | parcelRequire=function(e,r,n,t){var i="function"==typeof parcelRequire&&parcelRequire,o="function"==typeof require&&require;function u(n,t){if(!r[n]){if(!e[n]){var f="function"==typeof parcelRequire&&parcelRequire;if(!t&&f)return f(n,!0);if(i)return i(n,!0);if(o&&"string"==typeof n)return o(n);var c=new Error("Cannot find module '"+n+"'");throw c.code="MODULE_NOT_FOUND",c}p.resolve=function(r){return e[n][1][r]||r},p.cache={};var l=r[n]=new u.Module(n);e[n][0].call(l.exports,p,l,l.exports,this)}return r[n].exports;function p(e){return u(p.resolve(e))}}u.isParcelRequire=!0,u.Module=function(e){this.id=e,this.bundle=u,this.exports={}},u.modules=e,u.cache=r,u.parent=i,u.register=function(r,n){e[r]=[function(e,r){r.exports=n},{}]};for(var f=0;fnew e(this,t,s[i],this._tokens[i]));for(let e=0;enew i(this,t,s,e))}get sents(){return this._sents.map(({start:t,end:s})=>new i(this,t,s))}get nounChunks(){return this._chunks.map(({start:t,end:s})=>new i(this,t,s))}*[Symbol.iterator](){let t=0;for(;void 0!==this.tokens[t];)yield this.tokens[t],++t}toString(){return this.text}map(t){let s=[];for(let i of this)s.push(t(i));return s}slice(t,s){return new i(this,t,s)}async similarity(s){return await(0,t.getSimilarity)(this._api,this._model,this.text,s.text)}}exports.Doc=s;class i{constructor(t,s,i,e){this.doc=t,this.start=s,this.end=i,this._label=e,this.tokens=[...this.doc].slice(this.start,this.end);for(let h=0;he),spaces:o.tokens.map(({whitespace:e})=>Boolean(e)),attrs:Object.assign({},o,{api:a})}}}exports.default=s;
7 | },{"./tokens":"nJFl","./util":"Y/Oq"}],"Focm":[function(require,module,exports) {
8 | "use strict";Object.defineProperty(exports,"__esModule",{value:!0}),Object.defineProperty(exports,"Doc",{enumerable:!0,get:function(){return r.Doc}}),Object.defineProperty(exports,"Token",{enumerable:!0,get:function(){return r.Token}}),Object.defineProperty(exports,"Span",{enumerable:!0,get:function(){return r.Span}}),exports.default=void 0;var e=t(require("./language")),r=require("./tokens");function t(e){return e&&e.__esModule?e:{default:e}}var n={load:function(r,t){return new e.default(r,t)}};exports.default=n;
9 | },{"./language":"hk5u","./tokens":"nJFl"}]},{},["Focm"], "spacy")
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "spacy",
3 | "version": "0.0.4",
4 | "description": "JavaScript API for spaCy with Python REST API",
5 | "main": "dist/index.js",
6 | "scripts": {
7 | "test": "jest",
8 | "build": "parcel build src/index.js --target node --no-source-maps --global spacy",
9 | "dev": "parcel src/index.js --target node --no-source-maps --global spacy",
10 | "package": "npm run build && npm pack"
11 | },
12 | "author": "Ines Montani",
13 | "license": "MIT",
14 | "homepage": "https://github.com/ines/spacy-js#readme",
15 | "repository": {
16 | "type": "git",
17 | "url": "https://github.com/ines/spacy-js.git"
18 | },
19 | "keywords": [
20 | "spacy",
21 | "nlp",
22 | "natural language processing",
23 | "machine learning",
24 | "artificial intelligence"
25 | ],
26 | "dependencies": {
27 | "node-fetch": "^2.1.2"
28 | },
29 | "devDependencies": {
30 | "@babel/cli": "^7.1.2",
31 | "@babel/core": "^7.1.2",
32 | "@babel/preset-env": "^7.1.0",
33 | "babel-core": "^7.0.0-bridge.0",
34 | "babel-plugin-add-module-exports": "^1.0.0",
35 | "jest": "^23.6.0",
36 | "parcel-bundler": "^1.10.3",
37 | "regenerator-runtime": "^0.12.1"
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | hug>=2.4.0,<3.0.0
2 | hug-middleware-cors>=1.0.0,<2.0.0
3 | spacy>=2.1.0,<2.2.0
4 | waitress>=1.0.2,<2.0.0
5 | plac>=0.9.6,<1.0.0
6 |
7 | https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz#egg=en_core_web_sm==2.1.0
8 |
--------------------------------------------------------------------------------
/src/__mocks__/language.js:
--------------------------------------------------------------------------------
1 | import { Doc } from '../tokens';
2 | import { words, spaces, attrs } from '../../tests/util';
3 |
4 | export default class Language {
5 | constructor(model, api) {
6 | return async function(text) {
7 | return new Doc(words, spaces, attrs);
8 | }
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/src/index.js:
--------------------------------------------------------------------------------
1 | import Language from './language'
2 | export { Doc, Token, Span } from './tokens'
3 |
4 | export default {
5 | load: function(model, api) {
6 | return new Language(model, api);
7 | }
8 | }
9 |
--------------------------------------------------------------------------------
/src/language.js:
--------------------------------------------------------------------------------
1 | import { Doc } from './tokens';
2 | import { makeRequest } from './util';
3 |
4 | export default class Language {
5 | constructor(model, api = 'http://localhost:8080') {
6 | const self = this;
7 | return async function(text) {
8 | const { words, spaces, attrs } = await self.makeDoc(model, text, api);
9 | return new Doc(words, spaces, attrs);
10 | }
11 | }
12 |
13 | async makeDoc(model, text, api) {
14 | const json = await makeRequest(api, 'parse', { model, text })
15 | const words = json.tokens.map(({ text }) => text);
16 | const spaces = json.tokens.map(({ whitespace }) => Boolean(whitespace));
17 | return { words, spaces, attrs: Object.assign({}, json, { api }) }
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/tokens.js:
--------------------------------------------------------------------------------
1 | import { getSimilarity } from './util'
2 |
3 | export class Doc {
4 | constructor(words, spaces, attrs = {}) {
5 | this._doc = attrs.doc || {}
6 | this._tokens = attrs.tokens || []
7 | this._ents = attrs.ents || []
8 | this._sents = attrs.sents || []
9 | this._chunks = attrs.noun_chunks || []
10 | this._model = attrs.model
11 | this._api = attrs.api
12 | this.tokens = words.map((word, i) => new Token(this, word, spaces[i], this._tokens[i]))
13 | for (let i = 0; i < this.tokens.length; i++) {
14 | this[i] = this.tokens[i]
15 | }
16 | this.cats = this._doc.cats
17 | this.isTagged = this._doc.is_tagged
18 | this.isParsed = this._doc.is_parsed
19 | this.isNered = this._doc.is_nered
20 | this.isSentenced = this._doc.is_sentenced
21 | }
22 |
23 | inspect() {
24 | return this.text
25 | }
26 |
27 | get text() {
28 | let text = ''
29 | for (let token of this.tokens) {
30 | text += token.textWithWs
31 | }
32 | return text
33 | }
34 |
35 | get length() {
36 | return this.tokens.length
37 | }
38 |
39 | get ents() {
40 | return this._ents.map(({ start, end, label }) => new Span(this, start, end, label))
41 | }
42 |
43 | get sents() {
44 | return this._sents.map(({ start, end }) => new Span(this, start, end))
45 | }
46 |
47 | get nounChunks() {
48 | return this._chunks.map(({ start, end }) => new Span(this, start, end))
49 | }
50 |
51 | *[Symbol.iterator]() {
52 | let i = 0
53 | while (this.tokens[i] !== undefined) {
54 | yield this.tokens[i]
55 | ++i
56 | }
57 | }
58 |
59 | toString() {
60 | return this.text
61 | }
62 |
63 | map(func) {
64 | let tokens = []
65 | for (let token of this) {
66 | tokens.push(func(token))
67 | }
68 | return tokens
69 | }
70 |
71 | slice(start, end) {
72 | return new Span(this, start, end)
73 | }
74 |
75 | async similarity(obj) {
76 | return await getSimilarity(this._api, this._model, this.text, obj.text)
77 | }
78 | }
79 |
80 | export class Span {
81 | constructor(doc, start, end, label) {
82 | this.doc = doc
83 | this.start = start
84 | this.end = end
85 | this._label = label
86 | this.tokens = [...this.doc].slice(this.start, this.end)
87 | for (let i = 0; i < this.tokens.length; i++) {
88 | this[i] = this.tokens[0]
89 | }
90 | }
91 |
92 | get text() {
93 | let text = ''
94 | for (let token of this.tokens) {
95 | text += token.textWithWs
96 | }
97 | return text.trim()
98 | }
99 |
100 | get length() {
101 | return this.tokens.length
102 | }
103 |
104 | get label() {
105 | if (this._label) {
106 | return this._label
107 | }
108 | // Manually check if span is an entity
109 | for (let ent of this.doc.ents) {
110 | if (ent.start === this.start && ent.end == this.end) {
111 | return ent.label
112 | }
113 | }
114 | }
115 |
116 | *[Symbol.iterator]() {
117 | let i = 0
118 | while (this.tokens[i] !== undefined) {
119 | yield this.tokens[i]
120 | ++i
121 | }
122 | }
123 |
124 | slice(start, end) {
125 | return new Span(this, start, end)
126 | }
127 |
128 | toString() {
129 | return this.text
130 | }
131 |
132 | inspect() {
133 | return this.text
134 | }
135 |
136 | async similarity(obj) {
137 | return await getSimilarity(this.doc._api, this.doc._model, this.text, obj.text)
138 | }
139 | }
140 |
141 | export class Token {
142 | constructor(doc, word, space, attrs = {}) {
143 | this.doc = doc
144 | this.whitespace = space ? ' ' : ''
145 | this.text = word
146 | this.textWithWs = this.text + this.whitespace
147 | this.orth = attrs.orth
148 | this.i = attrs.i
149 | this.entType = attrs.ent_type
150 | this.entIob = attrs.ent_iob
151 | this.lemma = attrs.lemma
152 | this.norm = attrs.norm
153 | this.lower = attrs.lower
154 | ;(this.shape = attrs.shape), (this.prefix = attrs.prefix)
155 | this.suffix = attrs.suffix
156 | this.pos = attrs.pos
157 | this.tag = attrs.tag
158 | this.dep = attrs.dep
159 | this.isAlpha = attrs.is_alpha
160 | this.isAscii = attrs.is_ascii
161 | this.isDigit = attrs.is_digit
162 | this.isLower = attrs.is_lower
163 | this.isUpper = attrs.is_upper
164 | this.isTitle = attrs.is_title
165 | this.isPunct = attrs.is_punct
166 | this.isLeftPunct = attrs.is_left_punct
167 | this.isRightPunct = attrs.is_right_punct
168 | this.isSpace = attrs.is_space
169 | this.isBracket = attrs.is_bracket
170 | this.isCurrency = attrs.is_currency
171 | this.likeUrl = attrs.like_url
172 | this.likeNum = attrs.like_num
173 | this.likeEmail = attrs.like_email
174 | this.isOov = attrs.is_oov
175 | this.isStop = attrs.is_stop
176 | this.isSentStart = attrs.is_sent_start
177 |
178 | this._head = attrs.head
179 | }
180 |
181 | get length() {
182 | return this.text.length
183 | }
184 |
185 | get head() {
186 | return this.doc[this._head]
187 | }
188 |
189 | toString() {
190 | return this.text
191 | }
192 |
193 | inspect() {
194 | return this.text
195 | }
196 |
197 | async similarity(obj) {
198 | return await getSimilarity(this.doc._api, this.doc._model, this.text, obj.text)
199 | }
200 | }
201 |
--------------------------------------------------------------------------------
/src/util.js:
--------------------------------------------------------------------------------
1 | import fetch from 'node-fetch';
2 | import url from 'url';
3 |
4 | export async function makeRequest(api, endpoint, opts, method = 'POST') {
5 | const headers = { 'Accept': 'application/json', 'Content-Type': 'application/json' };
6 | const credentials = 'same-origin';
7 | const body = JSON.stringify(opts);
8 | const apiUrl = url.resolve(api, endpoint);
9 | try {
10 | const res = await fetch(apiUrl, { method, headers, credentials, body });
11 | return await res.json();
12 | }
13 | catch(err) {
14 | console.log(`Error fetching data from API: ${api}`)
15 | }
16 | }
17 |
18 | export async function getSimilarity(api, model, text1, text2) {
19 | const json = await makeRequest(api, '/similarity', { model, text1, text2 });
20 | return json.similarity;
21 | }
22 |
--------------------------------------------------------------------------------
/tests/doc.test.js:
--------------------------------------------------------------------------------
1 | import spacy from '../src'
2 | import { Doc, Token, Span } from '../src/tokens'
3 | import { text, words, spaces, attrs } from './util'
4 |
5 | jest.mock('../src/language')
6 |
7 | const nlp = spacy.load('en_core_web_sm')
8 |
9 | test('allows manual construction', () => {
10 | const doc = new Doc(words, spaces, attrs)
11 | expect(doc).toBeInstanceOf(Doc)
12 | })
13 |
14 | test('has Doc attributes', async () => {
15 | const doc = await nlp(text)
16 | expect(doc.text).toBe(text)
17 | expect(doc.toString()).toBe(text)
18 | expect(doc.length).toBe(10)
19 | expect(doc.cats).toEqual({})
20 | expect(doc.isTagged).toBe(true)
21 | expect(doc.isParsed).toBe(true)
22 | expect(doc.isSentenced).toBe(true)
23 | })
24 |
25 | test('allows token indexing', async () => {
26 | const doc = await nlp(text)
27 | for (let i = 0; i < doc.length; i++) {
28 | expect(doc[i]).toBeInstanceOf(Token)
29 | }
30 | expect(doc[doc.length + 1]).toBeUndefined()
31 | })
32 |
33 | test('allows token iteration', async () => {
34 | const doc = await nlp(text)
35 | for (let token of doc) {
36 | expect(token).toBeInstanceOf(Token)
37 | }
38 | })
39 |
40 | test('has named entities (doc.ents)', async () => {
41 | const doc = await nlp(text)
42 | expect(doc.ents).toBeInstanceOf(Array)
43 | expect(doc.ents).toEqual(expect.arrayContaining([expect.any(Span)]))
44 | expect(doc.ents.length).toBe(1)
45 | const entity = doc.ents[0]
46 | expect(entity).toBeInstanceOf(Span)
47 | expect(entity.text).toBe('Facebook')
48 | expect(entity.start).toBe(8)
49 | expect(entity.end).toBe(9)
50 | expect(entity.label).toBe('ORG')
51 | })
52 |
53 | test('has sentences (doc.sents)', async () => {
54 | const doc = await nlp(text)
55 | expect(doc.sents).toBeInstanceOf(Array)
56 | expect(doc.sents).toEqual(expect.arrayContaining([expect.any(Span)]))
57 | expect(doc.sents.length).toBe(2)
58 | const sentence = doc.sents[0]
59 | expect(sentence).toBeInstanceOf(Span)
60 | expect(sentence.text).toBe('Hello world!')
61 | expect(sentence.start).toBe(0)
62 | expect(sentence.end).toBe(3)
63 | })
64 |
65 | test('has noun chunks (doc.noun_chunks)', async () => {
66 | const doc = await nlp(text)
67 | expect(doc.nounChunks).toBeInstanceOf(Array)
68 | expect(doc.nounChunks).toEqual(expect.arrayContaining([expect.any(Span)]))
69 | expect(doc.nounChunks.length).toBe(3)
70 | const chunk = doc.nounChunks[0]
71 | expect(chunk).toBeInstanceOf(Span)
72 | expect(chunk.text).toBe('Hello world')
73 | expect(chunk.start).toBe(0)
74 | expect(chunk.end).toBe(2)
75 | })
76 |
--------------------------------------------------------------------------------
/tests/language.test.js:
--------------------------------------------------------------------------------
1 | import spacy from '../src'
2 |
3 | jest.mock('../src/language')
4 |
5 | test('creates new nlp object', () => {
6 | const nlp = spacy.load('en_core_web_sm')
7 | expect(nlp).toEqual(expect.any(Function))
8 | })
9 |
--------------------------------------------------------------------------------
/tests/span.test.js:
--------------------------------------------------------------------------------
1 | import spacy from '../src'
2 | import { Doc, Token, Span } from '../src/tokens'
3 | import { text, words, spaces, attrs } from './util'
4 |
5 | jest.mock('../src/language')
6 |
7 | const nlp = spacy.load('en_core_web_sm')
8 |
9 | test('allows manual construction', async () => {
10 | const doc = await nlp(text)
11 | const span = new Span(doc, 6, 9)
12 | expect(span).toBeInstanceOf(Span)
13 | expect(span.text).toBe('sentence about Facebook')
14 | })
15 |
16 | test('allows being sliced off Doc', async () => {
17 | const doc = await nlp(text)
18 | const span = doc.slice(6, 9)
19 | expect(span).toBeInstanceOf(Span)
20 | expect(span.text).toBe('sentence about Facebook')
21 | })
22 |
23 | test('has Span attributes', async () => {
24 | const doc = await nlp(text)
25 | const span = doc.slice(6, 9)
26 | expect(span.toString()).toBe('sentence about Facebook')
27 | expect(span.length).toBe(3)
28 | expect(span.start).toBe(6)
29 | expect(span.end).toBe(9)
30 | expect(span.label).toBeUndefined()
31 | })
32 |
33 | test('has parent Doc', async () => {
34 | const doc = await nlp(text)
35 | const span = doc.slice(6, 9)
36 | expect(span.doc).toBeInstanceOf(Doc)
37 | expect(span.doc).toBe(doc)
38 | })
39 |
40 | test('has entity label', async () => {
41 | const doc = await nlp(text)
42 | const span = doc.slice(8, 9)
43 | expect(span.toString()).toBe('Facebook')
44 | expect(span.label).toBe('ORG')
45 | })
46 |
47 | test('allows token indexing', async () => {
48 | const doc = await nlp(text)
49 | const span = doc.slice(6, 9)
50 | for (let i = 0; i < span.length; i++) {
51 | expect(span[i]).toBeInstanceOf(Token)
52 | }
53 | expect(span[span.length + 1]).toBeUndefined()
54 | })
55 |
56 | test('allows token iteration', async () => {
57 | const doc = await nlp(text)
58 | const span = doc.slice(6, 9)
59 | for (let token of span) {
60 | expect(token).toBeInstanceOf(Token)
61 | }
62 | })
63 |
--------------------------------------------------------------------------------
/tests/test_api.py:
--------------------------------------------------------------------------------
1 | # coding: utf8
2 | from __future__ import unicode_literals
3 |
4 | import pytest
5 | import spacy
6 | import json
7 |
8 | from api.server import parse, doc2json, load_model
9 |
10 |
11 | @pytest.fixture(scope="session")
12 | def model():
13 | return "en_core_web_sm"
14 |
15 |
16 | @pytest.fixture(scope="session")
17 | def text():
18 | return "This is a sentence about Facebook. This is another one."
19 |
20 |
21 | @pytest.fixture(scope="session")
22 | def nlp(model):
23 | return spacy.load(model)
24 |
25 |
26 | @pytest.fixture(scope="session")
27 | def doc(nlp, text):
28 | return nlp(text)
29 |
30 |
31 | def test_server_parse(model, text, doc):
32 | load_model(model)
33 | json_doc = parse(model, text)
34 | direct_json_doc = doc2json(doc, model)
35 | assert json.dumps(json_doc, sort_keys=True) == json.dumps(
36 | direct_json_doc, sort_keys=True
37 | )
38 |
39 |
40 | def test_doc2json_doc_tokens(doc, model):
41 | data = doc2json(doc, model)
42 | assert data["model"] == model
43 | assert data["doc"]["text"] == doc.text
44 | assert data["doc"]["text_with_ws"] == doc.text_with_ws
45 | assert data["doc"]["is_tagged"]
46 | assert data["doc"]["is_parsed"]
47 | assert data["doc"]["is_sentenced"]
48 | assert len(data["tokens"]) == len(doc)
49 | assert data["tokens"][0]["text"] == doc[0].text
50 | assert data["tokens"][0]["head"] == doc[0].head.i
51 |
52 |
53 | def test_doc2json_doc_ents(doc, model):
54 | data = doc2json(doc, model)
55 | ents = list(doc.ents)
56 | assert "ents" in data
57 | assert len(data["ents"]) == len(ents)
58 | assert len(data["ents"]) >= 1
59 | assert data["ents"][0]["start"] == ents[0].start
60 | assert data["ents"][0]["end"] == ents[0].end
61 | assert data["ents"][0]["label"] == ents[0].label_
62 |
63 |
64 | def test_doc2json_doc_sents(doc, model):
65 | data = doc2json(doc, model)
66 | sents = list(doc.sents)
67 | assert "sents" in data
68 | assert len(data["sents"]) == len(sents)
69 | assert len(data["sents"]) >= 1
70 | assert data["sents"][0]["start"] == sents[0].start
71 | assert data["sents"][0]["end"] == sents[0].end
72 |
73 |
74 | def test_doc2json_doc_noun_chunks(doc, model):
75 | data = doc2json(doc, model)
76 | chunks = list(doc.noun_chunks)
77 | assert "noun_chunks" in data
78 | assert len(data["noun_chunks"]) == len(chunks)
79 | assert len(data["noun_chunks"]) >= 1
80 | assert data["noun_chunks"][0]["start"] == chunks[0].start
81 | assert data["noun_chunks"][0]["end"] == chunks[0].end
82 |
--------------------------------------------------------------------------------
/tests/token.test.js:
--------------------------------------------------------------------------------
1 | import spacy from '../src'
2 | import { Doc, Token } from '../src/tokens'
3 | import { text, words, spaces, attrs } from './util'
4 |
5 | jest.mock('../src/language')
6 |
7 | const nlp = spacy.load('en_core_web_sm')
8 |
9 | test('allows manual construction', async () => {
10 | const doc = await nlp(text)
11 | const token = new Token(doc, words[7], spaces[7], attrs.tokens[7])
12 | expect(token).toBeInstanceOf(Token)
13 | expect(token.text).toBe('about')
14 | })
15 |
16 | test('allows indexing from Doc', async () => {
17 | const doc = await nlp(text)
18 | const token = doc[7]
19 | expect(token.text).toBe('about')
20 | })
21 |
22 | test('has Token attributes', async () => {
23 | const doc = await nlp(text)
24 | const token = doc[7]
25 | expect(token.length).toBe(5)
26 | expect(token.toString()).toBe('about')
27 | expect(token.text).toBe('about')
28 | expect(token.textWithWs).toBe('about ')
29 | expect(token.whitespace).toBe(' ')
30 | expect(token.orth).toBe(942632335873952620)
31 | expect(token.i).toBe(7)
32 | expect(token.entType).toBe('')
33 | expect(token.entIob).toBe('O')
34 | expect(token.lemma).toBe('about')
35 | expect(token.norm).toBe('about')
36 | expect(token.lower).toBe('about')
37 | expect(token.shape).toBe('xxxx')
38 | expect(token.prefix).toBe('a')
39 | expect(token.suffix).toBe('out')
40 | expect(token.pos).toBe('ADP')
41 | expect(token.tag).toBe('IN')
42 | expect(token.dep).toBe('prep')
43 | expect(token.isAlpha).toBe(true)
44 | expect(token.isAscii).toBe(true)
45 | expect(token.isDigit).toBe(false)
46 | expect(token.isLower).toBe(true)
47 | expect(token.isUpper).toBe(false)
48 | expect(token.isTitle).toBe(false)
49 | expect(token.isPunct).toBe(false)
50 | expect(token.isLeftPunct).toBe(false)
51 | expect(token.isRightPunct).toBe(false)
52 | expect(token.isSpace).toBe(false)
53 | expect(token.isBracket).toBe(false)
54 | expect(token.isCurrency).toBe(false)
55 | expect(token.likeUrl).toBe(false)
56 | expect(token.likeNum).toBe(false)
57 | expect(token.likeEmail).toBe(false)
58 | expect(token.isOov).toBe(true)
59 | expect(token.isStop).toBe(true)
60 | expect(token.isSentStart).toBe(null)
61 | })
62 |
63 | test('has parent Doc', async () => {
64 | const doc = await nlp(text)
65 | const token = doc[7]
66 | expect(token.doc).toBeInstanceOf(Doc)
67 | expect(token.doc).toBe(doc)
68 | })
69 |
70 | test('has head', async () => {
71 | const doc = await nlp(text)
72 | const head = doc[7].head
73 | expect(head).toBeInstanceOf(Token)
74 | expect(head.i).toBe(6)
75 | expect(head.text).toBe('sentence')
76 | })
77 |
--------------------------------------------------------------------------------
/tests/util.js:
--------------------------------------------------------------------------------
1 | export const text = 'Hello world! This is a sentence about Facebook.'
2 | export const words = [
3 | 'Hello',
4 | 'world',
5 | '!',
6 | 'This',
7 | 'is',
8 | 'a',
9 | 'sentence',
10 | 'about',
11 | 'Facebook',
12 | '.'
13 | ]
14 | export const spaces = [true, false, true, true, true, true, true, true, false, false]
15 | export const attrs = {
16 | model: 'en_core_web_sm',
17 | doc: {
18 | text: 'Hello world! This is a sentence about Facebook.',
19 | text_with_ws: 'Hello world! This is a sentence about Facebook.',
20 | cats: {},
21 | is_tagged: true,
22 | is_parsed: true,
23 | is_sentenced: true
24 | },
25 | ents: [
26 | {
27 | start: 8,
28 | end: 9,
29 | label: 'ORG'
30 | }
31 | ],
32 | sents: [
33 | {
34 | start: 0,
35 | end: 3
36 | },
37 | {
38 | start: 3,
39 | end: 10
40 | }
41 | ],
42 | noun_chunks: [
43 | {
44 | start: 0,
45 | end: 2
46 | },
47 | {
48 | start: 5,
49 | end: 7
50 | },
51 | {
52 | start: 8,
53 | end: 9
54 | }
55 | ],
56 | tokens: [
57 | {
58 | text: 'Hello',
59 | text_with_ws: 'Hello ',
60 | whitespace: ' ',
61 | orth: 15777305708150031551,
62 | i: 0,
63 | ent_type: '',
64 | ent_iob: 'O',
65 | lemma: 'hello',
66 | norm: 'hello',
67 | lower: 'hello',
68 | shape: 'Xxxxx',
69 | prefix: 'H',
70 | suffix: 'llo',
71 | pos: 'INTJ',
72 | tag: 'UH',
73 | dep: 'intj',
74 | is_alpha: true,
75 | is_ascii: true,
76 | is_digit: false,
77 | is_lower: false,
78 | is_upper: false,
79 | is_title: true,
80 | is_punct: false,
81 | is_left_punct: false,
82 | is_right_punct: false,
83 | is_space: false,
84 | is_bracket: false,
85 | is_currency: false,
86 | like_url: false,
87 | like_num: false,
88 | like_email: false,
89 | is_oov: true,
90 | is_stop: false,
91 | is_sent_start: null,
92 | head: 1
93 | },
94 | {
95 | text: 'world',
96 | text_with_ws: 'world',
97 | whitespace: '',
98 | orth: 1703489418272052182,
99 | i: 1,
100 | ent_type: '',
101 | ent_iob: 'O',
102 | lemma: 'world',
103 | norm: 'world',
104 | lower: 'world',
105 | shape: 'xxxx',
106 | prefix: 'w',
107 | suffix: 'rld',
108 | pos: 'NOUN',
109 | tag: 'NN',
110 | dep: 'ROOT',
111 | is_alpha: true,
112 | is_ascii: true,
113 | is_digit: false,
114 | is_lower: true,
115 | is_upper: false,
116 | is_title: false,
117 | is_punct: false,
118 | is_left_punct: false,
119 | is_right_punct: false,
120 | is_space: false,
121 | is_bracket: false,
122 | is_currency: false,
123 | like_url: false,
124 | like_num: false,
125 | like_email: false,
126 | is_oov: true,
127 | is_stop: false,
128 | is_sent_start: null,
129 | head: 1
130 | },
131 | {
132 | text: '!',
133 | text_with_ws: '! ',
134 | whitespace: ' ',
135 | orth: 17494803046312582752,
136 | i: 2,
137 | ent_type: '',
138 | ent_iob: 'O',
139 | lemma: '!',
140 | norm: '!',
141 | lower: '!',
142 | shape: '!',
143 | prefix: '!',
144 | suffix: '!',
145 | pos: 'PUNCT',
146 | tag: '.',
147 | dep: 'punct',
148 | is_alpha: false,
149 | is_ascii: true,
150 | is_digit: false,
151 | is_lower: false,
152 | is_upper: false,
153 | is_title: false,
154 | is_punct: true,
155 | is_left_punct: false,
156 | is_right_punct: false,
157 | is_space: false,
158 | is_bracket: false,
159 | is_currency: false,
160 | like_url: false,
161 | like_num: false,
162 | like_email: false,
163 | is_oov: true,
164 | is_stop: false,
165 | is_sent_start: null,
166 | head: 1
167 | },
168 | {
169 | text: 'This',
170 | text_with_ws: 'This ',
171 | whitespace: ' ',
172 | orth: 12943039165150086467,
173 | i: 3,
174 | ent_type: '',
175 | ent_iob: 'O',
176 | lemma: 'this',
177 | norm: 'this',
178 | lower: 'this',
179 | shape: 'Xxxx',
180 | prefix: 'T',
181 | suffix: 'his',
182 | pos: 'DET',
183 | tag: 'DT',
184 | dep: 'nsubj',
185 | is_alpha: true,
186 | is_ascii: true,
187 | is_digit: false,
188 | is_lower: false,
189 | is_upper: false,
190 | is_title: true,
191 | is_punct: false,
192 | is_left_punct: false,
193 | is_right_punct: false,
194 | is_space: false,
195 | is_bracket: false,
196 | is_currency: false,
197 | like_url: false,
198 | like_num: false,
199 | like_email: false,
200 | is_oov: true,
201 | is_stop: false,
202 | is_sent_start: true,
203 | head: 4
204 | },
205 | {
206 | text: 'is',
207 | text_with_ws: 'is ',
208 | whitespace: ' ',
209 | orth: 3411606890003347522,
210 | i: 4,
211 | ent_type: '',
212 | ent_iob: 'O',
213 | lemma: 'be',
214 | norm: 'is',
215 | lower: 'is',
216 | shape: 'xx',
217 | prefix: 'i',
218 | suffix: 'is',
219 | pos: 'VERB',
220 | tag: 'VBZ',
221 | dep: 'ROOT',
222 | is_alpha: true,
223 | is_ascii: true,
224 | is_digit: false,
225 | is_lower: true,
226 | is_upper: false,
227 | is_title: false,
228 | is_punct: false,
229 | is_left_punct: false,
230 | is_right_punct: false,
231 | is_space: false,
232 | is_bracket: false,
233 | is_currency: false,
234 | like_url: false,
235 | like_num: false,
236 | like_email: false,
237 | is_oov: true,
238 | is_stop: true,
239 | is_sent_start: null,
240 | head: 4
241 | },
242 | {
243 | text: 'a',
244 | text_with_ws: 'a ',
245 | whitespace: ' ',
246 | orth: 11901859001352538922,
247 | i: 5,
248 | ent_type: '',
249 | ent_iob: 'O',
250 | lemma: 'a',
251 | norm: 'gonna',
252 | lower: 'a',
253 | shape: 'x',
254 | prefix: 'a',
255 | suffix: 'a',
256 | pos: 'DET',
257 | tag: 'DT',
258 | dep: 'det',
259 | is_alpha: true,
260 | is_ascii: true,
261 | is_digit: false,
262 | is_lower: true,
263 | is_upper: false,
264 | is_title: false,
265 | is_punct: false,
266 | is_left_punct: false,
267 | is_right_punct: false,
268 | is_space: false,
269 | is_bracket: false,
270 | is_currency: false,
271 | like_url: false,
272 | like_num: false,
273 | like_email: false,
274 | is_oov: true,
275 | is_stop: true,
276 | is_sent_start: null,
277 | head: 6
278 | },
279 | {
280 | text: 'sentence',
281 | text_with_ws: 'sentence ',
282 | whitespace: ' ',
283 | orth: 18108853898452662235,
284 | i: 6,
285 | ent_type: '',
286 | ent_iob: 'O',
287 | lemma: 'sentence',
288 | norm: 'sentence',
289 | lower: 'sentence',
290 | shape: 'xxxx',
291 | prefix: 's',
292 | suffix: 'nce',
293 | pos: 'NOUN',
294 | tag: 'NN',
295 | dep: 'attr',
296 | is_alpha: true,
297 | is_ascii: true,
298 | is_digit: false,
299 | is_lower: true,
300 | is_upper: false,
301 | is_title: false,
302 | is_punct: false,
303 | is_left_punct: false,
304 | is_right_punct: false,
305 | is_space: false,
306 | is_bracket: false,
307 | is_currency: false,
308 | like_url: false,
309 | like_num: false,
310 | like_email: false,
311 | is_oov: true,
312 | is_stop: false,
313 | is_sent_start: null,
314 | head: 4
315 | },
316 | {
317 | text: 'about',
318 | text_with_ws: 'about ',
319 | whitespace: ' ',
320 | orth: 942632335873952620,
321 | i: 7,
322 | ent_type: '',
323 | ent_iob: 'O',
324 | lemma: 'about',
325 | norm: 'about',
326 | lower: 'about',
327 | shape: 'xxxx',
328 | prefix: 'a',
329 | suffix: 'out',
330 | pos: 'ADP',
331 | tag: 'IN',
332 | dep: 'prep',
333 | is_alpha: true,
334 | is_ascii: true,
335 | is_digit: false,
336 | is_lower: true,
337 | is_upper: false,
338 | is_title: false,
339 | is_punct: false,
340 | is_left_punct: false,
341 | is_right_punct: false,
342 | is_space: false,
343 | is_bracket: false,
344 | is_currency: false,
345 | like_url: false,
346 | like_num: false,
347 | like_email: false,
348 | is_oov: true,
349 | is_stop: true,
350 | is_sent_start: null,
351 | head: 6
352 | },
353 | {
354 | text: 'Facebook',
355 | text_with_ws: 'Facebook',
356 | whitespace: '',
357 | orth: 8081970590932371665,
358 | i: 8,
359 | ent_type: 'ORG',
360 | ent_iob: 'B',
361 | lemma: 'facebook',
362 | norm: 'facebook',
363 | lower: 'facebook',
364 | shape: 'Xxxxx',
365 | prefix: 'F',
366 | suffix: 'ook',
367 | pos: 'PROPN',
368 | tag: 'NNP',
369 | dep: 'pobj',
370 | is_alpha: true,
371 | is_ascii: true,
372 | is_digit: false,
373 | is_lower: false,
374 | is_upper: false,
375 | is_title: true,
376 | is_punct: false,
377 | is_left_punct: false,
378 | is_right_punct: false,
379 | is_space: false,
380 | is_bracket: false,
381 | is_currency: false,
382 | like_url: false,
383 | like_num: false,
384 | like_email: false,
385 | is_oov: true,
386 | is_stop: false,
387 | is_sent_start: null,
388 | head: 7
389 | },
390 | {
391 | text: '.',
392 | text_with_ws: '.',
393 | whitespace: '',
394 | orth: 12646065887601541794,
395 | i: 9,
396 | ent_type: '',
397 | ent_iob: 'O',
398 | lemma: '.',
399 | norm: '.',
400 | lower: '.',
401 | shape: '.',
402 | prefix: '.',
403 | suffix: '.',
404 | pos: 'PUNCT',
405 | tag: '.',
406 | dep: 'punct',
407 | is_alpha: false,
408 | is_ascii: true,
409 | is_digit: false,
410 | is_lower: false,
411 | is_upper: false,
412 | is_title: false,
413 | is_punct: true,
414 | is_left_punct: false,
415 | is_right_punct: false,
416 | is_space: false,
417 | is_bracket: false,
418 | is_currency: false,
419 | like_url: false,
420 | like_num: false,
421 | like_email: false,
422 | is_oov: true,
423 | is_stop: false,
424 | is_sent_start: null,
425 | head: 4
426 | }
427 | ]
428 | }
429 |
--------------------------------------------------------------------------------