├── requirements.txt
├── app.py
├── LICENSE
├── .gitignore
├── query.py
├── README.md
└── schema.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | spacy>=2.0.12,<2.1.0
2 | flask>=1.0.2,<1.1.0
3 | flask-graphql>=2.0.0,<2.1.0
4 | graphene>=2.1.3,<2.2.0
5 | 
6 | https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#egg=en_core_web_sm==2.0.0
7 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask
 2 | from flask_graphql import GraphQLView
 3 | from graphene import Schema
 4 | import os
 5 | 
 6 | from query import Query
 7 | 
 8 | 
 9 | schema = Schema(query=Query)
10 | view_func = GraphQLView.as_view('graphql', schema=schema, graphiql=True)
11 | app = Flask(__name__)
12 | app.add_url_rule('/', view_func=view_func)
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     host = os.environ.get('SPACY_HOST', '0.0.0.0')
17 |     port = os.environ.get('SPACY_PORT', 8080)
18 |     app.run(host=host, port=port)
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Ines Montani
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | cythonize.dat
 2 | *.cpp
 3 | .pytest_cache
 4 | .python-version
 5 | .vscode
 6 | 
 7 | # Byte-compiled / optimized / DLL files
 8 | __pycache__/
 9 | *.py[cod]
10 | *$py.class
11 | 
12 | # C extensions
13 | *.so
14 | 
15 | # Distribution / packaging
16 | .Python
17 | .env/
18 | env/
19 | build/
20 | develop-eggs/
21 | dist/
22 | downloads/
23 | eggs/
24 | .eggs/
25 | lib/
26 | lib64/
27 | parts/
28 | sdist/
29 | var/
30 | *.egg-info/
31 | .installed.cfg
32 | *.egg
33 | 
34 | # PyInstaller
35 | #  Usually these files are written by a python script from a template
36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
37 | *.manifest
38 | *.spec
39 | 
40 | # Installer logs
41 | pip-log.txt
42 | pip-delete-this-directory.txt
43 | 
44 | # Unit test / coverage reports
45 | htmlcov/
46 | .tox/
47 | .coverage
48 | .coverage.*
49 | .cache
50 | nosetests.xml
51 | coverage.xml
52 | *,cover
53 | .hypothesis/
54 | 
55 | # Translations
56 | *.mo
57 | *.pot
58 | 
59 | # Django stuff:
60 | *.log
61 | 
62 | # Sphinx documentation
63 | docs/_build/
64 | 
65 | # PyBuilder
66 | target/
67 | 
68 | #Ipython Notebook
69 | .ipynb_checkpoints
70 | 


--------------------------------------------------------------------------------
/query.py:
--------------------------------------------------------------------------------
  1 | import spacy
  2 | from graphene import ObjectType, Field, String
  3 | 
  4 | from schema import NLP, Doc, Token, Span, Cat, Meta
  5 | 
  6 | 
  7 | MODELS = {}
  8 | 
  9 | 
 10 | def get_model(name: str):
 11 |     if name not in MODELS:
 12 |         MODELS[name] = spacy.load(name)
 13 |     return MODELS[name]
 14 | 
 15 | 
 16 | def get_token(token: spacy.tokens.Token) -> Token:
 17 |     return Token(
 18 |         text=token.text,
 19 |         text_with_ws=token.text_with_ws,
 20 |         orth=token.orth,
 21 |         i=token.i,
 22 |         idx=token.idx,
 23 |         head_i=token.head.i,
 24 |         lower=token.lower,
 25 |         lower_=token.lower_,
 26 |         shape=token.shape,
 27 |         shape_=token.shape,
 28 |         lemma=token.lemma,
 29 |         lemma_=token.lemma_,
 30 |         norm=token.norm,
 31 |         norm_=token.norm_,
 32 |         pos=token.pos,
 33 |         pos_=token.pos_,
 34 |         tag=token.tag,
 35 |         tag_=token.tag_,
 36 |         dep=token.dep,
 37 |         dep_=token.dep_,
 38 |         ent_type=token.ent_type,
 39 |         ent_type_=token.ent_type_,
 40 |         ent_iob=token.ent_iob,
 41 |         ent_iob_=token.ent_iob,
 42 |         is_alpha=token.is_alpha,
 43 |         is_ascii=token.is_ascii,
 44 |         is_digit=token.is_digit,
 45 |         is_lower=token.is_lower,
 46 |         is_upper=token.is_upper,
 47 |         is_title=token.is_title,
 48 |         is_punct=token.is_punct,
 49 |         is_left_punct=token.is_left_punct,
 50 |         is_right_punct=token.is_right_punct,
 51 |         is_space=token.is_space,
 52 |         is_bracket=token.is_bracket,
 53 |         is_quote=token.is_quote,
 54 |         is_stop=token.is_stop,
 55 |         like_num=token.like_num,
 56 |         like_url=token.like_url,
 57 |         like_email=token.like_email
 58 |     )
 59 | 
 60 | 
 61 | def get_span(span: spacy.tokens.Span) -> Span:
 62 |     return Span(
 63 |         text=span.text,
 64 |         text_with_ws=span.text_with_ws,
 65 |         start=span.start,
 66 |         end=span.end,
 67 |         start_char=span.start_char,
 68 |         end_char=span.end_char,
 69 |         label=span.label,
 70 |         label_=span.label_
 71 |     )
 72 | 
 73 | 
 74 | def get_cat(label: str, score: float) -> Cat:
 75 |     return Cat(label=label, score=score)
 76 | 
 77 | 
 78 | def get_meta(meta: dict) -> Meta:
 79 |     return Meta(
 80 |         lang=meta.get('lang'),
 81 |         name=meta.get('name'),
 82 |         license=meta.get('license'),
 83 |         author=meta.get('author'),
 84 |         url=meta.get('url'),
 85 |         email=meta.get('email'),
 86 |         description=meta.get('description'),
 87 |         pipeline=meta.get('pipeline'),
 88 |         sources=meta.get('sources')
 89 |     )
 90 | 
 91 | 
 92 | def get_doc(doc):
 93 |     tokens = [get_token(token) for token in doc]
 94 |     ents = [get_span(ent) for ent in doc.ents]
 95 |     sents = [get_span(sent) for sent in doc.sents]
 96 |     cats = [get_cats(label, score) for label, score in doc.cats.items()]
 97 |     return Doc(
 98 |         text=doc.text,
 99 |         text_with_ws=doc.text_with_ws,
100 |         tokens=tokens,
101 |         ents=ents,
102 |         sents=sents,
103 |         cats=cats
104 |     )
105 | 
106 | 
107 | class Query(ObjectType):
108 |     nlp = Field(NLP,
109 |         text=String(required=True, description="The text to process"),
110 |         model=String(required=True, description="The name of the model to use"),
111 |         description="The nlp object used to process a text"
112 |     )
113 | 
114 |     def resolve_nlp(self, info, text, model):
115 |         _nlp = get_model(model)
116 |         meta = get_meta(_nlp.meta)
117 |         doc = get_doc(_nlp(text))
118 |         return NLP(doc=doc, meta=meta)
119 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # spacy-graphql
  2 | 
  3 | A very simple and experimental app that lets you query [spaCy](https://spacy.io)'s linguistic annotations using [GraphQL](https://graphql.org). It's my first ever experiment with GraphQL, so it's probably not as elegant as it could be.
  4 | 
  5 | The API currently supports most [token attributes](https://spacy.io/api/token#attributes), named entities, sentences and text categories (if available as `doc.cats`, i.e. if you added a text classifier to a model). The `meta` field will return the model meta data. Models are only loaded once and kept in memory.
  6 | 
  7 | It currently doesn't do anything particularly clever, so regardless of your query, the full model pipeline will always be applied, even if you only need the token texts. Similarly, if you only request entities, the document will still be tagged and parsed.
  8 | 
  9 | ## Installation & Usage
 10 | 
 11 | To try it out, clone this repo and install the dependencies. By default, the [`en_core_web_sm` model](https://spacy.io/models/en#en_core_web_sm) will be preinstalled. Note that the API requires **Python 3.5 or higher**.
 12 | 
 13 | ```bash
 14 | git clone https://github.com/ines/spacy-graphql
 15 | cd spacy-graphql
 16 | pip install -r requirements.txt
 17 | # optional: install more spaCy models
 18 | ```
 19 | 
 20 | Executing the [`app.py`](app.py) will start the server:
 21 | 
 22 | ```bash
 23 | python app.py
 24 | ```
 25 | 
 26 | You can use the `SPACY_HOST` and `SPACY_PORT` environment variables to change the host and port. By default, the API is served on [`localhost:8080`](http://localhost:8080).
 27 | 
 28 | If you navigate to the URL in your browser, you can explore the API interactively using [GraphiQL](https://github.com/graphql/graphiql). It also shows the complete documentation for the available fields.
 29 | 
 30 | ![GraphiQL](https://user-images.githubusercontent.com/13643239/43527763-9dd554c0-95a7-11e8-9c73-c60f6f546d7f.png)
 31 | 
 32 | ### Example query
 33 | 
 34 | Both the `text` and `model` argument are required. The value of `model` is passed to `spacy.load`, so you'll be able to load any model that's installed in the same environment out-of-the-box.
 35 | 
 36 | ```graphql
 37 | {
 38 |   nlp(text: "Zuckerberg is the CEO of Facebook.", model: "en_core_web_sm") {
 39 |     meta {
 40 |       lang
 41 |       description
 42 |     }
 43 |     doc {
 44 |       text
 45 |       tokens {
 46 |         text
 47 |         pos_
 48 |       }
 49 |       ents {
 50 |         text
 51 |         label_
 52 |       }
 53 |     }
 54 |   }
 55 | }
 56 | ```
 57 | 
 58 | ### Example Response
 59 | 
 60 | ```json
 61 | {
 62 |   "data": {
 63 |     "nlp": {
 64 |       "meta": {
 65 |         "lang": "en",
 66 |         "description": "English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities."
 67 |       },
 68 |       "doc": {
 69 |         "text": "Zuckerberg is the CEO of Facebook.",
 70 |         "tokens": [
 71 |           {
 72 |             "text": "Zuckerberg",
 73 |             "pos_": "PROPN"
 74 |           },
 75 |           {
 76 |             "text": "is",
 77 |             "pos_": "VERB"
 78 |           },
 79 |           {
 80 |             "text": "the",
 81 |             "pos_": "DET"
 82 |           },
 83 |           {
 84 |             "text": "CEO",
 85 |             "pos_": "NOUN"
 86 |           },
 87 |           {
 88 |             "text": "of",
 89 |             "pos_": "ADP"
 90 |           },
 91 |           {
 92 |             "text": "Facebook",
 93 |             "pos_": "PROPN"
 94 |           },
 95 |           {
 96 |             "text": ".",
 97 |             "pos_": "PUNCT"
 98 |           }
 99 |         ],
100 |         "ents": [
101 |           {
102 |             "text": "Zuckerberg",
103 |             "label_": "PERSON"
104 |           },
105 |           {
106 |             "text": "Facebook",
107 |             "label_": "ORG"
108 |           }
109 |         ]
110 |       }
111 |     }
112 |   }
113 | }
114 | ```
115 | 


--------------------------------------------------------------------------------
/schema.py:
--------------------------------------------------------------------------------
  1 | from graphene import ObjectType, Field, List, String, Boolean, Int, Float
  2 | 
  3 | 
  4 | class Token(ObjectType):
  5 |     """An individual token — a word, punctuation symbol, whitespace, etc."""
  6 | 
  7 |     text = String(description="Verbatim text")
  8 |     text_with_ws = String(description="Text with trailing space, if present")
  9 |     orth = Int(description="ID of the verbatim text content")
 10 |     i = Int(description="Index of the token within the parent Doc")
 11 |     idx = Int(description="Character offset of the token within parent Doc")
 12 |     head_i = Int(description="Index of the token's head")
 13 |     lower = Int(description="Lowercase form")
 14 |     lower_ = String(description="Lowercase form")
 15 |     shape = Int(description="Transform of token text, to show orthographic features")
 16 |     shape_ = String(description="Transform of token text, to show orthographic features")
 17 |     lemma = Int(description="Base form of the token")
 18 |     lemma_ = String(description="Base form of the token")
 19 |     norm = Int(description="Normalized form of the token")
 20 |     norm_ = String(description="Normalized form of the token")
 21 |     pos = Int(description="Coarse-grained part-of-speech tag")
 22 |     pos_ = String(description="Coarse-grained part-of-speech tag")
 23 |     tag = Int(description="Fine-grained part-of-speech tag")
 24 |     tag_ = String(description="Fine-grained part-of-speech tag")
 25 |     dep = Int(description="Dependency label")
 26 |     dep_ = String(description="Dependency label")
 27 |     ent_type = Int(description="Named entity type")
 28 |     ent_type_ = String(description="Named entity type")
 29 |     ent_iob = Int(description="IOB code of named entity tag")
 30 |     ent_iob_ = String(description="IOB code of named entity tag")
 31 |     is_alpha = Boolean(description="Does the token consist of alphabetic characters?")
 32 |     is_ascii = Boolean(description="Does the token consist of ASCII characters?")
 33 |     is_digit = Boolean(description="Does the token consist of digits?")
 34 |     is_lower = Boolean(description="Is the token lowercase?")
 35 |     is_upper = Boolean(description="Is the token uppercase?")
 36 |     is_title = Boolean(description="Is the token titlecase?")
 37 |     is_punct = Boolean(description="Is the token punctuation?")
 38 |     is_left_punct = Boolean(description="Is the token left punctuation?")
 39 |     is_right_punct = Boolean(description="Is the token right punctuation?")
 40 |     is_space = Boolean(description="Does the token consist of whitespace characters?")
 41 |     is_bracket = Boolean(description="Is the token a bracket?")
 42 |     is_quote = Boolean(description="Is the token a quotation mark?")
 43 |     is_stop = Boolean(description="Is the token a stop word?")
 44 |     like_num = Boolean(description="Does the token resemble a number?")
 45 |     like_url = Boolean(description="Does the token resemble a URL?")
 46 |     like_email = Boolean(description="Does the token resemble an email address?")
 47 | 
 48 | 
 49 | class Span(ObjectType):
 50 |     """A slice from a Doc object"""
 51 | 
 52 |     text = String(description="Verbatim text")
 53 |     text_with_ws = String(description="Text with trailing space, if present")
 54 |     start = Int(description="The token offset for the start of the span")
 55 |     end = Int(description="The token offset for the end of the span")
 56 |     start_char = Int(description="The character offset for the start of the span")
 57 |     end_char = Int(description="The character offset for the end of the span.")
 58 |     label = Int(description="The span's label")
 59 |     label_ = String(description="The span's label")
 60 | 
 61 | 
 62 | class Cat(ObjectType):
 63 |     """A text category predicted by the text classifier"""
 64 | 
 65 |     label = String(description="The name of the category")
 66 |     score = Float(description="The score predicted for the category")
 67 | 
 68 | 
 69 | class Doc(ObjectType):
 70 |     """A sequence of Token objects and a container for accessing linguistic
 71 |     annotations."""
 72 | 
 73 |     text = String(description="Verbatim text")
 74 |     text_with_ws = String(description="Text with trailing space, if present")
 75 |     tokens = List(Token, description="The tokens in the document")
 76 |     ents = List(Span, description="The named entities in the document")
 77 |     sents = List(Span, description="The sentences in the document")
 78 |     cats = List(Cat, description="The text classification categories, if available")
 79 | 
 80 | 
 81 | class Meta(ObjectType):
 82 |     """The current model's meta information."""
 83 | 
 84 |     lang = String(description="Model language")
 85 |     name = String(description="Model name")
 86 |     license = String(description="Model license")
 87 |     author = String(description="Model author")
 88 |     url = String(description="Model author URL")
 89 |     email = String(description="Model author email")
 90 |     description = String(description="Model description")
 91 |     pipeline = List(String, description="Names of model pipeline components")
 92 |     sources = List(String, description="Training data sources")
 93 | 
 94 | 
 95 | class NLP(ObjectType):
 96 |     """Container for processing results and meta information."""
 97 | 
 98 |     doc = Field(Doc, description="The processed document")
 99 |     meta = Field(Meta, description="The current model's meta information")
100 | 


--------------------------------------------------------------------------------