├── requirements.txt ├── app.py ├── LICENSE ├── .gitignore ├── query.py ├── README.md └── schema.py /requirements.txt: -------------------------------------------------------------------------------- 1 | spacy>=2.0.12,<2.1.0 2 | flask>=1.0.2,<1.1.0 3 | flask-graphql>=2.0.0,<2.1.0 4 | graphene>=2.1.3,<2.2.0 5 | 6 | https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#egg=en_core_web_sm==2.0.0 7 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask 2 | from flask_graphql import GraphQLView 3 | from graphene import Schema 4 | import os 5 | 6 | from query import Query 7 | 8 | 9 | schema = Schema(query=Query) 10 | view_func = GraphQLView.as_view('graphql', schema=schema, graphiql=True) 11 | app = Flask(__name__) 12 | app.add_url_rule('/', view_func=view_func) 13 | 14 | 15 | if __name__ == '__main__': 16 | host = os.environ.get('SPACY_HOST', '0.0.0.0') 17 | port = os.environ.get('SPACY_PORT', 8080) 18 | app.run(host=host, port=port) 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Ines Montani 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | cythonize.dat 2 | *.cpp 3 | .pytest_cache 4 | .python-version 5 | .vscode 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | .env/ 18 | env/ 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *,cover 53 | .hypothesis/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | 62 | # Sphinx documentation 63 | docs/_build/ 64 | 65 | # PyBuilder 66 | target/ 67 | 68 | #Ipython Notebook 69 | .ipynb_checkpoints 70 | -------------------------------------------------------------------------------- /query.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from graphene import ObjectType, Field, String 3 | 4 | from schema import NLP, Doc, Token, Span, Cat, Meta 5 | 6 | 7 | MODELS = {} 8 | 9 | 10 | def get_model(name: str): 11 | if name not in MODELS: 12 | MODELS[name] = spacy.load(name) 13 | return MODELS[name] 14 | 15 | 16 | def get_token(token: spacy.tokens.Token) -> Token: 17 | return Token( 18 | text=token.text, 19 | text_with_ws=token.text_with_ws, 20 | orth=token.orth, 21 | i=token.i, 22 | idx=token.idx, 23 | head_i=token.head.i, 24 | lower=token.lower, 25 | lower_=token.lower_, 26 | shape=token.shape, 27 | shape_=token.shape, 28 | lemma=token.lemma, 29 | lemma_=token.lemma_, 30 | norm=token.norm, 31 | norm_=token.norm_, 32 | pos=token.pos, 33 | pos_=token.pos_, 34 | tag=token.tag, 35 | tag_=token.tag_, 36 | dep=token.dep, 37 | dep_=token.dep_, 38 | ent_type=token.ent_type, 39 | ent_type_=token.ent_type_, 40 | ent_iob=token.ent_iob, 41 | ent_iob_=token.ent_iob, 42 | is_alpha=token.is_alpha, 43 | is_ascii=token.is_ascii, 44 | is_digit=token.is_digit, 45 | is_lower=token.is_lower, 46 | is_upper=token.is_upper, 47 | is_title=token.is_title, 48 | is_punct=token.is_punct, 49 | is_left_punct=token.is_left_punct, 50 | is_right_punct=token.is_right_punct, 51 | is_space=token.is_space, 52 | is_bracket=token.is_bracket, 53 | is_quote=token.is_quote, 54 | is_stop=token.is_stop, 55 | like_num=token.like_num, 56 | like_url=token.like_url, 57 | like_email=token.like_email 58 | ) 59 | 60 | 61 | def get_span(span: spacy.tokens.Span) -> Span: 62 | return Span( 63 | text=span.text, 64 | text_with_ws=span.text_with_ws, 65 | start=span.start, 66 | end=span.end, 67 | start_char=span.start_char, 68 | end_char=span.end_char, 69 | label=span.label, 70 | label_=span.label_ 71 | ) 72 | 73 | 74 | def get_cat(label: str, score: float) -> Cat: 75 | return Cat(label=label, score=score) 76 | 77 | 78 | def get_meta(meta: dict) -> Meta: 79 | return Meta( 80 | lang=meta.get('lang'), 81 | name=meta.get('name'), 82 | license=meta.get('license'), 83 | author=meta.get('author'), 84 | url=meta.get('url'), 85 | email=meta.get('email'), 86 | description=meta.get('description'), 87 | pipeline=meta.get('pipeline'), 88 | sources=meta.get('sources') 89 | ) 90 | 91 | 92 | def get_doc(doc): 93 | tokens = [get_token(token) for token in doc] 94 | ents = [get_span(ent) for ent in doc.ents] 95 | sents = [get_span(sent) for sent in doc.sents] 96 | cats = [get_cats(label, score) for label, score in doc.cats.items()] 97 | return Doc( 98 | text=doc.text, 99 | text_with_ws=doc.text_with_ws, 100 | tokens=tokens, 101 | ents=ents, 102 | sents=sents, 103 | cats=cats 104 | ) 105 | 106 | 107 | class Query(ObjectType): 108 | nlp = Field(NLP, 109 | text=String(required=True, description="The text to process"), 110 | model=String(required=True, description="The name of the model to use"), 111 | description="The nlp object used to process a text" 112 | ) 113 | 114 | def resolve_nlp(self, info, text, model): 115 | _nlp = get_model(model) 116 | meta = get_meta(_nlp.meta) 117 | doc = get_doc(_nlp(text)) 118 | return NLP(doc=doc, meta=meta) 119 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spacy-graphql 2 | 3 | A very simple and experimental app that lets you query [spaCy](https://spacy.io)'s linguistic annotations using [GraphQL](https://graphql.org). It's my first ever experiment with GraphQL, so it's probably not as elegant as it could be. 4 | 5 | The API currently supports most [token attributes](https://spacy.io/api/token#attributes), named entities, sentences and text categories (if available as `doc.cats`, i.e. if you added a text classifier to a model). The `meta` field will return the model meta data. Models are only loaded once and kept in memory. 6 | 7 | It currently doesn't do anything particularly clever, so regardless of your query, the full model pipeline will always be applied, even if you only need the token texts. Similarly, if you only request entities, the document will still be tagged and parsed. 8 | 9 | ## Installation & Usage 10 | 11 | To try it out, clone this repo and install the dependencies. By default, the [`en_core_web_sm` model](https://spacy.io/models/en#en_core_web_sm) will be preinstalled. Note that the API requires **Python 3.5 or higher**. 12 | 13 | ```bash 14 | git clone https://github.com/ines/spacy-graphql 15 | cd spacy-graphql 16 | pip install -r requirements.txt 17 | # optional: install more spaCy models 18 | ``` 19 | 20 | Executing the [`app.py`](app.py) will start the server: 21 | 22 | ```bash 23 | python app.py 24 | ``` 25 | 26 | You can use the `SPACY_HOST` and `SPACY_PORT` environment variables to change the host and port. By default, the API is served on [`localhost:8080`](http://localhost:8080). 27 | 28 | If you navigate to the URL in your browser, you can explore the API interactively using [GraphiQL](https://github.com/graphql/graphiql). It also shows the complete documentation for the available fields. 29 | 30 | ![GraphiQL](https://user-images.githubusercontent.com/13643239/43527763-9dd554c0-95a7-11e8-9c73-c60f6f546d7f.png) 31 | 32 | ### Example query 33 | 34 | Both the `text` and `model` argument are required. The value of `model` is passed to `spacy.load`, so you'll be able to load any model that's installed in the same environment out-of-the-box. 35 | 36 | ```graphql 37 | { 38 | nlp(text: "Zuckerberg is the CEO of Facebook.", model: "en_core_web_sm") { 39 | meta { 40 | lang 41 | description 42 | } 43 | doc { 44 | text 45 | tokens { 46 | text 47 | pos_ 48 | } 49 | ents { 50 | text 51 | label_ 52 | } 53 | } 54 | } 55 | } 56 | ``` 57 | 58 | ### Example Response 59 | 60 | ```json 61 | { 62 | "data": { 63 | "nlp": { 64 | "meta": { 65 | "lang": "en", 66 | "description": "English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities." 67 | }, 68 | "doc": { 69 | "text": "Zuckerberg is the CEO of Facebook.", 70 | "tokens": [ 71 | { 72 | "text": "Zuckerberg", 73 | "pos_": "PROPN" 74 | }, 75 | { 76 | "text": "is", 77 | "pos_": "VERB" 78 | }, 79 | { 80 | "text": "the", 81 | "pos_": "DET" 82 | }, 83 | { 84 | "text": "CEO", 85 | "pos_": "NOUN" 86 | }, 87 | { 88 | "text": "of", 89 | "pos_": "ADP" 90 | }, 91 | { 92 | "text": "Facebook", 93 | "pos_": "PROPN" 94 | }, 95 | { 96 | "text": ".", 97 | "pos_": "PUNCT" 98 | } 99 | ], 100 | "ents": [ 101 | { 102 | "text": "Zuckerberg", 103 | "label_": "PERSON" 104 | }, 105 | { 106 | "text": "Facebook", 107 | "label_": "ORG" 108 | } 109 | ] 110 | } 111 | } 112 | } 113 | } 114 | ``` 115 | -------------------------------------------------------------------------------- /schema.py: -------------------------------------------------------------------------------- 1 | from graphene import ObjectType, Field, List, String, Boolean, Int, Float 2 | 3 | 4 | class Token(ObjectType): 5 | """An individual token — a word, punctuation symbol, whitespace, etc.""" 6 | 7 | text = String(description="Verbatim text") 8 | text_with_ws = String(description="Text with trailing space, if present") 9 | orth = Int(description="ID of the verbatim text content") 10 | i = Int(description="Index of the token within the parent Doc") 11 | idx = Int(description="Character offset of the token within parent Doc") 12 | head_i = Int(description="Index of the token's head") 13 | lower = Int(description="Lowercase form") 14 | lower_ = String(description="Lowercase form") 15 | shape = Int(description="Transform of token text, to show orthographic features") 16 | shape_ = String(description="Transform of token text, to show orthographic features") 17 | lemma = Int(description="Base form of the token") 18 | lemma_ = String(description="Base form of the token") 19 | norm = Int(description="Normalized form of the token") 20 | norm_ = String(description="Normalized form of the token") 21 | pos = Int(description="Coarse-grained part-of-speech tag") 22 | pos_ = String(description="Coarse-grained part-of-speech tag") 23 | tag = Int(description="Fine-grained part-of-speech tag") 24 | tag_ = String(description="Fine-grained part-of-speech tag") 25 | dep = Int(description="Dependency label") 26 | dep_ = String(description="Dependency label") 27 | ent_type = Int(description="Named entity type") 28 | ent_type_ = String(description="Named entity type") 29 | ent_iob = Int(description="IOB code of named entity tag") 30 | ent_iob_ = String(description="IOB code of named entity tag") 31 | is_alpha = Boolean(description="Does the token consist of alphabetic characters?") 32 | is_ascii = Boolean(description="Does the token consist of ASCII characters?") 33 | is_digit = Boolean(description="Does the token consist of digits?") 34 | is_lower = Boolean(description="Is the token lowercase?") 35 | is_upper = Boolean(description="Is the token uppercase?") 36 | is_title = Boolean(description="Is the token titlecase?") 37 | is_punct = Boolean(description="Is the token punctuation?") 38 | is_left_punct = Boolean(description="Is the token left punctuation?") 39 | is_right_punct = Boolean(description="Is the token right punctuation?") 40 | is_space = Boolean(description="Does the token consist of whitespace characters?") 41 | is_bracket = Boolean(description="Is the token a bracket?") 42 | is_quote = Boolean(description="Is the token a quotation mark?") 43 | is_stop = Boolean(description="Is the token a stop word?") 44 | like_num = Boolean(description="Does the token resemble a number?") 45 | like_url = Boolean(description="Does the token resemble a URL?") 46 | like_email = Boolean(description="Does the token resemble an email address?") 47 | 48 | 49 | class Span(ObjectType): 50 | """A slice from a Doc object""" 51 | 52 | text = String(description="Verbatim text") 53 | text_with_ws = String(description="Text with trailing space, if present") 54 | start = Int(description="The token offset for the start of the span") 55 | end = Int(description="The token offset for the end of the span") 56 | start_char = Int(description="The character offset for the start of the span") 57 | end_char = Int(description="The character offset for the end of the span.") 58 | label = Int(description="The span's label") 59 | label_ = String(description="The span's label") 60 | 61 | 62 | class Cat(ObjectType): 63 | """A text category predicted by the text classifier""" 64 | 65 | label = String(description="The name of the category") 66 | score = Float(description="The score predicted for the category") 67 | 68 | 69 | class Doc(ObjectType): 70 | """A sequence of Token objects and a container for accessing linguistic 71 | annotations.""" 72 | 73 | text = String(description="Verbatim text") 74 | text_with_ws = String(description="Text with trailing space, if present") 75 | tokens = List(Token, description="The tokens in the document") 76 | ents = List(Span, description="The named entities in the document") 77 | sents = List(Span, description="The sentences in the document") 78 | cats = List(Cat, description="The text classification categories, if available") 79 | 80 | 81 | class Meta(ObjectType): 82 | """The current model's meta information.""" 83 | 84 | lang = String(description="Model language") 85 | name = String(description="Model name") 86 | license = String(description="Model license") 87 | author = String(description="Model author") 88 | url = String(description="Model author URL") 89 | email = String(description="Model author email") 90 | description = String(description="Model description") 91 | pipeline = List(String, description="Names of model pipeline components") 92 | sources = List(String, description="Training data sources") 93 | 94 | 95 | class NLP(ObjectType): 96 | """Container for processing results and meta information.""" 97 | 98 | doc = Field(Doc, description="The processed document") 99 | meta = Field(Meta, description="The current model's meta information") 100 | --------------------------------------------------------------------------------