├── .gitignore ├── LICENSE ├── README.md ├── base_config.cfg ├── config.cfg ├── create_data.py └── test_input.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.spacy 6 | output/ 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Philipp Sodmann 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spacy3Textcat 2 | This repository is the implementation for my SpacyV3 Text Categorizer Tutorial on medium: https://medium.com/@psodmann/building-a-text-classifier-with-spacy-3-0-dd16e9979a 3 | 4 | ### How to Run the Example 5 | you can clone it and run ```python create_data.py``` to create train and test data ```python -m spacy train config.cfg --output ./output``` to train a classifier and play with the model with ```python test_input.py``` 6 | 7 | ### Requirements: 8 | spacy-nightly 9 | ml-datasets 10 | -------------------------------------------------------------------------------- /base_config.cfg: -------------------------------------------------------------------------------- 1 | [paths] 2 | train = "data/train.spacy" 3 | dev = "data/valid.spacy" 4 | 5 | [system] 6 | gpu_allocator = null 7 | 8 | 9 | [nlp] 10 | lang = "en" 11 | pipeline = ["tok2vec","textcat"] 12 | tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"} 13 | 14 | [components] 15 | 16 | [components.tok2vec] 17 | factory = "tok2vec" 18 | 19 | [components.tok2vec.model] 20 | @architectures = "spacy.Tok2Vec.v1" 21 | 22 | [components.tok2vec.model.embed] 23 | @architectures = "spacy.MultiHashEmbed.v1" 24 | width = ${components.tok2vec.model.encode.width} 25 | attrs = ["ORTH", "SHAPE"] 26 | rows = [5000, 2500] 27 | include_static_vectors = false 28 | 29 | [components.tok2vec.model.encode] 30 | @architectures = "spacy.MaxoutWindowEncoder.v1" 31 | width = 96 32 | depth = 4 33 | window_size = 1 34 | maxout_pieces = 3 35 | 36 | 37 | [components.textcat] 38 | factory = "textcat" 39 | 40 | [components.textcat.model] 41 | @architectures = "spacy.TextCatBOW.v1" 42 | exclusive_classes = false 43 | ngram_size = 1 44 | no_output_layer = false 45 | 46 | [corpora] 47 | 48 | [corpora.train] 49 | @readers = "spacy.Corpus.v1" 50 | path = ${paths.train} 51 | max_length = 2000 52 | 53 | [corpora.dev] 54 | @readers = "spacy.Corpus.v1" 55 | path = ${paths.dev} 56 | max_length = 0 57 | 58 | [training] 59 | dev_corpus = "corpora.dev" 60 | train_corpus = "corpora.train" 61 | 62 | [training.optimizer] 63 | @optimizers = "Adam.v1" 64 | 65 | 66 | [training.batcher] 67 | @batchers = "spacy.batch_by_words.v1" 68 | discard_oversize = false 69 | tolerance = 0.2 70 | 71 | [training.batcher.size] 72 | @schedules = "compounding.v1" 73 | start = 100 74 | stop = 1000 75 | compound = 1.001 76 | 77 | [initialize] 78 | vectors = null 79 | -------------------------------------------------------------------------------- /config.cfg: -------------------------------------------------------------------------------- 1 | [paths] 2 | train = "data/train.spacy" 3 | dev = "data/valid.spacy" 4 | vectors = null 5 | init_tok2vec = null 6 | 7 | [system] 8 | gpu_allocator = null 9 | seed = 0 10 | 11 | [nlp] 12 | lang = "en" 13 | pipeline = ["tok2vec","textcat"] 14 | tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} 15 | disabled = [] 16 | before_creation = null 17 | after_creation = null 18 | after_pipeline_creation = null 19 | 20 | [components] 21 | 22 | [components.textcat] 23 | factory = "textcat" 24 | threshold = 0.5 25 | 26 | [components.textcat.model] 27 | @architectures = "spacy.TextCatBOW.v1" 28 | exclusive_classes = false 29 | ngram_size = 1 30 | no_output_layer = false 31 | nO = null 32 | 33 | [components.tok2vec] 34 | factory = "tok2vec" 35 | 36 | [components.tok2vec.model] 37 | @architectures = "spacy.Tok2Vec.v1" 38 | 39 | [components.tok2vec.model.embed] 40 | @architectures = "spacy.MultiHashEmbed.v1" 41 | width = ${components.tok2vec.model.encode.width} 42 | attrs = ["ORTH","SHAPE"] 43 | rows = [5000,2500] 44 | include_static_vectors = false 45 | 46 | [components.tok2vec.model.encode] 47 | @architectures = "spacy.MaxoutWindowEncoder.v1" 48 | width = 96 49 | depth = 4 50 | window_size = 1 51 | maxout_pieces = 3 52 | 53 | [corpora] 54 | 55 | [corpora.dev] 56 | @readers = "spacy.Corpus.v1" 57 | path = ${paths.dev} 58 | max_length = 0 59 | gold_preproc = false 60 | limit = 0 61 | augmenter = null 62 | 63 | [corpora.train] 64 | @readers = "spacy.Corpus.v1" 65 | path = ${paths.train} 66 | max_length = 2000 67 | gold_preproc = false 68 | limit = 0 69 | augmenter = null 70 | 71 | [training] 72 | dev_corpus = "corpora.dev" 73 | train_corpus = "corpora.train" 74 | seed = ${system.seed} 75 | gpu_allocator = ${system.gpu_allocator} 76 | dropout = 0.1 77 | accumulate_gradient = 1 78 | patience = 1600 79 | max_epochs = 0 80 | max_steps = 20000 81 | eval_frequency = 200 82 | frozen_components = [] 83 | before_to_disk = null 84 | 85 | [training.batcher] 86 | @batchers = "spacy.batch_by_words.v1" 87 | discard_oversize = false 88 | tolerance = 0.2 89 | get_length = null 90 | 91 | [training.batcher.size] 92 | @schedules = "compounding.v1" 93 | start = 100 94 | stop = 1000 95 | compound = 1.001 96 | t = 0.0 97 | 98 | [training.logger] 99 | @loggers = "spacy.ConsoleLogger.v1" 100 | progress_bar = false 101 | 102 | [training.optimizer] 103 | @optimizers = "Adam.v1" 104 | beta1 = 0.9 105 | beta2 = 0.999 106 | L2_is_weight_decay = true 107 | L2 = 0.01 108 | grad_clip = 1.0 109 | use_averages = false 110 | eps = 0.00000001 111 | learn_rate = 0.001 112 | 113 | [training.score_weights] 114 | cats_score_desc = null 115 | cats_micro_p = null 116 | cats_micro_r = null 117 | cats_micro_f = null 118 | cats_macro_p = null 119 | cats_macro_r = null 120 | cats_macro_f = null 121 | cats_macro_auc = null 122 | cats_f_per_type = null 123 | cats_macro_auc_per_type = null 124 | cats_score = 1.0 125 | 126 | [pretraining] 127 | 128 | [initialize] 129 | vectors = null 130 | init_tok2vec = ${paths.init_tok2vec} 131 | vocab_data = null 132 | lookups = null 133 | 134 | [initialize.components] 135 | 136 | [initialize.tokenizer] -------------------------------------------------------------------------------- /create_data.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | 3 | # tqdm is a great progress bar for python 4 | # tqdm.auto automatically selects a text based progress for the console 5 | # and html based output in jupyter notebooks 6 | from tqdm.auto import tqdm 7 | 8 | # DocBin is spacys new way to store Docs in a binary format for training later 9 | from spacy.tokens import DocBin 10 | 11 | # We want to classify movie reviews as positive or negative 12 | from ml_datasets import imdb 13 | 14 | # load movie reviews as a tuple (text, label) 15 | train_data, valid_data = imdb() 16 | 17 | # load a medium sized english language model in spacy 18 | nlp = spacy.load("en_core_web_md") 19 | 20 | # we are so far only interested in the first 5000 reviews 21 | # this will keep the training time short. 22 | # In practice take as much data as you can get. 23 | num_texts = 5000 24 | 25 | 26 | def make_docs(data): 27 | """ 28 | this will take a list of texts and labels and transform them in spacy documents 29 | 30 | texts: List(str) 31 | labels: List(labels) 32 | 33 | returns: List(spacy.Doc.doc) 34 | """ 35 | 36 | docs = [] 37 | 38 | # nlp.pipe([texts]) is way faster than running nlp(text) for each text 39 | # as_tuples allows us to pass in a tuple, the first one is treated as text 40 | # the second one will get returned as it is. 41 | 42 | for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)): 43 | 44 | # we need to set the (text)cat(egory) for each document 45 | doc.cats["positive"] = label 46 | 47 | # put them into a nice list 48 | docs.append(doc) 49 | 50 | return docs 51 | 52 | 53 | # we are so far only interested in the first 5000 reviews 54 | # this will keep the training time short. 55 | # In practice take as much data as you can get. 56 | # you can always reduce it to make the script even faster. 57 | num_texts = 5000 58 | 59 | 60 | # first we need to transform all the training data 61 | train_docs = make_docs(train_data[:num_texts]) 62 | # then we save it in a binary file to disc 63 | doc_bin = DocBin(docs=train_docs) 64 | doc_bin.to_disk("./data/train.spacy") 65 | 66 | # repeat for validation data 67 | valid_docs = make_docs(valid_data[:num_texts]) 68 | doc_bin = DocBin(docs=valid_docs) 69 | doc_bin.to_disk("./data/valid.spacy") -------------------------------------------------------------------------------- /test_input.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | # load thebest model from training 3 | nlp = spacy.load("output/model-best") 4 | text = "" 5 | print("type : 'quit' to exit") 6 | # predict the sentiment until someone writes quit 7 | while text != "quit": 8 | text = input("Please enter example input: ") 9 | doc = nlp(text) 10 | if doc.cats['positive'] >.5: 11 | print(f"the sentiment is positive") 12 | else: 13 | print(f"the sentiment is negative") --------------------------------------------------------------------------------