├── .gitignore
├── LICENSE
├── README.md
├── base_config.cfg
├── config.cfg
├── create_data.py
└── test_input.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | *.spacy
  6 | output/
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 96 | __pypackages__/
 97 | 
 98 | # Celery stuff
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Philipp Sodmann
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Spacy3Textcat
 2 | This repository is the implementation for my SpacyV3 Text Categorizer Tutorial on medium: https://medium.com/@psodmann/building-a-text-classifier-with-spacy-3-0-dd16e9979a
 3 | 
 4 | ### How to Run the Example
 5 | you can clone it and run ```python create_data.py``` to create train and test data ```python -m spacy train config.cfg --output ./output``` to train a classifier and play with the model with ```python test_input.py```
 6 | 
 7 | ### Requirements:
 8 | spacy-nightly  
 9 | ml-datasets
10 | 


--------------------------------------------------------------------------------
/base_config.cfg:
--------------------------------------------------------------------------------
 1 | [paths]
 2 | train = "data/train.spacy"
 3 | dev = "data/valid.spacy"
 4 | 
 5 | [system]
 6 | gpu_allocator = null
 7 | 
 8 | 
 9 | [nlp]
10 | lang = "en"
11 | pipeline = ["tok2vec","textcat"]
12 | tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
13 | 
14 | [components]
15 | 
16 | [components.tok2vec]
17 | factory = "tok2vec"
18 | 
19 | [components.tok2vec.model]
20 | @architectures = "spacy.Tok2Vec.v1"
21 | 
22 | [components.tok2vec.model.embed]
23 | @architectures = "spacy.MultiHashEmbed.v1"
24 | width = ${components.tok2vec.model.encode.width}
25 | attrs = ["ORTH", "SHAPE"]
26 | rows = [5000, 2500]
27 | include_static_vectors = false
28 | 
29 | [components.tok2vec.model.encode]
30 | @architectures = "spacy.MaxoutWindowEncoder.v1"
31 | width = 96
32 | depth = 4
33 | window_size = 1
34 | maxout_pieces = 3
35 | 
36 | 
37 | [components.textcat]
38 | factory = "textcat"
39 | 
40 | [components.textcat.model]
41 | @architectures = "spacy.TextCatBOW.v1"
42 | exclusive_classes = false
43 | ngram_size = 1
44 | no_output_layer = false
45 | 
46 | [corpora]
47 | 
48 | [corpora.train]
49 | @readers = "spacy.Corpus.v1"
50 | path = ${paths.train}
51 | max_length = 2000
52 | 
53 | [corpora.dev]
54 | @readers = "spacy.Corpus.v1"
55 | path = ${paths.dev}
56 | max_length = 0
57 | 
58 | [training]
59 | dev_corpus = "corpora.dev"
60 | train_corpus = "corpora.train"
61 | 
62 | [training.optimizer]
63 | @optimizers = "Adam.v1"
64 | 
65 | 
66 | [training.batcher]
67 | @batchers = "spacy.batch_by_words.v1"
68 | discard_oversize = false
69 | tolerance = 0.2
70 | 
71 | [training.batcher.size]
72 | @schedules = "compounding.v1"
73 | start = 100
74 | stop = 1000
75 | compound = 1.001
76 | 
77 | [initialize]
78 | vectors = null
79 | 


--------------------------------------------------------------------------------
/config.cfg:
--------------------------------------------------------------------------------
  1 | [paths]
  2 | train = "data/train.spacy"
  3 | dev = "data/valid.spacy"
  4 | vectors = null
  5 | init_tok2vec = null
  6 | 
  7 | [system]
  8 | gpu_allocator = null
  9 | seed = 0
 10 | 
 11 | [nlp]
 12 | lang = "en"
 13 | pipeline = ["tok2vec","textcat"]
 14 | tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 15 | disabled = []
 16 | before_creation = null
 17 | after_creation = null
 18 | after_pipeline_creation = null
 19 | 
 20 | [components]
 21 | 
 22 | [components.textcat]
 23 | factory = "textcat"
 24 | threshold = 0.5
 25 | 
 26 | [components.textcat.model]
 27 | @architectures = "spacy.TextCatBOW.v1"
 28 | exclusive_classes = false
 29 | ngram_size = 1
 30 | no_output_layer = false
 31 | nO = null
 32 | 
 33 | [components.tok2vec]
 34 | factory = "tok2vec"
 35 | 
 36 | [components.tok2vec.model]
 37 | @architectures = "spacy.Tok2Vec.v1"
 38 | 
 39 | [components.tok2vec.model.embed]
 40 | @architectures = "spacy.MultiHashEmbed.v1"
 41 | width = ${components.tok2vec.model.encode.width}
 42 | attrs = ["ORTH","SHAPE"]
 43 | rows = [5000,2500]
 44 | include_static_vectors = false
 45 | 
 46 | [components.tok2vec.model.encode]
 47 | @architectures = "spacy.MaxoutWindowEncoder.v1"
 48 | width = 96
 49 | depth = 4
 50 | window_size = 1
 51 | maxout_pieces = 3
 52 | 
 53 | [corpora]
 54 | 
 55 | [corpora.dev]
 56 | @readers = "spacy.Corpus.v1"
 57 | path = ${paths.dev}
 58 | max_length = 0
 59 | gold_preproc = false
 60 | limit = 0
 61 | augmenter = null
 62 | 
 63 | [corpora.train]
 64 | @readers = "spacy.Corpus.v1"
 65 | path = ${paths.train}
 66 | max_length = 2000
 67 | gold_preproc = false
 68 | limit = 0
 69 | augmenter = null
 70 | 
 71 | [training]
 72 | dev_corpus = "corpora.dev"
 73 | train_corpus = "corpora.train"
 74 | seed = ${system.seed}
 75 | gpu_allocator = ${system.gpu_allocator}
 76 | dropout = 0.1
 77 | accumulate_gradient = 1
 78 | patience = 1600
 79 | max_epochs = 0
 80 | max_steps = 20000
 81 | eval_frequency = 200
 82 | frozen_components = []
 83 | before_to_disk = null
 84 | 
 85 | [training.batcher]
 86 | @batchers = "spacy.batch_by_words.v1"
 87 | discard_oversize = false
 88 | tolerance = 0.2
 89 | get_length = null
 90 | 
 91 | [training.batcher.size]
 92 | @schedules = "compounding.v1"
 93 | start = 100
 94 | stop = 1000
 95 | compound = 1.001
 96 | t = 0.0
 97 | 
 98 | [training.logger]
 99 | @loggers = "spacy.ConsoleLogger.v1"
100 | progress_bar = false
101 | 
102 | [training.optimizer]
103 | @optimizers = "Adam.v1"
104 | beta1 = 0.9
105 | beta2 = 0.999
106 | L2_is_weight_decay = true
107 | L2 = 0.01
108 | grad_clip = 1.0
109 | use_averages = false
110 | eps = 0.00000001
111 | learn_rate = 0.001
112 | 
113 | [training.score_weights]
114 | cats_score_desc = null
115 | cats_micro_p = null
116 | cats_micro_r = null
117 | cats_micro_f = null
118 | cats_macro_p = null
119 | cats_macro_r = null
120 | cats_macro_f = null
121 | cats_macro_auc = null
122 | cats_f_per_type = null
123 | cats_macro_auc_per_type = null
124 | cats_score = 1.0
125 | 
126 | [pretraining]
127 | 
128 | [initialize]
129 | vectors = null
130 | init_tok2vec = ${paths.init_tok2vec}
131 | vocab_data = null
132 | lookups = null
133 | 
134 | [initialize.components]
135 | 
136 | [initialize.tokenizer]


--------------------------------------------------------------------------------
/create_data.py:
--------------------------------------------------------------------------------
 1 | import spacy
 2 | 
 3 | # tqdm is a great progress bar for python
 4 | # tqdm.auto automatically selects a text based progress for the console 
 5 | # and html based output in jupyter notebooks
 6 | from tqdm.auto import tqdm
 7 | 
 8 | # DocBin is spacys new way to store Docs in a binary format for training later
 9 | from spacy.tokens import DocBin
10 | 
11 | # We want to classify movie reviews as positive or negative
12 | from ml_datasets import imdb
13 | 
14 | # load movie reviews as a tuple (text, label)
15 | train_data, valid_data = imdb()
16 | 
17 | # load a medium sized english language model in spacy
18 | nlp = spacy.load("en_core_web_md")
19 | 
20 | # we are so far only interested in the first 5000 reviews
21 | # this will keep the training time short.
22 | # In practice take as much data as you can get.
23 | num_texts = 5000
24 | 
25 | 
26 | def make_docs(data):
27 |     """
28 |     this will take a list of texts and labels and transform them in spacy documents
29 |     
30 |     texts: List(str)
31 |     labels: List(labels)
32 |     
33 |     returns: List(spacy.Doc.doc)
34 |     """
35 |     
36 |     docs = []
37 | 
38 |     # nlp.pipe([texts]) is way faster than running nlp(text) for each text
39 |     # as_tuples allows us to pass in a tuple, the first one is treated as text
40 |     # the second one will get returned as it is.
41 |     
42 |     for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
43 |         
44 |         # we need to set the (text)cat(egory) for each document
45 |         doc.cats["positive"] = label
46 |         
47 |         # put them into a nice list
48 |         docs.append(doc)
49 |     
50 |     return docs
51 | 
52 | 
53 | # we are so far only interested in the first 5000 reviews
54 | # this will keep the training time short.
55 | # In practice take as much data as you can get.
56 | # you can always reduce it to make the script even faster.
57 | num_texts = 5000
58 | 
59 | 
60 | # first we need to transform all the training data
61 | train_docs = make_docs(train_data[:num_texts])
62 | # then we save it in a binary file to disc
63 | doc_bin = DocBin(docs=train_docs)
64 | doc_bin.to_disk("./data/train.spacy")
65 | 
66 | # repeat for validation data
67 | valid_docs = make_docs(valid_data[:num_texts])
68 | doc_bin = DocBin(docs=valid_docs)
69 | doc_bin.to_disk("./data/valid.spacy")


--------------------------------------------------------------------------------
/test_input.py:
--------------------------------------------------------------------------------
 1 | import spacy
 2 | # load thebest model from training
 3 | nlp = spacy.load("output/model-best")
 4 | text = ""
 5 | print("type : 'quit' to exit")
 6 | # predict the sentiment until someone writes quit
 7 | while text != "quit":
 8 |     text = input("Please enter example input: ")
 9 |     doc = nlp(text)
10 |     if doc.cats['positive'] >.5:
11 |         print(f"the sentiment is positive")
12 |     else:
13 |         print(f"the sentiment is negative")


--------------------------------------------------------------------------------