├── .gitignore
├── .travis.yml
├── LICENSE
├── Makefile
├── README.md
├── apparel
    ├── __init__.py
    ├── build.py
    ├── classify.py
    ├── config.py
    └── features.py
├── bin
    └── apparel-classify.py
├── conf
    └── apparel-example.yaml
├── docs
    ├── Makefile
    ├── conf.py
    ├── img
    │   └── plaid.jpg
    ├── index.rst
    └── make.bat
├── fixtures
    ├── .gitkeep
    ├── info-2015-05-02.json
    └── model-2015-05-02.pickle
├── requirements.txt
├── setup.py
└── tests
    └── __init__.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 | 
43 | # Translations
44 | *.mo
45 | *.pot
46 | 
47 | # Django stuff:
48 | *.log
49 | 
50 | # Sphinx documentation
51 | docs/_build/
52 | 
53 | # PyBuilder
54 | target/
55 | 
56 | # Virtualenv
57 | venv
58 | 
59 | # OS X
60 | .DS_Store
61 | 
62 | # Fixtures
63 | *.csv
64 | *.zip
65 | 
66 | # Configuration
67 | conf/apparel.yaml
68 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | python:
 4 |   - '2.7'
 5 | 
 6 | install: pip install -r requirements.txt
 7 | 
 8 | script: make test
 9 | 
10 | notifications:
11 |   email:
12 |     recipients:
13 |       - benjamin.bengfort@georgetown.edu
14 |       - allen.leis@georgetown.edu
15 |       - anthony.ojeda@georgetown.edu
16 |     on_success: change
17 |     on_failure: always
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Georgetown Data Analytics (CCPE)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Shell to use with Make
 2 | SHELL := /bin/sh
 3 | 
 4 | # Set important Paths
 5 | PROJECT := apparel
 6 | LOCALPATH := $(CURDIR)/$(PROJECT)
 7 | PYTHONPATH := $(LOCALPATH)/
 8 | PYTHON_BIN := $(VIRTUAL_ENV)/bin
 9 | 
10 | # Export targets not associated with files
11 | .PHONY: test coverage bootstrap pip virtualenv clean virtual_env_set
12 | 
13 | # Clean build files
14 | clean:
15 | 	find . -name "*.pyc" -print0 | xargs -0 rm -rf
16 | 	-rm -rf htmlcov
17 | 	-rm -rf .coverage
18 | 	-rm -rf build
19 | 	-rm -rf dist
20 | 	-rm -rf $(PROJECT).egg-info
21 | 
22 | # Targets for Coruscate testing
23 | test:
24 | 	$(PYTHON_BIN)/nosetests -v tests
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Product Classifier
 2 | 
 3 | [![Build Status](https://travis-ci.org/georgetown-analytics/product-classifier.svg)](https://travis-ci.org/georgetown-analytics/product-classifier) [![Stories in Ready](https://badge.waffle.io/georgetown-analytics/product-classifier.png?label=ready&title=Ready)](https://waffle.io/georgetown-analytics/product-classifier)
 4 | 
 5 | **Classify products into categories by their name with NLTK**
 6 | 
 7 | [![Too Much Plaid](docs/img/plaid.jpg)](docs/img/plaid.jpg)
 8 | 
 9 | ## Introduction
10 | 
11 | In order to reason about the similarity of products, particularly in the genre of apparel, a useful first step is to classify the products according to some hierarchical categorization scheme. These categories serve to group semantic features that can be used for ontological analysis later on as well as providing an initial framework for both functional and physical attributes (e.g. tops have sleeves, and coats are for use in cold weather). Unfortunately, no one standardized scheme exists for the genre (unlike movies) and we cannot rely on the ingestion of structured data from a variety of sources (e.g. affiliate data or web crawl data). Instead some algorithmic approach is required.
12 | 
13 | Luckily, we do have a significant amount of information about products that we can acquire reliably via ingestion from any source. In particular, the product name and description are usually available and seem to be enough for a human annotator to decide on a category for the product. Additionally we may have fields relating to the size, color, or even keywords used for quick search – all data features that may be used to enhance or develop a statistical classification approach using machine learning methodologies. This type of information lends itself immediately to probabilistic language model-based classification methods, which we will explore in this project.
14 | 
15 | Statistical learning algorithms are characterized by some underlying probability model and whose output is not just a classifier, but also a probability that the instance belongs in that class. In this project, we will focus in particular on supervised machine learning algorithms, which statisticians typically call classification, vs. unsupervised machine learning, which is usually called clustering. In order to perform supervised classification we require some annotated corpus of correct answers with which to train our model. This annotated corpus is used to create some hypothetical probability distribution across a feature set, which is then used to predict future outcomes. The classifiers themselves are some set of predetermined labels that we can then use to broadly generalize an instance, given its unique set of features.
16 | 
17 | ## Methodology
18 | 
19 | The schema of most apparel data notably includes several string fields that we have identified as extremely useful in characterizing a product- particularly by human annotators. In particular the name of the product is extremely descriptive in apparel because it is a distinguishing marketing feature that encodes unique attributes of a product for consumers. Consider the difference between an apparel product name and a movie title: “Obey ‘Anchors’ Shawl Cardigan Burgundy X-Large” vs. “Gone with the Wind”. Movie titles attempt to be short, memorable titles that are extremely unique to convey some artistic meaning to the reader. However, these titles do not contain enough information to classify them into genre.
20 | 
21 | Product names on the other hand do tend to capture classification information, possibly even including size and color information, which can aid in the classification. Consider that shoe sizes are numbers from 4-13 whereas a shirt size can be described as S, M, L, or XL. These tokens will assist in differentiating the vector space, although not fully considering that dresses seem to be sized similarly as shoes (although you can have a size 0 dress but not a size 0 shoe), and pants can be described either by the waist and inseam, or as in the case of athletic pants using the same S, M, L, or XL tokens.
22 | 
23 | Other useful text fields include the description of the product, a short, usually grammatical description of the product with even more fine grain detail, and search keywords for example: “Men, Shoes, Athletic”. Other data points could include the merchant name, extracted size and color characterizations, potentially even price or sale price. However, for the purposes of this article we focus on the name, description, and keywords fields.
24 | 
25 | Another consideration in the evaluation of a classifier is their performance given a minimum amount of information, and we would like to ensure that we use the most lightweight model possible. Whereas the product name will always be available, descriptions or keywords might be limited depending on the data source. We would like to achieve a model capable of highly accurate classifications using, if possible, the name only.
26 | 
27 | ## Quick Start
28 | 
29 | In order to use this classifier, take the following steps:
30 | 
31 | 1. Clone the repository (assuming you haven't already)
32 | 
33 |   ```bash
34 |   $ git clone https://github.com/georgetown-analytics/product-classifier.git
35 |   ```
36 | 
37 | 2. Move into the working directory with `cd` then install the requirements with pip.
38 | 
39 |   ```bash
40 |   $ cd product-classifier
41 |   $ pip install -r requirements.txt 
42 |   ```
43 | 
44 | 3. Create a copy of the `apparel-example.yaml` in the `conf` directory to `apparel.yaml`
45 | 
46 |   ```bash
47 |   $ cp conf/apparel-example.yaml conf/apparel.yaml
48 |   ```
49 | 
50 | 4. Modify the configuration to point to the model in the fixtures directory (currently `fixtures/model-2015-05-02.pickle`) by adding or editing the following line in the configuration:
51 | 
52 |   ```yaml
53 |   model: fixtures/model-2015-05-02.pickle
54 |   ```
55 | 
56 | 5. At this point you should be able to use the classifier
57 | 
58 |   ```bash
59 |   $ bin/apparel-classify.py classify --explain "North Face Fleece Jacket"
60 |   ```
61 | 
62 | The `apparel-classify` utility also builds models, so if you have a training set as a CSV with the columns "category" and "name" (and optionally "description" and "keywords") you can then build your own model to test! 
63 | 
64 | ## Notes
65 | 
66 | This project utilizes NTLK and a Maximum Entropy model to build a classifier which can then be used as a data product in production. The data set used to train the classifier is propriertary, however a pickle containing the parameterization of the model is compressed in the `fixtures` folder. In the future, we will acquire a public data set to use and expand upon this project.
67 | 
68 | ### Attribution
69 | 
70 | The image used in this README, [Too Much Plaid](https://flic.kr/p/7GjgQx) by [Ewan Munro](https://www.flickr.com/photos/55935853@N00/) is licensed under [CC BY-SA 2.0](https://creativecommons.org/licenses/by-sa/2.0/)
71 | 


--------------------------------------------------------------------------------
/apparel/__init__.py:
--------------------------------------------------------------------------------
 1 | # apparel
 2 | # Classifier libraries for Apparel
 3 | #
 4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
 5 | # Created:  Thu Feb 05 20:24:45 2015 -0500
 6 | #
 7 | # Copyright (C) 2014 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: __init__.py [] benjamin@bengfort.com $
11 | 
12 | """
13 | Classifier libraries for Apparel
14 | """
15 | 
16 | ##########################################################################
17 | ## Module Methods
18 | ##########################################################################
19 | 
20 | __version__ = (1,0,0)
21 | 
22 | def get_version():
23 |     """
24 |     Returns a string of the version
25 |     """
26 |     return ".".join(["%i" % i for i in __version__])
27 | 


--------------------------------------------------------------------------------
/apparel/build.py:
--------------------------------------------------------------------------------
  1 | # apparel.build
  2 | # Builds classifier models and saves them as pickles
  3 | #
  4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  5 | # Created:  Thu Feb 05 21:11:27 2015 -0500
  6 | #
  7 | # Copyright (C) 2014 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: build.py [] benjamin@bengfort.com $
 11 | 
 12 | """
 13 | Builds classifier models and saves them as pickles
 14 | """
 15 | 
 16 | ##########################################################################
 17 | ## Imports
 18 | ##########################################################################
 19 | 
 20 | import os
 21 | import time
 22 | import json
 23 | import pickle
 24 | import random
 25 | import apparel
 26 | import unicodecsv as csv
 27 | 
 28 | from datetime import datetime
 29 | from apparel.config import settings
 30 | from apparel.features import ProductFeatures
 31 | from nltk.classify.util import accuracy
 32 | from nltk.classify import MaxentClassifier
 33 | 
 34 | ##########################################################################
 35 | ## Module Constants
 36 | ##########################################################################
 37 | 
 38 | DATE_FORMAT = "%a %b %d %H:%M:%S %Y"
 39 | 
 40 | ##########################################################################
 41 | ## Model Builder
 42 | ##########################################################################
 43 | 
 44 | class ClassifierBuilder(object):
 45 |     """
 46 |     Creates a classifier model using MaximumEntropy and saves it as a
 47 |     pickle to disk. This class also writes out extra information to disk
 48 |     to ensure that the model can be identified in the future.
 49 |     """
 50 | 
 51 |     def __init__(self, corpus=None, **kwargs):
 52 |         self.corpus      = corpus or settings.corpus
 53 |         self.validate    = kwargs.pop('validate', True)    # Perform cross validation
 54 |         self.outpath     = kwargs.pop('outpath', '.')      # Where to write out the data
 55 | 
 56 |         # Compute info and model paths
 57 |         self.model_path, self.info_path = self.get_output_paths()
 58 | 
 59 |         # Other required properties
 60 |         self.accuracy    = None  # Accuracy of the model
 61 |         self.started     = None  # Start timestamp of the build
 62 |         self.finished    = None  # Finish timestamp of the build
 63 |         self.buildtime   = None  # Time (seconds) of complete build
 64 |         self.feattime    = None  # Time (seconds) to get features
 65 |         self.traintime   = None  # Time (seconds) to train the model
 66 |         self.validtime   = None  # Time (seconds) to run the validation
 67 | 
 68 |         # Create a featurizer
 69 |         self.featurizer  = ProductFeatures()
 70 | 
 71 |         # Cache the features on the model
 72 |         self._featureset = None
 73 | 
 74 |     def featureset(self):
 75 |         """
 76 |         Opens the corpus path, reads the data and constructs features to
 77 |         pass to the classifier. (A simple improvement is to cache this).
 78 | 
 79 |         Returns a dictionary of features and the label as follows:
 80 | 
 81 |             [(feats, label) for row in corpus]
 82 | 
 83 |         This is the expected format for the MaxentClassifier.
 84 |         """
 85 | 
 86 |         if self._featureset is None:
 87 | 
 88 |             # Time how long it takes to extract features
 89 |             start = time.time()
 90 | 
 91 |             self._featureset = []
 92 |             with open(self.corpus, 'r') as f:
 93 |                 reader = csv.DictReader(f)
 94 |                 for row in reader:
 95 |                     label = row.pop('category')
 96 |                     feats = self.featurizer.featurize(**row)
 97 |                     self._featureset.append((feats, label))
 98 | 
 99 |             # Record feature extraction time
100 |             self.feattime = time.time() - start
101 | 
102 |         return self._featureset
103 | 
104 |     def train(self, featureset=None):
105 |         """
106 |         Trains the maximum entropy classifier and returns it. If a
107 |         featureset is specified it trains on that, otherwise it trains on
108 |         the models featureset.
109 | 
110 |         Pass in a featureset during cross validation.
111 |         Returns the training time and the classifier.
112 |         """
113 |         featureset = featureset or self.featureset()
114 | 
115 |         # Time how long it takes to train
116 |         start = time.time()
117 | 
118 |         classifier = MaxentClassifier.train(featureset,
119 |                         algorithm='megam', trace=1, gaussian_prior_sigma=1)
120 | 
121 |         delta = time.time() - start
122 |         return classifier, delta
123 | 
124 |     def build(self):
125 |         """
126 |         Builds the model and writes to the outpath (which should be a
127 |         directory). Two files are written:
128 | 
129 |             - the pickle of the model
130 |             - a yaml file of associated data
131 | 
132 |         Note, if a file already exists at the outpath, this will raise an
133 |         exception (don't want to overwrite a model by accident!)
134 |         """
135 | 
136 |         # Record the start time
137 |         self.started  = datetime.now()
138 |         start = time.time()
139 | 
140 |         # Extract the features and train the model
141 |         classifier, self.traintime = self.train()
142 | 
143 |         # Write the classifier to disk
144 |         with open(self.model_path, 'w') as f:
145 |             pickle.dump(classifier, f, pickle.HIGHEST_PROTOCOL)
146 | 
147 |         # Begin accuracy validation
148 |         if self.validate:
149 |             self.cross_validate()
150 | 
151 |         # Record the finish time
152 |         self.finished = datetime.now()
153 |         self.buildtime = time.time() - start
154 | 
155 |         # Write the information to disk
156 |         self.write_details()
157 | 
158 |     def cross_validate(self):
159 |         """
160 |         Performs cross validation by training the model on 90% of the
161 |         corpus then checking the accuracy on the remaining 10%.
162 |         """
163 |         start  = time.time()
164 | 
165 |         feats  = self.featureset()
166 |         offset = len(feats) / 10
167 |         random.shuffle(feats)
168 | 
169 |         train  = feats[:offset]
170 |         test   = feats[offset:]
171 | 
172 |         classifier, _  = self.train(train)
173 |         self.accuracy  = accuracy(classifier, test)
174 | 
175 |         self.validtime = time.time() - start
176 | 
177 |     def get_output_paths(self):
178 |         """
179 |         Returns two paths - the pickle path and the information yaml path.
180 |         Ensures those paths don't exist and wont' be overwritten.
181 |         """
182 | 
183 |         today = datetime.now().strftime('%Y-%d-%m')
184 |         mname = os.path.join(self.outpath, "model-%s.pickle" % today)
185 |         iname = os.path.join(self.outpath, "info-%s.json" % today)
186 | 
187 |         for name in (mname, iname):
188 |             if os.path.exists(name):
189 |                 raise Exception("Can't overwrite file at '%s'!" % name)
190 | 
191 |         return mname, iname
192 | 
193 |     def write_details(self):
194 |         """
195 |         Writes the details of the classifier to a YAML file.
196 |         """
197 | 
198 |         details = {
199 |             'version': apparel.get_version(),
200 |             'started': self.started.strftime(DATE_FORMAT),
201 |             'finished': self.finished.strftime(DATE_FORMAT),
202 |             'accuracy': self.accuracy,
203 |             'validated': self.validate,
204 |             'corpus': self.corpus,
205 |             'paths': {
206 |                 'model': self.model_path,
207 |                 'info': self.info_path,
208 |             },
209 |             'classes': {
210 |                 'classifier': MaxentClassifier.__name__,
211 |                 'features': ProductFeatures.__name__,
212 |             },
213 |             'timer': {
214 |                 'build': self.buildtime,
215 |                 'features': self.feattime,
216 |                 'validation': self.validtime,
217 |                 'training': self.traintime,
218 |             }
219 |         }
220 | 
221 |         with open(self.info_path, 'w') as f:
222 |             json.dump(details, f, indent=4)
223 | 
224 | if __name__ == '__main__':
225 |     builder = ClassifierBuilder()
226 |     print builder.build()
227 | 


--------------------------------------------------------------------------------
/apparel/classify.py:
--------------------------------------------------------------------------------
 1 | # apparel.classify
 2 | # Classifier package - utilize built model to perform classifications
 3 | #
 4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
 5 | # Created:  Thu Feb 05 21:43:21 2015 -0500
 6 | #
 7 | # Copyright (C) 2014 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: classify.py [] benjamin@bengfort.com $
11 | 
12 | """
13 | Classifier package - utilize built model to perform classifications
14 | """
15 | 
16 | ##########################################################################
17 | ## Imports
18 | ##########################################################################
19 | 
20 | import pickle
21 | 
22 | from operator import itemgetter
23 | from apparel.config import settings
24 | from apparel.features import ProductFeatures
25 | 
26 | ##########################################################################
27 | ## Simple Classifier
28 | ##########################################################################
29 | 
30 | class ApparelClassifier(object):
31 |     """
32 |     Performs classification of products using a classifier that is loaded
33 |     via a pickle at runtime. This classifier can be of any type, but we
34 |     expect the Maximum Entropy classifier trained from a CSV corpus.
35 |     """
36 | 
37 |     def __init__(self, model=None):
38 |         """
39 |         Pass in the path of the pickle classifier object.
40 |         """
41 | 
42 |         ## Get the default model from the settings if it isn't passed in
43 |         model = model or settings.model
44 | 
45 |         ## Load the model from the pickle
46 |         with open(model, 'rb') as pkl:
47 |             self._classifier = pickle.load(pkl)
48 | 
49 |         ## Create a featurizer to use
50 |         self.featurizer = ProductFeatures()
51 | 
52 |     def classify(self, name, description=None, keywords=None):
53 |         """
54 |         Classifies the text using the internal classifier. Returns a
55 |         probability distribution of the labels associated with the text.
56 |         """
57 |         features = self.featurizer.featurize(name, description, keywords)
58 |         probdist = self._classifier.prob_classify(features)
59 |         labels   = [(label, probdist.prob(label))
60 |                     for label in probdist.samples()
61 |                     if probdist.prob(label) > 0.01]
62 |         return sorted(labels, key=itemgetter(1), reverse=True)
63 | 
64 |     def explain(self, name, description=None, keywords=None):
65 |         """
66 |         Wrapper for classifier.explain - prints out (no way to capture the
67 |         string output, unfortunately) the features contributing to the
68 |         chosen classifier.
69 |         """
70 |         features = self.featurizer.featurize(name, description, keywords)
71 |         self._classifier.explain(features)
72 | 
73 |     def labels(self):
74 |         """
75 |         Wrapper for classifier.labels - returns a list of the labels.
76 |         """
77 |         return self._classifier.labels()
78 | 
79 | if __name__ == '__main__':
80 |     classifier = ApparelClassifier()
81 |     classifier.explain("GUESS Handbag, Isla Large Satchel")
82 | 


--------------------------------------------------------------------------------
/apparel/config.py:
--------------------------------------------------------------------------------
 1 | # apparel.config
 2 | # Uses confire to get meaningful configurations from a yaml file
 3 | #
 4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
 5 | # Created:  Fri Sep 19 11:14:33 2014 -0400
 6 | #
 7 | # Copyright (C) 2014 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: config.py [] benjamin@bengfort.com $
11 | 
12 | """
13 | Uses confire to get meaningful configurations from a yaml file
14 | """
15 | 
16 | ##########################################################################
17 | ## Imports
18 | ##########################################################################
19 | 
20 | import os
21 | import confire
22 | 
23 | ##########################################################################
24 | ## Configuration
25 | ##########################################################################
26 | 
27 | class ApparelConfiguration(confire.Configuration):
28 |     """
29 |     Meaningful defaults and required configurations.
30 | 
31 |     debug:    the app will print or log debug statements
32 |     testing:  the app will not overwrite important resources
33 |     corpus:   the location of the corpus on disk
34 |     model:    the location of the pickled model on disk
35 |     """
36 | 
37 |     CONF_PATHS = [
38 |         "/etc/apparel.yaml",                     # System configuration
39 |         os.path.expanduser("~/.apparel.yaml"),   # User specific config
40 |         os.path.abspath("conf/apparel.yaml"),    # Local configuration
41 |     ]
42 | 
43 |     debug    = True
44 |     testing  = True
45 |     corpus   = None
46 |     model    = None
47 | 
48 | 
49 | ## Load settings immediately for import
50 | settings = ApparelConfiguration.load()
51 | 
52 | if __name__ == '__main__':
53 |     print settings
54 | 


--------------------------------------------------------------------------------
/apparel/features.py:
--------------------------------------------------------------------------------
  1 | # apparel.features
  2 | # Extracts the features from text for classification and building
  3 | #
  4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  5 | # Created:  Thu Feb 05 21:15:41 2015 -0500
  6 | #
  7 | # Copyright (C) 2014 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: features.py [] benjamin@bengfort.com $
 11 | 
 12 | """
 13 | Extracts the features from text for classification and building
 14 | """
 15 | 
 16 | ##########################################################################
 17 | ## Imports
 18 | ##########################################################################
 19 | 
 20 | import string
 21 | 
 22 | from nltk.corpus import stopwords
 23 | from nltk import wordpunct_tokenize
 24 | from nltk.stem.wordnet import WordNetLemmatizer
 25 | 
 26 | ##########################################################################
 27 | ## Featurize Class
 28 | ##########################################################################
 29 | 
 30 | class ProductFeatures(object):
 31 |     """
 32 |     This class manages the extraction of text features from a product
 33 |     document that might contain a name, a description, and keywords. It
 34 |     ensures that stopwords and punctuation is excluded, and that all tokens
 35 |     are normalized to lower case and to their lemma class (thus reducing
 36 |     the feature space for better classification).
 37 | 
 38 |     The reason this is a class is because data needs to be stored to do
 39 |     the work of featurization - e.g. loading stopwords and punctuation.
 40 |     """
 41 | 
 42 |     def __init__(self, stoplist=None, punct=None, lemmatizer=None):
 43 |         # Load stopwords, punctuation, and lemmatizer
 44 |         # This takes a bit of work, so we only want to do it once!
 45 |         self.stopwords   = stoplist or stopwords.words('english')
 46 |         self.punctuation = punct or string.punctuation
 47 |         self.lemmatizer  = lemmatizer or WordNetLemmatizer()
 48 | 
 49 |     def tokenize(self, text):
 50 |         """
 51 |         Returns a list of individual tokens from the text utilizing NLTK's
 52 |         tokenize built in utility (far better than split on space). It also
 53 |         removes any stopwords and punctuation from the text, as well as
 54 |         ensure that every token is normalized.
 55 | 
 56 |         For now, token = word as in bag of words (the feature we're using).
 57 |         """
 58 |         for token in wordpunct_tokenize(text):
 59 |             token = self.normalize(token)
 60 |             if token in self.punctuation: continue
 61 |             if token in self.stopwords: continue
 62 |             yield token
 63 | 
 64 |     def normalize(self, word):
 65 |         """
 66 |         Ensures words are in the same class (lemma) as well as lowercase
 67 |         """
 68 |         word = word.lower()
 69 |         return self.lemmatizer.lemmatize(word)
 70 | 
 71 |     def featurize(self, name, description=None, keywords=None):
 72 |         """
 73 |         Returns a dictionary of features to use with the Maximum Entropy
 74 |         classifier. In this case we're using a "bag of words" approach.
 75 |         """
 76 | 
 77 |         # Get the bag of words from the name
 78 |         tokens = set(self.tokenize(name))
 79 | 
 80 |         # Add the bag of words from the description (union)
 81 |         if description is not None:
 82 |             tokens = tokens | set(self.tokenize(description))
 83 | 
 84 |         # Get the bag of keywords
 85 |         keywords = set(self.tokenize(keywords)) if keywords else set([])
 86 | 
 87 |         # Create the features
 88 |         features = {}
 89 |         for token in tokens:
 90 |             features[token] = True
 91 |         for keyword in keywords:
 92 |             features["KEYWORD(%s)" % keyword] = True
 93 | 
 94 |         return features
 95 | 
 96 | ##########################################################################
 97 | ## Development testing
 98 | ##########################################################################
 99 | 
100 | if __name__ == '__main__':
101 |     print ProductFeatures().featurize("The Women's EQ Medium Travel Bag from DAKINE. Though it may be small, that does not mean it cannot accomplish great things. The efficient 51 liter interior provides enough space for a week's worth . . .")
102 | 


--------------------------------------------------------------------------------
/bin/apparel-classify.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # apparel-classify
  3 | # Command line script to execute classification commands
  4 | #
  5 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  6 | # Created:  timestamp
  7 | #
  8 | # Copyright (C) 2014 Bengfort.com
  9 | # For license information, see LICENSE.txt
 10 | #
 11 | # ID: apparel-classify.py [] benjamin@bengfort.com $
 12 | 
 13 | """
 14 | Command line script to execute classification commands.
 15 | 
 16 | There are two primary commands:
 17 | 
 18 |     - build (builds the model)
 19 |     - classify (classifies the input text)
 20 | 
 21 | These commands are dependent on configurations found in conf/apparel.yaml
 22 | """
 23 | 
 24 | ##########################################################################
 25 | ## Imports
 26 | ##########################################################################
 27 | 
 28 | import os
 29 | import sys
 30 | import argparse
 31 | 
 32 | ## Helper to add apparel to Python Path for development
 33 | sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 34 | 
 35 | import apparel
 36 | 
 37 | from apparel.config import settings
 38 | from apparel.build import ClassifierBuilder
 39 | from apparel.classify import ApparelClassifier
 40 | 
 41 | ##########################################################################
 42 | ## Command Constants
 43 | ##########################################################################
 44 | 
 45 | DESCRIPTION = "An administrative utility for classification"
 46 | EPILOG      = "Build and use classifiers all from one easy command"
 47 | VERSION     = apparel.get_version()
 48 | 
 49 | ##########################################################################
 50 | ## Administrative Commands
 51 | ##########################################################################
 52 | 
 53 | def classify(args):
 54 |     """
 55 |     Classifies text using a prebuilt model.
 56 |     """
 57 |     output     = []
 58 |     classifier = ApparelClassifier(args.model)
 59 | 
 60 |     for text in args.text:
 61 |         output.append('"%s" is classified as:' % text)
 62 |         for cls in classifier.classify(text):
 63 |             output.append("    %s (%0.4f)" % cls)
 64 |         output.append("")
 65 | 
 66 |     if args.explain:
 67 |         for text in args.text:
 68 |             classifier.explain(text)
 69 |             print "\n\n"
 70 | 
 71 |     return "\n".join(output)
 72 | 
 73 | def build(args):
 74 |     """
 75 |     Build a classifier model and write to a pickle
 76 |     """
 77 |     builder = ClassifierBuilder(corpus=args.corpus, outpath=args.outpath)
 78 |     builder.build()
 79 |     return "Build Complete!"
 80 | 
 81 | ##########################################################################
 82 | ## Main method
 83 | ##########################################################################
 84 | 
 85 | if __name__ == '__main__':
 86 | 
 87 |     # Construct the main ArgumentParser
 88 |     parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG, version=VERSION)
 89 |     subparsers = parser.add_subparsers(title='commands', description='Administrative commands')
 90 | 
 91 |     # Classify Command
 92 |     classify_parser = subparsers.add_parser('classify', help='Classify text using a prebuilt model')
 93 |     classify_parser.add_argument('text', nargs='+', help='Text to classify, surrounded by quotes')
 94 |     classify_parser.add_argument('--explain', default=False, action='store_true', help='Print out an explanation of the classification')
 95 |     classify_parser.add_argument('--model', default=settings.get('model'), metavar='PATH', help='Specify the path to the pickled classifier')
 96 |     classify_parser.set_defaults(func=classify)
 97 | 
 98 |     # Build Command
 99 |     build_parser = subparsers.add_parser('build', help='Build a classifier model and write to a pickle')
100 |     build_parser.add_argument('--corpus', default=settings.get('corpus'), type=str, help='Location of the CSV corpus to train from.')
101 |     build_parser.add_argument('-o', '--outpath', metavar='PATH', type=str, help="Where to write the pickle to.", default='fixtures/')
102 |     build_parser.set_defaults(func=build)
103 | 
104 |     # Handle input from the command line
105 |     args = parser.parse_args()              # Parse the arguments
106 |     try:
107 |         msg = args.func(args)               # Call the default function
108 |         parser.exit(0, msg+"\n")            # Exit cleanly with message
109 |     except Exception as e:
110 |         parser.error(str(e))                # Exit with error
111 | 


--------------------------------------------------------------------------------
/conf/apparel-example.yaml:
--------------------------------------------------------------------------------
1 | # Configuration file for the Baleen application
2 | 
3 | debug: true
4 | testing: false
5 | corpus: /path/to/corpus.csv
6 | model: /path/to/model.pickle
7 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 
 49 | clean:
 50 | 	rm -rf $(BUILDDIR)/*
 51 | 
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | dirhtml:
 58 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 59 | 	@echo
 60 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 61 | 
 62 | singlehtml:
 63 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 66 | 
 67 | pickle:
 68 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 69 | 	@echo
 70 | 	@echo "Build finished; now you can process the pickle files."
 71 | 
 72 | json:
 73 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 74 | 	@echo
 75 | 	@echo "Build finished; now you can process the JSON files."
 76 | 
 77 | htmlhelp:
 78 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 79 | 	@echo
 80 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 81 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 82 | 
 83 | qthelp:
 84 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 85 | 	@echo
 86 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 87 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 88 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/ProductClassifier.qhcp"
 89 | 	@echo "To view the help file:"
 90 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/ProductClassifier.qhc"
 91 | 
 92 | devhelp:
 93 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 94 | 	@echo
 95 | 	@echo "Build finished."
 96 | 	@echo "To view the help file:"
 97 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/ProductClassifier"
 98 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/ProductClassifier"
 99 | 	@echo "# devhelp"
100 | 
101 | epub:
102 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | 	@echo
104 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 | 
106 | latex:
107 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | 	@echo
109 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | 	      "(use \`make latexpdf' here to do that automatically)."
112 | 
113 | latexpdf:
114 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | 	@echo "Running LaTeX files through pdflatex..."
116 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 | 
119 | latexpdfja:
120 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
122 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 | 
125 | text:
126 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | 	@echo
128 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
129 | 
130 | man:
131 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | 	@echo
133 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 | 
135 | texinfo:
136 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | 	@echo
138 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
140 | 	      "(use \`make info' here to do that automatically)."
141 | 
142 | info:
143 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | 	@echo "Running Texinfo files through makeinfo..."
145 | 	make -C $(BUILDDIR)/texinfo info
146 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 | 
148 | gettext:
149 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | 	@echo
151 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 | 
153 | changes:
154 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | 	@echo
156 | 	@echo "The overview file is in $(BUILDDIR)/changes."
157 | 
158 | linkcheck:
159 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | 	@echo
161 | 	@echo "Link check complete; look for any errors in the above output " \
162 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
163 | 
164 | doctest:
165 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | 	@echo "Testing of doctests in the sources finished, look at the " \
167 | 	      "results in $(BUILDDIR)/doctest/output.txt."
168 | 
169 | xml:
170 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | 	@echo
172 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 | 
174 | pseudoxml:
175 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | 	@echo
177 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
178 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Product Classifier documentation build configuration file, created by
  4 | # sphinx-quickstart on Thu Feb  5 20:26:19 2015.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | import os
 17 | 
 18 | # If extensions (or modules to document with autodoc) are in another directory,
 19 | # add these directories to sys.path here. If the directory is relative to the
 20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 21 | #sys.path.insert(0, os.path.abspath('.'))
 22 | 
 23 | # -- General configuration ------------------------------------------------
 24 | 
 25 | # If your documentation needs a minimal Sphinx version, state it here.
 26 | #needs_sphinx = '1.0'
 27 | 
 28 | # Add any Sphinx extension module names here, as strings. They can be
 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 30 | # ones.
 31 | extensions = [
 32 |     'sphinx.ext.pngmath',
 33 | ]
 34 | 
 35 | # Add any paths that contain templates here, relative to this directory.
 36 | templates_path = ['_templates']
 37 | 
 38 | # The suffix of source filenames.
 39 | source_suffix = '.rst'
 40 | 
 41 | # The encoding of source files.
 42 | #source_encoding = 'utf-8-sig'
 43 | 
 44 | # The master toctree document.
 45 | master_doc = 'index'
 46 | 
 47 | # General information about the project.
 48 | project = u'Product Classifier'
 49 | copyright = u'2015, Georgetown Data Analytics'
 50 | 
 51 | # The version info for the project you're documenting, acts as replacement for
 52 | # |version| and |release|, also used in various other places throughout the
 53 | # built documents.
 54 | #
 55 | # The short X.Y version.
 56 | version = '1.0'
 57 | # The full version, including alpha/beta/rc tags.
 58 | release = '1.0'
 59 | 
 60 | # The language for content autogenerated by Sphinx. Refer to documentation
 61 | # for a list of supported languages.
 62 | #language = None
 63 | 
 64 | # There are two options for replacing |today|: either, you set today to some
 65 | # non-false value, then it is used:
 66 | #today = ''
 67 | # Else, today_fmt is used as the format for a strftime call.
 68 | #today_fmt = '%B %d, %Y'
 69 | 
 70 | # List of patterns, relative to source directory, that match files and
 71 | # directories to ignore when looking for source files.
 72 | exclude_patterns = ['_build']
 73 | 
 74 | # The reST default role (used for this markup: `text`) to use for all
 75 | # documents.
 76 | #default_role = None
 77 | 
 78 | # If true, '()' will be appended to :func: etc. cross-reference text.
 79 | #add_function_parentheses = True
 80 | 
 81 | # If true, the current module name will be prepended to all description
 82 | # unit titles (such as .. function::).
 83 | #add_module_names = True
 84 | 
 85 | # If true, sectionauthor and moduleauthor directives will be shown in the
 86 | # output. They are ignored by default.
 87 | #show_authors = False
 88 | 
 89 | # The name of the Pygments (syntax highlighting) style to use.
 90 | pygments_style = 'sphinx'
 91 | 
 92 | # A list of ignored prefixes for module index sorting.
 93 | #modindex_common_prefix = []
 94 | 
 95 | # If true, keep warnings as "system message" paragraphs in the built documents.
 96 | #keep_warnings = False
 97 | 
 98 | 
 99 | # -- Options for HTML output ----------------------------------------------
100 | 
101 | # The theme to use for HTML and HTML Help pages.  See the documentation for
102 | # a list of builtin themes.
103 | html_theme = 'default'
104 | 
105 | # Theme options are theme-specific and customize the look and feel of a theme
106 | # further.  For a list of options available for each theme, see the
107 | # documentation.
108 | #html_theme_options = {}
109 | 
110 | # Add any paths that contain custom themes here, relative to this directory.
111 | #html_theme_path = []
112 | 
113 | # The name for this set of Sphinx documents.  If None, it defaults to
114 | # "<project> v<release> documentation".
115 | #html_title = None
116 | 
117 | # A shorter title for the navigation bar.  Default is the same as html_title.
118 | #html_short_title = None
119 | 
120 | # The name of an image file (relative to this directory) to place at the top
121 | # of the sidebar.
122 | #html_logo = None
123 | 
124 | # The name of an image file (within the static path) to use as favicon of the
125 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
126 | # pixels large.
127 | #html_favicon = None
128 | 
129 | # Add any paths that contain custom static files (such as style sheets) here,
130 | # relative to this directory. They are copied after the builtin static files,
131 | # so a file named "default.css" will overwrite the builtin "default.css".
132 | html_static_path = ['_static']
133 | 
134 | # Add any extra paths that contain custom files (such as robots.txt or
135 | # .htaccess) here, relative to this directory. These files are copied
136 | # directly to the root of the documentation.
137 | #html_extra_path = []
138 | 
139 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
140 | # using the given strftime format.
141 | #html_last_updated_fmt = '%b %d, %Y'
142 | 
143 | # If true, SmartyPants will be used to convert quotes and dashes to
144 | # typographically correct entities.
145 | #html_use_smartypants = True
146 | 
147 | # Custom sidebar templates, maps document names to template names.
148 | #html_sidebars = {}
149 | 
150 | # Additional templates that should be rendered to pages, maps page names to
151 | # template names.
152 | #html_additional_pages = {}
153 | 
154 | # If false, no module index is generated.
155 | #html_domain_indices = True
156 | 
157 | # If false, no index is generated.
158 | #html_use_index = True
159 | 
160 | # If true, the index is split into individual pages for each letter.
161 | #html_split_index = False
162 | 
163 | # If true, links to the reST sources are added to the pages.
164 | #html_show_sourcelink = True
165 | 
166 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
167 | #html_show_sphinx = True
168 | 
169 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
170 | #html_show_copyright = True
171 | 
172 | # If true, an OpenSearch description file will be output, and all pages will
173 | # contain a <link> tag referring to it.  The value of this option must be the
174 | # base URL from which the finished HTML is served.
175 | #html_use_opensearch = ''
176 | 
177 | # This is the file name suffix for HTML files (e.g. ".xhtml").
178 | #html_file_suffix = None
179 | 
180 | # Output file base name for HTML help builder.
181 | htmlhelp_basename = 'ProductClassifierdoc'
182 | 
183 | 
184 | # -- Options for LaTeX output ---------------------------------------------
185 | 
186 | latex_elements = {
187 | # The paper size ('letterpaper' or 'a4paper').
188 | #'papersize': 'letterpaper',
189 | 
190 | # The font size ('10pt', '11pt' or '12pt').
191 | #'pointsize': '10pt',
192 | 
193 | # Additional stuff for the LaTeX preamble.
194 | #'preamble': '',
195 | }
196 | 
197 | # Grouping the document tree into LaTeX files. List of tuples
198 | # (source start file, target name, title,
199 | #  author, documentclass [howto, manual, or own class]).
200 | latex_documents = [
201 |   ('index', 'ProductClassifier.tex', u'Product Classifier Documentation',
202 |    u'Georgetown Data Analytics', 'manual'),
203 | ]
204 | 
205 | # The name of an image file (relative to this directory) to place at the top of
206 | # the title page.
207 | #latex_logo = None
208 | 
209 | # For "manual" documents, if this is true, then toplevel headings are parts,
210 | # not chapters.
211 | #latex_use_parts = False
212 | 
213 | # If true, show page references after internal links.
214 | #latex_show_pagerefs = False
215 | 
216 | # If true, show URL addresses after external links.
217 | #latex_show_urls = False
218 | 
219 | # Documents to append as an appendix to all manuals.
220 | #latex_appendices = []
221 | 
222 | # If false, no module index is generated.
223 | #latex_domain_indices = True
224 | 
225 | 
226 | # -- Options for manual page output ---------------------------------------
227 | 
228 | # One entry per manual page. List of tuples
229 | # (source start file, name, description, authors, manual section).
230 | man_pages = [
231 |     ('index', 'productclassifier', u'Product Classifier Documentation',
232 |      [u'Georgetown Data Analytics'], 1)
233 | ]
234 | 
235 | # If true, show URL addresses after external links.
236 | #man_show_urls = False
237 | 
238 | 
239 | # -- Options for Texinfo output -------------------------------------------
240 | 
241 | # Grouping the document tree into Texinfo files. List of tuples
242 | # (source start file, target name, title, author,
243 | #  dir menu entry, description, category)
244 | texinfo_documents = [
245 |   ('index', 'ProductClassifier', u'Product Classifier Documentation',
246 |    u'Georgetown Data Analytics', 'ProductClassifier', 'One line description of project.',
247 |    'Miscellaneous'),
248 | ]
249 | 
250 | # Documents to append as an appendix to all manuals.
251 | #texinfo_appendices = []
252 | 
253 | # If false, no module index is generated.
254 | #texinfo_domain_indices = True
255 | 
256 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
257 | #texinfo_show_urls = 'footnote'
258 | 
259 | # If true, do not generate a @detailmenu in the "Top" node's menu.
260 | #texinfo_no_detailmenu = False
261 | 


--------------------------------------------------------------------------------
/docs/img/plaid.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/georgetown-analytics/product-classifier/85f2b9a164679ada1aa6b3b28d862fbf96f7de2c/docs/img/plaid.jpg


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. Product Classifier documentation master file, created by
 2 |    sphinx-quickstart on Thu Feb  5 20:26:19 2015.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to Product Classifier's documentation!
 7 | ==============================================
 8 | 
 9 | Contents:
10 | 
11 | .. toctree::
12 |    :maxdepth: 2
13 | 
14 | 
15 | 
16 | Indices and tables
17 | ==================
18 | 
19 | * :ref:`genindex`
20 | * :ref:`modindex`
21 | * :ref:`search`
22 | 
23 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	goto end
 41 | )
 42 | 
 43 | if "%1" == "clean" (
 44 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 45 | 	del /q /s %BUILDDIR%\*
 46 | 	goto end
 47 | )
 48 | 
 49 | 
 50 | %SPHINXBUILD% 2> nul
 51 | if errorlevel 9009 (
 52 | 	echo.
 53 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 54 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 55 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 56 | 	echo.may add the Sphinx directory to PATH.
 57 | 	echo.
 58 | 	echo.If you don't have Sphinx installed, grab it from
 59 | 	echo.http://sphinx-doc.org/
 60 | 	exit /b 1
 61 | )
 62 | 
 63 | if "%1" == "html" (
 64 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "dirhtml" (
 72 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "singlehtml" (
 80 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "pickle" (
 88 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can process the pickle files.
 92 | 	goto end
 93 | )
 94 | 
 95 | if "%1" == "json" (
 96 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 97 | 	if errorlevel 1 exit /b 1
 98 | 	echo.
 99 | 	echo.Build finished; now you can process the JSON files.
100 | 	goto end
101 | )
102 | 
103 | if "%1" == "htmlhelp" (
104 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
105 | 	if errorlevel 1 exit /b 1
106 | 	echo.
107 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
108 | .hhp project file in %BUILDDIR%/htmlhelp.
109 | 	goto end
110 | )
111 | 
112 | if "%1" == "qthelp" (
113 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
114 | 	if errorlevel 1 exit /b 1
115 | 	echo.
116 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
117 | .qhcp project file in %BUILDDIR%/qthelp, like this:
118 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\ProductClassifier.qhcp
119 | 	echo.To view the help file:
120 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\ProductClassifier.ghc
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "devhelp" (
125 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "epub" (
133 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "latex" (
141 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "latexpdf" (
149 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
150 | 	cd %BUILDDIR%/latex
151 | 	make all-pdf
152 | 	cd %BUILDDIR%/..
153 | 	echo.
154 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
155 | 	goto end
156 | )
157 | 
158 | if "%1" == "latexpdfja" (
159 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
160 | 	cd %BUILDDIR%/latex
161 | 	make all-pdf-ja
162 | 	cd %BUILDDIR%/..
163 | 	echo.
164 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
165 | 	goto end
166 | )
167 | 
168 | if "%1" == "text" (
169 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
170 | 	if errorlevel 1 exit /b 1
171 | 	echo.
172 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
173 | 	goto end
174 | )
175 | 
176 | if "%1" == "man" (
177 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
178 | 	if errorlevel 1 exit /b 1
179 | 	echo.
180 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
181 | 	goto end
182 | )
183 | 
184 | if "%1" == "texinfo" (
185 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
186 | 	if errorlevel 1 exit /b 1
187 | 	echo.
188 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
189 | 	goto end
190 | )
191 | 
192 | if "%1" == "gettext" (
193 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
194 | 	if errorlevel 1 exit /b 1
195 | 	echo.
196 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
197 | 	goto end
198 | )
199 | 
200 | if "%1" == "changes" (
201 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
202 | 	if errorlevel 1 exit /b 1
203 | 	echo.
204 | 	echo.The overview file is in %BUILDDIR%/changes.
205 | 	goto end
206 | )
207 | 
208 | if "%1" == "linkcheck" (
209 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
210 | 	if errorlevel 1 exit /b 1
211 | 	echo.
212 | 	echo.Link check complete; look for any errors in the above output ^
213 | or in %BUILDDIR%/linkcheck/output.txt.
214 | 	goto end
215 | )
216 | 
217 | if "%1" == "doctest" (
218 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
219 | 	if errorlevel 1 exit /b 1
220 | 	echo.
221 | 	echo.Testing of doctests in the sources finished, look at the ^
222 | results in %BUILDDIR%/doctest/output.txt.
223 | 	goto end
224 | )
225 | 
226 | if "%1" == "xml" (
227 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
228 | 	if errorlevel 1 exit /b 1
229 | 	echo.
230 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
231 | 	goto end
232 | )
233 | 
234 | if "%1" == "pseudoxml" (
235 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
236 | 	if errorlevel 1 exit /b 1
237 | 	echo.
238 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
239 | 	goto end
240 | )
241 | 
242 | :end
243 | 


--------------------------------------------------------------------------------
/fixtures/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/georgetown-analytics/product-classifier/85f2b9a164679ada1aa6b3b28d862fbf96f7de2c/fixtures/.gitkeep


--------------------------------------------------------------------------------
/fixtures/info-2015-05-02.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "classes": {
 3 |         "classifier": "MaxentClassifier",
 4 |         "features": "ProductFeatures"
 5 |     },
 6 |     "paths": {
 7 |         "info": "fixtures/info-2015-05-02.json",
 8 |         "model": "fixtures/model-2015-05-02.pickle"
 9 |     },
10 |     "finished": "Thu Feb 05 23:23:02 2015",
11 |     "version": "1.0.0",
12 |     "started": "Thu Feb 05 23:11:12 2015",
13 |     "corpus": "fixtures/products.csv",
14 |     "validated": true,
15 |     "timer": {
16 |         "training": 608.286406993866,
17 |         "validation": 56.40983510017395,
18 |         "build": 710.0041649341583,
19 |         "features": 44.22180891036987
20 |     },
21 |     "accuracy": 0.9461648558974692
22 | }
23 | 


--------------------------------------------------------------------------------
/fixtures/model-2015-05-02.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/georgetown-analytics/product-classifier/85f2b9a164679ada1aa6b3b28d862fbf96f7de2c/fixtures/model-2015-05-02.pickle


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | PyYAML==3.11
 2 | confire==0.2.0
 3 | nltk==3.0.1
 4 | nose==1.3.4
 5 | numpy==1.9.1
 6 | python-dateutil==2.4.0
 7 | six==1.9.0
 8 | unicodecsv==0.9.4
 9 | wsgiref==0.1.2
10 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | try:
 4 |     from setuptools import setup
 5 |     from setuptools import find_packages
 6 | except ImportError:
 7 |     raise ImportError("Could not import \"setuptools\". Please install the setuptools package.")
 8 | 
 9 | 
10 | packages = find_packages(where=".",
11 |     exclude=('tests', 'bin', 'docs', 'fixtures', 'conf'))
12 | 
13 | requires = []
14 | 
15 | with open('requirements.txt', 'r') as reqfile:
16 |     for line in reqfile:
17 |         requires.append(line.strip())
18 | 
19 | classifiers = (
20 |     'Intended Audience :: Developers',
21 |     'License :: Other/Proprietary License',
22 |     'Natural Language :: English',
23 |     'Operating System :: MacOS :: MacOS X',
24 |     'Operating System :: POSIX :: Linux',
25 |     'Programming Language :: Python :: 2.7',
26 | )
27 | 
28 | config = {
29 |     "name": "Product Classifier",
30 |     "version": "1.0",
31 |     "description": "Classify products into categories by their name with NLTK",
32 |     "author": "Benjamin Bengfort",
33 |     "author_email": "benjamin.bengfort@georgetown.edu",
34 |     "url": "https://github.com/georgetown-analytics/product-classifier",
35 |     "packages": packages,
36 |     "install_requires": requires,
37 |     "classifiers": classifiers,
38 |     "zip_safe": False,
39 |     "scripts": ['bin/apparel-classify.py',],
40 | }
41 | 
42 | setup(**config)
43 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # tests
 2 | # Tests for the Apparel package
 3 | #
 4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
 5 | # Created:  Wed Apr 23 08:56:12 2014 -0400
 6 | #
 7 | # Copyright (C) 2014 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: __init__.py [] benjamin@bengfort.com $
11 | 
12 | """
13 | Tests for the Apparel package
14 | """
15 | 
16 | ##########################################################################
17 | ## Imports
18 | ##########################################################################
19 | 
20 | import unittest
21 | 
22 | ##########################################################################
23 | ## Initialization Test Case
24 | ##########################################################################
25 | 
26 | class InitializationTests(unittest.TestCase):
27 | 
28 |     def test_initialization(self):
29 |         """
30 |         Assert the world is sane by checking a fact, 2+2=4
31 |         """
32 |         self.assertTrue(2+2, 4)
33 | 
34 |     def test_import(self):
35 |         """
36 |         We're able to import the apparel library
37 |         """
38 |         try:
39 |             import apparel
40 |         except ImportError:
41 |             self.fail("Was unable to import the Apparel library!")
42 | 


--------------------------------------------------------------------------------