├── .gitignore
├── .pylint
├── .travis.yml
├── AUTHORS.rst
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.rst
├── docs
    ├── .deploy_heroku
    │   ├── Gemfile
    │   ├── Gemfile.lock
    │   ├── config.ru
    │   └── public
    │   │   └── 404.html
    ├── .gitignore
    ├── Makefile
    ├── README.md
    ├── README_sphinx_deployment.md
    ├── _static
    │   └── logo.png
    ├── _templates
    │   ├── side-primary.html
    │   └── side-secondary.html
    ├── _themes
    │   ├── .gitignore
    │   ├── LICENSE
    │   ├── flask_theme_support.py
    │   ├── kr
    │   │   ├── layout.html
    │   │   ├── relations.html
    │   │   ├── static
    │   │   │   ├── flasky.css_t
    │   │   │   └── small_flask.css
    │   │   └── theme.conf
    │   └── kr_small
    │   │   ├── layout.html
    │   │   ├── static
    │   │       └── flasky.css_t
    │   │   └── theme.conf
    ├── build_script_docs.py
    ├── cli.rst
    ├── conf.py
    ├── contributing.rst
    ├── index.rst
    ├── install.rst
    ├── quickstart.rst
    ├── requirements.txt
    ├── rsync_exclude
    ├── scripts
    │   ├── download.rst
    │   ├── filterlengths.rst
    │   ├── filterpunc.rst
    │   ├── filterwords.rst
    │   ├── nonewlines.rst
    │   ├── showstops.rst
    │   ├── text2ngrams.rst
    │   ├── text2punc.rst
    │   ├── text2sentences.rst
    │   ├── text2words.rst
    │   ├── texts2json.rst
    │   ├── tokens2counts.rst
    │   ├── tokens2json.rst
    │   ├── tokens2lower.rst
    │   ├── tokens2pos.rst
    │   ├── tokens2stem.rst
    │   ├── tokens2text.rst
    │   ├── tokens2topbigrams.rst
    │   ├── tokens2upper.rst
    │   ├── transliterate.rst
    │   ├── words2bigrams.rst
    │   └── words2ngrams.rst
    └── sphinx_deployment.mk
├── environment.yml
├── requirements.txt
├── setup.cfg
├── setup.py
├── test_data
    ├── alice.txt
    ├── alice_doc.json
    ├── alice_short.txt
    ├── alice_words.txt
    ├── docs.json
    ├── international.transliterate.txt
    ├── international.txt
    ├── pride_and_prejudice.txt
    ├── pride_words.txt
    └── word_tokens.txt
├── tests
    ├── __init__.py
    ├── test_coerce.py
    ├── test_filter.py
    ├── test_tokenize.py
    ├── test_transform.py
    ├── test_utils.py
    ├── transliterate.py
    └── utils.py
└── textkit
    ├── __init__.py
    ├── cli.py
    ├── coerce.py
    ├── data
        └── stopwords
        │   ├── README.md
        │   ├── danish.txt
        │   ├── dutch.txt
        │   ├── english.txt
        │   ├── finnish.txt
        │   ├── french.txt
        │   ├── german.txt
        │   ├── hungarian.txt
        │   ├── italian.txt
        │   ├── norwegian.txt
        │   ├── portuguese.txt
        │   ├── russian.txt
        │   ├── spanish.txt
        │   ├── swedish.txt
        │   └── turkish.txt
    ├── download.py
    ├── filter
        ├── __init__.py
        ├── filter_lengths.py
        ├── filter_punc.py
        └── filter_words.py
    ├── package
        ├── __init__.py
        ├── texts_to_json.py
        ├── tokens_to_json.py
        └── tokens_to_text.py
    ├── tokenize
        ├── __init__.py
        ├── bigrams.py
        ├── ngrams.py
        ├── punc.py
        ├── sentences.py
        └── words.py
    ├── transform
        ├── __init__.py
        ├── newlines.py
        ├── tokens_to_counts.py
        ├── tokens_to_lower.py
        ├── tokens_to_pos.py
        ├── tokens_to_stem.py
        ├── tokens_to_top_bigrams.py
        ├── tokens_to_upper.py
        └── transliterate.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.swp
 3 | .DS_Store
 4 | build
 5 | textkit.egg-info
 6 | dist
 7 | *.swo
 8 | .coverage
 9 | .tox
10 | .cache
11 | docs/_build
12 | 
13 | # Readme build
14 | README.html
15 | 


--------------------------------------------------------------------------------
/.pylint:
--------------------------------------------------------------------------------
  1 | [MASTER]
  2 | 
  3 | # Specify a configuration file.
  4 | #rcfile=
  5 | 
  6 | # Python code to execute, usually for sys.path manipulation such as
  7 | # pygtk.require().
  8 | #init-hook=
  9 | 
 10 | # Add files or directories to the blacklist. They should be base names, not
 11 | # paths.
 12 | ignore=data,docs,test_data
 13 | 
 14 | # Pickle collected data for later comparisons.
 15 | persistent=yes
 16 | 
 17 | # List of plugins (as comma separated values of python modules names) to load,
 18 | # usually to register additional checkers.
 19 | load-plugins=
 20 | 
 21 | # Use multiple processes to speed up Pylint.
 22 | jobs=1
 23 | 
 24 | # Allow loading of arbitrary C extensions. Extensions are imported into the
 25 | # active Python interpreter and may run arbitrary code.
 26 | unsafe-load-any-extension=no
 27 | 
 28 | # A comma-separated list of package or module names from where C extensions may
 29 | # be loaded. Extensions are loading into the active Python interpreter and may
 30 | # run arbitrary code
 31 | extension-pkg-whitelist=
 32 | 
 33 | # Allow optimization of some AST trees. This will activate a peephole AST
 34 | # optimizer, which will apply various small optimizations. For instance, it can
 35 | # be used to obtain the result of joining multiple strings with the addition
 36 | # operator. Joining a lot of strings can lead to a maximum recursion error in
 37 | # Pylint and this flag can prevent that. It has one side effect, the resulting
 38 | # AST will be different than the one from reality.
 39 | optimize-ast=no
 40 | 
 41 | 
 42 | [MESSAGES CONTROL]
 43 | 
 44 | # Only show warnings with the listed confidence levels. Leave empty to show
 45 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
 46 | confidence=
 47 | 
 48 | # Enable the message, report, category or checker with the given id(s). You can
 49 | # either give multiple identifier separated by comma (,) or put this option
 50 | # multiple time (only on the command line, not in the configuration file where
 51 | # it should appear only once). See also the "--disable" option for examples.
 52 | #enable=
 53 | 
 54 | # Disable the message, report, category or checker with the given id(s). You
 55 | # can either give multiple identifiers separated by comma (,) or put this
 56 | # option multiple times (only on the command line, not in the configuration
 57 | # file where it should appear only once).You can also use "--disable=all" to
 58 | # disable everything first and then reenable specific checks. For example, if
 59 | # you want to run only the similarities checker, you can use "--disable=all
 60 | # --enable=similarities". If you want to run only the classes checker, but have
 61 | # no Warning level messages displayed, use"--disable=all --enable=classes
 62 | # --disable=W"
 63 | disable=getslice-method,unicode-builtin,useless-suppression,old-raise-syntax,cmp-builtin,import-star-module-level,unpacking-in-except,standarderror-builtin,cmp-method,suppressed-message,parameter-unpacking,round-builtin,file-builtin,old-division,raw_input-builtin,print-statement,range-builtin-not-iterating,oct-method,reload-builtin,execfile-builtin,zip-builtin-not-iterating,long-builtin,using-cmp-argument,next-method-called,apply-builtin,backtick,no-absolute-import,dict-iter-method,old-octal-literal,coerce-method,long-suffix,raising-string,filter-builtin-not-iterating,delslice-method,indexing-exception,hex-method,setslice-method,dict-view-method,buffer-builtin,coerce-builtin,unichr-builtin,input-builtin,old-ne-operator,xrange-builtin,map-builtin-not-iterating,reduce-builtin,basestring-builtin,nonzero-method,metaclass-assignment,intern-builtin,missing-docstring,expression-not-assigned,bad-continuation,too-many-arguments,bad-builtin
 64 | 
 65 | [REPORTS]
 66 | 
 67 | # Set the output format. Available formats are text, parseable, colorized, msvs
 68 | # (visual studio) and html. You can also give a reporter class, eg
 69 | # mypackage.mymodule.MyReporterClass.
 70 | output-format=text
 71 | 
 72 | # Put messages in a separate file for each module / package specified on the
 73 | # command line instead of printing them on stdout. Reports (if any) will be
 74 | # written in a file name "pylint_global.[txt|html]".
 75 | files-output=no
 76 | 
 77 | # Tells whether to display a full report or only the messages
 78 | reports=yes
 79 | 
 80 | # Python expression which should return a note less than 10 (10 is the highest
 81 | # note). You have access to the variables errors warning, statement which
 82 | # respectively contain the number of errors / warnings messages and the total
 83 | # number of statements analyzed. This is used by the global evaluation report
 84 | # (RP0004).
 85 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
 86 | 
 87 | # Template used to display messages. This is a python new-style format string
 88 | # used to format the message information. See doc for all details
 89 | #msg-template=
 90 | 
 91 | 
 92 | [BASIC]
 93 | 
 94 | # Include a hint for the correct naming format with invalid-name
 95 | include-naming-hint=yes
 96 | 
 97 | # Regular expression matching correct class names
 98 | class-rgx=[A-Z_][a-zA-Z0-9]+$
 99 | 
100 | # Naming hint for class names
101 | class-name-hint=[A-Z_][a-zA-Z0-9]+$
102 | 
103 | # Regular expression matching correct argument names
104 | argument-rgx=[a-z_][a-z0-9_]{2,30}$
105 | 
106 | # Naming hint for argument names
107 | argument-name-hint=[a-z_][a-z0-9_]{2,30}$
108 | 
109 | # Regular expression matching correct class attribute names
110 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
111 | 
112 | # Naming hint for class attribute names
113 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
114 | 
115 | # Regular expression matching correct inline iteration names
116 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
117 | 
118 | # Naming hint for inline iteration names
119 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
120 | 
121 | # Regular expression matching correct attribute names
122 | attr-rgx=[a-z_][a-z0-9_]{2,30}$
123 | 
124 | # Naming hint for attribute names
125 | attr-name-hint=[a-z_][a-z0-9_]{2,30}$
126 | 
127 | # Regular expression matching correct constant names
128 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
129 | 
130 | # Naming hint for constant names
131 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
132 | 
133 | # Regular expression matching correct variable names
134 | variable-rgx=[a-z_][a-z0-9_]{2,30}$
135 | 
136 | # Naming hint for variable names
137 | variable-name-hint=[a-z_][a-z0-9_]{2,30}$
138 | 
139 | # Regular expression matching correct function names
140 | function-rgx=[a-z_][a-z0-9_]{2,30}$
141 | 
142 | # Naming hint for function names
143 | function-name-hint=[a-z_][a-z0-9_]{2,30}$
144 | 
145 | # Regular expression matching correct method names
146 | method-rgx=[a-z_][a-z0-9_]{2,30}$
147 | 
148 | # Naming hint for method names
149 | method-name-hint=[a-z_][a-z0-9_]{2,30}$
150 | 
151 | # Regular expression matching correct module names
152 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
153 | 
154 | # Naming hint for module names
155 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
156 | 
157 | # Regular expression which should only match function or class names that do
158 | # not require a docstring.
159 | no-docstring-rgx=^_
160 | 
161 | # Minimum line length for functions/classes that require docstrings, shorter
162 | # ones are exempt.
163 | docstring-min-length=-1
164 | 
165 | 
166 | [ELIF]
167 | 
168 | # Maximum number of nested blocks for function / method body
169 | max-nested-blocks=5
170 | 
171 | [FORMAT]
172 | 
173 | # Maximum number of characters on a single line.
174 | max-line-length=80
175 | 
176 | # Regexp for a line that is allowed to be longer than the limit.
177 | ignore-long-lines=^\s*(# )?<?https?://\S+>?$
178 | 
179 | # Allow the body of an if to be on the same line as the test if there is no
180 | # else.
181 | single-line-if-stmt=no
182 | 
183 | # List of optional constructs for which whitespace checking is disabled. `dict-
184 | # separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
185 | # `trailing-comma` allows a space between comma and closing bracket: (a, ).
186 | # `empty-line` allows space-only lines.
187 | no-space-check=trailing-comma,dict-separator
188 | 
189 | # Maximum number of lines in a module
190 | max-module-lines=1000
191 | 
192 | # String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
193 | # tab).
194 | indent-string='    '
195 | 
196 | # Number of spaces of indent required inside a hanging  or continued line.
197 | indent-after-paren=4
198 | 
199 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
200 | expected-line-ending-format=
201 | 
202 | 
203 | [LOGGING]
204 | 
205 | # Logging modules to check that the string format arguments are in logging
206 | # function parameter format
207 | logging-modules=logging
208 | 
209 | 
210 | [MISCELLANEOUS]
211 | 
212 | # List of note tags to take in consideration, separated by a comma.
213 | notes=FIXME,XXX,TODO
214 | 
215 | [SIMILARITIES]
216 | 
217 | # Minimum lines number of a similarity.
218 | min-similarity-lines=4
219 | 
220 | # Ignore comments when computing similarities.
221 | ignore-comments=yes
222 | 
223 | # Ignore docstrings when computing similarities.
224 | ignore-docstrings=yes
225 | 
226 | # Ignore imports when computing similarities.
227 | ignore-imports=no
228 | 
229 | 
230 | [SPELLING]
231 | 
232 | # Spelling dictionary name. Available dictionaries: none. To make it working
233 | # install python-enchant package.
234 | spelling-dict=
235 | 
236 | # List of comma separated words that should not be checked.
237 | spelling-ignore-words=
238 | 
239 | # A path to a file that contains private dictionary; one word per line.
240 | spelling-private-dict-file=
241 | 
242 | # Tells whether to store unknown words to indicated private dictionary in
243 | # --spelling-private-dict-file option instead of raising a message.
244 | spelling-store-unknown-words=no
245 | 
246 | 
247 | [TYPECHECK]
248 | 
249 | # Tells whether missing members accessed in mixin class should be ignored. A
250 | # mixin class is detected if its name ends with "mixin" (case insensitive).
251 | ignore-mixin-members=yes
252 | 
253 | # List of module names for which member attributes should not be checked
254 | # (useful for modules/projects where namespaces are manipulated during runtime
255 | # and thus existing member attributes cannot be deduced by static analysis. It
256 | # supports qualified module names, as well as Unix pattern matching.
257 | ignored-modules=
258 | 
259 | # List of classes names for which member attributes should not be checked
260 | # (useful for classes with attributes dynamically set). This supports can work
261 | # with qualified names.
262 | ignored-classes=
263 | 
264 | # List of members which are set dynamically and missed by pylint inference
265 | # system, and so shouldn't trigger E1101 when accessed. Python regular
266 | # expressions are accepted.
267 | generated-members=
268 | 
269 | 
270 | [VARIABLES]
271 | 
272 | # Tells whether we should check for unused import in __init__ files.
273 | init-import=no
274 | 
275 | # A regular expression matching the name of dummy variables (i.e. expectedly
276 | # not used).
277 | dummy-variables-rgx=_$|dummy
278 | 
279 | # List of additional names supposed to be defined in builtins. Remember that
280 | # you should avoid to define new builtins when possible.
281 | additional-builtins=
282 | 
283 | # List of strings which can identify a callback function by name. A callback
284 | # name must start or end with one of those strings.
285 | callbacks=cb_,_cb
286 | 
287 | 
288 | [CLASSES]
289 | 
290 | # List of method names used to declare (i.e. assign) instance attributes.
291 | defining-attr-methods=__init__,__new__,setUp
292 | 
293 | # List of valid names for the first argument in a class method.
294 | valid-classmethod-first-arg=cls
295 | 
296 | # List of valid names for the first argument in a metaclass class method.
297 | valid-metaclass-classmethod-first-arg=mcs
298 | 
299 | # List of member names, which should be excluded from the protected access
300 | # warning.
301 | exclude-protected=_asdict,_fields,_replace,_source,_make
302 | 
303 | 
304 | [DESIGN]
305 | 
306 | # Maximum number of arguments for function / method
307 | max-args=5
308 | 
309 | # Argument names that match this expression will be ignored. Default to name
310 | # with leading underscore
311 | ignored-argument-names=_.*
312 | 
313 | # Maximum number of locals for function / method body
314 | max-locals=15
315 | 
316 | # Maximum number of return / yield for function / method body
317 | max-returns=6
318 | 
319 | # Maximum number of branch for function / method body
320 | max-branches=12
321 | 
322 | # Maximum number of statements in function / method body
323 | max-statements=50
324 | 
325 | # Maximum number of parents for a class (see R0901).
326 | max-parents=7
327 | 
328 | # Maximum number of attributes for a class (see R0902).
329 | max-attributes=7
330 | 
331 | # Minimum number of public methods for a class (see R0903).
332 | min-public-methods=2
333 | 
334 | # Maximum number of public methods for a class (see R0904).
335 | max-public-methods=20
336 | 
337 | # Maximum number of boolean expressions in a if statement
338 | max-bool-expr=5
339 | 
340 | 
341 | [IMPORTS]
342 | 
343 | # Deprecated modules which should not be used, separated by a comma
344 | deprecated-modules=optparse
345 | 
346 | # Create a graph of every (i.e. internal and external) dependencies in the
347 | # given file (report RP0402 must not be disabled)
348 | import-graph=
349 | 
350 | # Create a graph of external dependencies in the given file (report RP0402 must
351 | # not be disabled)
352 | ext-import-graph=
353 | 
354 | # Create a graph of internal dependencies in the given file (report RP0402 must
355 | # not be disabled)
356 | int-import-graph=
357 | 
358 | 
359 | [EXCEPTIONS]
360 | 
361 | # Exceptions that will emit a warning when being caught. Defaults to
362 | # "Exception"
363 | overgeneral-exceptions=Exception
364 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | before_script:
 3 | - git config --global user.name "Teracy"
 4 | - git config --global user.email "your-friends@teracy.com"
 5 | - export REPO_URL_GITHUB="https://$GH_TOKEN@github.com/$GH_REPO.git"
 6 | - . ./.travis/setup.sh
 7 | - echo $DEPLOY_HTML_DIR
 8 | after_script:
 9 | - sudo pip install -r docs/requirements.txt --use-mirrors
10 | - cd docs
11 | - make setup_gh_pages
12 | - make generate
13 | - make deploy
14 | env:
15 |   global:
16 |   - GH_REPO="teracyhq/sphinx-deployment"
17 | notifications:
18 |   slack:
19 |     secure: RhiJy6C19bKjM8fY5oHFSBAyHgRRDr+ZiYhK5QwlitpQmSCsABLsVGWpIrD+aA0nPKs6khtmNFUoEsXtzol65CsvV2m70SQbVK3OCbOwXd6xRSLw8hb1p+SOnpqu/VvZ89iayVl7J9Y8WpJQlUfobMXZgequED7mSRo9+bdBB4E=
20 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | *******
 2 | Authors
 3 | *******
 4 | 
 5 | Contributors (in random order)
 6 | ============================
 7 | 
 8 | - Lynn Cherny `@arnicas <https://github.com/arnicas>`_
 9 | - Yannick Assogba `@tafsiri <https://github.com/tafsiri>`_
10 | - Jim Vallandingham `@vlandham <https://github.com/vlandham>`_
11 | - Irene Ros `@ireneros <https://github.com/iros>`_
12 | - Alfred Lee `@justalfred <https://github.com/justalfred>`_
13 | - Jenn Schiffer`@jennschiffer <https://github.com/jennschiffer>`_
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 LTV
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include textkit/data/**/*
2 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | test:
 3 | 	python -m pytest tests
 4 | 
 5 | lint:
 6 | 	pylint --rcfile .pylint textkit/*/**.py
 7 | 
 8 | install_dev:
 9 | 	pip install --editable .
10 | 
11 | package:
12 | 	python setup.py egg_info
13 | 	python setup.py sdist
14 | 	python setup.py bdist_wheel --universal
15 | 
16 | publish:clean package
17 | 	twine upload dist/*
18 | 
19 | clean:
20 | 	rm -rf build
21 | 	rm -rf textkit.egg-info
22 | 	rm -rf dist
23 | 
24 | readme:
25 | 	# This requires `pip install rst2html5`
26 | 	# we should document - but we shouldn't need this as a requirement
27 | 	# or dependency of textkit
28 | 	rst2html.py README.rst > README.html
29 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | textkit
  2 | =======
  3 | 
  4 | Simple text analysis from the command line.
  5 | 
  6 | Homepage: `http://learntextvis.github.io/textkit/ <http://learntextvis.github.io/textkit/>`_
  7 | 
  8 | About
  9 | -----
 10 | 
 11 | `textkit` is a series of small, unix-style tools that provide a suite of capabilities for
 12 | dealing with text as data.
 13 | 
 14 | Think of textkit as basic natural language processing capabilities - from the command line.
 15 | 
 16 | textkit Features
 17 | ----------------
 18 | 
 19 | Here are some of the cool things you can do with textkit.
 20 | 
 21 | Convert a document to a set of word tokens and remove all punctuation from the tokens:
 22 | 
 23 | .. code-block:: python
 24 | 
 25 |     textkit text2words input.txt | textkit filterpunc
 26 | 
 27 | Count top used words in a text:
 28 | 
 29 | .. code-block:: python
 30 | 
 31 |     textkit text2words alice.txt | textkit count --limit 20
 32 | 
 33 | Do the same, but with punctuation removed:
 34 | 
 35 | .. code-block:: python
 36 | 
 37 |     textkit text2words alice.txt | textkit filterpunc | textkit count --limit 20
 38 | 
 39 | Installation
 40 | ------------
 41 | ::
 42 | 
 43 |     $ pip install -U textkit
 44 |     $ textkit --help
 45 | 
 46 | 
 47 | Dev install
 48 | -----------
 49 | 
 50 | To test locally, clone the repo:
 51 | 
 52 | ::
 53 | 
 54 |     git clone git@github.com:learntextvis/textkit.git
 55 | 
 56 | 
 57 | Create a local virtual environment or `conda` environment.
 58 | 
 59 | Here is how I created my local `conda` environment for installing and testing textkit:
 60 | 
 61 | ::
 62 | 
 63 |     conda create --name textkit nltk
 64 | 
 65 |     source activate textkit
 66 | 
 67 | Then I went into the `textkit` directory to install its requirements
 68 | 
 69 | ::
 70 | 
 71 |     cd textkit
 72 | 
 73 |     pip install -r requirements.txt
 74 | 
 75 | Finally, I installed the local version of textkit using the `--editable` flag:
 76 | 
 77 | ::
 78 | 
 79 |     pip install --editable .
 80 | 
 81 | Examples
 82 | --------
 83 | 
 84 | See more examples at the `Quickstart guide`_.
 85 | 
 86 | .. _`Quickstart guide`: http://learntextvis.github.io/textkit/quickstart.html
 87 | 
 88 | 
 89 | Requirements
 90 | ------------
 91 | 
 92 | - Python >= 2.6 or >= 3.3
 93 | 
 94 | Project Links
 95 | -------------
 96 | 
 97 | - Docs: http://learntextvis.github.io/textkit/
 98 | - PyPI: https://pypi.python.org/pypi/textkit
 99 | - Issues: https://github.com/learntextvis/textkit/issues
100 | 


--------------------------------------------------------------------------------
/docs/.deploy_heroku/Gemfile:
--------------------------------------------------------------------------------
1 | source "https://rubygems.org"
2 | ruby '1.9.3'
3 | 
4 | gem 'sinatra', '~> 1.4.2'
5 | 


--------------------------------------------------------------------------------
/docs/.deploy_heroku/Gemfile.lock:
--------------------------------------------------------------------------------
 1 | GEM
 2 |   remote: https://rubygems.org/
 3 |   specs:
 4 |     rack (1.5.2)
 5 |     rack-protection (1.5.1)
 6 |       rack
 7 |     sinatra (1.4.4)
 8 |       rack (~> 1.4)
 9 |       rack-protection (~> 1.4)
10 |       tilt (~> 1.3, >= 1.3.4)
11 |     tilt (1.4.1)
12 | 
13 | PLATFORMS
14 |   ruby
15 | 
16 | DEPENDENCIES
17 |   sinatra (~> 1.4.2)
18 | 


--------------------------------------------------------------------------------
/docs/.deploy_heroku/config.ru:
--------------------------------------------------------------------------------
 1 | require 'bundler/setup'
 2 | require 'sinatra/base'
 3 | 
 4 | # The project root directory
 5 | $root = ::File.dirname(__FILE__)
 6 | 
 7 | class SinatraStaticServer < Sinatra::Base
 8 | 
 9 |   get(/.+/) do
10 |     send_sinatra_file(request.path) {404}
11 |   end
12 | 
13 |   not_found do
14 |     send_file(File.join(File.dirname(__FILE__), 'public', '404.html'), {:status => 404})
15 |   end
16 | 
17 |   def send_sinatra_file(path, &missing_file_block)
18 |     file_path = File.join(File.dirname(__FILE__), 'public',  path)
19 |     file_path = File.join(file_path, 'index.html') unless file_path =~ /\.[a-z]+$/i
20 |     File.exist?(file_path) ? send_file(file_path) : missing_file_block.call
21 |   end
22 | 
23 | end
24 | 
25 | run SinatraStaticServer
26 | 


--------------------------------------------------------------------------------
/docs/.deploy_heroku/public/404.html:
--------------------------------------------------------------------------------
1 | <html>
2 |   <head>
3 |     <title>404 - Page Not Found</title>
4 |   </head>
5 |   <body>
6 |     <h1>Error: 404 - Page Not Found</h1>
7 |   </body>
8 | </html>
9 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | # default ignored by sphinx-deployment
2 | _deploy
3 | _deploy_heroku
4 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 
 49 | clean:
 50 | 	rm -rf $(BUILDDIR)/*
 51 | 
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | dirhtml:
 58 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 59 | 	@echo
 60 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 61 | 
 62 | singlehtml:
 63 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 66 | 
 67 | pickle:
 68 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 69 | 	@echo
 70 | 	@echo "Build finished; now you can process the pickle files."
 71 | 
 72 | json:
 73 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 74 | 	@echo
 75 | 	@echo "Build finished; now you can process the JSON files."
 76 | 
 77 | htmlhelp:
 78 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 79 | 	@echo
 80 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 81 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 82 | 
 83 | qthelp:
 84 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 85 | 	@echo
 86 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 87 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 88 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/textkit.qhcp"
 89 | 	@echo "To view the help file:"
 90 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/textkit.qhc"
 91 | 
 92 | devhelp:
 93 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 94 | 	@echo
 95 | 	@echo "Build finished."
 96 | 	@echo "To view the help file:"
 97 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/textkit"
 98 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/textkit"
 99 | 	@echo "# devhelp"
100 | 
101 | epub:
102 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | 	@echo
104 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 | 
106 | latex:
107 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | 	@echo
109 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | 	      "(use \`make latexpdf' here to do that automatically)."
112 | 
113 | latexpdf:
114 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | 	@echo "Running LaTeX files through pdflatex..."
116 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 | 
119 | latexpdfja:
120 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
122 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 | 
125 | text:
126 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | 	@echo
128 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
129 | 
130 | man:
131 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | 	@echo
133 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 | 
135 | texinfo:
136 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | 	@echo
138 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
140 | 	      "(use \`make info' here to do that automatically)."
141 | 
142 | info:
143 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | 	@echo "Running Texinfo files through makeinfo..."
145 | 	make -C $(BUILDDIR)/texinfo info
146 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 | 
148 | gettext:
149 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | 	@echo
151 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 | 
153 | changes:
154 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | 	@echo
156 | 	@echo "The overview file is in $(BUILDDIR)/changes."
157 | 
158 | linkcheck:
159 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | 	@echo
161 | 	@echo "Link check complete; look for any errors in the above output " \
162 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
163 | 
164 | doctest:
165 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | 	@echo "Testing of doctests in the sources finished, look at the " \
167 | 	      "results in $(BUILDDIR)/doctest/output.txt."
168 | 
169 | xml:
170 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | 	@echo
172 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 | 
174 | pseudoxml:
175 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | 	@echo
177 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
178 | 
179 | include sphinx_deployment.mk
180 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # textkit documentation
 2 | 
 3 | ## Installing prerequisites
 4 | 
 5 | The doc building system has additional requirements past the base `textkit` install.
 6 | 
 7 | **Note:** All commands expect you are in the `docs` sub-directory of the project.
 8 | So, `cd` to that directory first:
 9 | 
10 | ```
11 | cd docs
12 | ```
13 | 
14 | You can install these packages using:
15 | 
16 | ```
17 | pip install -r requirements.txt
18 | ```
19 | 
20 | ## Building Docs
21 | 
22 | This should be done via `make`:
23 | 
24 | ```
25 | make html
26 | ```
27 | 
28 | ## Viewing Built Docs
29 | 
30 | The built documentation should now be in the `_build/html` sub-directory
31 | 
32 | You can view them by starting a webserver in that directory.
33 | 
34 | If you are on python 3, this should do that:
35 | 
36 | ```
37 | cd _build/html/; python -m http.server
38 | ```
39 | 
40 | If you are on python 2, try:
41 | 
42 | ```
43 | cd _build/html/; python -m SimpleHTTPServer
44 | ```
45 | 
46 | This should allow you to visit [http://localhost:8000/](http://localhost:8000/) and
47 | see your updated docs.
48 | 
49 | ## Deploying to gh-pages
50 | 
51 | We are using the [Sphinx Deployment](https://github.com/teracyhq/sphinx-deployment) tooling
52 | to automate deploying documentation to the `gh-pages` branch of this repo.
53 | 
54 | This keeps the documentation in the same location as the code, which is great for discoverability.
55 | 
56 | It also provides documentation at a consistent url: [http://learntextvis.github.io/textkit/](http://learntextvis.github.io/textkit/).
57 | 
58 | I believe Sphinx Deployment is all setup to allow for anyone with write capabilities to the repo
59 | (owners) to update and push documentation with the following commands:
60 | 
61 | ```
62 | make html
63 | make deploy
64 | ```
65 | 
66 | The default deployment is to `gh-pages` which is what we want.
67 | The configuration of the Sphinx Deployment process is specified in `sphinx_deployment.mk` - so check there for all the configuration parameters to ensure everything is set as we want it.
68 | 
69 | For more info on the options, `README_sphinx_deployment.md` comes from the
70 | 


--------------------------------------------------------------------------------
/docs/README_sphinx_deployment.md:
--------------------------------------------------------------------------------
  1 | sphinx-deployment
  2 | =================
  3 | 
  4 | Automatic setup and deployment for [sphinx][] docs.
  5 | 
  6 | This project is intended to be used to deploy [sphinx][] project on:
  7 | 
  8 | - [Github Pages](https://help.github.com/categories/20/articles)
  9 | - [Rsync](http://en.wikipedia.org/wiki/Rsync)
 10 | - PaaS services: [heroku](http://heroku.com/), etc.
 11 | 
 12 | Usage
 13 | -----
 14 | 
 15 | **1. `$ make generate`**
 16 | 
 17 | For generating contents, alias for `$ make html`
 18 | 
 19 | **2. `$ make deploy`**
 20 | 
 21 | For short-cut deployment, it could be `$ make deploy_gh_pages`, `$ make deploy_rsync` or
 22 | `$ make deploy_heroku` basing on the configuration of `DEPLOY_DEFAULT`.
 23 | 
 24 | **3. `$ make gen_deploy`**
 25 | 
 26 | For short-cut generation and deployment: `$ make generate` and then `$ make deploy`.
 27 | 
 28 | **4. `$ make setup_gh_pages`**
 29 | 
 30 | For the first time only to create `$(DEPLOY_DIR)` to track `$(DEPLOY_BRANCH)`. This is used for
 31 | github pages deployment.
 32 | 
 33 | **5. `$ make setup_heroku`**
 34 | 
 35 | For the first time only to create `$(DEPLOY_DIR_HEROKU` to track the Heroku repo's master branch.
 36 | This is used for heroku deployment.
 37 | 
 38 | **6. `$ make deploy_gh_pages`**
 39 | 
 40 | For deploying with github pages only.
 41 | 
 42 | **7. `$ make deploy_rsync`**
 43 | 
 44 | For deploying with rsync only.
 45 | 
 46 | **8. `$ make deploy_heroku`**
 47 | 
 48 | For deploying with heroku only.
 49 | 
 50 | 
 51 | Installation
 52 | ------------
 53 | 
 54 | **1. Bash script**
 55 | 
 56 | Just run this bash script from your root git repository project and it's enough.
 57 | 
 58 | You need to specify the `<docs_path>` to your sphinx docs directory:
 59 | 
 60 | ``` bash
 61 | $ cd <your_project>
 62 | $ wget https://raw.github.com/teracyhq/sphinx-deployment/master/scripts/spxd.sh && chmod +x ./spxd.sh && ./spxd.sh -p <docs_path>
 63 | ```
 64 | 
 65 | For example:
 66 | 
 67 | ``` bash
 68 | $ cd my_project
 69 | $ wget https://raw.github.com/teracyhq/sphinx-deployment/master/scripts/spxd.sh && chmod +x ./spxd.sh && ./spxd.sh -p ./docs
 70 | ```
 71 | 
 72 | **2. Manual**
 73 | 
 74 | a. You need to copy these following files to your [sphinx][] directory:
 75 | 
 76 | - `docs/requirements`
 77 | - `docs/sphinx_deployment.mk`
 78 | - `docs/rsync_exclude`
 79 | - `docs/.deploy_heroku/*`
 80 | - `docs/.gitignore`
 81 | 
 82 | b. Include `sphinx_deployment.mk` to your `Makefile`:
 83 | 
 84 | - Add the content below to your `Makefile`:
 85 | 
 86 | ```
 87 | include sphinx_deployment.mk
 88 | ```
 89 | 
 90 | - Or do with commands on terminal:
 91 | 
 92 | ``` bash
 93 | echo '' >> Makefile
 94 | echo 'include sphinx_deployment.mk' >> Makefile
 95 | ```
 96 | 
 97 | 
 98 | c.. To build with `travis-ci`, you need to copy these following files to your root project directory:
 99 | 
100 | - `.travis.yml`
101 | - `.travis/setup.sh`
102 | 
103 | 
104 | Configuration
105 | -------------
106 | 
107 | You need to configure these following deployment configurations following your project settings on
108 | `sphinx_deployment.mk` file.
109 | 
110 | ``` Makefile
111 | # Deployment configurations from sphinx_deployment project
112 | 
113 | # default deployment when $ make deploy
114 | # deploy_gh_pages                            : to $ make deploy_gh_pages
115 | # deploy_rsync                               : to $ make deploy_rsync
116 | # deploy_heroku                              : to $ make deploy_heroku
117 | # deploy_gh_pages deploy_rsync deploy_heroku : to $ make deploy_gh_pages then $ make deploy_rsync
118 | #                                              and then $ make deploy_heroku
119 | # default value: deploy_gh_pages
120 | ifndef DEPLOY_DEFAULT
121 | DEPLOY_DEFAULT = deploy_gh_pages
122 | endif
123 | 
124 | # The deployment directory to be deployed
125 | ifndef DEPLOY_DIR
126 | DEPLOY_DIR      = _deploy
127 | endif
128 | 
129 | # The heroku deployment directory to be deployed
130 | # we must create this separated dir to avoid any conflict with _deploy (rsync and gh_pages)
131 | ifndef DEPLOY_DIR_HEROKU
132 | DEPLOY_DIR_HEROKU = _deploy_heroku
133 | endif
134 | 
135 | # Copy contents from $(BUILDDIR) to $(DEPLOY_DIR)/$(DEPLOY_HTML_DIR) directory
136 | ifndef DEPLOY_HTML_DIR
137 | DEPLOY_HTML_DIR = docs
138 | endif
139 | 
140 | 
141 | ## -- Rsync Deploy config -- ##
142 | # Be sure your public key is listed in your server's ~/.ssh/authorized_keys file
143 | ifndef SSH_USER
144 | SSH_USER       = user@domain.com
145 | endif
146 | 
147 | ifndef SSH_PORT
148 | SSH_PORT       = 22
149 | endif
150 | 
151 | ifndef DOCUMENT_ROOT
152 | DOCUMENT_ROOT  = ~/website.com/
153 | endif
154 | 
155 | #If you choose to delete on sync, rsync will create a 1:1 match
156 | ifndef RSYNC_DELETE
157 | RSYNC_DELETE   = false
158 | endif
159 | 
160 | # Any extra arguments to pass to rsync
161 | ifndef RSYNC_ARGS
162 | RSYNC_ARGS     =
163 | endif
164 | 
165 | ## -- Github Pages Deploy config -- ##
166 | 
167 | # Configure the right deployment branch
168 | ifndef DEPLOY_BRANCH_GITHUB
169 | DEPLOY_BRANCH_GITHUB = gh-pages
170 | endif
171 | 
172 | #if REPO_URL_GITHUB was NOT defined by travis-ci
173 | ifndef REPO_URL_GITHUB
174 | # Configure your right github project repo
175 | # REPO_URL       = git@github.com:teracy-official/sphinx-deployment.git
176 | endif
177 | 
178 | ## -- Heroku Deployment Config -- ##
179 | 
180 | ifndef REPO_URL_HEROKU
181 | # Configure your right heroku repo
182 | # REPO_URL_HEROKU = git@heroku.com:spxd.git
183 | endif
184 | 
185 | 
186 | ## end deployment configuration, don't edit anything below this line ##
187 | #######################################################################
188 | ```
189 | 
190 | Continuous Integration Build
191 | ----------------------------
192 | 
193 | **1. `travis-ci`**
194 | 
195 | Move `.travis.yml` file to your root repository project, and configure it following its
196 | instruction there. There is a supported `.travis/setup.sh` to export variables for `Makefile`
197 | depending on the being-built branch.
198 | 
199 | To configure secure token for `travis-ci`, please read the similar step described at
200 | http://blog.teracy.com/2013/08/03/how-to-start-blogging-easily-with-octopress-and-teracy-dev/
201 | 
202 | 
203 | **2. `jenkins`**
204 | 
205 | //TODO
206 | 
207 | 
208 | Authors and contributors
209 | ------------------------
210 | 
211 | - Hoat Le: http://github.com/hoatle
212 | 
213 | - Many thanks to http://octopress.org/docs/deploying/ for inspiration.
214 | 
215 | License
216 | -------
217 | 
218 | BSD License
219 | 
220 | ```
221 | Copyright (c) Teracy, Inc. and individual contributors.
222 | All rights reserved.
223 | 
224 | Redistribution and use in source and binary forms, with or without modification,
225 | are permitted provided that the following conditions are met:
226 | 
227 |     1. Redistributions of source code must retain the above copyright notice,
228 |        this list of conditions and the following disclaimer.
229 | 
230 |     2. Redistributions in binary form must reproduce the above copyright
231 |        notice, this list of conditions and the following disclaimer in the
232 |        documentation and/or other materials provided with the distribution.
233 | 
234 |     3. Neither the name of Teracy, Inc. nor the names of its contributors may be used
235 |        to endorse or promote products derived from this software without
236 |        specific prior written permission.
237 | 
238 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
239 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
240 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
241 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
242 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
243 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
244 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
245 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
246 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
247 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
248 | 
249 | ```
250 | 
251 | [sphinx]: http://sphinx-doc.org
252 | 


--------------------------------------------------------------------------------
/docs/_static/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/learntextvis/textkit/6c482ea7d378022e10ee1002f9d9db7925856294/docs/_static/logo.png


--------------------------------------------------------------------------------
/docs/_templates/side-primary.html:
--------------------------------------------------------------------------------
1 | <p class="logo">
2 |   <a href="{{ pathto(master_doc) }}"><img class="logo" src="{{ pathto('_static/logo.png', 1) }}" height="200" width="230" alt="Logo"/></a>
3 | </p>
4 | 
5 | 
6 | <p>
7 |   Command line tools for text processing and analysis.
8 | </p>
9 | 


--------------------------------------------------------------------------------
/docs/_templates/side-secondary.html:
--------------------------------------------------------------------------------
1 | <p class="logo">
2 |   <a href="{{ pathto(master_doc) }}"><img class="logo" src="{{ pathto('_static/logo.png', 1) }}" height="200" width="230" alt="Logo"/></a>
3 | </p>
4 | 
5 | <p>
6 | Command line tools for text processing and analysis.
7 | </p>
8 | 


--------------------------------------------------------------------------------
/docs/_themes/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.pyo
3 | .DS_Store
4 | 


--------------------------------------------------------------------------------
/docs/_themes/LICENSE:
--------------------------------------------------------------------------------
 1 | Modifications: 
 2 | 
 3 | Copyright (c) 2010 Kenneth Reitz.
 4 | 
 5 | 
 6 | Original Project: 
 7 | 
 8 | Copyright (c) 2010 by Armin Ronacher.
 9 | 
10 | 
11 | Some rights reserved.
12 | 
13 | Redistribution and use in source and binary forms of the theme, with or
14 | without modification, are permitted provided that the following conditions
15 | are met:
16 | 
17 | * Redistributions of source code must retain the above copyright
18 |   notice, this list of conditions and the following disclaimer.
19 | 
20 | * Redistributions in binary form must reproduce the above
21 |   copyright notice, this list of conditions and the following
22 |   disclaimer in the documentation and/or other materials provided
23 |   with the distribution.
24 | 
25 | * The names of the contributors may not be used to endorse or
26 |   promote products derived from this software without specific
27 |   prior written permission.
28 | 
29 | We kindly ask you to only use these themes in an unmodified manner just
30 | for Flask and Flask-related products, not for unrelated projects.  If you
31 | like the visual style and want to use it for your own projects, please
32 | consider making some larger changes to the themes (such as changing
33 | font faces, sizes, colors or margins).
34 | 
35 | THIS THEME IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
36 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
38 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
39 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
40 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
41 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
42 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
43 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
44 | ARISING IN ANY WAY OUT OF THE USE OF THIS THEME, EVEN IF ADVISED OF THE
45 | POSSIBILITY OF SUCH DAMAGE.
46 | 


--------------------------------------------------------------------------------
/docs/_themes/flask_theme_support.py:
--------------------------------------------------------------------------------
 1 | # flasky extensions.  flasky pygments style based on tango style
 2 | from pygments.style import Style
 3 | from pygments.token import Keyword, Name, Comment, String, Error, \
 4 |      Number, Operator, Generic, Whitespace, Punctuation, Other, Literal
 5 | 
 6 | 
 7 | class FlaskyStyle(Style):
 8 |     background_color = "#f8f8f8"
 9 |     default_style = ""
10 | 
11 |     styles = {
12 |         # No corresponding class for the following:
13 |         #Text:                     "", # class:  ''
14 |         Whitespace:                "underline #f8f8f8",      # class: 'w'
15 |         Error:                     "#a40000 border:#ef2929", # class: 'err'
16 |         Other:                     "#000000",                # class 'x'
17 | 
18 |         Comment:                   "italic #8f5902", # class: 'c'
19 |         Comment.Preproc:           "noitalic",       # class: 'cp'
20 | 
21 |         Keyword:                   "bold #004461",   # class: 'k'
22 |         Keyword.Constant:          "bold #004461",   # class: 'kc'
23 |         Keyword.Declaration:       "bold #004461",   # class: 'kd'
24 |         Keyword.Namespace:         "bold #004461",   # class: 'kn'
25 |         Keyword.Pseudo:            "bold #004461",   # class: 'kp'
26 |         Keyword.Reserved:          "bold #004461",   # class: 'kr'
27 |         Keyword.Type:              "bold #004461",   # class: 'kt'
28 | 
29 |         Operator:                  "#582800",   # class: 'o'
30 |         Operator.Word:             "bold #004461",   # class: 'ow' - like keywords
31 | 
32 |         Punctuation:               "bold #000000",   # class: 'p'
33 | 
34 |         # because special names such as Name.Class, Name.Function, etc.
35 |         # are not recognized as such later in the parsing, we choose them
36 |         # to look the same as ordinary variables.
37 |         Name:                      "#000000",        # class: 'n'
38 |         Name.Attribute:            "#c4a000",        # class: 'na' - to be revised
39 |         Name.Builtin:              "#004461",        # class: 'nb'
40 |         Name.Builtin.Pseudo:       "#3465a4",        # class: 'bp'
41 |         Name.Class:                "#000000",        # class: 'nc' - to be revised
42 |         Name.Constant:             "#000000",        # class: 'no' - to be revised
43 |         Name.Decorator:            "#888",           # class: 'nd' - to be revised
44 |         Name.Entity:               "#ce5c00",        # class: 'ni'
45 |         Name.Exception:            "bold #cc0000",   # class: 'ne'
46 |         Name.Function:             "#000000",        # class: 'nf'
47 |         Name.Property:             "#000000",        # class: 'py'
48 |         Name.Label:                "#f57900",        # class: 'nl'
49 |         Name.Namespace:            "#000000",        # class: 'nn' - to be revised
50 |         Name.Other:                "#000000",        # class: 'nx'
51 |         Name.Tag:                  "bold #004461",   # class: 'nt' - like a keyword
52 |         Name.Variable:             "#000000",        # class: 'nv' - to be revised
53 |         Name.Variable.Class:       "#000000",        # class: 'vc' - to be revised
54 |         Name.Variable.Global:      "#000000",        # class: 'vg' - to be revised
55 |         Name.Variable.Instance:    "#000000",        # class: 'vi' - to be revised
56 | 
57 |         Number:                    "#990000",        # class: 'm'
58 | 
59 |         Literal:                   "#000000",        # class: 'l'
60 |         Literal.Date:              "#000000",        # class: 'ld'
61 | 
62 |         String:                    "#4e9a06",        # class: 's'
63 |         String.Backtick:           "#4e9a06",        # class: 'sb'
64 |         String.Char:               "#4e9a06",        # class: 'sc'
65 |         String.Doc:                "italic #8f5902", # class: 'sd' - like a comment
66 |         String.Double:             "#4e9a06",        # class: 's2'
67 |         String.Escape:             "#4e9a06",        # class: 'se'
68 |         String.Heredoc:            "#4e9a06",        # class: 'sh'
69 |         String.Interpol:           "#4e9a06",        # class: 'si'
70 |         String.Other:              "#4e9a06",        # class: 'sx'
71 |         String.Regex:              "#4e9a06",        # class: 'sr'
72 |         String.Single:             "#4e9a06",        # class: 's1'
73 |         String.Symbol:             "#4e9a06",        # class: 'ss'
74 | 
75 |         Generic:                   "#000000",        # class: 'g'
76 |         Generic.Deleted:           "#a40000",        # class: 'gd'
77 |         Generic.Emph:              "italic #000000", # class: 'ge'
78 |         Generic.Error:             "#ef2929",        # class: 'gr'
79 |         Generic.Heading:           "bold #000080",   # class: 'gh'
80 |         Generic.Inserted:          "#00A000",        # class: 'gi'
81 |         Generic.Output:            "#888",           # class: 'go'
82 |         Generic.Prompt:            "#745334",        # class: 'gp'
83 |         Generic.Strong:            "bold #000000",   # class: 'gs'
84 |         Generic.Subheading:        "bold #800080",   # class: 'gu'
85 |         Generic.Traceback:         "bold #a40000",   # class: 'gt'
86 |     }
87 | 


--------------------------------------------------------------------------------
/docs/_themes/kr/layout.html:
--------------------------------------------------------------------------------
 1 | {%- extends "basic/layout.html" %}
 2 | {%- block extrahead %}
 3 |   {{ super() }}
 4 |   {% if theme_touch_icon %}
 5 |     <link rel="apple-touch-icon" href="{{ pathto('_static/' ~ theme_touch_icon, 1) }}" />
 6 |   {% endif %}
 7 |   <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9">
 8 | {% endblock %}
 9 | {%- block relbar2 %}{% endblock %}
10 | {%- block footer %}
11 |     <div class="footer">
12 |       &copy; Copyright {{ copyright }}.
13 |     </div>
14 |     <a href="https://github.com/learntextvis/textkit" class="github">
15 |         <img style="position: absolute; top: 0; right: 0; border: 0;" src="http://s3.amazonaws.com/github/ribbons/forkme_right_darkblue_121621.png" alt="Fork me on GitHub"  class="github"/>
16 |     </a>
17 | 
18 | {%- endblock %}
19 | 


--------------------------------------------------------------------------------
/docs/_themes/kr/relations.html:
--------------------------------------------------------------------------------
 1 | <h3>Related Topics</h3>
 2 | <ul>
 3 |   <li><a href="{{ pathto(master_doc) }}">Documentation overview</a><ul>
 4 |   {%- for parent in parents %}
 5 |   <li><a href="{{ parent.link|e }}">{{ parent.title }}</a><ul>
 6 |   {%- endfor %}
 7 |     {%- if prev %}
 8 |       <li>Previous: <a href="{{ prev.link|e }}" title="{{ _('previous chapter')
 9 |         }}">{{ prev.title }}</a></li>
10 |     {%- endif %}
11 |     {%- if next %}
12 |       <li>Next: <a href="{{ next.link|e }}" title="{{ _('next chapter')
13 |         }}">{{ next.title }}</a></li>
14 |     {%- endif %}
15 |   {%- for parent in parents %}
16 |   </ul></li>
17 |   {%- endfor %}
18 |   </ul></li>
19 | </ul>
20 | 


--------------------------------------------------------------------------------
/docs/_themes/kr/static/flasky.css_t:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * flasky.css_t
  3 |  * ~~~~~~~~~~~~
  4 |  *
  5 |  * :copyright: Copyright 2010 by Armin Ronacher. Modifications by Kenneth Reitz.
  6 |  * :license: Flask Design License, see LICENSE for details.
  7 |  */
  8 | 
  9 | {% set page_width = '940px' %}
 10 | {% set sidebar_width = '220px' %}
 11 | 
 12 | @import url("basic.css");
 13 | 
 14 | /* -- page layout ----------------------------------------------------------- */
 15 | 
 16 | body {
 17 |     font-family: 'goudy old style', 'minion pro', 'bell mt', Georgia, 'Hiragino Mincho Pro';
 18 |     font-size: 17px;
 19 |     background-color: white;
 20 |     color: #000;
 21 |     margin: 0;
 22 |     padding: 0;
 23 | }
 24 | 
 25 | div.document {
 26 |     width: {{ page_width }};
 27 |     margin: 30px auto 0 auto;
 28 | }
 29 | 
 30 | div.documentwrapper {
 31 |     float: left;
 32 |     width: 100%;
 33 | }
 34 | 
 35 | div.bodywrapper {
 36 |     margin: 0 0 0 {{ sidebar_width }};
 37 | }
 38 | 
 39 | div.sphinxsidebar {
 40 |     width: {{ sidebar_width }};
 41 | }
 42 | 
 43 | hr {
 44 |     border: 1px solid #B1B4B6;
 45 | }
 46 | 
 47 | div.body {
 48 |     background-color: #ffffff;
 49 |     color: #3E4349;
 50 |     padding: 0 30px 0 30px;
 51 | }
 52 | 
 53 | img.floatingflask {
 54 |     padding: 0 0 10px 10px;
 55 |     float: right;
 56 | }
 57 | 
 58 | div.footer {
 59 |     width: {{ page_width }};
 60 |     margin: 20px auto 30px auto;
 61 |     font-size: 14px;
 62 |     color: #888;
 63 |     text-align: right;
 64 | }
 65 | 
 66 | div.footer a {
 67 |     color: #888;
 68 | }
 69 | 
 70 | div.related {
 71 |     display: none;
 72 | }
 73 | 
 74 | div.sphinxsidebar a {
 75 |     color: #444;
 76 |     text-decoration: none;
 77 |     border-bottom: 1px dotted #999;
 78 | }
 79 | 
 80 | div.sphinxsidebar a:hover {
 81 |     border-bottom: 1px solid #999;
 82 | }
 83 | 
 84 | div.sphinxsidebar {
 85 |     font-size: 14px;
 86 |     line-height: 1.5;
 87 | }
 88 | 
 89 | div.sphinxsidebarwrapper {
 90 |     padding: 18px 10px;
 91 | }
 92 | 
 93 | div.sphinxsidebarwrapper p.logo {
 94 |     padding: 0;
 95 |     margin: -10px 0 0 -20px;
 96 |     text-align: center;
 97 | }
 98 | 
 99 | div.sphinxsidebar h3,
100 | div.sphinxsidebar h4 {
101 |     font-family: 'Garamond', 'Georgia', serif;
102 |     color: #444;
103 |     font-size: 24px;
104 |     font-weight: normal;
105 |     margin: 0 0 5px 0;
106 |     padding: 0;
107 | }
108 | 
109 | div.sphinxsidebar h4 {
110 |     font-size: 20px;
111 | }
112 | 
113 | div.sphinxsidebar h3 a {
114 |     color: #444;
115 | }
116 | 
117 | div.sphinxsidebar p.logo a,
118 | div.sphinxsidebar h3 a,
119 | div.sphinxsidebar p.logo a:hover,
120 | div.sphinxsidebar h3 a:hover {
121 |     border: none;
122 | }
123 | 
124 | div.sphinxsidebar p {
125 |     color: #555;
126 |     margin: 10px 0;
127 | }
128 | 
129 | div.sphinxsidebar ul {
130 |     margin: 10px 0;
131 |     padding: 0;
132 |     color: #000;
133 | }
134 | 
135 | div.sphinxsidebar input {
136 |     border: 1px solid #ccc;
137 |     font-family: 'Georgia', serif;
138 |     font-size: 1em;
139 | }
140 | 
141 | /* -- body styles ----------------------------------------------------------- */
142 | 
143 | a {
144 |     color: #004B6B;
145 |     text-decoration: underline;
146 | }
147 | 
148 | a:hover {
149 |     color: #6D4100;
150 |     text-decoration: underline;
151 | }
152 | 
153 | div.body h1,
154 | div.body h2,
155 | div.body h3,
156 | div.body h4,
157 | div.body h5,
158 | div.body h6 {
159 |     font-family: 'Garamond', 'Georgia', serif;
160 |     font-weight: normal;
161 |     margin: 30px 0px 10px 0px;
162 |     padding: 0;
163 | }
164 | 
165 | div.body h1 { margin-top: 0; padding-top: 0; font-size: 240%; }
166 | div.body h2 { font-size: 180%; }
167 | div.body h3 { font-size: 150%; }
168 | div.body h4 { font-size: 130%; }
169 | div.body h5 { font-size: 100%; }
170 | div.body h6 { font-size: 100%; }
171 | 
172 | a.headerlink {
173 |     color: #ddd;
174 |     padding: 0 4px;
175 |     text-decoration: none;
176 | }
177 | 
178 | a.headerlink:hover {
179 |     color: #444;
180 |     background: #eaeaea;
181 | }
182 | 
183 | div.body p, div.body dd, div.body li {
184 |     line-height: 1.4em;
185 | }
186 | 
187 | div.admonition {
188 |     background: #fafafa;
189 |     margin: 20px -30px;
190 |     padding: 10px 30px;
191 |     border-top: 1px solid #ccc;
192 |     border-bottom: 1px solid #ccc;
193 | }
194 | 
195 | div.admonition tt.xref, div.admonition a tt {
196 |     border-bottom: 1px solid #fafafa;
197 | }
198 | 
199 | dd div.admonition {
200 |     margin-left: -60px;
201 |     padding-left: 60px;
202 | }
203 | 
204 | div.admonition p.admonition-title {
205 |     font-family: 'Garamond', 'Georgia', serif;
206 |     font-weight: normal;
207 |     font-size: 24px;
208 |     margin: 0 0 10px 0;
209 |     padding: 0;
210 |     line-height: 1;
211 | }
212 | 
213 | div.admonition p.last {
214 |     margin-bottom: 0;
215 | }
216 | 
217 | div.highlight {
218 |     background-color: white;
219 | }
220 | 
221 | dt:target, .highlight {
222 |     background: #FAF3E8;
223 | }
224 | 
225 | div.note {
226 |     background-color: #eee;
227 |     border: 1px solid #ccc;
228 | }
229 | 
230 | div.seealso {
231 |     background-color: #ffc;
232 |     border: 1px solid #ff6;
233 | }
234 | 
235 | div.topic {
236 |     background-color: #eee;
237 | }
238 | 
239 | p.admonition-title {
240 |     display: inline;
241 | }
242 | 
243 | p.admonition-title:after {
244 |     content: ":";
245 | }
246 | 
247 | pre, tt {
248 |     font-family: 'Consolas', 'Menlo', 'Deja Vu Sans Mono', 'Bitstream Vera Sans Mono', monospace;
249 |     font-size: 0.9em;
250 | }
251 | 
252 | img.screenshot {
253 | }
254 | 
255 | tt.descname, tt.descclassname {
256 |     font-size: 0.95em;
257 | }
258 | 
259 | tt.descname {
260 |     padding-right: 0.08em;
261 | }
262 | 
263 | img.screenshot {
264 |     -moz-box-shadow: 2px 2px 4px #eee;
265 |     -webkit-box-shadow: 2px 2px 4px #eee;
266 |     box-shadow: 2px 2px 4px #eee;
267 | }
268 | 
269 | table.docutils {
270 |     border: 1px solid #888;
271 |     -moz-box-shadow: 2px 2px 4px #eee;
272 |     -webkit-box-shadow: 2px 2px 4px #eee;
273 |     box-shadow: 2px 2px 4px #eee;
274 | }
275 | 
276 | table.docutils td, table.docutils th {
277 |     border: 1px solid #888;
278 |     padding: 0.25em 0.7em;
279 | }
280 | 
281 | table.field-list, table.footnote {
282 |     border: none;
283 |     -moz-box-shadow: none;
284 |     -webkit-box-shadow: none;
285 |     box-shadow: none;
286 | }
287 | 
288 | table.footnote {
289 |     margin: 15px 0;
290 |     width: 100%;
291 |     border: 1px solid #eee;
292 |     background: #fdfdfd;
293 |     font-size: 0.9em;
294 | }
295 | 
296 | table.footnote + table.footnote {
297 |     margin-top: -15px;
298 |     border-top: none;
299 | }
300 | 
301 | table.field-list th {
302 |     padding: 0 0.8em 0 0;
303 | }
304 | 
305 | table.field-list td {
306 |     padding: 0;
307 | }
308 | 
309 | table.footnote td.label {
310 |     width: 0px;
311 |     padding: 0.3em 0 0.3em 0.5em;
312 | }
313 | 
314 | table.footnote td {
315 |     padding: 0.3em 0.5em;
316 | }
317 | 
318 | dl {
319 |     margin: 0;
320 |     padding: 0;
321 | }
322 | 
323 | dl dd {
324 |     margin-left: 30px;
325 | }
326 | 
327 | blockquote {
328 |     margin: 0 0 0 30px;
329 |     padding: 0;
330 | }
331 | 
332 | ul, ol {
333 |     margin: 10px 0 10px 30px;
334 |     padding: 0;
335 | }
336 | 
337 | pre {
338 |     background: #eee;
339 |     padding: 7px 30px;
340 |     margin: 15px -30px;
341 |     line-height: 1.3em;
342 | }
343 | 
344 | dl pre, blockquote pre, li pre {
345 |     margin-left: -60px;
346 |     padding-left: 60px;
347 | }
348 | 
349 | dl dl pre {
350 |     margin-left: -90px;
351 |     padding-left: 90px;
352 | }
353 | 
354 | tt {
355 |     background-color: #ecf0f3;
356 |     color: #222;
357 |     /* padding: 1px 2px; */
358 | }
359 | 
360 | tt.xref, a tt {
361 |     background-color: #FBFBFB;
362 |     border-bottom: 1px solid white;
363 | }
364 | 
365 | a.reference {
366 |     text-decoration: none;
367 |     border-bottom: 1px dotted #004B6B;
368 | }
369 | 
370 | a.reference:hover {
371 |     border-bottom: 1px solid #6D4100;
372 | }
373 | 
374 | a.footnote-reference {
375 |     text-decoration: none;
376 |     font-size: 0.7em;
377 |     vertical-align: top;
378 |     border-bottom: 1px dotted #004B6B;
379 | }
380 | 
381 | a.footnote-reference:hover {
382 |     border-bottom: 1px solid #6D4100;
383 | }
384 | 
385 | a:hover tt {
386 |     background: #EEE;
387 | }
388 | 
389 | 
390 | @media screen and (max-width: 870px) {
391 | 
392 |     div.sphinxsidebar {
393 |         display: none;
394 |     }
395 | 
396 |     div.document {
397 |        width: 100%;
398 | 
399 |     }
400 | 
401 |     div.documentwrapper {
402 |         margin-left: 0;
403 |         margin-top: 0;
404 |         margin-right: 0;
405 |         margin-bottom: 0;
406 |     }
407 | 
408 |     div.bodywrapper {
409 |         margin-top: 0;
410 |         margin-right: 0;
411 |         margin-bottom: 0;
412 |         margin-left: 0;
413 |     }
414 | 
415 |     ul {
416 |         margin-left: 0;
417 |     }
418 | 
419 |     .document {
420 |         width: auto;
421 |     }
422 | 
423 |     .footer {
424 |         width: auto;
425 |     }
426 | 
427 |     .bodywrapper {
428 |         margin: 0;
429 |     }
430 | 
431 |     .footer {
432 |         width: auto;
433 |     }
434 | 
435 |     .github {
436 |         display: none;
437 |     }
438 | 
439 | 
440 | 
441 | }
442 | 
443 | 
444 | 
445 | @media screen and (max-width: 875px) {
446 | 
447 |     body {
448 |         margin: 0;
449 |         padding: 20px 30px;
450 |     }
451 | 
452 |     div.documentwrapper {
453 |         float: none;
454 |         background: white;
455 |     }
456 | 
457 |     div.sphinxsidebar {
458 |         display: block;
459 |         float: none;
460 |         width: 102.5%;
461 |         margin: 50px -30px -20px -30px;
462 |         padding: 10px 20px;
463 |         background: #333;
464 |         color: white;
465 |     }
466 | 
467 |     div.sphinxsidebar h3, div.sphinxsidebar h4, div.sphinxsidebar p,
468 |     div.sphinxsidebar h3 a {
469 |         color: white;
470 |     }
471 | 
472 |     div.sphinxsidebar a {
473 |         color: #aaa;
474 |     }
475 | 
476 |     div.sphinxsidebar p.logo {
477 |         display: none;
478 |     }
479 | 
480 |     div.document {
481 |         width: 100%;
482 |         margin: 0;
483 |     }
484 | 
485 |     div.related {
486 |         display: block;
487 |         margin: 0;
488 |         padding: 10px 0 20px 0;
489 |     }
490 | 
491 |     div.related ul,
492 |     div.related ul li {
493 |         margin: 0;
494 |         padding: 0;
495 |     }
496 | 
497 |     div.footer {
498 |         display: none;
499 |     }
500 | 
501 |     div.bodywrapper {
502 |         margin: 0;
503 |     }
504 | 
505 |     div.body {
506 |         min-height: 0;
507 |         padding: 0;
508 |     }
509 | 
510 |     .rtd_doc_footer {
511 |         display: none;
512 |     }
513 | 
514 |     .document {
515 |         width: auto;
516 |     }
517 | 
518 |     .footer {
519 |         width: auto;
520 |     }
521 | 
522 |     .footer {
523 |         width: auto;
524 |     }
525 | 
526 |     .github {
527 |         display: none;
528 |     }
529 | }
530 | 
531 | 
532 | /* misc. */
533 | 
534 | .revsys-inline {
535 |     display: none!important;
536 | }
537 | 
538 | div.sphinxsidebar a.flattr-button {
539 |     text-decoration: none;
540 |     border-bottom: none;
541 | }


--------------------------------------------------------------------------------
/docs/_themes/kr/static/small_flask.css:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * small_flask.css_t
 3 |  * ~~~~~~~~~~~~~~~~~
 4 |  *
 5 |  * :copyright: Copyright 2010 by Armin Ronacher.
 6 |  * :license: Flask Design License, see LICENSE for details.
 7 |  */
 8 | 
 9 | body {
10 |     margin: 0;
11 |     padding: 20px 30px;
12 | }
13 | 
14 | div.documentwrapper {
15 |     float: none;
16 |     background: white;
17 | }
18 | 
19 | div.sphinxsidebar {
20 |     display: block;
21 |     float: none;
22 |     width: 102.5%;
23 |     margin: 50px -30px -20px -30px;
24 |     padding: 10px 20px;
25 |     background: #333;
26 |     color: white;
27 | }
28 | 
29 | div.sphinxsidebar h3, div.sphinxsidebar h4, div.sphinxsidebar p,
30 | div.sphinxsidebar h3 a {
31 |     color: white;
32 | }
33 | 
34 | div.sphinxsidebar a {
35 |     color: #aaa;
36 | }
37 | 
38 | div.sphinxsidebar p.logo {
39 |     display: none;
40 | }
41 | 
42 | div.document {
43 |     width: 100%;
44 |     margin: 0;
45 | }
46 | 
47 | div.related {
48 |     display: block;
49 |     margin: 0;
50 |     padding: 10px 0 20px 0;
51 | }
52 | 
53 | div.related ul,
54 | div.related ul li {
55 |     margin: 0;
56 |     padding: 0;
57 | }
58 | 
59 | div.footer {
60 |     display: none;
61 | }
62 | 
63 | div.bodywrapper {
64 |     margin: 0;
65 | }
66 | 
67 | div.body {
68 |     min-height: 0;
69 |     padding: 0;
70 | }
71 | 
72 | .rtd_doc_footer {
73 |     display: none;
74 | }
75 | 
76 | .document {
77 |     width: auto;
78 | }
79 | 
80 | .footer {
81 | 	width: auto;
82 | }
83 | 
84 | .footer {
85 | 	width: auto;
86 | }
87 | 
88 | .github {
89 |     display: none;
90 | }
91 | 
92 | img {
93 |     border: 0px 0px;
94 | }


--------------------------------------------------------------------------------
/docs/_themes/kr/theme.conf:
--------------------------------------------------------------------------------
1 | [theme]
2 | inherit = basic
3 | stylesheet = flasky.css
4 | pygments_style = flask_theme_support.FlaskyStyle
5 | 
6 | [options]
7 | touch_icon = 
8 | 


--------------------------------------------------------------------------------
/docs/_themes/kr_small/layout.html:
--------------------------------------------------------------------------------
 1 | {% extends "basic/layout.html" %}
 2 | {% block header %}
 3 |   {{ super() }}
 4 |   {% if pagename == 'index' %}
 5 |   <div class=indexwrapper>
 6 |   {% endif %}
 7 | {% endblock %}
 8 | {% block footer %}
 9 |   {% if pagename == 'index' %}
10 |   </div>
11 |   {% endif %}
12 | {% endblock %}
13 | {# do not display relbars #}
14 | {% block relbar1 %}{% endblock %}
15 | {% block relbar2 %}
16 |   {% if theme_github_fork %}
17 |     <a href="http://github.com/{{ theme_github_fork }}"><img style="position: fixed; top: 0; right: 0; border: 0;"
18 |     src="http://s3.amazonaws.com/github/ribbons/forkme_right_darkblue_121621.png" alt="Fork me on GitHub" /></a>
19 |   {% endif %}
20 | {% endblock %}
21 | {% block sidebar1 %}{% endblock %}
22 | {% block sidebar2 %}{% endblock %}
23 | 


--------------------------------------------------------------------------------
/docs/_themes/kr_small/static/flasky.css_t:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * flasky.css_t
  3 |  * ~~~~~~~~~~~~
  4 |  *
  5 |  * Sphinx stylesheet -- flasky theme based on nature theme.
  6 |  *
  7 |  * :copyright: Copyright 2007-2010 by the Sphinx team, see AUTHORS.
  8 |  * :license: BSD, see LICENSE for details.
  9 |  *
 10 |  */
 11 |  
 12 | @import url("basic.css");
 13 |  
 14 | /* -- page layout ----------------------------------------------------------- */
 15 |  
 16 | body {
 17 |     font-family: 'Georgia', serif;
 18 |     font-size: 17px;
 19 |     color: #000;
 20 |     background: white;
 21 |     margin: 0;
 22 |     padding: 0;
 23 | }
 24 | 
 25 | div.documentwrapper {
 26 |     float: left;
 27 |     width: 100%;
 28 | }
 29 | 
 30 | div.bodywrapper {
 31 |     margin: 40px auto 0 auto;
 32 |     width: 700px;
 33 | }
 34 | 
 35 | hr {
 36 |     border: 1px solid #B1B4B6;
 37 | }
 38 |  
 39 | div.body {
 40 |     background-color: #ffffff;
 41 |     color: #3E4349;
 42 |     padding: 0 30px 30px 30px;
 43 | }
 44 | 
 45 | img.floatingflask {
 46 |     padding: 0 0 10px 10px;
 47 |     float: right;
 48 | }
 49 |  
 50 | div.footer {
 51 |     text-align: right;
 52 |     color: #888;
 53 |     padding: 10px;
 54 |     font-size: 14px;
 55 |     width: 650px;
 56 |     margin: 0 auto 40px auto;
 57 | }
 58 |  
 59 | div.footer a {
 60 |     color: #888;
 61 |     text-decoration: underline;
 62 | }
 63 |  
 64 | div.related {
 65 |     line-height: 32px;
 66 |     color: #888;
 67 | }
 68 | 
 69 | div.related ul {
 70 |     padding: 0 0 0 10px;
 71 | }
 72 |  
 73 | div.related a {
 74 |     color: #444;
 75 | }
 76 |  
 77 | /* -- body styles ----------------------------------------------------------- */
 78 |  
 79 | a {
 80 |     color: #004B6B;
 81 |     text-decoration: underline;
 82 | }
 83 |  
 84 | a:hover {
 85 |     color: #6D4100;
 86 |     text-decoration: underline;
 87 | }
 88 | 
 89 | div.body {
 90 |     padding-bottom: 40px; /* saved for footer */
 91 | }
 92 |  
 93 | div.body h1,
 94 | div.body h2,
 95 | div.body h3,
 96 | div.body h4,
 97 | div.body h5,
 98 | div.body h6 {
 99 |     font-family: 'Garamond', 'Georgia', serif;
100 |     font-weight: normal;
101 |     margin: 30px 0px 10px 0px;
102 |     padding: 0;
103 | }
104 | 
105 | {% if theme_index_logo %}
106 | div.indexwrapper h1 {
107 |     text-indent: -999999px;
108 |     background: url({{ theme_index_logo }}) no-repeat center center;
109 |     height: {{ theme_index_logo_height }};
110 | }
111 | {% endif %}
112 |  
113 | div.body h2 { font-size: 180%; }
114 | div.body h3 { font-size: 150%; }
115 | div.body h4 { font-size: 130%; }
116 | div.body h5 { font-size: 100%; }
117 | div.body h6 { font-size: 100%; }
118 |  
119 | a.headerlink {
120 |     color: white;
121 |     padding: 0 4px;
122 |     text-decoration: none;
123 | }
124 |  
125 | a.headerlink:hover {
126 |     color: #444;
127 |     background: #eaeaea;
128 | }
129 |  
130 | div.body p, div.body dd, div.body li {
131 |     line-height: 1.4em;
132 | }
133 | 
134 | div.admonition {
135 |     background: #fafafa;
136 |     margin: 20px -30px;
137 |     padding: 10px 30px;
138 |     border-top: 1px solid #ccc;
139 |     border-bottom: 1px solid #ccc;
140 | }
141 | 
142 | div.admonition p.admonition-title {
143 |     font-family: 'Garamond', 'Georgia', serif;
144 |     font-weight: normal;
145 |     font-size: 24px;
146 |     margin: 0 0 10px 0;
147 |     padding: 0;
148 |     line-height: 1;
149 | }
150 | 
151 | div.admonition p.last {
152 |     margin-bottom: 0;
153 | }
154 | 
155 | div.highlight{
156 |     background-color: white;
157 | }
158 | 
159 | dt:target, .highlight {
160 |     background: #FAF3E8;
161 | }
162 | 
163 | div.note {
164 |     background-color: #eee;
165 |     border: 1px solid #ccc;
166 | }
167 |  
168 | div.seealso {
169 |     background-color: #ffc;
170 |     border: 1px solid #ff6;
171 | }
172 |  
173 | div.topic {
174 |     background-color: #eee;
175 | }
176 |  
177 | div.warning {
178 |     background-color: #ffe4e4;
179 |     border: 1px solid #f66;
180 | }
181 |  
182 | p.admonition-title {
183 |     display: inline;
184 | }
185 |  
186 | p.admonition-title:after {
187 |     content: ":";
188 | }
189 | 
190 | pre, tt {
191 |     font-family: 'Consolas', 'Menlo', 'Deja Vu Sans Mono', 'Bitstream Vera Sans Mono', monospace;
192 |     font-size: 0.85em;
193 | }
194 | 
195 | img.screenshot {
196 | }
197 | 
198 | tt.descname, tt.descclassname {
199 |     font-size: 0.95em;
200 | }
201 | 
202 | tt.descname {
203 |     padding-right: 0.08em;
204 | }
205 | 
206 | img.screenshot {
207 |     -moz-box-shadow: 2px 2px 4px #eee;
208 |     -webkit-box-shadow: 2px 2px 4px #eee;
209 |     box-shadow: 2px 2px 4px #eee;
210 | }
211 | 
212 | table.docutils {
213 |     border: 1px solid #888;
214 |     -moz-box-shadow: 2px 2px 4px #eee;
215 |     -webkit-box-shadow: 2px 2px 4px #eee;
216 |     box-shadow: 2px 2px 4px #eee;
217 | }
218 | 
219 | table.docutils td, table.docutils th {
220 |     border: 1px solid #888;
221 |     padding: 0.25em 0.7em;
222 | }
223 | 
224 | table.field-list, table.footnote {
225 |     border: none;
226 |     -moz-box-shadow: none;
227 |     -webkit-box-shadow: none;
228 |     box-shadow: none;
229 | }
230 | 
231 | table.footnote {
232 |     margin: 15px 0;
233 |     width: 100%;
234 |     border: 1px solid #eee;
235 | }
236 | 
237 | table.field-list th {
238 |     padding: 0 0.8em 0 0;
239 | }
240 | 
241 | table.field-list td {
242 |     padding: 0;
243 | }
244 | 
245 | table.footnote td {
246 |     padding: 0.5em;
247 | }
248 | 
249 | dl {
250 |     margin: 0;
251 |     padding: 0;
252 | }
253 | 
254 | dl dd {
255 |     margin-left: 30px;
256 | }
257 |  
258 | pre {
259 |     padding: 0;
260 |     margin: 15px -30px;
261 |     padding: 8px;
262 |     line-height: 1.3em;
263 |     padding: 7px 30px;
264 |     background: #eee;
265 |     border-radius: 2px;
266 |     -moz-border-radius: 2px;
267 |     -webkit-border-radius: 2px;
268 | }
269 | 
270 | dl pre {
271 |     margin-left: -60px;
272 |     padding-left: 60px;
273 | }
274 | 
275 | tt {
276 |     background-color: #ecf0f3;
277 |     color: #222;
278 |     /* padding: 1px 2px; */
279 | }
280 | 
281 | tt.xref, a tt {
282 |     background-color: #FBFBFB;
283 | }
284 | 
285 | a:hover tt {
286 |     background: #EEE;
287 | }
288 | 


--------------------------------------------------------------------------------
/docs/_themes/kr_small/theme.conf:
--------------------------------------------------------------------------------
 1 | [theme]
 2 | inherit = basic
 3 | stylesheet = flasky.css
 4 | nosidebar = true
 5 | pygments_style = flask_theme_support.FlaskyStyle
 6 | 
 7 | [options]
 8 | index_logo = ''
 9 | index_logo_height = 120px
10 | github_fork = ''
11 | 


--------------------------------------------------------------------------------
/docs/build_script_docs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import os
 5 | import subprocess
 6 | 
 7 | 
 8 | def create_docs(command, path):
 9 |     proc = subprocess.Popen(["textkit", command, "--help"], stdout=subprocess.PIPE)
10 |     out = str(proc.communicate()[0]).split('\\n')
11 |     # print(out)
12 |     content = "=" * len(command) + "\n"
13 |     content += command + "\n"
14 |     content += "=" * len(command) + "\n"
15 | 
16 |     content += "\n"
17 |     content += "Description\n"
18 |     content += "=" * len("Description") + "\n"
19 |     content += "\n"
20 |     content += "::\n"
21 |     content += "\n"
22 |     for line in out:
23 |         content += "    " + line.replace("b'", '').replace("'", '') + "\n"
24 |     content += "\n"
25 |     content += "\n"
26 |     content += "Examples\n"
27 |     content += "=" * len("Examples") + "\n"
28 |     print(content)
29 | 
30 |     with open(path, 'w') as f:
31 |         f.write(content)
32 | 
33 | proc = subprocess.Popen(["textkit", "--help"], stdout=subprocess.PIPE)
34 | out = str(proc.communicate()[0]).split('\\n')
35 | 
36 | commands = []
37 | 
38 | at_commands = False
39 | for line in out:
40 |     if at_commands:
41 |         command = line.split()[0]
42 |         if(len(command) > 3):
43 |             commands.append(command)
44 |     else:
45 |         if "Commands:" in line:
46 |             at_commands = True
47 | 
48 | 
49 | print(commands)
50 | 
51 | 
52 | path = os.path.dirname(os.path.realpath(__file__))
53 | doc_path = os.path.join(path, "scripts")
54 | print(doc_path)
55 | 
56 | for command in commands:
57 |     command_doc_path = os.path.join(doc_path, command + ".rst")
58 |     if not os.path.isfile(command_doc_path):
59 |         print('creating: ' + command_doc_path)
60 |         create_docs(command, command_doc_path)
61 |     else:
62 |         print('skipping: ' + command_doc_path)
63 | 


--------------------------------------------------------------------------------
/docs/cli.rst:
--------------------------------------------------------------------------------
 1 | =====
 2 | Usage
 3 | =====
 4 | 
 5 | textkit is composed of command-line tools that can be divided into four major categories: Tokenization, Transformation, Filtering, and Packaging. Documentation and examples for each tool are described on the following pages.
 6 | 
 7 | Tokenization
 8 | ============
 9 | 
10 | .. toctree::
11 |     :maxdepth: 1
12 | 
13 |     scripts/text2words
14 |     scripts/text2ngrams
15 |     scripts/text2punc
16 |     scripts/text2sentences
17 |     scripts/words2bigrams
18 |     scripts/words2ngrams
19 | 
20 | Transformation
21 | ==============
22 | 
23 | .. toctree::
24 |     :maxdepth: 1
25 | 
26 |     scripts/tokens2lower
27 |     scripts/tokens2upper
28 |     scripts/tokens2stem
29 |     scripts/tokens2counts
30 |     scripts/tokens2pos
31 |     scripts/tokens2topbigrams
32 |     scripts/transliterate
33 | 
34 | 
35 | Filter
36 | ======
37 | 
38 | .. toctree::
39 |     :maxdepth: 1
40 | 
41 |     scripts/filterlengths
42 |     scripts/filterpunc
43 |     scripts/filterwords
44 | 
45 | 
46 | Package
47 | =======
48 | 
49 | .. toctree::
50 |     :maxdepth: 1
51 | 
52 |     scripts/texts2json
53 |     scripts/tokens2json
54 |     scripts/tokens2text
55 | 
56 | Misc
57 | ====
58 | 
59 | .. toctree::
60 |     :maxdepth: 1
61 | 
62 |     scripts/download
63 |     scripts/nonewlines
64 |     scripts/showstops
65 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import datetime as dt
 3 | import os
 4 | import sys
 5 | 
 6 | # If extensions (or modules to document with autodoc) are in another directory,
 7 | # add these directories to sys.path here. If the directory is relative to the
 8 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 9 | sys.path.insert(0, os.path.abspath('..'))
10 | import textkit
11 | sys.path.append(os.path.abspath("_themes"))
12 | 
13 | # -- General configuration -----------------------------------------------------
14 | 
15 | # Add any Sphinx extension module names here, as strings. They can be extensions
16 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
17 | extensions = [
18 |     'sphinx.ext.autodoc',
19 |     'sphinx.ext.doctest',
20 |     'sphinx.ext.viewcode'
21 | ]
22 | 
23 | primary_domain = 'py'
24 | default_role = 'py:obj'
25 | 
26 | issues_github_path = 'learntextvis/textkit'
27 | 
28 | # Add any paths that contain templates here, relative to this directory.
29 | templates_path = ['_templates']
30 | 
31 | # The suffix of source filenames.
32 | source_suffix = '.rst'
33 | 
34 | # The master toctree document.
35 | master_doc = 'index'
36 | 
37 | # General information about the project.
38 | project = u'textkit'
39 | copyright = u'{0:%Y} <a href="http://learntextvis.github.io/">LearnTextVis</a>'.format(
40 |     dt.datetime.utcnow()
41 | )
42 | 
43 | # The version info for the project you're documenting, acts as replacement for
44 | # |version| and |release|, also used in various other places throughout the
45 | # built documents.
46 | #
47 | # The short X.Y version.
48 | # version = release = textkit.__version__
49 | version = release = '0.0.1'
50 | exclude_patterns = ['_build']
51 | pygments_style = 'flask_theme_support.FlaskyStyle'
52 | html_theme = 'kr'
53 | html_theme_path = ['_themes']
54 | 
55 | html_static_path = ['_static']
56 | 
57 | # Custom sidebar templates, maps document names to template names.
58 | html_sidebars = {
59 |     'index':    ['side-primary.html'],
60 |     '**':       ['side-secondary.html', 'localtoc.html',
61 |                  'relations.html']
62 | }
63 | # Output file base name for HTML help builder.
64 | htmlhelp_basename = 'textkitdoc'
65 | 
66 | 
67 | # -- Options for LaTeX output --------------------------------------------------
68 | 
69 | # Grouping the document tree into LaTeX files. List of tuples
70 | # (source start file, target name, title, author, documentclass [howto/manual]).
71 | latex_documents = [
72 |     ('index', 'textkit.tex', u'textkit Documentation',
73 |     u'LearnTextVis', 'manual'),
74 | ]
75 | 
76 | # One entry per manual page. List of tuples
77 | # (source start file, name, description, authors, manual section).
78 | man_pages = [
79 |     ('index', 'textkit', u'textkit Documentation',
80 |      [u'LearnTextVis'], 1)
81 | ]
82 | # -- Options for Texinfo output ------------------------------------------------
83 | 
84 | # Grouping the document tree into Texinfo files. List of tuples
85 | # (source start file, target name, title, author,
86 | #  dir menu entry, description, category)
87 | texinfo_documents = [
88 |     ('index', 'textkit', u'textkit Documentation',
89 |    u'LearnTextVis', 'textkit', 'Command-line text-processing.',
90 |    'Natural Language Processing'),
91 | ]
92 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
 1 | .. _contributing:
 2 | 
 3 | Contributing
 4 | ============
 5 | 
 6 | If you are interested in contributing to textkit we would love your help!
 7 | 
 8 | Here is a bit more about the structure of the codebase and how to contribute.
 9 | 
10 | Code Structure
11 | --------------
12 | 
13 | Each command is implemented in its own file. These command files are organized into
14 | sub-directories:
15 | 
16 | * tokenize
17 | * transform
18 | * filter
19 | * package
20 | 
21 | The use of these sub-directories is primarily for developer convenience and commands
22 | can be moved around if a better structure is found.
23 | 
24 | Commands
25 | --------
26 | 
27 | textkit uses `Click <http://click.pocoo.org/>`_. to handle command line arguments
28 | and inputs. Click uses decorators to define these arguments and options in a succinct way.
29 | 
30 | textkit strives to use text as an input and text as an output. Raw text can be processed
31 | using commands that start with ``text2`` like ``text2words``.
32 | 
33 | Token documents (text files with a token on each line) can be used and produced by
34 | commands that include ``words`` in the name.
35 | 
36 | Tranformation functions that work on tokens should start with ``tokens2``,
37 | as in ``tokens2counts``.
38 | 
39 | Utilities
40 | ---------
41 | 
42 | There are a very small set of utility functions that are useful in keeping textkit
43 | 
44 | These are contained in the ``utils.py`` file. Some that you might find helpful:
45 | 
46 | ``read_tokens`` will convert a token document into a list of tokens. Use this to process the
47 | input file if your input is a token document.
48 | 
49 | ``output`` is a light wrapper around the output capabilities of Click that prevents
50 | error messages if the command is exited early (like when piping to ``head``).
51 | 
52 | ``write_csv`` is handy for when multiple columns of data are being output.
53 | 
54 | Writing New Commands
55 | --------------------
56 | 
57 | Want to contribute a new command? Great!
58 | 
59 | textkit uses GitHub Pull Requests to incorporate other developer's work.
60 | 
61 | Fork the repo and then create a branch for your new command. Create and test it,
62 | then submit a Pull Request.
63 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | textkit: Command line text processing
 2 | ====================================
 3 | 
 4 | *textkit* is a set of command line tools for text processing and analysis.
 5 | 
 6 | You can use it to do basic natural language processing from the command line.
 7 | 
 8 | Features
 9 | --------
10 | 
11 | - Simple tools that can be combined to do fun stuff with text as data.
12 | - A Unix-style approach that promotes piping together commands to produce more complex processes.
13 | - Documentation is built in using `--help` on commands.
14 | 
15 | 
16 | Get it now
17 | ----------
18 | 
19 | textkit can be easily installed using pip:
20 | 
21 | ::
22 | 
23 |      $ pip install -U textkit
24 | 
25 | Try it out
26 | ----------
27 | 
28 | ::
29 | 
30 |      # install necessary data files
31 |      $ textkit download
32 | 
33 |      # show all commands
34 |      $ textkit --help
35 | 
36 | Check out the :ref:`Quickstart guide <quickstart>` to learn more about textkit's features.
37 | 
38 | 
39 | Guide
40 | =====
41 | 
42 | .. toctree::
43 |    :maxdepth: 2
44 | 
45 |    install
46 |    quickstart
47 |    cli
48 |    contributing
49 | 


--------------------------------------------------------------------------------
/docs/install.rst:
--------------------------------------------------------------------------------
 1 | .. _install:
 2 | 
 3 | Installation
 4 | ============
 5 | 
 6 | Using pip
 7 | +++++++++
 8 | 
 9 | Most users should be able to install textkit easily using pip:
10 | 
11 | ``pip install -U textkit``
12 | 
13 | To ensure you have all the data files needed to run all the commands, you should then run:
14 | 
15 | ``textkit download``
16 | 
17 | This will download some files that NLTK (a dependency of textkit) needs for certain commands.
18 | 
19 | From Source
20 | +++++++++++
21 | 
22 | Textkit is developed and maintained on github, so building from source is also easy.
23 | 
24 | First clone the repo:
25 | 
26 | ``git clone git@github.com:learntextvis/textkit.git``
27 | 
28 | Then navigate to the `textkit` directory to install its requirements
29 | 
30 | .. doctest::
31 | 
32 |      cd textkit
33 |      pip install -r requirements.txt
34 | 
35 | Finally, install the local version of textkit using the `--editable` flag:
36 | 
37 | .. doctest::
38 | 
39 |      pip install --editable .
40 | 


--------------------------------------------------------------------------------
/docs/quickstart.rst:
--------------------------------------------------------------------------------
  1 | .. _quickstart:
  2 | 
  3 | Tutorial: Quickstart
  4 | ====================
  5 | 
  6 |  .. module:: textkit.cli
  7 | 
  8 |  Let's say we have a very short piece of text stored in ``input.txt``. It looks something like:
  9 | 
 10 |  .. doctest::
 11 | 
 12 |     Mrs. Bennet deigned not to make any reply, but, unable to contain
 13 |     herself, began scolding one of her daughters.
 14 | 
 15 | What are some of the tools in textkit that we can use on this text?
 16 | 
 17 | Convert Text to Tokens
 18 | ------------------------
 19 | 
 20 | Tokenization is the process of turning text into chunks of text.
 21 | These chunks can be sentences, words, or even sections of words.
 22 | 
 23 | Textkit converts a text file into a **token document** - where each line has one token per line.
 24 | 
 25 | .. doctest::
 26 | 
 27 |     textkit text2words input.txt
 28 | 
 29 | This command converts our input.txt text file into a token document where each token is a word.
 30 | 
 31 | The output would look something like:
 32 | 
 33 | .. doctest::
 34 | 
 35 |     Mrs.
 36 |     Bennet
 37 |     deigned
 38 |     not
 39 |     to
 40 |     make
 41 |     any
 42 |     reply
 43 |     ,
 44 |     but
 45 |     ,
 46 |     unable
 47 |     to
 48 |     contain
 49 |     herself
 50 |     ,
 51 |     began
 52 |     scolding
 53 |     one
 54 |     of
 55 |     her
 56 |     daughters
 57 |     .
 58 | 
 59 | 
 60 | This is typically the first thing we want to do when using textkit, as textkit is all about working with tokens.
 61 | 
 62 | The output by default goes to standard out. You can redirect to a file by using ``>``.
 63 | 
 64 | .. doctest::
 65 | 
 66 |     textkit text2words input.txt > words.txt
 67 | 
 68 | This would put our words into ``words.txt``.
 69 | 
 70 | We can also get **bigrams** (two word tokens).
 71 | 
 72 | .. doctest::
 73 | 
 74 |     textkit text2words input.txt | textkit words2bigrams > bigrams.txt
 75 | 
 76 | Here we first convert the text to word tokens and use that as the input for the bigram tokenization.
 77 | 
 78 | The contents of ``bigrams.txt`` would look like:
 79 | 
 80 | .. doctest::
 81 | 
 82 |     Mrs. Bennet
 83 |     Bennet deigned
 84 |     deigned not
 85 |     not to
 86 |     to make
 87 |     make any
 88 |     any reply
 89 |     reply ,
 90 |     , but
 91 |     but ,
 92 |     , unable
 93 |     unable to
 94 |     to contain
 95 |     contain herself
 96 |     herself ,
 97 |     , began
 98 |     began scolding
 99 |     scolding one
100 |     one of
101 |     of her
102 |     her daughters
103 |     daughters .
104 | 
105 | Note the use of **|** for piping one textkit command into another.
106 | 
107 | With no file passed in, many textkit commands default to standard in.
108 | This can be indicated explicitly by using a dash (``-``) to indicate standard in.
109 | 
110 | Commands that begin with ``text`` in textkit transform text into tokens of some sort.
111 | 
112 | Any command that uses ``words`` expects to work with **token documents** that have one word per line.
113 | 
114 | A bigram is just a special case of an ``NGram`` - so lets make some ngrams of size 5:
115 | 
116 | .. doctest::
117 | 
118 |     textkit text2words input.txt | textkit words2ngrams -n 5
119 | 
120 | Which produces:
121 | 
122 | .. doctest::
123 | 
124 |     Mrs. Bennet deigned not to
125 |     Bennet deigned not to make
126 |     deigned not to make any
127 |     not to make any reply
128 |     to make any reply ,
129 |     make any reply , but
130 |     any reply , but ,
131 |     reply , but , unable
132 |     , but , unable to
133 |     but , unable to contain
134 |     , unable to contain herself
135 |     unable to contain herself ,
136 |     to contain herself , began
137 |     contain herself , began scolding
138 |     herself , began scolding one
139 |     , began scolding one of
140 |     began scolding one of her
141 |     scolding one of her daughters
142 |     one of her daughters .
143 | 
144 | Notice the ``-n`` argument to indicate the number of words that should be included in each ngram.
145 | 
146 | With all textkit commands, the ``--help`` flag shows all possible arguments for a command.
147 | 
148 | 
149 | .. doctest::
150 | 
151 |     textkit words2ngrams --help
152 | 
153 | .. doctest::
154 | 
155 |   Usage: textkit words2ngrams [OPTIONS] [TOKENS]
156 | 
157 |       Tokenize words into ngrams. ngrams are n-length word tokens. Punctuation
158 |       is considered as a separate token.
159 | 
160 |   Options:
161 |     --sep TEXT            Separator between words in bigram output.  [default: ]
162 |     -n, --length INTEGER  Length of the n-gram  [default: 2]
163 |     --help                Show this message and exit.
164 | 
165 | Filter Tokens
166 | -------------
167 | 
168 | textkit includes a number of filtering capabilities that can be useful for tweaking your tokens.
169 | 
170 | Notice our word and ngram tokens above include commas and periods? Let's remove them using ``filterpunc``.
171 | 
172 | .. doctest::
173 | 
174 |     textkit text2words input.txt | textkit filterpunc
175 | 
176 | If we don't want to pipe these commands together, we can also just execute filters on the ``words.txt`` - the saved word token file.
177 | 
178 | .. doctest::
179 | 
180 |     textkit filterpunc words.txt
181 | 
182 | 
183 | In natural language processing, ``stop words`` are words so common that they provide little information about a document, and so are often removed. Textkit's ``filterwords`` will remove stop words from our token output.
184 | 
185 | 
186 | .. doctest::
187 | 
188 |     textkit filterwords words.txt
189 | 
190 | We can also just filter words that are less then a certain number of characters long:
191 | 
192 | .. doctest::
193 | 
194 |     textkit filterlengths -m 5 words.txt
195 | 
196 | This would produce:
197 | 
198 | .. doctest::
199 | 
200 |     Bennet
201 |     deigned
202 |     reply
203 |     unable
204 |     contain
205 |     herself
206 |     began
207 |     scolding
208 |     daughters
209 | 
210 | Transform Tokens
211 | ----------------
212 | 
213 | There are a number of tools in textkit to transform tokens in varous ways.
214 | 
215 | Ensuring the casing of our tokens is consistent is a common text analysis preprocessing step.
216 | 
217 | This is done in textkit using ``tokens2lower`` and ``tokens2upper``. These commands work on tokens as well as raw text.
218 | 
219 | .. doctest::
220 | 
221 |     textkit tokens2lower input.txt
222 | 
223 | .. doctest::
224 | 
225 |     mrs. bennet deigned not to make any reply, but, unable to contain
226 |     herself, began scolding one of her daughters.
227 | 
228 | 
229 | .. doctest::
230 | 
231 |     textkit tokens2upper words.txt
232 | 
233 | .. doctest::
234 | 
235 |     MRS. BENNET DEIGNED NOT TO MAKE ANY REPLY, BUT, UNABLE TO CONTAIN
236 |     HERSELF, BEGAN SCOLDING ONE OF HER DAUGHTERS.
237 | 
238 | Token Information and Stats
239 | ---------------------------
240 | 
241 | textkit is also great for finding out interesting stuff about your text.
242 | 
243 | Count unique tokens with ``tokens2counts``, which outputs a CSV-like output that includes the token and the count of that token in the document.
244 | 
245 | .. doctest::
246 | 
247 |     textkit tokens2counts words.txt
248 | 
249 | ``TODO: topbigrams``
250 | 
251 | ``TODO: tokens2pos``
252 | 
253 | Package
254 | -------
255 | 
256 | Once the tokens are setup and transformed the way you want them,
257 | it can be useful to package up a set of documents into a single file for downstream visualization or other uses.
258 | 
259 | .. doctest::
260 | 
261 |     textkit tokens2json words1.txt words2.txt > out.json
262 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | # pip requirements for docs generation
2 | # pip install -r docs/requirements.txt
3 | 
4 | Sphinx


--------------------------------------------------------------------------------
/docs/rsync_exclude:
--------------------------------------------------------------------------------
1 | .git
2 | 


--------------------------------------------------------------------------------
/docs/scripts/download.rst:
--------------------------------------------------------------------------------
 1 | ========
 2 | download
 3 | ========
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | Install required libraries. Note this library will install nltk dependencies into your user directory.
 9 | 
10 | ::
11 | 
12 |     Usage: textkit download
13 | 
14 | 
15 | 
16 | Examples
17 | ========
18 | 


--------------------------------------------------------------------------------
/docs/scripts/filterlengths.rst:
--------------------------------------------------------------------------------
 1 | =============
 2 | filterlengths
 3 | =============
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | Remove tokens that are shorter then the minimum length provided.
 9 | 
10 | ::
11 | 
12 |     Usage: textkit filterlengths [OPTIONS] [TOKENS]
13 | 
14 |     Options:
15 |       -m, --minimum INTEGER  Minimum length of token to not filter.  [default: 3]
16 |       --help                 Show this message and exit.
17 | 
18 | 
19 | 
20 | Examples
21 | ========
22 | 


--------------------------------------------------------------------------------
/docs/scripts/filterpunc.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | filterpunc
 3 | ==========
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | Remove tokens that are only punctuation from a list of tokens.
 9 | 
10 | ::
11 | 
12 |     Usage: textkit filterpunc [OPTIONS] [TOKENS]
13 |     
14 |     Options:
15 |       --help  Show this message and exit.
16 | 
17 | 
18 | 
19 | Examples
20 | ========
21 | 


--------------------------------------------------------------------------------
/docs/scripts/filterwords.rst:
--------------------------------------------------------------------------------
 1 | ===========
 2 | filterwords
 3 | ===========
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | Remove stop words from tokens, returning tokens without stop words.
 9 | 
10 | ::
11 | 
12 |     Usage: textkit filterwords [OPTIONS] [TOKENS]
13 | 
14 |     Options:
15 |       -l, --language [english|german|danish|dutch|finnish|french|hungarian|italian|norwegian|portuguese|russian|spanish|swedish|turkish]
16 |       --custom FILENAME               Optional token file of additional tokens to
17 |                                       remove along with selected stop words.
18 |       --help                          Show this message and exit.
19 | 
20 | 
21 | 
22 | Examples
23 | ========
24 | 


--------------------------------------------------------------------------------
/docs/scripts/nonewlines.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | nonewlines
 3 | ==========
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | ::
 9 | 
10 |     Usage: textkit nonewlines [OPTIONS] [TEXT]...
11 |     
12 |       Remove newlines from a text file.
13 |     
14 |     Options:
15 |       --help  Show this message and exit.
16 |     
17 | 
18 | 
19 | Examples
20 | ========
21 | 


--------------------------------------------------------------------------------
/docs/scripts/showstops.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | showstops
 3 | =========
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | ::
 9 | 
10 |     Usage: textkit showstops [OPTIONS]
11 | 
12 |       Display stop words used by textkit for a given language.
13 | 
14 |     Options:
15 |       -l, --language [english|german|danish|dutch|finnish|french|hungarian|italian|norwegian|portuguese|russian|spanish|swedish|turkish]
16 |       --help                          Show this message and exit.
17 | 
18 | 
19 | 
20 | Examples
21 | ========
22 | 


--------------------------------------------------------------------------------
/docs/scripts/text2ngrams.rst:
--------------------------------------------------------------------------------
 1 | ===========
 2 | text2ngrams
 3 | ===========
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | ::
 9 | 
10 |     Usage: textkit text2ngrams [OPTIONS] [TEXT]...
11 |     
12 |       Tokenize plain text into ngrams. ngrams are n-length word tokens.
13 |       Punctuation is considered as a separate token.
14 |     
15 |     Options:
16 |       -s, --sep TEXT     Separator between words in bigram output.  [default:  ]
17 |       -n, --num INTEGER  Length of the n-gram  [default: 2]
18 |       --help             Show this message and exit.
19 |     
20 | 
21 | 
22 | Examples
23 | ========
24 | 


--------------------------------------------------------------------------------
/docs/scripts/text2punc.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | text2punc
 3 | =========
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | ::
 9 | 
10 |     Usage: textkit text2punc [OPTIONS] [TEXT]...
11 |     
12 |       Tokenize text into punctuation tokens. Words and numbers are removed,
13 |       leaving only punctuation.
14 |     
15 |     Options:
16 |       --help  Show this message and exit.
17 |     
18 | 
19 | 
20 | Examples
21 | ========
22 | 


--------------------------------------------------------------------------------
/docs/scripts/text2sentences.rst:
--------------------------------------------------------------------------------
 1 | ==============
 2 | text2sentences
 3 | ==============
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | ::
 9 | 
10 |     Usage: textkit text2sentences [OPTIONS] [TEXT]...
11 |     
12 |       Tokenize text into sentence tokens.
13 |     
14 |     Options:
15 |       --help  Show this message and exit.
16 |     
17 | 
18 | 
19 | Examples
20 | ========
21 | 


--------------------------------------------------------------------------------
/docs/scripts/text2words.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | text2words
 3 | ==========
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | Tokenize text into word tokens. Punctuation is considered as a separate token::
 9 | 
10 |     Usage: textkit text2words [OPTIONS] [TEXT]...
11 | 
12 |     Options:
13 |       --help  Show this message and exit.
14 | 
15 | Examples
16 | ========
17 | 
18 | 
19 | TODO
20 | 


--------------------------------------------------------------------------------
/docs/scripts/texts2json.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | texts2json
 3 | ==========
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | ::
 9 | 
10 |     Usage: textkit texts2json [OPTIONS] [TEXT_DOCS]...
11 |     
12 |       Convert a set of text documents into a JSON array of document objects.
13 |     
14 |     Options:
15 |       --ids PATH    File with one id per text document, each separated by a new
16 |                     line. Ids file is used to set the id attribute in the output
17 |                     JSON.
18 |       --names PATH  File with one name per text document, each separated by a new
19 |                     line. Names file is used to set the name attribute in the
20 |                     output JSON.
21 |       --field TEXT  Attribute name where text will be stored in the document
22 |                     object.  [default: text]
23 |       --help        Show this message and exit.
24 |     
25 | 
26 | 
27 | Examples
28 | ========
29 | 


--------------------------------------------------------------------------------
/docs/scripts/tokens2counts.rst:
--------------------------------------------------------------------------------
 1 | =============
 2 | tokens2counts
 3 | =============
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | ::
 9 | 
10 |     Usage: textkit tokens2counts [OPTIONS] [TOKENS]
11 |     
12 |       Count unique tokens in a list of tokens. Tokens are sorted by top counts.
13 |     
14 |     Options:
15 |       -s, --sep TEXT   Separator between token and count in output.  [default: ,]
16 |       --limit INTEGER  Only output the top N most frequent tokens
17 |       --help           Show this message and exit.
18 |     
19 | 
20 | 
21 | Examples
22 | ========
23 | 


--------------------------------------------------------------------------------
/docs/scripts/tokens2json.rst:
--------------------------------------------------------------------------------
 1 | ===========
 2 | tokens2json
 3 | ===========
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | ::
 9 | 
10 |     Usage: textkit tokens2json [OPTIONS] [TOKEN_DOCS]...
11 |     
12 |       Convert a set of token documents into a JSON array of document objects.
13 |     
14 |     Options:
15 |       --ids PATH            File with one id per token document, each separated by
16 |                             a new line. Ids file is used to set the id attribute
17 |                             in the output JSON.
18 |       --names PATH          File with one name per token document, each separated
19 |                             by a new line. Names file is used to set the name
20 |                             attribute in the output JSON.
21 |       --field TEXT          Attribute name where tokens will be stored in the
22 |                             document object.  [default: tokens]
23 |       --split / --no-split  If enabled, textkit will attempt to split input
24 |                             columns when packaging. This is useful when packaging
25 |                             multiple column output like counts.  [default: False]
26 |       -s, --sep TEXT        Separator character between columns. Only used if
27 |                             split-columns flag is used.  [default: ,]
28 |       --help                Show this message and exit.
29 |     
30 | 
31 | 
32 | Examples
33 | ========
34 | 


--------------------------------------------------------------------------------
/docs/scripts/tokens2lower.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | tokens2lower
 3 | ============
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | ::
 9 | 
10 |     Usage: textkit tokens2lower [OPTIONS] [TOKENS]
11 |     
12 |       Transform all tokens to lowercase.
13 |     
14 |     Options:
15 |       --help  Show this message and exit.
16 |     
17 | 
18 | 
19 | Examples
20 | ========
21 | 


--------------------------------------------------------------------------------
/docs/scripts/tokens2pos.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | tokens2pos
 3 | ==========
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | ::
 9 | 
10 |     Usage: textkit tokens2pos [OPTIONS] [TOKENS]
11 |     
12 |       Tokenize words into their parts of speech. Output contains the word token
13 |       followed by its part-of-speech tag, separated by the character specified
14 |       by --sep.
15 |     
16 |     Options:
17 |       -s, --sep TEXT  Separator between words in the output.  [default: ,]
18 |       --help          Show this message and exit.
19 |     
20 | 
21 | 
22 | Examples
23 | ========
24 | 


--------------------------------------------------------------------------------
/docs/scripts/tokens2stem.rst:
--------------------------------------------------------------------------------
 1 | ===========
 2 | tokens2stem
 3 | ===========
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | ::
 9 | 
10 |     Usage: textkit tokens2stem [OPTIONS] [TOKENS]
11 |     
12 |       Stem a list of tokens to get their root.
13 |     
14 |     Options:
15 |       -a, --algorithm [porter|lancaster|snowball|wordnet]
16 |                                       Specify which stemming algorithm to use.
17 |                                       [default: porter]
18 |       --help                          Show this message and exit.
19 |     
20 | 
21 | 
22 | Examples
23 | ========
24 | 


--------------------------------------------------------------------------------
/docs/scripts/tokens2text.rst:
--------------------------------------------------------------------------------
 1 | ===========
 2 | tokens2text
 3 | ===========
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | ::
 9 | 
10 |     Usage: textkit tokens2text [OPTIONS] [TOKENS]
11 |     
12 |       Combine tokens in a token document into a single text file.
13 |     
14 |     Options:
15 |       -s, --sep TEXT  Separator between token and count in output.  [default:  ]
16 |       --help          Show this message and exit.
17 |     
18 | 
19 | 
20 | Examples
21 | ========
22 | 


--------------------------------------------------------------------------------
/docs/scripts/tokens2topbigrams.rst:
--------------------------------------------------------------------------------
 1 | =================
 2 | tokens2topbigrams
 3 | =================
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | ::
 9 | 
10 |     b"Usage: textkit tokens2topbigrams [OPTIONS] [TOKENS]
11 |     
12 |       Find top most interesting bi-grams in a token document. Uses the --measure
13 |       argument to determine what measure to use to define interesting.
14 |     
15 |     Options:
16 |       -s, --sep TEXT                  Separator between tokens and scores in
17 |                                       output.  [default: ,]
18 |       -m, --measure [likelihood|chi_sq|pmi|student_t|freq]
19 |                                       Specify which measure to use to define
20 |                                       interesing-ness.  [default: likelihood]
21 |       --freq INTEGER                  Minimum frequency of bi-grams to filter out.
22 |                                       [default: 2]
23 |       --scores / --no-scores          Include or exclude scores in output.
24 |                                       [default: True]
25 |       --help                          Show this message and exit.
26 |     "
27 | 
28 | 
29 | Examples
30 | ========
31 | 


--------------------------------------------------------------------------------
/docs/scripts/tokens2upper.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | tokens2upper
 3 | ============
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | ::
 9 | 
10 |     Usage: textkit tokens2upper [OPTIONS] [TOKENS]
11 |     
12 |       Transform all tokens to uppercase.
13 |     
14 |     Options:
15 |       --help  Show this message and exit.
16 |     
17 | 
18 | 
19 | Examples
20 | ========
21 | 


--------------------------------------------------------------------------------
/docs/scripts/transliterate.rst:
--------------------------------------------------------------------------------
 1 | =============
 2 | transliterate
 3 | =============
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | ::
 9 | 
10 |     Usage: textkit transliterate [OPTIONS] [TEXT]...
11 |     
12 |       Transform an international text file to plain ascii
13 |     
14 |     Options:
15 |       --help  Show this message and exit.
16 |     
17 | 
18 | 
19 | Examples
20 | ========
21 | 
22 |     > echo "Hello! À bientôt… L’été à Pètechïn; 日本語, Nihongo Klüft skräms inför på fédéral électoral große Küche Mærsk" > file_full_of_international_text.md
23 |     > textkit transliterate file_full_of_international_text.md
24 |     Hello! A bientot... L'ete a Petechin; Ri Ben Yu , Nihongo Kluft skrams infor pa federal electoral grosse Kuche Maersk


--------------------------------------------------------------------------------
/docs/scripts/words2bigrams.rst:
--------------------------------------------------------------------------------
 1 | =============
 2 | words2bigrams
 3 | =============
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | ::
 9 | 
10 |     Usage: textkit words2bigrams [OPTIONS] [TOKENS]
11 |     
12 |       Tokenize words into bigrams. Bigrams are two word tokens. Punctuation is
13 |       considered as a separate token.
14 |     
15 |     Options:
16 |       -s, --sep TEXT  Separator between words in bigram output.  [default:  ]
17 |       --help          Show this message and exit.
18 |     
19 | 
20 | 
21 | Examples
22 | ========
23 | 


--------------------------------------------------------------------------------
/docs/scripts/words2ngrams.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | words2ngrams
 3 | ============
 4 | 
 5 | Description
 6 | ===========
 7 | 
 8 | ::
 9 | 
10 |     Usage: textkit words2ngrams [OPTIONS] [TOKENS]
11 |     
12 |       Convert word tokens into ngrams. ngrams are n-length word tokens.
13 |       Punctuation is considered as a separate token.
14 |     
15 |     Options:
16 |       -s, --sep TEXT     Separator between words in bigram output.  [default:  ]
17 |       -n, --num INTEGER  Length of the n-gram  [default: 2]
18 |       --help             Show this message and exit.
19 |     
20 | 
21 | 
22 | Examples
23 | ========
24 | 


--------------------------------------------------------------------------------
/docs/sphinx_deployment.mk:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Teracy, Inc. and individual contributors.
  2 | # All rights reserved.
  3 | 
  4 | # Redistribution and use in source and binary forms, with or without modification,
  5 | # are permitted provided that the following conditions are met:
  6 | 
  7 | #     1. Redistributions of source code must retain the above copyright notice,
  8 | #        this list of conditions and the following disclaimer.
  9 | 
 10 | #     2. Redistributions in binary form must reproduce the above copyright
 11 | #        notice, this list of conditions and the following disclaimer in the
 12 | #        documentation and/or other materials provided with the distribution.
 13 | 
 14 | #     3. Neither the name of Teracy, Inc. nor the names of its contributors may be used
 15 | #        to endorse or promote products derived from this software without
 16 | #        specific prior written permission.
 17 | 
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 19 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 20 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 21 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 22 | # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 23 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 24 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 25 | # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 27 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | # Deployment configurations from sphinx_deployment project
 30 | 
 31 | # default deployment when $ make deploy
 32 | # deploy_gh_pages                            : to $ make deploy_gh_pages
 33 | # deploy_rsync                               : to $ make deploy_rsync
 34 | # deploy_heroku                              : to $ make deploy_heroku
 35 | # deploy_gh_pages deploy_rsync deploy_heroku : to $ make deploy_gh_pages then $ make deploy_rsync
 36 | #                                              and then $ make deploy_heroku
 37 | # default value: deploy_gh_pages
 38 | ifndef DEPLOY_DEFAULT
 39 | DEPLOY_DEFAULT = deploy_gh_pages
 40 | endif
 41 | 
 42 | # The deployment directory to be deployed
 43 | ifndef DEPLOY_DIR
 44 | DEPLOY_DIR      = _deploy
 45 | endif
 46 | 
 47 | # The heroku deployment directory to be deployed
 48 | # we must create this separated dir to avoid any conflict with _deploy (rsync and gh_pages)
 49 | ifndef DEPLOY_DIR_HEROKU
 50 | DEPLOY_DIR_HEROKU = _deploy_heroku
 51 | endif
 52 | 
 53 | # Copy contents from $(BUILDDIR) to $(DEPLOY_DIR)/$(DEPLOY_HTML_DIR) directory
 54 | ifndef DEPLOY_HTML_DIR
 55 | DEPLOY_HTML_DIR = .
 56 | endif
 57 | 
 58 | 
 59 | ## -- Rsync Deploy config -- ##
 60 | # Be sure your public key is listed in your server's ~/.ssh/authorized_keys file
 61 | ifndef SSH_USER
 62 | SSH_USER       = user@domain.com
 63 | endif
 64 | 
 65 | ifndef SSH_PORT
 66 | SSH_PORT       = 22
 67 | endif
 68 | 
 69 | ifndef DOCUMENT_ROOT
 70 | DOCUMENT_ROOT  = ~/website.com/
 71 | endif
 72 | 
 73 | #If you choose to delete on sync, rsync will create a 1:1 match
 74 | ifndef RSYNC_DELETE
 75 | RSYNC_DELETE   = false
 76 | endif
 77 | 
 78 | # Any extra arguments to pass to rsync
 79 | ifndef RSYNC_ARGS
 80 | RSYNC_ARGS     =
 81 | endif
 82 | 
 83 | ## -- Github Pages Deploy config -- ##
 84 | 
 85 | # Configure the right deployment branch
 86 | ifndef DEPLOY_BRANCH_GITHUB
 87 | DEPLOY_BRANCH_GITHUB = gh-pages
 88 | endif
 89 | 
 90 | #if REPO_URL_GITHUB was NOT defined by travis-ci
 91 | ifndef REPO_URL_GITHUB
 92 | # Configure your right github project repo
 93 | REPO_URL_GITHUB       = git@github.com:learntextvis/textkit.git
 94 | endif
 95 | 
 96 | ## -- Heroku Deployment Config -- ##
 97 | 
 98 | ifndef REPO_URL_HEROKU
 99 | # Configure your right heroku repo
100 | # REPO_URL_HEROKU = git@heroku.com:spxd.git
101 | endif
102 | 
103 | 
104 | ## end deployment configuration, don't edit anything below this line ##
105 | #######################################################################
106 | 
107 | ifeq ($(RSYNC_DELETE), true)
108 | RSYNC_DELETE_OPT = --delete
109 | endif
110 | 
111 | init_gh_pages:
112 | 	@rm -rf $(DEPLOY_DIR)
113 | 	@mkdir -p $(DEPLOY_DIR)
114 | 	@cd $(DEPLOY_DIR); git init;\
115 | 		echo 'sphinx docs comming soon...' > index.html;\
116 | 		touch .nojekyll;\
117 | 		git add .;\
118 | 		git commit -m "sphinx docs init";\
119 | 		git branch -m $(DEPLOY_BRANCH_GITHUB);\
120 | 		echo $(DEPLOY_BRANCH_GITHUB) > index.html;\
121 | 		git remote add origin $(REPO_URL_GITHUB);
122 | 	@cd $(DEPLOY_DIR);\
123 | 		if ! git ls-remote origin $(DEPLOY_BRANCH_GITHUB) | grep $(DEPLOY_BRANCH_GITHUB) ; then \
124 | 			echo "Preparing Github deployment branch: $(DEPLOY_BRANCH_GITHUB) for the first time only...";\
125 | 			git push -u origin $(DEPLOY_BRANCH_GITHUB);\
126 | 		fi
127 | 
128 | setup_gh_pages: init_gh_pages
129 | 	@echo "Setting up gh-pages deployment..."
130 | 	@cd $(DEPLOY_DIR);\
131 | 		git fetch origin;\
132 | 		git reset --hard origin/$(DEPLOY_BRANCH_GITHUB);\
133 | 		git branch --set-upstream $(DEPLOY_BRANCH_GITHUB) origin/$(DEPLOY_BRANCH_GITHUB)
134 | 	@echo "Now you can deploy to Github Pages with 'make generate' and then 'make deploy_gh_pages'"
135 | 
136 | init_heroku:
137 | 	@rm -rf $(DEPLOY_DIR_HEROKU)
138 | 	@mkdir -p $(DEPLOY_DIR_HEROKU)
139 | 	@cd $(DEPLOY_DIR_HEROKU); git init;\
140 | 		cp -r ../.deploy_heroku/* .;\
141 | 		echo 'sphinx docs comming soon...' > public/index.html;\
142 | 		git add .; git commit -m "sphinx docs init";\
143 | 		git remote add origin $(REPO_URL_HEROKU);
144 | 	@cd $(DEPLOY_DIR_HEROKU);\
145 | 		if ! git ls-remote origin master | grep master ; then\
146 | 			echo "Preparing Heroku deployment for the first time only...";\
147 | 			git push -u origin master;\
148 | 		fi
149 | 
150 | setup_heroku: init_heroku
151 | 	@echo "setting up heroku deployment..."
152 | 	@cd $(DEPLOY_DIR_HEROKU);\
153 | 		git fetch origin;\
154 | 		git reset --hard origin/master;\
155 | 		git branch --set-upstream master origin/master
156 | 	@echo "Now you can deploy to Heroku with 'make generate' and then 'make deploy_heroku'"
157 | 
158 | generate: html
159 | 
160 | prepare_rsync_deployment:
161 | 	@echo "Preparing rsync deployment..."
162 | 	@mkdir -p $(DEPLOY_DIR)/$(DEPLOY_HTML_DIR)
163 | 	@echo "Copying files from '$(BUILDDIR)/html/.' to '$(DEPLOY_DIR)/$(DEPLOY_HTML_DIR)'"
164 | 	@cp -r $(BUILDDIR)/html/. $(DEPLOY_DIR)/$(DEPLOY_HTML_DIR)
165 | 
166 | deploy_rsync: prepare_rsync_deployment
167 | 	@echo "Deploying on rsync now..."
168 | 	rsync -avze 'ssh -p $(SSH_PORT)' --exclude-from $(realpath ./rsync_exclude) $(RSYNC_ARGS) $(RSYNC_DELETE_OPT) ${DEPLOY_DIR}/ $(SSH_USER):$(DOCUMENT_ROOT)
169 | 
170 | prepare_gh_pages_deployment:
171 | 	@echo "Preparing gh_pages deployment..."
172 | 	@echo "Pulling any updates from Github Pages..."
173 | 	@cd $(DEPLOY_DIR); git pull;
174 | 	@mkdir -p $(DEPLOY_DIR)/$(DEPLOY_HTML_DIR)
175 | 	@echo "Copying files from '$(BUILDDIR)/html/.' to '$(DEPLOY_DIR)/$(DEPLOY_HTML_DIR)'"
176 | 	@cp -r $(BUILDDIR)/html/. $(DEPLOY_DIR)/$(DEPLOY_HTML_DIR)
177 | 
178 | deploy_gh_pages: prepare_gh_pages_deployment
179 | 	@echo "Deploying on github pages now..."
180 | 	@cd $(DEPLOY_DIR); git add -A; git commit -m "docs updated at `date -u`";\
181 | 		git push origin $(DEPLOY_BRANCH) --quiet
182 | 	@echo "Github Pages deploy was completed at `date -u`"
183 | 
184 | prepare_heroku_deployment:
185 | 	@echo "Preparing heroku deployment..."
186 | 	@echo "Pulling any updates from Heroku..."
187 | 	@cd $(DEPLOY_DIR_HEROKU); git pull;
188 | 	@mkdir -p $(DEPLOY_DIR_HEROKU)/public/$(DEPLOY_HTML_DIR)
189 | 	@echo "Copying files from .deploy_heroku to $(DEPLOY_DIR_HEROKU)"
190 | 	@cp -r .deploy_heroku/. $(DEPLOY_DIR_HEROKU)
191 | 	@echo "Copying files from '$(BUILDDIR)/html/.' to '$(DEPLOY_DIR_HEROKU)/public/$(DEPLOY_HTML_DIR)'"
192 | 	@cp -r $(BUILDDIR)/html/. $(DEPLOY_DIR_HEROKU)/public/$(DEPLOY_HTML_DIR)
193 | 
194 | 
195 | deploy_heroku: prepare_heroku_deployment
196 | 	@echo "Deploying on heroku now..."
197 | 	@cd $(DEPLOY_DIR_HEROKU); git add -A; git commit -m "docs updated at `date -u`";\
198 | 		git push origin master --quiet
199 | 	@echo "Heroku deployment was completed at `date -u`"
200 | 
201 | 
202 | deploy: $(DEPLOY_DEFAULT)
203 | 
204 | gen_deploy: generate deploy
205 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: textkit
 2 | dependencies:
 3 | - click=4.1=py35_0
 4 | - coverage=4.0=py35_0
 5 | - nltk=3.1=py35_0
 6 | - numpy=1.10.2=py35_0
 7 | - openssl=1.0.2d=0
 8 | - pip=7.1.2=py35_0
 9 | - python=3.5.1=0
10 | - pyyaml=3.11=py35_1
11 | - readline=6.2=2
12 | - setuptools=19.1.1=py35_0
13 | - six=1.10.0=py35_0
14 | - sqlite=3.8.4.1=1
15 | - tk=8.5.18=0
16 | - wheel=0.26.0=py35_1
17 | - xz=5.0.5=0
18 | - yaml=0.1.6=0
19 | - zlib=1.2.8=0
20 | 
21 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | click>=6.2
 2 | coverage>=4.0.3
 3 | nltk>=3.1
 4 | numpy>=1.10.2
 5 | PyYAML>=3.11
 6 | six>=1.10.0
 7 | wheel>=0.26.0
 8 | unittest2>=1.1.0
 9 | pylint>=1.5.5
10 | pytest>=2.9.1
11 | unidecode>=0.4.20
12 | chardet>=2.3.0
13 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | # This flag says that the code is written to work on both Python 2 and Python
3 | # 3. If at all possible, it is good practice to do this. If you cannot, you
4 | # will need to generate wheels for each Python version that you support.
5 | universal=1
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | import sys
 6 | from setuptools import setup, find_packages
 7 | 
 8 | 
 9 | def find_version(fname):
10 |     """Attempts to find the version number in the file names fname.
11 |     Raises RuntimeError if not found.
12 |     """
13 |     version = ''
14 |     with open(fname, 'r') as fp:
15 |         reg = re.compile(r'__version__ = [\'"]([^\'"]*)[\'"]')
16 |         for line in fp:
17 |             m = reg.match(line)
18 |             if m:
19 |                 version = m.group(1)
20 |                 break
21 |     if not version:
22 |         raise RuntimeError('Cannot find version information')
23 |     return version
24 | 
25 | __version__ = find_version("textkit/__init__.py")
26 | 
27 | 
28 | setup(
29 |     name='textkit',
30 |     version=__version__,
31 |     description='Simple text analysis from the command line',
32 |     long_description=open("README.rst").read(),
33 |     packages=find_packages(exclude=['test*', 'docs']),
34 |     license='MIT',
35 |     author='Learn Text Vis Team',
36 |     author_email='landham@gmail.com',
37 |     py_modules=['textkit'],
38 |     url='https://github.com/learntextvis/textkit',
39 |     keywords=['text', 'analysis', 'textkit'],
40 |     include_package_data=True,
41 |     install_requires=[
42 |         'click>=6.2',
43 |         'nltk>=3.1',
44 |         'unidecode>=0.4.20',
45 |         'chardet>=2.3.0'
46 |     ],
47 |     entry_points={
48 |      'console_scripts': [
49 |         'textkit = textkit.cli:cli'
50 |      ]
51 |     },
52 |     package_data={
53 |         'textkit': ['data/stopwords/english.txt', 'data/stopwords/german.txt', 'data/stopwords/danish.txt', 'data/stopwords/dutch.txt', 'data/stopwords/finnish.txt', 'data/stopwords/french.txt', 'data/stopwords/hungarian.txt', 'data/stopwords/italian.txt', 'data/stopwords/norwegian.txt', 'data/stopwords/portuguese.txt', 'data/stopwords/russian.txt', 'data/stopwords/spanish.txt', 'data/stopwords/swedish.txt', 'data/stopwords/turkish.txt']
54 |         }, classifiers=[
55 |         'Development Status :: 3 - Alpha',
56 |         'Environment :: Console',
57 |         'Intended Audience :: Developers',
58 |         'Intended Audience :: End Users/Desktop',
59 |         'Intended Audience :: Science/Research',
60 |         'License :: OSI Approved :: MIT License',
61 |         'Natural Language :: English',
62 |         'Operating System :: OS Independent',
63 |         'Programming Language :: Python',
64 |         'Programming Language :: Python :: 2.7',
65 |         'Programming Language :: Python :: 3.4',
66 |         'Topic :: Utilities'
67 |     ]
68 | )
69 | 


--------------------------------------------------------------------------------
/test_data/alice_short.txt:
--------------------------------------------------------------------------------
 1 | CHAPTER
 2 | I
 3 | .
 4 | Down
 5 | the
 6 | Rabbit-Hole
 7 | Alice
 8 | was
 9 | beginning
10 | to
11 | get
12 | very
13 | tired
14 | of
15 | sitting
16 | by
17 | her
18 | sister
19 | on
20 | the
21 | 


--------------------------------------------------------------------------------
/test_data/international.transliterate.txt:
--------------------------------------------------------------------------------
 1 | GR	ILIADOSA
 2 | FR	L'HAY-LES-ROSES
 3 | KO	yi bonmun naeyongeun yeongmunsaiteureul camgoha
 4 | JA	Mu Ci Ba Cui 
 5 | RU	servery raspolozhennye
 6 | RO	Pot sa mananc sticla si ea nu ma raneste
 7 | VI	Shi  E  Shui  Jing 
 8 | CN	Wo Neng Tun Xia Bo Li Er Bu Shang Shen Ti 
 9 | NV	yishaago
10 | 
11 | 


--------------------------------------------------------------------------------
/test_data/international.txt:
--------------------------------------------------------------------------------
 1 | GR	ΙΛΙΑΔΟΣΑ
 2 | FR	L’HAŸ-LES-ROSES
 3 | KO	의 본문 내용은 영문사이트를 참고하
 4 | JA	目次抜粋
 5 | RU	серверы расположенные
 6 | RO	Pot să mănânc sticlă și ea nu mă rănește
 7 | VI	世 咹 水 晶
 8 | CN	我能吞下玻璃而不伤身体
 9 | NV	yishą́ągo
10 | 


--------------------------------------------------------------------------------
/test_data/word_tokens.txt:
--------------------------------------------------------------------------------
1 | This
2 | is
3 | one
4 | sentence
5 | Dude
6 | .
7 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/learntextvis/textkit/6c482ea7d378022e10ee1002f9d9db7925856294/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_coerce.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from textkit.coerce import coerce_types
 3 | 
 4 | 
 5 | def test_coerce_types():
 6 |     content = [
 7 |         ["happy", "9"],
 8 |         ["day", "8"],
 9 |         ["4", "7"],
10 |         ["YOU!", "6"]
11 |     ]
12 | 
13 |     tokens = coerce_types(content)
14 |     assert len(tokens) == 4
15 |     assert tokens[0][0] == "happy"
16 |     assert tokens[0][1] == 9
17 |     assert tokens[2][0] == "4"
18 | 
19 | 
20 | def test_coerce_types_with_mix_floats_ints():
21 |     content = [
22 |         ["happy", "9"],
23 |         ["day", "8.7"],
24 |         ["4", "7.0"],
25 |         ["YOU!", "6"]
26 |     ]
27 | 
28 |     tokens = coerce_types(content)
29 |     assert len(tokens) == 4
30 |     assert tokens[0][0] == "happy"
31 |     assert tokens[0][1] == 9.0
32 |     assert tokens[1][1] == 8.7
33 |     assert tokens[2][1] == 7
34 | 


--------------------------------------------------------------------------------
/tests/test_filter.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import click
 3 | from click.testing import CliRunner
 4 | from textkit.filter.filter_punc import filterpunc
 5 | from textkit.filter.filter_words import filterwords
 6 | from textkit.filter.filter_lengths import filterlengths
 7 | from tests.utils import create_single_output, create_multifile_output, compare_results
 8 | 
 9 | 
10 | def test_filterlengths():
11 |     runner = CliRunner()
12 |     with runner.isolated_filesystem():
13 |         filename = 'in.txt'
14 |         sentence = 'Hello\nWorld\n!\nI\n.\nnot\nwin\n'
15 | 
16 |         create_single_output(filename, sentence)
17 | 
18 |         # default length 3
19 |         result = runner.invoke(filterlengths, [filename])
20 |         tokens = result.output.split('\n')
21 |         expected_tokens = ['Hello', 'World', 'not', 'win']
22 |         assert result.exit_code == 0
23 |         compare_results(tokens, expected_tokens)
24 | 
25 |         # minumum length 4
26 |         result = runner.invoke(filterlengths, ['-m', '4', filename])
27 |         tokens = result.output.split('\n')
28 |         expected_tokens = ['Hello', 'World']
29 |         assert result.exit_code == 0
30 |         compare_results(tokens, expected_tokens)
31 | 
32 | 
33 | def test_filterpunc():
34 |     runner = CliRunner()
35 |     with runner.isolated_filesystem():
36 |         filename = 'in.txt'
37 |         sentence = 'Hello\nWorld\n!\nI\n.\nnot'
38 |         expected_tokens = ['Hello', 'World', 'I', 'not']
39 |         create_single_output(filename, sentence)
40 |         result = runner.invoke(filterpunc, [filename])
41 |         tokens = result.output.split('\n')
42 |         assert result.exit_code == 0
43 |         compare_results(tokens, expected_tokens)
44 | 
45 | 
46 | def test_filterwords():
47 |     runner = CliRunner()
48 |     with runner.isolated_filesystem():
49 | 
50 |         filename = 'in.txt'
51 |         sentence = 'Hello\nWorld\n!\nI\nam\nnot\na\ncrook\n.'
52 |         expected_tokens = ['Hello', 'World', '!', 'crook', '.']
53 |         create_single_output(filename, sentence)
54 |         result = runner.invoke(filterwords, ['--language', 'english', filename])
55 |         tokens = result.output.split('\n')
56 |         assert result.exit_code == 0
57 |         compare_results(tokens, expected_tokens)
58 | 
59 | 
60 | def test_filterwords_custom():
61 |     runner = CliRunner()
62 |     with runner.isolated_filesystem():
63 | 
64 |         filename = 'in.txt'
65 |         sentence = 'Hello\nWorld\n!\nI\nam\nnot\na\ncrook\n.'
66 |         expected_tokens = ['World', '!', 'crook', '.']
67 |         custom_stopword_filename = 'custom.txt'
68 |         custom_stopwords = 'hello\n'
69 | 
70 |         create_single_output(filename, sentence)
71 |         create_single_output(custom_stopword_filename, custom_stopwords)
72 | 
73 |         result = runner.invoke(filterwords,
74 |                                ['--custom',
75 |                                 custom_stopword_filename,
76 |                                 filename])
77 | 
78 |         tokens = result.output.split('\n')
79 |         assert result.exit_code == 0
80 |         compare_results(tokens, expected_tokens)
81 | 


--------------------------------------------------------------------------------
/tests/test_tokenize.py:
--------------------------------------------------------------------------------
  1 | from click.testing import CliRunner
  2 | from textkit.tokenize.words import text2words
  3 | from textkit.tokenize.bigrams import words2bigrams
  4 | from textkit.tokenize.punc import text2punc
  5 | from textkit.tokenize.sentences import text2sentences
  6 | from textkit.tokenize.ngrams import words2ngrams, text2ngrams
  7 | from tests.utils import create_single_output, create_multifile_output, compare_results
  8 | 
  9 | 
 10 | def test_text2words():
 11 |     runner = CliRunner()
 12 |     with runner.isolated_filesystem():
 13 |         filename = 'in.txt'
 14 |         sentence = 'Hello World!\nI.\nnot sure where to go'
 15 |         expected_tokens = ['Hello', 'World', '!', 'I.',
 16 |                            'not', 'sure', 'where', 'to', 'go']
 17 |         create_single_output(filename, sentence)
 18 |         result = runner.invoke(text2words, [filename])
 19 |         tokens = result.output.split('\n')
 20 |         assert result.exit_code == 0
 21 |         compare_results(tokens, expected_tokens)
 22 | 
 23 | 
 24 | def test_text2words_multifile():
 25 |     runner = CliRunner()
 26 |     with runner.isolated_filesystem():
 27 | 
 28 |         filenames = ['in.txt', 'in2.txt']
 29 |         sentences = ('Hello World!\nI.\nnot sure where to go',
 30 |                      'Goodbye World!\n I.\n know everything about you')
 31 |         expected_tokens = ['Hello', 'World', '!', 'I.',
 32 |                            'not', 'sure', 'where', 'to', 'go',
 33 |                            'Goodbye', 'World', '!', 'I.', 'know',
 34 |                            'everything', 'about', 'you']
 35 |         create_multifile_output(filenames, sentences)
 36 |         result = runner.invoke(text2words, filenames)
 37 |         tokens = result.output.split('\n')
 38 |         assert result.exit_code == 0
 39 |         compare_results(tokens, expected_tokens)
 40 | 
 41 | 
 42 | def test_words2bigrams():
 43 |     runner = CliRunner()
 44 |     with runner.isolated_filesystem():
 45 |         filename = 'in.txt'
 46 |         sentence = 'Hello\nWorld\n!\nI\nlove\ngo\n.'
 47 |         expected_tokens = ['Hello World', 'World !',
 48 |                            '! I', 'I love', 'love go', 'go .']
 49 |         create_single_output(filename, sentence)
 50 |         result = runner.invoke(words2bigrams, [filename])
 51 |         tokens = result.output.split('\n')
 52 |         assert result.exit_code == 0
 53 |         compare_results(tokens, expected_tokens)
 54 | 
 55 | 
 56 | def test_sentences():
 57 |     runner = CliRunner()
 58 |     with runner.isolated_filesystem():
 59 |         filename = 'in.txt'
 60 |         sentence = 'Hello World! I love go.'
 61 |         expected_tokens = ['Hello World!', 'I love go.']
 62 |         create_single_output(filename, sentence)
 63 |         result = runner.invoke(text2sentences, [filename])
 64 |         tokens = result.output.split('\n')
 65 |         assert result.exit_code == 0
 66 |         compare_results(tokens, expected_tokens)
 67 | 
 68 | 
 69 | def test_punc():
 70 |     runner = CliRunner()
 71 |     with runner.isolated_filesystem():
 72 |         filename = 'in.txt'
 73 |         sentence = 'Hello\nWorld\n!\nI\nlove,\ngo\n.'
 74 |         expected_tokens = ['!', ',', '.']
 75 |         create_single_output(filename, sentence)
 76 |         result = runner.invoke(text2punc, [filename])
 77 |         tokens = result.output.split('\n')
 78 |         assert result.exit_code == 0
 79 |         compare_results(tokens, expected_tokens)
 80 | 
 81 | 
 82 | def test_punc_multifile():
 83 |     runner = CliRunner()
 84 |     with runner.isolated_filesystem():
 85 |         filenames = ['in.txt', 'in2.txt']
 86 |         sentences = ['Hello\nWorld\n!\nI\nlove,\ngo\n.',
 87 |                      'Goodbye World!\n I...\n know everything\'s about you?']
 88 |         expected_tokens = ['!', ',', '.', '!', '...', "'", '?']
 89 |         create_multifile_output(filenames, sentences)
 90 |         result = runner.invoke(text2punc, filenames)
 91 |         tokens = result.output.split('\n')
 92 |         assert result.exit_code == 0
 93 |         compare_results(tokens, expected_tokens)
 94 | 
 95 | 
 96 | def test_words2ngrams():
 97 |     runner = CliRunner()
 98 |     with runner.isolated_filesystem():
 99 |         filename = 'in.txt'
100 |         sentence = 'Hello\nWorld\n!\nI\nlove\ngo\n.'
101 |         expected_tokens = ['Hello World !', 'World ! I', '! I love', 'I love go']
102 |         create_single_output(filename, sentence)
103 |         result = runner.invoke(words2ngrams, ['-n', 3, filename])
104 |         tokens = result.output.split('\n')
105 |         assert result.exit_code == 0
106 |         compare_results(tokens, expected_tokens)
107 | 
108 | 
109 | def test_text2ngrams():
110 |     runner = CliRunner()
111 |     with runner.isolated_filesystem():
112 |         filename = 'in.txt'
113 |         sentence = 'Hello World! I love go.'
114 |         expected_tokens = ['Hello World !', 'World ! I', '! I love', 'I love go']
115 |         create_single_output(filename, sentence)
116 |         result = runner.invoke(text2ngrams, ['-n', 3, filename])
117 |         tokens = result.output.split('\n')
118 |         assert result.exit_code == 0
119 |         compare_results(tokens, expected_tokens)
120 | 


--------------------------------------------------------------------------------
/tests/test_transform.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from click.testing import CliRunner
  3 | from textkit.transform.tokens_to_lower import tokens2lower
  4 | from textkit.transform.newlines import nonewlines
  5 | from textkit.transform.tokens_to_upper import tokens2upper
  6 | from textkit.transform.tokens_to_counts import tokens2counts
  7 | from textkit.transform.tokens_to_pos import tokens2pos
  8 | from textkit.transform.tokens_to_top_bigrams import tokens2topbigrams
  9 | from tests.utils import create_single_output, create_multifile_output, compare_results
 10 | 
 11 | 
 12 | def test_lowercase():
 13 |     runner = CliRunner()
 14 |     with runner.isolated_filesystem():
 15 |         filename = 'in.txt'
 16 |         sentence = 'Hello\nWorld\n!\nI\n.\nnoooo\n'
 17 |         expected_tokens = ['hello', 'world', '!', 'i', '.', 'noooo']
 18 |         create_single_output(filename, sentence)
 19 | 
 20 |         result = runner.invoke(tokens2lower, [filename])
 21 |         tokens = result.output.split('\n')
 22 |         assert result.exit_code == 0
 23 |         compare_results(tokens, expected_tokens)
 24 | 
 25 | 
 26 | def test_uppercase():
 27 |     runner = CliRunner()
 28 |     with runner.isolated_filesystem():
 29 |         filename = 'in.txt'
 30 |         sentence = 'Hello\nWorld\n!\nI\n.\nnoooo\n'
 31 |         expected_tokens = ['HELLO', 'WORLD', '!', 'I', '.', 'NOOOO']
 32 |         create_single_output(filename, sentence)
 33 | 
 34 |         result = runner.invoke(tokens2upper, [filename])
 35 |         tokens = result.output.split('\n')
 36 |         assert result.exit_code == 0
 37 |         compare_results(tokens, expected_tokens)
 38 | 
 39 | 
 40 | def test_nonewlines():
 41 |     runner = CliRunner()
 42 |     with runner.isolated_filesystem():
 43 |         filename = 'in.txt'
 44 |         sentence = 'Hello\nWorld\n!\nI\nam\nin.\n'
 45 |         expected_tokens = ['Hello World ! I am in.']
 46 | 
 47 |         create_single_output(filename, sentence)
 48 |         result = runner.invoke(nonewlines, [filename])
 49 |         tokens = result.output.split('\n')
 50 |         assert result.exit_code == 0
 51 |         assert len(result.output.split('\n')) == 2
 52 |         compare_results(tokens, expected_tokens)
 53 | 
 54 | 
 55 | def test_nonewlines_multifile():
 56 |     runner = CliRunner()
 57 |     with runner.isolated_filesystem():
 58 |         filenames = ['in.txt', 'in2.txt']
 59 |         sentences = ['Hello\nWorld\n!\nI\nam\nin.',
 60 |                      'What are you\na creature\nof mystery']
 61 |         expected_tokens = ['Hello World ! I am in. What are you a creature of mystery']
 62 |         create_multifile_output(filenames, sentences)
 63 |         result = runner.invoke(nonewlines, filenames)
 64 |         tokens = result.output.split('\n')
 65 |         assert result.exit_code == 0
 66 |         assert len(result.output.split('\n')) == 2
 67 |         compare_results(tokens, expected_tokens)
 68 | 
 69 | 
 70 | def test_count_tokens():
 71 |     runner = CliRunner()
 72 |     with runner.isolated_filesystem():
 73 |         filename = 'in.txt'
 74 |         sentence = 'Hello,\nworld\n!\nI\nlove\nthis\nworld\nand\nlove\nyou'
 75 |         expected_tokens = ['love,2', 'world,2', 'and,1', 'I,1', 'you,1',
 76 |                            'this,1', '\"Hello,\",1', '!,1', '']
 77 |         expected_tokens.sort()
 78 |         create_single_output(filename, sentence)
 79 |         result = runner.invoke(tokens2counts, [filename])
 80 |         tokens = result.output.split('\n')
 81 |         tokens.sort()
 82 |         assert result.exit_code == 0
 83 |         compare_results(tokens, expected_tokens)
 84 | 
 85 | 
 86 | def test_pos_tokens():
 87 |     runner = CliRunner()
 88 |     with runner.isolated_filesystem():
 89 |         filename = 'in.txt'
 90 |         sentence = 'Hello\nworld\n!\nI\nlove\nthis\nworld\nand\nlove\nyou'
 91 |         expected_tokens = ['Hello,NNP', 'world,NN', '!,.',
 92 |                            'I,PRP', 'love,VBP', 'this,DT',
 93 |                            'world,NN', 'and,CC', 'love,VB', 'you,PRP']
 94 |         create_single_output(filename, sentence)
 95 |         result = runner.invoke(tokens2pos, [filename])
 96 |         tokens = result.output.split('\n')
 97 |         assert result.exit_code == 0
 98 |         compare_results(tokens, expected_tokens)
 99 | 
100 | 
101 | def test_top_bigrams():
102 |     runner = CliRunner()
103 |     with runner.isolated_filesystem():
104 |         filename = 'in.txt'
105 |         sentence = 'I\nworld\n!\nI\nlove\nyou\nthis\nworld\nand\nlove\nyou'
106 |         create_single_output(filename, sentence)
107 | 
108 |         result = runner.invoke(tokens2topbigrams, [filename])
109 |         assert result.exit_code == 0
110 | 
111 |         tokens = result.output.split('\n')
112 |         assert tokens[0].split(',')[0:2] == ['love', 'you']
113 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from textkit.utils import read_tokens
 3 | 
 4 | 
 5 | def test_read_tokens():
 6 |     f = open('test_data/word_tokens.txt', 'r')
 7 | 
 8 |     tokens = read_tokens(f)
 9 |     assert len(tokens) == 6
10 | 
11 |     f.close()
12 | 


--------------------------------------------------------------------------------
/tests/transliterate.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from click.testing import CliRunner
 3 | from textkit.filter.transliterate import transliterate
 4 | 
 5 | 
 6 | def test_transliterate():
 7 |     runner = CliRunner()
 8 |     filename = 'test_data/international.txt'
 9 |     ## ???
10 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | def create_single_output(filename, content):
 2 |     """
 3 |     Outputs test content into a filename
 4 |     """
 5 |     with open(filename, 'w') as f:
 6 |         f.write(content)
 7 | 
 8 | 
 9 | def create_multifile_output(filenames, contents):
10 |     """
11 |     Outputs several text contents into several files
12 |     """
13 |     for idx, filename in enumerate(filenames):
14 |         with open(filename, 'w') as f:
15 |             f.write(contents[idx])
16 | 
17 | 
18 | def compare_results(tokens, expected_tokens):
19 |     for tdx, expected_token in enumerate(expected_tokens):
20 |         assert tokens[tdx] == expected_tokens[tdx]
21 | 


--------------------------------------------------------------------------------
/textkit/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | __version__ = '0.2.3'
3 | __license__ = 'MIT'
4 | 


--------------------------------------------------------------------------------
/textkit/cli.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import click
 4 | from textkit.tokenize.words import text2words
 5 | from textkit.tokenize.sentences import text2sentences
 6 | from textkit.tokenize.bigrams import words2bigrams
 7 | from textkit.tokenize.ngrams import words2ngrams, text2ngrams
 8 | from textkit.tokenize.punc import text2punc
 9 | from textkit.filter.filter_punc import filterpunc
10 | from textkit.filter.filter_words import filterwords
11 | from textkit.filter.filter_lengths import filterlengths
12 | from textkit.filter.filter_words import showstops
13 | from textkit.transform.tokens_to_lower import tokens2lower
14 | from textkit.transform.tokens_to_upper import tokens2upper
15 | from textkit.transform.newlines import nonewlines
16 | from textkit.transform.tokens_to_stem import tokens2stem
17 | from textkit.transform.tokens_to_counts import tokens2counts
18 | from textkit.transform.tokens_to_top_bigrams import tokens2topbigrams
19 | from textkit.transform.tokens_to_pos import tokens2pos
20 | from textkit.transform.transliterate import transliterate
21 | from textkit.package.tokens_to_json import tokens2json
22 | from textkit.package.texts_to_json import texts2json
23 | from textkit.package.tokens_to_text import tokens2text
24 | from textkit.download import download
25 | 
26 | 
27 | @click.group()
28 | def cli():
29 |     '''Text analysis from the command line.
30 |     '''
31 |     pass
32 | 
33 | cli.add_command(text2words)
34 | cli.add_command(text2sentences)
35 | cli.add_command(words2bigrams)
36 | cli.add_command(words2ngrams)
37 | cli.add_command(text2ngrams)
38 | cli.add_command(text2punc)
39 | cli.add_command(filterpunc)
40 | cli.add_command(filterwords)
41 | cli.add_command(filterlengths)
42 | cli.add_command(showstops)
43 | cli.add_command(tokens2lower)
44 | cli.add_command(tokens2upper)
45 | cli.add_command(nonewlines)
46 | cli.add_command(tokens2stem)
47 | cli.add_command(tokens2json)
48 | cli.add_command(texts2json)
49 | cli.add_command(tokens2text)
50 | cli.add_command(tokens2counts)
51 | cli.add_command(tokens2topbigrams)
52 | cli.add_command(tokens2pos)
53 | cli.add_command(transliterate)
54 | cli.add_command(download)
55 | 


--------------------------------------------------------------------------------
/textkit/coerce.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def isfloat(value):
 3 |     try:
 4 |         float(value)
 5 |         return True
 6 |     except:
 7 |         return False
 8 | 
 9 | 
10 | def isint(value):
11 |     try:
12 |         int(value)
13 |         return True
14 |     except:
15 |         return False
16 | 
17 | 
18 | CONVERTERS = {'IntType': lambda x: int(x),
19 |               'FloatType': lambda x: float(x),
20 |               'StringType': lambda x: x}
21 | 
22 | 
23 | def pick_type(types):
24 |     ''' if there is only one type found
25 |     in a column, then use that. if multiple
26 |     types are found, default back to string.
27 |     '''
28 |     type_set = set(types)
29 |     if len(type_set) == 1:
30 |         return list(type_set)[0]
31 |     elif set(['IntType', 'FloatType']) == type_set:
32 |         # if there is a mix of floats and ints, then the column is floats.
33 |         return 'FloatType'
34 |     else:
35 |         return 'StringType'
36 | 
37 | 
38 | def get_column_types(content):
39 |     ''' Figure out what type of content is in each column
40 |     of a csv-like input. This is a simple brute force method that
41 |     attempts to convert the strings of the content into floats and ints.
42 |     if the conversion is successful for all rows tested,
43 |     that type is considered the type of the column.
44 |     '''
45 | 
46 |     # number of rows to check for content
47 |     test_count = min(len(content), 5)
48 | 
49 |     # number of columns
50 |     col_count = len(content[0])
51 | 
52 |     all_types = [[] for i in range(col_count)]
53 | 
54 |     for r_ind in range(test_count):
55 |         for col_ind, col in enumerate(content[r_ind]):
56 |             if isint(col):
57 |                 all_types[col_ind].append('IntType')
58 |             elif isfloat(col):
59 |                 all_types[col_ind].append('FloatType')
60 |             else:
61 |                 all_types[col_ind].append('StringType')
62 | 
63 |     # find if conversions are consistent across rows
64 |     column_types = [pick_type(types) for types in all_types]
65 |     return column_types
66 | 
67 | 
68 | def coerce_types(content):
69 |     '''
70 |     Convert types in csv-like content.
71 |     The idea is that when translating to and
72 |     from csv, everything is converted to strings. So, we need to undo that
73 |     conversion for things like counts.
74 |     '''
75 |     if len(content) == 0:
76 |         return content
77 | 
78 |     column_types = get_column_types(content)
79 | 
80 |     coerced_content = []
81 |     for row in content:
82 |         c_row = []
83 |         for col_ind, col in enumerate(row):
84 |             try:
85 |                 col = CONVERTERS[column_types[col_ind]](col)
86 |             except ValueError:
87 |                 col = col
88 |             c_row.append(col)
89 |         coerced_content.append(c_row)
90 |     return coerced_content
91 | 


--------------------------------------------------------------------------------
/textkit/data/stopwords/README.md:
--------------------------------------------------------------------------------
1 | Stopwords Corpus
2 | 
3 | This corpus contains lists of stop words for several languages.
4 | 
5 | They were obtained from:
6 | https://github.com/nltk
7 | 
8 | And nltk in turn obtained the stopwords corpus from:
9 | http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/


--------------------------------------------------------------------------------
/textkit/data/stopwords/danish.txt:
--------------------------------------------------------------------------------
 1 | og
 2 | i
 3 | jeg
 4 | det
 5 | at
 6 | en
 7 | den
 8 | til
 9 | er
10 | som
11 | på
12 | de
13 | med
14 | han
15 | af
16 | for
17 | ikke
18 | der
19 | var
20 | mig
21 | sig
22 | men
23 | et
24 | har
25 | om
26 | vi
27 | min
28 | havde
29 | ham
30 | hun
31 | nu
32 | over
33 | da
34 | fra
35 | du
36 | ud
37 | sin
38 | dem
39 | os
40 | op
41 | man
42 | hans
43 | hvor
44 | eller
45 | hvad
46 | skal
47 | selv
48 | her
49 | alle
50 | vil
51 | blev
52 | kunne
53 | ind
54 | når
55 | være
56 | dog
57 | noget
58 | ville
59 | jo
60 | deres
61 | efter
62 | ned
63 | skulle
64 | denne
65 | end
66 | dette
67 | mit
68 | også
69 | under
70 | have
71 | dig
72 | anden
73 | hende
74 | mine
75 | alt
76 | meget
77 | sit
78 | sine
79 | vor
80 | mod
81 | disse
82 | hvis
83 | din
84 | nogle
85 | hos
86 | blive
87 | mange
88 | ad
89 | bliver
90 | hendes
91 | været
92 | thi
93 | jer
94 | sådan
95 | 


--------------------------------------------------------------------------------
/textkit/data/stopwords/dutch.txt:
--------------------------------------------------------------------------------
  1 | de
  2 | en
  3 | van
  4 | ik
  5 | te
  6 | dat
  7 | die
  8 | in
  9 | een
 10 | hij
 11 | het
 12 | niet
 13 | zijn
 14 | is
 15 | was
 16 | op
 17 | aan
 18 | met
 19 | als
 20 | voor
 21 | had
 22 | er
 23 | maar
 24 | om
 25 | hem
 26 | dan
 27 | zou
 28 | of
 29 | wat
 30 | mijn
 31 | men
 32 | dit
 33 | zo
 34 | door
 35 | over
 36 | ze
 37 | zich
 38 | bij
 39 | ook
 40 | tot
 41 | je
 42 | mij
 43 | uit
 44 | der
 45 | daar
 46 | haar
 47 | naar
 48 | heb
 49 | hoe
 50 | heeft
 51 | hebben
 52 | deze
 53 | u
 54 | want
 55 | nog
 56 | zal
 57 | me
 58 | zij
 59 | nu
 60 | ge
 61 | geen
 62 | omdat
 63 | iets
 64 | worden
 65 | toch
 66 | al
 67 | waren
 68 | veel
 69 | meer
 70 | doen
 71 | toen
 72 | moet
 73 | ben
 74 | zonder
 75 | kan
 76 | hun
 77 | dus
 78 | alles
 79 | onder
 80 | ja
 81 | eens
 82 | hier
 83 | wie
 84 | werd
 85 | altijd
 86 | doch
 87 | wordt
 88 | wezen
 89 | kunnen
 90 | ons
 91 | zelf
 92 | tegen
 93 | na
 94 | reeds
 95 | wil
 96 | kon
 97 | niets
 98 | uw
 99 | iemand
100 | geweest
101 | andere
102 | 


--------------------------------------------------------------------------------
/textkit/data/stopwords/english.txt:
--------------------------------------------------------------------------------
  1 | i
  2 | me
  3 | my
  4 | myself
  5 | we
  6 | our
  7 | ours
  8 | ourselves
  9 | you
 10 | your
 11 | yours
 12 | yourself
 13 | yourselves
 14 | he
 15 | him
 16 | his
 17 | himself
 18 | she
 19 | her
 20 | hers
 21 | herself
 22 | it
 23 | its
 24 | itself
 25 | they
 26 | them
 27 | their
 28 | theirs
 29 | themselves
 30 | what
 31 | which
 32 | who
 33 | whom
 34 | this
 35 | that
 36 | these
 37 | those
 38 | am
 39 | is
 40 | are
 41 | was
 42 | were
 43 | be
 44 | been
 45 | being
 46 | have
 47 | has
 48 | had
 49 | having
 50 | do
 51 | does
 52 | did
 53 | doing
 54 | a
 55 | an
 56 | the
 57 | and
 58 | but
 59 | if
 60 | or
 61 | because
 62 | as
 63 | until
 64 | while
 65 | of
 66 | at
 67 | by
 68 | for
 69 | with
 70 | about
 71 | against
 72 | between
 73 | into
 74 | through
 75 | during
 76 | before
 77 | after
 78 | above
 79 | below
 80 | to
 81 | from
 82 | up
 83 | down
 84 | in
 85 | out
 86 | on
 87 | off
 88 | over
 89 | under
 90 | again
 91 | further
 92 | then
 93 | once
 94 | here
 95 | there
 96 | when
 97 | where
 98 | why
 99 | how
100 | all
101 | any
102 | both
103 | each
104 | few
105 | more
106 | most
107 | other
108 | some
109 | such
110 | no
111 | nor
112 | not
113 | only
114 | own
115 | same
116 | so
117 | than
118 | too
119 | very
120 | s
121 | t
122 | can
123 | will
124 | just
125 | don
126 | should
127 | now
128 | 
129 | 


--------------------------------------------------------------------------------
/textkit/data/stopwords/finnish.txt:
--------------------------------------------------------------------------------
  1 | olla
  2 | olen
  3 | olet
  4 | on
  5 | olemme
  6 | olette
  7 | ovat
  8 | ole
  9 | oli
 10 | olisi
 11 | olisit
 12 | olisin
 13 | olisimme
 14 | olisitte
 15 | olisivat
 16 | olit
 17 | olin
 18 | olimme
 19 | olitte
 20 | olivat
 21 | ollut
 22 | olleet
 23 | en
 24 | et
 25 | ei
 26 | emme
 27 | ette
 28 | eivät
 29 | minä
 30 | minun
 31 | minut
 32 | minua
 33 | minussa
 34 | minusta
 35 | minuun
 36 | minulla
 37 | minulta
 38 | minulle
 39 | sinä
 40 | sinun
 41 | sinut
 42 | sinua
 43 | sinussa
 44 | sinusta
 45 | sinuun
 46 | sinulla
 47 | sinulta
 48 | sinulle
 49 | hän
 50 | hänen
 51 | hänet
 52 | häntä
 53 | hänessä
 54 | hänestä
 55 | häneen
 56 | hänellä
 57 | häneltä
 58 | hänelle
 59 | me
 60 | meidän
 61 | meidät
 62 | meitä
 63 | meissä
 64 | meistä
 65 | meihin
 66 | meillä
 67 | meiltä
 68 | meille
 69 | te
 70 | teidän
 71 | teidät
 72 | teitä
 73 | teissä
 74 | teistä
 75 | teihin
 76 | teillä
 77 | teiltä
 78 | teille
 79 | he
 80 | heidän
 81 | heidät
 82 | heitä
 83 | heissä
 84 | heistä
 85 | heihin
 86 | heillä
 87 | heiltä
 88 | heille
 89 | tämä
 90 | tämän
 91 | tätä
 92 | tässä
 93 | tästä
 94 | tähän
 95 | tallä
 96 | tältä
 97 | tälle
 98 | tänä
 99 | täksi
100 | tuo
101 | tuon
102 | tuotä
103 | tuossa
104 | tuosta
105 | tuohon
106 | tuolla
107 | tuolta
108 | tuolle
109 | tuona
110 | tuoksi
111 | se
112 | sen
113 | sitä
114 | siinä
115 | siitä
116 | siihen
117 | sillä
118 | siltä
119 | sille
120 | sinä
121 | siksi
122 | nämä
123 | näiden
124 | näitä
125 | näissä
126 | näistä
127 | näihin
128 | näillä
129 | näiltä
130 | näille
131 | näinä
132 | näiksi
133 | nuo
134 | noiden
135 | noita
136 | noissa
137 | noista
138 | noihin
139 | noilla
140 | noilta
141 | noille
142 | noina
143 | noiksi
144 | ne
145 | niiden
146 | niitä
147 | niissä
148 | niistä
149 | niihin
150 | niillä
151 | niiltä
152 | niille
153 | niinä
154 | niiksi
155 | kuka
156 | kenen
157 | kenet
158 | ketä
159 | kenessä
160 | kenestä
161 | keneen
162 | kenellä
163 | keneltä
164 | kenelle
165 | kenenä
166 | keneksi
167 | ketkä
168 | keiden
169 | ketkä
170 | keitä
171 | keissä
172 | keistä
173 | keihin
174 | keillä
175 | keiltä
176 | keille
177 | keinä
178 | keiksi
179 | mikä
180 | minkä
181 | minkä
182 | mitä
183 | missä
184 | mistä
185 | mihin
186 | millä
187 | miltä
188 | mille
189 | minä
190 | miksi
191 | mitkä
192 | joka
193 | jonka
194 | jota
195 | jossa
196 | josta
197 | johon
198 | jolla
199 | jolta
200 | jolle
201 | jona
202 | joksi
203 | jotka
204 | joiden
205 | joita
206 | joissa
207 | joista
208 | joihin
209 | joilla
210 | joilta
211 | joille
212 | joina
213 | joiksi
214 | että
215 | ja
216 | jos
217 | koska
218 | kuin
219 | mutta
220 | niin
221 | sekä
222 | sillä
223 | tai
224 | vaan
225 | vai
226 | vaikka
227 | kanssa
228 | mukaan
229 | noin
230 | poikki
231 | yli
232 | kun
233 | niin
234 | nyt
235 | itse
236 | 


--------------------------------------------------------------------------------
/textkit/data/stopwords/french.txt:
--------------------------------------------------------------------------------
  1 | au
  2 | aux
  3 | avec
  4 | ce
  5 | ces
  6 | dans
  7 | de
  8 | des
  9 | du
 10 | elle
 11 | en
 12 | et
 13 | eux
 14 | il
 15 | je
 16 | la
 17 | le
 18 | leur
 19 | lui
 20 | ma
 21 | mais
 22 | me
 23 | même
 24 | mes
 25 | moi
 26 | mon
 27 | ne
 28 | nos
 29 | notre
 30 | nous
 31 | on
 32 | ou
 33 | par
 34 | pas
 35 | pour
 36 | qu
 37 | que
 38 | qui
 39 | sa
 40 | se
 41 | ses
 42 | son
 43 | sur
 44 | ta
 45 | te
 46 | tes
 47 | toi
 48 | ton
 49 | tu
 50 | un
 51 | une
 52 | vos
 53 | votre
 54 | vous
 55 | c
 56 | d
 57 | j
 58 | l
 59 | à
 60 | m
 61 | n
 62 | s
 63 | t
 64 | y
 65 | été
 66 | étée
 67 | étées
 68 | étés
 69 | étant
 70 | étante
 71 | étants
 72 | étantes
 73 | suis
 74 | es
 75 | est
 76 | sommes
 77 | êtes
 78 | sont
 79 | serai
 80 | seras
 81 | sera
 82 | serons
 83 | serez
 84 | seront
 85 | serais
 86 | serait
 87 | serions
 88 | seriez
 89 | seraient
 90 | étais
 91 | était
 92 | étions
 93 | étiez
 94 | étaient
 95 | fus
 96 | fut
 97 | fûmes
 98 | fûtes
 99 | furent
100 | sois
101 | soit
102 | soyons
103 | soyez
104 | soient
105 | fusse
106 | fusses
107 | fût
108 | fussions
109 | fussiez
110 | fussent
111 | ayant
112 | ayante
113 | ayantes
114 | ayants
115 | eu
116 | eue
117 | eues
118 | eus
119 | ai
120 | as
121 | avons
122 | avez
123 | ont
124 | aurai
125 | auras
126 | aura
127 | aurons
128 | aurez
129 | auront
130 | aurais
131 | aurait
132 | aurions
133 | auriez
134 | auraient
135 | avais
136 | avait
137 | avions
138 | aviez
139 | avaient
140 | eut
141 | eûmes
142 | eûtes
143 | eurent
144 | aie
145 | aies
146 | ait
147 | ayons
148 | ayez
149 | aient
150 | eusse
151 | eusses
152 | eût
153 | eussions
154 | eussiez
155 | eussent
156 | 


--------------------------------------------------------------------------------
/textkit/data/stopwords/german.txt:
--------------------------------------------------------------------------------
  1 | aber
  2 | alle
  3 | allem
  4 | allen
  5 | aller
  6 | alles
  7 | als
  8 | also
  9 | am
 10 | an
 11 | ander
 12 | andere
 13 | anderem
 14 | anderen
 15 | anderer
 16 | anderes
 17 | anderm
 18 | andern
 19 | anderr
 20 | anders
 21 | auch
 22 | auf
 23 | aus
 24 | bei
 25 | bin
 26 | bis
 27 | bist
 28 | da
 29 | damit
 30 | dann
 31 | der
 32 | den
 33 | des
 34 | dem
 35 | die
 36 | das
 37 | daß
 38 | derselbe
 39 | derselben
 40 | denselben
 41 | desselben
 42 | demselben
 43 | dieselbe
 44 | dieselben
 45 | dasselbe
 46 | dazu
 47 | dein
 48 | deine
 49 | deinem
 50 | deinen
 51 | deiner
 52 | deines
 53 | denn
 54 | derer
 55 | dessen
 56 | dich
 57 | dir
 58 | du
 59 | dies
 60 | diese
 61 | diesem
 62 | diesen
 63 | dieser
 64 | dieses
 65 | doch
 66 | dort
 67 | durch
 68 | ein
 69 | eine
 70 | einem
 71 | einen
 72 | einer
 73 | eines
 74 | einig
 75 | einige
 76 | einigem
 77 | einigen
 78 | einiger
 79 | einiges
 80 | einmal
 81 | er
 82 | ihn
 83 | ihm
 84 | es
 85 | etwas
 86 | euer
 87 | eure
 88 | eurem
 89 | euren
 90 | eurer
 91 | eures
 92 | für
 93 | gegen
 94 | gewesen
 95 | hab
 96 | habe
 97 | haben
 98 | hat
 99 | hatte
100 | hatten
101 | hier
102 | hin
103 | hinter
104 | ich
105 | mich
106 | mir
107 | ihr
108 | ihre
109 | ihrem
110 | ihren
111 | ihrer
112 | ihres
113 | euch
114 | im
115 | in
116 | indem
117 | ins
118 | ist
119 | jede
120 | jedem
121 | jeden
122 | jeder
123 | jedes
124 | jene
125 | jenem
126 | jenen
127 | jener
128 | jenes
129 | jetzt
130 | kann
131 | kein
132 | keine
133 | keinem
134 | keinen
135 | keiner
136 | keines
137 | können
138 | könnte
139 | machen
140 | man
141 | manche
142 | manchem
143 | manchen
144 | mancher
145 | manches
146 | mein
147 | meine
148 | meinem
149 | meinen
150 | meiner
151 | meines
152 | mit
153 | muss
154 | musste
155 | nach
156 | nicht
157 | nichts
158 | noch
159 | nun
160 | nur
161 | ob
162 | oder
163 | ohne
164 | sehr
165 | sein
166 | seine
167 | seinem
168 | seinen
169 | seiner
170 | seines
171 | selbst
172 | sich
173 | sie
174 | ihnen
175 | sind
176 | so
177 | solche
178 | solchem
179 | solchen
180 | solcher
181 | solches
182 | soll
183 | sollte
184 | sondern
185 | sonst
186 | über
187 | um
188 | und
189 | uns
190 | unse
191 | unsem
192 | unsen
193 | unser
194 | unses
195 | unter
196 | viel
197 | vom
198 | von
199 | vor
200 | während
201 | war
202 | waren
203 | warst
204 | was
205 | weg
206 | weil
207 | weiter
208 | welche
209 | welchem
210 | welchen
211 | welcher
212 | welches
213 | wenn
214 | werde
215 | werden
216 | wie
217 | wieder
218 | will
219 | wir
220 | wird
221 | wirst
222 | wo
223 | wollen
224 | wollte
225 | würde
226 | würden
227 | zu
228 | zum
229 | zur
230 | zwar
231 | zwischen
232 | 


--------------------------------------------------------------------------------
/textkit/data/stopwords/hungarian.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | ahogy
  3 | ahol
  4 | aki
  5 | akik
  6 | akkor
  7 | alatt
  8 | által
  9 | általában
 10 | amely
 11 | amelyek
 12 | amelyekben
 13 | amelyeket
 14 | amelyet
 15 | amelynek
 16 | ami
 17 | amit
 18 | amolyan
 19 | amíg
 20 | amikor
 21 | át
 22 | abban
 23 | ahhoz
 24 | annak
 25 | arra
 26 | arról
 27 | az
 28 | azok
 29 | azon
 30 | azt
 31 | azzal
 32 | azért
 33 | aztán
 34 | azután
 35 | azonban
 36 | bár
 37 | be
 38 | belül
 39 | benne
 40 | cikk
 41 | cikkek
 42 | cikkeket
 43 | csak
 44 | de
 45 | e
 46 | eddig
 47 | egész
 48 | egy
 49 | egyes
 50 | egyetlen
 51 | egyéb
 52 | egyik
 53 | egyre
 54 | ekkor
 55 | el
 56 | elég
 57 | ellen
 58 | elõ
 59 | elõször
 60 | elõtt
 61 | elsõ
 62 | én
 63 | éppen
 64 | ebben
 65 | ehhez
 66 | emilyen
 67 | ennek
 68 | erre
 69 | ez
 70 | ezt
 71 | ezek
 72 | ezen
 73 | ezzel
 74 | ezért
 75 | és
 76 | fel
 77 | felé
 78 | hanem
 79 | hiszen
 80 | hogy
 81 | hogyan
 82 | igen
 83 | így
 84 | illetve
 85 | ill.
 86 | ill
 87 | ilyen
 88 | ilyenkor
 89 | ison
 90 | ismét
 91 | itt
 92 | jó
 93 | jól
 94 | jobban
 95 | kell
 96 | kellett
 97 | keresztül
 98 | keressünk
 99 | ki
100 | kívül
101 | között
102 | közül
103 | legalább
104 | lehet
105 | lehetett
106 | legyen
107 | lenne
108 | lenni
109 | lesz
110 | lett
111 | maga
112 | magát
113 | majd
114 | majd
115 | már
116 | más
117 | másik
118 | meg
119 | még
120 | mellett
121 | mert
122 | mely
123 | melyek
124 | mi
125 | mit
126 | míg
127 | miért
128 | milyen
129 | mikor
130 | minden
131 | mindent
132 | mindenki
133 | mindig
134 | mint
135 | mintha
136 | mivel
137 | most
138 | nagy
139 | nagyobb
140 | nagyon
141 | ne
142 | néha
143 | nekem
144 | neki
145 | nem
146 | néhány
147 | nélkül
148 | nincs
149 | olyan
150 | ott
151 | össze
152 | õ
153 | õk
154 | õket
155 | pedig
156 | persze
157 | rá
158 | s
159 | saját
160 | sem
161 | semmi
162 | sok
163 | sokat
164 | sokkal
165 | számára
166 | szemben
167 | szerint
168 | szinte
169 | talán
170 | tehát
171 | teljes
172 | tovább
173 | továbbá
174 | több
175 | úgy
176 | ugyanis
177 | új
178 | újabb
179 | újra
180 | után
181 | utána
182 | utolsó
183 | vagy
184 | vagyis
185 | valaki
186 | valami
187 | valamint
188 | való
189 | vagyok
190 | van
191 | vannak
192 | volt
193 | voltam
194 | voltak
195 | voltunk
196 | vissza
197 | vele
198 | viszont
199 | volna
200 | 


--------------------------------------------------------------------------------
/textkit/data/stopwords/italian.txt:
--------------------------------------------------------------------------------
  1 | ad
  2 | al
  3 | allo
  4 | ai
  5 | agli
  6 | all
  7 | agl
  8 | alla
  9 | alle
 10 | con
 11 | col
 12 | coi
 13 | da
 14 | dal
 15 | dallo
 16 | dai
 17 | dagli
 18 | dall
 19 | dagl
 20 | dalla
 21 | dalle
 22 | di
 23 | del
 24 | dello
 25 | dei
 26 | degli
 27 | dell
 28 | degl
 29 | della
 30 | delle
 31 | in
 32 | nel
 33 | nello
 34 | nei
 35 | negli
 36 | nell
 37 | negl
 38 | nella
 39 | nelle
 40 | su
 41 | sul
 42 | sullo
 43 | sui
 44 | sugli
 45 | sull
 46 | sugl
 47 | sulla
 48 | sulle
 49 | per
 50 | tra
 51 | contro
 52 | io
 53 | tu
 54 | lui
 55 | lei
 56 | noi
 57 | voi
 58 | loro
 59 | mio
 60 | mia
 61 | miei
 62 | mie
 63 | tuo
 64 | tua
 65 | tuoi
 66 | tue
 67 | suo
 68 | sua
 69 | suoi
 70 | sue
 71 | nostro
 72 | nostra
 73 | nostri
 74 | nostre
 75 | vostro
 76 | vostra
 77 | vostri
 78 | vostre
 79 | mi
 80 | ti
 81 | ci
 82 | vi
 83 | lo
 84 | la
 85 | li
 86 | le
 87 | gli
 88 | ne
 89 | il
 90 | un
 91 | uno
 92 | una
 93 | ma
 94 | ed
 95 | se
 96 | perché
 97 | anche
 98 | come
 99 | dov
100 | dove
101 | che
102 | chi
103 | cui
104 | non
105 | più
106 | quale
107 | quanto
108 | quanti
109 | quanta
110 | quante
111 | quello
112 | quelli
113 | quella
114 | quelle
115 | questo
116 | questi
117 | questa
118 | queste
119 | si
120 | tutto
121 | tutti
122 | a
123 | c
124 | e
125 | i
126 | l
127 | o
128 | ho
129 | hai
130 | ha
131 | abbiamo
132 | avete
133 | hanno
134 | abbia
135 | abbiate
136 | abbiano
137 | avrò
138 | avrai
139 | avrà
140 | avremo
141 | avrete
142 | avranno
143 | avrei
144 | avresti
145 | avrebbe
146 | avremmo
147 | avreste
148 | avrebbero
149 | avevo
150 | avevi
151 | aveva
152 | avevamo
153 | avevate
154 | avevano
155 | ebbi
156 | avesti
157 | ebbe
158 | avemmo
159 | aveste
160 | ebbero
161 | avessi
162 | avesse
163 | avessimo
164 | avessero
165 | avendo
166 | avuto
167 | avuta
168 | avuti
169 | avute
170 | sono
171 | sei
172 | è
173 | siamo
174 | siete
175 | sia
176 | siate
177 | siano
178 | sarò
179 | sarai
180 | sarà
181 | saremo
182 | sarete
183 | saranno
184 | sarei
185 | saresti
186 | sarebbe
187 | saremmo
188 | sareste
189 | sarebbero
190 | ero
191 | eri
192 | era
193 | eravamo
194 | eravate
195 | erano
196 | fui
197 | fosti
198 | fu
199 | fummo
200 | foste
201 | furono
202 | fossi
203 | fosse
204 | fossimo
205 | fossero
206 | essendo
207 | faccio
208 | fai
209 | facciamo
210 | fanno
211 | faccia
212 | facciate
213 | facciano
214 | farò
215 | farai
216 | farà
217 | faremo
218 | farete
219 | faranno
220 | farei
221 | faresti
222 | farebbe
223 | faremmo
224 | fareste
225 | farebbero
226 | facevo
227 | facevi
228 | faceva
229 | facevamo
230 | facevate
231 | facevano
232 | feci
233 | facesti
234 | fece
235 | facemmo
236 | faceste
237 | fecero
238 | facessi
239 | facesse
240 | facessimo
241 | facessero
242 | facendo
243 | sto
244 | stai
245 | sta
246 | stiamo
247 | stanno
248 | stia
249 | stiate
250 | stiano
251 | starò
252 | starai
253 | starà
254 | staremo
255 | starete
256 | staranno
257 | starei
258 | staresti
259 | starebbe
260 | staremmo
261 | stareste
262 | starebbero
263 | stavo
264 | stavi
265 | stava
266 | stavamo
267 | stavate
268 | stavano
269 | stetti
270 | stesti
271 | stette
272 | stemmo
273 | steste
274 | stettero
275 | stessi
276 | stesse
277 | stessimo
278 | stessero
279 | stando
280 | 


--------------------------------------------------------------------------------
/textkit/data/stopwords/norwegian.txt:
--------------------------------------------------------------------------------
  1 | og
  2 | i
  3 | jeg
  4 | det
  5 | at
  6 | en
  7 | et
  8 | den
  9 | til
 10 | er
 11 | som
 12 | på
 13 | de
 14 | med
 15 | han
 16 | av
 17 | ikke
 18 | ikkje
 19 | der
 20 | så
 21 | var
 22 | meg
 23 | seg
 24 | men
 25 | ett
 26 | har
 27 | om
 28 | vi
 29 | min
 30 | mitt
 31 | ha
 32 | hadde
 33 | hun
 34 | nå
 35 | over
 36 | da
 37 | ved
 38 | fra
 39 | du
 40 | ut
 41 | sin
 42 | dem
 43 | oss
 44 | opp
 45 | man
 46 | kan
 47 | hans
 48 | hvor
 49 | eller
 50 | hva
 51 | skal
 52 | selv
 53 | sjøl
 54 | her
 55 | alle
 56 | vil
 57 | bli
 58 | ble
 59 | blei
 60 | blitt
 61 | kunne
 62 | inn
 63 | når
 64 | være
 65 | kom
 66 | noen
 67 | noe
 68 | ville
 69 | dere
 70 | som
 71 | deres
 72 | kun
 73 | ja
 74 | etter
 75 | ned
 76 | skulle
 77 | denne
 78 | for
 79 | deg
 80 | si
 81 | sine
 82 | sitt
 83 | mot
 84 | å
 85 | meget
 86 | hvorfor
 87 | dette
 88 | disse
 89 | uten
 90 | hvordan
 91 | ingen
 92 | din
 93 | ditt
 94 | blir
 95 | samme
 96 | hvilken
 97 | hvilke
 98 | sånn
 99 | inni
100 | mellom
101 | vår
102 | hver
103 | hvem
104 | vors
105 | hvis
106 | både
107 | bare
108 | enn
109 | fordi
110 | før
111 | mange
112 | også
113 | slik
114 | vært
115 | være
116 | båe
117 | begge
118 | siden
119 | dykk
120 | dykkar
121 | dei
122 | deira
123 | deires
124 | deim
125 | di
126 | då
127 | eg
128 | ein
129 | eit
130 | eitt
131 | elles
132 | honom
133 | hjå
134 | ho
135 | hoe
136 | henne
137 | hennar
138 | hennes
139 | hoss
140 | hossen
141 | ikkje
142 | ingi
143 | inkje
144 | korleis
145 | korso
146 | kva
147 | kvar
148 | kvarhelst
149 | kven
150 | kvi
151 | kvifor
152 | me
153 | medan
154 | mi
155 | mine
156 | mykje
157 | no
158 | nokon
159 | noka
160 | nokor
161 | noko
162 | nokre
163 | si
164 | sia
165 | sidan
166 | so
167 | somt
168 | somme
169 | um
170 | upp
171 | vere
172 | vore
173 | verte
174 | vort
175 | varte
176 | vart
177 | 


--------------------------------------------------------------------------------
/textkit/data/stopwords/portuguese.txt:
--------------------------------------------------------------------------------
  1 | de
  2 | a
  3 | o
  4 | que
  5 | e
  6 | do
  7 | da
  8 | em
  9 | um
 10 | para
 11 | com
 12 | não
 13 | uma
 14 | os
 15 | no
 16 | se
 17 | na
 18 | por
 19 | mais
 20 | as
 21 | dos
 22 | como
 23 | mas
 24 | ao
 25 | ele
 26 | das
 27 | à
 28 | seu
 29 | sua
 30 | ou
 31 | quando
 32 | muito
 33 | nos
 34 | já
 35 | eu
 36 | também
 37 | só
 38 | pelo
 39 | pela
 40 | até
 41 | isso
 42 | ela
 43 | entre
 44 | depois
 45 | sem
 46 | mesmo
 47 | aos
 48 | seus
 49 | quem
 50 | nas
 51 | me
 52 | esse
 53 | eles
 54 | você
 55 | essa
 56 | num
 57 | nem
 58 | suas
 59 | meu
 60 | às
 61 | minha
 62 | numa
 63 | pelos
 64 | elas
 65 | qual
 66 | nós
 67 | lhe
 68 | deles
 69 | essas
 70 | esses
 71 | pelas
 72 | este
 73 | dele
 74 | tu
 75 | te
 76 | vocês
 77 | vos
 78 | lhes
 79 | meus
 80 | minhas
 81 | teu
 82 | tua
 83 | teus
 84 | tuas
 85 | nosso
 86 | nossa
 87 | nossos
 88 | nossas
 89 | dela
 90 | delas
 91 | esta
 92 | estes
 93 | estas
 94 | aquele
 95 | aquela
 96 | aqueles
 97 | aquelas
 98 | isto
 99 | aquilo
100 | estou
101 | está
102 | estamos
103 | estão
104 | estive
105 | esteve
106 | estivemos
107 | estiveram
108 | estava
109 | estávamos
110 | estavam
111 | estivera
112 | estivéramos
113 | esteja
114 | estejamos
115 | estejam
116 | estivesse
117 | estivéssemos
118 | estivessem
119 | estiver
120 | estivermos
121 | estiverem
122 | hei
123 | há
124 | havemos
125 | hão
126 | houve
127 | houvemos
128 | houveram
129 | houvera
130 | houvéramos
131 | haja
132 | hajamos
133 | hajam
134 | houvesse
135 | houvéssemos
136 | houvessem
137 | houver
138 | houvermos
139 | houverem
140 | houverei
141 | houverá
142 | houveremos
143 | houverão
144 | houveria
145 | houveríamos
146 | houveriam
147 | sou
148 | somos
149 | são
150 | era
151 | éramos
152 | eram
153 | fui
154 | foi
155 | fomos
156 | foram
157 | fora
158 | fôramos
159 | seja
160 | sejamos
161 | sejam
162 | fosse
163 | fôssemos
164 | fossem
165 | for
166 | formos
167 | forem
168 | serei
169 | será
170 | seremos
171 | serão
172 | seria
173 | seríamos
174 | seriam
175 | tenho
176 | tem
177 | temos
178 | tém
179 | tinha
180 | tínhamos
181 | tinham
182 | tive
183 | teve
184 | tivemos
185 | tiveram
186 | tivera
187 | tivéramos
188 | tenha
189 | tenhamos
190 | tenham
191 | tivesse
192 | tivéssemos
193 | tivessem
194 | tiver
195 | tivermos
196 | tiverem
197 | terei
198 | terá
199 | teremos
200 | terão
201 | teria
202 | teríamos
203 | teriam
204 | 


--------------------------------------------------------------------------------
/textkit/data/stopwords/russian.txt:
--------------------------------------------------------------------------------
  1 | и
  2 | в
  3 | во
  4 | не
  5 | что
  6 | он
  7 | на
  8 | я
  9 | с
 10 | со
 11 | как
 12 | а
 13 | то
 14 | все
 15 | она
 16 | так
 17 | его
 18 | но
 19 | да
 20 | ты
 21 | к
 22 | у
 23 | же
 24 | вы
 25 | за
 26 | бы
 27 | по
 28 | только
 29 | ее
 30 | мне
 31 | было
 32 | вот
 33 | от
 34 | меня
 35 | еще
 36 | нет
 37 | о
 38 | из
 39 | ему
 40 | теперь
 41 | когда
 42 | даже
 43 | ну
 44 | вдруг
 45 | ли
 46 | если
 47 | уже
 48 | или
 49 | ни
 50 | быть
 51 | был
 52 | него
 53 | до
 54 | вас
 55 | нибудь
 56 | опять
 57 | уж
 58 | вам
 59 | ведь
 60 | там
 61 | потом
 62 | себя
 63 | ничего
 64 | ей
 65 | может
 66 | они
 67 | тут
 68 | где
 69 | есть
 70 | надо
 71 | ней
 72 | для
 73 | мы
 74 | тебя
 75 | их
 76 | чем
 77 | была
 78 | сам
 79 | чтоб
 80 | без
 81 | будто
 82 | чего
 83 | раз
 84 | тоже
 85 | себе
 86 | под
 87 | будет
 88 | ж
 89 | тогда
 90 | кто
 91 | этот
 92 | того
 93 | потому
 94 | этого
 95 | какой
 96 | совсем
 97 | ним
 98 | здесь
 99 | этом
100 | один
101 | почти
102 | мой
103 | тем
104 | чтобы
105 | нее
106 | сейчас
107 | были
108 | куда
109 | зачем
110 | всех
111 | никогда
112 | можно
113 | при
114 | наконец
115 | два
116 | об
117 | другой
118 | хоть
119 | после
120 | над
121 | больше
122 | тот
123 | через
124 | эти
125 | нас
126 | про
127 | всего
128 | них
129 | какая
130 | много
131 | разве
132 | три
133 | эту
134 | моя
135 | впрочем
136 | хорошо
137 | свою
138 | этой
139 | перед
140 | иногда
141 | лучше
142 | чуть
143 | том
144 | нельзя
145 | такой
146 | им
147 | более
148 | всегда
149 | конечно
150 | всю
151 | между
152 | 


--------------------------------------------------------------------------------
/textkit/data/stopwords/spanish.txt:
--------------------------------------------------------------------------------
  1 | de
  2 | la
  3 | que
  4 | el
  5 | en
  6 | y
  7 | a
  8 | los
  9 | del
 10 | se
 11 | las
 12 | por
 13 | un
 14 | para
 15 | con
 16 | no
 17 | una
 18 | su
 19 | al
 20 | lo
 21 | como
 22 | más
 23 | pero
 24 | sus
 25 | le
 26 | ya
 27 | o
 28 | este
 29 | sí
 30 | porque
 31 | esta
 32 | entre
 33 | cuando
 34 | muy
 35 | sin
 36 | sobre
 37 | también
 38 | me
 39 | hasta
 40 | hay
 41 | donde
 42 | quien
 43 | desde
 44 | todo
 45 | nos
 46 | durante
 47 | todos
 48 | uno
 49 | les
 50 | ni
 51 | contra
 52 | otros
 53 | ese
 54 | eso
 55 | ante
 56 | ellos
 57 | e
 58 | esto
 59 | mí
 60 | antes
 61 | algunos
 62 | qué
 63 | unos
 64 | yo
 65 | otro
 66 | otras
 67 | otra
 68 | él
 69 | tanto
 70 | esa
 71 | estos
 72 | mucho
 73 | quienes
 74 | nada
 75 | muchos
 76 | cual
 77 | poco
 78 | ella
 79 | estar
 80 | estas
 81 | algunas
 82 | algo
 83 | nosotros
 84 | mi
 85 | mis
 86 | tú
 87 | te
 88 | ti
 89 | tu
 90 | tus
 91 | ellas
 92 | nosotras
 93 | vosostros
 94 | vosostras
 95 | os
 96 | mío
 97 | mía
 98 | míos
 99 | mías
100 | tuyo
101 | tuya
102 | tuyos
103 | tuyas
104 | suyo
105 | suya
106 | suyos
107 | suyas
108 | nuestro
109 | nuestra
110 | nuestros
111 | nuestras
112 | vuestro
113 | vuestra
114 | vuestros
115 | vuestras
116 | esos
117 | esas
118 | estoy
119 | estás
120 | está
121 | estamos
122 | estáis
123 | están
124 | esté
125 | estés
126 | estemos
127 | estéis
128 | estén
129 | estaré
130 | estarás
131 | estará
132 | estaremos
133 | estaréis
134 | estarán
135 | estaría
136 | estarías
137 | estaríamos
138 | estaríais
139 | estarían
140 | estaba
141 | estabas
142 | estábamos
143 | estabais
144 | estaban
145 | estuve
146 | estuviste
147 | estuvo
148 | estuvimos
149 | estuvisteis
150 | estuvieron
151 | estuviera
152 | estuvieras
153 | estuviéramos
154 | estuvierais
155 | estuvieran
156 | estuviese
157 | estuvieses
158 | estuviésemos
159 | estuvieseis
160 | estuviesen
161 | estando
162 | estado
163 | estada
164 | estados
165 | estadas
166 | estad
167 | he
168 | has
169 | ha
170 | hemos
171 | habéis
172 | han
173 | haya
174 | hayas
175 | hayamos
176 | hayáis
177 | hayan
178 | habré
179 | habrás
180 | habrá
181 | habremos
182 | habréis
183 | habrán
184 | habría
185 | habrías
186 | habríamos
187 | habríais
188 | habrían
189 | había
190 | habías
191 | habíamos
192 | habíais
193 | habían
194 | hube
195 | hubiste
196 | hubo
197 | hubimos
198 | hubisteis
199 | hubieron
200 | hubiera
201 | hubieras
202 | hubiéramos
203 | hubierais
204 | hubieran
205 | hubiese
206 | hubieses
207 | hubiésemos
208 | hubieseis
209 | hubiesen
210 | habiendo
211 | habido
212 | habida
213 | habidos
214 | habidas
215 | soy
216 | eres
217 | es
218 | somos
219 | sois
220 | son
221 | sea
222 | seas
223 | seamos
224 | seáis
225 | sean
226 | seré
227 | serás
228 | será
229 | seremos
230 | seréis
231 | serán
232 | sería
233 | serías
234 | seríamos
235 | seríais
236 | serían
237 | era
238 | eras
239 | éramos
240 | erais
241 | eran
242 | fui
243 | fuiste
244 | fue
245 | fuimos
246 | fuisteis
247 | fueron
248 | fuera
249 | fueras
250 | fuéramos
251 | fuerais
252 | fueran
253 | fuese
254 | fueses
255 | fuésemos
256 | fueseis
257 | fuesen
258 | sintiendo
259 | sentido
260 | sentida
261 | sentidos
262 | sentidas
263 | siente
264 | sentid
265 | tengo
266 | tienes
267 | tiene
268 | tenemos
269 | tenéis
270 | tienen
271 | tenga
272 | tengas
273 | tengamos
274 | tengáis
275 | tengan
276 | tendré
277 | tendrás
278 | tendrá
279 | tendremos
280 | tendréis
281 | tendrán
282 | tendría
283 | tendrías
284 | tendríamos
285 | tendríais
286 | tendrían
287 | tenía
288 | tenías
289 | teníamos
290 | teníais
291 | tenían
292 | tuve
293 | tuviste
294 | tuvo
295 | tuvimos
296 | tuvisteis
297 | tuvieron
298 | tuviera
299 | tuvieras
300 | tuviéramos
301 | tuvierais
302 | tuvieran
303 | tuviese
304 | tuvieses
305 | tuviésemos
306 | tuvieseis
307 | tuviesen
308 | teniendo
309 | tenido
310 | tenida
311 | tenidos
312 | tenidas
313 | tened
314 | 


--------------------------------------------------------------------------------
/textkit/data/stopwords/swedish.txt:
--------------------------------------------------------------------------------
  1 | och
  2 | det
  3 | att
  4 | i
  5 | en
  6 | jag
  7 | hon
  8 | som
  9 | han
 10 | på
 11 | den
 12 | med
 13 | var
 14 | sig
 15 | för
 16 | så
 17 | till
 18 | är
 19 | men
 20 | ett
 21 | om
 22 | hade
 23 | de
 24 | av
 25 | icke
 26 | mig
 27 | du
 28 | henne
 29 | då
 30 | sin
 31 | nu
 32 | har
 33 | inte
 34 | hans
 35 | honom
 36 | skulle
 37 | hennes
 38 | där
 39 | min
 40 | man
 41 | ej
 42 | vid
 43 | kunde
 44 | något
 45 | från
 46 | ut
 47 | när
 48 | efter
 49 | upp
 50 | vi
 51 | dem
 52 | vara
 53 | vad
 54 | över
 55 | än
 56 | dig
 57 | kan
 58 | sina
 59 | här
 60 | ha
 61 | mot
 62 | alla
 63 | under
 64 | någon
 65 | eller
 66 | allt
 67 | mycket
 68 | sedan
 69 | ju
 70 | denna
 71 | själv
 72 | detta
 73 | åt
 74 | utan
 75 | varit
 76 | hur
 77 | ingen
 78 | mitt
 79 | ni
 80 | bli
 81 | blev
 82 | oss
 83 | din
 84 | dessa
 85 | några
 86 | deras
 87 | blir
 88 | mina
 89 | samma
 90 | vilken
 91 | er
 92 | sådan
 93 | vår
 94 | blivit
 95 | dess
 96 | inom
 97 | mellan
 98 | sådant
 99 | varför
100 | varje
101 | vilka
102 | ditt
103 | vem
104 | vilket
105 | sitta
106 | sådana
107 | vart
108 | dina
109 | vars
110 | vårt
111 | våra
112 | ert
113 | era
114 | vilkas
115 | 


--------------------------------------------------------------------------------
/textkit/data/stopwords/turkish.txt:
--------------------------------------------------------------------------------
 1 | acaba
 2 | ama
 3 | aslında
 4 | az
 5 | bazı
 6 | belki
 7 | biri
 8 | birkaç
 9 | birşey
10 | biz
11 | bu
12 | çok
13 | çünkü
14 | da
15 | daha
16 | de
17 | defa
18 | diye
19 | eğer
20 | en
21 | gibi
22 | hem
23 | hep
24 | hepsi
25 | her
26 | hiç
27 | için
28 | ile
29 | ise
30 | kez
31 | ki
32 | kim
33 | mı
34 | mu
35 | mü
36 | nasıl
37 | ne
38 | neden
39 | nerde
40 | nerede
41 | nereye
42 | niçin
43 | niye
44 | o
45 | sanki
46 | şey
47 | siz
48 | şu
49 | tüm
50 | ve
51 | veya
52 | ya
53 | yani
54 | 


--------------------------------------------------------------------------------
/textkit/download.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import click
 3 | import nltk
 4 | from textkit.utils import read_tokens, output
 5 | 
 6 | 
 7 | @click.command('download')
 8 | def download():
 9 |     '''
10 |     Install required libraries.
11 |     Note this library will install nltk dependencies into your
12 |     user directory.
13 |     '''
14 | 
15 |     click.echo("Installing nltk packages into your user directories in " +
16 |                "the following order of existence (first found):\n" +
17 |                '\n'.join(nltk.data.path))
18 | 
19 |     extensions = [("taggers", "averaged_perceptron_tagger"),
20 |                   ("corpora", "wordnet"),
21 |                   ("tokenizers", "punkt")]
22 | 
23 |     missing = check_packages_exist(extensions)
24 | 
25 |     for ext_tuple in missing:
26 |         nltk.download(ext_tuple[1])
27 | 
28 | 
29 | def check_packages_exist(extensions):
30 |     '''
31 |     Finds missing nltk extensions.
32 |     '''
33 |     paths = nltk.data.path  # there are usually quite a few, so we check them all.
34 |     missing = []
35 |     for ext_tuple in extensions:
36 |         ext_found = False
37 |         click.echo(message="Looking for " + ext_tuple[1], nl=True)
38 |         for path in paths:
39 |             if os.path.exists(os.path.join(path, ext_tuple[0], ext_tuple[1])):
40 |                 ext_found = True
41 |                 click.echo(message="Found " + ext_tuple[1], nl=True)
42 |                 break
43 |         if not ext_found:
44 |             click.echo(message="Missing " + ext_tuple[1], nl=True)
45 |             missing.append(ext_tuple)
46 | 
47 |     return missing
48 | 


--------------------------------------------------------------------------------
/textkit/filter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/learntextvis/textkit/6c482ea7d378022e10ee1002f9d9db7925856294/textkit/filter/__init__.py


--------------------------------------------------------------------------------
/textkit/filter/filter_lengths.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from textkit.utils import output, read_tokens
 3 | 
 4 | 
 5 | @click.command()
 6 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-'))
 7 | @click.option('-m', '--minimum', default=3,
 8 |               help='Minimum length of token to not filter.', show_default=True)
 9 | def filterlengths(minimum, tokens):
10 |     '''Remove tokens that are shorter then the minimum length provided.'''
11 |     content = read_tokens(tokens)
12 |     [output(token) for token in content if len(token) >= minimum]
13 | 


--------------------------------------------------------------------------------
/textkit/filter/filter_punc.py:
--------------------------------------------------------------------------------
 1 | from string import punctuation
 2 | import click
 3 | from textkit.utils import output, read_tokens
 4 | 
 5 | 
 6 | @click.command()
 7 | # @click.option('--out', type=click.File('w'), default='-',
 8 | #               help='Optional output file. Defaults to standard out.')
 9 | # @click.option('--punctuation', default=punctuation,
10 | #               help='String indicating punctuation to check for.')
11 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-'))
12 | def filterpunc(tokens):
13 |     '''Remove tokens that are only punctuation from a list of tokens.'''
14 |     content = read_tokens(tokens)
15 |     [output(token) for token in content if token not in punctuation]
16 | 


--------------------------------------------------------------------------------
/textkit/filter/filter_words.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import click
 3 | from textkit.utils import output, read_tokens, data_item
 4 | 
 5 | def get_stopwords(stopword_name):
 6 |     path = data_item('/stopwords/' + stopword_name + '.txt')
 7 |     stopwords = []
 8 |     with open(path) as filename:
 9 |         stopwords = read_tokens(filename)
10 |     return stopwords
11 | 
12 | 
13 | @click.command()
14 | @click.option('-l', '--language', type=click.Choice(['english', 'german', 'danish','dutch','finnish','french','hungarian','italian','norwegian','portuguese','russian','spanish','swedish','turkish']),
15 |               default='english')
16 | @click.option('--custom', type=click.File('r'),
17 |               help='Optional token file of additional tokens to remove ' +
18 |               'along with selected stop words.')
19 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-'))
20 | def filterwords(language, custom, tokens):
21 |     '''Remove stop words from tokens, returning tokens without stop words.'''
22 |     content = read_tokens(tokens)
23 |     stopwords = get_stopwords(language)
24 |     if custom:
25 |         stopwords = stopwords + read_tokens(custom)
26 | 
27 |     [output(token) for token in content
28 |         if token.lower() not in stopwords]
29 | 
30 | 
31 | @click.command()
32 | @click.option('-l', '--language', type=click.Choice(['english', 'german']),
33 |               default='english')
34 | def showstops(language):
35 |     '''Display stop words used by textkit for a given language.'''
36 |     stopwords = get_stopwords(language)
37 | 
38 |     [output(token) for token in stopwords]
39 | 


--------------------------------------------------------------------------------
/textkit/package/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/learntextvis/textkit/6c482ea7d378022e10ee1002f9d9db7925856294/textkit/package/__init__.py


--------------------------------------------------------------------------------
/textkit/package/texts_to_json.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from collections import OrderedDict
 3 | import click
 4 | from textkit.utils import output, read_tokens
 5 | 
 6 | def read_names(names_path):
 7 |     names = []
 8 |     if names_path:
 9 |         names_doc = open(names_path, 'r')
10 |         names = read_tokens(names_doc)
11 |     return names
12 | 
13 | 
14 | @click.command()
15 | @click.argument('text_docs', type=click.Path(exists=True), nargs=-1)
16 | @click.option('--ids', type=click.Path(),
17 |               help="File with one id per text document, each separated by a " +
18 |               "new line. Ids file is used to set the id attribute in the " +
19 |               "output JSON.")
20 | @click.option('--names', type=click.Path(),
21 |               help="File with one name per text document, each separated " +
22 |               "by a new line. Names file is used to set the name attribute " +
23 |               "in the output JSON.")
24 | @click.option('--field', default='text', help="Attribute name where text " +
25 |               "will be stored in the document object.", show_default=True)
26 | 
27 | def texts2json(ids, names, field, text_docs):
28 |     '''Convert a set of text documents into a
29 |     JSON array of document objects.'''
30 | 
31 |     docs = []
32 | 
33 |     names = read_names(names)
34 |     ids = read_names(ids)
35 | 
36 |     for idx, path in enumerate(text_docs):
37 |         tokens_doc = open(path, 'r')
38 |         content = ""
39 |         with click.open_file(path):
40 |             content = tokens_doc.read()
41 | 
42 |         # ordered so that these attributes stay at the top
43 |         doc = OrderedDict()
44 | 
45 |         if idx < len(ids) - 1:
46 |             doc['id'] = ids[idx]
47 |         else:
48 |             doc['id'] = path
49 | 
50 |         if idx < len(names) - 1:
51 |             doc['name'] = names[idx]
52 |         else:
53 |             doc['name'] = path
54 | 
55 |         doc[field] = content
56 |         docs.append(doc)
57 |         tokens_doc.close()
58 | 
59 |     out_content = json.dumps(docs, indent=2)
60 |     output(out_content)
61 | 


--------------------------------------------------------------------------------
/textkit/package/tokens_to_json.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | from collections import OrderedDict
 4 | import click
 5 | from textkit.utils import output, read_tokens, read_csv
 6 | from textkit.coerce import coerce_types
 7 | 
 8 | 
 9 | def read_names(names_path):
10 |     names = []
11 |     if names_path:
12 |         names_doc = open(names_path, 'r')
13 |         names = read_tokens(names_doc)
14 |     return names
15 | 
16 | 
17 | @click.command()
18 | @click.argument('token_docs', type=click.Path(exists=True), nargs=-1)
19 | @click.option('--ids', type=click.Path(),
20 |               help="File with one id per token document, each separated " +
21 |               "by a new line. Ids file is used to set the id attribute in " +
22 |               "the output JSON.")
23 | @click.option('--names', type=click.Path(),
24 |               help="File with one name per token document, each separated " +
25 |               "by a new line. Names file is used to set the name attribute " +
26 |               "in the output JSON.")
27 | @click.option('--field', default='tokens', help="Attribute name where " +
28 |               "tokens will be stored in the document object.",
29 |               show_default=True)
30 | @click.option('--split/--no-split', default=False, help="If enabled, " +
31 |               "textkit will attempt to split input columns when " +
32 |               "packaging. This is useful when packaging multiple column " +
33 |               "output like counts.",
34 |               show_default=True)
35 | @click.option('-s', '--sep', default=',', help="Separator character between " +
36 |               "columns. Only used if split-columns flag is used.",
37 |               show_default=True)
38 | def tokens2json(ids, names, field, split, sep, token_docs):
39 |     '''Convert a set of token documents into a
40 |     JSON array of document objects.'''
41 | 
42 |     docs = []
43 | 
44 |     names = read_names(names)
45 |     ids = read_names(ids)
46 | 
47 |     for idx, path in enumerate(token_docs):
48 |         if path == '-':
49 |             tokens_doc = sys.stdin
50 |         else:
51 |             tokens_doc = open(path, 'r')
52 |         if split:
53 |             content = read_csv(tokens_doc, sep)
54 |             content = coerce_types(content)
55 |         else:
56 |             content = read_tokens(tokens_doc)
57 | 
58 |         # ordered so that these attributes stay at the top
59 |         doc = OrderedDict()
60 | 
61 |         if idx < len(ids) - 1:
62 |             doc['id'] = ids[idx]
63 |         else:
64 |             doc['id'] = path
65 | 
66 |         if idx < len(names) - 1:
67 |             doc['name'] = names[idx]
68 |         else:
69 |             doc['name'] = path
70 | 
71 |         doc[field] = content
72 |         docs.append(doc)
73 |         tokens_doc.close()
74 | 
75 |     out_content = json.dumps(docs, indent=2)
76 |     output(out_content)
77 | 


--------------------------------------------------------------------------------
/textkit/package/tokens_to_text.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from textkit.utils import read_tokens, output
 3 | 
 4 | 
 5 | @click.command()
 6 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-'))
 7 | @click.option('-s', '--sep', default=' ',
 8 |               help='Separator between token and count in output.',
 9 |               show_default=True)
10 | def tokens2text(sep, tokens):
11 |     '''Combine tokens in a token document into a single text file.'''
12 | 
13 |     content = read_tokens(tokens)
14 |     out = sep.join(content)
15 |     output(out)
16 | 


--------------------------------------------------------------------------------
/textkit/tokenize/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/learntextvis/textkit/6c482ea7d378022e10ee1002f9d9db7925856294/textkit/tokenize/__init__.py


--------------------------------------------------------------------------------
/textkit/tokenize/bigrams.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import nltk
 3 | from textkit.utils import output, read_tokens
 4 | 
 5 | 
 6 | @click.command()
 7 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-'))
 8 | @click.option('-s', '--sep', default=' ',
 9 |               help='Separator between words in bigram output.',
10 |               show_default=True)
11 | def words2bigrams(sep, tokens):
12 |     '''Tokenize words into bigrams. Bigrams are two word tokens.
13 |     Punctuation is considered as a separate token.'''
14 | 
15 |     content = read_tokens(tokens)
16 |     bigrams = []
17 |     try:
18 |         bigrams = list(nltk.bigrams(content))
19 |     except LookupError as err:
20 |         click.echo(message="Error with tokenization", nl=True)
21 |         click.echo(message="Have you run \"textkit download\"?", nl=True)
22 |         click.echo(message="\nOriginal Error:", nl=True)
23 |         click.echo(err)
24 |     [output(sep.join(bigram)) for bigram in bigrams]
25 | 


--------------------------------------------------------------------------------
/textkit/tokenize/ngrams.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import nltk
 3 | from textkit.utils import write_csv, read_tokens
 4 | 
 5 | 
 6 | @click.command('words2ngrams')
 7 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-'))
 8 | @click.option('-s', '--sep', default=' ',
 9 |               help='Separator between words in bigram output.',
10 |               show_default=True)
11 | @click.option('-n', '--num', default=2,
12 |               help='Length of the n-gram',
13 |               show_default=True)
14 | def words2ngrams(sep, num, tokens):
15 |     '''Convert word tokens into ngrams. ngrams are n-length word tokens.
16 |     Punctuation is considered as a separate token.'''
17 | 
18 |     content = read_tokens(tokens)
19 |     ngrams = list(nltk.ngrams(content, num))
20 |     write_csv(ngrams, str(sep))
21 | 
22 | 
23 | @click.command('text2ngrams')
24 | @click.argument('text', type=click.Path(exists=True), nargs=-1)
25 | @click.option('-s', '--sep', default=' ',
26 |               help='Separator between words in bigram output.',
27 |               show_default=True)
28 | @click.option('-n', '--num', default=2,
29 |               help='Length of the n-gram',
30 |               show_default=True)
31 | def text2ngrams(sep, num, text):
32 |     '''Tokenize plain text into ngrams. ngrams are n-length word tokens.
33 |     Punctuation is considered as a separate token.'''
34 |     content = '\n'.join([open(f).read() for f in text])
35 |     try:
36 |         tokens = nltk.word_tokenize(content)
37 |         ngrams = list(nltk.ngrams(tokens, num))
38 |         write_csv(ngrams, str(sep))
39 |     except LookupError as err:
40 |         click.echo(message="Error with tokenization", nl=True)
41 |         click.echo(message="Have you run \"textkit download\"?", nl=True)
42 |         click.echo(message="\nOriginal Error:", nl=True)
43 |         click.echo(err)
44 | 


--------------------------------------------------------------------------------
/textkit/tokenize/punc.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from string import punctuation
 3 | import click
 4 | from textkit.utils import output
 5 | 
 6 | @click.command()
 7 | @click.argument('text', type=click.Path(exists=True), nargs=-1)
 8 | def text2punc(text):
 9 |     '''Tokenize text into punctuation tokens.
10 |     Words and numbers are removed, leaving only punctuation.'''
11 | 
12 |     # from: http://stackoverflow.com/questions/17485092/how-to-just-keep-punctuation-with-a-string-in-python
13 | 
14 |     content = '\n'.join([open(f).read() for f in text])
15 |     out = re.sub(r'[^{}]+'.format(punctuation), ' ', content)
16 |     out = out.split()
17 |     [output(p) for p in out]
18 | 


--------------------------------------------------------------------------------
/textkit/tokenize/sentences.py:
--------------------------------------------------------------------------------
 1 | from nltk.tokenize import sent_tokenize
 2 | import click
 3 | from textkit.utils import output
 4 | 
 5 | @click.command()
 6 | @click.argument('text', type=click.Path(exists=True), nargs=-1)
 7 | def text2sentences(text):
 8 |     '''Tokenize text into sentence tokens.'''
 9 |     content = '\n'.join([open(f).read() for f in text])
10 |     sentences = []
11 |     try:
12 |         sentences = sent_tokenize(content)
13 |     except LookupError as err:
14 |         click.echo(message="Error with tokenization", nl=True)
15 |         click.echo(message="Have you run \"textkit download\"?", nl=True)
16 |         click.echo(message="\nOriginal Error:", nl=True)
17 |         click.echo(err)
18 |     [output(s.strip()) for s in sentences]
19 | 


--------------------------------------------------------------------------------
/textkit/tokenize/words.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import nltk
 3 | from textkit.utils import output
 4 | 
 5 | 
 6 | @click.command()
 7 | @click.argument('text', type=click.Path(exists=True), nargs=-1)
 8 | def text2words(text):
 9 |     '''Tokenize text into word tokens.
10 |     Punctuation is considered as a separate token.'''
11 |     content = '\n'.join([open(f).read() for f in text])
12 |     tokens = []
13 |     try:
14 |         tokens = nltk.word_tokenize(content)
15 |     except LookupError as err:
16 |         click.echo(message="Error with tokenization", nl=True)
17 |         click.echo(message="Have you run \"textkit download\"?", nl=True)
18 |         click.echo(message="\nOriginal Error:", nl=True)
19 |         click.echo(err)
20 |     [output(token) for token in tokens]
21 | 


--------------------------------------------------------------------------------
/textkit/transform/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/learntextvis/textkit/6c482ea7d378022e10ee1002f9d9db7925856294/textkit/transform/__init__.py


--------------------------------------------------------------------------------
/textkit/transform/newlines.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import re
 3 | from textkit.utils import output
 4 | 
 5 | 
 6 | @click.command()
 7 | @click.argument('text', type=click.Path(exists=True), nargs=-1)
 8 | def nonewlines(text):
 9 |     '''Remove newlines from a text file.'''
10 |     content = '\n'.join([open(f).read() for f in text])
11 |     content = re.sub('\n|\r\n|\r', ' ', content).strip()
12 |     output(content)
13 | 


--------------------------------------------------------------------------------
/textkit/transform/tokens_to_counts.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from collections import defaultdict
 3 | from textkit.utils import read_tokens, write_csv
 4 | 
 5 | 
 6 | @click.command('tokens2counts')
 7 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-'))
 8 | @click.option('-s', '--sep', default=',',
 9 |               help='Separator between token and count in output.',
10 |               show_default=True)
11 | @click.option('--limit', default=-1, type=click.INT,
12 |               help='Only output the top N most frequent tokens')
13 | def tokens2counts(sep, limit, tokens):
14 |     '''Count unique tokens in a list of tokens.
15 |     Tokens are sorted by top counts.'''
16 |     content = read_tokens(tokens)
17 |     counts = sort_counts(get_counts(content))
18 | 
19 |     # we want the argument type to be an INT - but python only
20 |     # has support for a float infinity. So if it the limit is negative,
21 |     # it becomes infinite
22 |     if limit < 0:
23 |         limit = float('inf')
24 | 
25 |     # using csv writer to ensure proper encoding of the seperator.
26 |     rows = [list(map(str, vals)) for ind, vals in enumerate(counts) if ind < limit]
27 |     write_csv(rows, str(sep))
28 | 
29 | 
30 | def get_counts(tokens):
31 |     '''Count unique tokens in a list'''
32 |     counts = defaultdict(int)
33 |     for token in tokens:
34 |         counts[token] += 1
35 |     return counts
36 | 
37 | 
38 | def sort_counts(counts):
39 |     '''Sorts dict of counts by count and returns array of counts'''
40 |     return sorted(counts.items(), key=lambda count: count[1], reverse=True)
41 | 


--------------------------------------------------------------------------------
/textkit/transform/tokens_to_lower.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from textkit.utils import read_tokens, output
 3 | 
 4 | 
 5 | @click.command()
 6 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-'))
 7 | def tokens2lower(tokens):
 8 |     '''Transform all tokens to lowercase.'''
 9 |     content = read_tokens(tokens)
10 |     [output(token.lower()) for token in content]
11 | 


--------------------------------------------------------------------------------
/textkit/transform/tokens_to_pos.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import click
 3 | import nltk
 4 | from textkit.utils import write_csv, read_tokens, data_item
 5 | 
 6 | 
 7 | @click.command('tokens2pos')
 8 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-'))
 9 | @click.option('-s', '--sep', default=',',
10 |               help='Separator between words in the output.',
11 |               show_default=True)
12 | def tokens2pos(sep, tokens):
13 |     '''Tokenize words into their parts of speech. Output contains the
14 |        word token followed by its part-of-speech tag, separated by the
15 |        character specified by --sep.
16 |     '''
17 | 
18 |     content = read_tokens(tokens)
19 |     nltk.data.path.append(data_item())
20 |     tags = nltk.pos_tag(content)
21 |     write_csv(tags, str(sep))
22 | 


--------------------------------------------------------------------------------
/textkit/transform/tokens_to_stem.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from collections import OrderedDict
 3 | from nltk.stem import PorterStemmer
 4 | from nltk.stem.lancaster import LancasterStemmer
 5 | from nltk.stem.snowball import EnglishStemmer
 6 | from nltk.stem.wordnet import WordNetLemmatizer
 7 | from textkit.utils import read_tokens, output
 8 | 
 9 | ALGOS = OrderedDict([
10 |     ('porter', PorterStemmer),
11 |     ('lancaster', LancasterStemmer),
12 |     ('snowball', EnglishStemmer),
13 |     ('wordnet', WordNetLemmatizer)
14 | ])
15 | 
16 | 
17 | @click.command()
18 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-'))
19 | @click.option('-a', '--algorithm', type=click.Choice(list(ALGOS.keys())),
20 |               default=list(ALGOS.keys())[0],
21 |               help='Specify which stemming algorithm to use.',
22 |               show_default=True)
23 | def tokens2stem(tokens, algorithm):
24 |     '''Stem a list of tokens to get their root.'''
25 |     content = read_tokens(tokens)
26 |     stemmer = ALGOS[algorithm]()
27 | 
28 |     if algorithm == 'wordnet':
29 |         for token in content:
30 |             output(stemmer.lemmatize(token))
31 |     else:
32 |         for token in content:
33 |             output(stemmer.stem(token))
34 | 


--------------------------------------------------------------------------------
/textkit/transform/tokens_to_top_bigrams.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import nltk
 3 | from collections import OrderedDict
 4 | from textkit.utils import read_tokens, write_csv
 5 | 
 6 | 
 7 | MEASURES = OrderedDict([
 8 |     ('likelihood', nltk.collocations.BigramAssocMeasures.likelihood_ratio),
 9 |     ('chi_sq', nltk.collocations.BigramAssocMeasures.chi_sq),
10 |     ('pmi', nltk.collocations.BigramAssocMeasures.pmi),
11 |     ('student_t', nltk.collocations.BigramAssocMeasures.student_t),
12 |     ('freq', nltk.collocations.BigramAssocMeasures.raw_freq)
13 | ])
14 | 
15 | 
16 | @click.command('tokens2topbigrams')
17 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-'))
18 | @click.option('-s', '--sep', default=',',
19 |               help='Separator between tokens and scores in output.',
20 |               show_default=True)
21 | @click.option('-m', '--measure', type=click.Choice(list(MEASURES.keys())),
22 |               default=list(MEASURES.keys())[0],
23 |               help='Specify which measure to use to define interesing-ness.',
24 |               show_default=True)
25 | @click.option('--freq', default=2,
26 |               help='Minimum frequency of bi-grams to filter out.',
27 |               show_default=True)
28 | @click.option('--scores/--no-scores', default=True,
29 |               help='Include or exclude scores in output.',
30 |               show_default=True)
31 | def tokens2topbigrams(sep, measure, freq, scores, tokens):
32 |     '''Find top most interesting bi-grams in a token document.
33 |     Uses the --measure argument to determine what measure to use to define
34 |     'interesting'.
35 |     '''
36 | 
37 |     content = read_tokens(tokens)
38 |     bcf = nltk.collocations.BigramCollocationFinder.from_words(content)
39 |     bcf.apply_freq_filter(freq)
40 | 
41 |     nltk_measure = MEASURES[measure]
42 |     bigrams = bcf.score_ngrams(nltk_measure)
43 | 
44 |     out = [b[0] for b in bigrams]
45 |     if scores:
46 |         out = [b[0] + tuple([str(b[1])]) for b in bigrams]
47 |     write_csv(out, str(sep))
48 | 


--------------------------------------------------------------------------------
/textkit/transform/tokens_to_upper.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from textkit.utils import read_tokens, output
 3 | 
 4 | 
 5 | @click.command()
 6 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-'))
 7 | def tokens2upper(tokens):
 8 |     '''Transform all tokens to uppercase.'''
 9 |     content = read_tokens(tokens)
10 |     [output(token.upper()) for token in content]
11 | 


--------------------------------------------------------------------------------
/textkit/transform/transliterate.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from textkit.utils import output
 3 | from unidecode import unidecode
 4 | import chardet
 5 | 
 6 | @click.command()
 7 | @click.argument('file', type=click.File('r'), default=click.open_file('-'))
 8 | def transliterate(file):
 9 |     '''Convert international text to ascii.'''
10 |     content = ''.join(file.readlines())
11 |     try:
12 |         content = content.decode(chardet.detect(content)['encoding'])
13 |     except AttributeError:
14 |         # Strings do not have a decode method in python 3.
15 |         pass
16 |     [output(unidecode(content).encode('ascii', 'ignore'))]
17 | 


--------------------------------------------------------------------------------
/textkit/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import click
 3 | import sys
 4 | import csv
 5 | from pkg_resources import resource_filename
 6 | 
 7 | 
 8 | def read_tokens(file):
 9 |     '''Reads tokens from a file handle'''
10 |     lines = file.readlines()
11 |     return [l.rstrip('\n') for l in lines if len(l.rstrip('\n')) > 0]
12 | 
13 | 
14 | def read_csv(file, delim):
15 |     '''Reads csv formated tokens from a file handle'''
16 |     lines = []
17 |     reader = csv.reader(file, delimiter=delim)
18 |     lines = [line for line in reader]
19 |     return lines
20 | 
21 | 
22 | def write_csv(rows, delim):
23 |     writer = csv.writer(click.get_text_stream('stdout'), delimiter=delim, lineterminator='\n')
24 |     try:
25 |         [writer.writerow(row) for row in rows]
26 |     except (OSError, IOError):
27 |         sys.stderr.close()
28 | 
29 | 
30 | def output(line):
31 |     try:
32 |         click.echo(line)
33 |     except (OSError, IOError):
34 |         sys.stderr.close()
35 | 
36 | 
37 | def data_item(search_path=''):
38 |     path = resource_filename(__name__, 'data/' + search_path)
39 |     return path
40 | 


--------------------------------------------------------------------------------