├── .gitignore ├── .pylint ├── .travis.yml ├── AUTHORS.rst ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.rst ├── docs ├── .deploy_heroku │ ├── Gemfile │ ├── Gemfile.lock │ ├── config.ru │ └── public │ │ └── 404.html ├── .gitignore ├── Makefile ├── README.md ├── README_sphinx_deployment.md ├── _static │ └── logo.png ├── _templates │ ├── side-primary.html │ └── side-secondary.html ├── _themes │ ├── .gitignore │ ├── LICENSE │ ├── flask_theme_support.py │ ├── kr │ │ ├── layout.html │ │ ├── relations.html │ │ ├── static │ │ │ ├── flasky.css_t │ │ │ └── small_flask.css │ │ └── theme.conf │ └── kr_small │ │ ├── layout.html │ │ ├── static │ │ └── flasky.css_t │ │ └── theme.conf ├── build_script_docs.py ├── cli.rst ├── conf.py ├── contributing.rst ├── index.rst ├── install.rst ├── quickstart.rst ├── requirements.txt ├── rsync_exclude ├── scripts │ ├── download.rst │ ├── filterlengths.rst │ ├── filterpunc.rst │ ├── filterwords.rst │ ├── nonewlines.rst │ ├── showstops.rst │ ├── text2ngrams.rst │ ├── text2punc.rst │ ├── text2sentences.rst │ ├── text2words.rst │ ├── texts2json.rst │ ├── tokens2counts.rst │ ├── tokens2json.rst │ ├── tokens2lower.rst │ ├── tokens2pos.rst │ ├── tokens2stem.rst │ ├── tokens2text.rst │ ├── tokens2topbigrams.rst │ ├── tokens2upper.rst │ ├── transliterate.rst │ ├── words2bigrams.rst │ └── words2ngrams.rst └── sphinx_deployment.mk ├── environment.yml ├── requirements.txt ├── setup.cfg ├── setup.py ├── test_data ├── alice.txt ├── alice_doc.json ├── alice_short.txt ├── alice_words.txt ├── docs.json ├── international.transliterate.txt ├── international.txt ├── pride_and_prejudice.txt ├── pride_words.txt └── word_tokens.txt ├── tests ├── __init__.py ├── test_coerce.py ├── test_filter.py ├── test_tokenize.py ├── test_transform.py ├── test_utils.py ├── transliterate.py └── utils.py └── textkit ├── __init__.py ├── cli.py ├── coerce.py ├── data └── stopwords │ ├── README.md │ ├── danish.txt │ ├── dutch.txt │ ├── english.txt │ ├── finnish.txt │ ├── french.txt │ ├── german.txt │ ├── hungarian.txt │ ├── italian.txt │ ├── norwegian.txt │ ├── portuguese.txt │ ├── russian.txt │ ├── spanish.txt │ ├── swedish.txt │ └── turkish.txt ├── download.py ├── filter ├── __init__.py ├── filter_lengths.py ├── filter_punc.py └── filter_words.py ├── package ├── __init__.py ├── texts_to_json.py ├── tokens_to_json.py └── tokens_to_text.py ├── tokenize ├── __init__.py ├── bigrams.py ├── ngrams.py ├── punc.py ├── sentences.py └── words.py ├── transform ├── __init__.py ├── newlines.py ├── tokens_to_counts.py ├── tokens_to_lower.py ├── tokens_to_pos.py ├── tokens_to_stem.py ├── tokens_to_top_bigrams.py ├── tokens_to_upper.py └── transliterate.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | .DS_Store 4 | build 5 | textkit.egg-info 6 | dist 7 | *.swo 8 | .coverage 9 | .tox 10 | .cache 11 | docs/_build 12 | 13 | # Readme build 14 | README.html 15 | -------------------------------------------------------------------------------- /.pylint: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # Specify a configuration file. 4 | #rcfile= 5 | 6 | # Python code to execute, usually for sys.path manipulation such as 7 | # pygtk.require(). 8 | #init-hook= 9 | 10 | # Add files or directories to the blacklist. They should be base names, not 11 | # paths. 12 | ignore=data,docs,test_data 13 | 14 | # Pickle collected data for later comparisons. 15 | persistent=yes 16 | 17 | # List of plugins (as comma separated values of python modules names) to load, 18 | # usually to register additional checkers. 19 | load-plugins= 20 | 21 | # Use multiple processes to speed up Pylint. 22 | jobs=1 23 | 24 | # Allow loading of arbitrary C extensions. Extensions are imported into the 25 | # active Python interpreter and may run arbitrary code. 26 | unsafe-load-any-extension=no 27 | 28 | # A comma-separated list of package or module names from where C extensions may 29 | # be loaded. Extensions are loading into the active Python interpreter and may 30 | # run arbitrary code 31 | extension-pkg-whitelist= 32 | 33 | # Allow optimization of some AST trees. This will activate a peephole AST 34 | # optimizer, which will apply various small optimizations. For instance, it can 35 | # be used to obtain the result of joining multiple strings with the addition 36 | # operator. Joining a lot of strings can lead to a maximum recursion error in 37 | # Pylint and this flag can prevent that. It has one side effect, the resulting 38 | # AST will be different than the one from reality. 39 | optimize-ast=no 40 | 41 | 42 | [MESSAGES CONTROL] 43 | 44 | # Only show warnings with the listed confidence levels. Leave empty to show 45 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED 46 | confidence= 47 | 48 | # Enable the message, report, category or checker with the given id(s). You can 49 | # either give multiple identifier separated by comma (,) or put this option 50 | # multiple time (only on the command line, not in the configuration file where 51 | # it should appear only once). See also the "--disable" option for examples. 52 | #enable= 53 | 54 | # Disable the message, report, category or checker with the given id(s). You 55 | # can either give multiple identifiers separated by comma (,) or put this 56 | # option multiple times (only on the command line, not in the configuration 57 | # file where it should appear only once).You can also use "--disable=all" to 58 | # disable everything first and then reenable specific checks. For example, if 59 | # you want to run only the similarities checker, you can use "--disable=all 60 | # --enable=similarities". If you want to run only the classes checker, but have 61 | # no Warning level messages displayed, use"--disable=all --enable=classes 62 | # --disable=W" 63 | disable=getslice-method,unicode-builtin,useless-suppression,old-raise-syntax,cmp-builtin,import-star-module-level,unpacking-in-except,standarderror-builtin,cmp-method,suppressed-message,parameter-unpacking,round-builtin,file-builtin,old-division,raw_input-builtin,print-statement,range-builtin-not-iterating,oct-method,reload-builtin,execfile-builtin,zip-builtin-not-iterating,long-builtin,using-cmp-argument,next-method-called,apply-builtin,backtick,no-absolute-import,dict-iter-method,old-octal-literal,coerce-method,long-suffix,raising-string,filter-builtin-not-iterating,delslice-method,indexing-exception,hex-method,setslice-method,dict-view-method,buffer-builtin,coerce-builtin,unichr-builtin,input-builtin,old-ne-operator,xrange-builtin,map-builtin-not-iterating,reduce-builtin,basestring-builtin,nonzero-method,metaclass-assignment,intern-builtin,missing-docstring,expression-not-assigned,bad-continuation,too-many-arguments,bad-builtin 64 | 65 | [REPORTS] 66 | 67 | # Set the output format. Available formats are text, parseable, colorized, msvs 68 | # (visual studio) and html. You can also give a reporter class, eg 69 | # mypackage.mymodule.MyReporterClass. 70 | output-format=text 71 | 72 | # Put messages in a separate file for each module / package specified on the 73 | # command line instead of printing them on stdout. Reports (if any) will be 74 | # written in a file name "pylint_global.[txt|html]". 75 | files-output=no 76 | 77 | # Tells whether to display a full report or only the messages 78 | reports=yes 79 | 80 | # Python expression which should return a note less than 10 (10 is the highest 81 | # note). You have access to the variables errors warning, statement which 82 | # respectively contain the number of errors / warnings messages and the total 83 | # number of statements analyzed. This is used by the global evaluation report 84 | # (RP0004). 85 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 86 | 87 | # Template used to display messages. This is a python new-style format string 88 | # used to format the message information. See doc for all details 89 | #msg-template= 90 | 91 | 92 | [BASIC] 93 | 94 | # Include a hint for the correct naming format with invalid-name 95 | include-naming-hint=yes 96 | 97 | # Regular expression matching correct class names 98 | class-rgx=[A-Z_][a-zA-Z0-9]+$ 99 | 100 | # Naming hint for class names 101 | class-name-hint=[A-Z_][a-zA-Z0-9]+$ 102 | 103 | # Regular expression matching correct argument names 104 | argument-rgx=[a-z_][a-z0-9_]{2,30}$ 105 | 106 | # Naming hint for argument names 107 | argument-name-hint=[a-z_][a-z0-9_]{2,30}$ 108 | 109 | # Regular expression matching correct class attribute names 110 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 111 | 112 | # Naming hint for class attribute names 113 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 114 | 115 | # Regular expression matching correct inline iteration names 116 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ 117 | 118 | # Naming hint for inline iteration names 119 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$ 120 | 121 | # Regular expression matching correct attribute names 122 | attr-rgx=[a-z_][a-z0-9_]{2,30}$ 123 | 124 | # Naming hint for attribute names 125 | attr-name-hint=[a-z_][a-z0-9_]{2,30}$ 126 | 127 | # Regular expression matching correct constant names 128 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 129 | 130 | # Naming hint for constant names 131 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 132 | 133 | # Regular expression matching correct variable names 134 | variable-rgx=[a-z_][a-z0-9_]{2,30}$ 135 | 136 | # Naming hint for variable names 137 | variable-name-hint=[a-z_][a-z0-9_]{2,30}$ 138 | 139 | # Regular expression matching correct function names 140 | function-rgx=[a-z_][a-z0-9_]{2,30}$ 141 | 142 | # Naming hint for function names 143 | function-name-hint=[a-z_][a-z0-9_]{2,30}$ 144 | 145 | # Regular expression matching correct method names 146 | method-rgx=[a-z_][a-z0-9_]{2,30}$ 147 | 148 | # Naming hint for method names 149 | method-name-hint=[a-z_][a-z0-9_]{2,30}$ 150 | 151 | # Regular expression matching correct module names 152 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 153 | 154 | # Naming hint for module names 155 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 156 | 157 | # Regular expression which should only match function or class names that do 158 | # not require a docstring. 159 | no-docstring-rgx=^_ 160 | 161 | # Minimum line length for functions/classes that require docstrings, shorter 162 | # ones are exempt. 163 | docstring-min-length=-1 164 | 165 | 166 | [ELIF] 167 | 168 | # Maximum number of nested blocks for function / method body 169 | max-nested-blocks=5 170 | 171 | [FORMAT] 172 | 173 | # Maximum number of characters on a single line. 174 | max-line-length=80 175 | 176 | # Regexp for a line that is allowed to be longer than the limit. 177 | ignore-long-lines=^\s*(# )??$ 178 | 179 | # Allow the body of an if to be on the same line as the test if there is no 180 | # else. 181 | single-line-if-stmt=no 182 | 183 | # List of optional constructs for which whitespace checking is disabled. `dict- 184 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. 185 | # `trailing-comma` allows a space between comma and closing bracket: (a, ). 186 | # `empty-line` allows space-only lines. 187 | no-space-check=trailing-comma,dict-separator 188 | 189 | # Maximum number of lines in a module 190 | max-module-lines=1000 191 | 192 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 193 | # tab). 194 | indent-string=' ' 195 | 196 | # Number of spaces of indent required inside a hanging or continued line. 197 | indent-after-paren=4 198 | 199 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 200 | expected-line-ending-format= 201 | 202 | 203 | [LOGGING] 204 | 205 | # Logging modules to check that the string format arguments are in logging 206 | # function parameter format 207 | logging-modules=logging 208 | 209 | 210 | [MISCELLANEOUS] 211 | 212 | # List of note tags to take in consideration, separated by a comma. 213 | notes=FIXME,XXX,TODO 214 | 215 | [SIMILARITIES] 216 | 217 | # Minimum lines number of a similarity. 218 | min-similarity-lines=4 219 | 220 | # Ignore comments when computing similarities. 221 | ignore-comments=yes 222 | 223 | # Ignore docstrings when computing similarities. 224 | ignore-docstrings=yes 225 | 226 | # Ignore imports when computing similarities. 227 | ignore-imports=no 228 | 229 | 230 | [SPELLING] 231 | 232 | # Spelling dictionary name. Available dictionaries: none. To make it working 233 | # install python-enchant package. 234 | spelling-dict= 235 | 236 | # List of comma separated words that should not be checked. 237 | spelling-ignore-words= 238 | 239 | # A path to a file that contains private dictionary; one word per line. 240 | spelling-private-dict-file= 241 | 242 | # Tells whether to store unknown words to indicated private dictionary in 243 | # --spelling-private-dict-file option instead of raising a message. 244 | spelling-store-unknown-words=no 245 | 246 | 247 | [TYPECHECK] 248 | 249 | # Tells whether missing members accessed in mixin class should be ignored. A 250 | # mixin class is detected if its name ends with "mixin" (case insensitive). 251 | ignore-mixin-members=yes 252 | 253 | # List of module names for which member attributes should not be checked 254 | # (useful for modules/projects where namespaces are manipulated during runtime 255 | # and thus existing member attributes cannot be deduced by static analysis. It 256 | # supports qualified module names, as well as Unix pattern matching. 257 | ignored-modules= 258 | 259 | # List of classes names for which member attributes should not be checked 260 | # (useful for classes with attributes dynamically set). This supports can work 261 | # with qualified names. 262 | ignored-classes= 263 | 264 | # List of members which are set dynamically and missed by pylint inference 265 | # system, and so shouldn't trigger E1101 when accessed. Python regular 266 | # expressions are accepted. 267 | generated-members= 268 | 269 | 270 | [VARIABLES] 271 | 272 | # Tells whether we should check for unused import in __init__ files. 273 | init-import=no 274 | 275 | # A regular expression matching the name of dummy variables (i.e. expectedly 276 | # not used). 277 | dummy-variables-rgx=_$|dummy 278 | 279 | # List of additional names supposed to be defined in builtins. Remember that 280 | # you should avoid to define new builtins when possible. 281 | additional-builtins= 282 | 283 | # List of strings which can identify a callback function by name. A callback 284 | # name must start or end with one of those strings. 285 | callbacks=cb_,_cb 286 | 287 | 288 | [CLASSES] 289 | 290 | # List of method names used to declare (i.e. assign) instance attributes. 291 | defining-attr-methods=__init__,__new__,setUp 292 | 293 | # List of valid names for the first argument in a class method. 294 | valid-classmethod-first-arg=cls 295 | 296 | # List of valid names for the first argument in a metaclass class method. 297 | valid-metaclass-classmethod-first-arg=mcs 298 | 299 | # List of member names, which should be excluded from the protected access 300 | # warning. 301 | exclude-protected=_asdict,_fields,_replace,_source,_make 302 | 303 | 304 | [DESIGN] 305 | 306 | # Maximum number of arguments for function / method 307 | max-args=5 308 | 309 | # Argument names that match this expression will be ignored. Default to name 310 | # with leading underscore 311 | ignored-argument-names=_.* 312 | 313 | # Maximum number of locals for function / method body 314 | max-locals=15 315 | 316 | # Maximum number of return / yield for function / method body 317 | max-returns=6 318 | 319 | # Maximum number of branch for function / method body 320 | max-branches=12 321 | 322 | # Maximum number of statements in function / method body 323 | max-statements=50 324 | 325 | # Maximum number of parents for a class (see R0901). 326 | max-parents=7 327 | 328 | # Maximum number of attributes for a class (see R0902). 329 | max-attributes=7 330 | 331 | # Minimum number of public methods for a class (see R0903). 332 | min-public-methods=2 333 | 334 | # Maximum number of public methods for a class (see R0904). 335 | max-public-methods=20 336 | 337 | # Maximum number of boolean expressions in a if statement 338 | max-bool-expr=5 339 | 340 | 341 | [IMPORTS] 342 | 343 | # Deprecated modules which should not be used, separated by a comma 344 | deprecated-modules=optparse 345 | 346 | # Create a graph of every (i.e. internal and external) dependencies in the 347 | # given file (report RP0402 must not be disabled) 348 | import-graph= 349 | 350 | # Create a graph of external dependencies in the given file (report RP0402 must 351 | # not be disabled) 352 | ext-import-graph= 353 | 354 | # Create a graph of internal dependencies in the given file (report RP0402 must 355 | # not be disabled) 356 | int-import-graph= 357 | 358 | 359 | [EXCEPTIONS] 360 | 361 | # Exceptions that will emit a warning when being caught. Defaults to 362 | # "Exception" 363 | overgeneral-exceptions=Exception 364 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | before_script: 3 | - git config --global user.name "Teracy" 4 | - git config --global user.email "your-friends@teracy.com" 5 | - export REPO_URL_GITHUB="https://$GH_TOKEN@github.com/$GH_REPO.git" 6 | - . ./.travis/setup.sh 7 | - echo $DEPLOY_HTML_DIR 8 | after_script: 9 | - sudo pip install -r docs/requirements.txt --use-mirrors 10 | - cd docs 11 | - make setup_gh_pages 12 | - make generate 13 | - make deploy 14 | env: 15 | global: 16 | - GH_REPO="teracyhq/sphinx-deployment" 17 | notifications: 18 | slack: 19 | secure: RhiJy6C19bKjM8fY5oHFSBAyHgRRDr+ZiYhK5QwlitpQmSCsABLsVGWpIrD+aA0nPKs6khtmNFUoEsXtzol65CsvV2m70SQbVK3OCbOwXd6xRSLw8hb1p+SOnpqu/VvZ89iayVl7J9Y8WpJQlUfobMXZgequED7mSRo9+bdBB4E= 20 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ******* 2 | Authors 3 | ******* 4 | 5 | Contributors (in random order) 6 | ============================ 7 | 8 | - Lynn Cherny `@arnicas `_ 9 | - Yannick Assogba `@tafsiri `_ 10 | - Jim Vallandingham `@vlandham `_ 11 | - Irene Ros `@ireneros `_ 12 | - Alfred Lee `@justalfred `_ 13 | - Jenn Schiffer`@jennschiffer `_ 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 LTV 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include textkit/data/**/* 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | test: 3 | python -m pytest tests 4 | 5 | lint: 6 | pylint --rcfile .pylint textkit/*/**.py 7 | 8 | install_dev: 9 | pip install --editable . 10 | 11 | package: 12 | python setup.py egg_info 13 | python setup.py sdist 14 | python setup.py bdist_wheel --universal 15 | 16 | publish:clean package 17 | twine upload dist/* 18 | 19 | clean: 20 | rm -rf build 21 | rm -rf textkit.egg-info 22 | rm -rf dist 23 | 24 | readme: 25 | # This requires `pip install rst2html5` 26 | # we should document - but we shouldn't need this as a requirement 27 | # or dependency of textkit 28 | rst2html.py README.rst > README.html 29 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | textkit 2 | ======= 3 | 4 | Simple text analysis from the command line. 5 | 6 | Homepage: `http://learntextvis.github.io/textkit/ `_ 7 | 8 | About 9 | ----- 10 | 11 | `textkit` is a series of small, unix-style tools that provide a suite of capabilities for 12 | dealing with text as data. 13 | 14 | Think of textkit as basic natural language processing capabilities - from the command line. 15 | 16 | textkit Features 17 | ---------------- 18 | 19 | Here are some of the cool things you can do with textkit. 20 | 21 | Convert a document to a set of word tokens and remove all punctuation from the tokens: 22 | 23 | .. code-block:: python 24 | 25 | textkit text2words input.txt | textkit filterpunc 26 | 27 | Count top used words in a text: 28 | 29 | .. code-block:: python 30 | 31 | textkit text2words alice.txt | textkit count --limit 20 32 | 33 | Do the same, but with punctuation removed: 34 | 35 | .. code-block:: python 36 | 37 | textkit text2words alice.txt | textkit filterpunc | textkit count --limit 20 38 | 39 | Installation 40 | ------------ 41 | :: 42 | 43 | $ pip install -U textkit 44 | $ textkit --help 45 | 46 | 47 | Dev install 48 | ----------- 49 | 50 | To test locally, clone the repo: 51 | 52 | :: 53 | 54 | git clone git@github.com:learntextvis/textkit.git 55 | 56 | 57 | Create a local virtual environment or `conda` environment. 58 | 59 | Here is how I created my local `conda` environment for installing and testing textkit: 60 | 61 | :: 62 | 63 | conda create --name textkit nltk 64 | 65 | source activate textkit 66 | 67 | Then I went into the `textkit` directory to install its requirements 68 | 69 | :: 70 | 71 | cd textkit 72 | 73 | pip install -r requirements.txt 74 | 75 | Finally, I installed the local version of textkit using the `--editable` flag: 76 | 77 | :: 78 | 79 | pip install --editable . 80 | 81 | Examples 82 | -------- 83 | 84 | See more examples at the `Quickstart guide`_. 85 | 86 | .. _`Quickstart guide`: http://learntextvis.github.io/textkit/quickstart.html 87 | 88 | 89 | Requirements 90 | ------------ 91 | 92 | - Python >= 2.6 or >= 3.3 93 | 94 | Project Links 95 | ------------- 96 | 97 | - Docs: http://learntextvis.github.io/textkit/ 98 | - PyPI: https://pypi.python.org/pypi/textkit 99 | - Issues: https://github.com/learntextvis/textkit/issues 100 | -------------------------------------------------------------------------------- /docs/.deploy_heroku/Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | ruby '1.9.3' 3 | 4 | gem 'sinatra', '~> 1.4.2' 5 | -------------------------------------------------------------------------------- /docs/.deploy_heroku/Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | rack (1.5.2) 5 | rack-protection (1.5.1) 6 | rack 7 | sinatra (1.4.4) 8 | rack (~> 1.4) 9 | rack-protection (~> 1.4) 10 | tilt (~> 1.3, >= 1.3.4) 11 | tilt (1.4.1) 12 | 13 | PLATFORMS 14 | ruby 15 | 16 | DEPENDENCIES 17 | sinatra (~> 1.4.2) 18 | -------------------------------------------------------------------------------- /docs/.deploy_heroku/config.ru: -------------------------------------------------------------------------------- 1 | require 'bundler/setup' 2 | require 'sinatra/base' 3 | 4 | # The project root directory 5 | $root = ::File.dirname(__FILE__) 6 | 7 | class SinatraStaticServer < Sinatra::Base 8 | 9 | get(/.+/) do 10 | send_sinatra_file(request.path) {404} 11 | end 12 | 13 | not_found do 14 | send_file(File.join(File.dirname(__FILE__), 'public', '404.html'), {:status => 404}) 15 | end 16 | 17 | def send_sinatra_file(path, &missing_file_block) 18 | file_path = File.join(File.dirname(__FILE__), 'public', path) 19 | file_path = File.join(file_path, 'index.html') unless file_path =~ /\.[a-z]+$/i 20 | File.exist?(file_path) ? send_file(file_path) : missing_file_block.call 21 | end 22 | 23 | end 24 | 25 | run SinatraStaticServer 26 | -------------------------------------------------------------------------------- /docs/.deploy_heroku/public/404.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 404 - Page Not Found 4 | 5 | 6 |

Error: 404 - Page Not Found

7 | 8 | 9 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | # default ignored by sphinx-deployment 2 | _deploy 3 | _deploy_heroku 4 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/textkit.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/textkit.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/textkit" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/textkit" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | 179 | include sphinx_deployment.mk 180 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # textkit documentation 2 | 3 | ## Installing prerequisites 4 | 5 | The doc building system has additional requirements past the base `textkit` install. 6 | 7 | **Note:** All commands expect you are in the `docs` sub-directory of the project. 8 | So, `cd` to that directory first: 9 | 10 | ``` 11 | cd docs 12 | ``` 13 | 14 | You can install these packages using: 15 | 16 | ``` 17 | pip install -r requirements.txt 18 | ``` 19 | 20 | ## Building Docs 21 | 22 | This should be done via `make`: 23 | 24 | ``` 25 | make html 26 | ``` 27 | 28 | ## Viewing Built Docs 29 | 30 | The built documentation should now be in the `_build/html` sub-directory 31 | 32 | You can view them by starting a webserver in that directory. 33 | 34 | If you are on python 3, this should do that: 35 | 36 | ``` 37 | cd _build/html/; python -m http.server 38 | ``` 39 | 40 | If you are on python 2, try: 41 | 42 | ``` 43 | cd _build/html/; python -m SimpleHTTPServer 44 | ``` 45 | 46 | This should allow you to visit [http://localhost:8000/](http://localhost:8000/) and 47 | see your updated docs. 48 | 49 | ## Deploying to gh-pages 50 | 51 | We are using the [Sphinx Deployment](https://github.com/teracyhq/sphinx-deployment) tooling 52 | to automate deploying documentation to the `gh-pages` branch of this repo. 53 | 54 | This keeps the documentation in the same location as the code, which is great for discoverability. 55 | 56 | It also provides documentation at a consistent url: [http://learntextvis.github.io/textkit/](http://learntextvis.github.io/textkit/). 57 | 58 | I believe Sphinx Deployment is all setup to allow for anyone with write capabilities to the repo 59 | (owners) to update and push documentation with the following commands: 60 | 61 | ``` 62 | make html 63 | make deploy 64 | ``` 65 | 66 | The default deployment is to `gh-pages` which is what we want. 67 | The configuration of the Sphinx Deployment process is specified in `sphinx_deployment.mk` - so check there for all the configuration parameters to ensure everything is set as we want it. 68 | 69 | For more info on the options, `README_sphinx_deployment.md` comes from the 70 | -------------------------------------------------------------------------------- /docs/README_sphinx_deployment.md: -------------------------------------------------------------------------------- 1 | sphinx-deployment 2 | ================= 3 | 4 | Automatic setup and deployment for [sphinx][] docs. 5 | 6 | This project is intended to be used to deploy [sphinx][] project on: 7 | 8 | - [Github Pages](https://help.github.com/categories/20/articles) 9 | - [Rsync](http://en.wikipedia.org/wiki/Rsync) 10 | - PaaS services: [heroku](http://heroku.com/), etc. 11 | 12 | Usage 13 | ----- 14 | 15 | **1. `$ make generate`** 16 | 17 | For generating contents, alias for `$ make html` 18 | 19 | **2. `$ make deploy`** 20 | 21 | For short-cut deployment, it could be `$ make deploy_gh_pages`, `$ make deploy_rsync` or 22 | `$ make deploy_heroku` basing on the configuration of `DEPLOY_DEFAULT`. 23 | 24 | **3. `$ make gen_deploy`** 25 | 26 | For short-cut generation and deployment: `$ make generate` and then `$ make deploy`. 27 | 28 | **4. `$ make setup_gh_pages`** 29 | 30 | For the first time only to create `$(DEPLOY_DIR)` to track `$(DEPLOY_BRANCH)`. This is used for 31 | github pages deployment. 32 | 33 | **5. `$ make setup_heroku`** 34 | 35 | For the first time only to create `$(DEPLOY_DIR_HEROKU` to track the Heroku repo's master branch. 36 | This is used for heroku deployment. 37 | 38 | **6. `$ make deploy_gh_pages`** 39 | 40 | For deploying with github pages only. 41 | 42 | **7. `$ make deploy_rsync`** 43 | 44 | For deploying with rsync only. 45 | 46 | **8. `$ make deploy_heroku`** 47 | 48 | For deploying with heroku only. 49 | 50 | 51 | Installation 52 | ------------ 53 | 54 | **1. Bash script** 55 | 56 | Just run this bash script from your root git repository project and it's enough. 57 | 58 | You need to specify the `` to your sphinx docs directory: 59 | 60 | ``` bash 61 | $ cd 62 | $ wget https://raw.github.com/teracyhq/sphinx-deployment/master/scripts/spxd.sh && chmod +x ./spxd.sh && ./spxd.sh -p 63 | ``` 64 | 65 | For example: 66 | 67 | ``` bash 68 | $ cd my_project 69 | $ wget https://raw.github.com/teracyhq/sphinx-deployment/master/scripts/spxd.sh && chmod +x ./spxd.sh && ./spxd.sh -p ./docs 70 | ``` 71 | 72 | **2. Manual** 73 | 74 | a. You need to copy these following files to your [sphinx][] directory: 75 | 76 | - `docs/requirements` 77 | - `docs/sphinx_deployment.mk` 78 | - `docs/rsync_exclude` 79 | - `docs/.deploy_heroku/*` 80 | - `docs/.gitignore` 81 | 82 | b. Include `sphinx_deployment.mk` to your `Makefile`: 83 | 84 | - Add the content below to your `Makefile`: 85 | 86 | ``` 87 | include sphinx_deployment.mk 88 | ``` 89 | 90 | - Or do with commands on terminal: 91 | 92 | ``` bash 93 | echo '' >> Makefile 94 | echo 'include sphinx_deployment.mk' >> Makefile 95 | ``` 96 | 97 | 98 | c.. To build with `travis-ci`, you need to copy these following files to your root project directory: 99 | 100 | - `.travis.yml` 101 | - `.travis/setup.sh` 102 | 103 | 104 | Configuration 105 | ------------- 106 | 107 | You need to configure these following deployment configurations following your project settings on 108 | `sphinx_deployment.mk` file. 109 | 110 | ``` Makefile 111 | # Deployment configurations from sphinx_deployment project 112 | 113 | # default deployment when $ make deploy 114 | # deploy_gh_pages : to $ make deploy_gh_pages 115 | # deploy_rsync : to $ make deploy_rsync 116 | # deploy_heroku : to $ make deploy_heroku 117 | # deploy_gh_pages deploy_rsync deploy_heroku : to $ make deploy_gh_pages then $ make deploy_rsync 118 | # and then $ make deploy_heroku 119 | # default value: deploy_gh_pages 120 | ifndef DEPLOY_DEFAULT 121 | DEPLOY_DEFAULT = deploy_gh_pages 122 | endif 123 | 124 | # The deployment directory to be deployed 125 | ifndef DEPLOY_DIR 126 | DEPLOY_DIR = _deploy 127 | endif 128 | 129 | # The heroku deployment directory to be deployed 130 | # we must create this separated dir to avoid any conflict with _deploy (rsync and gh_pages) 131 | ifndef DEPLOY_DIR_HEROKU 132 | DEPLOY_DIR_HEROKU = _deploy_heroku 133 | endif 134 | 135 | # Copy contents from $(BUILDDIR) to $(DEPLOY_DIR)/$(DEPLOY_HTML_DIR) directory 136 | ifndef DEPLOY_HTML_DIR 137 | DEPLOY_HTML_DIR = docs 138 | endif 139 | 140 | 141 | ## -- Rsync Deploy config -- ## 142 | # Be sure your public key is listed in your server's ~/.ssh/authorized_keys file 143 | ifndef SSH_USER 144 | SSH_USER = user@domain.com 145 | endif 146 | 147 | ifndef SSH_PORT 148 | SSH_PORT = 22 149 | endif 150 | 151 | ifndef DOCUMENT_ROOT 152 | DOCUMENT_ROOT = ~/website.com/ 153 | endif 154 | 155 | #If you choose to delete on sync, rsync will create a 1:1 match 156 | ifndef RSYNC_DELETE 157 | RSYNC_DELETE = false 158 | endif 159 | 160 | # Any extra arguments to pass to rsync 161 | ifndef RSYNC_ARGS 162 | RSYNC_ARGS = 163 | endif 164 | 165 | ## -- Github Pages Deploy config -- ## 166 | 167 | # Configure the right deployment branch 168 | ifndef DEPLOY_BRANCH_GITHUB 169 | DEPLOY_BRANCH_GITHUB = gh-pages 170 | endif 171 | 172 | #if REPO_URL_GITHUB was NOT defined by travis-ci 173 | ifndef REPO_URL_GITHUB 174 | # Configure your right github project repo 175 | # REPO_URL = git@github.com:teracy-official/sphinx-deployment.git 176 | endif 177 | 178 | ## -- Heroku Deployment Config -- ## 179 | 180 | ifndef REPO_URL_HEROKU 181 | # Configure your right heroku repo 182 | # REPO_URL_HEROKU = git@heroku.com:spxd.git 183 | endif 184 | 185 | 186 | ## end deployment configuration, don't edit anything below this line ## 187 | ####################################################################### 188 | ``` 189 | 190 | Continuous Integration Build 191 | ---------------------------- 192 | 193 | **1. `travis-ci`** 194 | 195 | Move `.travis.yml` file to your root repository project, and configure it following its 196 | instruction there. There is a supported `.travis/setup.sh` to export variables for `Makefile` 197 | depending on the being-built branch. 198 | 199 | To configure secure token for `travis-ci`, please read the similar step described at 200 | http://blog.teracy.com/2013/08/03/how-to-start-blogging-easily-with-octopress-and-teracy-dev/ 201 | 202 | 203 | **2. `jenkins`** 204 | 205 | //TODO 206 | 207 | 208 | Authors and contributors 209 | ------------------------ 210 | 211 | - Hoat Le: http://github.com/hoatle 212 | 213 | - Many thanks to http://octopress.org/docs/deploying/ for inspiration. 214 | 215 | License 216 | ------- 217 | 218 | BSD License 219 | 220 | ``` 221 | Copyright (c) Teracy, Inc. and individual contributors. 222 | All rights reserved. 223 | 224 | Redistribution and use in source and binary forms, with or without modification, 225 | are permitted provided that the following conditions are met: 226 | 227 | 1. Redistributions of source code must retain the above copyright notice, 228 | this list of conditions and the following disclaimer. 229 | 230 | 2. Redistributions in binary form must reproduce the above copyright 231 | notice, this list of conditions and the following disclaimer in the 232 | documentation and/or other materials provided with the distribution. 233 | 234 | 3. Neither the name of Teracy, Inc. nor the names of its contributors may be used 235 | to endorse or promote products derived from this software without 236 | specific prior written permission. 237 | 238 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 239 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 240 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 241 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 242 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 243 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 244 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 245 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 246 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 247 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 248 | 249 | ``` 250 | 251 | [sphinx]: http://sphinx-doc.org 252 | -------------------------------------------------------------------------------- /docs/_static/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/learntextvis/textkit/6c482ea7d378022e10ee1002f9d9db7925856294/docs/_static/logo.png -------------------------------------------------------------------------------- /docs/_templates/side-primary.html: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 |

7 | Command line tools for text processing and analysis. 8 |

9 | -------------------------------------------------------------------------------- /docs/_templates/side-secondary.html: -------------------------------------------------------------------------------- 1 | 4 | 5 |

6 | Command line tools for text processing and analysis. 7 |

8 | -------------------------------------------------------------------------------- /docs/_themes/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.pyo 3 | .DS_Store 4 | -------------------------------------------------------------------------------- /docs/_themes/LICENSE: -------------------------------------------------------------------------------- 1 | Modifications: 2 | 3 | Copyright (c) 2010 Kenneth Reitz. 4 | 5 | 6 | Original Project: 7 | 8 | Copyright (c) 2010 by Armin Ronacher. 9 | 10 | 11 | Some rights reserved. 12 | 13 | Redistribution and use in source and binary forms of the theme, with or 14 | without modification, are permitted provided that the following conditions 15 | are met: 16 | 17 | * Redistributions of source code must retain the above copyright 18 | notice, this list of conditions and the following disclaimer. 19 | 20 | * Redistributions in binary form must reproduce the above 21 | copyright notice, this list of conditions and the following 22 | disclaimer in the documentation and/or other materials provided 23 | with the distribution. 24 | 25 | * The names of the contributors may not be used to endorse or 26 | promote products derived from this software without specific 27 | prior written permission. 28 | 29 | We kindly ask you to only use these themes in an unmodified manner just 30 | for Flask and Flask-related products, not for unrelated projects. If you 31 | like the visual style and want to use it for your own projects, please 32 | consider making some larger changes to the themes (such as changing 33 | font faces, sizes, colors or margins). 34 | 35 | THIS THEME IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 36 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 37 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 38 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 39 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 40 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 41 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 42 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 43 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 44 | ARISING IN ANY WAY OUT OF THE USE OF THIS THEME, EVEN IF ADVISED OF THE 45 | POSSIBILITY OF SUCH DAMAGE. 46 | -------------------------------------------------------------------------------- /docs/_themes/flask_theme_support.py: -------------------------------------------------------------------------------- 1 | # flasky extensions. flasky pygments style based on tango style 2 | from pygments.style import Style 3 | from pygments.token import Keyword, Name, Comment, String, Error, \ 4 | Number, Operator, Generic, Whitespace, Punctuation, Other, Literal 5 | 6 | 7 | class FlaskyStyle(Style): 8 | background_color = "#f8f8f8" 9 | default_style = "" 10 | 11 | styles = { 12 | # No corresponding class for the following: 13 | #Text: "", # class: '' 14 | Whitespace: "underline #f8f8f8", # class: 'w' 15 | Error: "#a40000 border:#ef2929", # class: 'err' 16 | Other: "#000000", # class 'x' 17 | 18 | Comment: "italic #8f5902", # class: 'c' 19 | Comment.Preproc: "noitalic", # class: 'cp' 20 | 21 | Keyword: "bold #004461", # class: 'k' 22 | Keyword.Constant: "bold #004461", # class: 'kc' 23 | Keyword.Declaration: "bold #004461", # class: 'kd' 24 | Keyword.Namespace: "bold #004461", # class: 'kn' 25 | Keyword.Pseudo: "bold #004461", # class: 'kp' 26 | Keyword.Reserved: "bold #004461", # class: 'kr' 27 | Keyword.Type: "bold #004461", # class: 'kt' 28 | 29 | Operator: "#582800", # class: 'o' 30 | Operator.Word: "bold #004461", # class: 'ow' - like keywords 31 | 32 | Punctuation: "bold #000000", # class: 'p' 33 | 34 | # because special names such as Name.Class, Name.Function, etc. 35 | # are not recognized as such later in the parsing, we choose them 36 | # to look the same as ordinary variables. 37 | Name: "#000000", # class: 'n' 38 | Name.Attribute: "#c4a000", # class: 'na' - to be revised 39 | Name.Builtin: "#004461", # class: 'nb' 40 | Name.Builtin.Pseudo: "#3465a4", # class: 'bp' 41 | Name.Class: "#000000", # class: 'nc' - to be revised 42 | Name.Constant: "#000000", # class: 'no' - to be revised 43 | Name.Decorator: "#888", # class: 'nd' - to be revised 44 | Name.Entity: "#ce5c00", # class: 'ni' 45 | Name.Exception: "bold #cc0000", # class: 'ne' 46 | Name.Function: "#000000", # class: 'nf' 47 | Name.Property: "#000000", # class: 'py' 48 | Name.Label: "#f57900", # class: 'nl' 49 | Name.Namespace: "#000000", # class: 'nn' - to be revised 50 | Name.Other: "#000000", # class: 'nx' 51 | Name.Tag: "bold #004461", # class: 'nt' - like a keyword 52 | Name.Variable: "#000000", # class: 'nv' - to be revised 53 | Name.Variable.Class: "#000000", # class: 'vc' - to be revised 54 | Name.Variable.Global: "#000000", # class: 'vg' - to be revised 55 | Name.Variable.Instance: "#000000", # class: 'vi' - to be revised 56 | 57 | Number: "#990000", # class: 'm' 58 | 59 | Literal: "#000000", # class: 'l' 60 | Literal.Date: "#000000", # class: 'ld' 61 | 62 | String: "#4e9a06", # class: 's' 63 | String.Backtick: "#4e9a06", # class: 'sb' 64 | String.Char: "#4e9a06", # class: 'sc' 65 | String.Doc: "italic #8f5902", # class: 'sd' - like a comment 66 | String.Double: "#4e9a06", # class: 's2' 67 | String.Escape: "#4e9a06", # class: 'se' 68 | String.Heredoc: "#4e9a06", # class: 'sh' 69 | String.Interpol: "#4e9a06", # class: 'si' 70 | String.Other: "#4e9a06", # class: 'sx' 71 | String.Regex: "#4e9a06", # class: 'sr' 72 | String.Single: "#4e9a06", # class: 's1' 73 | String.Symbol: "#4e9a06", # class: 'ss' 74 | 75 | Generic: "#000000", # class: 'g' 76 | Generic.Deleted: "#a40000", # class: 'gd' 77 | Generic.Emph: "italic #000000", # class: 'ge' 78 | Generic.Error: "#ef2929", # class: 'gr' 79 | Generic.Heading: "bold #000080", # class: 'gh' 80 | Generic.Inserted: "#00A000", # class: 'gi' 81 | Generic.Output: "#888", # class: 'go' 82 | Generic.Prompt: "#745334", # class: 'gp' 83 | Generic.Strong: "bold #000000", # class: 'gs' 84 | Generic.Subheading: "bold #800080", # class: 'gu' 85 | Generic.Traceback: "bold #a40000", # class: 'gt' 86 | } 87 | -------------------------------------------------------------------------------- /docs/_themes/kr/layout.html: -------------------------------------------------------------------------------- 1 | {%- extends "basic/layout.html" %} 2 | {%- block extrahead %} 3 | {{ super() }} 4 | {% if theme_touch_icon %} 5 | 6 | {% endif %} 7 | 8 | {% endblock %} 9 | {%- block relbar2 %}{% endblock %} 10 | {%- block footer %} 11 | 14 | 15 | Fork me on GitHub 16 | 17 | 18 | {%- endblock %} 19 | -------------------------------------------------------------------------------- /docs/_themes/kr/relations.html: -------------------------------------------------------------------------------- 1 |

Related Topics

2 | 20 | -------------------------------------------------------------------------------- /docs/_themes/kr/static/flasky.css_t: -------------------------------------------------------------------------------- 1 | /* 2 | * flasky.css_t 3 | * ~~~~~~~~~~~~ 4 | * 5 | * :copyright: Copyright 2010 by Armin Ronacher. Modifications by Kenneth Reitz. 6 | * :license: Flask Design License, see LICENSE for details. 7 | */ 8 | 9 | {% set page_width = '940px' %} 10 | {% set sidebar_width = '220px' %} 11 | 12 | @import url("basic.css"); 13 | 14 | /* -- page layout ----------------------------------------------------------- */ 15 | 16 | body { 17 | font-family: 'goudy old style', 'minion pro', 'bell mt', Georgia, 'Hiragino Mincho Pro'; 18 | font-size: 17px; 19 | background-color: white; 20 | color: #000; 21 | margin: 0; 22 | padding: 0; 23 | } 24 | 25 | div.document { 26 | width: {{ page_width }}; 27 | margin: 30px auto 0 auto; 28 | } 29 | 30 | div.documentwrapper { 31 | float: left; 32 | width: 100%; 33 | } 34 | 35 | div.bodywrapper { 36 | margin: 0 0 0 {{ sidebar_width }}; 37 | } 38 | 39 | div.sphinxsidebar { 40 | width: {{ sidebar_width }}; 41 | } 42 | 43 | hr { 44 | border: 1px solid #B1B4B6; 45 | } 46 | 47 | div.body { 48 | background-color: #ffffff; 49 | color: #3E4349; 50 | padding: 0 30px 0 30px; 51 | } 52 | 53 | img.floatingflask { 54 | padding: 0 0 10px 10px; 55 | float: right; 56 | } 57 | 58 | div.footer { 59 | width: {{ page_width }}; 60 | margin: 20px auto 30px auto; 61 | font-size: 14px; 62 | color: #888; 63 | text-align: right; 64 | } 65 | 66 | div.footer a { 67 | color: #888; 68 | } 69 | 70 | div.related { 71 | display: none; 72 | } 73 | 74 | div.sphinxsidebar a { 75 | color: #444; 76 | text-decoration: none; 77 | border-bottom: 1px dotted #999; 78 | } 79 | 80 | div.sphinxsidebar a:hover { 81 | border-bottom: 1px solid #999; 82 | } 83 | 84 | div.sphinxsidebar { 85 | font-size: 14px; 86 | line-height: 1.5; 87 | } 88 | 89 | div.sphinxsidebarwrapper { 90 | padding: 18px 10px; 91 | } 92 | 93 | div.sphinxsidebarwrapper p.logo { 94 | padding: 0; 95 | margin: -10px 0 0 -20px; 96 | text-align: center; 97 | } 98 | 99 | div.sphinxsidebar h3, 100 | div.sphinxsidebar h4 { 101 | font-family: 'Garamond', 'Georgia', serif; 102 | color: #444; 103 | font-size: 24px; 104 | font-weight: normal; 105 | margin: 0 0 5px 0; 106 | padding: 0; 107 | } 108 | 109 | div.sphinxsidebar h4 { 110 | font-size: 20px; 111 | } 112 | 113 | div.sphinxsidebar h3 a { 114 | color: #444; 115 | } 116 | 117 | div.sphinxsidebar p.logo a, 118 | div.sphinxsidebar h3 a, 119 | div.sphinxsidebar p.logo a:hover, 120 | div.sphinxsidebar h3 a:hover { 121 | border: none; 122 | } 123 | 124 | div.sphinxsidebar p { 125 | color: #555; 126 | margin: 10px 0; 127 | } 128 | 129 | div.sphinxsidebar ul { 130 | margin: 10px 0; 131 | padding: 0; 132 | color: #000; 133 | } 134 | 135 | div.sphinxsidebar input { 136 | border: 1px solid #ccc; 137 | font-family: 'Georgia', serif; 138 | font-size: 1em; 139 | } 140 | 141 | /* -- body styles ----------------------------------------------------------- */ 142 | 143 | a { 144 | color: #004B6B; 145 | text-decoration: underline; 146 | } 147 | 148 | a:hover { 149 | color: #6D4100; 150 | text-decoration: underline; 151 | } 152 | 153 | div.body h1, 154 | div.body h2, 155 | div.body h3, 156 | div.body h4, 157 | div.body h5, 158 | div.body h6 { 159 | font-family: 'Garamond', 'Georgia', serif; 160 | font-weight: normal; 161 | margin: 30px 0px 10px 0px; 162 | padding: 0; 163 | } 164 | 165 | div.body h1 { margin-top: 0; padding-top: 0; font-size: 240%; } 166 | div.body h2 { font-size: 180%; } 167 | div.body h3 { font-size: 150%; } 168 | div.body h4 { font-size: 130%; } 169 | div.body h5 { font-size: 100%; } 170 | div.body h6 { font-size: 100%; } 171 | 172 | a.headerlink { 173 | color: #ddd; 174 | padding: 0 4px; 175 | text-decoration: none; 176 | } 177 | 178 | a.headerlink:hover { 179 | color: #444; 180 | background: #eaeaea; 181 | } 182 | 183 | div.body p, div.body dd, div.body li { 184 | line-height: 1.4em; 185 | } 186 | 187 | div.admonition { 188 | background: #fafafa; 189 | margin: 20px -30px; 190 | padding: 10px 30px; 191 | border-top: 1px solid #ccc; 192 | border-bottom: 1px solid #ccc; 193 | } 194 | 195 | div.admonition tt.xref, div.admonition a tt { 196 | border-bottom: 1px solid #fafafa; 197 | } 198 | 199 | dd div.admonition { 200 | margin-left: -60px; 201 | padding-left: 60px; 202 | } 203 | 204 | div.admonition p.admonition-title { 205 | font-family: 'Garamond', 'Georgia', serif; 206 | font-weight: normal; 207 | font-size: 24px; 208 | margin: 0 0 10px 0; 209 | padding: 0; 210 | line-height: 1; 211 | } 212 | 213 | div.admonition p.last { 214 | margin-bottom: 0; 215 | } 216 | 217 | div.highlight { 218 | background-color: white; 219 | } 220 | 221 | dt:target, .highlight { 222 | background: #FAF3E8; 223 | } 224 | 225 | div.note { 226 | background-color: #eee; 227 | border: 1px solid #ccc; 228 | } 229 | 230 | div.seealso { 231 | background-color: #ffc; 232 | border: 1px solid #ff6; 233 | } 234 | 235 | div.topic { 236 | background-color: #eee; 237 | } 238 | 239 | p.admonition-title { 240 | display: inline; 241 | } 242 | 243 | p.admonition-title:after { 244 | content: ":"; 245 | } 246 | 247 | pre, tt { 248 | font-family: 'Consolas', 'Menlo', 'Deja Vu Sans Mono', 'Bitstream Vera Sans Mono', monospace; 249 | font-size: 0.9em; 250 | } 251 | 252 | img.screenshot { 253 | } 254 | 255 | tt.descname, tt.descclassname { 256 | font-size: 0.95em; 257 | } 258 | 259 | tt.descname { 260 | padding-right: 0.08em; 261 | } 262 | 263 | img.screenshot { 264 | -moz-box-shadow: 2px 2px 4px #eee; 265 | -webkit-box-shadow: 2px 2px 4px #eee; 266 | box-shadow: 2px 2px 4px #eee; 267 | } 268 | 269 | table.docutils { 270 | border: 1px solid #888; 271 | -moz-box-shadow: 2px 2px 4px #eee; 272 | -webkit-box-shadow: 2px 2px 4px #eee; 273 | box-shadow: 2px 2px 4px #eee; 274 | } 275 | 276 | table.docutils td, table.docutils th { 277 | border: 1px solid #888; 278 | padding: 0.25em 0.7em; 279 | } 280 | 281 | table.field-list, table.footnote { 282 | border: none; 283 | -moz-box-shadow: none; 284 | -webkit-box-shadow: none; 285 | box-shadow: none; 286 | } 287 | 288 | table.footnote { 289 | margin: 15px 0; 290 | width: 100%; 291 | border: 1px solid #eee; 292 | background: #fdfdfd; 293 | font-size: 0.9em; 294 | } 295 | 296 | table.footnote + table.footnote { 297 | margin-top: -15px; 298 | border-top: none; 299 | } 300 | 301 | table.field-list th { 302 | padding: 0 0.8em 0 0; 303 | } 304 | 305 | table.field-list td { 306 | padding: 0; 307 | } 308 | 309 | table.footnote td.label { 310 | width: 0px; 311 | padding: 0.3em 0 0.3em 0.5em; 312 | } 313 | 314 | table.footnote td { 315 | padding: 0.3em 0.5em; 316 | } 317 | 318 | dl { 319 | margin: 0; 320 | padding: 0; 321 | } 322 | 323 | dl dd { 324 | margin-left: 30px; 325 | } 326 | 327 | blockquote { 328 | margin: 0 0 0 30px; 329 | padding: 0; 330 | } 331 | 332 | ul, ol { 333 | margin: 10px 0 10px 30px; 334 | padding: 0; 335 | } 336 | 337 | pre { 338 | background: #eee; 339 | padding: 7px 30px; 340 | margin: 15px -30px; 341 | line-height: 1.3em; 342 | } 343 | 344 | dl pre, blockquote pre, li pre { 345 | margin-left: -60px; 346 | padding-left: 60px; 347 | } 348 | 349 | dl dl pre { 350 | margin-left: -90px; 351 | padding-left: 90px; 352 | } 353 | 354 | tt { 355 | background-color: #ecf0f3; 356 | color: #222; 357 | /* padding: 1px 2px; */ 358 | } 359 | 360 | tt.xref, a tt { 361 | background-color: #FBFBFB; 362 | border-bottom: 1px solid white; 363 | } 364 | 365 | a.reference { 366 | text-decoration: none; 367 | border-bottom: 1px dotted #004B6B; 368 | } 369 | 370 | a.reference:hover { 371 | border-bottom: 1px solid #6D4100; 372 | } 373 | 374 | a.footnote-reference { 375 | text-decoration: none; 376 | font-size: 0.7em; 377 | vertical-align: top; 378 | border-bottom: 1px dotted #004B6B; 379 | } 380 | 381 | a.footnote-reference:hover { 382 | border-bottom: 1px solid #6D4100; 383 | } 384 | 385 | a:hover tt { 386 | background: #EEE; 387 | } 388 | 389 | 390 | @media screen and (max-width: 870px) { 391 | 392 | div.sphinxsidebar { 393 | display: none; 394 | } 395 | 396 | div.document { 397 | width: 100%; 398 | 399 | } 400 | 401 | div.documentwrapper { 402 | margin-left: 0; 403 | margin-top: 0; 404 | margin-right: 0; 405 | margin-bottom: 0; 406 | } 407 | 408 | div.bodywrapper { 409 | margin-top: 0; 410 | margin-right: 0; 411 | margin-bottom: 0; 412 | margin-left: 0; 413 | } 414 | 415 | ul { 416 | margin-left: 0; 417 | } 418 | 419 | .document { 420 | width: auto; 421 | } 422 | 423 | .footer { 424 | width: auto; 425 | } 426 | 427 | .bodywrapper { 428 | margin: 0; 429 | } 430 | 431 | .footer { 432 | width: auto; 433 | } 434 | 435 | .github { 436 | display: none; 437 | } 438 | 439 | 440 | 441 | } 442 | 443 | 444 | 445 | @media screen and (max-width: 875px) { 446 | 447 | body { 448 | margin: 0; 449 | padding: 20px 30px; 450 | } 451 | 452 | div.documentwrapper { 453 | float: none; 454 | background: white; 455 | } 456 | 457 | div.sphinxsidebar { 458 | display: block; 459 | float: none; 460 | width: 102.5%; 461 | margin: 50px -30px -20px -30px; 462 | padding: 10px 20px; 463 | background: #333; 464 | color: white; 465 | } 466 | 467 | div.sphinxsidebar h3, div.sphinxsidebar h4, div.sphinxsidebar p, 468 | div.sphinxsidebar h3 a { 469 | color: white; 470 | } 471 | 472 | div.sphinxsidebar a { 473 | color: #aaa; 474 | } 475 | 476 | div.sphinxsidebar p.logo { 477 | display: none; 478 | } 479 | 480 | div.document { 481 | width: 100%; 482 | margin: 0; 483 | } 484 | 485 | div.related { 486 | display: block; 487 | margin: 0; 488 | padding: 10px 0 20px 0; 489 | } 490 | 491 | div.related ul, 492 | div.related ul li { 493 | margin: 0; 494 | padding: 0; 495 | } 496 | 497 | div.footer { 498 | display: none; 499 | } 500 | 501 | div.bodywrapper { 502 | margin: 0; 503 | } 504 | 505 | div.body { 506 | min-height: 0; 507 | padding: 0; 508 | } 509 | 510 | .rtd_doc_footer { 511 | display: none; 512 | } 513 | 514 | .document { 515 | width: auto; 516 | } 517 | 518 | .footer { 519 | width: auto; 520 | } 521 | 522 | .footer { 523 | width: auto; 524 | } 525 | 526 | .github { 527 | display: none; 528 | } 529 | } 530 | 531 | 532 | /* misc. */ 533 | 534 | .revsys-inline { 535 | display: none!important; 536 | } 537 | 538 | div.sphinxsidebar a.flattr-button { 539 | text-decoration: none; 540 | border-bottom: none; 541 | } -------------------------------------------------------------------------------- /docs/_themes/kr/static/small_flask.css: -------------------------------------------------------------------------------- 1 | /* 2 | * small_flask.css_t 3 | * ~~~~~~~~~~~~~~~~~ 4 | * 5 | * :copyright: Copyright 2010 by Armin Ronacher. 6 | * :license: Flask Design License, see LICENSE for details. 7 | */ 8 | 9 | body { 10 | margin: 0; 11 | padding: 20px 30px; 12 | } 13 | 14 | div.documentwrapper { 15 | float: none; 16 | background: white; 17 | } 18 | 19 | div.sphinxsidebar { 20 | display: block; 21 | float: none; 22 | width: 102.5%; 23 | margin: 50px -30px -20px -30px; 24 | padding: 10px 20px; 25 | background: #333; 26 | color: white; 27 | } 28 | 29 | div.sphinxsidebar h3, div.sphinxsidebar h4, div.sphinxsidebar p, 30 | div.sphinxsidebar h3 a { 31 | color: white; 32 | } 33 | 34 | div.sphinxsidebar a { 35 | color: #aaa; 36 | } 37 | 38 | div.sphinxsidebar p.logo { 39 | display: none; 40 | } 41 | 42 | div.document { 43 | width: 100%; 44 | margin: 0; 45 | } 46 | 47 | div.related { 48 | display: block; 49 | margin: 0; 50 | padding: 10px 0 20px 0; 51 | } 52 | 53 | div.related ul, 54 | div.related ul li { 55 | margin: 0; 56 | padding: 0; 57 | } 58 | 59 | div.footer { 60 | display: none; 61 | } 62 | 63 | div.bodywrapper { 64 | margin: 0; 65 | } 66 | 67 | div.body { 68 | min-height: 0; 69 | padding: 0; 70 | } 71 | 72 | .rtd_doc_footer { 73 | display: none; 74 | } 75 | 76 | .document { 77 | width: auto; 78 | } 79 | 80 | .footer { 81 | width: auto; 82 | } 83 | 84 | .footer { 85 | width: auto; 86 | } 87 | 88 | .github { 89 | display: none; 90 | } 91 | 92 | img { 93 | border: 0px 0px; 94 | } -------------------------------------------------------------------------------- /docs/_themes/kr/theme.conf: -------------------------------------------------------------------------------- 1 | [theme] 2 | inherit = basic 3 | stylesheet = flasky.css 4 | pygments_style = flask_theme_support.FlaskyStyle 5 | 6 | [options] 7 | touch_icon = 8 | -------------------------------------------------------------------------------- /docs/_themes/kr_small/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "basic/layout.html" %} 2 | {% block header %} 3 | {{ super() }} 4 | {% if pagename == 'index' %} 5 |
6 | {% endif %} 7 | {% endblock %} 8 | {% block footer %} 9 | {% if pagename == 'index' %} 10 |
11 | {% endif %} 12 | {% endblock %} 13 | {# do not display relbars #} 14 | {% block relbar1 %}{% endblock %} 15 | {% block relbar2 %} 16 | {% if theme_github_fork %} 17 | Fork me on GitHub 19 | {% endif %} 20 | {% endblock %} 21 | {% block sidebar1 %}{% endblock %} 22 | {% block sidebar2 %}{% endblock %} 23 | -------------------------------------------------------------------------------- /docs/_themes/kr_small/static/flasky.css_t: -------------------------------------------------------------------------------- 1 | /* 2 | * flasky.css_t 3 | * ~~~~~~~~~~~~ 4 | * 5 | * Sphinx stylesheet -- flasky theme based on nature theme. 6 | * 7 | * :copyright: Copyright 2007-2010 by the Sphinx team, see AUTHORS. 8 | * :license: BSD, see LICENSE for details. 9 | * 10 | */ 11 | 12 | @import url("basic.css"); 13 | 14 | /* -- page layout ----------------------------------------------------------- */ 15 | 16 | body { 17 | font-family: 'Georgia', serif; 18 | font-size: 17px; 19 | color: #000; 20 | background: white; 21 | margin: 0; 22 | padding: 0; 23 | } 24 | 25 | div.documentwrapper { 26 | float: left; 27 | width: 100%; 28 | } 29 | 30 | div.bodywrapper { 31 | margin: 40px auto 0 auto; 32 | width: 700px; 33 | } 34 | 35 | hr { 36 | border: 1px solid #B1B4B6; 37 | } 38 | 39 | div.body { 40 | background-color: #ffffff; 41 | color: #3E4349; 42 | padding: 0 30px 30px 30px; 43 | } 44 | 45 | img.floatingflask { 46 | padding: 0 0 10px 10px; 47 | float: right; 48 | } 49 | 50 | div.footer { 51 | text-align: right; 52 | color: #888; 53 | padding: 10px; 54 | font-size: 14px; 55 | width: 650px; 56 | margin: 0 auto 40px auto; 57 | } 58 | 59 | div.footer a { 60 | color: #888; 61 | text-decoration: underline; 62 | } 63 | 64 | div.related { 65 | line-height: 32px; 66 | color: #888; 67 | } 68 | 69 | div.related ul { 70 | padding: 0 0 0 10px; 71 | } 72 | 73 | div.related a { 74 | color: #444; 75 | } 76 | 77 | /* -- body styles ----------------------------------------------------------- */ 78 | 79 | a { 80 | color: #004B6B; 81 | text-decoration: underline; 82 | } 83 | 84 | a:hover { 85 | color: #6D4100; 86 | text-decoration: underline; 87 | } 88 | 89 | div.body { 90 | padding-bottom: 40px; /* saved for footer */ 91 | } 92 | 93 | div.body h1, 94 | div.body h2, 95 | div.body h3, 96 | div.body h4, 97 | div.body h5, 98 | div.body h6 { 99 | font-family: 'Garamond', 'Georgia', serif; 100 | font-weight: normal; 101 | margin: 30px 0px 10px 0px; 102 | padding: 0; 103 | } 104 | 105 | {% if theme_index_logo %} 106 | div.indexwrapper h1 { 107 | text-indent: -999999px; 108 | background: url({{ theme_index_logo }}) no-repeat center center; 109 | height: {{ theme_index_logo_height }}; 110 | } 111 | {% endif %} 112 | 113 | div.body h2 { font-size: 180%; } 114 | div.body h3 { font-size: 150%; } 115 | div.body h4 { font-size: 130%; } 116 | div.body h5 { font-size: 100%; } 117 | div.body h6 { font-size: 100%; } 118 | 119 | a.headerlink { 120 | color: white; 121 | padding: 0 4px; 122 | text-decoration: none; 123 | } 124 | 125 | a.headerlink:hover { 126 | color: #444; 127 | background: #eaeaea; 128 | } 129 | 130 | div.body p, div.body dd, div.body li { 131 | line-height: 1.4em; 132 | } 133 | 134 | div.admonition { 135 | background: #fafafa; 136 | margin: 20px -30px; 137 | padding: 10px 30px; 138 | border-top: 1px solid #ccc; 139 | border-bottom: 1px solid #ccc; 140 | } 141 | 142 | div.admonition p.admonition-title { 143 | font-family: 'Garamond', 'Georgia', serif; 144 | font-weight: normal; 145 | font-size: 24px; 146 | margin: 0 0 10px 0; 147 | padding: 0; 148 | line-height: 1; 149 | } 150 | 151 | div.admonition p.last { 152 | margin-bottom: 0; 153 | } 154 | 155 | div.highlight{ 156 | background-color: white; 157 | } 158 | 159 | dt:target, .highlight { 160 | background: #FAF3E8; 161 | } 162 | 163 | div.note { 164 | background-color: #eee; 165 | border: 1px solid #ccc; 166 | } 167 | 168 | div.seealso { 169 | background-color: #ffc; 170 | border: 1px solid #ff6; 171 | } 172 | 173 | div.topic { 174 | background-color: #eee; 175 | } 176 | 177 | div.warning { 178 | background-color: #ffe4e4; 179 | border: 1px solid #f66; 180 | } 181 | 182 | p.admonition-title { 183 | display: inline; 184 | } 185 | 186 | p.admonition-title:after { 187 | content: ":"; 188 | } 189 | 190 | pre, tt { 191 | font-family: 'Consolas', 'Menlo', 'Deja Vu Sans Mono', 'Bitstream Vera Sans Mono', monospace; 192 | font-size: 0.85em; 193 | } 194 | 195 | img.screenshot { 196 | } 197 | 198 | tt.descname, tt.descclassname { 199 | font-size: 0.95em; 200 | } 201 | 202 | tt.descname { 203 | padding-right: 0.08em; 204 | } 205 | 206 | img.screenshot { 207 | -moz-box-shadow: 2px 2px 4px #eee; 208 | -webkit-box-shadow: 2px 2px 4px #eee; 209 | box-shadow: 2px 2px 4px #eee; 210 | } 211 | 212 | table.docutils { 213 | border: 1px solid #888; 214 | -moz-box-shadow: 2px 2px 4px #eee; 215 | -webkit-box-shadow: 2px 2px 4px #eee; 216 | box-shadow: 2px 2px 4px #eee; 217 | } 218 | 219 | table.docutils td, table.docutils th { 220 | border: 1px solid #888; 221 | padding: 0.25em 0.7em; 222 | } 223 | 224 | table.field-list, table.footnote { 225 | border: none; 226 | -moz-box-shadow: none; 227 | -webkit-box-shadow: none; 228 | box-shadow: none; 229 | } 230 | 231 | table.footnote { 232 | margin: 15px 0; 233 | width: 100%; 234 | border: 1px solid #eee; 235 | } 236 | 237 | table.field-list th { 238 | padding: 0 0.8em 0 0; 239 | } 240 | 241 | table.field-list td { 242 | padding: 0; 243 | } 244 | 245 | table.footnote td { 246 | padding: 0.5em; 247 | } 248 | 249 | dl { 250 | margin: 0; 251 | padding: 0; 252 | } 253 | 254 | dl dd { 255 | margin-left: 30px; 256 | } 257 | 258 | pre { 259 | padding: 0; 260 | margin: 15px -30px; 261 | padding: 8px; 262 | line-height: 1.3em; 263 | padding: 7px 30px; 264 | background: #eee; 265 | border-radius: 2px; 266 | -moz-border-radius: 2px; 267 | -webkit-border-radius: 2px; 268 | } 269 | 270 | dl pre { 271 | margin-left: -60px; 272 | padding-left: 60px; 273 | } 274 | 275 | tt { 276 | background-color: #ecf0f3; 277 | color: #222; 278 | /* padding: 1px 2px; */ 279 | } 280 | 281 | tt.xref, a tt { 282 | background-color: #FBFBFB; 283 | } 284 | 285 | a:hover tt { 286 | background: #EEE; 287 | } 288 | -------------------------------------------------------------------------------- /docs/_themes/kr_small/theme.conf: -------------------------------------------------------------------------------- 1 | [theme] 2 | inherit = basic 3 | stylesheet = flasky.css 4 | nosidebar = true 5 | pygments_style = flask_theme_support.FlaskyStyle 6 | 7 | [options] 8 | index_logo = '' 9 | index_logo_height = 120px 10 | github_fork = '' 11 | -------------------------------------------------------------------------------- /docs/build_script_docs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import subprocess 6 | 7 | 8 | def create_docs(command, path): 9 | proc = subprocess.Popen(["textkit", command, "--help"], stdout=subprocess.PIPE) 10 | out = str(proc.communicate()[0]).split('\\n') 11 | # print(out) 12 | content = "=" * len(command) + "\n" 13 | content += command + "\n" 14 | content += "=" * len(command) + "\n" 15 | 16 | content += "\n" 17 | content += "Description\n" 18 | content += "=" * len("Description") + "\n" 19 | content += "\n" 20 | content += "::\n" 21 | content += "\n" 22 | for line in out: 23 | content += " " + line.replace("b'", '').replace("'", '') + "\n" 24 | content += "\n" 25 | content += "\n" 26 | content += "Examples\n" 27 | content += "=" * len("Examples") + "\n" 28 | print(content) 29 | 30 | with open(path, 'w') as f: 31 | f.write(content) 32 | 33 | proc = subprocess.Popen(["textkit", "--help"], stdout=subprocess.PIPE) 34 | out = str(proc.communicate()[0]).split('\\n') 35 | 36 | commands = [] 37 | 38 | at_commands = False 39 | for line in out: 40 | if at_commands: 41 | command = line.split()[0] 42 | if(len(command) > 3): 43 | commands.append(command) 44 | else: 45 | if "Commands:" in line: 46 | at_commands = True 47 | 48 | 49 | print(commands) 50 | 51 | 52 | path = os.path.dirname(os.path.realpath(__file__)) 53 | doc_path = os.path.join(path, "scripts") 54 | print(doc_path) 55 | 56 | for command in commands: 57 | command_doc_path = os.path.join(doc_path, command + ".rst") 58 | if not os.path.isfile(command_doc_path): 59 | print('creating: ' + command_doc_path) 60 | create_docs(command, command_doc_path) 61 | else: 62 | print('skipping: ' + command_doc_path) 63 | -------------------------------------------------------------------------------- /docs/cli.rst: -------------------------------------------------------------------------------- 1 | ===== 2 | Usage 3 | ===== 4 | 5 | textkit is composed of command-line tools that can be divided into four major categories: Tokenization, Transformation, Filtering, and Packaging. Documentation and examples for each tool are described on the following pages. 6 | 7 | Tokenization 8 | ============ 9 | 10 | .. toctree:: 11 | :maxdepth: 1 12 | 13 | scripts/text2words 14 | scripts/text2ngrams 15 | scripts/text2punc 16 | scripts/text2sentences 17 | scripts/words2bigrams 18 | scripts/words2ngrams 19 | 20 | Transformation 21 | ============== 22 | 23 | .. toctree:: 24 | :maxdepth: 1 25 | 26 | scripts/tokens2lower 27 | scripts/tokens2upper 28 | scripts/tokens2stem 29 | scripts/tokens2counts 30 | scripts/tokens2pos 31 | scripts/tokens2topbigrams 32 | scripts/transliterate 33 | 34 | 35 | Filter 36 | ====== 37 | 38 | .. toctree:: 39 | :maxdepth: 1 40 | 41 | scripts/filterlengths 42 | scripts/filterpunc 43 | scripts/filterwords 44 | 45 | 46 | Package 47 | ======= 48 | 49 | .. toctree:: 50 | :maxdepth: 1 51 | 52 | scripts/texts2json 53 | scripts/tokens2json 54 | scripts/tokens2text 55 | 56 | Misc 57 | ==== 58 | 59 | .. toctree:: 60 | :maxdepth: 1 61 | 62 | scripts/download 63 | scripts/nonewlines 64 | scripts/showstops 65 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import datetime as dt 3 | import os 4 | import sys 5 | 6 | # If extensions (or modules to document with autodoc) are in another directory, 7 | # add these directories to sys.path here. If the directory is relative to the 8 | # documentation root, use os.path.abspath to make it absolute, like shown here. 9 | sys.path.insert(0, os.path.abspath('..')) 10 | import textkit 11 | sys.path.append(os.path.abspath("_themes")) 12 | 13 | # -- General configuration ----------------------------------------------------- 14 | 15 | # Add any Sphinx extension module names here, as strings. They can be extensions 16 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 17 | extensions = [ 18 | 'sphinx.ext.autodoc', 19 | 'sphinx.ext.doctest', 20 | 'sphinx.ext.viewcode' 21 | ] 22 | 23 | primary_domain = 'py' 24 | default_role = 'py:obj' 25 | 26 | issues_github_path = 'learntextvis/textkit' 27 | 28 | # Add any paths that contain templates here, relative to this directory. 29 | templates_path = ['_templates'] 30 | 31 | # The suffix of source filenames. 32 | source_suffix = '.rst' 33 | 34 | # The master toctree document. 35 | master_doc = 'index' 36 | 37 | # General information about the project. 38 | project = u'textkit' 39 | copyright = u'{0:%Y} LearnTextVis'.format( 40 | dt.datetime.utcnow() 41 | ) 42 | 43 | # The version info for the project you're documenting, acts as replacement for 44 | # |version| and |release|, also used in various other places throughout the 45 | # built documents. 46 | # 47 | # The short X.Y version. 48 | # version = release = textkit.__version__ 49 | version = release = '0.0.1' 50 | exclude_patterns = ['_build'] 51 | pygments_style = 'flask_theme_support.FlaskyStyle' 52 | html_theme = 'kr' 53 | html_theme_path = ['_themes'] 54 | 55 | html_static_path = ['_static'] 56 | 57 | # Custom sidebar templates, maps document names to template names. 58 | html_sidebars = { 59 | 'index': ['side-primary.html'], 60 | '**': ['side-secondary.html', 'localtoc.html', 61 | 'relations.html'] 62 | } 63 | # Output file base name for HTML help builder. 64 | htmlhelp_basename = 'textkitdoc' 65 | 66 | 67 | # -- Options for LaTeX output -------------------------------------------------- 68 | 69 | # Grouping the document tree into LaTeX files. List of tuples 70 | # (source start file, target name, title, author, documentclass [howto/manual]). 71 | latex_documents = [ 72 | ('index', 'textkit.tex', u'textkit Documentation', 73 | u'LearnTextVis', 'manual'), 74 | ] 75 | 76 | # One entry per manual page. List of tuples 77 | # (source start file, name, description, authors, manual section). 78 | man_pages = [ 79 | ('index', 'textkit', u'textkit Documentation', 80 | [u'LearnTextVis'], 1) 81 | ] 82 | # -- Options for Texinfo output ------------------------------------------------ 83 | 84 | # Grouping the document tree into Texinfo files. List of tuples 85 | # (source start file, target name, title, author, 86 | # dir menu entry, description, category) 87 | texinfo_documents = [ 88 | ('index', 'textkit', u'textkit Documentation', 89 | u'LearnTextVis', 'textkit', 'Command-line text-processing.', 90 | 'Natural Language Processing'), 91 | ] 92 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. _contributing: 2 | 3 | Contributing 4 | ============ 5 | 6 | If you are interested in contributing to textkit we would love your help! 7 | 8 | Here is a bit more about the structure of the codebase and how to contribute. 9 | 10 | Code Structure 11 | -------------- 12 | 13 | Each command is implemented in its own file. These command files are organized into 14 | sub-directories: 15 | 16 | * tokenize 17 | * transform 18 | * filter 19 | * package 20 | 21 | The use of these sub-directories is primarily for developer convenience and commands 22 | can be moved around if a better structure is found. 23 | 24 | Commands 25 | -------- 26 | 27 | textkit uses `Click `_. to handle command line arguments 28 | and inputs. Click uses decorators to define these arguments and options in a succinct way. 29 | 30 | textkit strives to use text as an input and text as an output. Raw text can be processed 31 | using commands that start with ``text2`` like ``text2words``. 32 | 33 | Token documents (text files with a token on each line) can be used and produced by 34 | commands that include ``words`` in the name. 35 | 36 | Tranformation functions that work on tokens should start with ``tokens2``, 37 | as in ``tokens2counts``. 38 | 39 | Utilities 40 | --------- 41 | 42 | There are a very small set of utility functions that are useful in keeping textkit 43 | 44 | These are contained in the ``utils.py`` file. Some that you might find helpful: 45 | 46 | ``read_tokens`` will convert a token document into a list of tokens. Use this to process the 47 | input file if your input is a token document. 48 | 49 | ``output`` is a light wrapper around the output capabilities of Click that prevents 50 | error messages if the command is exited early (like when piping to ``head``). 51 | 52 | ``write_csv`` is handy for when multiple columns of data are being output. 53 | 54 | Writing New Commands 55 | -------------------- 56 | 57 | Want to contribute a new command? Great! 58 | 59 | textkit uses GitHub Pull Requests to incorporate other developer's work. 60 | 61 | Fork the repo and then create a branch for your new command. Create and test it, 62 | then submit a Pull Request. 63 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | textkit: Command line text processing 2 | ==================================== 3 | 4 | *textkit* is a set of command line tools for text processing and analysis. 5 | 6 | You can use it to do basic natural language processing from the command line. 7 | 8 | Features 9 | -------- 10 | 11 | - Simple tools that can be combined to do fun stuff with text as data. 12 | - A Unix-style approach that promotes piping together commands to produce more complex processes. 13 | - Documentation is built in using `--help` on commands. 14 | 15 | 16 | Get it now 17 | ---------- 18 | 19 | textkit can be easily installed using pip: 20 | 21 | :: 22 | 23 | $ pip install -U textkit 24 | 25 | Try it out 26 | ---------- 27 | 28 | :: 29 | 30 | # install necessary data files 31 | $ textkit download 32 | 33 | # show all commands 34 | $ textkit --help 35 | 36 | Check out the :ref:`Quickstart guide ` to learn more about textkit's features. 37 | 38 | 39 | Guide 40 | ===== 41 | 42 | .. toctree:: 43 | :maxdepth: 2 44 | 45 | install 46 | quickstart 47 | cli 48 | contributing 49 | -------------------------------------------------------------------------------- /docs/install.rst: -------------------------------------------------------------------------------- 1 | .. _install: 2 | 3 | Installation 4 | ============ 5 | 6 | Using pip 7 | +++++++++ 8 | 9 | Most users should be able to install textkit easily using pip: 10 | 11 | ``pip install -U textkit`` 12 | 13 | To ensure you have all the data files needed to run all the commands, you should then run: 14 | 15 | ``textkit download`` 16 | 17 | This will download some files that NLTK (a dependency of textkit) needs for certain commands. 18 | 19 | From Source 20 | +++++++++++ 21 | 22 | Textkit is developed and maintained on github, so building from source is also easy. 23 | 24 | First clone the repo: 25 | 26 | ``git clone git@github.com:learntextvis/textkit.git`` 27 | 28 | Then navigate to the `textkit` directory to install its requirements 29 | 30 | .. doctest:: 31 | 32 | cd textkit 33 | pip install -r requirements.txt 34 | 35 | Finally, install the local version of textkit using the `--editable` flag: 36 | 37 | .. doctest:: 38 | 39 | pip install --editable . 40 | -------------------------------------------------------------------------------- /docs/quickstart.rst: -------------------------------------------------------------------------------- 1 | .. _quickstart: 2 | 3 | Tutorial: Quickstart 4 | ==================== 5 | 6 | .. module:: textkit.cli 7 | 8 | Let's say we have a very short piece of text stored in ``input.txt``. It looks something like: 9 | 10 | .. doctest:: 11 | 12 | Mrs. Bennet deigned not to make any reply, but, unable to contain 13 | herself, began scolding one of her daughters. 14 | 15 | What are some of the tools in textkit that we can use on this text? 16 | 17 | Convert Text to Tokens 18 | ------------------------ 19 | 20 | Tokenization is the process of turning text into chunks of text. 21 | These chunks can be sentences, words, or even sections of words. 22 | 23 | Textkit converts a text file into a **token document** - where each line has one token per line. 24 | 25 | .. doctest:: 26 | 27 | textkit text2words input.txt 28 | 29 | This command converts our input.txt text file into a token document where each token is a word. 30 | 31 | The output would look something like: 32 | 33 | .. doctest:: 34 | 35 | Mrs. 36 | Bennet 37 | deigned 38 | not 39 | to 40 | make 41 | any 42 | reply 43 | , 44 | but 45 | , 46 | unable 47 | to 48 | contain 49 | herself 50 | , 51 | began 52 | scolding 53 | one 54 | of 55 | her 56 | daughters 57 | . 58 | 59 | 60 | This is typically the first thing we want to do when using textkit, as textkit is all about working with tokens. 61 | 62 | The output by default goes to standard out. You can redirect to a file by using ``>``. 63 | 64 | .. doctest:: 65 | 66 | textkit text2words input.txt > words.txt 67 | 68 | This would put our words into ``words.txt``. 69 | 70 | We can also get **bigrams** (two word tokens). 71 | 72 | .. doctest:: 73 | 74 | textkit text2words input.txt | textkit words2bigrams > bigrams.txt 75 | 76 | Here we first convert the text to word tokens and use that as the input for the bigram tokenization. 77 | 78 | The contents of ``bigrams.txt`` would look like: 79 | 80 | .. doctest:: 81 | 82 | Mrs. Bennet 83 | Bennet deigned 84 | deigned not 85 | not to 86 | to make 87 | make any 88 | any reply 89 | reply , 90 | , but 91 | but , 92 | , unable 93 | unable to 94 | to contain 95 | contain herself 96 | herself , 97 | , began 98 | began scolding 99 | scolding one 100 | one of 101 | of her 102 | her daughters 103 | daughters . 104 | 105 | Note the use of **|** for piping one textkit command into another. 106 | 107 | With no file passed in, many textkit commands default to standard in. 108 | This can be indicated explicitly by using a dash (``-``) to indicate standard in. 109 | 110 | Commands that begin with ``text`` in textkit transform text into tokens of some sort. 111 | 112 | Any command that uses ``words`` expects to work with **token documents** that have one word per line. 113 | 114 | A bigram is just a special case of an ``NGram`` - so lets make some ngrams of size 5: 115 | 116 | .. doctest:: 117 | 118 | textkit text2words input.txt | textkit words2ngrams -n 5 119 | 120 | Which produces: 121 | 122 | .. doctest:: 123 | 124 | Mrs. Bennet deigned not to 125 | Bennet deigned not to make 126 | deigned not to make any 127 | not to make any reply 128 | to make any reply , 129 | make any reply , but 130 | any reply , but , 131 | reply , but , unable 132 | , but , unable to 133 | but , unable to contain 134 | , unable to contain herself 135 | unable to contain herself , 136 | to contain herself , began 137 | contain herself , began scolding 138 | herself , began scolding one 139 | , began scolding one of 140 | began scolding one of her 141 | scolding one of her daughters 142 | one of her daughters . 143 | 144 | Notice the ``-n`` argument to indicate the number of words that should be included in each ngram. 145 | 146 | With all textkit commands, the ``--help`` flag shows all possible arguments for a command. 147 | 148 | 149 | .. doctest:: 150 | 151 | textkit words2ngrams --help 152 | 153 | .. doctest:: 154 | 155 | Usage: textkit words2ngrams [OPTIONS] [TOKENS] 156 | 157 | Tokenize words into ngrams. ngrams are n-length word tokens. Punctuation 158 | is considered as a separate token. 159 | 160 | Options: 161 | --sep TEXT Separator between words in bigram output. [default: ] 162 | -n, --length INTEGER Length of the n-gram [default: 2] 163 | --help Show this message and exit. 164 | 165 | Filter Tokens 166 | ------------- 167 | 168 | textkit includes a number of filtering capabilities that can be useful for tweaking your tokens. 169 | 170 | Notice our word and ngram tokens above include commas and periods? Let's remove them using ``filterpunc``. 171 | 172 | .. doctest:: 173 | 174 | textkit text2words input.txt | textkit filterpunc 175 | 176 | If we don't want to pipe these commands together, we can also just execute filters on the ``words.txt`` - the saved word token file. 177 | 178 | .. doctest:: 179 | 180 | textkit filterpunc words.txt 181 | 182 | 183 | In natural language processing, ``stop words`` are words so common that they provide little information about a document, and so are often removed. Textkit's ``filterwords`` will remove stop words from our token output. 184 | 185 | 186 | .. doctest:: 187 | 188 | textkit filterwords words.txt 189 | 190 | We can also just filter words that are less then a certain number of characters long: 191 | 192 | .. doctest:: 193 | 194 | textkit filterlengths -m 5 words.txt 195 | 196 | This would produce: 197 | 198 | .. doctest:: 199 | 200 | Bennet 201 | deigned 202 | reply 203 | unable 204 | contain 205 | herself 206 | began 207 | scolding 208 | daughters 209 | 210 | Transform Tokens 211 | ---------------- 212 | 213 | There are a number of tools in textkit to transform tokens in varous ways. 214 | 215 | Ensuring the casing of our tokens is consistent is a common text analysis preprocessing step. 216 | 217 | This is done in textkit using ``tokens2lower`` and ``tokens2upper``. These commands work on tokens as well as raw text. 218 | 219 | .. doctest:: 220 | 221 | textkit tokens2lower input.txt 222 | 223 | .. doctest:: 224 | 225 | mrs. bennet deigned not to make any reply, but, unable to contain 226 | herself, began scolding one of her daughters. 227 | 228 | 229 | .. doctest:: 230 | 231 | textkit tokens2upper words.txt 232 | 233 | .. doctest:: 234 | 235 | MRS. BENNET DEIGNED NOT TO MAKE ANY REPLY, BUT, UNABLE TO CONTAIN 236 | HERSELF, BEGAN SCOLDING ONE OF HER DAUGHTERS. 237 | 238 | Token Information and Stats 239 | --------------------------- 240 | 241 | textkit is also great for finding out interesting stuff about your text. 242 | 243 | Count unique tokens with ``tokens2counts``, which outputs a CSV-like output that includes the token and the count of that token in the document. 244 | 245 | .. doctest:: 246 | 247 | textkit tokens2counts words.txt 248 | 249 | ``TODO: topbigrams`` 250 | 251 | ``TODO: tokens2pos`` 252 | 253 | Package 254 | ------- 255 | 256 | Once the tokens are setup and transformed the way you want them, 257 | it can be useful to package up a set of documents into a single file for downstream visualization or other uses. 258 | 259 | .. doctest:: 260 | 261 | textkit tokens2json words1.txt words2.txt > out.json 262 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | # pip requirements for docs generation 2 | # pip install -r docs/requirements.txt 3 | 4 | Sphinx -------------------------------------------------------------------------------- /docs/rsync_exclude: -------------------------------------------------------------------------------- 1 | .git 2 | -------------------------------------------------------------------------------- /docs/scripts/download.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | download 3 | ======== 4 | 5 | Description 6 | =========== 7 | 8 | Install required libraries. Note this library will install nltk dependencies into your user directory. 9 | 10 | :: 11 | 12 | Usage: textkit download 13 | 14 | 15 | 16 | Examples 17 | ======== 18 | -------------------------------------------------------------------------------- /docs/scripts/filterlengths.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | filterlengths 3 | ============= 4 | 5 | Description 6 | =========== 7 | 8 | Remove tokens that are shorter then the minimum length provided. 9 | 10 | :: 11 | 12 | Usage: textkit filterlengths [OPTIONS] [TOKENS] 13 | 14 | Options: 15 | -m, --minimum INTEGER Minimum length of token to not filter. [default: 3] 16 | --help Show this message and exit. 17 | 18 | 19 | 20 | Examples 21 | ======== 22 | -------------------------------------------------------------------------------- /docs/scripts/filterpunc.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | filterpunc 3 | ========== 4 | 5 | Description 6 | =========== 7 | 8 | Remove tokens that are only punctuation from a list of tokens. 9 | 10 | :: 11 | 12 | Usage: textkit filterpunc [OPTIONS] [TOKENS] 13 | 14 | Options: 15 | --help Show this message and exit. 16 | 17 | 18 | 19 | Examples 20 | ======== 21 | -------------------------------------------------------------------------------- /docs/scripts/filterwords.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | filterwords 3 | =========== 4 | 5 | Description 6 | =========== 7 | 8 | Remove stop words from tokens, returning tokens without stop words. 9 | 10 | :: 11 | 12 | Usage: textkit filterwords [OPTIONS] [TOKENS] 13 | 14 | Options: 15 | -l, --language [english|german|danish|dutch|finnish|french|hungarian|italian|norwegian|portuguese|russian|spanish|swedish|turkish] 16 | --custom FILENAME Optional token file of additional tokens to 17 | remove along with selected stop words. 18 | --help Show this message and exit. 19 | 20 | 21 | 22 | Examples 23 | ======== 24 | -------------------------------------------------------------------------------- /docs/scripts/nonewlines.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | nonewlines 3 | ========== 4 | 5 | Description 6 | =========== 7 | 8 | :: 9 | 10 | Usage: textkit nonewlines [OPTIONS] [TEXT]... 11 | 12 | Remove newlines from a text file. 13 | 14 | Options: 15 | --help Show this message and exit. 16 | 17 | 18 | 19 | Examples 20 | ======== 21 | -------------------------------------------------------------------------------- /docs/scripts/showstops.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | showstops 3 | ========= 4 | 5 | Description 6 | =========== 7 | 8 | :: 9 | 10 | Usage: textkit showstops [OPTIONS] 11 | 12 | Display stop words used by textkit for a given language. 13 | 14 | Options: 15 | -l, --language [english|german|danish|dutch|finnish|french|hungarian|italian|norwegian|portuguese|russian|spanish|swedish|turkish] 16 | --help Show this message and exit. 17 | 18 | 19 | 20 | Examples 21 | ======== 22 | -------------------------------------------------------------------------------- /docs/scripts/text2ngrams.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | text2ngrams 3 | =========== 4 | 5 | Description 6 | =========== 7 | 8 | :: 9 | 10 | Usage: textkit text2ngrams [OPTIONS] [TEXT]... 11 | 12 | Tokenize plain text into ngrams. ngrams are n-length word tokens. 13 | Punctuation is considered as a separate token. 14 | 15 | Options: 16 | -s, --sep TEXT Separator between words in bigram output. [default: ] 17 | -n, --num INTEGER Length of the n-gram [default: 2] 18 | --help Show this message and exit. 19 | 20 | 21 | 22 | Examples 23 | ======== 24 | -------------------------------------------------------------------------------- /docs/scripts/text2punc.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | text2punc 3 | ========= 4 | 5 | Description 6 | =========== 7 | 8 | :: 9 | 10 | Usage: textkit text2punc [OPTIONS] [TEXT]... 11 | 12 | Tokenize text into punctuation tokens. Words and numbers are removed, 13 | leaving only punctuation. 14 | 15 | Options: 16 | --help Show this message and exit. 17 | 18 | 19 | 20 | Examples 21 | ======== 22 | -------------------------------------------------------------------------------- /docs/scripts/text2sentences.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | text2sentences 3 | ============== 4 | 5 | Description 6 | =========== 7 | 8 | :: 9 | 10 | Usage: textkit text2sentences [OPTIONS] [TEXT]... 11 | 12 | Tokenize text into sentence tokens. 13 | 14 | Options: 15 | --help Show this message and exit. 16 | 17 | 18 | 19 | Examples 20 | ======== 21 | -------------------------------------------------------------------------------- /docs/scripts/text2words.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | text2words 3 | ========== 4 | 5 | Description 6 | =========== 7 | 8 | Tokenize text into word tokens. Punctuation is considered as a separate token:: 9 | 10 | Usage: textkit text2words [OPTIONS] [TEXT]... 11 | 12 | Options: 13 | --help Show this message and exit. 14 | 15 | Examples 16 | ======== 17 | 18 | 19 | TODO 20 | -------------------------------------------------------------------------------- /docs/scripts/texts2json.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | texts2json 3 | ========== 4 | 5 | Description 6 | =========== 7 | 8 | :: 9 | 10 | Usage: textkit texts2json [OPTIONS] [TEXT_DOCS]... 11 | 12 | Convert a set of text documents into a JSON array of document objects. 13 | 14 | Options: 15 | --ids PATH File with one id per text document, each separated by a new 16 | line. Ids file is used to set the id attribute in the output 17 | JSON. 18 | --names PATH File with one name per text document, each separated by a new 19 | line. Names file is used to set the name attribute in the 20 | output JSON. 21 | --field TEXT Attribute name where text will be stored in the document 22 | object. [default: text] 23 | --help Show this message and exit. 24 | 25 | 26 | 27 | Examples 28 | ======== 29 | -------------------------------------------------------------------------------- /docs/scripts/tokens2counts.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | tokens2counts 3 | ============= 4 | 5 | Description 6 | =========== 7 | 8 | :: 9 | 10 | Usage: textkit tokens2counts [OPTIONS] [TOKENS] 11 | 12 | Count unique tokens in a list of tokens. Tokens are sorted by top counts. 13 | 14 | Options: 15 | -s, --sep TEXT Separator between token and count in output. [default: ,] 16 | --limit INTEGER Only output the top N most frequent tokens 17 | --help Show this message and exit. 18 | 19 | 20 | 21 | Examples 22 | ======== 23 | -------------------------------------------------------------------------------- /docs/scripts/tokens2json.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | tokens2json 3 | =========== 4 | 5 | Description 6 | =========== 7 | 8 | :: 9 | 10 | Usage: textkit tokens2json [OPTIONS] [TOKEN_DOCS]... 11 | 12 | Convert a set of token documents into a JSON array of document objects. 13 | 14 | Options: 15 | --ids PATH File with one id per token document, each separated by 16 | a new line. Ids file is used to set the id attribute 17 | in the output JSON. 18 | --names PATH File with one name per token document, each separated 19 | by a new line. Names file is used to set the name 20 | attribute in the output JSON. 21 | --field TEXT Attribute name where tokens will be stored in the 22 | document object. [default: tokens] 23 | --split / --no-split If enabled, textkit will attempt to split input 24 | columns when packaging. This is useful when packaging 25 | multiple column output like counts. [default: False] 26 | -s, --sep TEXT Separator character between columns. Only used if 27 | split-columns flag is used. [default: ,] 28 | --help Show this message and exit. 29 | 30 | 31 | 32 | Examples 33 | ======== 34 | -------------------------------------------------------------------------------- /docs/scripts/tokens2lower.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | tokens2lower 3 | ============ 4 | 5 | Description 6 | =========== 7 | 8 | :: 9 | 10 | Usage: textkit tokens2lower [OPTIONS] [TOKENS] 11 | 12 | Transform all tokens to lowercase. 13 | 14 | Options: 15 | --help Show this message and exit. 16 | 17 | 18 | 19 | Examples 20 | ======== 21 | -------------------------------------------------------------------------------- /docs/scripts/tokens2pos.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | tokens2pos 3 | ========== 4 | 5 | Description 6 | =========== 7 | 8 | :: 9 | 10 | Usage: textkit tokens2pos [OPTIONS] [TOKENS] 11 | 12 | Tokenize words into their parts of speech. Output contains the word token 13 | followed by its part-of-speech tag, separated by the character specified 14 | by --sep. 15 | 16 | Options: 17 | -s, --sep TEXT Separator between words in the output. [default: ,] 18 | --help Show this message and exit. 19 | 20 | 21 | 22 | Examples 23 | ======== 24 | -------------------------------------------------------------------------------- /docs/scripts/tokens2stem.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | tokens2stem 3 | =========== 4 | 5 | Description 6 | =========== 7 | 8 | :: 9 | 10 | Usage: textkit tokens2stem [OPTIONS] [TOKENS] 11 | 12 | Stem a list of tokens to get their root. 13 | 14 | Options: 15 | -a, --algorithm [porter|lancaster|snowball|wordnet] 16 | Specify which stemming algorithm to use. 17 | [default: porter] 18 | --help Show this message and exit. 19 | 20 | 21 | 22 | Examples 23 | ======== 24 | -------------------------------------------------------------------------------- /docs/scripts/tokens2text.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | tokens2text 3 | =========== 4 | 5 | Description 6 | =========== 7 | 8 | :: 9 | 10 | Usage: textkit tokens2text [OPTIONS] [TOKENS] 11 | 12 | Combine tokens in a token document into a single text file. 13 | 14 | Options: 15 | -s, --sep TEXT Separator between token and count in output. [default: ] 16 | --help Show this message and exit. 17 | 18 | 19 | 20 | Examples 21 | ======== 22 | -------------------------------------------------------------------------------- /docs/scripts/tokens2topbigrams.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | tokens2topbigrams 3 | ================= 4 | 5 | Description 6 | =========== 7 | 8 | :: 9 | 10 | b"Usage: textkit tokens2topbigrams [OPTIONS] [TOKENS] 11 | 12 | Find top most interesting bi-grams in a token document. Uses the --measure 13 | argument to determine what measure to use to define interesting. 14 | 15 | Options: 16 | -s, --sep TEXT Separator between tokens and scores in 17 | output. [default: ,] 18 | -m, --measure [likelihood|chi_sq|pmi|student_t|freq] 19 | Specify which measure to use to define 20 | interesing-ness. [default: likelihood] 21 | --freq INTEGER Minimum frequency of bi-grams to filter out. 22 | [default: 2] 23 | --scores / --no-scores Include or exclude scores in output. 24 | [default: True] 25 | --help Show this message and exit. 26 | " 27 | 28 | 29 | Examples 30 | ======== 31 | -------------------------------------------------------------------------------- /docs/scripts/tokens2upper.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | tokens2upper 3 | ============ 4 | 5 | Description 6 | =========== 7 | 8 | :: 9 | 10 | Usage: textkit tokens2upper [OPTIONS] [TOKENS] 11 | 12 | Transform all tokens to uppercase. 13 | 14 | Options: 15 | --help Show this message and exit. 16 | 17 | 18 | 19 | Examples 20 | ======== 21 | -------------------------------------------------------------------------------- /docs/scripts/transliterate.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | transliterate 3 | ============= 4 | 5 | Description 6 | =========== 7 | 8 | :: 9 | 10 | Usage: textkit transliterate [OPTIONS] [TEXT]... 11 | 12 | Transform an international text file to plain ascii 13 | 14 | Options: 15 | --help Show this message and exit. 16 | 17 | 18 | 19 | Examples 20 | ======== 21 | 22 | > echo "Hello! À bientôt… L’été à Pètechïn; 日本語, Nihongo Klüft skräms inför på fédéral électoral große Küche Mærsk" > file_full_of_international_text.md 23 | > textkit transliterate file_full_of_international_text.md 24 | Hello! A bientot... L'ete a Petechin; Ri Ben Yu , Nihongo Kluft skrams infor pa federal electoral grosse Kuche Maersk -------------------------------------------------------------------------------- /docs/scripts/words2bigrams.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | words2bigrams 3 | ============= 4 | 5 | Description 6 | =========== 7 | 8 | :: 9 | 10 | Usage: textkit words2bigrams [OPTIONS] [TOKENS] 11 | 12 | Tokenize words into bigrams. Bigrams are two word tokens. Punctuation is 13 | considered as a separate token. 14 | 15 | Options: 16 | -s, --sep TEXT Separator between words in bigram output. [default: ] 17 | --help Show this message and exit. 18 | 19 | 20 | 21 | Examples 22 | ======== 23 | -------------------------------------------------------------------------------- /docs/scripts/words2ngrams.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | words2ngrams 3 | ============ 4 | 5 | Description 6 | =========== 7 | 8 | :: 9 | 10 | Usage: textkit words2ngrams [OPTIONS] [TOKENS] 11 | 12 | Convert word tokens into ngrams. ngrams are n-length word tokens. 13 | Punctuation is considered as a separate token. 14 | 15 | Options: 16 | -s, --sep TEXT Separator between words in bigram output. [default: ] 17 | -n, --num INTEGER Length of the n-gram [default: 2] 18 | --help Show this message and exit. 19 | 20 | 21 | 22 | Examples 23 | ======== 24 | -------------------------------------------------------------------------------- /docs/sphinx_deployment.mk: -------------------------------------------------------------------------------- 1 | # Copyright (c) Teracy, Inc. and individual contributors. 2 | # All rights reserved. 3 | 4 | # Redistribution and use in source and binary forms, with or without modification, 5 | # are permitted provided that the following conditions are met: 6 | 7 | # 1. Redistributions of source code must retain the above copyright notice, 8 | # this list of conditions and the following disclaimer. 9 | 10 | # 2. Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | 14 | # 3. Neither the name of Teracy, Inc. nor the names of its contributors may be used 15 | # to endorse or promote products derived from this software without 16 | # specific prior written permission. 17 | 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22 | # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | # Deployment configurations from sphinx_deployment project 30 | 31 | # default deployment when $ make deploy 32 | # deploy_gh_pages : to $ make deploy_gh_pages 33 | # deploy_rsync : to $ make deploy_rsync 34 | # deploy_heroku : to $ make deploy_heroku 35 | # deploy_gh_pages deploy_rsync deploy_heroku : to $ make deploy_gh_pages then $ make deploy_rsync 36 | # and then $ make deploy_heroku 37 | # default value: deploy_gh_pages 38 | ifndef DEPLOY_DEFAULT 39 | DEPLOY_DEFAULT = deploy_gh_pages 40 | endif 41 | 42 | # The deployment directory to be deployed 43 | ifndef DEPLOY_DIR 44 | DEPLOY_DIR = _deploy 45 | endif 46 | 47 | # The heroku deployment directory to be deployed 48 | # we must create this separated dir to avoid any conflict with _deploy (rsync and gh_pages) 49 | ifndef DEPLOY_DIR_HEROKU 50 | DEPLOY_DIR_HEROKU = _deploy_heroku 51 | endif 52 | 53 | # Copy contents from $(BUILDDIR) to $(DEPLOY_DIR)/$(DEPLOY_HTML_DIR) directory 54 | ifndef DEPLOY_HTML_DIR 55 | DEPLOY_HTML_DIR = . 56 | endif 57 | 58 | 59 | ## -- Rsync Deploy config -- ## 60 | # Be sure your public key is listed in your server's ~/.ssh/authorized_keys file 61 | ifndef SSH_USER 62 | SSH_USER = user@domain.com 63 | endif 64 | 65 | ifndef SSH_PORT 66 | SSH_PORT = 22 67 | endif 68 | 69 | ifndef DOCUMENT_ROOT 70 | DOCUMENT_ROOT = ~/website.com/ 71 | endif 72 | 73 | #If you choose to delete on sync, rsync will create a 1:1 match 74 | ifndef RSYNC_DELETE 75 | RSYNC_DELETE = false 76 | endif 77 | 78 | # Any extra arguments to pass to rsync 79 | ifndef RSYNC_ARGS 80 | RSYNC_ARGS = 81 | endif 82 | 83 | ## -- Github Pages Deploy config -- ## 84 | 85 | # Configure the right deployment branch 86 | ifndef DEPLOY_BRANCH_GITHUB 87 | DEPLOY_BRANCH_GITHUB = gh-pages 88 | endif 89 | 90 | #if REPO_URL_GITHUB was NOT defined by travis-ci 91 | ifndef REPO_URL_GITHUB 92 | # Configure your right github project repo 93 | REPO_URL_GITHUB = git@github.com:learntextvis/textkit.git 94 | endif 95 | 96 | ## -- Heroku Deployment Config -- ## 97 | 98 | ifndef REPO_URL_HEROKU 99 | # Configure your right heroku repo 100 | # REPO_URL_HEROKU = git@heroku.com:spxd.git 101 | endif 102 | 103 | 104 | ## end deployment configuration, don't edit anything below this line ## 105 | ####################################################################### 106 | 107 | ifeq ($(RSYNC_DELETE), true) 108 | RSYNC_DELETE_OPT = --delete 109 | endif 110 | 111 | init_gh_pages: 112 | @rm -rf $(DEPLOY_DIR) 113 | @mkdir -p $(DEPLOY_DIR) 114 | @cd $(DEPLOY_DIR); git init;\ 115 | echo 'sphinx docs comming soon...' > index.html;\ 116 | touch .nojekyll;\ 117 | git add .;\ 118 | git commit -m "sphinx docs init";\ 119 | git branch -m $(DEPLOY_BRANCH_GITHUB);\ 120 | echo $(DEPLOY_BRANCH_GITHUB) > index.html;\ 121 | git remote add origin $(REPO_URL_GITHUB); 122 | @cd $(DEPLOY_DIR);\ 123 | if ! git ls-remote origin $(DEPLOY_BRANCH_GITHUB) | grep $(DEPLOY_BRANCH_GITHUB) ; then \ 124 | echo "Preparing Github deployment branch: $(DEPLOY_BRANCH_GITHUB) for the first time only...";\ 125 | git push -u origin $(DEPLOY_BRANCH_GITHUB);\ 126 | fi 127 | 128 | setup_gh_pages: init_gh_pages 129 | @echo "Setting up gh-pages deployment..." 130 | @cd $(DEPLOY_DIR);\ 131 | git fetch origin;\ 132 | git reset --hard origin/$(DEPLOY_BRANCH_GITHUB);\ 133 | git branch --set-upstream $(DEPLOY_BRANCH_GITHUB) origin/$(DEPLOY_BRANCH_GITHUB) 134 | @echo "Now you can deploy to Github Pages with 'make generate' and then 'make deploy_gh_pages'" 135 | 136 | init_heroku: 137 | @rm -rf $(DEPLOY_DIR_HEROKU) 138 | @mkdir -p $(DEPLOY_DIR_HEROKU) 139 | @cd $(DEPLOY_DIR_HEROKU); git init;\ 140 | cp -r ../.deploy_heroku/* .;\ 141 | echo 'sphinx docs comming soon...' > public/index.html;\ 142 | git add .; git commit -m "sphinx docs init";\ 143 | git remote add origin $(REPO_URL_HEROKU); 144 | @cd $(DEPLOY_DIR_HEROKU);\ 145 | if ! git ls-remote origin master | grep master ; then\ 146 | echo "Preparing Heroku deployment for the first time only...";\ 147 | git push -u origin master;\ 148 | fi 149 | 150 | setup_heroku: init_heroku 151 | @echo "setting up heroku deployment..." 152 | @cd $(DEPLOY_DIR_HEROKU);\ 153 | git fetch origin;\ 154 | git reset --hard origin/master;\ 155 | git branch --set-upstream master origin/master 156 | @echo "Now you can deploy to Heroku with 'make generate' and then 'make deploy_heroku'" 157 | 158 | generate: html 159 | 160 | prepare_rsync_deployment: 161 | @echo "Preparing rsync deployment..." 162 | @mkdir -p $(DEPLOY_DIR)/$(DEPLOY_HTML_DIR) 163 | @echo "Copying files from '$(BUILDDIR)/html/.' to '$(DEPLOY_DIR)/$(DEPLOY_HTML_DIR)'" 164 | @cp -r $(BUILDDIR)/html/. $(DEPLOY_DIR)/$(DEPLOY_HTML_DIR) 165 | 166 | deploy_rsync: prepare_rsync_deployment 167 | @echo "Deploying on rsync now..." 168 | rsync -avze 'ssh -p $(SSH_PORT)' --exclude-from $(realpath ./rsync_exclude) $(RSYNC_ARGS) $(RSYNC_DELETE_OPT) ${DEPLOY_DIR}/ $(SSH_USER):$(DOCUMENT_ROOT) 169 | 170 | prepare_gh_pages_deployment: 171 | @echo "Preparing gh_pages deployment..." 172 | @echo "Pulling any updates from Github Pages..." 173 | @cd $(DEPLOY_DIR); git pull; 174 | @mkdir -p $(DEPLOY_DIR)/$(DEPLOY_HTML_DIR) 175 | @echo "Copying files from '$(BUILDDIR)/html/.' to '$(DEPLOY_DIR)/$(DEPLOY_HTML_DIR)'" 176 | @cp -r $(BUILDDIR)/html/. $(DEPLOY_DIR)/$(DEPLOY_HTML_DIR) 177 | 178 | deploy_gh_pages: prepare_gh_pages_deployment 179 | @echo "Deploying on github pages now..." 180 | @cd $(DEPLOY_DIR); git add -A; git commit -m "docs updated at `date -u`";\ 181 | git push origin $(DEPLOY_BRANCH) --quiet 182 | @echo "Github Pages deploy was completed at `date -u`" 183 | 184 | prepare_heroku_deployment: 185 | @echo "Preparing heroku deployment..." 186 | @echo "Pulling any updates from Heroku..." 187 | @cd $(DEPLOY_DIR_HEROKU); git pull; 188 | @mkdir -p $(DEPLOY_DIR_HEROKU)/public/$(DEPLOY_HTML_DIR) 189 | @echo "Copying files from .deploy_heroku to $(DEPLOY_DIR_HEROKU)" 190 | @cp -r .deploy_heroku/. $(DEPLOY_DIR_HEROKU) 191 | @echo "Copying files from '$(BUILDDIR)/html/.' to '$(DEPLOY_DIR_HEROKU)/public/$(DEPLOY_HTML_DIR)'" 192 | @cp -r $(BUILDDIR)/html/. $(DEPLOY_DIR_HEROKU)/public/$(DEPLOY_HTML_DIR) 193 | 194 | 195 | deploy_heroku: prepare_heroku_deployment 196 | @echo "Deploying on heroku now..." 197 | @cd $(DEPLOY_DIR_HEROKU); git add -A; git commit -m "docs updated at `date -u`";\ 198 | git push origin master --quiet 199 | @echo "Heroku deployment was completed at `date -u`" 200 | 201 | 202 | deploy: $(DEPLOY_DEFAULT) 203 | 204 | gen_deploy: generate deploy 205 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: textkit 2 | dependencies: 3 | - click=4.1=py35_0 4 | - coverage=4.0=py35_0 5 | - nltk=3.1=py35_0 6 | - numpy=1.10.2=py35_0 7 | - openssl=1.0.2d=0 8 | - pip=7.1.2=py35_0 9 | - python=3.5.1=0 10 | - pyyaml=3.11=py35_1 11 | - readline=6.2=2 12 | - setuptools=19.1.1=py35_0 13 | - six=1.10.0=py35_0 14 | - sqlite=3.8.4.1=1 15 | - tk=8.5.18=0 16 | - wheel=0.26.0=py35_1 17 | - xz=5.0.5=0 18 | - yaml=0.1.6=0 19 | - zlib=1.2.8=0 20 | 21 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | click>=6.2 2 | coverage>=4.0.3 3 | nltk>=3.1 4 | numpy>=1.10.2 5 | PyYAML>=3.11 6 | six>=1.10.0 7 | wheel>=0.26.0 8 | unittest2>=1.1.0 9 | pylint>=1.5.5 10 | pytest>=2.9.1 11 | unidecode>=0.4.20 12 | chardet>=2.3.0 13 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | # This flag says that the code is written to work on both Python 2 and Python 3 | # 3. If at all possible, it is good practice to do this. If you cannot, you 4 | # will need to generate wheels for each Python version that you support. 5 | universal=1 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | import sys 6 | from setuptools import setup, find_packages 7 | 8 | 9 | def find_version(fname): 10 | """Attempts to find the version number in the file names fname. 11 | Raises RuntimeError if not found. 12 | """ 13 | version = '' 14 | with open(fname, 'r') as fp: 15 | reg = re.compile(r'__version__ = [\'"]([^\'"]*)[\'"]') 16 | for line in fp: 17 | m = reg.match(line) 18 | if m: 19 | version = m.group(1) 20 | break 21 | if not version: 22 | raise RuntimeError('Cannot find version information') 23 | return version 24 | 25 | __version__ = find_version("textkit/__init__.py") 26 | 27 | 28 | setup( 29 | name='textkit', 30 | version=__version__, 31 | description='Simple text analysis from the command line', 32 | long_description=open("README.rst").read(), 33 | packages=find_packages(exclude=['test*', 'docs']), 34 | license='MIT', 35 | author='Learn Text Vis Team', 36 | author_email='landham@gmail.com', 37 | py_modules=['textkit'], 38 | url='https://github.com/learntextvis/textkit', 39 | keywords=['text', 'analysis', 'textkit'], 40 | include_package_data=True, 41 | install_requires=[ 42 | 'click>=6.2', 43 | 'nltk>=3.1', 44 | 'unidecode>=0.4.20', 45 | 'chardet>=2.3.0' 46 | ], 47 | entry_points={ 48 | 'console_scripts': [ 49 | 'textkit = textkit.cli:cli' 50 | ] 51 | }, 52 | package_data={ 53 | 'textkit': ['data/stopwords/english.txt', 'data/stopwords/german.txt', 'data/stopwords/danish.txt', 'data/stopwords/dutch.txt', 'data/stopwords/finnish.txt', 'data/stopwords/french.txt', 'data/stopwords/hungarian.txt', 'data/stopwords/italian.txt', 'data/stopwords/norwegian.txt', 'data/stopwords/portuguese.txt', 'data/stopwords/russian.txt', 'data/stopwords/spanish.txt', 'data/stopwords/swedish.txt', 'data/stopwords/turkish.txt'] 54 | }, classifiers=[ 55 | 'Development Status :: 3 - Alpha', 56 | 'Environment :: Console', 57 | 'Intended Audience :: Developers', 58 | 'Intended Audience :: End Users/Desktop', 59 | 'Intended Audience :: Science/Research', 60 | 'License :: OSI Approved :: MIT License', 61 | 'Natural Language :: English', 62 | 'Operating System :: OS Independent', 63 | 'Programming Language :: Python', 64 | 'Programming Language :: Python :: 2.7', 65 | 'Programming Language :: Python :: 3.4', 66 | 'Topic :: Utilities' 67 | ] 68 | ) 69 | -------------------------------------------------------------------------------- /test_data/alice_short.txt: -------------------------------------------------------------------------------- 1 | CHAPTER 2 | I 3 | . 4 | Down 5 | the 6 | Rabbit-Hole 7 | Alice 8 | was 9 | beginning 10 | to 11 | get 12 | very 13 | tired 14 | of 15 | sitting 16 | by 17 | her 18 | sister 19 | on 20 | the 21 | -------------------------------------------------------------------------------- /test_data/international.transliterate.txt: -------------------------------------------------------------------------------- 1 | GR ILIADOSA 2 | FR L'HAY-LES-ROSES 3 | KO yi bonmun naeyongeun yeongmunsaiteureul camgoha 4 | JA Mu Ci Ba Cui 5 | RU servery raspolozhennye 6 | RO Pot sa mananc sticla si ea nu ma raneste 7 | VI Shi E Shui Jing 8 | CN Wo Neng Tun Xia Bo Li Er Bu Shang Shen Ti 9 | NV yishaago 10 | 11 | -------------------------------------------------------------------------------- /test_data/international.txt: -------------------------------------------------------------------------------- 1 | GR ΙΛΙΑΔΟΣΑ 2 | FR L’HAŸ-LES-ROSES 3 | KO 의 본문 내용은 영문사이트를 참고하 4 | JA 目次抜粋 5 | RU серверы расположенные 6 | RO Pot să mănânc sticlă și ea nu mă rănește 7 | VI 世 咹 水 晶 8 | CN 我能吞下玻璃而不伤身体 9 | NV yishą́ągo 10 | -------------------------------------------------------------------------------- /test_data/word_tokens.txt: -------------------------------------------------------------------------------- 1 | This 2 | is 3 | one 4 | sentence 5 | Dude 6 | . 7 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/learntextvis/textkit/6c482ea7d378022e10ee1002f9d9db7925856294/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_coerce.py: -------------------------------------------------------------------------------- 1 | 2 | from textkit.coerce import coerce_types 3 | 4 | 5 | def test_coerce_types(): 6 | content = [ 7 | ["happy", "9"], 8 | ["day", "8"], 9 | ["4", "7"], 10 | ["YOU!", "6"] 11 | ] 12 | 13 | tokens = coerce_types(content) 14 | assert len(tokens) == 4 15 | assert tokens[0][0] == "happy" 16 | assert tokens[0][1] == 9 17 | assert tokens[2][0] == "4" 18 | 19 | 20 | def test_coerce_types_with_mix_floats_ints(): 21 | content = [ 22 | ["happy", "9"], 23 | ["day", "8.7"], 24 | ["4", "7.0"], 25 | ["YOU!", "6"] 26 | ] 27 | 28 | tokens = coerce_types(content) 29 | assert len(tokens) == 4 30 | assert tokens[0][0] == "happy" 31 | assert tokens[0][1] == 9.0 32 | assert tokens[1][1] == 8.7 33 | assert tokens[2][1] == 7 34 | -------------------------------------------------------------------------------- /tests/test_filter.py: -------------------------------------------------------------------------------- 1 | 2 | import click 3 | from click.testing import CliRunner 4 | from textkit.filter.filter_punc import filterpunc 5 | from textkit.filter.filter_words import filterwords 6 | from textkit.filter.filter_lengths import filterlengths 7 | from tests.utils import create_single_output, create_multifile_output, compare_results 8 | 9 | 10 | def test_filterlengths(): 11 | runner = CliRunner() 12 | with runner.isolated_filesystem(): 13 | filename = 'in.txt' 14 | sentence = 'Hello\nWorld\n!\nI\n.\nnot\nwin\n' 15 | 16 | create_single_output(filename, sentence) 17 | 18 | # default length 3 19 | result = runner.invoke(filterlengths, [filename]) 20 | tokens = result.output.split('\n') 21 | expected_tokens = ['Hello', 'World', 'not', 'win'] 22 | assert result.exit_code == 0 23 | compare_results(tokens, expected_tokens) 24 | 25 | # minumum length 4 26 | result = runner.invoke(filterlengths, ['-m', '4', filename]) 27 | tokens = result.output.split('\n') 28 | expected_tokens = ['Hello', 'World'] 29 | assert result.exit_code == 0 30 | compare_results(tokens, expected_tokens) 31 | 32 | 33 | def test_filterpunc(): 34 | runner = CliRunner() 35 | with runner.isolated_filesystem(): 36 | filename = 'in.txt' 37 | sentence = 'Hello\nWorld\n!\nI\n.\nnot' 38 | expected_tokens = ['Hello', 'World', 'I', 'not'] 39 | create_single_output(filename, sentence) 40 | result = runner.invoke(filterpunc, [filename]) 41 | tokens = result.output.split('\n') 42 | assert result.exit_code == 0 43 | compare_results(tokens, expected_tokens) 44 | 45 | 46 | def test_filterwords(): 47 | runner = CliRunner() 48 | with runner.isolated_filesystem(): 49 | 50 | filename = 'in.txt' 51 | sentence = 'Hello\nWorld\n!\nI\nam\nnot\na\ncrook\n.' 52 | expected_tokens = ['Hello', 'World', '!', 'crook', '.'] 53 | create_single_output(filename, sentence) 54 | result = runner.invoke(filterwords, ['--language', 'english', filename]) 55 | tokens = result.output.split('\n') 56 | assert result.exit_code == 0 57 | compare_results(tokens, expected_tokens) 58 | 59 | 60 | def test_filterwords_custom(): 61 | runner = CliRunner() 62 | with runner.isolated_filesystem(): 63 | 64 | filename = 'in.txt' 65 | sentence = 'Hello\nWorld\n!\nI\nam\nnot\na\ncrook\n.' 66 | expected_tokens = ['World', '!', 'crook', '.'] 67 | custom_stopword_filename = 'custom.txt' 68 | custom_stopwords = 'hello\n' 69 | 70 | create_single_output(filename, sentence) 71 | create_single_output(custom_stopword_filename, custom_stopwords) 72 | 73 | result = runner.invoke(filterwords, 74 | ['--custom', 75 | custom_stopword_filename, 76 | filename]) 77 | 78 | tokens = result.output.split('\n') 79 | assert result.exit_code == 0 80 | compare_results(tokens, expected_tokens) 81 | -------------------------------------------------------------------------------- /tests/test_tokenize.py: -------------------------------------------------------------------------------- 1 | from click.testing import CliRunner 2 | from textkit.tokenize.words import text2words 3 | from textkit.tokenize.bigrams import words2bigrams 4 | from textkit.tokenize.punc import text2punc 5 | from textkit.tokenize.sentences import text2sentences 6 | from textkit.tokenize.ngrams import words2ngrams, text2ngrams 7 | from tests.utils import create_single_output, create_multifile_output, compare_results 8 | 9 | 10 | def test_text2words(): 11 | runner = CliRunner() 12 | with runner.isolated_filesystem(): 13 | filename = 'in.txt' 14 | sentence = 'Hello World!\nI.\nnot sure where to go' 15 | expected_tokens = ['Hello', 'World', '!', 'I.', 16 | 'not', 'sure', 'where', 'to', 'go'] 17 | create_single_output(filename, sentence) 18 | result = runner.invoke(text2words, [filename]) 19 | tokens = result.output.split('\n') 20 | assert result.exit_code == 0 21 | compare_results(tokens, expected_tokens) 22 | 23 | 24 | def test_text2words_multifile(): 25 | runner = CliRunner() 26 | with runner.isolated_filesystem(): 27 | 28 | filenames = ['in.txt', 'in2.txt'] 29 | sentences = ('Hello World!\nI.\nnot sure where to go', 30 | 'Goodbye World!\n I.\n know everything about you') 31 | expected_tokens = ['Hello', 'World', '!', 'I.', 32 | 'not', 'sure', 'where', 'to', 'go', 33 | 'Goodbye', 'World', '!', 'I.', 'know', 34 | 'everything', 'about', 'you'] 35 | create_multifile_output(filenames, sentences) 36 | result = runner.invoke(text2words, filenames) 37 | tokens = result.output.split('\n') 38 | assert result.exit_code == 0 39 | compare_results(tokens, expected_tokens) 40 | 41 | 42 | def test_words2bigrams(): 43 | runner = CliRunner() 44 | with runner.isolated_filesystem(): 45 | filename = 'in.txt' 46 | sentence = 'Hello\nWorld\n!\nI\nlove\ngo\n.' 47 | expected_tokens = ['Hello World', 'World !', 48 | '! I', 'I love', 'love go', 'go .'] 49 | create_single_output(filename, sentence) 50 | result = runner.invoke(words2bigrams, [filename]) 51 | tokens = result.output.split('\n') 52 | assert result.exit_code == 0 53 | compare_results(tokens, expected_tokens) 54 | 55 | 56 | def test_sentences(): 57 | runner = CliRunner() 58 | with runner.isolated_filesystem(): 59 | filename = 'in.txt' 60 | sentence = 'Hello World! I love go.' 61 | expected_tokens = ['Hello World!', 'I love go.'] 62 | create_single_output(filename, sentence) 63 | result = runner.invoke(text2sentences, [filename]) 64 | tokens = result.output.split('\n') 65 | assert result.exit_code == 0 66 | compare_results(tokens, expected_tokens) 67 | 68 | 69 | def test_punc(): 70 | runner = CliRunner() 71 | with runner.isolated_filesystem(): 72 | filename = 'in.txt' 73 | sentence = 'Hello\nWorld\n!\nI\nlove,\ngo\n.' 74 | expected_tokens = ['!', ',', '.'] 75 | create_single_output(filename, sentence) 76 | result = runner.invoke(text2punc, [filename]) 77 | tokens = result.output.split('\n') 78 | assert result.exit_code == 0 79 | compare_results(tokens, expected_tokens) 80 | 81 | 82 | def test_punc_multifile(): 83 | runner = CliRunner() 84 | with runner.isolated_filesystem(): 85 | filenames = ['in.txt', 'in2.txt'] 86 | sentences = ['Hello\nWorld\n!\nI\nlove,\ngo\n.', 87 | 'Goodbye World!\n I...\n know everything\'s about you?'] 88 | expected_tokens = ['!', ',', '.', '!', '...', "'", '?'] 89 | create_multifile_output(filenames, sentences) 90 | result = runner.invoke(text2punc, filenames) 91 | tokens = result.output.split('\n') 92 | assert result.exit_code == 0 93 | compare_results(tokens, expected_tokens) 94 | 95 | 96 | def test_words2ngrams(): 97 | runner = CliRunner() 98 | with runner.isolated_filesystem(): 99 | filename = 'in.txt' 100 | sentence = 'Hello\nWorld\n!\nI\nlove\ngo\n.' 101 | expected_tokens = ['Hello World !', 'World ! I', '! I love', 'I love go'] 102 | create_single_output(filename, sentence) 103 | result = runner.invoke(words2ngrams, ['-n', 3, filename]) 104 | tokens = result.output.split('\n') 105 | assert result.exit_code == 0 106 | compare_results(tokens, expected_tokens) 107 | 108 | 109 | def test_text2ngrams(): 110 | runner = CliRunner() 111 | with runner.isolated_filesystem(): 112 | filename = 'in.txt' 113 | sentence = 'Hello World! I love go.' 114 | expected_tokens = ['Hello World !', 'World ! I', '! I love', 'I love go'] 115 | create_single_output(filename, sentence) 116 | result = runner.invoke(text2ngrams, ['-n', 3, filename]) 117 | tokens = result.output.split('\n') 118 | assert result.exit_code == 0 119 | compare_results(tokens, expected_tokens) 120 | -------------------------------------------------------------------------------- /tests/test_transform.py: -------------------------------------------------------------------------------- 1 | 2 | from click.testing import CliRunner 3 | from textkit.transform.tokens_to_lower import tokens2lower 4 | from textkit.transform.newlines import nonewlines 5 | from textkit.transform.tokens_to_upper import tokens2upper 6 | from textkit.transform.tokens_to_counts import tokens2counts 7 | from textkit.transform.tokens_to_pos import tokens2pos 8 | from textkit.transform.tokens_to_top_bigrams import tokens2topbigrams 9 | from tests.utils import create_single_output, create_multifile_output, compare_results 10 | 11 | 12 | def test_lowercase(): 13 | runner = CliRunner() 14 | with runner.isolated_filesystem(): 15 | filename = 'in.txt' 16 | sentence = 'Hello\nWorld\n!\nI\n.\nnoooo\n' 17 | expected_tokens = ['hello', 'world', '!', 'i', '.', 'noooo'] 18 | create_single_output(filename, sentence) 19 | 20 | result = runner.invoke(tokens2lower, [filename]) 21 | tokens = result.output.split('\n') 22 | assert result.exit_code == 0 23 | compare_results(tokens, expected_tokens) 24 | 25 | 26 | def test_uppercase(): 27 | runner = CliRunner() 28 | with runner.isolated_filesystem(): 29 | filename = 'in.txt' 30 | sentence = 'Hello\nWorld\n!\nI\n.\nnoooo\n' 31 | expected_tokens = ['HELLO', 'WORLD', '!', 'I', '.', 'NOOOO'] 32 | create_single_output(filename, sentence) 33 | 34 | result = runner.invoke(tokens2upper, [filename]) 35 | tokens = result.output.split('\n') 36 | assert result.exit_code == 0 37 | compare_results(tokens, expected_tokens) 38 | 39 | 40 | def test_nonewlines(): 41 | runner = CliRunner() 42 | with runner.isolated_filesystem(): 43 | filename = 'in.txt' 44 | sentence = 'Hello\nWorld\n!\nI\nam\nin.\n' 45 | expected_tokens = ['Hello World ! I am in.'] 46 | 47 | create_single_output(filename, sentence) 48 | result = runner.invoke(nonewlines, [filename]) 49 | tokens = result.output.split('\n') 50 | assert result.exit_code == 0 51 | assert len(result.output.split('\n')) == 2 52 | compare_results(tokens, expected_tokens) 53 | 54 | 55 | def test_nonewlines_multifile(): 56 | runner = CliRunner() 57 | with runner.isolated_filesystem(): 58 | filenames = ['in.txt', 'in2.txt'] 59 | sentences = ['Hello\nWorld\n!\nI\nam\nin.', 60 | 'What are you\na creature\nof mystery'] 61 | expected_tokens = ['Hello World ! I am in. What are you a creature of mystery'] 62 | create_multifile_output(filenames, sentences) 63 | result = runner.invoke(nonewlines, filenames) 64 | tokens = result.output.split('\n') 65 | assert result.exit_code == 0 66 | assert len(result.output.split('\n')) == 2 67 | compare_results(tokens, expected_tokens) 68 | 69 | 70 | def test_count_tokens(): 71 | runner = CliRunner() 72 | with runner.isolated_filesystem(): 73 | filename = 'in.txt' 74 | sentence = 'Hello,\nworld\n!\nI\nlove\nthis\nworld\nand\nlove\nyou' 75 | expected_tokens = ['love,2', 'world,2', 'and,1', 'I,1', 'you,1', 76 | 'this,1', '\"Hello,\",1', '!,1', ''] 77 | expected_tokens.sort() 78 | create_single_output(filename, sentence) 79 | result = runner.invoke(tokens2counts, [filename]) 80 | tokens = result.output.split('\n') 81 | tokens.sort() 82 | assert result.exit_code == 0 83 | compare_results(tokens, expected_tokens) 84 | 85 | 86 | def test_pos_tokens(): 87 | runner = CliRunner() 88 | with runner.isolated_filesystem(): 89 | filename = 'in.txt' 90 | sentence = 'Hello\nworld\n!\nI\nlove\nthis\nworld\nand\nlove\nyou' 91 | expected_tokens = ['Hello,NNP', 'world,NN', '!,.', 92 | 'I,PRP', 'love,VBP', 'this,DT', 93 | 'world,NN', 'and,CC', 'love,VB', 'you,PRP'] 94 | create_single_output(filename, sentence) 95 | result = runner.invoke(tokens2pos, [filename]) 96 | tokens = result.output.split('\n') 97 | assert result.exit_code == 0 98 | compare_results(tokens, expected_tokens) 99 | 100 | 101 | def test_top_bigrams(): 102 | runner = CliRunner() 103 | with runner.isolated_filesystem(): 104 | filename = 'in.txt' 105 | sentence = 'I\nworld\n!\nI\nlove\nyou\nthis\nworld\nand\nlove\nyou' 106 | create_single_output(filename, sentence) 107 | 108 | result = runner.invoke(tokens2topbigrams, [filename]) 109 | assert result.exit_code == 0 110 | 111 | tokens = result.output.split('\n') 112 | assert tokens[0].split(',')[0:2] == ['love', 'you'] 113 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | 2 | from textkit.utils import read_tokens 3 | 4 | 5 | def test_read_tokens(): 6 | f = open('test_data/word_tokens.txt', 'r') 7 | 8 | tokens = read_tokens(f) 9 | assert len(tokens) == 6 10 | 11 | f.close() 12 | -------------------------------------------------------------------------------- /tests/transliterate.py: -------------------------------------------------------------------------------- 1 | import click 2 | from click.testing import CliRunner 3 | from textkit.filter.transliterate import transliterate 4 | 5 | 6 | def test_transliterate(): 7 | runner = CliRunner() 8 | filename = 'test_data/international.txt' 9 | ## ??? 10 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | def create_single_output(filename, content): 2 | """ 3 | Outputs test content into a filename 4 | """ 5 | with open(filename, 'w') as f: 6 | f.write(content) 7 | 8 | 9 | def create_multifile_output(filenames, contents): 10 | """ 11 | Outputs several text contents into several files 12 | """ 13 | for idx, filename in enumerate(filenames): 14 | with open(filename, 'w') as f: 15 | f.write(contents[idx]) 16 | 17 | 18 | def compare_results(tokens, expected_tokens): 19 | for tdx, expected_token in enumerate(expected_tokens): 20 | assert tokens[tdx] == expected_tokens[tdx] 21 | -------------------------------------------------------------------------------- /textkit/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | __version__ = '0.2.3' 3 | __license__ = 'MIT' 4 | -------------------------------------------------------------------------------- /textkit/cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import click 4 | from textkit.tokenize.words import text2words 5 | from textkit.tokenize.sentences import text2sentences 6 | from textkit.tokenize.bigrams import words2bigrams 7 | from textkit.tokenize.ngrams import words2ngrams, text2ngrams 8 | from textkit.tokenize.punc import text2punc 9 | from textkit.filter.filter_punc import filterpunc 10 | from textkit.filter.filter_words import filterwords 11 | from textkit.filter.filter_lengths import filterlengths 12 | from textkit.filter.filter_words import showstops 13 | from textkit.transform.tokens_to_lower import tokens2lower 14 | from textkit.transform.tokens_to_upper import tokens2upper 15 | from textkit.transform.newlines import nonewlines 16 | from textkit.transform.tokens_to_stem import tokens2stem 17 | from textkit.transform.tokens_to_counts import tokens2counts 18 | from textkit.transform.tokens_to_top_bigrams import tokens2topbigrams 19 | from textkit.transform.tokens_to_pos import tokens2pos 20 | from textkit.transform.transliterate import transliterate 21 | from textkit.package.tokens_to_json import tokens2json 22 | from textkit.package.texts_to_json import texts2json 23 | from textkit.package.tokens_to_text import tokens2text 24 | from textkit.download import download 25 | 26 | 27 | @click.group() 28 | def cli(): 29 | '''Text analysis from the command line. 30 | ''' 31 | pass 32 | 33 | cli.add_command(text2words) 34 | cli.add_command(text2sentences) 35 | cli.add_command(words2bigrams) 36 | cli.add_command(words2ngrams) 37 | cli.add_command(text2ngrams) 38 | cli.add_command(text2punc) 39 | cli.add_command(filterpunc) 40 | cli.add_command(filterwords) 41 | cli.add_command(filterlengths) 42 | cli.add_command(showstops) 43 | cli.add_command(tokens2lower) 44 | cli.add_command(tokens2upper) 45 | cli.add_command(nonewlines) 46 | cli.add_command(tokens2stem) 47 | cli.add_command(tokens2json) 48 | cli.add_command(texts2json) 49 | cli.add_command(tokens2text) 50 | cli.add_command(tokens2counts) 51 | cli.add_command(tokens2topbigrams) 52 | cli.add_command(tokens2pos) 53 | cli.add_command(transliterate) 54 | cli.add_command(download) 55 | -------------------------------------------------------------------------------- /textkit/coerce.py: -------------------------------------------------------------------------------- 1 | 2 | def isfloat(value): 3 | try: 4 | float(value) 5 | return True 6 | except: 7 | return False 8 | 9 | 10 | def isint(value): 11 | try: 12 | int(value) 13 | return True 14 | except: 15 | return False 16 | 17 | 18 | CONVERTERS = {'IntType': lambda x: int(x), 19 | 'FloatType': lambda x: float(x), 20 | 'StringType': lambda x: x} 21 | 22 | 23 | def pick_type(types): 24 | ''' if there is only one type found 25 | in a column, then use that. if multiple 26 | types are found, default back to string. 27 | ''' 28 | type_set = set(types) 29 | if len(type_set) == 1: 30 | return list(type_set)[0] 31 | elif set(['IntType', 'FloatType']) == type_set: 32 | # if there is a mix of floats and ints, then the column is floats. 33 | return 'FloatType' 34 | else: 35 | return 'StringType' 36 | 37 | 38 | def get_column_types(content): 39 | ''' Figure out what type of content is in each column 40 | of a csv-like input. This is a simple brute force method that 41 | attempts to convert the strings of the content into floats and ints. 42 | if the conversion is successful for all rows tested, 43 | that type is considered the type of the column. 44 | ''' 45 | 46 | # number of rows to check for content 47 | test_count = min(len(content), 5) 48 | 49 | # number of columns 50 | col_count = len(content[0]) 51 | 52 | all_types = [[] for i in range(col_count)] 53 | 54 | for r_ind in range(test_count): 55 | for col_ind, col in enumerate(content[r_ind]): 56 | if isint(col): 57 | all_types[col_ind].append('IntType') 58 | elif isfloat(col): 59 | all_types[col_ind].append('FloatType') 60 | else: 61 | all_types[col_ind].append('StringType') 62 | 63 | # find if conversions are consistent across rows 64 | column_types = [pick_type(types) for types in all_types] 65 | return column_types 66 | 67 | 68 | def coerce_types(content): 69 | ''' 70 | Convert types in csv-like content. 71 | The idea is that when translating to and 72 | from csv, everything is converted to strings. So, we need to undo that 73 | conversion for things like counts. 74 | ''' 75 | if len(content) == 0: 76 | return content 77 | 78 | column_types = get_column_types(content) 79 | 80 | coerced_content = [] 81 | for row in content: 82 | c_row = [] 83 | for col_ind, col in enumerate(row): 84 | try: 85 | col = CONVERTERS[column_types[col_ind]](col) 86 | except ValueError: 87 | col = col 88 | c_row.append(col) 89 | coerced_content.append(c_row) 90 | return coerced_content 91 | -------------------------------------------------------------------------------- /textkit/data/stopwords/README.md: -------------------------------------------------------------------------------- 1 | Stopwords Corpus 2 | 3 | This corpus contains lists of stop words for several languages. 4 | 5 | They were obtained from: 6 | https://github.com/nltk 7 | 8 | And nltk in turn obtained the stopwords corpus from: 9 | http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/ -------------------------------------------------------------------------------- /textkit/data/stopwords/danish.txt: -------------------------------------------------------------------------------- 1 | og 2 | i 3 | jeg 4 | det 5 | at 6 | en 7 | den 8 | til 9 | er 10 | som 11 | på 12 | de 13 | med 14 | han 15 | af 16 | for 17 | ikke 18 | der 19 | var 20 | mig 21 | sig 22 | men 23 | et 24 | har 25 | om 26 | vi 27 | min 28 | havde 29 | ham 30 | hun 31 | nu 32 | over 33 | da 34 | fra 35 | du 36 | ud 37 | sin 38 | dem 39 | os 40 | op 41 | man 42 | hans 43 | hvor 44 | eller 45 | hvad 46 | skal 47 | selv 48 | her 49 | alle 50 | vil 51 | blev 52 | kunne 53 | ind 54 | når 55 | være 56 | dog 57 | noget 58 | ville 59 | jo 60 | deres 61 | efter 62 | ned 63 | skulle 64 | denne 65 | end 66 | dette 67 | mit 68 | også 69 | under 70 | have 71 | dig 72 | anden 73 | hende 74 | mine 75 | alt 76 | meget 77 | sit 78 | sine 79 | vor 80 | mod 81 | disse 82 | hvis 83 | din 84 | nogle 85 | hos 86 | blive 87 | mange 88 | ad 89 | bliver 90 | hendes 91 | været 92 | thi 93 | jer 94 | sådan 95 | -------------------------------------------------------------------------------- /textkit/data/stopwords/dutch.txt: -------------------------------------------------------------------------------- 1 | de 2 | en 3 | van 4 | ik 5 | te 6 | dat 7 | die 8 | in 9 | een 10 | hij 11 | het 12 | niet 13 | zijn 14 | is 15 | was 16 | op 17 | aan 18 | met 19 | als 20 | voor 21 | had 22 | er 23 | maar 24 | om 25 | hem 26 | dan 27 | zou 28 | of 29 | wat 30 | mijn 31 | men 32 | dit 33 | zo 34 | door 35 | over 36 | ze 37 | zich 38 | bij 39 | ook 40 | tot 41 | je 42 | mij 43 | uit 44 | der 45 | daar 46 | haar 47 | naar 48 | heb 49 | hoe 50 | heeft 51 | hebben 52 | deze 53 | u 54 | want 55 | nog 56 | zal 57 | me 58 | zij 59 | nu 60 | ge 61 | geen 62 | omdat 63 | iets 64 | worden 65 | toch 66 | al 67 | waren 68 | veel 69 | meer 70 | doen 71 | toen 72 | moet 73 | ben 74 | zonder 75 | kan 76 | hun 77 | dus 78 | alles 79 | onder 80 | ja 81 | eens 82 | hier 83 | wie 84 | werd 85 | altijd 86 | doch 87 | wordt 88 | wezen 89 | kunnen 90 | ons 91 | zelf 92 | tegen 93 | na 94 | reeds 95 | wil 96 | kon 97 | niets 98 | uw 99 | iemand 100 | geweest 101 | andere 102 | -------------------------------------------------------------------------------- /textkit/data/stopwords/english.txt: -------------------------------------------------------------------------------- 1 | i 2 | me 3 | my 4 | myself 5 | we 6 | our 7 | ours 8 | ourselves 9 | you 10 | your 11 | yours 12 | yourself 13 | yourselves 14 | he 15 | him 16 | his 17 | himself 18 | she 19 | her 20 | hers 21 | herself 22 | it 23 | its 24 | itself 25 | they 26 | them 27 | their 28 | theirs 29 | themselves 30 | what 31 | which 32 | who 33 | whom 34 | this 35 | that 36 | these 37 | those 38 | am 39 | is 40 | are 41 | was 42 | were 43 | be 44 | been 45 | being 46 | have 47 | has 48 | had 49 | having 50 | do 51 | does 52 | did 53 | doing 54 | a 55 | an 56 | the 57 | and 58 | but 59 | if 60 | or 61 | because 62 | as 63 | until 64 | while 65 | of 66 | at 67 | by 68 | for 69 | with 70 | about 71 | against 72 | between 73 | into 74 | through 75 | during 76 | before 77 | after 78 | above 79 | below 80 | to 81 | from 82 | up 83 | down 84 | in 85 | out 86 | on 87 | off 88 | over 89 | under 90 | again 91 | further 92 | then 93 | once 94 | here 95 | there 96 | when 97 | where 98 | why 99 | how 100 | all 101 | any 102 | both 103 | each 104 | few 105 | more 106 | most 107 | other 108 | some 109 | such 110 | no 111 | nor 112 | not 113 | only 114 | own 115 | same 116 | so 117 | than 118 | too 119 | very 120 | s 121 | t 122 | can 123 | will 124 | just 125 | don 126 | should 127 | now 128 | 129 | -------------------------------------------------------------------------------- /textkit/data/stopwords/finnish.txt: -------------------------------------------------------------------------------- 1 | olla 2 | olen 3 | olet 4 | on 5 | olemme 6 | olette 7 | ovat 8 | ole 9 | oli 10 | olisi 11 | olisit 12 | olisin 13 | olisimme 14 | olisitte 15 | olisivat 16 | olit 17 | olin 18 | olimme 19 | olitte 20 | olivat 21 | ollut 22 | olleet 23 | en 24 | et 25 | ei 26 | emme 27 | ette 28 | eivät 29 | minä 30 | minun 31 | minut 32 | minua 33 | minussa 34 | minusta 35 | minuun 36 | minulla 37 | minulta 38 | minulle 39 | sinä 40 | sinun 41 | sinut 42 | sinua 43 | sinussa 44 | sinusta 45 | sinuun 46 | sinulla 47 | sinulta 48 | sinulle 49 | hän 50 | hänen 51 | hänet 52 | häntä 53 | hänessä 54 | hänestä 55 | häneen 56 | hänellä 57 | häneltä 58 | hänelle 59 | me 60 | meidän 61 | meidät 62 | meitä 63 | meissä 64 | meistä 65 | meihin 66 | meillä 67 | meiltä 68 | meille 69 | te 70 | teidän 71 | teidät 72 | teitä 73 | teissä 74 | teistä 75 | teihin 76 | teillä 77 | teiltä 78 | teille 79 | he 80 | heidän 81 | heidät 82 | heitä 83 | heissä 84 | heistä 85 | heihin 86 | heillä 87 | heiltä 88 | heille 89 | tämä 90 | tämän 91 | tätä 92 | tässä 93 | tästä 94 | tähän 95 | tallä 96 | tältä 97 | tälle 98 | tänä 99 | täksi 100 | tuo 101 | tuon 102 | tuotä 103 | tuossa 104 | tuosta 105 | tuohon 106 | tuolla 107 | tuolta 108 | tuolle 109 | tuona 110 | tuoksi 111 | se 112 | sen 113 | sitä 114 | siinä 115 | siitä 116 | siihen 117 | sillä 118 | siltä 119 | sille 120 | sinä 121 | siksi 122 | nämä 123 | näiden 124 | näitä 125 | näissä 126 | näistä 127 | näihin 128 | näillä 129 | näiltä 130 | näille 131 | näinä 132 | näiksi 133 | nuo 134 | noiden 135 | noita 136 | noissa 137 | noista 138 | noihin 139 | noilla 140 | noilta 141 | noille 142 | noina 143 | noiksi 144 | ne 145 | niiden 146 | niitä 147 | niissä 148 | niistä 149 | niihin 150 | niillä 151 | niiltä 152 | niille 153 | niinä 154 | niiksi 155 | kuka 156 | kenen 157 | kenet 158 | ketä 159 | kenessä 160 | kenestä 161 | keneen 162 | kenellä 163 | keneltä 164 | kenelle 165 | kenenä 166 | keneksi 167 | ketkä 168 | keiden 169 | ketkä 170 | keitä 171 | keissä 172 | keistä 173 | keihin 174 | keillä 175 | keiltä 176 | keille 177 | keinä 178 | keiksi 179 | mikä 180 | minkä 181 | minkä 182 | mitä 183 | missä 184 | mistä 185 | mihin 186 | millä 187 | miltä 188 | mille 189 | minä 190 | miksi 191 | mitkä 192 | joka 193 | jonka 194 | jota 195 | jossa 196 | josta 197 | johon 198 | jolla 199 | jolta 200 | jolle 201 | jona 202 | joksi 203 | jotka 204 | joiden 205 | joita 206 | joissa 207 | joista 208 | joihin 209 | joilla 210 | joilta 211 | joille 212 | joina 213 | joiksi 214 | että 215 | ja 216 | jos 217 | koska 218 | kuin 219 | mutta 220 | niin 221 | sekä 222 | sillä 223 | tai 224 | vaan 225 | vai 226 | vaikka 227 | kanssa 228 | mukaan 229 | noin 230 | poikki 231 | yli 232 | kun 233 | niin 234 | nyt 235 | itse 236 | -------------------------------------------------------------------------------- /textkit/data/stopwords/french.txt: -------------------------------------------------------------------------------- 1 | au 2 | aux 3 | avec 4 | ce 5 | ces 6 | dans 7 | de 8 | des 9 | du 10 | elle 11 | en 12 | et 13 | eux 14 | il 15 | je 16 | la 17 | le 18 | leur 19 | lui 20 | ma 21 | mais 22 | me 23 | même 24 | mes 25 | moi 26 | mon 27 | ne 28 | nos 29 | notre 30 | nous 31 | on 32 | ou 33 | par 34 | pas 35 | pour 36 | qu 37 | que 38 | qui 39 | sa 40 | se 41 | ses 42 | son 43 | sur 44 | ta 45 | te 46 | tes 47 | toi 48 | ton 49 | tu 50 | un 51 | une 52 | vos 53 | votre 54 | vous 55 | c 56 | d 57 | j 58 | l 59 | à 60 | m 61 | n 62 | s 63 | t 64 | y 65 | été 66 | étée 67 | étées 68 | étés 69 | étant 70 | étante 71 | étants 72 | étantes 73 | suis 74 | es 75 | est 76 | sommes 77 | êtes 78 | sont 79 | serai 80 | seras 81 | sera 82 | serons 83 | serez 84 | seront 85 | serais 86 | serait 87 | serions 88 | seriez 89 | seraient 90 | étais 91 | était 92 | étions 93 | étiez 94 | étaient 95 | fus 96 | fut 97 | fûmes 98 | fûtes 99 | furent 100 | sois 101 | soit 102 | soyons 103 | soyez 104 | soient 105 | fusse 106 | fusses 107 | fût 108 | fussions 109 | fussiez 110 | fussent 111 | ayant 112 | ayante 113 | ayantes 114 | ayants 115 | eu 116 | eue 117 | eues 118 | eus 119 | ai 120 | as 121 | avons 122 | avez 123 | ont 124 | aurai 125 | auras 126 | aura 127 | aurons 128 | aurez 129 | auront 130 | aurais 131 | aurait 132 | aurions 133 | auriez 134 | auraient 135 | avais 136 | avait 137 | avions 138 | aviez 139 | avaient 140 | eut 141 | eûmes 142 | eûtes 143 | eurent 144 | aie 145 | aies 146 | ait 147 | ayons 148 | ayez 149 | aient 150 | eusse 151 | eusses 152 | eût 153 | eussions 154 | eussiez 155 | eussent 156 | -------------------------------------------------------------------------------- /textkit/data/stopwords/german.txt: -------------------------------------------------------------------------------- 1 | aber 2 | alle 3 | allem 4 | allen 5 | aller 6 | alles 7 | als 8 | also 9 | am 10 | an 11 | ander 12 | andere 13 | anderem 14 | anderen 15 | anderer 16 | anderes 17 | anderm 18 | andern 19 | anderr 20 | anders 21 | auch 22 | auf 23 | aus 24 | bei 25 | bin 26 | bis 27 | bist 28 | da 29 | damit 30 | dann 31 | der 32 | den 33 | des 34 | dem 35 | die 36 | das 37 | daß 38 | derselbe 39 | derselben 40 | denselben 41 | desselben 42 | demselben 43 | dieselbe 44 | dieselben 45 | dasselbe 46 | dazu 47 | dein 48 | deine 49 | deinem 50 | deinen 51 | deiner 52 | deines 53 | denn 54 | derer 55 | dessen 56 | dich 57 | dir 58 | du 59 | dies 60 | diese 61 | diesem 62 | diesen 63 | dieser 64 | dieses 65 | doch 66 | dort 67 | durch 68 | ein 69 | eine 70 | einem 71 | einen 72 | einer 73 | eines 74 | einig 75 | einige 76 | einigem 77 | einigen 78 | einiger 79 | einiges 80 | einmal 81 | er 82 | ihn 83 | ihm 84 | es 85 | etwas 86 | euer 87 | eure 88 | eurem 89 | euren 90 | eurer 91 | eures 92 | für 93 | gegen 94 | gewesen 95 | hab 96 | habe 97 | haben 98 | hat 99 | hatte 100 | hatten 101 | hier 102 | hin 103 | hinter 104 | ich 105 | mich 106 | mir 107 | ihr 108 | ihre 109 | ihrem 110 | ihren 111 | ihrer 112 | ihres 113 | euch 114 | im 115 | in 116 | indem 117 | ins 118 | ist 119 | jede 120 | jedem 121 | jeden 122 | jeder 123 | jedes 124 | jene 125 | jenem 126 | jenen 127 | jener 128 | jenes 129 | jetzt 130 | kann 131 | kein 132 | keine 133 | keinem 134 | keinen 135 | keiner 136 | keines 137 | können 138 | könnte 139 | machen 140 | man 141 | manche 142 | manchem 143 | manchen 144 | mancher 145 | manches 146 | mein 147 | meine 148 | meinem 149 | meinen 150 | meiner 151 | meines 152 | mit 153 | muss 154 | musste 155 | nach 156 | nicht 157 | nichts 158 | noch 159 | nun 160 | nur 161 | ob 162 | oder 163 | ohne 164 | sehr 165 | sein 166 | seine 167 | seinem 168 | seinen 169 | seiner 170 | seines 171 | selbst 172 | sich 173 | sie 174 | ihnen 175 | sind 176 | so 177 | solche 178 | solchem 179 | solchen 180 | solcher 181 | solches 182 | soll 183 | sollte 184 | sondern 185 | sonst 186 | über 187 | um 188 | und 189 | uns 190 | unse 191 | unsem 192 | unsen 193 | unser 194 | unses 195 | unter 196 | viel 197 | vom 198 | von 199 | vor 200 | während 201 | war 202 | waren 203 | warst 204 | was 205 | weg 206 | weil 207 | weiter 208 | welche 209 | welchem 210 | welchen 211 | welcher 212 | welches 213 | wenn 214 | werde 215 | werden 216 | wie 217 | wieder 218 | will 219 | wir 220 | wird 221 | wirst 222 | wo 223 | wollen 224 | wollte 225 | würde 226 | würden 227 | zu 228 | zum 229 | zur 230 | zwar 231 | zwischen 232 | -------------------------------------------------------------------------------- /textkit/data/stopwords/hungarian.txt: -------------------------------------------------------------------------------- 1 | a 2 | ahogy 3 | ahol 4 | aki 5 | akik 6 | akkor 7 | alatt 8 | által 9 | általában 10 | amely 11 | amelyek 12 | amelyekben 13 | amelyeket 14 | amelyet 15 | amelynek 16 | ami 17 | amit 18 | amolyan 19 | amíg 20 | amikor 21 | át 22 | abban 23 | ahhoz 24 | annak 25 | arra 26 | arról 27 | az 28 | azok 29 | azon 30 | azt 31 | azzal 32 | azért 33 | aztán 34 | azután 35 | azonban 36 | bár 37 | be 38 | belül 39 | benne 40 | cikk 41 | cikkek 42 | cikkeket 43 | csak 44 | de 45 | e 46 | eddig 47 | egész 48 | egy 49 | egyes 50 | egyetlen 51 | egyéb 52 | egyik 53 | egyre 54 | ekkor 55 | el 56 | elég 57 | ellen 58 | elõ 59 | elõször 60 | elõtt 61 | elsõ 62 | én 63 | éppen 64 | ebben 65 | ehhez 66 | emilyen 67 | ennek 68 | erre 69 | ez 70 | ezt 71 | ezek 72 | ezen 73 | ezzel 74 | ezért 75 | és 76 | fel 77 | felé 78 | hanem 79 | hiszen 80 | hogy 81 | hogyan 82 | igen 83 | így 84 | illetve 85 | ill. 86 | ill 87 | ilyen 88 | ilyenkor 89 | ison 90 | ismét 91 | itt 92 | jó 93 | jól 94 | jobban 95 | kell 96 | kellett 97 | keresztül 98 | keressünk 99 | ki 100 | kívül 101 | között 102 | közül 103 | legalább 104 | lehet 105 | lehetett 106 | legyen 107 | lenne 108 | lenni 109 | lesz 110 | lett 111 | maga 112 | magát 113 | majd 114 | majd 115 | már 116 | más 117 | másik 118 | meg 119 | még 120 | mellett 121 | mert 122 | mely 123 | melyek 124 | mi 125 | mit 126 | míg 127 | miért 128 | milyen 129 | mikor 130 | minden 131 | mindent 132 | mindenki 133 | mindig 134 | mint 135 | mintha 136 | mivel 137 | most 138 | nagy 139 | nagyobb 140 | nagyon 141 | ne 142 | néha 143 | nekem 144 | neki 145 | nem 146 | néhány 147 | nélkül 148 | nincs 149 | olyan 150 | ott 151 | össze 152 | õ 153 | õk 154 | õket 155 | pedig 156 | persze 157 | rá 158 | s 159 | saját 160 | sem 161 | semmi 162 | sok 163 | sokat 164 | sokkal 165 | számára 166 | szemben 167 | szerint 168 | szinte 169 | talán 170 | tehát 171 | teljes 172 | tovább 173 | továbbá 174 | több 175 | úgy 176 | ugyanis 177 | új 178 | újabb 179 | újra 180 | után 181 | utána 182 | utolsó 183 | vagy 184 | vagyis 185 | valaki 186 | valami 187 | valamint 188 | való 189 | vagyok 190 | van 191 | vannak 192 | volt 193 | voltam 194 | voltak 195 | voltunk 196 | vissza 197 | vele 198 | viszont 199 | volna 200 | -------------------------------------------------------------------------------- /textkit/data/stopwords/italian.txt: -------------------------------------------------------------------------------- 1 | ad 2 | al 3 | allo 4 | ai 5 | agli 6 | all 7 | agl 8 | alla 9 | alle 10 | con 11 | col 12 | coi 13 | da 14 | dal 15 | dallo 16 | dai 17 | dagli 18 | dall 19 | dagl 20 | dalla 21 | dalle 22 | di 23 | del 24 | dello 25 | dei 26 | degli 27 | dell 28 | degl 29 | della 30 | delle 31 | in 32 | nel 33 | nello 34 | nei 35 | negli 36 | nell 37 | negl 38 | nella 39 | nelle 40 | su 41 | sul 42 | sullo 43 | sui 44 | sugli 45 | sull 46 | sugl 47 | sulla 48 | sulle 49 | per 50 | tra 51 | contro 52 | io 53 | tu 54 | lui 55 | lei 56 | noi 57 | voi 58 | loro 59 | mio 60 | mia 61 | miei 62 | mie 63 | tuo 64 | tua 65 | tuoi 66 | tue 67 | suo 68 | sua 69 | suoi 70 | sue 71 | nostro 72 | nostra 73 | nostri 74 | nostre 75 | vostro 76 | vostra 77 | vostri 78 | vostre 79 | mi 80 | ti 81 | ci 82 | vi 83 | lo 84 | la 85 | li 86 | le 87 | gli 88 | ne 89 | il 90 | un 91 | uno 92 | una 93 | ma 94 | ed 95 | se 96 | perché 97 | anche 98 | come 99 | dov 100 | dove 101 | che 102 | chi 103 | cui 104 | non 105 | più 106 | quale 107 | quanto 108 | quanti 109 | quanta 110 | quante 111 | quello 112 | quelli 113 | quella 114 | quelle 115 | questo 116 | questi 117 | questa 118 | queste 119 | si 120 | tutto 121 | tutti 122 | a 123 | c 124 | e 125 | i 126 | l 127 | o 128 | ho 129 | hai 130 | ha 131 | abbiamo 132 | avete 133 | hanno 134 | abbia 135 | abbiate 136 | abbiano 137 | avrò 138 | avrai 139 | avrà 140 | avremo 141 | avrete 142 | avranno 143 | avrei 144 | avresti 145 | avrebbe 146 | avremmo 147 | avreste 148 | avrebbero 149 | avevo 150 | avevi 151 | aveva 152 | avevamo 153 | avevate 154 | avevano 155 | ebbi 156 | avesti 157 | ebbe 158 | avemmo 159 | aveste 160 | ebbero 161 | avessi 162 | avesse 163 | avessimo 164 | avessero 165 | avendo 166 | avuto 167 | avuta 168 | avuti 169 | avute 170 | sono 171 | sei 172 | è 173 | siamo 174 | siete 175 | sia 176 | siate 177 | siano 178 | sarò 179 | sarai 180 | sarà 181 | saremo 182 | sarete 183 | saranno 184 | sarei 185 | saresti 186 | sarebbe 187 | saremmo 188 | sareste 189 | sarebbero 190 | ero 191 | eri 192 | era 193 | eravamo 194 | eravate 195 | erano 196 | fui 197 | fosti 198 | fu 199 | fummo 200 | foste 201 | furono 202 | fossi 203 | fosse 204 | fossimo 205 | fossero 206 | essendo 207 | faccio 208 | fai 209 | facciamo 210 | fanno 211 | faccia 212 | facciate 213 | facciano 214 | farò 215 | farai 216 | farà 217 | faremo 218 | farete 219 | faranno 220 | farei 221 | faresti 222 | farebbe 223 | faremmo 224 | fareste 225 | farebbero 226 | facevo 227 | facevi 228 | faceva 229 | facevamo 230 | facevate 231 | facevano 232 | feci 233 | facesti 234 | fece 235 | facemmo 236 | faceste 237 | fecero 238 | facessi 239 | facesse 240 | facessimo 241 | facessero 242 | facendo 243 | sto 244 | stai 245 | sta 246 | stiamo 247 | stanno 248 | stia 249 | stiate 250 | stiano 251 | starò 252 | starai 253 | starà 254 | staremo 255 | starete 256 | staranno 257 | starei 258 | staresti 259 | starebbe 260 | staremmo 261 | stareste 262 | starebbero 263 | stavo 264 | stavi 265 | stava 266 | stavamo 267 | stavate 268 | stavano 269 | stetti 270 | stesti 271 | stette 272 | stemmo 273 | steste 274 | stettero 275 | stessi 276 | stesse 277 | stessimo 278 | stessero 279 | stando 280 | -------------------------------------------------------------------------------- /textkit/data/stopwords/norwegian.txt: -------------------------------------------------------------------------------- 1 | og 2 | i 3 | jeg 4 | det 5 | at 6 | en 7 | et 8 | den 9 | til 10 | er 11 | som 12 | på 13 | de 14 | med 15 | han 16 | av 17 | ikke 18 | ikkje 19 | der 20 | så 21 | var 22 | meg 23 | seg 24 | men 25 | ett 26 | har 27 | om 28 | vi 29 | min 30 | mitt 31 | ha 32 | hadde 33 | hun 34 | nå 35 | over 36 | da 37 | ved 38 | fra 39 | du 40 | ut 41 | sin 42 | dem 43 | oss 44 | opp 45 | man 46 | kan 47 | hans 48 | hvor 49 | eller 50 | hva 51 | skal 52 | selv 53 | sjøl 54 | her 55 | alle 56 | vil 57 | bli 58 | ble 59 | blei 60 | blitt 61 | kunne 62 | inn 63 | når 64 | være 65 | kom 66 | noen 67 | noe 68 | ville 69 | dere 70 | som 71 | deres 72 | kun 73 | ja 74 | etter 75 | ned 76 | skulle 77 | denne 78 | for 79 | deg 80 | si 81 | sine 82 | sitt 83 | mot 84 | å 85 | meget 86 | hvorfor 87 | dette 88 | disse 89 | uten 90 | hvordan 91 | ingen 92 | din 93 | ditt 94 | blir 95 | samme 96 | hvilken 97 | hvilke 98 | sånn 99 | inni 100 | mellom 101 | vår 102 | hver 103 | hvem 104 | vors 105 | hvis 106 | både 107 | bare 108 | enn 109 | fordi 110 | før 111 | mange 112 | også 113 | slik 114 | vært 115 | være 116 | båe 117 | begge 118 | siden 119 | dykk 120 | dykkar 121 | dei 122 | deira 123 | deires 124 | deim 125 | di 126 | då 127 | eg 128 | ein 129 | eit 130 | eitt 131 | elles 132 | honom 133 | hjå 134 | ho 135 | hoe 136 | henne 137 | hennar 138 | hennes 139 | hoss 140 | hossen 141 | ikkje 142 | ingi 143 | inkje 144 | korleis 145 | korso 146 | kva 147 | kvar 148 | kvarhelst 149 | kven 150 | kvi 151 | kvifor 152 | me 153 | medan 154 | mi 155 | mine 156 | mykje 157 | no 158 | nokon 159 | noka 160 | nokor 161 | noko 162 | nokre 163 | si 164 | sia 165 | sidan 166 | so 167 | somt 168 | somme 169 | um 170 | upp 171 | vere 172 | vore 173 | verte 174 | vort 175 | varte 176 | vart 177 | -------------------------------------------------------------------------------- /textkit/data/stopwords/portuguese.txt: -------------------------------------------------------------------------------- 1 | de 2 | a 3 | o 4 | que 5 | e 6 | do 7 | da 8 | em 9 | um 10 | para 11 | com 12 | não 13 | uma 14 | os 15 | no 16 | se 17 | na 18 | por 19 | mais 20 | as 21 | dos 22 | como 23 | mas 24 | ao 25 | ele 26 | das 27 | à 28 | seu 29 | sua 30 | ou 31 | quando 32 | muito 33 | nos 34 | já 35 | eu 36 | também 37 | só 38 | pelo 39 | pela 40 | até 41 | isso 42 | ela 43 | entre 44 | depois 45 | sem 46 | mesmo 47 | aos 48 | seus 49 | quem 50 | nas 51 | me 52 | esse 53 | eles 54 | você 55 | essa 56 | num 57 | nem 58 | suas 59 | meu 60 | às 61 | minha 62 | numa 63 | pelos 64 | elas 65 | qual 66 | nós 67 | lhe 68 | deles 69 | essas 70 | esses 71 | pelas 72 | este 73 | dele 74 | tu 75 | te 76 | vocês 77 | vos 78 | lhes 79 | meus 80 | minhas 81 | teu 82 | tua 83 | teus 84 | tuas 85 | nosso 86 | nossa 87 | nossos 88 | nossas 89 | dela 90 | delas 91 | esta 92 | estes 93 | estas 94 | aquele 95 | aquela 96 | aqueles 97 | aquelas 98 | isto 99 | aquilo 100 | estou 101 | está 102 | estamos 103 | estão 104 | estive 105 | esteve 106 | estivemos 107 | estiveram 108 | estava 109 | estávamos 110 | estavam 111 | estivera 112 | estivéramos 113 | esteja 114 | estejamos 115 | estejam 116 | estivesse 117 | estivéssemos 118 | estivessem 119 | estiver 120 | estivermos 121 | estiverem 122 | hei 123 | há 124 | havemos 125 | hão 126 | houve 127 | houvemos 128 | houveram 129 | houvera 130 | houvéramos 131 | haja 132 | hajamos 133 | hajam 134 | houvesse 135 | houvéssemos 136 | houvessem 137 | houver 138 | houvermos 139 | houverem 140 | houverei 141 | houverá 142 | houveremos 143 | houverão 144 | houveria 145 | houveríamos 146 | houveriam 147 | sou 148 | somos 149 | são 150 | era 151 | éramos 152 | eram 153 | fui 154 | foi 155 | fomos 156 | foram 157 | fora 158 | fôramos 159 | seja 160 | sejamos 161 | sejam 162 | fosse 163 | fôssemos 164 | fossem 165 | for 166 | formos 167 | forem 168 | serei 169 | será 170 | seremos 171 | serão 172 | seria 173 | seríamos 174 | seriam 175 | tenho 176 | tem 177 | temos 178 | tém 179 | tinha 180 | tínhamos 181 | tinham 182 | tive 183 | teve 184 | tivemos 185 | tiveram 186 | tivera 187 | tivéramos 188 | tenha 189 | tenhamos 190 | tenham 191 | tivesse 192 | tivéssemos 193 | tivessem 194 | tiver 195 | tivermos 196 | tiverem 197 | terei 198 | terá 199 | teremos 200 | terão 201 | teria 202 | teríamos 203 | teriam 204 | -------------------------------------------------------------------------------- /textkit/data/stopwords/russian.txt: -------------------------------------------------------------------------------- 1 | и 2 | в 3 | во 4 | не 5 | что 6 | он 7 | на 8 | я 9 | с 10 | со 11 | как 12 | а 13 | то 14 | все 15 | она 16 | так 17 | его 18 | но 19 | да 20 | ты 21 | к 22 | у 23 | же 24 | вы 25 | за 26 | бы 27 | по 28 | только 29 | ее 30 | мне 31 | было 32 | вот 33 | от 34 | меня 35 | еще 36 | нет 37 | о 38 | из 39 | ему 40 | теперь 41 | когда 42 | даже 43 | ну 44 | вдруг 45 | ли 46 | если 47 | уже 48 | или 49 | ни 50 | быть 51 | был 52 | него 53 | до 54 | вас 55 | нибудь 56 | опять 57 | уж 58 | вам 59 | ведь 60 | там 61 | потом 62 | себя 63 | ничего 64 | ей 65 | может 66 | они 67 | тут 68 | где 69 | есть 70 | надо 71 | ней 72 | для 73 | мы 74 | тебя 75 | их 76 | чем 77 | была 78 | сам 79 | чтоб 80 | без 81 | будто 82 | чего 83 | раз 84 | тоже 85 | себе 86 | под 87 | будет 88 | ж 89 | тогда 90 | кто 91 | этот 92 | того 93 | потому 94 | этого 95 | какой 96 | совсем 97 | ним 98 | здесь 99 | этом 100 | один 101 | почти 102 | мой 103 | тем 104 | чтобы 105 | нее 106 | сейчас 107 | были 108 | куда 109 | зачем 110 | всех 111 | никогда 112 | можно 113 | при 114 | наконец 115 | два 116 | об 117 | другой 118 | хоть 119 | после 120 | над 121 | больше 122 | тот 123 | через 124 | эти 125 | нас 126 | про 127 | всего 128 | них 129 | какая 130 | много 131 | разве 132 | три 133 | эту 134 | моя 135 | впрочем 136 | хорошо 137 | свою 138 | этой 139 | перед 140 | иногда 141 | лучше 142 | чуть 143 | том 144 | нельзя 145 | такой 146 | им 147 | более 148 | всегда 149 | конечно 150 | всю 151 | между 152 | -------------------------------------------------------------------------------- /textkit/data/stopwords/spanish.txt: -------------------------------------------------------------------------------- 1 | de 2 | la 3 | que 4 | el 5 | en 6 | y 7 | a 8 | los 9 | del 10 | se 11 | las 12 | por 13 | un 14 | para 15 | con 16 | no 17 | una 18 | su 19 | al 20 | lo 21 | como 22 | más 23 | pero 24 | sus 25 | le 26 | ya 27 | o 28 | este 29 | sí 30 | porque 31 | esta 32 | entre 33 | cuando 34 | muy 35 | sin 36 | sobre 37 | también 38 | me 39 | hasta 40 | hay 41 | donde 42 | quien 43 | desde 44 | todo 45 | nos 46 | durante 47 | todos 48 | uno 49 | les 50 | ni 51 | contra 52 | otros 53 | ese 54 | eso 55 | ante 56 | ellos 57 | e 58 | esto 59 | mí 60 | antes 61 | algunos 62 | qué 63 | unos 64 | yo 65 | otro 66 | otras 67 | otra 68 | él 69 | tanto 70 | esa 71 | estos 72 | mucho 73 | quienes 74 | nada 75 | muchos 76 | cual 77 | poco 78 | ella 79 | estar 80 | estas 81 | algunas 82 | algo 83 | nosotros 84 | mi 85 | mis 86 | tú 87 | te 88 | ti 89 | tu 90 | tus 91 | ellas 92 | nosotras 93 | vosostros 94 | vosostras 95 | os 96 | mío 97 | mía 98 | míos 99 | mías 100 | tuyo 101 | tuya 102 | tuyos 103 | tuyas 104 | suyo 105 | suya 106 | suyos 107 | suyas 108 | nuestro 109 | nuestra 110 | nuestros 111 | nuestras 112 | vuestro 113 | vuestra 114 | vuestros 115 | vuestras 116 | esos 117 | esas 118 | estoy 119 | estás 120 | está 121 | estamos 122 | estáis 123 | están 124 | esté 125 | estés 126 | estemos 127 | estéis 128 | estén 129 | estaré 130 | estarás 131 | estará 132 | estaremos 133 | estaréis 134 | estarán 135 | estaría 136 | estarías 137 | estaríamos 138 | estaríais 139 | estarían 140 | estaba 141 | estabas 142 | estábamos 143 | estabais 144 | estaban 145 | estuve 146 | estuviste 147 | estuvo 148 | estuvimos 149 | estuvisteis 150 | estuvieron 151 | estuviera 152 | estuvieras 153 | estuviéramos 154 | estuvierais 155 | estuvieran 156 | estuviese 157 | estuvieses 158 | estuviésemos 159 | estuvieseis 160 | estuviesen 161 | estando 162 | estado 163 | estada 164 | estados 165 | estadas 166 | estad 167 | he 168 | has 169 | ha 170 | hemos 171 | habéis 172 | han 173 | haya 174 | hayas 175 | hayamos 176 | hayáis 177 | hayan 178 | habré 179 | habrás 180 | habrá 181 | habremos 182 | habréis 183 | habrán 184 | habría 185 | habrías 186 | habríamos 187 | habríais 188 | habrían 189 | había 190 | habías 191 | habíamos 192 | habíais 193 | habían 194 | hube 195 | hubiste 196 | hubo 197 | hubimos 198 | hubisteis 199 | hubieron 200 | hubiera 201 | hubieras 202 | hubiéramos 203 | hubierais 204 | hubieran 205 | hubiese 206 | hubieses 207 | hubiésemos 208 | hubieseis 209 | hubiesen 210 | habiendo 211 | habido 212 | habida 213 | habidos 214 | habidas 215 | soy 216 | eres 217 | es 218 | somos 219 | sois 220 | son 221 | sea 222 | seas 223 | seamos 224 | seáis 225 | sean 226 | seré 227 | serás 228 | será 229 | seremos 230 | seréis 231 | serán 232 | sería 233 | serías 234 | seríamos 235 | seríais 236 | serían 237 | era 238 | eras 239 | éramos 240 | erais 241 | eran 242 | fui 243 | fuiste 244 | fue 245 | fuimos 246 | fuisteis 247 | fueron 248 | fuera 249 | fueras 250 | fuéramos 251 | fuerais 252 | fueran 253 | fuese 254 | fueses 255 | fuésemos 256 | fueseis 257 | fuesen 258 | sintiendo 259 | sentido 260 | sentida 261 | sentidos 262 | sentidas 263 | siente 264 | sentid 265 | tengo 266 | tienes 267 | tiene 268 | tenemos 269 | tenéis 270 | tienen 271 | tenga 272 | tengas 273 | tengamos 274 | tengáis 275 | tengan 276 | tendré 277 | tendrás 278 | tendrá 279 | tendremos 280 | tendréis 281 | tendrán 282 | tendría 283 | tendrías 284 | tendríamos 285 | tendríais 286 | tendrían 287 | tenía 288 | tenías 289 | teníamos 290 | teníais 291 | tenían 292 | tuve 293 | tuviste 294 | tuvo 295 | tuvimos 296 | tuvisteis 297 | tuvieron 298 | tuviera 299 | tuvieras 300 | tuviéramos 301 | tuvierais 302 | tuvieran 303 | tuviese 304 | tuvieses 305 | tuviésemos 306 | tuvieseis 307 | tuviesen 308 | teniendo 309 | tenido 310 | tenida 311 | tenidos 312 | tenidas 313 | tened 314 | -------------------------------------------------------------------------------- /textkit/data/stopwords/swedish.txt: -------------------------------------------------------------------------------- 1 | och 2 | det 3 | att 4 | i 5 | en 6 | jag 7 | hon 8 | som 9 | han 10 | på 11 | den 12 | med 13 | var 14 | sig 15 | för 16 | så 17 | till 18 | är 19 | men 20 | ett 21 | om 22 | hade 23 | de 24 | av 25 | icke 26 | mig 27 | du 28 | henne 29 | då 30 | sin 31 | nu 32 | har 33 | inte 34 | hans 35 | honom 36 | skulle 37 | hennes 38 | där 39 | min 40 | man 41 | ej 42 | vid 43 | kunde 44 | något 45 | från 46 | ut 47 | när 48 | efter 49 | upp 50 | vi 51 | dem 52 | vara 53 | vad 54 | över 55 | än 56 | dig 57 | kan 58 | sina 59 | här 60 | ha 61 | mot 62 | alla 63 | under 64 | någon 65 | eller 66 | allt 67 | mycket 68 | sedan 69 | ju 70 | denna 71 | själv 72 | detta 73 | åt 74 | utan 75 | varit 76 | hur 77 | ingen 78 | mitt 79 | ni 80 | bli 81 | blev 82 | oss 83 | din 84 | dessa 85 | några 86 | deras 87 | blir 88 | mina 89 | samma 90 | vilken 91 | er 92 | sådan 93 | vår 94 | blivit 95 | dess 96 | inom 97 | mellan 98 | sådant 99 | varför 100 | varje 101 | vilka 102 | ditt 103 | vem 104 | vilket 105 | sitta 106 | sådana 107 | vart 108 | dina 109 | vars 110 | vårt 111 | våra 112 | ert 113 | era 114 | vilkas 115 | -------------------------------------------------------------------------------- /textkit/data/stopwords/turkish.txt: -------------------------------------------------------------------------------- 1 | acaba 2 | ama 3 | aslında 4 | az 5 | bazı 6 | belki 7 | biri 8 | birkaç 9 | birşey 10 | biz 11 | bu 12 | çok 13 | çünkü 14 | da 15 | daha 16 | de 17 | defa 18 | diye 19 | eğer 20 | en 21 | gibi 22 | hem 23 | hep 24 | hepsi 25 | her 26 | hiç 27 | için 28 | ile 29 | ise 30 | kez 31 | ki 32 | kim 33 | mı 34 | mu 35 | mü 36 | nasıl 37 | ne 38 | neden 39 | nerde 40 | nerede 41 | nereye 42 | niçin 43 | niye 44 | o 45 | sanki 46 | şey 47 | siz 48 | şu 49 | tüm 50 | ve 51 | veya 52 | ya 53 | yani 54 | -------------------------------------------------------------------------------- /textkit/download.py: -------------------------------------------------------------------------------- 1 | import os 2 | import click 3 | import nltk 4 | from textkit.utils import read_tokens, output 5 | 6 | 7 | @click.command('download') 8 | def download(): 9 | ''' 10 | Install required libraries. 11 | Note this library will install nltk dependencies into your 12 | user directory. 13 | ''' 14 | 15 | click.echo("Installing nltk packages into your user directories in " + 16 | "the following order of existence (first found):\n" + 17 | '\n'.join(nltk.data.path)) 18 | 19 | extensions = [("taggers", "averaged_perceptron_tagger"), 20 | ("corpora", "wordnet"), 21 | ("tokenizers", "punkt")] 22 | 23 | missing = check_packages_exist(extensions) 24 | 25 | for ext_tuple in missing: 26 | nltk.download(ext_tuple[1]) 27 | 28 | 29 | def check_packages_exist(extensions): 30 | ''' 31 | Finds missing nltk extensions. 32 | ''' 33 | paths = nltk.data.path # there are usually quite a few, so we check them all. 34 | missing = [] 35 | for ext_tuple in extensions: 36 | ext_found = False 37 | click.echo(message="Looking for " + ext_tuple[1], nl=True) 38 | for path in paths: 39 | if os.path.exists(os.path.join(path, ext_tuple[0], ext_tuple[1])): 40 | ext_found = True 41 | click.echo(message="Found " + ext_tuple[1], nl=True) 42 | break 43 | if not ext_found: 44 | click.echo(message="Missing " + ext_tuple[1], nl=True) 45 | missing.append(ext_tuple) 46 | 47 | return missing 48 | -------------------------------------------------------------------------------- /textkit/filter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/learntextvis/textkit/6c482ea7d378022e10ee1002f9d9db7925856294/textkit/filter/__init__.py -------------------------------------------------------------------------------- /textkit/filter/filter_lengths.py: -------------------------------------------------------------------------------- 1 | import click 2 | from textkit.utils import output, read_tokens 3 | 4 | 5 | @click.command() 6 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-')) 7 | @click.option('-m', '--minimum', default=3, 8 | help='Minimum length of token to not filter.', show_default=True) 9 | def filterlengths(minimum, tokens): 10 | '''Remove tokens that are shorter then the minimum length provided.''' 11 | content = read_tokens(tokens) 12 | [output(token) for token in content if len(token) >= minimum] 13 | -------------------------------------------------------------------------------- /textkit/filter/filter_punc.py: -------------------------------------------------------------------------------- 1 | from string import punctuation 2 | import click 3 | from textkit.utils import output, read_tokens 4 | 5 | 6 | @click.command() 7 | # @click.option('--out', type=click.File('w'), default='-', 8 | # help='Optional output file. Defaults to standard out.') 9 | # @click.option('--punctuation', default=punctuation, 10 | # help='String indicating punctuation to check for.') 11 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-')) 12 | def filterpunc(tokens): 13 | '''Remove tokens that are only punctuation from a list of tokens.''' 14 | content = read_tokens(tokens) 15 | [output(token) for token in content if token not in punctuation] 16 | -------------------------------------------------------------------------------- /textkit/filter/filter_words.py: -------------------------------------------------------------------------------- 1 | import os 2 | import click 3 | from textkit.utils import output, read_tokens, data_item 4 | 5 | def get_stopwords(stopword_name): 6 | path = data_item('/stopwords/' + stopword_name + '.txt') 7 | stopwords = [] 8 | with open(path) as filename: 9 | stopwords = read_tokens(filename) 10 | return stopwords 11 | 12 | 13 | @click.command() 14 | @click.option('-l', '--language', type=click.Choice(['english', 'german', 'danish','dutch','finnish','french','hungarian','italian','norwegian','portuguese','russian','spanish','swedish','turkish']), 15 | default='english') 16 | @click.option('--custom', type=click.File('r'), 17 | help='Optional token file of additional tokens to remove ' + 18 | 'along with selected stop words.') 19 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-')) 20 | def filterwords(language, custom, tokens): 21 | '''Remove stop words from tokens, returning tokens without stop words.''' 22 | content = read_tokens(tokens) 23 | stopwords = get_stopwords(language) 24 | if custom: 25 | stopwords = stopwords + read_tokens(custom) 26 | 27 | [output(token) for token in content 28 | if token.lower() not in stopwords] 29 | 30 | 31 | @click.command() 32 | @click.option('-l', '--language', type=click.Choice(['english', 'german']), 33 | default='english') 34 | def showstops(language): 35 | '''Display stop words used by textkit for a given language.''' 36 | stopwords = get_stopwords(language) 37 | 38 | [output(token) for token in stopwords] 39 | -------------------------------------------------------------------------------- /textkit/package/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/learntextvis/textkit/6c482ea7d378022e10ee1002f9d9db7925856294/textkit/package/__init__.py -------------------------------------------------------------------------------- /textkit/package/texts_to_json.py: -------------------------------------------------------------------------------- 1 | import json 2 | from collections import OrderedDict 3 | import click 4 | from textkit.utils import output, read_tokens 5 | 6 | def read_names(names_path): 7 | names = [] 8 | if names_path: 9 | names_doc = open(names_path, 'r') 10 | names = read_tokens(names_doc) 11 | return names 12 | 13 | 14 | @click.command() 15 | @click.argument('text_docs', type=click.Path(exists=True), nargs=-1) 16 | @click.option('--ids', type=click.Path(), 17 | help="File with one id per text document, each separated by a " + 18 | "new line. Ids file is used to set the id attribute in the " + 19 | "output JSON.") 20 | @click.option('--names', type=click.Path(), 21 | help="File with one name per text document, each separated " + 22 | "by a new line. Names file is used to set the name attribute " + 23 | "in the output JSON.") 24 | @click.option('--field', default='text', help="Attribute name where text " + 25 | "will be stored in the document object.", show_default=True) 26 | 27 | def texts2json(ids, names, field, text_docs): 28 | '''Convert a set of text documents into a 29 | JSON array of document objects.''' 30 | 31 | docs = [] 32 | 33 | names = read_names(names) 34 | ids = read_names(ids) 35 | 36 | for idx, path in enumerate(text_docs): 37 | tokens_doc = open(path, 'r') 38 | content = "" 39 | with click.open_file(path): 40 | content = tokens_doc.read() 41 | 42 | # ordered so that these attributes stay at the top 43 | doc = OrderedDict() 44 | 45 | if idx < len(ids) - 1: 46 | doc['id'] = ids[idx] 47 | else: 48 | doc['id'] = path 49 | 50 | if idx < len(names) - 1: 51 | doc['name'] = names[idx] 52 | else: 53 | doc['name'] = path 54 | 55 | doc[field] = content 56 | docs.append(doc) 57 | tokens_doc.close() 58 | 59 | out_content = json.dumps(docs, indent=2) 60 | output(out_content) 61 | -------------------------------------------------------------------------------- /textkit/package/tokens_to_json.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | from collections import OrderedDict 4 | import click 5 | from textkit.utils import output, read_tokens, read_csv 6 | from textkit.coerce import coerce_types 7 | 8 | 9 | def read_names(names_path): 10 | names = [] 11 | if names_path: 12 | names_doc = open(names_path, 'r') 13 | names = read_tokens(names_doc) 14 | return names 15 | 16 | 17 | @click.command() 18 | @click.argument('token_docs', type=click.Path(exists=True), nargs=-1) 19 | @click.option('--ids', type=click.Path(), 20 | help="File with one id per token document, each separated " + 21 | "by a new line. Ids file is used to set the id attribute in " + 22 | "the output JSON.") 23 | @click.option('--names', type=click.Path(), 24 | help="File with one name per token document, each separated " + 25 | "by a new line. Names file is used to set the name attribute " + 26 | "in the output JSON.") 27 | @click.option('--field', default='tokens', help="Attribute name where " + 28 | "tokens will be stored in the document object.", 29 | show_default=True) 30 | @click.option('--split/--no-split', default=False, help="If enabled, " + 31 | "textkit will attempt to split input columns when " + 32 | "packaging. This is useful when packaging multiple column " + 33 | "output like counts.", 34 | show_default=True) 35 | @click.option('-s', '--sep', default=',', help="Separator character between " + 36 | "columns. Only used if split-columns flag is used.", 37 | show_default=True) 38 | def tokens2json(ids, names, field, split, sep, token_docs): 39 | '''Convert a set of token documents into a 40 | JSON array of document objects.''' 41 | 42 | docs = [] 43 | 44 | names = read_names(names) 45 | ids = read_names(ids) 46 | 47 | for idx, path in enumerate(token_docs): 48 | if path == '-': 49 | tokens_doc = sys.stdin 50 | else: 51 | tokens_doc = open(path, 'r') 52 | if split: 53 | content = read_csv(tokens_doc, sep) 54 | content = coerce_types(content) 55 | else: 56 | content = read_tokens(tokens_doc) 57 | 58 | # ordered so that these attributes stay at the top 59 | doc = OrderedDict() 60 | 61 | if idx < len(ids) - 1: 62 | doc['id'] = ids[idx] 63 | else: 64 | doc['id'] = path 65 | 66 | if idx < len(names) - 1: 67 | doc['name'] = names[idx] 68 | else: 69 | doc['name'] = path 70 | 71 | doc[field] = content 72 | docs.append(doc) 73 | tokens_doc.close() 74 | 75 | out_content = json.dumps(docs, indent=2) 76 | output(out_content) 77 | -------------------------------------------------------------------------------- /textkit/package/tokens_to_text.py: -------------------------------------------------------------------------------- 1 | import click 2 | from textkit.utils import read_tokens, output 3 | 4 | 5 | @click.command() 6 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-')) 7 | @click.option('-s', '--sep', default=' ', 8 | help='Separator between token and count in output.', 9 | show_default=True) 10 | def tokens2text(sep, tokens): 11 | '''Combine tokens in a token document into a single text file.''' 12 | 13 | content = read_tokens(tokens) 14 | out = sep.join(content) 15 | output(out) 16 | -------------------------------------------------------------------------------- /textkit/tokenize/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/learntextvis/textkit/6c482ea7d378022e10ee1002f9d9db7925856294/textkit/tokenize/__init__.py -------------------------------------------------------------------------------- /textkit/tokenize/bigrams.py: -------------------------------------------------------------------------------- 1 | import click 2 | import nltk 3 | from textkit.utils import output, read_tokens 4 | 5 | 6 | @click.command() 7 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-')) 8 | @click.option('-s', '--sep', default=' ', 9 | help='Separator between words in bigram output.', 10 | show_default=True) 11 | def words2bigrams(sep, tokens): 12 | '''Tokenize words into bigrams. Bigrams are two word tokens. 13 | Punctuation is considered as a separate token.''' 14 | 15 | content = read_tokens(tokens) 16 | bigrams = [] 17 | try: 18 | bigrams = list(nltk.bigrams(content)) 19 | except LookupError as err: 20 | click.echo(message="Error with tokenization", nl=True) 21 | click.echo(message="Have you run \"textkit download\"?", nl=True) 22 | click.echo(message="\nOriginal Error:", nl=True) 23 | click.echo(err) 24 | [output(sep.join(bigram)) for bigram in bigrams] 25 | -------------------------------------------------------------------------------- /textkit/tokenize/ngrams.py: -------------------------------------------------------------------------------- 1 | import click 2 | import nltk 3 | from textkit.utils import write_csv, read_tokens 4 | 5 | 6 | @click.command('words2ngrams') 7 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-')) 8 | @click.option('-s', '--sep', default=' ', 9 | help='Separator between words in bigram output.', 10 | show_default=True) 11 | @click.option('-n', '--num', default=2, 12 | help='Length of the n-gram', 13 | show_default=True) 14 | def words2ngrams(sep, num, tokens): 15 | '''Convert word tokens into ngrams. ngrams are n-length word tokens. 16 | Punctuation is considered as a separate token.''' 17 | 18 | content = read_tokens(tokens) 19 | ngrams = list(nltk.ngrams(content, num)) 20 | write_csv(ngrams, str(sep)) 21 | 22 | 23 | @click.command('text2ngrams') 24 | @click.argument('text', type=click.Path(exists=True), nargs=-1) 25 | @click.option('-s', '--sep', default=' ', 26 | help='Separator between words in bigram output.', 27 | show_default=True) 28 | @click.option('-n', '--num', default=2, 29 | help='Length of the n-gram', 30 | show_default=True) 31 | def text2ngrams(sep, num, text): 32 | '''Tokenize plain text into ngrams. ngrams are n-length word tokens. 33 | Punctuation is considered as a separate token.''' 34 | content = '\n'.join([open(f).read() for f in text]) 35 | try: 36 | tokens = nltk.word_tokenize(content) 37 | ngrams = list(nltk.ngrams(tokens, num)) 38 | write_csv(ngrams, str(sep)) 39 | except LookupError as err: 40 | click.echo(message="Error with tokenization", nl=True) 41 | click.echo(message="Have you run \"textkit download\"?", nl=True) 42 | click.echo(message="\nOriginal Error:", nl=True) 43 | click.echo(err) 44 | -------------------------------------------------------------------------------- /textkit/tokenize/punc.py: -------------------------------------------------------------------------------- 1 | import re 2 | from string import punctuation 3 | import click 4 | from textkit.utils import output 5 | 6 | @click.command() 7 | @click.argument('text', type=click.Path(exists=True), nargs=-1) 8 | def text2punc(text): 9 | '''Tokenize text into punctuation tokens. 10 | Words and numbers are removed, leaving only punctuation.''' 11 | 12 | # from: http://stackoverflow.com/questions/17485092/how-to-just-keep-punctuation-with-a-string-in-python 13 | 14 | content = '\n'.join([open(f).read() for f in text]) 15 | out = re.sub(r'[^{}]+'.format(punctuation), ' ', content) 16 | out = out.split() 17 | [output(p) for p in out] 18 | -------------------------------------------------------------------------------- /textkit/tokenize/sentences.py: -------------------------------------------------------------------------------- 1 | from nltk.tokenize import sent_tokenize 2 | import click 3 | from textkit.utils import output 4 | 5 | @click.command() 6 | @click.argument('text', type=click.Path(exists=True), nargs=-1) 7 | def text2sentences(text): 8 | '''Tokenize text into sentence tokens.''' 9 | content = '\n'.join([open(f).read() for f in text]) 10 | sentences = [] 11 | try: 12 | sentences = sent_tokenize(content) 13 | except LookupError as err: 14 | click.echo(message="Error with tokenization", nl=True) 15 | click.echo(message="Have you run \"textkit download\"?", nl=True) 16 | click.echo(message="\nOriginal Error:", nl=True) 17 | click.echo(err) 18 | [output(s.strip()) for s in sentences] 19 | -------------------------------------------------------------------------------- /textkit/tokenize/words.py: -------------------------------------------------------------------------------- 1 | import click 2 | import nltk 3 | from textkit.utils import output 4 | 5 | 6 | @click.command() 7 | @click.argument('text', type=click.Path(exists=True), nargs=-1) 8 | def text2words(text): 9 | '''Tokenize text into word tokens. 10 | Punctuation is considered as a separate token.''' 11 | content = '\n'.join([open(f).read() for f in text]) 12 | tokens = [] 13 | try: 14 | tokens = nltk.word_tokenize(content) 15 | except LookupError as err: 16 | click.echo(message="Error with tokenization", nl=True) 17 | click.echo(message="Have you run \"textkit download\"?", nl=True) 18 | click.echo(message="\nOriginal Error:", nl=True) 19 | click.echo(err) 20 | [output(token) for token in tokens] 21 | -------------------------------------------------------------------------------- /textkit/transform/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/learntextvis/textkit/6c482ea7d378022e10ee1002f9d9db7925856294/textkit/transform/__init__.py -------------------------------------------------------------------------------- /textkit/transform/newlines.py: -------------------------------------------------------------------------------- 1 | import click 2 | import re 3 | from textkit.utils import output 4 | 5 | 6 | @click.command() 7 | @click.argument('text', type=click.Path(exists=True), nargs=-1) 8 | def nonewlines(text): 9 | '''Remove newlines from a text file.''' 10 | content = '\n'.join([open(f).read() for f in text]) 11 | content = re.sub('\n|\r\n|\r', ' ', content).strip() 12 | output(content) 13 | -------------------------------------------------------------------------------- /textkit/transform/tokens_to_counts.py: -------------------------------------------------------------------------------- 1 | import click 2 | from collections import defaultdict 3 | from textkit.utils import read_tokens, write_csv 4 | 5 | 6 | @click.command('tokens2counts') 7 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-')) 8 | @click.option('-s', '--sep', default=',', 9 | help='Separator between token and count in output.', 10 | show_default=True) 11 | @click.option('--limit', default=-1, type=click.INT, 12 | help='Only output the top N most frequent tokens') 13 | def tokens2counts(sep, limit, tokens): 14 | '''Count unique tokens in a list of tokens. 15 | Tokens are sorted by top counts.''' 16 | content = read_tokens(tokens) 17 | counts = sort_counts(get_counts(content)) 18 | 19 | # we want the argument type to be an INT - but python only 20 | # has support for a float infinity. So if it the limit is negative, 21 | # it becomes infinite 22 | if limit < 0: 23 | limit = float('inf') 24 | 25 | # using csv writer to ensure proper encoding of the seperator. 26 | rows = [list(map(str, vals)) for ind, vals in enumerate(counts) if ind < limit] 27 | write_csv(rows, str(sep)) 28 | 29 | 30 | def get_counts(tokens): 31 | '''Count unique tokens in a list''' 32 | counts = defaultdict(int) 33 | for token in tokens: 34 | counts[token] += 1 35 | return counts 36 | 37 | 38 | def sort_counts(counts): 39 | '''Sorts dict of counts by count and returns array of counts''' 40 | return sorted(counts.items(), key=lambda count: count[1], reverse=True) 41 | -------------------------------------------------------------------------------- /textkit/transform/tokens_to_lower.py: -------------------------------------------------------------------------------- 1 | import click 2 | from textkit.utils import read_tokens, output 3 | 4 | 5 | @click.command() 6 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-')) 7 | def tokens2lower(tokens): 8 | '''Transform all tokens to lowercase.''' 9 | content = read_tokens(tokens) 10 | [output(token.lower()) for token in content] 11 | -------------------------------------------------------------------------------- /textkit/transform/tokens_to_pos.py: -------------------------------------------------------------------------------- 1 | import os 2 | import click 3 | import nltk 4 | from textkit.utils import write_csv, read_tokens, data_item 5 | 6 | 7 | @click.command('tokens2pos') 8 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-')) 9 | @click.option('-s', '--sep', default=',', 10 | help='Separator between words in the output.', 11 | show_default=True) 12 | def tokens2pos(sep, tokens): 13 | '''Tokenize words into their parts of speech. Output contains the 14 | word token followed by its part-of-speech tag, separated by the 15 | character specified by --sep. 16 | ''' 17 | 18 | content = read_tokens(tokens) 19 | nltk.data.path.append(data_item()) 20 | tags = nltk.pos_tag(content) 21 | write_csv(tags, str(sep)) 22 | -------------------------------------------------------------------------------- /textkit/transform/tokens_to_stem.py: -------------------------------------------------------------------------------- 1 | import click 2 | from collections import OrderedDict 3 | from nltk.stem import PorterStemmer 4 | from nltk.stem.lancaster import LancasterStemmer 5 | from nltk.stem.snowball import EnglishStemmer 6 | from nltk.stem.wordnet import WordNetLemmatizer 7 | from textkit.utils import read_tokens, output 8 | 9 | ALGOS = OrderedDict([ 10 | ('porter', PorterStemmer), 11 | ('lancaster', LancasterStemmer), 12 | ('snowball', EnglishStemmer), 13 | ('wordnet', WordNetLemmatizer) 14 | ]) 15 | 16 | 17 | @click.command() 18 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-')) 19 | @click.option('-a', '--algorithm', type=click.Choice(list(ALGOS.keys())), 20 | default=list(ALGOS.keys())[0], 21 | help='Specify which stemming algorithm to use.', 22 | show_default=True) 23 | def tokens2stem(tokens, algorithm): 24 | '''Stem a list of tokens to get their root.''' 25 | content = read_tokens(tokens) 26 | stemmer = ALGOS[algorithm]() 27 | 28 | if algorithm == 'wordnet': 29 | for token in content: 30 | output(stemmer.lemmatize(token)) 31 | else: 32 | for token in content: 33 | output(stemmer.stem(token)) 34 | -------------------------------------------------------------------------------- /textkit/transform/tokens_to_top_bigrams.py: -------------------------------------------------------------------------------- 1 | import click 2 | import nltk 3 | from collections import OrderedDict 4 | from textkit.utils import read_tokens, write_csv 5 | 6 | 7 | MEASURES = OrderedDict([ 8 | ('likelihood', nltk.collocations.BigramAssocMeasures.likelihood_ratio), 9 | ('chi_sq', nltk.collocations.BigramAssocMeasures.chi_sq), 10 | ('pmi', nltk.collocations.BigramAssocMeasures.pmi), 11 | ('student_t', nltk.collocations.BigramAssocMeasures.student_t), 12 | ('freq', nltk.collocations.BigramAssocMeasures.raw_freq) 13 | ]) 14 | 15 | 16 | @click.command('tokens2topbigrams') 17 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-')) 18 | @click.option('-s', '--sep', default=',', 19 | help='Separator between tokens and scores in output.', 20 | show_default=True) 21 | @click.option('-m', '--measure', type=click.Choice(list(MEASURES.keys())), 22 | default=list(MEASURES.keys())[0], 23 | help='Specify which measure to use to define interesing-ness.', 24 | show_default=True) 25 | @click.option('--freq', default=2, 26 | help='Minimum frequency of bi-grams to filter out.', 27 | show_default=True) 28 | @click.option('--scores/--no-scores', default=True, 29 | help='Include or exclude scores in output.', 30 | show_default=True) 31 | def tokens2topbigrams(sep, measure, freq, scores, tokens): 32 | '''Find top most interesting bi-grams in a token document. 33 | Uses the --measure argument to determine what measure to use to define 34 | 'interesting'. 35 | ''' 36 | 37 | content = read_tokens(tokens) 38 | bcf = nltk.collocations.BigramCollocationFinder.from_words(content) 39 | bcf.apply_freq_filter(freq) 40 | 41 | nltk_measure = MEASURES[measure] 42 | bigrams = bcf.score_ngrams(nltk_measure) 43 | 44 | out = [b[0] for b in bigrams] 45 | if scores: 46 | out = [b[0] + tuple([str(b[1])]) for b in bigrams] 47 | write_csv(out, str(sep)) 48 | -------------------------------------------------------------------------------- /textkit/transform/tokens_to_upper.py: -------------------------------------------------------------------------------- 1 | import click 2 | from textkit.utils import read_tokens, output 3 | 4 | 5 | @click.command() 6 | @click.argument('tokens', type=click.File('r'), default=click.open_file('-')) 7 | def tokens2upper(tokens): 8 | '''Transform all tokens to uppercase.''' 9 | content = read_tokens(tokens) 10 | [output(token.upper()) for token in content] 11 | -------------------------------------------------------------------------------- /textkit/transform/transliterate.py: -------------------------------------------------------------------------------- 1 | import click 2 | from textkit.utils import output 3 | from unidecode import unidecode 4 | import chardet 5 | 6 | @click.command() 7 | @click.argument('file', type=click.File('r'), default=click.open_file('-')) 8 | def transliterate(file): 9 | '''Convert international text to ascii.''' 10 | content = ''.join(file.readlines()) 11 | try: 12 | content = content.decode(chardet.detect(content)['encoding']) 13 | except AttributeError: 14 | # Strings do not have a decode method in python 3. 15 | pass 16 | [output(unidecode(content).encode('ascii', 'ignore'))] 17 | -------------------------------------------------------------------------------- /textkit/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import click 3 | import sys 4 | import csv 5 | from pkg_resources import resource_filename 6 | 7 | 8 | def read_tokens(file): 9 | '''Reads tokens from a file handle''' 10 | lines = file.readlines() 11 | return [l.rstrip('\n') for l in lines if len(l.rstrip('\n')) > 0] 12 | 13 | 14 | def read_csv(file, delim): 15 | '''Reads csv formated tokens from a file handle''' 16 | lines = [] 17 | reader = csv.reader(file, delimiter=delim) 18 | lines = [line for line in reader] 19 | return lines 20 | 21 | 22 | def write_csv(rows, delim): 23 | writer = csv.writer(click.get_text_stream('stdout'), delimiter=delim, lineterminator='\n') 24 | try: 25 | [writer.writerow(row) for row in rows] 26 | except (OSError, IOError): 27 | sys.stderr.close() 28 | 29 | 30 | def output(line): 31 | try: 32 | click.echo(line) 33 | except (OSError, IOError): 34 | sys.stderr.close() 35 | 36 | 37 | def data_item(search_path=''): 38 | path = resource_filename(__name__, 'data/' + search_path) 39 | return path 40 | --------------------------------------------------------------------------------