├── .gitignore ├── LICENSE.txt ├── MANIFEST.in ├── README ├── README.rst ├── docs ├── Makefile ├── _build │ ├── doctrees │ │ ├── disqus_jnlp.html.doctree │ │ ├── environment.pickle │ │ └── index.doctree │ └── html │ │ ├── .buildinfo │ │ ├── _sources │ │ ├── disqus_jnlp.html.txt │ │ └── index.txt │ │ ├── _static │ │ ├── ajax-loader.gif │ │ ├── basic.css │ │ ├── comment-bright.png │ │ ├── comment-close.png │ │ ├── comment.png │ │ ├── default.css │ │ ├── dialog-note.png │ │ ├── dialog-seealso.png │ │ ├── dialog-topic.png │ │ ├── dialog-warning.png │ │ ├── doctools.js │ │ ├── down-pressed.png │ │ ├── down.png │ │ ├── epub.css │ │ ├── file.png │ │ ├── footerbg.png │ │ ├── headerbg.png │ │ ├── ie6.css │ │ ├── jquery.js │ │ ├── middlebg.png │ │ ├── minus.png │ │ ├── plus.png │ │ ├── pygments.css │ │ ├── pyramid.css │ │ ├── searchtools.js │ │ ├── sidebar.js │ │ ├── transparent.gif │ │ ├── underscore.js │ │ ├── up-pressed.png │ │ ├── up.png │ │ └── websupport.js │ │ ├── disqus_jnlp.html.html │ │ ├── genindex.html │ │ ├── index.html │ │ ├── objects.inv │ │ ├── search.html │ │ └── searchindex.js ├── conf.py ├── disqus_jnlp.html.rst ├── index.rst └── make.bat ├── scripts └── vcabocha.py ├── setup.py └── src ├── jNlp ├── __init__.py ├── aquisition │ ├── OpenSubtitles.py │ ├── SubtitleDatabase.py │ ├── __init__.py │ ├── aquire.py │ ├── download_subs.xml │ └── movies.txt ├── callunix.py ├── data │ ├── JapaneseSentiWordNet.txt │ ├── __init__.py │ ├── chasen_pos.txt │ ├── hiraganaChart.txt │ └── katakanaChart.txt ├── eProcessing.py ├── edict_search_monash │ ├── __init__.py │ ├── ambiguous_words.p │ ├── edict_examples.p │ ├── edict_examples.py │ └── edict_search.py ├── jCabocha.py ├── jColor.py ├── jConvert.py ├── jProcessing.py ├── jSentiments.py ├── jTokenize.py ├── summarize.py ├── url2text.py └── vcabocha.py └── jProcessing.egg-info ├── PKG-INFO ├── SOURCES.txt ├── dependency_links.txt └── top_level.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.*[#~] 2 | #.*?# 3 | upload.sh 4 | push.sh 5 | *.pyc 6 | dist/ 7 | build/ 8 | src/jNlp/*.p 9 | src/jNlp/_dicts/ 10 | src/jNlp/classifiers/ 11 | runsetup.py 12 | src/jNlp/_corpora -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2011, Pulkit Kathuria 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 18 | # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 19 | # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 | # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 25 | # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 | # POSSIBILITY OF SUCH DAMAGE. 27 | 28 | 29 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft src 2 | prune src/jNlp/.git 3 | exclude src/jNlp/jnlp/upload.sh 4 | exclude push.sh 5 | exclude src/jNlp/*.p 6 | prune src/jNlp/_dicts 7 | prune src/jNlp/_corpora 8 | prune src/jNlp/classifiers 9 | exclude runsetup.py 10 | 11 | 12 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | ==================== 2 | Japanese NLP Library 3 | ==================== 4 | 5 | 6 | Requirements 7 | ============ 8 | 9 | - Third Party Dependencies 10 | 11 | - Cabocha Japanese Morphological parser http://sourceforge.net/projects/cabocha/ 12 | 13 | - Python Dependencies 14 | 15 | - ``Python 2.6.*`` or above 16 | 17 | 18 | ``Links`` 19 | --------- 20 | 21 | - All code at jProcessing Repo GitHub_ 22 | 23 | .. _GitHub: https://github.com/kevincobain2000/jProcessing 24 | 25 | - Documentation_ and HomePage_ and Sphinx_ 26 | 27 | .. _Documentation: http://www.jaist.ac.jp/~s1010205/jnlp 28 | 29 | .. _HomePage: http://www.jaist.ac.jp/~s1010205/ 30 | 31 | .. _Sphinx: http://readthedocs.org/docs/jprocessing/en/latest/ 32 | 33 | 34 | - PyPi_ Python Package 35 | 36 | .. _PyPi: http://pypi.python.org/pypi/jProcessing/0.1 37 | 38 | :: 39 | 40 | clone git@github.com:kevincobain2000/jProcessing.git 41 | 42 | 43 | ``Install`` 44 | ----------- 45 | 46 | In ``Terminal`` :: 47 | 48 | >>>bash$ python setup.py install 49 | 50 | History 51 | ------- 52 | 53 | - ``0.2`` 54 | 55 | + Sentiment Analysis of Japanese Text 56 | 57 | - ``0.1`` 58 | + Morphologically Tokenize Japanese Sentence 59 | + Kanji / Hiragana / Katakana to Romaji Converter 60 | + Edict Dictionary Search - borrowed 61 | + Edict Examples Search - incomplete 62 | + Sentence Similarity between two JP Sentences 63 | + Run Cabocha(ISO--8859-1 configured) in Python. 64 | + Longest Common String between Sentences 65 | + Kanji to Katakana Pronunciation 66 | + Hiragana, Katakana Chart Parser 67 | 68 | Contacts 69 | ======== 70 | 71 | - ContactForm_ 72 | - BugReport_ 73 | - Contribute_ 74 | 75 | .. _ContactForm: http://www.jaist.ac.jp/~s1010205/styled-2/index.html 76 | .. _BugReport: http://www.jaist.ac.jp/~s1010205/styled/index.html 77 | .. _Contribute: https://github.com/kevincobain2000/jProcessing 78 | 79 | :Author: `pulkit[at]jaist.ac.jp` [change ``at`` with ``@``] 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. raw:: html 2 | 3 | 4 | 5 | 6 | 7 | .. raw:: html 8 | 9 |
Back to Home 10 | 11 | ==================== 12 | Japanese NLP Library 13 | ==================== 14 | 15 | 16 | .. sectnum:: 17 | .. contents:: 18 | 19 | Requirements 20 | ============ 21 | 22 | - Third Party Dependencies 23 | 24 | - Cabocha Japanese Morphological parser http://sourceforge.net/projects/cabocha/ 25 | 26 | - Python Dependencies 27 | 28 | - ``Python 2.6.*`` or above 29 | 30 | 31 | ``Links`` 32 | --------- 33 | 34 | - All code at jProcessing Repo GitHub_ 35 | 36 | .. _GitHub: https://github.com/kevincobain2000/jProcessing 37 | 38 | - Documentation_ and HomePage_ and Sphinx_ 39 | 40 | .. _Documentation: http://www.jaist.ac.jp/~s1010205/jnlp 41 | 42 | .. _HomePage: http://www.jaist.ac.jp/~s1010205/ 43 | 44 | .. _Sphinx: http://readthedocs.org/docs/jprocessing/en/latest/ 45 | 46 | 47 | - PyPi_ Python Package 48 | 49 | .. _PyPi: http://pypi.python.org/pypi/jProcessing/0.1 50 | 51 | :: 52 | 53 | clone git@github.com:kevincobain2000/jProcessing.git 54 | 55 | 56 | ``Install`` 57 | ----------- 58 | 59 | In ``Terminal`` :: 60 | 61 | bash$ python setup.py install 62 | 63 | History 64 | ------- 65 | 66 | - ``0.2`` 67 | 68 | + Sentiment Analysis of Japanese Text 69 | 70 | - ``0.1`` 71 | + Morphologically Tokenize Japanese Sentence 72 | + Kanji / Hiragana / Katakana to Romaji Converter 73 | + Edict Dictionary Search - borrowed 74 | + Edict Examples Search - incomplete 75 | + Sentence Similarity between two JP Sentences 76 | + Run Cabocha(ISO--8859-1 configured) in Python. 77 | + Longest Common String between Sentences 78 | + Kanji to Katakana Pronunciation 79 | + Hiragana, Katakana Chart Parser 80 | 81 | Libraries and Modules 82 | ===================== 83 | 84 | Tokenize ``jTokenize.py`` 85 | ------------------------- 86 | In ``Python`` :: 87 | 88 | >>> from jNlp.jTokenize import jTokenize 89 | >>> input_sentence = u'私は彼を5日前、つまりこの前の金曜日に駅で見かけた' 90 | >>> list_of_tokens = jTokenize(input_sentence) 91 | >>> print list_of_tokens 92 | >>> print '--'.join(list_of_tokens).encode('utf-8') 93 | 94 | Returns: 95 | 96 | :: 97 | 98 | ... [u'\u79c1', u'\u306f', u'\u5f7c', u'\u3092', u'\uff15'...] 99 | ... 私--は--彼--を--5--日--前--、--つまり--この--前--の--金曜日--に--駅--で--見かけ--た 100 | 101 | Katakana Pronunciation: 102 | 103 | :: 104 | 105 | >>> print '--'.join(jReads(input_sentence)).encode('utf-8') 106 | ... ワタシ--ハ--カレ--ヲ--ゴ--ニチ--マエ--、--ツマリ--コノ--マエ--ノ--キンヨウビ--ニ--エキ--デ--ミカケ--タ 107 | 108 | 109 | Cabocha ``jCabocha.py`` 110 | ----------------------- 111 | 112 | Run Cabocha_ with original ``EUCJP`` or ``IS0-8859-1`` configured encoding, with ``utf8`` python 113 | 114 | .. _Cabocha: http://code.google.com/p/cabocha/ 115 | 116 | - If cabocha is configured as ``utf8`` then see this http://nltk.googlecode.com/svn/trunk/doc/book-jp/ch12.html#cabocha 117 | 118 | .. code-block:: python 119 | 120 | >>> from jNlp.jCabocha import cabocha 121 | >>> print cabocha(input_sentence).encode('utf-8') 122 | 123 | Output: 124 | 125 | .. code-block:: xml 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | Kanji / Katakana /Hiragana to Tokenized Romaji ``jConvert.py`` 146 | -------------------------------------------------------------- 147 | 148 | Uses ``data/katakanaChart.txt`` and parses the chart. See katakanaChart_. 149 | 150 | .. code-block:: python 151 | 152 | >>> from jNlp.jConvert import * 153 | >>> input_sentence = u'気象庁が21日午前4時48分、発表した天気概況によると、' 154 | >>> print ' '.join(tokenizedRomaji(input_sentence)) 155 | >>> print tokenizedRomaji(input_sentence) 156 | 157 | .. code-block:: python 158 | 159 | ...kisyoutyou ga ni ichi nichi gozen yon ji yon hachi hun hapyou si ta tenki gaikyou ni yoru to 160 | ...[u'kisyoutyou', u'ga', u'ni', u'ichi', u'nichi', u'gozen',...] 161 | 162 | 163 | **katakanaChart.txt** 164 | 165 | 166 | .. _katakanaChart: 167 | 168 | - katakanaChartFile_ and hiraganaChartFile_ 169 | 170 | .. _katakanaChartFile: https://raw.github.com/kevincobain2000/jProcessing/master/src/jNlp/data/katakanaChart.txt 171 | 172 | .. _hiraganaChartFile: https://raw.github.com/kevincobain2000/jProcessing/master/src/jNlp/data/hiraganaChart.txt 173 | 174 | 175 | Longest Common String Japanese ``jProcessing.py`` 176 | ------------------------------------------------- 177 | 178 | On English Strings :: 179 | 180 | >>> from jNlp.jProcessing import long_substr 181 | >>> a = 'Once upon a time in Italy' 182 | >>> b = 'Thre was a time in America' 183 | >>> print long_substr(a, b) 184 | 185 | Output :: 186 | 187 | ...a time in 188 | 189 | On Japanese Strings :: 190 | 191 | >>> a = u'これでアナタも冷え知らず' 192 | >>> b = u'これでア冷え知らずナタも' 193 | >>> print long_substr(a, b).encode('utf-8') 194 | 195 | Output :: 196 | 197 | ...冷え知らず 198 | 199 | Similarity between two sentences ``jProcessing.py`` 200 | --------------------------------------------------- 201 | Uses MinHash by checking the overlap http://en.wikipedia.org/wiki/MinHash 202 | 203 | :English Strings: 204 | 205 | >>> from jNlp.jProcessing import Similarities 206 | >>> s = Similarities() 207 | >>> a = 'There was' 208 | >>> b = 'There is' 209 | >>> print s.minhash(a,b) 210 | ...0.444444444444 211 | 212 | :Japanese Strings: 213 | 214 | >>> from jNlp.jProcessing import * 215 | >>> a = u'これは何ですか?' 216 | >>> b = u'これはわからないです' 217 | >>> print s.minhash(' '.join(jTokenize(a)), ' '.join(jTokenize(b))) 218 | ...0.210526315789 219 | 220 | Edict Japanese Dictionary Search with Example sentences 221 | ======================================================= 222 | 223 | Sample Ouput Demo 224 | ----------------- 225 | 226 | .. raw:: html 227 | 228 | 244 | 245 | 246 | 247 | Edict dictionary and example sentences parser. 248 | ---------------------------------------------- 249 | 250 | This package uses the EDICT_ and KANJIDIC_ dictionary files. 251 | These files are the property of the 252 | Electronic Dictionary Research and Development Group_ , and 253 | are used in conformance with the Group's licence_ . 254 | 255 | .. _EDICT: http://www.csse.monash.edu.au/~jwb/edict.html 256 | .. _KANJIDIC: http://www.csse.monash.edu.au/~jwb/kanjidic.html 257 | .. _Group: http://www.edrdg.org/ 258 | .. _licence: http://www.edrdg.org/edrdg/licence.html 259 | 260 | Edict Parser By **Paul Goins**, see ``edict_search.py`` 261 | Edict Example sentences Parse by query, **Pulkit Kathuria**, see ``edict_examples.py`` 262 | Edict examples pickle files are provided but latest example files can be downloaded from the links provided. 263 | 264 | Charset 265 | ------- 266 | Two files 267 | 268 | - ``utf8`` Charset example file if not using ``src/jNlp/data/edict_examples`` 269 | 270 | To convert ``EUCJP/ISO-8859-1`` to ``utf8`` :: 271 | 272 | iconv -f EUCJP -t UTF-8 path/to/edict_examples > path/to/save_with_utf-8 273 | 274 | - ``ISO-8859-1`` edict_dictionary file 275 | 276 | Outputs example sentences for a query in Japanese only for ambiguous words. 277 | 278 | 279 | Links 280 | ----- 281 | 282 | **Latest** Dictionary files can be downloaded here_ 283 | 284 | .. _here: http://www.csse.monash.edu.au/~jwb/edict.html 285 | 286 | ``edict_search.py`` 287 | ------------------- 288 | :author: Paul Goins `License included` linkToOriginal_: 289 | 290 | .. _linkToOriginal: http://repo.or.cz/w/jbparse.git/blame/8e42831ca5f721c0320b27d7d83cb553d6e9c68f:/jbparse/edict.py 291 | 292 | For all entries of sense definitions 293 | 294 | >>> from jNlp.edict_search import * 295 | >>> query = u'認める' 296 | >>> edict_path = 'src/jNlp/data/edict-yy-mm-dd' 297 | >>> kp = Parser(edict_path) 298 | >>> for i, entry in enumerate(kp.search(query)): 299 | ... print entry.to_string().encode('utf-8') 300 | 301 | 302 | ``edict_examples.py`` 303 | --------------------- 304 | :`Note`: Only outputs the examples sentences for ambiguous words (if word has one or more senses) 305 | 306 | :author: Pulkit Kathuria 307 | 308 | >>> from jNlp.edict_examples import * 309 | >>> query = u'認める' 310 | >>> edict_path = 'src/jNlp/data/edict-yy-mm-dd' 311 | >>> edict_examples_path = 'src/jNlp/data/edict_examples' 312 | >>> search_with_example(edict_path, edict_examples_path, query) 313 | 314 | Output :: 315 | 316 | 認める 317 | 318 | Sense (1) to recognize; 319 | EX:01 我々は彼の才能を*認*めている。We appreciate his talent. 320 | 321 | Sense (2) to observe; 322 | EX:01 x線写真で異状が*認*められます。We have detected an abnormality on your x-ray. 323 | 324 | Sense (3) to admit; 325 | EX:01 母は私の計画をよいと*認*めた。Mother approved my plan. 326 | EX:02 母は決して私の結婚を*認*めないだろう。Mother will never approve of my marriage. 327 | EX:03 父は決して私の結婚を*認*めないだろう。Father will never approve of my marriage. 328 | EX:04 彼は女性の喫煙をいいものだと*認*めない。He doesn't approve of women smoking. 329 | ... 330 | 331 | Sentiment Analysis Japanese Text 332 | ================================ 333 | 334 | This section covers (1) Sentiment Analysis on Japanese text using Word Sense Disambiguation, Wordnet-jp_ (Japanese Word Net file name ``wnjpn-all.tab``), SentiWordnet_ (English SentiWordNet file name ``SentiWordNet_3.*.txt``). 335 | 336 | .. _Wordnet-jp: http://nlpwww.nict.go.jp/wn-ja/eng/downloads.html 337 | .. _SentiWordnet: http://sentiwordnet.isti.cnr.it/ 338 | 339 | Wordnet files download links 340 | ---------------------------- 341 | 342 | 1. http://nlpwww.nict.go.jp/wn-ja/eng/downloads.html 343 | 2. http://sentiwordnet.isti.cnr.it/ 344 | 345 | How to Use 346 | ---------- 347 | 348 | The following classifier is baseline, which works as simple mapping of Eng to Japanese using Wordnet and classify on polarity score using SentiWordnet. 349 | 350 | - (Adnouns, nouns, verbs, .. all included) 351 | - No WSD module on Japanese Sentence 352 | - Uses word as its common sense for polarity score 353 | 354 | >>> from jNlp.jSentiments import * 355 | >>> jp_wn = '../../../../data/wnjpn-all.tab' 356 | >>> en_swn = '../../../../data/SentiWordNet_3.0.0_20100908.txt' 357 | >>> classifier = Sentiment() 358 | >>> classifier.train(en_swn, jp_wn) 359 | >>> text = u'監督、俳優、ストーリー、演出、全部最高!' 360 | >>> print classifier.baseline(text) 361 | ...Pos Score = 0.625 Neg Score = 0.125 362 | ...Text is Positive 363 | 364 | Japanese Word Polarity Score 365 | ---------------------------- 366 | 367 | >>> from jNlp.jSentiments import * 368 | >>> jp_wn = '_dicts/wnjpn-all.tab' #path to Japanese Word Net 369 | >>> en_swn = '_dicts/SentiWordNet_3.0.0_20100908.txt' #Path to SentiWordNet 370 | >>> classifier = Sentiment() 371 | >>> sentiwordnet, jpwordnet = classifier.train(en_swn, jp_wn) 372 | >>> positive_score = sentiwordnet[jpwordnet[u'全部']][0] 373 | >>> negative_score = sentiwordnet[jpwordnet[u'全部']][1] 374 | >>> print 'pos score = {0}, neg score = {1}'.format(positive_score, negative_score) 375 | ...pos score = 0.625, neg score = 0.0 376 | 377 | 378 | Contacts 379 | ======== 380 | 381 | :Author: `pulkit[at]jaist.ac.jp` [change ``at`` with ``@``] 382 | 383 | 384 | .. include:: disqus_jnlp.html.rst 385 | 386 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " texinfo to make Texinfo files" 35 | @echo " info to make Texinfo files and run them through makeinfo" 36 | @echo " gettext to make PO message catalogs" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | 41 | clean: 42 | -rm -rf $(BUILDDIR)/* 43 | 44 | html: 45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 48 | 49 | dirhtml: 50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 53 | 54 | singlehtml: 55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 56 | @echo 57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 58 | 59 | pickle: 60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 61 | @echo 62 | @echo "Build finished; now you can process the pickle files." 63 | 64 | json: 65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 66 | @echo 67 | @echo "Build finished; now you can process the JSON files." 68 | 69 | htmlhelp: 70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 71 | @echo 72 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 73 | ".hhp project file in $(BUILDDIR)/htmlhelp." 74 | 75 | qthelp: 76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 77 | @echo 78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/jProcessing.qhcp" 81 | @echo "To view the help file:" 82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/jProcessing.qhc" 83 | 84 | devhelp: 85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 86 | @echo 87 | @echo "Build finished." 88 | @echo "To view the help file:" 89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/jProcessing" 90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/jProcessing" 91 | @echo "# devhelp" 92 | 93 | epub: 94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 95 | @echo 96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 97 | 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | latexpdf: 106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 107 | @echo "Running LaTeX files through pdflatex..." 108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 110 | 111 | text: 112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 113 | @echo 114 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 115 | 116 | man: 117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 118 | @echo 119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 120 | 121 | texinfo: 122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 123 | @echo 124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 125 | @echo "Run \`make' in that directory to run these through makeinfo" \ 126 | "(use \`make info' here to do that automatically)." 127 | 128 | info: 129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 130 | @echo "Running Texinfo files through makeinfo..." 131 | make -C $(BUILDDIR)/texinfo info 132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 133 | 134 | gettext: 135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 136 | @echo 137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 138 | 139 | changes: 140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 141 | @echo 142 | @echo "The overview file is in $(BUILDDIR)/changes." 143 | 144 | linkcheck: 145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 146 | @echo 147 | @echo "Link check complete; look for any errors in the above output " \ 148 | "or in $(BUILDDIR)/linkcheck/output.txt." 149 | 150 | doctest: 151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 152 | @echo "Testing of doctests in the sources finished, look at the " \ 153 | "results in $(BUILDDIR)/doctest/output.txt." 154 | -------------------------------------------------------------------------------- /docs/_build/doctrees/disqus_jnlp.html.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/doctrees/disqus_jnlp.html.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/environment.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/doctrees/environment.pickle -------------------------------------------------------------------------------- /docs/_build/doctrees/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/doctrees/index.doctree -------------------------------------------------------------------------------- /docs/_build/html/.buildinfo: -------------------------------------------------------------------------------- 1 | # Sphinx build info version 1 2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. 3 | config: c5db09488a4b62fff5c534b5b975854f 4 | tags: fbb0d17656682115ca4d033fb2f83ba1 5 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/disqus_jnlp.html.txt: -------------------------------------------------------------------------------- 1 | .. raw:: html 2 | 3 |
4 | 15 | 16 | blog comments powered by Disqus 17 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/index.txt: -------------------------------------------------------------------------------- 1 | .. raw:: html 2 | 3 | 4 | 5 | 6 | 7 | .. raw:: html 8 | 9 |
Back to Home 10 | 11 | ==================== 12 | Japanese NLP Library 13 | ==================== 14 | 15 | 16 | .. sectnum:: 17 | .. contents:: 18 | 19 | Requirements 20 | ============ 21 | 22 | - Third Party Dependencies 23 | 24 | - Cabocha Japanese Morphological parser http://sourceforge.net/projects/cabocha/ 25 | 26 | - Python Dependencies 27 | 28 | - ``Python 2.6.*`` or above 29 | 30 | 31 | ``Links`` 32 | --------- 33 | 34 | - All code at jProcessing Repo GitHub_ 35 | 36 | .. _GitHub: https://github.com/kevincobain2000/jProcessing 37 | 38 | - Documentation_ and HomePage_ and Sphinx_ 39 | 40 | .. _Documentation: http://www.jaist.ac.jp/~s1010205/jnlp 41 | 42 | .. _HomePage: http://www.jaist.ac.jp/~s1010205/ 43 | 44 | .. _Sphinx: http://readthedocs.org/docs/jprocessing/en/latest/ 45 | 46 | 47 | - PyPi_ Python Package 48 | 49 | .. _PyPi: http://pypi.python.org/pypi/jProcessing/0.1 50 | 51 | :: 52 | 53 | clone git@github.com:kevincobain2000/jProcessing.git 54 | 55 | 56 | ``Install`` 57 | ----------- 58 | 59 | In ``Terminal`` :: 60 | 61 | bash$ python setup.py install 62 | 63 | History 64 | ------- 65 | 66 | - ``0.2`` 67 | 68 | + Sentiment Analysis of Japanese Text 69 | 70 | - ``0.1`` 71 | + Morphologically Tokenize Japanese Sentence 72 | + Kanji / Hiragana / Katakana to Romaji Converter 73 | + Edict Dictionary Search - borrowed 74 | + Edict Examples Search - incomplete 75 | + Sentence Similarity between two JP Sentences 76 | + Run Cabocha(ISO--8859-1 configured) in Python. 77 | + Longest Common String between Sentences 78 | + Kanji to Katakana Pronunciation 79 | + Hiragana, Katakana Chart Parser 80 | 81 | Libraries and Modules 82 | ===================== 83 | 84 | Tokenize ``jTokenize.py`` 85 | ------------------------- 86 | In ``Python`` :: 87 | 88 | >>> from jNlp.jTokenize import jTokenize 89 | >>> input_sentence = u'私は彼を5日前、つまりこの前の金曜日に駅で見かけた' 90 | >>> list_of_tokens = jTokenize(input_sentence) 91 | >>> print list_of_tokens 92 | >>> print '--'.join(list_of_tokens).encode('utf-8') 93 | 94 | Returns: 95 | 96 | :: 97 | 98 | ... [u'\u79c1', u'\u306f', u'\u5f7c', u'\u3092', u'\uff15'...] 99 | ... 私--は--彼--を--5--日--前--、--つまり--この--前--の--金曜日--に--駅--で--見かけ--た 100 | 101 | Katakana Pronunciation: 102 | 103 | :: 104 | 105 | >>> print '--'.join(jReads(input_sentence)).encode('utf-8') 106 | ... ワタシ--ハ--カレ--ヲ--ゴ--ニチ--マエ--、--ツマリ--コノ--マエ--ノ--キンヨウビ--ニ--エキ--デ--ミカケ--タ 107 | 108 | 109 | Cabocha ``jCabocha.py`` 110 | ----------------------- 111 | 112 | Run Cabocha_ with original ``EUCJP`` or ``IS0-8859-1`` configured encoding, with ``utf8`` python 113 | 114 | .. _Cabocha: http://code.google.com/p/cabocha/ 115 | 116 | - If cobocha is configured as ``utf8`` then see this http://nltk.googlecode.com/svn/trunk/doc/book-jp/ch12.html#cabocha 117 | 118 | .. code-block:: python 119 | 120 | >>> from jNlp.jCabocha import cabocha 121 | >>> print cabocha(input_sentence).encode('utf-8') 122 | 123 | Output: 124 | 125 | .. code-block:: xml 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | Kanji / Katakana /Hiragana to Tokenized Romaji ``jConvert.py`` 146 | -------------------------------------------------------------- 147 | 148 | Uses ``data/katakanaChart.txt`` and parses the chart. See katakanaChart_. 149 | 150 | .. code-block:: python 151 | 152 | >>> from jNlp.jConvert import * 153 | >>> input_sentence = u'気象庁が21日午前4時48分、発表した天気概況によると、' 154 | >>> print ' '.join(tokenizedRomaji(input_sentence)) 155 | >>> print tokenizedRomaji(input_sentence) 156 | 157 | .. code-block:: python 158 | 159 | ...kisyoutyou ga ni ichi nichi gozen yon ji yon hachi hun hapyou si ta tenki gaikyou ni yoru to 160 | ...[u'kisyoutyou', u'ga', u'ni', u'ichi', u'nichi', u'gozen',...] 161 | 162 | 163 | **katakanaChart.txt** 164 | 165 | 166 | .. _katakanaChart: 167 | 168 | - katakanaChartFile_ and hiraganaChartFile_ 169 | 170 | .. _katakanaChartFile: https://raw.github.com/kevincobain2000/jProcessing/master/src/jNlp/data/katakanaChart.txt 171 | 172 | .. _hiraganaChartFile: https://raw.github.com/kevincobain2000/jProcessing/master/src/jNlp/data/hiraganaChart.txt 173 | 174 | 175 | Longest Common String Japanese ``jProcessing.py`` 176 | ------------------------------------------------- 177 | 178 | On English Strings :: 179 | 180 | >>> from jNlp.jProcessing import long_substr 181 | >>> a = 'Once upon a time in Italy' 182 | >>> b = 'Thre was a time in America' 183 | >>> print long_substr(a, b) 184 | 185 | Output :: 186 | 187 | ...a time in 188 | 189 | On Japanese Strings :: 190 | 191 | >>> a = u'これでアナタも冷え知らず' 192 | >>> b = u'これでア冷え知らずナタも' 193 | >>> print long_substr(a, b).encode('utf-8') 194 | 195 | Output :: 196 | 197 | ...冷え知らず 198 | 199 | Similarity between two sentences ``jProcessing.py`` 200 | --------------------------------------------------- 201 | Uses MinHash by checking the overlap http://en.wikipedia.org/wiki/MinHash 202 | 203 | :English Strings: 204 | 205 | >>> from jNlp.jProcessing import Similarities 206 | >>> s = Similarities() 207 | >>> a = 'There was' 208 | >>> b = 'There is' 209 | >>> print s.minhash(a,b) 210 | ...0.444444444444 211 | 212 | :Japanese Strings: 213 | 214 | >>> from jNlp.jProcessing import * 215 | >>> a = u'これは何ですか?' 216 | >>> b = u'これはわからないです' 217 | >>> print s.minhash(' '.join(jTokenize(a)), ' '.join(jTokenize(b))) 218 | ...0.210526315789 219 | 220 | Edict Japanese Dictionary Search with Example sentences 221 | ======================================================= 222 | 223 | Sample Ouput Demo 224 | ----------------- 225 | 226 | .. raw:: html 227 | 228 | 244 | 245 | 246 | 247 | Edict dictionary and example sentences parser. 248 | ---------------------------------------------- 249 | 250 | This package uses the EDICT_ and KANJIDIC_ dictionary files. 251 | These files are the property of the 252 | Electronic Dictionary Research and Development Group_ , and 253 | are used in conformance with the Group's licence_ . 254 | 255 | .. _EDICT: http://www.csse.monash.edu.au/~jwb/edict.html 256 | .. _KANJIDIC: http://www.csse.monash.edu.au/~jwb/kanjidic.html 257 | .. _Group: http://www.edrdg.org/ 258 | .. _licence: http://www.edrdg.org/edrdg/licence.html 259 | 260 | Edict Parser By **Paul Goins**, see ``edict_search.py`` 261 | Edict Example sentences Parse by query, **Pulkit Kathuria**, see ``edict_examples.py`` 262 | Edict examples pickle files are provided but latest example files can be downloaded from the links provided. 263 | 264 | Charset 265 | ------- 266 | Two files 267 | 268 | - ``utf8`` Charset example file if not using ``src/jNlp/data/edict_examples`` 269 | 270 | To convert ``EUCJP/ISO-8859-1`` to ``utf8`` :: 271 | 272 | iconv -f EUCJP -t UTF-8 path/to/edict_examples > path/to/save_with_utf-8 273 | 274 | - ``ISO-8859-1`` edict_dictionary file 275 | 276 | Outputs example sentences for a query in Japanese only for ambiguous words. 277 | 278 | 279 | Links 280 | ----- 281 | 282 | **Latest** Dictionary files can be downloaded here_ 283 | 284 | .. _here: http://www.csse.monash.edu.au/~jwb/edict.html 285 | 286 | ``edict_search.py`` 287 | ------------------- 288 | :author: Paul Goins `License included` linkToOriginal_: 289 | 290 | .. _linkToOriginal: http://repo.or.cz/w/jbparse.git/blame/8e42831ca5f721c0320b27d7d83cb553d6e9c68f:/jbparse/edict.py 291 | 292 | For all entries of sense definitions 293 | 294 | >>> from jNlp.edict_search import * 295 | >>> query = u'認める' 296 | >>> edict_path = 'src/jNlp/data/edict-yy-mm-dd' 297 | >>> kp = Parser(edict_path) 298 | >>> for i, entry in enumerate(kp.search(query)): 299 | ... print entry.to_string().encode('utf-8') 300 | 301 | 302 | ``edict_examples.py`` 303 | --------------------- 304 | :`Note`: Only outputs the examples sentences for ambiguous words (if word has one or more senses) 305 | 306 | :author: Pulkit Kathuria 307 | 308 | >>> from jNlp.edict_examples import * 309 | >>> query = u'認める' 310 | >>> edict_path = 'src/jNlp/data/edict-yy-mm-dd' 311 | >>> edict_examples_path = 'src/jNlp/data/edict_examples' 312 | >>> search_with_example(edict_path, edict_examples_path, query) 313 | 314 | Output :: 315 | 316 | 認める 317 | 318 | Sense (1) to recognize; 319 | EX:01 我々は彼の才能を*認*めている。We appreciate his talent. 320 | 321 | Sense (2) to observe; 322 | EX:01 x線写真で異状が*認*められます。We have detected an abnormality on your x-ray. 323 | 324 | Sense (3) to admit; 325 | EX:01 母は私の計画をよいと*認*めた。Mother approved my plan. 326 | EX:02 母は決して私の結婚を*認*めないだろう。Mother will never approve of my marriage. 327 | EX:03 父は決して私の結婚を*認*めないだろう。Father will never approve of my marriage. 328 | EX:04 彼は女性の喫煙をいいものだと*認*めない。He doesn't approve of women smoking. 329 | ... 330 | 331 | Sentiment Analysis Japanese Text 332 | ================================ 333 | 334 | This section covers (1) Sentiment Analysis on Japanese text using Word Sense Disambiguation, Wordnet-jp_ (Japanese Word Net file name ``wnjpn-all.tab``), SentiWordnet_ (English SentiWordNet file name ``SentiWordNet_3.*.txt``). 335 | 336 | .. _Wordnet-jp: http://nlpwww.nict.go.jp/wn-ja/eng/downloads.html 337 | .. _SentiWordnet: http://sentiwordnet.isti.cnr.it/ 338 | 339 | Wordnet files download links 340 | ---------------------------- 341 | 342 | 1. http://nlpwww.nict.go.jp/wn-ja/eng/downloads.html 343 | 2. http://sentiwordnet.isti.cnr.it/ 344 | 345 | How to Use 346 | ---------- 347 | 348 | The following classifier is baseline, which works as simple mapping of Eng to Japanese using Wordnet and classify on polarity score using SentiWordnet. 349 | 350 | - (Adnouns, nouns, verbs, .. all included) 351 | - No WSD module on Japanese Sentence 352 | - Uses word as its common sense for polarity score 353 | 354 | >>> from jNlp.jSentiments import * 355 | >>> jp_wn = '../../../../data/wnjpn-all.tab' 356 | >>> en_swn = '../../../../data/SentiWordNet_3.0.0_20100908.txt' 357 | >>> classifier = Sentiment() 358 | >>> classifier.train(en_swn, jp_wn) 359 | >>> text = u'監督、俳優、ストーリー、演出、全部最高!' 360 | >>> print classifier.baseline(text) 361 | ...Pos Score = 0.625 Neg Score = 0.125 362 | ...Text is Positive 363 | 364 | Japanese Word Polarity Score 365 | ---------------------------- 366 | 367 | >>> from jNlp.jSentiments import * 368 | >>> jp_wn = '_dicts/wnjpn-all.tab' #path to Japanese Word Net 369 | >>> en_swn = '_dicts/SentiWordNet_3.0.0_20100908.txt' #Path to SentiWordNet 370 | >>> classifier = Sentiment() 371 | >>> sentiwordnet, jpwordnet = classifier.train(en_swn, jp_wn) 372 | >>> positive_score = sentiwordnet[jpwordnet[u'全部']][0] 373 | >>> negative_score = sentiwordnet[jpwordnet[u'全部']][1] 374 | >>> print 'pos score = {0}, neg score = {1}'.format(positive_score, negative_score) 375 | ...pos score = 0.625, neg score = 0.0 376 | 377 | 378 | Contacts 379 | ======== 380 | 381 | :Author: `pulkit[at]jaist.ac.jp` [change ``at`` with ``@``] 382 | 383 | 384 | .. include:: disqus_jnlp.html.rst 385 | 386 | -------------------------------------------------------------------------------- /docs/_build/html/_static/ajax-loader.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/html/_static/ajax-loader.gif -------------------------------------------------------------------------------- /docs/_build/html/_static/basic.css: -------------------------------------------------------------------------------- 1 | /* 2 | * basic.css 3 | * ~~~~~~~~~ 4 | * 5 | * Sphinx stylesheet -- basic theme. 6 | * 7 | * :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. 8 | * :license: BSD, see LICENSE for details. 9 | * 10 | */ 11 | 12 | /* -- main layout ----------------------------------------------------------- */ 13 | 14 | div.clearer { 15 | clear: both; 16 | } 17 | 18 | /* -- relbar ---------------------------------------------------------------- */ 19 | 20 | div.related { 21 | width: 100%; 22 | font-size: 90%; 23 | } 24 | 25 | div.related h3 { 26 | display: none; 27 | } 28 | 29 | div.related ul { 30 | margin: 0; 31 | padding: 0 0 0 10px; 32 | list-style: none; 33 | } 34 | 35 | div.related li { 36 | display: inline; 37 | } 38 | 39 | div.related li.right { 40 | float: right; 41 | margin-right: 5px; 42 | } 43 | 44 | /* -- sidebar --------------------------------------------------------------- */ 45 | 46 | div.sphinxsidebarwrapper { 47 | padding: 10px 5px 0 10px; 48 | } 49 | 50 | div.sphinxsidebar { 51 | float: left; 52 | width: 230px; 53 | margin-left: -100%; 54 | font-size: 90%; 55 | } 56 | 57 | div.sphinxsidebar ul { 58 | list-style: none; 59 | } 60 | 61 | div.sphinxsidebar ul ul, 62 | div.sphinxsidebar ul.want-points { 63 | margin-left: 20px; 64 | list-style: square; 65 | } 66 | 67 | div.sphinxsidebar ul ul { 68 | margin-top: 0; 69 | margin-bottom: 0; 70 | } 71 | 72 | div.sphinxsidebar form { 73 | margin-top: 10px; 74 | } 75 | 76 | div.sphinxsidebar input { 77 | border: 1px solid #98dbcc; 78 | font-family: sans-serif; 79 | font-size: 1em; 80 | } 81 | 82 | div.sphinxsidebar input[type="text"] { 83 | width: 170px; 84 | } 85 | 86 | div.sphinxsidebar input[type="submit"] { 87 | width: 30px; 88 | } 89 | 90 | img { 91 | border: 0; 92 | } 93 | 94 | /* -- search page ----------------------------------------------------------- */ 95 | 96 | ul.search { 97 | margin: 10px 0 0 20px; 98 | padding: 0; 99 | } 100 | 101 | ul.search li { 102 | padding: 5px 0 5px 20px; 103 | background-image: url(file.png); 104 | background-repeat: no-repeat; 105 | background-position: 0 7px; 106 | } 107 | 108 | ul.search li a { 109 | font-weight: bold; 110 | } 111 | 112 | ul.search li div.context { 113 | color: #888; 114 | margin: 2px 0 0 30px; 115 | text-align: left; 116 | } 117 | 118 | ul.keywordmatches li.goodmatch a { 119 | font-weight: bold; 120 | } 121 | 122 | /* -- index page ------------------------------------------------------------ */ 123 | 124 | table.contentstable { 125 | width: 90%; 126 | } 127 | 128 | table.contentstable p.biglink { 129 | line-height: 150%; 130 | } 131 | 132 | a.biglink { 133 | font-size: 1.3em; 134 | } 135 | 136 | span.linkdescr { 137 | font-style: italic; 138 | padding-top: 5px; 139 | font-size: 90%; 140 | } 141 | 142 | /* -- general index --------------------------------------------------------- */ 143 | 144 | table.indextable { 145 | width: 100%; 146 | } 147 | 148 | table.indextable td { 149 | text-align: left; 150 | vertical-align: top; 151 | } 152 | 153 | table.indextable dl, table.indextable dd { 154 | margin-top: 0; 155 | margin-bottom: 0; 156 | } 157 | 158 | table.indextable tr.pcap { 159 | height: 10px; 160 | } 161 | 162 | table.indextable tr.cap { 163 | margin-top: 10px; 164 | background-color: #f2f2f2; 165 | } 166 | 167 | img.toggler { 168 | margin-right: 3px; 169 | margin-top: 3px; 170 | cursor: pointer; 171 | } 172 | 173 | div.modindex-jumpbox { 174 | border-top: 1px solid #ddd; 175 | border-bottom: 1px solid #ddd; 176 | margin: 1em 0 1em 0; 177 | padding: 0.4em; 178 | } 179 | 180 | div.genindex-jumpbox { 181 | border-top: 1px solid #ddd; 182 | border-bottom: 1px solid #ddd; 183 | margin: 1em 0 1em 0; 184 | padding: 0.4em; 185 | } 186 | 187 | /* -- general body styles --------------------------------------------------- */ 188 | 189 | a.headerlink { 190 | visibility: hidden; 191 | } 192 | 193 | h1:hover > a.headerlink, 194 | h2:hover > a.headerlink, 195 | h3:hover > a.headerlink, 196 | h4:hover > a.headerlink, 197 | h5:hover > a.headerlink, 198 | h6:hover > a.headerlink, 199 | dt:hover > a.headerlink { 200 | visibility: visible; 201 | } 202 | 203 | div.body p.caption { 204 | text-align: inherit; 205 | } 206 | 207 | div.body td { 208 | text-align: left; 209 | } 210 | 211 | .field-list ul { 212 | padding-left: 1em; 213 | } 214 | 215 | .first { 216 | margin-top: 0 !important; 217 | } 218 | 219 | p.rubric { 220 | margin-top: 30px; 221 | font-weight: bold; 222 | } 223 | 224 | img.align-left, .figure.align-left, object.align-left { 225 | clear: left; 226 | float: left; 227 | margin-right: 1em; 228 | } 229 | 230 | img.align-right, .figure.align-right, object.align-right { 231 | clear: right; 232 | float: right; 233 | margin-left: 1em; 234 | } 235 | 236 | img.align-center, .figure.align-center, object.align-center { 237 | display: block; 238 | margin-left: auto; 239 | margin-right: auto; 240 | } 241 | 242 | .align-left { 243 | text-align: left; 244 | } 245 | 246 | .align-center { 247 | text-align: center; 248 | } 249 | 250 | .align-right { 251 | text-align: right; 252 | } 253 | 254 | /* -- sidebars -------------------------------------------------------------- */ 255 | 256 | div.sidebar { 257 | margin: 0 0 0.5em 1em; 258 | border: 1px solid #ddb; 259 | padding: 7px 7px 0 7px; 260 | background-color: #ffe; 261 | width: 40%; 262 | float: right; 263 | } 264 | 265 | p.sidebar-title { 266 | font-weight: bold; 267 | } 268 | 269 | /* -- topics ---------------------------------------------------------------- */ 270 | 271 | div.topic { 272 | border: 1px solid #ccc; 273 | padding: 7px 7px 0 7px; 274 | margin: 10px 0 10px 0; 275 | } 276 | 277 | p.topic-title { 278 | font-size: 1.1em; 279 | font-weight: bold; 280 | margin-top: 10px; 281 | } 282 | 283 | /* -- admonitions ----------------------------------------------------------- */ 284 | 285 | div.admonition { 286 | margin-top: 10px; 287 | margin-bottom: 10px; 288 | padding: 7px; 289 | } 290 | 291 | div.admonition dt { 292 | font-weight: bold; 293 | } 294 | 295 | div.admonition dl { 296 | margin-bottom: 0; 297 | } 298 | 299 | p.admonition-title { 300 | margin: 0px 10px 5px 0px; 301 | font-weight: bold; 302 | } 303 | 304 | div.body p.centered { 305 | text-align: center; 306 | margin-top: 25px; 307 | } 308 | 309 | /* -- tables ---------------------------------------------------------------- */ 310 | 311 | table.docutils { 312 | border: 0; 313 | border-collapse: collapse; 314 | } 315 | 316 | table.docutils td, table.docutils th { 317 | padding: 1px 8px 1px 5px; 318 | border-top: 0; 319 | border-left: 0; 320 | border-right: 0; 321 | border-bottom: 1px solid #aaa; 322 | } 323 | 324 | table.field-list td, table.field-list th { 325 | border: 0 !important; 326 | } 327 | 328 | table.footnote td, table.footnote th { 329 | border: 0 !important; 330 | } 331 | 332 | th { 333 | text-align: left; 334 | padding-right: 5px; 335 | } 336 | 337 | table.citation { 338 | border-left: solid 1px gray; 339 | margin-left: 1px; 340 | } 341 | 342 | table.citation td { 343 | border-bottom: none; 344 | } 345 | 346 | /* -- other body styles ----------------------------------------------------- */ 347 | 348 | ol.arabic { 349 | list-style: decimal; 350 | } 351 | 352 | ol.loweralpha { 353 | list-style: lower-alpha; 354 | } 355 | 356 | ol.upperalpha { 357 | list-style: upper-alpha; 358 | } 359 | 360 | ol.lowerroman { 361 | list-style: lower-roman; 362 | } 363 | 364 | ol.upperroman { 365 | list-style: upper-roman; 366 | } 367 | 368 | dl { 369 | margin-bottom: 15px; 370 | } 371 | 372 | dd p { 373 | margin-top: 0px; 374 | } 375 | 376 | dd ul, dd table { 377 | margin-bottom: 10px; 378 | } 379 | 380 | dd { 381 | margin-top: 3px; 382 | margin-bottom: 10px; 383 | margin-left: 30px; 384 | } 385 | 386 | dt:target, .highlighted { 387 | background-color: #fbe54e; 388 | } 389 | 390 | dl.glossary dt { 391 | font-weight: bold; 392 | font-size: 1.1em; 393 | } 394 | 395 | .field-list ul { 396 | margin: 0; 397 | padding-left: 1em; 398 | } 399 | 400 | .field-list p { 401 | margin: 0; 402 | } 403 | 404 | .refcount { 405 | color: #060; 406 | } 407 | 408 | .optional { 409 | font-size: 1.3em; 410 | } 411 | 412 | .versionmodified { 413 | font-style: italic; 414 | } 415 | 416 | .system-message { 417 | background-color: #fda; 418 | padding: 5px; 419 | border: 3px solid red; 420 | } 421 | 422 | .footnote:target { 423 | background-color: #ffa; 424 | } 425 | 426 | .line-block { 427 | display: block; 428 | margin-top: 1em; 429 | margin-bottom: 1em; 430 | } 431 | 432 | .line-block .line-block { 433 | margin-top: 0; 434 | margin-bottom: 0; 435 | margin-left: 1.5em; 436 | } 437 | 438 | .guilabel, .menuselection { 439 | font-family: sans-serif; 440 | } 441 | 442 | .accelerator { 443 | text-decoration: underline; 444 | } 445 | 446 | .classifier { 447 | font-style: oblique; 448 | } 449 | 450 | abbr, acronym { 451 | border-bottom: dotted 1px; 452 | cursor: help; 453 | } 454 | 455 | /* -- code displays --------------------------------------------------------- */ 456 | 457 | pre { 458 | overflow: auto; 459 | overflow-y: hidden; /* fixes display issues on Chrome browsers */ 460 | } 461 | 462 | td.linenos pre { 463 | padding: 5px 0px; 464 | border: 0; 465 | background-color: transparent; 466 | color: #aaa; 467 | } 468 | 469 | table.highlighttable { 470 | margin-left: 0.5em; 471 | } 472 | 473 | table.highlighttable td { 474 | padding: 0 0.5em 0 0.5em; 475 | } 476 | 477 | tt.descname { 478 | background-color: transparent; 479 | font-weight: bold; 480 | font-size: 1.2em; 481 | } 482 | 483 | tt.descclassname { 484 | background-color: transparent; 485 | } 486 | 487 | tt.xref, a tt { 488 | background-color: transparent; 489 | font-weight: bold; 490 | } 491 | 492 | h1 tt, h2 tt, h3 tt, h4 tt, h5 tt, h6 tt { 493 | background-color: transparent; 494 | } 495 | 496 | .viewcode-link { 497 | float: right; 498 | } 499 | 500 | .viewcode-back { 501 | float: right; 502 | font-family: sans-serif; 503 | } 504 | 505 | div.viewcode-block:target { 506 | margin: -1px -10px; 507 | padding: 0 10px; 508 | } 509 | 510 | /* -- math display ---------------------------------------------------------- */ 511 | 512 | img.math { 513 | vertical-align: middle; 514 | } 515 | 516 | div.body div.math p { 517 | text-align: center; 518 | } 519 | 520 | span.eqno { 521 | float: right; 522 | } 523 | 524 | /* -- printout stylesheet --------------------------------------------------- */ 525 | 526 | @media print { 527 | div.document, 528 | div.documentwrapper, 529 | div.bodywrapper { 530 | margin: 0 !important; 531 | width: 100%; 532 | } 533 | 534 | div.sphinxsidebar, 535 | div.related, 536 | div.footer, 537 | #top-link { 538 | display: none; 539 | } 540 | } -------------------------------------------------------------------------------- /docs/_build/html/_static/comment-bright.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/html/_static/comment-bright.png -------------------------------------------------------------------------------- /docs/_build/html/_static/comment-close.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/html/_static/comment-close.png -------------------------------------------------------------------------------- /docs/_build/html/_static/comment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/html/_static/comment.png -------------------------------------------------------------------------------- /docs/_build/html/_static/default.css: -------------------------------------------------------------------------------- 1 | /* 2 | * default.css_t 3 | * ~~~~~~~~~~~~~ 4 | * 5 | * Sphinx stylesheet -- default theme. 6 | * 7 | * :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. 8 | * :license: BSD, see LICENSE for details. 9 | * 10 | */ 11 | 12 | @import url("basic.css"); 13 | 14 | /* -- page layout ----------------------------------------------------------- */ 15 | 16 | body { 17 | font-family: sans-serif; 18 | font-size: 100%; 19 | background-color: #11303d; 20 | color: #000; 21 | margin: 0; 22 | padding: 0; 23 | } 24 | 25 | div.document { 26 | background-color: #1c4e63; 27 | } 28 | 29 | div.documentwrapper { 30 | float: left; 31 | width: 100%; 32 | } 33 | 34 | div.bodywrapper { 35 | margin: 0 0 0 230px; 36 | } 37 | 38 | div.body { 39 | background-color: #ffffff; 40 | color: #000000; 41 | padding: 0 20px 30px 20px; 42 | } 43 | 44 | div.footer { 45 | color: #ffffff; 46 | width: 100%; 47 | padding: 9px 0 9px 0; 48 | text-align: center; 49 | font-size: 75%; 50 | } 51 | 52 | div.footer a { 53 | color: #ffffff; 54 | text-decoration: underline; 55 | } 56 | 57 | div.related { 58 | background-color: #133f52; 59 | line-height: 30px; 60 | color: #ffffff; 61 | } 62 | 63 | div.related a { 64 | color: #ffffff; 65 | } 66 | 67 | div.sphinxsidebar { 68 | } 69 | 70 | div.sphinxsidebar h3 { 71 | font-family: 'Trebuchet MS', sans-serif; 72 | color: #ffffff; 73 | font-size: 1.4em; 74 | font-weight: normal; 75 | margin: 0; 76 | padding: 0; 77 | } 78 | 79 | div.sphinxsidebar h3 a { 80 | color: #ffffff; 81 | } 82 | 83 | div.sphinxsidebar h4 { 84 | font-family: 'Trebuchet MS', sans-serif; 85 | color: #ffffff; 86 | font-size: 1.3em; 87 | font-weight: normal; 88 | margin: 5px 0 0 0; 89 | padding: 0; 90 | } 91 | 92 | div.sphinxsidebar p { 93 | color: #ffffff; 94 | } 95 | 96 | div.sphinxsidebar p.topless { 97 | margin: 5px 10px 10px 10px; 98 | } 99 | 100 | div.sphinxsidebar ul { 101 | margin: 10px; 102 | padding: 0; 103 | color: #ffffff; 104 | } 105 | 106 | div.sphinxsidebar a { 107 | color: #98dbcc; 108 | } 109 | 110 | div.sphinxsidebar input { 111 | border: 1px solid #98dbcc; 112 | font-family: sans-serif; 113 | font-size: 1em; 114 | } 115 | 116 | 117 | 118 | /* -- hyperlink styles ------------------------------------------------------ */ 119 | 120 | a { 121 | color: #355f7c; 122 | text-decoration: none; 123 | } 124 | 125 | a:visited { 126 | color: #355f7c; 127 | text-decoration: none; 128 | } 129 | 130 | a:hover { 131 | text-decoration: underline; 132 | } 133 | 134 | 135 | 136 | /* -- body styles ----------------------------------------------------------- */ 137 | 138 | div.body h1, 139 | div.body h2, 140 | div.body h3, 141 | div.body h4, 142 | div.body h5, 143 | div.body h6 { 144 | font-family: 'Trebuchet MS', sans-serif; 145 | background-color: #f2f2f2; 146 | font-weight: normal; 147 | color: #20435c; 148 | border-bottom: 1px solid #ccc; 149 | margin: 20px -20px 10px -20px; 150 | padding: 3px 0 3px 10px; 151 | } 152 | 153 | div.body h1 { margin-top: 0; font-size: 200%; } 154 | div.body h2 { font-size: 160%; } 155 | div.body h3 { font-size: 140%; } 156 | div.body h4 { font-size: 120%; } 157 | div.body h5 { font-size: 110%; } 158 | div.body h6 { font-size: 100%; } 159 | 160 | a.headerlink { 161 | color: #c60f0f; 162 | font-size: 0.8em; 163 | padding: 0 4px 0 4px; 164 | text-decoration: none; 165 | } 166 | 167 | a.headerlink:hover { 168 | background-color: #c60f0f; 169 | color: white; 170 | } 171 | 172 | div.body p, div.body dd, div.body li { 173 | text-align: justify; 174 | line-height: 130%; 175 | } 176 | 177 | div.admonition p.admonition-title + p { 178 | display: inline; 179 | } 180 | 181 | div.admonition p { 182 | margin-bottom: 5px; 183 | } 184 | 185 | div.admonition pre { 186 | margin-bottom: 5px; 187 | } 188 | 189 | div.admonition ul, div.admonition ol { 190 | margin-bottom: 5px; 191 | } 192 | 193 | div.note { 194 | background-color: #eee; 195 | border: 1px solid #ccc; 196 | } 197 | 198 | div.seealso { 199 | background-color: #ffc; 200 | border: 1px solid #ff6; 201 | } 202 | 203 | div.topic { 204 | background-color: #eee; 205 | } 206 | 207 | div.warning { 208 | background-color: #ffe4e4; 209 | border: 1px solid #f66; 210 | } 211 | 212 | p.admonition-title { 213 | display: inline; 214 | } 215 | 216 | p.admonition-title:after { 217 | content: ":"; 218 | } 219 | 220 | pre { 221 | padding: 5px; 222 | background-color: #eeffcc; 223 | color: #333333; 224 | line-height: 120%; 225 | border: 1px solid #ac9; 226 | border-left: none; 227 | border-right: none; 228 | } 229 | 230 | tt { 231 | background-color: #ecf0f3; 232 | padding: 0 1px 0 1px; 233 | font-size: 0.95em; 234 | } 235 | 236 | th { 237 | background-color: #ede; 238 | } 239 | 240 | .warning tt { 241 | background: #efc2c2; 242 | } 243 | 244 | .note tt { 245 | background: #d6d6d6; 246 | } 247 | 248 | .viewcode-back { 249 | font-family: sans-serif; 250 | } 251 | 252 | div.viewcode-block:target { 253 | background-color: #f4debf; 254 | border-top: 1px solid #ac9; 255 | border-bottom: 1px solid #ac9; 256 | } -------------------------------------------------------------------------------- /docs/_build/html/_static/dialog-note.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/html/_static/dialog-note.png -------------------------------------------------------------------------------- /docs/_build/html/_static/dialog-seealso.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/html/_static/dialog-seealso.png -------------------------------------------------------------------------------- /docs/_build/html/_static/dialog-topic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/html/_static/dialog-topic.png -------------------------------------------------------------------------------- /docs/_build/html/_static/dialog-warning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/html/_static/dialog-warning.png -------------------------------------------------------------------------------- /docs/_build/html/_static/doctools.js: -------------------------------------------------------------------------------- 1 | /* 2 | * doctools.js 3 | * ~~~~~~~~~~~ 4 | * 5 | * Sphinx JavaScript utilities for all documentation. 6 | * 7 | * :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. 8 | * :license: BSD, see LICENSE for details. 9 | * 10 | */ 11 | 12 | /** 13 | * select a different prefix for underscore 14 | */ 15 | $u = _.noConflict(); 16 | 17 | /** 18 | * make the code below compatible with browsers without 19 | * an installed firebug like debugger 20 | if (!window.console || !console.firebug) { 21 | var names = ["log", "debug", "info", "warn", "error", "assert", "dir", 22 | "dirxml", "group", "groupEnd", "time", "timeEnd", "count", "trace", 23 | "profile", "profileEnd"]; 24 | window.console = {}; 25 | for (var i = 0; i < names.length; ++i) 26 | window.console[names[i]] = function() {}; 27 | } 28 | */ 29 | 30 | /** 31 | * small helper function to urldecode strings 32 | */ 33 | jQuery.urldecode = function(x) { 34 | return decodeURIComponent(x).replace(/\+/g, ' '); 35 | } 36 | 37 | /** 38 | * small helper function to urlencode strings 39 | */ 40 | jQuery.urlencode = encodeURIComponent; 41 | 42 | /** 43 | * This function returns the parsed url parameters of the 44 | * current request. Multiple values per key are supported, 45 | * it will always return arrays of strings for the value parts. 46 | */ 47 | jQuery.getQueryParameters = function(s) { 48 | if (typeof s == 'undefined') 49 | s = document.location.search; 50 | var parts = s.substr(s.indexOf('?') + 1).split('&'); 51 | var result = {}; 52 | for (var i = 0; i < parts.length; i++) { 53 | var tmp = parts[i].split('=', 2); 54 | var key = jQuery.urldecode(tmp[0]); 55 | var value = jQuery.urldecode(tmp[1]); 56 | if (key in result) 57 | result[key].push(value); 58 | else 59 | result[key] = [value]; 60 | } 61 | return result; 62 | }; 63 | 64 | /** 65 | * small function to check if an array contains 66 | * a given item. 67 | */ 68 | jQuery.contains = function(arr, item) { 69 | for (var i = 0; i < arr.length; i++) { 70 | if (arr[i] == item) 71 | return true; 72 | } 73 | return false; 74 | }; 75 | 76 | /** 77 | * highlight a given string on a jquery object by wrapping it in 78 | * span elements with the given class name. 79 | */ 80 | jQuery.fn.highlightText = function(text, className) { 81 | function highlight(node) { 82 | if (node.nodeType == 3) { 83 | var val = node.nodeValue; 84 | var pos = val.toLowerCase().indexOf(text); 85 | if (pos >= 0 && !jQuery(node.parentNode).hasClass(className)) { 86 | var span = document.createElement("span"); 87 | span.className = className; 88 | span.appendChild(document.createTextNode(val.substr(pos, text.length))); 89 | node.parentNode.insertBefore(span, node.parentNode.insertBefore( 90 | document.createTextNode(val.substr(pos + text.length)), 91 | node.nextSibling)); 92 | node.nodeValue = val.substr(0, pos); 93 | } 94 | } 95 | else if (!jQuery(node).is("button, select, textarea")) { 96 | jQuery.each(node.childNodes, function() { 97 | highlight(this); 98 | }); 99 | } 100 | } 101 | return this.each(function() { 102 | highlight(this); 103 | }); 104 | }; 105 | 106 | /** 107 | * Small JavaScript module for the documentation. 108 | */ 109 | var Documentation = { 110 | 111 | init : function() { 112 | this.fixFirefoxAnchorBug(); 113 | this.highlightSearchWords(); 114 | this.initIndexTable(); 115 | }, 116 | 117 | /** 118 | * i18n support 119 | */ 120 | TRANSLATIONS : {}, 121 | PLURAL_EXPR : function(n) { return n == 1 ? 0 : 1; }, 122 | LOCALE : 'unknown', 123 | 124 | // gettext and ngettext don't access this so that the functions 125 | // can safely bound to a different name (_ = Documentation.gettext) 126 | gettext : function(string) { 127 | var translated = Documentation.TRANSLATIONS[string]; 128 | if (typeof translated == 'undefined') 129 | return string; 130 | return (typeof translated == 'string') ? translated : translated[0]; 131 | }, 132 | 133 | ngettext : function(singular, plural, n) { 134 | var translated = Documentation.TRANSLATIONS[singular]; 135 | if (typeof translated == 'undefined') 136 | return (n == 1) ? singular : plural; 137 | return translated[Documentation.PLURALEXPR(n)]; 138 | }, 139 | 140 | addTranslations : function(catalog) { 141 | for (var key in catalog.messages) 142 | this.TRANSLATIONS[key] = catalog.messages[key]; 143 | this.PLURAL_EXPR = new Function('n', 'return +(' + catalog.plural_expr + ')'); 144 | this.LOCALE = catalog.locale; 145 | }, 146 | 147 | /** 148 | * add context elements like header anchor links 149 | */ 150 | addContextElements : function() { 151 | $('div[id] > :header:first').each(function() { 152 | $('\u00B6'). 153 | attr('href', '#' + this.id). 154 | attr('title', _('Permalink to this headline')). 155 | appendTo(this); 156 | }); 157 | $('dt[id]').each(function() { 158 | $('\u00B6'). 159 | attr('href', '#' + this.id). 160 | attr('title', _('Permalink to this definition')). 161 | appendTo(this); 162 | }); 163 | }, 164 | 165 | /** 166 | * workaround a firefox stupidity 167 | */ 168 | fixFirefoxAnchorBug : function() { 169 | if (document.location.hash && $.browser.mozilla) 170 | window.setTimeout(function() { 171 | document.location.href += ''; 172 | }, 10); 173 | }, 174 | 175 | /** 176 | * highlight the search words provided in the url in the text 177 | */ 178 | highlightSearchWords : function() { 179 | var params = $.getQueryParameters(); 180 | var terms = (params.highlight) ? params.highlight[0].split(/\s+/) : []; 181 | if (terms.length) { 182 | var body = $('div.body'); 183 | window.setTimeout(function() { 184 | $.each(terms, function() { 185 | body.highlightText(this.toLowerCase(), 'highlighted'); 186 | }); 187 | }, 10); 188 | $('') 190 | .appendTo($('#searchbox')); 191 | } 192 | }, 193 | 194 | /** 195 | * init the domain index toggle buttons 196 | */ 197 | initIndexTable : function() { 198 | var togglers = $('img.toggler').click(function() { 199 | var src = $(this).attr('src'); 200 | var idnum = $(this).attr('id').substr(7); 201 | $('tr.cg-' + idnum).toggle(); 202 | if (src.substr(-9) == 'minus.png') 203 | $(this).attr('src', src.substr(0, src.length-9) + 'plus.png'); 204 | else 205 | $(this).attr('src', src.substr(0, src.length-8) + 'minus.png'); 206 | }).css('display', ''); 207 | if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) { 208 | togglers.click(); 209 | } 210 | }, 211 | 212 | /** 213 | * helper function to hide the search marks again 214 | */ 215 | hideSearchWords : function() { 216 | $('#searchbox .highlight-link').fadeOut(300); 217 | $('span.highlighted').removeClass('highlighted'); 218 | }, 219 | 220 | /** 221 | * make the url absolute 222 | */ 223 | makeURL : function(relativeURL) { 224 | return DOCUMENTATION_OPTIONS.URL_ROOT + '/' + relativeURL; 225 | }, 226 | 227 | /** 228 | * get the current relative url 229 | */ 230 | getCurrentURL : function() { 231 | var path = document.location.pathname; 232 | var parts = path.split(/\//); 233 | $.each(DOCUMENTATION_OPTIONS.URL_ROOT.split(/\//), function() { 234 | if (this == '..') 235 | parts.pop(); 236 | }); 237 | var url = parts.join('/'); 238 | return path.substring(url.lastIndexOf('/') + 1, path.length - 1); 239 | } 240 | }; 241 | 242 | // quick alias for translations 243 | _ = Documentation.gettext; 244 | 245 | $(document).ready(function() { 246 | Documentation.init(); 247 | }); 248 | -------------------------------------------------------------------------------- /docs/_build/html/_static/down-pressed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/html/_static/down-pressed.png -------------------------------------------------------------------------------- /docs/_build/html/_static/down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/html/_static/down.png -------------------------------------------------------------------------------- /docs/_build/html/_static/epub.css: -------------------------------------------------------------------------------- 1 | /* 2 | * default.css_t 3 | * ~~~~~~~~~~~~~ 4 | * 5 | * Sphinx stylesheet -- default theme. 6 | * 7 | * :copyright: Copyright 2007-2010 by the Sphinx team, see AUTHORS. 8 | * :license: BSD, see LICENSE for details. 9 | * 10 | */ 11 | 12 | @import url("basic.css"); 13 | 14 | /* -- page layout ----------------------------------------------------------- */ 15 | 16 | body { 17 | font-family: {{ theme_bodyfont }}; 18 | font-size: 100%; 19 | background-color: {{ theme_footerbgcolor }}; 20 | color: #000; 21 | margin: 0; 22 | padding: 0; 23 | } 24 | 25 | div.document { 26 | background-color: {{ theme_sidebarbgcolor }}; 27 | } 28 | 29 | div.documentwrapper { 30 | float: left; 31 | width: 100%; 32 | } 33 | 34 | div.bodywrapper { 35 | margin: 0 0 0 230px; 36 | } 37 | 38 | div.body { 39 | background-color: {{ theme_bgcolor }}; 40 | color: {{ theme_textcolor }}; 41 | padding: 0 20px 30px 20px; 42 | } 43 | 44 | {%- if theme_rightsidebar|tobool %} 45 | div.bodywrapper { 46 | margin: 0 230px 0 0; 47 | } 48 | {%- endif %} 49 | 50 | div.footer { 51 | color: {{ theme_footertextcolor }}; 52 | width: 100%; 53 | padding: 9px 0 9px 0; 54 | text-align: center; 55 | font-size: 75%; 56 | } 57 | 58 | div.footer a { 59 | color: {{ theme_footertextcolor }}; 60 | text-decoration: underline; 61 | } 62 | 63 | div.related { 64 | background-color: {{ theme_relbarbgcolor }}; 65 | line-height: 30px; 66 | color: {{ theme_relbartextcolor }}; 67 | } 68 | 69 | div.related a { 70 | color: {{ theme_relbarlinkcolor }}; 71 | } 72 | 73 | div.sphinxsidebar { 74 | {%- if theme_stickysidebar|tobool %} 75 | top: 30px; 76 | bottom: 0; 77 | margin: 0; 78 | position: fixed; 79 | overflow: auto; 80 | height: auto; 81 | {%- endif %} 82 | {%- if theme_rightsidebar|tobool %} 83 | float: right; 84 | {%- if theme_stickysidebar|tobool %} 85 | right: 0; 86 | {%- endif %} 87 | {%- endif %} 88 | } 89 | 90 | {%- if theme_stickysidebar|tobool %} 91 | /* this is nice, but it it leads to hidden headings when jumping 92 | to an anchor */ 93 | /* 94 | div.related { 95 | position: fixed; 96 | } 97 | 98 | div.documentwrapper { 99 | margin-top: 30px; 100 | } 101 | */ 102 | {%- endif %} 103 | 104 | div.sphinxsidebar h3 { 105 | font-family: {{ theme_headfont }}; 106 | color: {{ theme_sidebartextcolor }}; 107 | font-size: 1.4em; 108 | font-weight: normal; 109 | margin: 0; 110 | padding: 0; 111 | } 112 | 113 | div.sphinxsidebar h3 a { 114 | color: {{ theme_sidebartextcolor }}; 115 | } 116 | 117 | div.sphinxsidebar h4 { 118 | font-family: {{ theme_headfont }}; 119 | color: {{ theme_sidebartextcolor }}; 120 | font-size: 1.3em; 121 | font-weight: normal; 122 | margin: 5px 0 0 0; 123 | padding: 0; 124 | } 125 | 126 | div.sphinxsidebar p { 127 | color: {{ theme_sidebartextcolor }}; 128 | } 129 | 130 | div.sphinxsidebar p.topless { 131 | margin: 5px 10px 10px 10px; 132 | } 133 | 134 | div.sphinxsidebar ul { 135 | margin: 10px; 136 | padding: 0; 137 | color: {{ theme_sidebartextcolor }}; 138 | } 139 | 140 | div.sphinxsidebar a { 141 | color: {{ theme_sidebarlinkcolor }}; 142 | } 143 | 144 | div.sphinxsidebar input { 145 | border: 1px solid {{ theme_sidebarlinkcolor }}; 146 | font-family: sans-serif; 147 | font-size: 1em; 148 | } 149 | 150 | {% if theme_collapsiblesidebar|tobool %} 151 | /* for collapsible sidebar */ 152 | div#sidebarbutton { 153 | background-color: {{ theme_sidebarbtncolor }}; 154 | } 155 | {% endif %} 156 | 157 | /* -- hyperlink styles ------------------------------------------------------ */ 158 | 159 | a { 160 | color: {{ theme_linkcolor }}; 161 | text-decoration: none; 162 | } 163 | 164 | a:visited { 165 | color: {{ theme_visitedlinkcolor }}; 166 | text-decoration: none; 167 | } 168 | 169 | a:hover { 170 | text-decoration: underline; 171 | } 172 | 173 | {% if theme_externalrefs|tobool %} 174 | a.external { 175 | text-decoration: none; 176 | border-bottom: 1px dashed {{ theme_linkcolor }}; 177 | } 178 | 179 | a.external:hover { 180 | text-decoration: none; 181 | border-bottom: none; 182 | } 183 | 184 | a.external:visited { 185 | text-decoration: none; 186 | border-bottom: 1px dashed {{ theme_visitedlinkcolor }}; 187 | } 188 | {% endif %} 189 | 190 | /* -- body styles ----------------------------------------------------------- */ 191 | 192 | div.body h1, 193 | div.body h2, 194 | div.body h3, 195 | div.body h4, 196 | div.body h5, 197 | div.body h6 { 198 | font-family: {{ theme_headfont }}; 199 | background-color: {{ theme_headbgcolor }}; 200 | font-weight: normal; 201 | color: {{ theme_headtextcolor }}; 202 | border-bottom: 1px solid #ccc; 203 | margin: 20px -20px 10px -20px; 204 | padding: 3px 0 3px 10px; 205 | } 206 | 207 | div.body h1 { margin-top: 0; font-size: 200%; } 208 | div.body h2 { font-size: 160%; } 209 | div.body h3 { font-size: 140%; } 210 | div.body h4 { font-size: 120%; } 211 | div.body h5 { font-size: 110%; } 212 | div.body h6 { font-size: 100%; } 213 | 214 | a.headerlink { 215 | color: {{ theme_headlinkcolor }}; 216 | font-size: 0.8em; 217 | padding: 0 4px 0 4px; 218 | text-decoration: none; 219 | } 220 | 221 | a.headerlink:hover { 222 | background-color: {{ theme_headlinkcolor }}; 223 | color: white; 224 | } 225 | 226 | div.body p, div.body dd, div.body li { 227 | text-align: justify; 228 | line-height: 130%; 229 | } 230 | 231 | div.admonition p.admonition-title + p { 232 | display: inline; 233 | } 234 | 235 | div.admonition p { 236 | margin-bottom: 5px; 237 | } 238 | 239 | div.admonition pre { 240 | margin-bottom: 5px; 241 | } 242 | 243 | div.admonition ul, div.admonition ol { 244 | margin-bottom: 5px; 245 | } 246 | 247 | div.note { 248 | background-color: #eee; 249 | border: 1px solid #ccc; 250 | } 251 | 252 | div.seealso { 253 | background-color: #ffc; 254 | border: 1px solid #ff6; 255 | } 256 | 257 | div.topic { 258 | background-color: #eee; 259 | } 260 | 261 | div.warning { 262 | background-color: #ffe4e4; 263 | border: 1px solid #f66; 264 | } 265 | 266 | p.admonition-title { 267 | display: inline; 268 | } 269 | 270 | p.admonition-title:after { 271 | content: ":"; 272 | } 273 | 274 | pre { 275 | padding: 5px; 276 | background-color: {{ theme_codebgcolor }}; 277 | color: {{ theme_codetextcolor }}; 278 | line-height: 120%; 279 | border: 1px solid #ac9; 280 | border-left: none; 281 | border-right: none; 282 | } 283 | 284 | tt { 285 | background-color: #ecf0f3; 286 | padding: 0 1px 0 1px; 287 | font-size: 0.95em; 288 | } 289 | 290 | th { 291 | background-color: #ede; 292 | } 293 | 294 | .warning tt { 295 | background: #efc2c2; 296 | } 297 | 298 | .note tt { 299 | background: #d6d6d6; 300 | } 301 | 302 | .viewcode-back { 303 | font-family: {{ theme_bodyfont }}; 304 | } 305 | 306 | div.viewcode-block:target { 307 | background-color: #f4debf; 308 | border-top: 1px solid #ac9; 309 | border-bottom: 1px solid #ac9; 310 | } 311 | -------------------------------------------------------------------------------- /docs/_build/html/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/html/_static/file.png -------------------------------------------------------------------------------- /docs/_build/html/_static/footerbg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/html/_static/footerbg.png -------------------------------------------------------------------------------- /docs/_build/html/_static/headerbg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/html/_static/headerbg.png -------------------------------------------------------------------------------- /docs/_build/html/_static/ie6.css: -------------------------------------------------------------------------------- 1 | * html img, 2 | * html .png{position:relative;behavior:expression((this.runtimeStyle.behavior="none")&&(this.pngSet?this.pngSet=true:(this.nodeName == "IMG" && this.src.toLowerCase().indexOf('.png')>-1?(this.runtimeStyle.backgroundImage = "none", 3 | this.runtimeStyle.filter = "progid:DXImageTransform.Microsoft.AlphaImageLoader(src='" + this.src + "',sizingMethod='image')", 4 | this.src = "_static/transparent.gif"):(this.origBg = this.origBg? this.origBg :this.currentStyle.backgroundImage.toString().replace('url("','').replace('")',''), 5 | this.runtimeStyle.filter = "progid:DXImageTransform.Microsoft.AlphaImageLoader(src='" + this.origBg + "',sizingMethod='crop')", 6 | this.runtimeStyle.backgroundImage = "none")),this.pngSet=true) 7 | );} 8 | -------------------------------------------------------------------------------- /docs/_build/html/_static/middlebg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/html/_static/middlebg.png -------------------------------------------------------------------------------- /docs/_build/html/_static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/html/_static/minus.png -------------------------------------------------------------------------------- /docs/_build/html/_static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/html/_static/plus.png -------------------------------------------------------------------------------- /docs/_build/html/_static/pygments.css: -------------------------------------------------------------------------------- 1 | .highlight .hll { background-color: #ffffcc } 2 | .highlight { background: #eeffcc; } 3 | .highlight .c { color: #408090; font-style: italic } /* Comment */ 4 | .highlight .err { border: 1px solid #FF0000 } /* Error */ 5 | .highlight .k { color: #007020; font-weight: bold } /* Keyword */ 6 | .highlight .o { color: #666666 } /* Operator */ 7 | .highlight .cm { color: #408090; font-style: italic } /* Comment.Multiline */ 8 | .highlight .cp { color: #007020 } /* Comment.Preproc */ 9 | .highlight .c1 { color: #408090; font-style: italic } /* Comment.Single */ 10 | .highlight .cs { color: #408090; background-color: #fff0f0 } /* Comment.Special */ 11 | .highlight .gd { color: #A00000 } /* Generic.Deleted */ 12 | .highlight .ge { font-style: italic } /* Generic.Emph */ 13 | .highlight .gr { color: #FF0000 } /* Generic.Error */ 14 | .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ 15 | .highlight .gi { color: #00A000 } /* Generic.Inserted */ 16 | .highlight .go { color: #303030 } /* Generic.Output */ 17 | .highlight .gp { color: #c65d09; font-weight: bold } /* Generic.Prompt */ 18 | .highlight .gs { font-weight: bold } /* Generic.Strong */ 19 | .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ 20 | .highlight .gt { color: #0040D0 } /* Generic.Traceback */ 21 | .highlight .kc { color: #007020; font-weight: bold } /* Keyword.Constant */ 22 | .highlight .kd { color: #007020; font-weight: bold } /* Keyword.Declaration */ 23 | .highlight .kn { color: #007020; font-weight: bold } /* Keyword.Namespace */ 24 | .highlight .kp { color: #007020 } /* Keyword.Pseudo */ 25 | .highlight .kr { color: #007020; font-weight: bold } /* Keyword.Reserved */ 26 | .highlight .kt { color: #902000 } /* Keyword.Type */ 27 | .highlight .m { color: #208050 } /* Literal.Number */ 28 | .highlight .s { color: #4070a0 } /* Literal.String */ 29 | .highlight .na { color: #4070a0 } /* Name.Attribute */ 30 | .highlight .nb { color: #007020 } /* Name.Builtin */ 31 | .highlight .nc { color: #0e84b5; font-weight: bold } /* Name.Class */ 32 | .highlight .no { color: #60add5 } /* Name.Constant */ 33 | .highlight .nd { color: #555555; font-weight: bold } /* Name.Decorator */ 34 | .highlight .ni { color: #d55537; font-weight: bold } /* Name.Entity */ 35 | .highlight .ne { color: #007020 } /* Name.Exception */ 36 | .highlight .nf { color: #06287e } /* Name.Function */ 37 | .highlight .nl { color: #002070; font-weight: bold } /* Name.Label */ 38 | .highlight .nn { color: #0e84b5; font-weight: bold } /* Name.Namespace */ 39 | .highlight .nt { color: #062873; font-weight: bold } /* Name.Tag */ 40 | .highlight .nv { color: #bb60d5 } /* Name.Variable */ 41 | .highlight .ow { color: #007020; font-weight: bold } /* Operator.Word */ 42 | .highlight .w { color: #bbbbbb } /* Text.Whitespace */ 43 | .highlight .mf { color: #208050 } /* Literal.Number.Float */ 44 | .highlight .mh { color: #208050 } /* Literal.Number.Hex */ 45 | .highlight .mi { color: #208050 } /* Literal.Number.Integer */ 46 | .highlight .mo { color: #208050 } /* Literal.Number.Oct */ 47 | .highlight .sb { color: #4070a0 } /* Literal.String.Backtick */ 48 | .highlight .sc { color: #4070a0 } /* Literal.String.Char */ 49 | .highlight .sd { color: #4070a0; font-style: italic } /* Literal.String.Doc */ 50 | .highlight .s2 { color: #4070a0 } /* Literal.String.Double */ 51 | .highlight .se { color: #4070a0; font-weight: bold } /* Literal.String.Escape */ 52 | .highlight .sh { color: #4070a0 } /* Literal.String.Heredoc */ 53 | .highlight .si { color: #70a0d0; font-style: italic } /* Literal.String.Interpol */ 54 | .highlight .sx { color: #c65d09 } /* Literal.String.Other */ 55 | .highlight .sr { color: #235388 } /* Literal.String.Regex */ 56 | .highlight .s1 { color: #4070a0 } /* Literal.String.Single */ 57 | .highlight .ss { color: #517918 } /* Literal.String.Symbol */ 58 | .highlight .bp { color: #007020 } /* Name.Builtin.Pseudo */ 59 | .highlight .vc { color: #bb60d5 } /* Name.Variable.Class */ 60 | .highlight .vg { color: #bb60d5 } /* Name.Variable.Global */ 61 | .highlight .vi { color: #bb60d5 } /* Name.Variable.Instance */ 62 | .highlight .il { color: #208050 } /* Literal.Number.Integer.Long */ -------------------------------------------------------------------------------- /docs/_build/html/_static/pyramid.css: -------------------------------------------------------------------------------- 1 | /* 2 | * pylons.css_t 3 | * ~~~~~~~~~~~~ 4 | * 5 | * Sphinx stylesheet -- pylons theme. 6 | * 7 | * :copyright: Copyright 2007-2010 by the Sphinx team, see AUTHORS. 8 | * :license: BSD, see LICENSE for details. 9 | * 10 | */ 11 | 12 | @import url("basic.css"); 13 | 14 | /* -- page layout ----------------------------------------------------------- */ 15 | 16 | body { 17 | font-family: "Nobile", sans-serif; 18 | font-size: 100%; 19 | background-color: #393939; 20 | color: #ffffff; 21 | margin: 0; 22 | padding: 0; 23 | } 24 | 25 | div.documentwrapper { 26 | float: left; 27 | width: 100%; 28 | } 29 | 30 | div.bodywrapper { 31 | margin: 0 0 0 230px; 32 | } 33 | 34 | hr { 35 | border: 1px solid #B1B4B6; 36 | } 37 | 38 | div.document { 39 | background-color: #eee; 40 | } 41 | 42 | div.header { 43 | width:100%; 44 | background: #f4ad32 url(headerbg.png) repeat-x 0 top; 45 | border-bottom: 2px solid #ffffff; 46 | } 47 | 48 | div.logo { 49 | text-align: center; 50 | padding-top: 10px; 51 | } 52 | 53 | div.body { 54 | background-color: #ffffff; 55 | color: #3E4349; 56 | padding: 0 30px 30px 30px; 57 | font-size: 1em; 58 | border: 2px solid #ddd; 59 | border-right-style: none; 60 | overflow: auto; 61 | } 62 | 63 | div.footer { 64 | color: #ffffff; 65 | width: 100%; 66 | padding: 13px 0; 67 | text-align: center; 68 | font-size: 75%; 69 | background: transparent; 70 | clear:both; 71 | } 72 | 73 | div.footer a { 74 | color: #ffffff; 75 | text-decoration: none; 76 | } 77 | 78 | div.footer a:hover { 79 | color: #e88f00; 80 | text-decoration: underline; 81 | } 82 | 83 | div.related { 84 | line-height: 30px; 85 | color: #373839; 86 | font-size: 0.8em; 87 | background-color: #eee; 88 | } 89 | 90 | div.related a { 91 | color: #1b61d6; 92 | } 93 | 94 | div.related ul { 95 | padding-left: 240px; 96 | } 97 | 98 | div.sphinxsidebar { 99 | font-size: 0.75em; 100 | line-height: 1.5em; 101 | } 102 | 103 | div.sphinxsidebarwrapper{ 104 | padding: 10px 0; 105 | } 106 | 107 | div.sphinxsidebar h3, 108 | div.sphinxsidebar h4 { 109 | font-family: "Neuton", sans-serif; 110 | color: #373839; 111 | font-size: 1.4em; 112 | font-weight: normal; 113 | margin: 0; 114 | padding: 5px 10px; 115 | border-bottom: 2px solid #ddd; 116 | } 117 | 118 | div.sphinxsidebar h4{ 119 | font-size: 1.3em; 120 | } 121 | 122 | div.sphinxsidebar h3 a { 123 | color: #000000; 124 | } 125 | 126 | 127 | div.sphinxsidebar p { 128 | color: #888; 129 | padding: 5px 20px; 130 | } 131 | 132 | div.sphinxsidebar p.topless { 133 | } 134 | 135 | div.sphinxsidebar ul { 136 | margin: 10px 20px; 137 | padding: 0; 138 | color: #373839; 139 | } 140 | 141 | div.sphinxsidebar a { 142 | color: #444; 143 | } 144 | 145 | div.sphinxsidebar input { 146 | border: 1px solid #ccc; 147 | font-family: sans-serif; 148 | font-size: 1em; 149 | } 150 | 151 | div.sphinxsidebar input[type=text]{ 152 | margin-left: 20px; 153 | } 154 | 155 | /* -- sidebars -------------------------------------------------------------- */ 156 | 157 | div.sidebar { 158 | margin: 0 0 0.5em 1em; 159 | border: 2px solid #c6d880; 160 | background-color: #e6efc2; 161 | width: 40%; 162 | float: right; 163 | border-right-style: none; 164 | border-left-style: none; 165 | padding: 10px 20px; 166 | } 167 | 168 | p.sidebar-title { 169 | font-weight: bold; 170 | } 171 | 172 | /* -- body styles ----------------------------------------------------------- */ 173 | 174 | a, a .pre { 175 | color: #1b61d6; 176 | text-decoration: none; 177 | } 178 | 179 | a:hover, a:hover .pre { 180 | text-decoration: underline; 181 | } 182 | 183 | div.body h1, 184 | div.body h2, 185 | div.body h3, 186 | div.body h4, 187 | div.body h5, 188 | div.body h6 { 189 | font-family: "Neuton", sans-serif; 190 | background-color: #ffffff; 191 | font-weight: normal; 192 | color: #373839; 193 | margin: 30px 0px 10px 0px; 194 | padding: 5px 0; 195 | } 196 | 197 | div.body h1 { border-top: 20px solid white; margin-top: 0; font-size: 200%; } 198 | div.body h2 { font-size: 150%; background-color: #ffffff; } 199 | div.body h3 { font-size: 120%; background-color: #ffffff; } 200 | div.body h4 { font-size: 110%; background-color: #ffffff; } 201 | div.body h5 { font-size: 100%; background-color: #ffffff; } 202 | div.body h6 { font-size: 100%; background-color: #ffffff; } 203 | 204 | a.headerlink { 205 | color: #1b61d6; 206 | font-size: 0.8em; 207 | padding: 0 4px 0 4px; 208 | text-decoration: none; 209 | } 210 | 211 | a.headerlink:hover { 212 | text-decoration: underline; 213 | } 214 | 215 | div.body p, div.body dd, div.body li { 216 | line-height: 1.5em; 217 | } 218 | 219 | div.admonition p.admonition-title + p { 220 | display: inline; 221 | } 222 | 223 | div.highlight{ 224 | background-color: white; 225 | } 226 | 227 | div.note { 228 | border: 2px solid #7a9eec; 229 | border-right-style: none; 230 | border-left-style: none; 231 | padding: 10px 20px 10px 60px; 232 | background: #e1ecfe url(dialog-note.png) no-repeat 10px 8px; 233 | } 234 | 235 | div.seealso { 236 | background: #fff6bf url(dialog-seealso.png) no-repeat 10px 8px; 237 | border: 2px solid #ffd324; 238 | border-left-style: none; 239 | border-right-style: none; 240 | padding: 10px 20px 10px 60px; 241 | } 242 | 243 | div.topic { 244 | background: #eeeeee; 245 | border: 2px solid #C6C9CB; 246 | padding: 10px 20px; 247 | border-right-style: none; 248 | border-left-style: none; 249 | } 250 | 251 | div.warning { 252 | background: #fbe3e4 url(dialog-warning.png) no-repeat 10px 8px; 253 | border: 2px solid #fbc2c4; 254 | border-right-style: none; 255 | border-left-style: none; 256 | padding: 10px 20px 10px 60px; 257 | } 258 | 259 | p.admonition-title { 260 | display: none; 261 | } 262 | 263 | p.admonition-title:after { 264 | content: ":"; 265 | } 266 | 267 | pre { 268 | padding: 10px; 269 | background-color: #fafafa; 270 | color: #222; 271 | line-height: 1.2em; 272 | border: 2px solid #C6C9CB; 273 | font-size: 1.1em; 274 | margin: 1.5em 0 1.5em 0; 275 | border-right-style: none; 276 | border-left-style: none; 277 | } 278 | 279 | tt { 280 | background-color: transparent; 281 | color: #222; 282 | font-size: 1.1em; 283 | font-family: monospace; 284 | } 285 | 286 | .viewcode-back { 287 | font-family: "Nobile", sans-serif; 288 | } 289 | 290 | div.viewcode-block:target { 291 | background-color: #fff6bf; 292 | border: 2px solid #ffd324; 293 | border-left-style: none; 294 | border-right-style: none; 295 | padding: 10px 20px; 296 | } 297 | 298 | table.highlighttable { 299 | width: 100%; 300 | } 301 | 302 | table.highlighttable td { 303 | padding: 0; 304 | } 305 | 306 | a em.std-term { 307 | color: #007f00; 308 | } 309 | 310 | a:hover em.std-term { 311 | text-decoration: underline; 312 | } 313 | 314 | .download { 315 | font-family: "Nobile", sans-serif; 316 | font-weight: normal; 317 | font-style: normal; 318 | } 319 | 320 | tt.xref { 321 | font-weight: normal; 322 | font-style: normal; 323 | } -------------------------------------------------------------------------------- /docs/_build/html/_static/sidebar.js: -------------------------------------------------------------------------------- 1 | /* 2 | * sidebar.js 3 | * ~~~~~~~~~~ 4 | * 5 | * This script makes the Sphinx sidebar collapsible. 6 | * 7 | * .sphinxsidebar contains .sphinxsidebarwrapper. This script adds 8 | * in .sphixsidebar, after .sphinxsidebarwrapper, the #sidebarbutton 9 | * used to collapse and expand the sidebar. 10 | * 11 | * When the sidebar is collapsed the .sphinxsidebarwrapper is hidden 12 | * and the width of the sidebar and the margin-left of the document 13 | * are decreased. When the sidebar is expanded the opposite happens. 14 | * This script saves a per-browser/per-session cookie used to 15 | * remember the position of the sidebar among the pages. 16 | * Once the browser is closed the cookie is deleted and the position 17 | * reset to the default (expanded). 18 | * 19 | * :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. 20 | * :license: BSD, see LICENSE for details. 21 | * 22 | */ 23 | 24 | $(function() { 25 | // global elements used by the functions. 26 | // the 'sidebarbutton' element is defined as global after its 27 | // creation, in the add_sidebar_button function 28 | var bodywrapper = $('.bodywrapper'); 29 | var sidebar = $('.sphinxsidebar'); 30 | var sidebarwrapper = $('.sphinxsidebarwrapper'); 31 | 32 | // for some reason, the document has no sidebar; do not run into errors 33 | if (!sidebar.length) return; 34 | 35 | // original margin-left of the bodywrapper and width of the sidebar 36 | // with the sidebar expanded 37 | var bw_margin_expanded = bodywrapper.css('margin-left'); 38 | var ssb_width_expanded = sidebar.width(); 39 | 40 | // margin-left of the bodywrapper and width of the sidebar 41 | // with the sidebar collapsed 42 | var bw_margin_collapsed = '.8em'; 43 | var ssb_width_collapsed = '.8em'; 44 | 45 | // colors used by the current theme 46 | var dark_color = $('.related').css('background-color'); 47 | var light_color = $('.document').css('background-color'); 48 | 49 | function sidebar_is_collapsed() { 50 | return sidebarwrapper.is(':not(:visible)'); 51 | } 52 | 53 | function toggle_sidebar() { 54 | if (sidebar_is_collapsed()) 55 | expand_sidebar(); 56 | else 57 | collapse_sidebar(); 58 | } 59 | 60 | function collapse_sidebar() { 61 | sidebarwrapper.hide(); 62 | sidebar.css('width', ssb_width_collapsed); 63 | bodywrapper.css('margin-left', bw_margin_collapsed); 64 | sidebarbutton.css({ 65 | 'margin-left': '0', 66 | 'height': bodywrapper.height() 67 | }); 68 | sidebarbutton.find('span').text('»'); 69 | sidebarbutton.attr('title', _('Expand sidebar')); 70 | document.cookie = 'sidebar=collapsed'; 71 | } 72 | 73 | function expand_sidebar() { 74 | bodywrapper.css('margin-left', bw_margin_expanded); 75 | sidebar.css('width', ssb_width_expanded); 76 | sidebarwrapper.show(); 77 | sidebarbutton.css({ 78 | 'margin-left': ssb_width_expanded-12, 79 | 'height': bodywrapper.height() 80 | }); 81 | sidebarbutton.find('span').text('«'); 82 | sidebarbutton.attr('title', _('Collapse sidebar')); 83 | document.cookie = 'sidebar=expanded'; 84 | } 85 | 86 | function add_sidebar_button() { 87 | sidebarwrapper.css({ 88 | 'float': 'left', 89 | 'margin-right': '0', 90 | 'width': ssb_width_expanded - 28 91 | }); 92 | // create the button 93 | sidebar.append( 94 | '
«
' 95 | ); 96 | var sidebarbutton = $('#sidebarbutton'); 97 | light_color = sidebarbutton.css('background-color'); 98 | // find the height of the viewport to center the '<<' in the page 99 | var viewport_height; 100 | if (window.innerHeight) 101 | viewport_height = window.innerHeight; 102 | else 103 | viewport_height = $(window).height(); 104 | sidebarbutton.find('span').css({ 105 | 'display': 'block', 106 | 'margin-top': (viewport_height - sidebar.position().top - 20) / 2 107 | }); 108 | 109 | sidebarbutton.click(toggle_sidebar); 110 | sidebarbutton.attr('title', _('Collapse sidebar')); 111 | sidebarbutton.css({ 112 | 'color': '#FFFFFF', 113 | 'border-left': '1px solid ' + dark_color, 114 | 'font-size': '1.2em', 115 | 'cursor': 'pointer', 116 | 'height': bodywrapper.height(), 117 | 'padding-top': '1px', 118 | 'margin-left': ssb_width_expanded - 12 119 | }); 120 | 121 | sidebarbutton.hover( 122 | function () { 123 | $(this).css('background-color', dark_color); 124 | }, 125 | function () { 126 | $(this).css('background-color', light_color); 127 | } 128 | ); 129 | } 130 | 131 | function set_position_from_cookie() { 132 | if (!document.cookie) 133 | return; 134 | var items = document.cookie.split(';'); 135 | for(var k=0; k=e.computed&&(e={value:f,computed:g})});return e.value};b.min=function(a,c,d){if(!c&&b.isArray(a))return Math.min.apply(Math,a);var e={computed:Infinity};b.each(a,function(f,g,h){g=c?c.call(d,f,g,h):f;gf?1:0}),"value")};b.sortedIndex=function(a,c,d){d=d||b.identity;for(var e=0,f=a.length;e>1;d(a[g])=0})})};b.zip=function(){for(var a=b.toArray(arguments),c=b.max(b.pluck(a,"length")),d=new Array(c),e=0;e0?f-c:c-f)>=0)return e;e[g++]=f}};b.bind=function(a,c){var d=b.rest(arguments,2);return function(){return a.apply(c||j,d.concat(b.toArray(arguments)))}};b.bindAll=function(a){var c=b.rest(arguments);if(c.length==0)c=b.functions(a);b.each(c,function(d){a[d]=b.bind(a[d],a)}); 17 | return a};b.delay=function(a,c){var d=b.rest(arguments,2);return setTimeout(function(){return a.apply(a,d)},c)};b.defer=function(a){return b.delay.apply(b,[a,1].concat(b.rest(arguments)))};b.wrap=function(a,c){return function(){var d=[a].concat(b.toArray(arguments));return c.apply(c,d)}};b.compose=function(){var a=b.toArray(arguments);return function(){for(var c=b.toArray(arguments),d=a.length-1;d>=0;d--)c=[a[d].apply(this,c)];return c[0]}};b.keys=function(a){if(b.isArray(a))return b.range(0,a.length); 18 | var c=[];for(var d in a)q.call(a,d)&&c.push(d);return c};b.values=function(a){return b.map(a,b.identity)};b.functions=function(a){return b.select(b.keys(a),function(c){return b.isFunction(a[c])}).sort()};b.extend=function(a,c){for(var d in c)a[d]=c[d];return a};b.clone=function(a){if(b.isArray(a))return a.slice(0);return b.extend({},a)};b.tap=function(a,c){c(a);return a};b.isEqual=function(a,c){if(a===c)return true;var d=typeof a;if(d!=typeof c)return false;if(a==c)return true;if(!a&&c||a&&!c)return false; 19 | if(a.isEqual)return a.isEqual(c);if(b.isDate(a)&&b.isDate(c))return a.getTime()===c.getTime();if(b.isNaN(a)&&b.isNaN(c))return true;if(b.isRegExp(a)&&b.isRegExp(c))return a.source===c.source&&a.global===c.global&&a.ignoreCase===c.ignoreCase&&a.multiline===c.multiline;if(d!=="object")return false;if(a.length&&a.length!==c.length)return false;d=b.keys(a);var e=b.keys(c);if(d.length!=e.length)return false;for(var f in a)if(!b.isEqual(a[f],c[f]))return false;return true};b.isEmpty=function(a){return b.keys(a).length== 20 | 0};b.isElement=function(a){return!!(a&&a.nodeType==1)};b.isArray=function(a){return!!(a&&a.concat&&a.unshift)};b.isArguments=function(a){return a&&b.isNumber(a.length)&&!b.isArray(a)&&!r.call(a,"length")};b.isFunction=function(a){return!!(a&&a.constructor&&a.call&&a.apply)};b.isString=function(a){return!!(a===""||a&&a.charCodeAt&&a.substr)};b.isNumber=function(a){return p.call(a)==="[object Number]"};b.isDate=function(a){return!!(a&&a.getTimezoneOffset&&a.setUTCFullYear)};b.isRegExp=function(a){return!!(a&& 21 | a.test&&a.exec&&(a.ignoreCase||a.ignoreCase===false))};b.isNaN=function(a){return b.isNumber(a)&&isNaN(a)};b.isNull=function(a){return a===null};b.isUndefined=function(a){return typeof a=="undefined"};b.noConflict=function(){j._=n;return this};b.identity=function(a){return a};b.breakLoop=function(){throw m;};var s=0;b.uniqueId=function(a){var c=s++;return a?a+c:c};b.template=function(a,c){a=new Function("obj","var p=[],print=function(){p.push.apply(p,arguments);};with(obj){p.push('"+a.replace(/[\r\t\n]/g, 22 | " ").replace(/'(?=[^%]*%>)/g,"\t").split("'").join("\\'").split("\t").join("'").replace(/<%=(.+?)%>/g,"',$1,'").split("<%").join("');").split("%>").join("p.push('")+"');}return p.join('');");return c?a(c):a};b.forEach=b.each;b.foldl=b.inject=b.reduce;b.foldr=b.reduceRight;b.filter=b.select;b.every=b.all;b.some=b.any;b.head=b.first;b.tail=b.rest;b.methods=b.functions;var l=function(a,c){return c?b(a).chain():a};b.each(b.functions(b),function(a){var c=b[a];i.prototype[a]=function(){var d=b.toArray(arguments); 23 | o.call(d,this._wrapped);return l(c.apply(b,d),this._chain)}});b.each(["pop","push","reverse","shift","sort","splice","unshift"],function(a){var c=Array.prototype[a];i.prototype[a]=function(){c.apply(this._wrapped,arguments);return l(this._wrapped,this._chain)}});b.each(["concat","join","slice"],function(a){var c=Array.prototype[a];i.prototype[a]=function(){return l(c.apply(this._wrapped,arguments),this._chain)}});i.prototype.chain=function(){this._chain=true;return this};i.prototype.value=function(){return this._wrapped}})(); 24 | -------------------------------------------------------------------------------- /docs/_build/html/_static/up-pressed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/html/_static/up-pressed.png -------------------------------------------------------------------------------- /docs/_build/html/_static/up.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/html/_static/up.png -------------------------------------------------------------------------------- /docs/_build/html/disqus_jnlp.html.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | <no title> — Japanese Natural Language Processing 11 | 12 | 13 | 14 | 15 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 33 | 34 | 35 | 36 | 37 | 43 | 44 |
45 |
46 |
47 |
48 | 49 |
50 | 61 | 62 | blog comments powered by Disqus 63 | 64 |
65 |
66 |
67 |
68 |
69 | 81 | 82 |
83 |
84 |
85 |
86 | 92 | 95 | 96 | -------------------------------------------------------------------------------- /docs/_build/html/genindex.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | Index — Japanese Natural Language Processing 13 | 14 | 15 | 16 | 17 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 35 | 36 | 37 | 38 | 39 | 48 | 49 |
50 |
51 |
52 |
53 | 54 | 55 |

Index

56 | 57 |
58 | 59 |
60 | 61 | 62 |
63 |
64 |
65 |
66 |
67 | 68 | 69 | 70 | 82 | 83 |
84 |
85 |
86 |
87 | 96 | 100 | 101 | -------------------------------------------------------------------------------- /docs/_build/html/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/docs/_build/html/objects.inv -------------------------------------------------------------------------------- /docs/_build/html/search.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Search — Japanese Natural Language Processing 11 | 12 | 13 | 14 | 15 | 24 | 25 | 26 | 27 | 28 | 29 | 32 | 33 | 34 | 35 | 38 | 39 | 40 | 41 | 42 | 43 | 49 | 50 |
51 |
52 |
53 |
54 | 55 |

Search

56 |
57 | 58 |

59 | Please activate JavaScript to enable the search 60 | functionality. 61 |

62 |
63 |

64 | From here you can search these documents. Enter your search 65 | words into the box below and click "search". Note that the search 66 | function will automatically search for all of the words. Pages 67 | containing fewer words won't appear in the result list. 68 |

69 |
70 | 71 | 72 | 73 |
74 | 75 |
76 | 77 |
78 | 79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 | 94 | 97 | 98 | -------------------------------------------------------------------------------- /docs/_build/html/searchindex.js: -------------------------------------------------------------------------------- 1 | Search.setIndex({objects:{},terms:{all:0,code:0,queri:0,follow:0,"\u3064\u307e\u308a\u3053\u306e\u524d\u306e\u91d1\u66dc\u65e5\u306b\u99c5\u3067\u898b\u304b\u3051\u305f":0,research:0,morpholog:0,iconv:0,depend:0,father:0,"\u4e00\u822c":0,contentwindow:0,cform:0,bugreport:[],sens:0,hiraganachartfil:0,"\u79c1":0,string:0,"\u30cb":0,electron:0,verb:0,join:0,hachi:0,wsd:0,iframe1:0,button:0,div:[0,1],"\u8aad\u70b9":0,pleas:[0,1],past:[0,1],"\uff15":0,download:0,appendchild:[0,1],neg:0,section:0,version:[],newheight:0,net:0,can:0,gozen:0,jconvert:0,never:0,nichi:0,bodi:[0,1],trunk:0,path:0,u5f7c:0,gaikyou:0,search:0,talent:0,forum:[0,1],"\u6c17\u8c61\u5e81\u304c\uff12\uff11\u65e5\u5348\u524d\uff14\u6642\uff14\uff18\u5206":0,"\u5f7c":0,smoke:0,chang:0,head:[0,1],search_with_exampl:0,scrollwidth:0,negative_scor:0,modul:0,"\u8a8d":0,"\u3081\u3066\u3044\u308b":0,href:[0,1],u79c1:0,instal:0,txt:0,"\u30ef\u30bf\u30b7":0,from:0,thre:0,"\u898b\u304b\u3051":0,two:0,kevincobain2000:0,stylesheet:0,minhash:0,disambigu:0,edict_search:0,edict_dictionari:0,more:0,src:[0,1],ctype:0,"\u4fc2\u52a9\u8a5e":0,train:0,women:0,"\u30cb\u30c1":0,word:0,setup:0,work:0,histori:0,disqus_thread:[0,1],tab:0,eng:0,charset:0,kisyouty:0,classifi:0,"\u30ab\u30ec":0,how:0,"\uff58\u7dda\u5199\u771f\u3067\u7570\u72b6\u304c":0,"\u540d\u8a5e":0,simpl:0,css:0,map:0,"\u6f14\u51fa":0,clone:0,"\u524d":0,befor:[0,1],date:0,marginheight:0,data:0,"\u30c4\u30de\u30ea":0,github:0,noun:0,third:0,author:0,ambigu:0,jcabocha:0,nltk:0,ouput:0,approv:0,jp_wn:0,nbsp:0,edict_examples_path:0,input_sent:0,katakanachart:0,group:0,hiragana:0,"return":0,"\u30c7":0,python:0,sentenc:0,"\u30df\u30ab\u30b1":0,"\u51b7\u3048\u77e5\u3089\u305a":0,hapyou:0,pulkit:0,"\u65e5":0,name:0,edit:[0,1],"\u91d1\u66dc\u65e5":0,token:0,"\u3053\u308c\u3067\u30a2\u30ca\u30bf\u3082\u51b7\u3048\u77e5\u3089\u305a":0,createel:[0,1],"\u76e3\u7763":0,replac:[0,1],chunk:0,ifram:0,recogn:0,baselin:0,variabl:[0,1],"\u6bcd\u306f\u6c7a\u3057\u3066\u79c1\u306e\u7d50\u5a5a\u3092":0,sentiment:0,is0:0,dsq:[0,1],content:0,rel:0,print:0,yoru:0,"\u5168\u90e8":0,base:0,dictionari:0,wnjpn:0,org:0,"\u30ce":0,bash:0,sentiwordnet_3:0,jsentiment:0,iso:0,getelementsbytagnam:[0,1],origin:0,onc:0,"\u3053\u308c\u306f\u308f\u304b\u3089\u306a\u3044\u3067\u3059":0,script:[0,1],licens:0,tok:0,long_substr:0,termin:0,licenc:0,disqu:[0,1],hun:0,pars:0,pronunci:0,provid:0,project:0,contactform:[],posit:0,analysi:0,jread:0,ichi:0,rai:0,packag:0,"\u63a5\u5c3e":0,have:0,kanjid:0,katakana:0,u306f:0,"150px":0,adnoun:0,isti:0,note:0,exampl:[0,1],which:0,"\u5f7c\u306f\u5973\u6027\u306e\u55ab\u7159\u3092\u3044\u3044\u3082\u306e\u3060\u3068":0,"\u8a18\u53f7":0,romaji:0,english:0,chart:0,plan:0,america:0,homepag:0,"class":[0,1],kathuria:0,don:[0,1],"\u52a9\u8a5e":0,doc:0,cover:0,"\u30f2":0,abnorm:0,shortnam:[0,1],"\u3064\u307e\u308a":0,"\u3081\u306a\u3044":0,cabocha:0,nlp:0,wikipedia:0,sentiwordnet:0,"_dict":0,onli:0,configur:[0,1],"\u4ff3\u512a":0,edict:0,"\u3081\u3089\u308c\u307e\u3059":0,contribut:[],pypi:0,"\u6bcd\u306f\u79c1\u306e\u8a08\u753b\u3092\u3088\u3044\u3068":0,repo:0,longest:0,"\u8a8d\u3081\u308b":0,requir:[0,1],enabl:[0,1],emb:[0,1],jprocess:0,borrow:0,common:0,view:[0,1],wiki:0,conform:0,"\u5168\u90e8\u6700\u9ad8":0,see:0,detect:0,enumer:0,en_swn:0,score:0,between:0,edict_search_app:0,"import":0,appreci:0,"\u683c\u52a9\u8a5e":0,javascript:[0,1],here:0,"\u6211\u3005\u306f\u5f7c\u306e\u624d\u80fd\u3092":0,ch12:0,"\u4ee3\u540d\u8a5e":0,admit:0,"\u79c1\u306f\u5f7c\u3092\uff15\u65e5\u524d":0,"\u526f\u8a5e\u53ef\u80fd":0,com:[0,1],disqus_shortnam:[0,1],comment:[0,1],height:0,jnlp:[0,1],list_of_token:0,ref_noscript:[0,1],convert:0,func:0,positive_scor:0,nlpwww:0,properti:0,sourceforg:0,cobocha:0,"\u6570":0,marriag:0,abov:0,"\u52a9\u6570\u8a5e":0,observ:0,demo:0,wordnet:0,develop:0,japanes:0,parti:0,"\u30de\u30a8":0,read:0,html:0,itali:0,document:[0,1],scrollheight:0,brlink:[0,1],http:[0,1],utf8:0,upon:0,"\u3081\u306a\u3044\u3060\u308d\u3046":0,jtoken:0,tokenizedromaji:0,"\u767a\u8868\u3057\u305f\u5929\u6c17\u6982\u6cc1\u306b\u3088\u308b\u3068":0,entri:0,getelementbyid:0,pickl:0,contact:0,thi:[0,1],mother:0,latest:0,paul:0,languag:0,noscript:[0,1],onload:0,blog:[0,1],framebord:0,"\u30bf":0,bin:0,"\u99c5":0,"\u3053\u308c\u3067\u30a2\u51b7\u3048\u77e5\u3089\u305a\u30ca\u30bf\u3082":0,format:0,webpag:[0,1],"\u30cf":0,uff15:0,output:0,tenki:0,www:0,edict_exampl:0,back:0,sampl:0,home:0,librari:0,cnr:0,definit:0,overlap:0,"\u30b4":0,"\u30ad\u30f3\u30e8\u30a6\u30d3":0,cgi:0,run:0,power:[0,1],usag:[],async:[0,1],"0_20100908":0,eucjp:0,edict_path:0,encod:0,"\u7236\u306f\u6c7a\u3057\u3066\u79c1\u306e\u7d50\u5a5a\u3092":0,your:[0,1],git:0,span:[0,1],s1010205:0,width:0,includ:0,newwidth:0,"var":[0,1],icon:0,"function":[0,1],jpwordnet:0,link:0,"\u30a8\u30ad":0,line:[0,1],"true":[0,1],utf:0,type:[0,1],googlecod:0,below:[0,1],linktoorigin:0,similar:0,parser:0,doesn:0,incomplet:0,"\u30b3\u30ce":0,file:0,logo:[0,1],check:0,yon:0,titl:1,save_with_utf:0,nict:0,book:0,katakanachartfil:0,polar:0,"\u30b9\u30c8\u30fc\u30ea\u30fc":0,u3092:0,"\u3053\u308c\u306f\u4f55\u3067\u3059\u304b":0,goin:0,svn:0,sphinx:0,kanji:0,to_str:0,text:[0,1],time:0,autores:0,jaist:0},objtypes:{},titles:["1   Japanese NLP Library","<no title>"],objnames:{},filenames:["index","disqus_jnlp.html"]}) -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # jProcessing documentation build configuration file, created by 4 | # sphinx-quickstart on Wed Mar 7 20:02:01 2012. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | #sys.path.insert(0, os.path.abspath('.')) 20 | 21 | # -- General configuration ----------------------------------------------------- 22 | 23 | # If your documentation needs a minimal Sphinx version, state it here. 24 | #needs_sphinx = '1.0' 25 | 26 | # Add any Sphinx extension module names here, as strings. They can be extensions 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 28 | extensions = [] 29 | 30 | # Add any paths that contain templates here, relative to this directory. 31 | templates_path = ['_templates'] 32 | 33 | # The suffix of source filenames. 34 | source_suffix = '.rst' 35 | 36 | # The encoding of source files. 37 | #source_encoding = 'utf-8-sig' 38 | 39 | # The master toctree document. 40 | master_doc = 'index' 41 | 42 | # General information about the project. 43 | project = u'jProcessing' 44 | copyright = u'2012, Pulkit Kathuria' 45 | 46 | # The version info for the project you're documenting, acts as replacement for 47 | # |version| and |release|, also used in various other places throughout the 48 | # built documents. 49 | # 50 | # The short X.Y version. 51 | version = '0.1' 52 | # The full version, including alpha/beta/rc tags. 53 | release = '0.1' 54 | 55 | # The language for content autogenerated by Sphinx. Refer to documentation 56 | # for a list of supported languages. 57 | #language = None 58 | 59 | # There are two options for replacing |today|: either, you set today to some 60 | # non-false value, then it is used: 61 | #today = '' 62 | # Else, today_fmt is used as the format for a strftime call. 63 | #today_fmt = '%B %d, %Y' 64 | 65 | # List of patterns, relative to source directory, that match files and 66 | # directories to ignore when looking for source files. 67 | exclude_patterns = ['_build'] 68 | 69 | # The reST default role (used for this markup: `text`) to use for all documents. 70 | #default_role = None 71 | 72 | # If true, '()' will be appended to :func: etc. cross-reference text. 73 | #add_function_parentheses = True 74 | 75 | # If true, the current module name will be prepended to all description 76 | # unit titles (such as .. function::). 77 | #add_module_names = True 78 | 79 | # If true, sectionauthor and moduleauthor directives will be shown in the 80 | # output. They are ignored by default. 81 | #show_authors = False 82 | 83 | # The name of the Pygments (syntax highlighting) style to use. 84 | pygments_style = 'sphinx' 85 | 86 | # A list of ignored prefixes for module index sorting. 87 | #modindex_common_prefix = [] 88 | 89 | 90 | # -- Options for HTML output --------------------------------------------------- 91 | 92 | # The theme to use for HTML and HTML Help pages. See the documentation for 93 | # a list of builtin themes. 94 | html_theme = 'pyramid' 95 | 96 | # Theme options are theme-specific and customize the look and feel of a theme 97 | # further. For a list of options available for each theme, see the 98 | # documentation. 99 | #html_theme_options = {} 100 | 101 | # Add any paths that contain custom themes here, relative to this directory. 102 | #html_theme_path = [] 103 | 104 | # The name for this set of Sphinx documents. If None, it defaults to 105 | # " v documentation". 106 | html_title = "Japanese Natural Language Processing" 107 | 108 | # A shorter title for the navigation bar. Default is the same as html_title. 109 | #html_short_title = None 110 | 111 | # The name of an image file (relative to this directory) to place at the top 112 | # of the sidebar. 113 | #html_logo = None 114 | 115 | # The name of an image file (within the static path) to use as favicon of the 116 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 117 | # pixels large. 118 | #html_favicon = None 119 | 120 | # Add any paths that contain custom static files (such as style sheets) here, 121 | # relative to this directory. They are copied after the builtin static files, 122 | # so a file named "default.css" will overwrite the builtin "default.css". 123 | html_static_path = ['_static'] 124 | 125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 126 | # using the given strftime format. 127 | #html_last_updated_fmt = '%b %d, %Y' 128 | 129 | # If true, SmartyPants will be used to convert quotes and dashes to 130 | # typographically correct entities. 131 | #html_use_smartypants = True 132 | 133 | # Custom sidebar templates, maps document names to template names. 134 | #html_sidebars = {} 135 | 136 | # Additional templates that should be rendered to pages, maps page names to 137 | # template names. 138 | #html_additional_pages = {} 139 | 140 | # If false, no module index is generated. 141 | #html_domain_indices = True 142 | 143 | # If false, no index is generated. 144 | html_use_index = False 145 | 146 | # If true, the index is split into individual pages for each letter. 147 | #html_split_index = False 148 | 149 | # If true, links to the reST sources are added to the pages. 150 | html_show_sourcelink = False 151 | 152 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 153 | html_show_sphinx = False 154 | 155 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 156 | #html_show_copyright = True 157 | 158 | # If true, an OpenSearch description file will be output, and all pages will 159 | # contain a tag referring to it. The value of this option must be the 160 | # base URL from which the finished HTML is served. 161 | #html_use_opensearch = '' 162 | 163 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 164 | #html_file_suffix = None 165 | 166 | # Output file base name for HTML help builder. 167 | htmlhelp_basename = 'jProcessingdoc' 168 | 169 | 170 | # -- Options for LaTeX output -------------------------------------------------- 171 | 172 | latex_elements = { 173 | # The paper size ('letterpaper' or 'a4paper'). 174 | #'papersize': 'letterpaper', 175 | 176 | # The font size ('10pt', '11pt' or '12pt'). 177 | #'pointsize': '10pt', 178 | 179 | # Additional stuff for the LaTeX preamble. 180 | #'preamble': '', 181 | } 182 | 183 | # Grouping the document tree into LaTeX files. List of tuples 184 | # (source start file, target name, title, author, documentclass [howto/manual]). 185 | latex_documents = [ 186 | ('index', 'jProcessing.tex', u'jProcessing Documentation', 187 | u'Pulkit Kathuria', 'manual'), 188 | ] 189 | 190 | # The name of an image file (relative to this directory) to place at the top of 191 | # the title page. 192 | #latex_logo = None 193 | 194 | # For "manual" documents, if this is true, then toplevel headings are parts, 195 | # not chapters. 196 | #latex_use_parts = False 197 | 198 | # If true, show page references after internal links. 199 | #latex_show_pagerefs = False 200 | 201 | # If true, show URL addresses after external links. 202 | #latex_show_urls = False 203 | 204 | # Documents to append as an appendix to all manuals. 205 | #latex_appendices = [] 206 | 207 | # If false, no module index is generated. 208 | #latex_domain_indices = True 209 | 210 | 211 | # -- Options for manual page output -------------------------------------------- 212 | 213 | # One entry per manual page. List of tuples 214 | # (source start file, name, description, authors, manual section). 215 | man_pages = [ 216 | ('index', 'jprocessing', u'jProcessing Documentation', 217 | [u'Pulkit Kathuria'], 1) 218 | ] 219 | 220 | # If true, show URL addresses after external links. 221 | #man_show_urls = False 222 | 223 | 224 | # -- Options for Texinfo output ------------------------------------------------ 225 | 226 | # Grouping the document tree into Texinfo files. List of tuples 227 | # (source start file, target name, title, author, 228 | # dir menu entry, description, category) 229 | texinfo_documents = [ 230 | ('index', 'jProcessing', u'jProcessing Documentation', 231 | u'Pulkit Kathuria', 'jProcessing', 'One line description of project.', 232 | 'Miscellaneous'), 233 | ] 234 | 235 | # Documents to append as an appendix to all manuals. 236 | #texinfo_appendices = [] 237 | 238 | # If false, no module index is generated. 239 | #texinfo_domain_indices = True 240 | 241 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 242 | #texinfo_show_urls = 'footnote' 243 | -------------------------------------------------------------------------------- /docs/disqus_jnlp.html.rst: -------------------------------------------------------------------------------- 1 | .. raw:: html 2 | 3 |
4 | 15 | 16 | blog comments powered by Disqus 17 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. raw:: html 2 | 3 | 4 | 5 | 6 | 7 | .. raw:: html 8 | 9 |
Back to Home 10 | 11 | ==================== 12 | Japanese NLP Library 13 | ==================== 14 | 15 | 16 | .. sectnum:: 17 | .. contents:: 18 | 19 | Requirements 20 | ============ 21 | 22 | - Third Party Dependencies 23 | 24 | - Cabocha Japanese Morphological parser http://sourceforge.net/projects/cabocha/ 25 | 26 | - Python Dependencies 27 | 28 | - ``Python 2.6.*`` or above 29 | 30 | 31 | ``Links`` 32 | --------- 33 | 34 | - All code at jProcessing Repo GitHub_ 35 | 36 | .. _GitHub: https://github.com/kevincobain2000/jProcessing 37 | 38 | - Documentation_ and HomePage_ and Sphinx_ 39 | 40 | .. _Documentation: http://www.jaist.ac.jp/~s1010205/jnlp 41 | 42 | .. _HomePage: http://www.jaist.ac.jp/~s1010205/ 43 | 44 | .. _Sphinx: http://readthedocs.org/docs/jprocessing/en/latest/ 45 | 46 | 47 | - PyPi_ Python Package 48 | 49 | .. _PyPi: http://pypi.python.org/pypi/jProcessing/0.1 50 | 51 | :: 52 | 53 | clone git@github.com:kevincobain2000/jProcessing.git 54 | 55 | 56 | ``Install`` 57 | ----------- 58 | 59 | In ``Terminal`` :: 60 | 61 | bash$ python setup.py install 62 | 63 | History 64 | ------- 65 | 66 | - ``0.2`` 67 | 68 | + Sentiment Analysis of Japanese Text 69 | 70 | - ``0.1`` 71 | + Morphologically Tokenize Japanese Sentence 72 | + Kanji / Hiragana / Katakana to Romaji Converter 73 | + Edict Dictionary Search - borrowed 74 | + Edict Examples Search - incomplete 75 | + Sentence Similarity between two JP Sentences 76 | + Run Cabocha(ISO--8859-1 configured) in Python. 77 | + Longest Common String between Sentences 78 | + Kanji to Katakana Pronunciation 79 | + Hiragana, Katakana Chart Parser 80 | 81 | Libraries and Modules 82 | ===================== 83 | 84 | Tokenize ``jTokenize.py`` 85 | ------------------------- 86 | In ``Python`` :: 87 | 88 | >>> from jNlp.jTokenize import jTokenize 89 | >>> input_sentence = u'私は彼を5日前、つまりこの前の金曜日に駅で見かけた' 90 | >>> list_of_tokens = jTokenize(input_sentence) 91 | >>> print list_of_tokens 92 | >>> print '--'.join(list_of_tokens).encode('utf-8') 93 | 94 | Returns: 95 | 96 | :: 97 | 98 | ... [u'\u79c1', u'\u306f', u'\u5f7c', u'\u3092', u'\uff15'...] 99 | ... 私--は--彼--を--5--日--前--、--つまり--この--前--の--金曜日--に--駅--で--見かけ--た 100 | 101 | Katakana Pronunciation: 102 | 103 | :: 104 | 105 | >>> print '--'.join(jReads(input_sentence)).encode('utf-8') 106 | ... ワタシ--ハ--カレ--ヲ--ゴ--ニチ--マエ--、--ツマリ--コノ--マエ--ノ--キンヨウビ--ニ--エキ--デ--ミカケ--タ 107 | 108 | 109 | Cabocha ``jCabocha.py`` 110 | ----------------------- 111 | 112 | Run Cabocha_ with original ``EUCJP`` or ``IS0-8859-1`` configured encoding, with ``utf8`` python 113 | 114 | .. _Cabocha: http://code.google.com/p/cabocha/ 115 | 116 | - If cabocha is configured as ``utf8`` then see this http://nltk.googlecode.com/svn/trunk/doc/book-jp/ch12.html#cabocha 117 | 118 | .. code-block:: python 119 | 120 | >>> from jNlp.jCabocha import cabocha 121 | >>> print cabocha(input_sentence).encode('utf-8') 122 | 123 | Output: 124 | 125 | .. code-block:: xml 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | Kanji / Katakana /Hiragana to Tokenized Romaji ``jConvert.py`` 146 | -------------------------------------------------------------- 147 | 148 | Uses ``data/katakanaChart.txt`` and parses the chart. See katakanaChart_. 149 | 150 | .. code-block:: python 151 | 152 | >>> from jNlp.jConvert import * 153 | >>> input_sentence = u'気象庁が21日午前4時48分、発表した天気概況によると、' 154 | >>> print ' '.join(tokenizedRomaji(input_sentence)) 155 | >>> print tokenizedRomaji(input_sentence) 156 | 157 | .. code-block:: python 158 | 159 | ...kisyoutyou ga ni ichi nichi gozen yon ji yon hachi hun hapyou si ta tenki gaikyou ni yoru to 160 | ...[u'kisyoutyou', u'ga', u'ni', u'ichi', u'nichi', u'gozen',...] 161 | 162 | 163 | **katakanaChart.txt** 164 | 165 | 166 | .. _katakanaChart: 167 | 168 | - katakanaChartFile_ and hiraganaChartFile_ 169 | 170 | .. _katakanaChartFile: https://raw.github.com/kevincobain2000/jProcessing/master/src/jNlp/data/katakanaChart.txt 171 | 172 | .. _hiraganaChartFile: https://raw.github.com/kevincobain2000/jProcessing/master/src/jNlp/data/hiraganaChart.txt 173 | 174 | 175 | Longest Common String Japanese ``jProcessing.py`` 176 | ------------------------------------------------- 177 | 178 | On English Strings :: 179 | 180 | >>> from jNlp.jProcessing import long_substr 181 | >>> a = 'Once upon a time in Italy' 182 | >>> b = 'Thre was a time in America' 183 | >>> print long_substr(a, b) 184 | 185 | Output :: 186 | 187 | ...a time in 188 | 189 | On Japanese Strings :: 190 | 191 | >>> a = u'これでアナタも冷え知らず' 192 | >>> b = u'これでア冷え知らずナタも' 193 | >>> print long_substr(a, b).encode('utf-8') 194 | 195 | Output :: 196 | 197 | ...冷え知らず 198 | 199 | Similarity between two sentences ``jProcessing.py`` 200 | --------------------------------------------------- 201 | Uses MinHash by checking the overlap http://en.wikipedia.org/wiki/MinHash 202 | 203 | :English Strings: 204 | 205 | >>> from jNlp.jProcessing import Similarities 206 | >>> s = Similarities() 207 | >>> a = 'There was' 208 | >>> b = 'There is' 209 | >>> print s.minhash(a,b) 210 | ...0.444444444444 211 | 212 | :Japanese Strings: 213 | 214 | >>> from jNlp.jProcessing import * 215 | >>> a = u'これは何ですか?' 216 | >>> b = u'これはわからないです' 217 | >>> print s.minhash(' '.join(jTokenize(a)), ' '.join(jTokenize(b))) 218 | ...0.210526315789 219 | 220 | Edict Japanese Dictionary Search with Example sentences 221 | ======================================================= 222 | 223 | Sample Ouput Demo 224 | ----------------- 225 | 226 | .. raw:: html 227 | 228 | 244 | 245 | 246 | 247 | Edict dictionary and example sentences parser. 248 | ---------------------------------------------- 249 | 250 | This package uses the EDICT_ and KANJIDIC_ dictionary files. 251 | These files are the property of the 252 | Electronic Dictionary Research and Development Group_ , and 253 | are used in conformance with the Group's licence_ . 254 | 255 | .. _EDICT: http://www.csse.monash.edu.au/~jwb/edict.html 256 | .. _KANJIDIC: http://www.csse.monash.edu.au/~jwb/kanjidic.html 257 | .. _Group: http://www.edrdg.org/ 258 | .. _licence: http://www.edrdg.org/edrdg/licence.html 259 | 260 | Edict Parser By **Paul Goins**, see ``edict_search.py`` 261 | Edict Example sentences Parse by query, **Pulkit Kathuria**, see ``edict_examples.py`` 262 | Edict examples pickle files are provided but latest example files can be downloaded from the links provided. 263 | 264 | Charset 265 | ------- 266 | Two files 267 | 268 | - ``utf8`` Charset example file if not using ``src/jNlp/data/edict_examples`` 269 | 270 | To convert ``EUCJP/ISO-8859-1`` to ``utf8`` :: 271 | 272 | iconv -f EUCJP -t UTF-8 path/to/edict_examples > path/to/save_with_utf-8 273 | 274 | - ``ISO-8859-1`` edict_dictionary file 275 | 276 | Outputs example sentences for a query in Japanese only for ambiguous words. 277 | 278 | 279 | Links 280 | ----- 281 | 282 | **Latest** Dictionary files can be downloaded here_ 283 | 284 | .. _here: http://www.csse.monash.edu.au/~jwb/edict.html 285 | 286 | ``edict_search.py`` 287 | ------------------- 288 | :author: Paul Goins `License included` linkToOriginal_: 289 | 290 | .. _linkToOriginal: http://repo.or.cz/w/jbparse.git/blame/8e42831ca5f721c0320b27d7d83cb553d6e9c68f:/jbparse/edict.py 291 | 292 | For all entries of sense definitions 293 | 294 | >>> from jNlp.edict_search import * 295 | >>> query = u'認める' 296 | >>> edict_path = 'src/jNlp/data/edict-yy-mm-dd' 297 | >>> kp = Parser(edict_path) 298 | >>> for i, entry in enumerate(kp.search(query)): 299 | ... print entry.to_string().encode('utf-8') 300 | 301 | 302 | ``edict_examples.py`` 303 | --------------------- 304 | :`Note`: Only outputs the examples sentences for ambiguous words (if word has one or more senses) 305 | 306 | :author: Pulkit Kathuria 307 | 308 | >>> from jNlp.edict_examples import * 309 | >>> query = u'認める' 310 | >>> edict_path = 'src/jNlp/data/edict-yy-mm-dd' 311 | >>> edict_examples_path = 'src/jNlp/data/edict_examples' 312 | >>> search_with_example(edict_path, edict_examples_path, query) 313 | 314 | Output :: 315 | 316 | 認める 317 | 318 | Sense (1) to recognize; 319 | EX:01 我々は彼の才能を*認*めている。We appreciate his talent. 320 | 321 | Sense (2) to observe; 322 | EX:01 x線写真で異状が*認*められます。We have detected an abnormality on your x-ray. 323 | 324 | Sense (3) to admit; 325 | EX:01 母は私の計画をよいと*認*めた。Mother approved my plan. 326 | EX:02 母は決して私の結婚を*認*めないだろう。Mother will never approve of my marriage. 327 | EX:03 父は決して私の結婚を*認*めないだろう。Father will never approve of my marriage. 328 | EX:04 彼は女性の喫煙をいいものだと*認*めない。He doesn't approve of women smoking. 329 | ... 330 | 331 | Sentiment Analysis Japanese Text 332 | ================================ 333 | 334 | This section covers (1) Sentiment Analysis on Japanese text using Word Sense Disambiguation, Wordnet-jp_ (Japanese Word Net file name ``wnjpn-all.tab``), SentiWordnet_ (English SentiWordNet file name ``SentiWordNet_3.*.txt``). 335 | 336 | .. _Wordnet-jp: http://nlpwww.nict.go.jp/wn-ja/eng/downloads.html 337 | .. _SentiWordnet: http://sentiwordnet.isti.cnr.it/ 338 | 339 | Wordnet files download links 340 | ---------------------------- 341 | 342 | 1. http://nlpwww.nict.go.jp/wn-ja/eng/downloads.html 343 | 2. http://sentiwordnet.isti.cnr.it/ 344 | 345 | How to Use 346 | ---------- 347 | 348 | The following classifier is baseline, which works as simple mapping of Eng to Japanese using Wordnet and classify on polarity score using SentiWordnet. 349 | 350 | - (Adnouns, nouns, verbs, .. all included) 351 | - No WSD module on Japanese Sentence 352 | - Uses word as its common sense for polarity score 353 | 354 | >>> from jNlp.jSentiments import * 355 | >>> jp_wn = '../../../../data/wnjpn-all.tab' 356 | >>> en_swn = '../../../../data/SentiWordNet_3.0.0_20100908.txt' 357 | >>> classifier = Sentiment() 358 | >>> classifier.train(en_swn, jp_wn) 359 | >>> text = u'監督、俳優、ストーリー、演出、全部最高!' 360 | >>> print classifier.baseline(text) 361 | ...Pos Score = 0.625 Neg Score = 0.125 362 | ...Text is Positive 363 | 364 | Japanese Word Polarity Score 365 | ---------------------------- 366 | 367 | >>> from jNlp.jSentiments import * 368 | >>> jp_wn = '_dicts/wnjpn-all.tab' #path to Japanese Word Net 369 | >>> en_swn = '_dicts/SentiWordNet_3.0.0_20100908.txt' #Path to SentiWordNet 370 | >>> classifier = Sentiment() 371 | >>> sentiwordnet, jpwordnet = classifier.train(en_swn, jp_wn) 372 | >>> positive_score = sentiwordnet[jpwordnet[u'全部']][0] 373 | >>> negative_score = sentiwordnet[jpwordnet[u'全部']][1] 374 | >>> print 'pos score = {0}, neg score = {1}'.format(positive_score, negative_score) 375 | ...pos score = 0.625, neg score = 0.0 376 | 377 | 378 | Contacts 379 | ======== 380 | 381 | :Author: `pulkit[at]jaist.ac.jp` [change ``at`` with ``@``] 382 | 383 | 384 | .. include:: disqus_jnlp.html.rst 385 | 386 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. linkcheck to check all external links for integrity 37 | echo. doctest to run all doctests embedded in the documentation if enabled 38 | goto end 39 | ) 40 | 41 | if "%1" == "clean" ( 42 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 43 | del /q /s %BUILDDIR%\* 44 | goto end 45 | ) 46 | 47 | if "%1" == "html" ( 48 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 49 | if errorlevel 1 exit /b 1 50 | echo. 51 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 52 | goto end 53 | ) 54 | 55 | if "%1" == "dirhtml" ( 56 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 57 | if errorlevel 1 exit /b 1 58 | echo. 59 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 60 | goto end 61 | ) 62 | 63 | if "%1" == "singlehtml" ( 64 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 68 | goto end 69 | ) 70 | 71 | if "%1" == "pickle" ( 72 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished; now you can process the pickle files. 76 | goto end 77 | ) 78 | 79 | if "%1" == "json" ( 80 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished; now you can process the JSON files. 84 | goto end 85 | ) 86 | 87 | if "%1" == "htmlhelp" ( 88 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can run HTML Help Workshop with the ^ 92 | .hhp project file in %BUILDDIR%/htmlhelp. 93 | goto end 94 | ) 95 | 96 | if "%1" == "qthelp" ( 97 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 98 | if errorlevel 1 exit /b 1 99 | echo. 100 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 101 | .qhcp project file in %BUILDDIR%/qthelp, like this: 102 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\jProcessing.qhcp 103 | echo.To view the help file: 104 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\jProcessing.ghc 105 | goto end 106 | ) 107 | 108 | if "%1" == "devhelp" ( 109 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 110 | if errorlevel 1 exit /b 1 111 | echo. 112 | echo.Build finished. 113 | goto end 114 | ) 115 | 116 | if "%1" == "epub" ( 117 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 118 | if errorlevel 1 exit /b 1 119 | echo. 120 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 121 | goto end 122 | ) 123 | 124 | if "%1" == "latex" ( 125 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 129 | goto end 130 | ) 131 | 132 | if "%1" == "text" ( 133 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The text files are in %BUILDDIR%/text. 137 | goto end 138 | ) 139 | 140 | if "%1" == "man" ( 141 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 145 | goto end 146 | ) 147 | 148 | if "%1" == "texinfo" ( 149 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 150 | if errorlevel 1 exit /b 1 151 | echo. 152 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 153 | goto end 154 | ) 155 | 156 | if "%1" == "gettext" ( 157 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 158 | if errorlevel 1 exit /b 1 159 | echo. 160 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 161 | goto end 162 | ) 163 | 164 | if "%1" == "changes" ( 165 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 166 | if errorlevel 1 exit /b 1 167 | echo. 168 | echo.The overview file is in %BUILDDIR%/changes. 169 | goto end 170 | ) 171 | 172 | if "%1" == "linkcheck" ( 173 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 174 | if errorlevel 1 exit /b 1 175 | echo. 176 | echo.Link check complete; look for any errors in the above output ^ 177 | or in %BUILDDIR%/linkcheck/output.txt. 178 | goto end 179 | ) 180 | 181 | if "%1" == "doctest" ( 182 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 183 | if errorlevel 1 exit /b 1 184 | echo. 185 | echo.Testing of doctests in the sources finished, look at the ^ 186 | results in %BUILDDIR%/doctest/output.txt. 187 | goto end 188 | ) 189 | 190 | :end 191 | -------------------------------------------------------------------------------- /scripts/vcabocha.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from jNlp.jCabocha import * 4 | from jNlp.jTokenize import * 5 | import argparse 6 | 7 | if __name__ == '__main__': 8 | parser = argparse.ArgumentParser(add_help = True) 9 | parser = argparse.ArgumentParser(description= 'No description sepecified') 10 | parser.add_argument('-a', action="store", dest="action", type=unicode, help='-a [cabocha, tokenize, base, read, pos]') 11 | parser.add_argument('-s', action="store", dest="sentence", type=str, help='-s Sentence') 12 | myarguments = parser.parse_args() 13 | sent = unicode(myarguments.sentence,'utf-8') 14 | print myarguments.action 15 | if myarguments.action == "cabocha": 16 | print cabocha(sent).encode('utf-8') 17 | elif myarguments.action == "tokenize": 18 | print 'Tokenized' 19 | print '=========' 20 | print '\n'.join(jTokenize(sent)) 21 | elif myarguments.action: 22 | tokenized = jTokenize(sent) 23 | info = jInfo(sent, infotype=myarguments.action) 24 | mxlen = len(max(max(tokenized, key=len), max(info, key=len))) + 30 25 | print '{0:{mx}}{1:}'.format('Sent',myarguments.action, mx = mxlen) 26 | print '{0:{mx}}{1:}'.format('====','='*len(myarguments.action), mx = mxlen) 27 | 28 | for i, j in zip(tokenized, info): 29 | i = i.encode('utf-8') 30 | j = j.encode('utf-8') 31 | print '{0:{mx}}{1:<}'.format(i,j, mx = mxlen) 32 | else: 33 | print cabocha(sent).encode('utf-8') 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | from setuptools import setup, find_packages 5 | def read(fname): 6 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 7 | setup( 8 | name = 'jProcessing', #First Level Dir 9 | version='0.1', 10 | author='KATHURIA Pulkit', 11 | author_email='pulkit@jaist.ac.jp', 12 | packages= find_packages('src'), 13 | scripts = ['scripts/vcabocha.py'], 14 | package_dir = {'':'src'}, 15 | package_data = {'': ['data/*'], 16 | }, 17 | include_package_data = True, 18 | exclude_package_data = {'': ['jNlp/*.p']}, 19 | url='http://www.jaist.ac.jp/~s1010205', 20 | license='LICENSE.txt', 21 | description='Japanese NLP Utilities', 22 | long_description=open('README').read(), 23 | classifiers=['Development Status :: 2 - Pre-Alpha','Natural Language :: Japanese', 24 | 'Topic :: Scientific/Engineering :: Artificial Intelligence'], 25 | 26 | ) 27 | 28 | """ 29 | File System 30 | =========== 31 | jNlp/ 32 | setup.py 33 | README 34 | LICENCE.txt 35 | scripts/ 36 | ... 37 | src/ 38 | jNlp/ 39 | __init__.py 40 | jCabocha.py #see foo.py to check how to access somefile.dat 41 | jTokenize.py 42 | jConvert.py 43 | jColor.py 44 | edict_search.py 45 | edict_examples.py 46 | jSentiments.py 47 | 48 | classifiers/ 49 | .. 50 | data/ 51 | katakanaChart.txt 52 | hiraganaChart.txt 53 | edict dictionary files *not included* 54 | jnlp/ 55 | *not with this package*#see MANIFEST.in 56 | ... 57 | _dicts/ 58 | dict files *NA* 59 | """ 60 | -------------------------------------------------------------------------------- /src/jNlp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/src/jNlp/__init__.py -------------------------------------------------------------------------------- /src/jNlp/aquisition/OpenSubtitles.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # This file is part of periscope. 4 | # 5 | # periscope is free software; you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation; either version 2 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # periscope is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with periscope; if not, write to the Free Software 17 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 | 19 | import os, struct, xmlrpclib, commands, gzip, traceback, logging 20 | import socket # For timeout purposes 21 | 22 | import SubtitleDatabase 23 | 24 | log = logging.getLogger(__name__) 25 | 26 | OS_LANGS ={ "en": "eng", 27 | "fr" : "fre", 28 | "hu": "hun", 29 | "cs": "cze", 30 | "pl" : "pol", 31 | "sk" : "slo", 32 | "pt" : "por", 33 | "pt-br" : "pob", 34 | "es" : "spa", 35 | "el" : "ell", 36 | "ar":"ara", 37 | 'sq':'alb', 38 | "hy":"arm", 39 | "ay":"ass", 40 | "bs":"bos", 41 | "bg":"bul", 42 | "ca":"cat", 43 | "zh":"chi", 44 | "hr":"hrv", 45 | "da":"dan", 46 | "nl":"dut", 47 | "eo":"epo", 48 | "et":"est", 49 | "fi":"fin", 50 | "gl":"glg", 51 | "ka":"geo", 52 | "de":"ger", 53 | "he":"heb", 54 | "hi":"hin", 55 | "is":"ice", 56 | "id":"ind", 57 | "it":"ita", 58 | "ja":"jpn", 59 | "kk":"kaz", 60 | "ko":"kor", 61 | "lv":"lav", 62 | "lt":"lit", 63 | "lb":"ltz", 64 | "mk":"mac", 65 | "ms":"may", 66 | "no":"nor", 67 | "oc":"oci", 68 | "fa":"per", 69 | "ro":"rum", 70 | "ru":"rus", 71 | "sr":"scc", 72 | "sl":"slv", 73 | "sv":"swe", 74 | "th":"tha", 75 | "tr":"tur", 76 | "uk":"ukr", 77 | "vi":"vie"} 78 | 79 | class OpenSubtitles(SubtitleDatabase.SubtitleDB): 80 | url = "http://www.opensubtitles.org/" 81 | site_name = "OpenSubtitles" 82 | 83 | def __init__(self):#, config, cache_folder_path): 84 | super(OpenSubtitles, self).__init__(OS_LANGS) 85 | self.server_url = 'http://api.opensubtitles.org/xml-rpc' 86 | self.revertlangs = dict(map(lambda item: (item[1],item[0]), self.langs.items())) 87 | 88 | def process(self, filepath, langs): 89 | ''' main method to call on the plugin, pass the filename and the wished 90 | languages and it will query OpenSubtitles.org ''' 91 | if os.path.isfile(filepath): 92 | filehash = self.hashFile(filepath) 93 | log.debug(filehash) 94 | size = os.path.getsize(filepath) 95 | fname = self.getFileName(filepath) 96 | return self.query(moviehash=filehash, langs=langs, bytesize=size, filename=fname) 97 | else: 98 | fname = self.getFileName(filepath) 99 | return self.query(langs=langs, filename=fname) 100 | 101 | def createFile(self, subtitle): 102 | '''pass the URL of the sub and the file it matches, will unzip it 103 | and return the path to the created file''' 104 | suburl = subtitle["link"] 105 | videofilename = subtitle["filename"] 106 | srtbasefilename = videofilename.rsplit(".", 1)[0] 107 | self.downloadFile(suburl, srtbasefilename + ".srt.gz") 108 | f = gzip.open(srtbasefilename+".srt.gz") 109 | dump = open(srtbasefilename+".srt", "wb") 110 | dump.write(f.read()) 111 | dump.close() 112 | f.close() 113 | os.remove(srtbasefilename+".srt.gz") 114 | return srtbasefilename+".srt" 115 | 116 | def query(self, filename, imdbID=None, moviehash=None, bytesize=None, langs=None): 117 | ''' Makes a query on opensubtitles and returns info about found subtitles. 118 | Note: if using moviehash, bytesize is required. ''' 119 | log.debug('query') 120 | #Prepare the search 121 | search = {} 122 | sublinks = [] 123 | if moviehash: search['moviehash'] = moviehash 124 | if imdbID: search['imdbid'] = imdbID 125 | if bytesize: search['moviebytesize'] = str(bytesize) 126 | if langs: search['sublanguageid'] = ",".join([self.getLanguage(lang) for lang in langs]) 127 | if len(search) == 0: 128 | log.debug("No search term, we'll use the filename") 129 | # Let's try to guess what to search: 130 | guessed_data = self.guessFileData(filename) 131 | search['query'] = guessed_data['name'] 132 | log.debug(search['query']) 133 | 134 | #Login 135 | self.server = xmlrpclib.Server(self.server_url) 136 | socket.setdefaulttimeout(10) 137 | try: 138 | log_result = self.server.LogIn("","","eng","periscope") 139 | log.debug(log_result) 140 | token = log_result["token"] 141 | except Exception: 142 | log.error("Open subtitles could not be contacted for login") 143 | token = None 144 | socket.setdefaulttimeout(None) 145 | return [] 146 | if not token: 147 | log.error("Open subtitles did not return a token after logging in.") 148 | return [] 149 | 150 | # Search 151 | self.filename = filename #Used to order the results 152 | sublinks += self.get_results(token, search) 153 | 154 | # Logout 155 | try: 156 | self.server.LogOut(token) 157 | except: 158 | log.error("Open subtitles could not be contacted for logout") 159 | socket.setdefaulttimeout(None) 160 | return sublinks 161 | 162 | 163 | def get_results(self, token, search): 164 | log.debug("query: token='%s', search='%s'" % (token, search)) 165 | try: 166 | if search: 167 | results = self.server.SearchSubtitles(token, [search]) 168 | except Exception, e: 169 | log.error("Could not query the server OpenSubtitles") 170 | log.debug(e) 171 | return [] 172 | log.debug("Result: %s" %str(results)) 173 | 174 | sublinks = [] 175 | if results['data']: 176 | log.debug(results['data']) 177 | # OpenSubtitles hash function is not robust ... We'll use the MovieReleaseName to help us select the best candidate 178 | for r in sorted(results['data'], self.sort_by_moviereleasename): 179 | # Only added if the MovieReleaseName matches the file 180 | result = {} 181 | result["release"] = r['SubFileName'] 182 | result["link"] = r['SubDownloadLink'] 183 | result["page"] = r['SubDownloadLink'] 184 | result["lang"] = self.getLG(r['SubLanguageID']) 185 | if search.has_key("query") : #We are using the guessed file name, let's remove some results 186 | if r["MovieReleaseName"].startswith(self.filename): 187 | sublinks.append(result) 188 | else: 189 | log.debug("Removing %s because release '%s' has not right start %s" %(result["release"], r["MovieReleaseName"], self.filename)) 190 | else : 191 | sublinks.append(result) 192 | return sublinks 193 | 194 | def sort_by_moviereleasename(self, x, y): 195 | ''' sorts based on the movierelease name tag. More matching, returns 1''' 196 | #TODO add also support for subtitles release 197 | xmatch = x['MovieReleaseName'] and (x['MovieReleaseName'].find(self.filename)>-1 or self.filename.find(x['MovieReleaseName'])>-1) 198 | ymatch = y['MovieReleaseName'] and (y['MovieReleaseName'].find(self.filename)>-1 or self.filename.find(y['MovieReleaseName'])>-1) 199 | #print "analyzing %s and %s = %s and %s" %(x['MovieReleaseName'], y['MovieReleaseName'], xmatch, ymatch) 200 | if xmatch and ymatch: 201 | if x['MovieReleaseName'] == self.filename or x['MovieReleaseName'].startswith(self.filename) : 202 | return -1 203 | return 0 204 | if not xmatch and not ymatch: 205 | return 0 206 | if xmatch and not ymatch: 207 | return -1 208 | if not xmatch and ymatch: 209 | return 1 210 | return 0 211 | 212 | if __name__ == "__main__": 213 | subs = OpenSubtitles() 214 | print subs.query('Titanic') 215 | -------------------------------------------------------------------------------- /src/jNlp/aquisition/SubtitleDatabase.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # This file is part of periscope. 4 | # 5 | # periscope is free software; you can redistribute it and/or modify 6 | # it under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation; either version 2 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # periscope is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public License 16 | # along with periscope; if not, write to the Free Software 17 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 | 19 | import os, shutil, urllib2, sys, logging, traceback, zipfile 20 | import struct 21 | import socket # For timeout purposes 22 | import re 23 | 24 | log = logging.getLogger(__name__) 25 | 26 | USER_AGENT = 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.3)' 27 | 28 | class SubtitleDB(object): 29 | ''' Base (kind of abstract) class that represent a SubtitleDB, usually a website. Should be rewritten using abc module in Python 2.6/3K''' 30 | def __init__(self, langs, revertlangs = None): 31 | if langs: 32 | self.langs = langs 33 | self.revertlangs = dict(map(lambda item: (item[1],item[0]), self.langs.items())) 34 | if revertlangs: 35 | self.revertlangs = revertlangs 36 | self.langs = dict(map(lambda item: (item[1],item[0]), self.revertlangs.items())) 37 | self.tvshowRegex = re.compile('(?P.*)S(?P[0-9]{2})E(?P[0-9]{2}).(?P.*)', re.IGNORECASE) 38 | self.tvshowRegex2 = re.compile('(?P.*).(?P[0-9]{1,2})x(?P[0-9]{1,2}).(?P.*)', re.IGNORECASE) 39 | self.movieRegex = re.compile('(?P.*)[\.|\[|\(| ]{1}(?P(?:(?:19|20)[0-9]{2}))(?P.*)', re.IGNORECASE) 40 | 41 | def searchInThread(self, queue, filename, langs): 42 | ''' search subtitles with the given filename for the given languages''' 43 | try: 44 | subs = self.process(filename, langs) 45 | map(lambda item: item.setdefault("plugin", self), subs) 46 | map(lambda item: item.setdefault("filename", filename), subs) 47 | log.info("%s writing %s items to queue" % (self.__class__.__name__, len(subs))) 48 | except: 49 | log.exception("Error occured") 50 | subs = [] 51 | queue.put(subs, True) # Each plugin must write as the caller periscopy.py waits for an result on the queue 52 | 53 | def process(self, filepath, langs): 54 | ''' main method to call on the plugin, pass the filename and the wished 55 | languages and it will query the subtitles source ''' 56 | fname = self.getFileName(filepath) 57 | try: 58 | return self.query(fname, langs) 59 | except Exception, e: 60 | log.exception("Error occured") 61 | return [] 62 | 63 | def createFile(self, subtitle): 64 | '''pass the URL of the sub and the file it matches, will unzip it 65 | and return the path to the created file''' 66 | suburl = subtitle["link"] 67 | videofilename = subtitle["filename"] 68 | srtbasefilename = videofilename.rsplit(".", 1)[0] 69 | zipfilename = srtbasefilename +".zip" 70 | self.downloadFile(suburl, zipfilename) 71 | 72 | if zipfile.is_zipfile(zipfilename): 73 | log.debug("Unzipping file " + zipfilename) 74 | zf = zipfile.ZipFile(zipfilename, "r") 75 | for el in zf.infolist(): 76 | if el.orig_filename.rsplit(".", 1)[1] in ("srt", "sub", "txt"): 77 | outfile = open(srtbasefilename + "." + el.orig_filename.rsplit(".", 1)[1], "wb") 78 | outfile.write(zf.read(el.orig_filename)) 79 | outfile.flush() 80 | outfile.close() 81 | else: 82 | log.info("File %s does not seem to be valid " %el.orig_filename) 83 | # Deleting the zip file 84 | zf.close() 85 | os.remove(zipfilename) 86 | return srtbasefilename + ".srt" 87 | else: 88 | log.info("Unexpected file type (not zip)") 89 | os.remove(zipfilename) 90 | return None 91 | 92 | def downloadContent(self, url, timeout = None): 93 | ''' Downloads the given url and returns its contents.''' 94 | try: 95 | log.debug("Downloading %s" % url) 96 | req = urllib2.Request(url, headers={'Referer' : url, 'User-Agent' : USER_AGENT}) 97 | if timeout: 98 | socket.setdefaulttimeout(timeout) 99 | f = urllib2.urlopen(req) 100 | content = f.read() 101 | f.close() 102 | return content 103 | except urllib2.HTTPError, e: 104 | log.warning("HTTP Error: %s - %s" % (e.code, url)) 105 | except urllib2.URLError, e: 106 | log.warning("URL Error: %s - %s" % (e.reason, url)) 107 | 108 | def downloadFile(self, url, filename): 109 | ''' Downloads the given url to the given filename ''' 110 | content = self.downloadContent(url) 111 | dump = open(filename, "wb") 112 | dump.write(content) 113 | dump.close() 114 | log.debug("Download finished to file %s. Size : %s"%(filename,os.path.getsize(filename))) 115 | 116 | def getLG(self, language): 117 | ''' Returns the short (two-character) representation of the long language name''' 118 | try: 119 | return self.revertlangs[language] 120 | except KeyError, e: 121 | log.warn("Ooops, you found a missing language in the config file of %s: %s. Send a bug report to have it added." %(self.__class__.__name__, language)) 122 | 123 | def getLanguage(self, lg): 124 | ''' Returns the long naming of the language on a two character code ''' 125 | try: 126 | return self.langs[lg] 127 | except KeyError, e: 128 | log.warn("Ooops, you found a missing language in the config file of %s: %s. Send a bug report to have it added." %(self.__class__.__name__, lg)) 129 | 130 | def query(self, token): 131 | raise TypeError("%s has not implemented method '%s'" %(self.__class__.__name__, sys._getframe().f_code.co_name)) 132 | 133 | def fileExtension(self, filename): 134 | ''' Returns the file extension (without the dot)''' 135 | return os.path.splitext(filename)[1][1:].lower() 136 | 137 | def getFileName(self, filepath): 138 | if os.path.isfile(filepath): 139 | filename = os.path.basename(filepath) 140 | else: 141 | filename = filepath 142 | if filename.endswith(('.avi', '.wmv', '.mov', '.mp4', '.mpeg', '.mpg', '.mkv')): 143 | fname = filename.rsplit('.', 1)[0] 144 | else: 145 | fname = filename 146 | return fname 147 | 148 | def guessFileData(self, filename): 149 | filename = unicode(self.getFileName(filename).lower()) 150 | matches_tvshow = self.tvshowRegex.match(filename) 151 | if matches_tvshow: # It looks like a tv show 152 | (tvshow, season, episode, teams) = matches_tvshow.groups() 153 | tvshow = tvshow.replace(".", " ").strip() 154 | teams = teams.split('.') 155 | return {'type' : 'tvshow', 'name' : tvshow.strip(), 'season' : int(season), 'episode' : int(episode), 'teams' : teams} 156 | else: 157 | matches_tvshow = self.tvshowRegex2.match(filename) 158 | if matches_tvshow: 159 | (tvshow, season, episode, teams) = matches_tvshow.groups() 160 | tvshow = tvshow.replace(".", " ").strip() 161 | teams = teams.split('.') 162 | return {'type' : 'tvshow', 'name' : tvshow.strip(), 'season' : int(season), 'episode' : int(episode), 'teams' : teams} 163 | else: 164 | matches_movie = self.movieRegex.match(filename) 165 | if matches_movie: 166 | (movie, year, teams) = matches_movie.groups() 167 | teams = teams.split('.') 168 | part = None 169 | if "cd1" in teams : 170 | teams.remove('cd1') 171 | part = 1 172 | if "cd2" in teams : 173 | teams.remove('cd2') 174 | part = 2 175 | return {'type' : 'movie', 'name' : movie.strip(), 'year' : year, 'teams' : teams, 'part' : part} 176 | else: 177 | return {'type' : 'unknown', 'name' : filename, 'teams' : [] } 178 | 179 | def hashFile(self, name): 180 | ''' 181 | Calculates the Hash à-la Media Player Classic as it is the hash used by OpenSubtitles. 182 | By the way, this is not a very robust hash code. 183 | ''' 184 | longlongformat = 'Q' # unsigned long long little endian 185 | bytesize = struct.calcsize(longlongformat) 186 | format= "<%d%s" % (65536//bytesize, longlongformat) 187 | 188 | f = open(name, "rb") 189 | filesize = os.fstat(f.fileno()).st_size 190 | hash = filesize 191 | 192 | if filesize < 65536 * 2: 193 | log.error('File is too small') 194 | return "SizeError" 195 | 196 | buffer= f.read(65536) 197 | longlongs= struct.unpack(format, buffer) 198 | hash+= sum(longlongs) 199 | 200 | f.seek(-65536, os.SEEK_END) # size is always > 131072 201 | buffer= f.read(65536) 202 | longlongs= struct.unpack(format, buffer) 203 | hash+= sum(longlongs) 204 | hash&= 0xFFFFFFFFFFFFFFFF 205 | 206 | f.close() 207 | returnedhash = "%016x" % hash 208 | return returnedhash 209 | 210 | 211 | class InvalidFileException(Exception): 212 | ''' Exception object to be raised when the file is invalid''' 213 | def __init__(self, filename, reason): 214 | self.filename = filename 215 | self.reason = reason 216 | def __str__(self): 217 | return (repr(filename), repr(reason)) 218 | -------------------------------------------------------------------------------- /src/jNlp/aquisition/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/src/jNlp/aquisition/__init__.py -------------------------------------------------------------------------------- /src/jNlp/aquisition/aquire.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from OpenSubtitles import * 4 | 5 | def get_movie_names(directory): 6 | movienames = [] 7 | for line in open('movies.txt').readlines(): 8 | if not line.strip():continue 9 | movienames.append(line.strip()) 10 | return movienames 11 | 12 | if __name__ == '__main__': 13 | opensubs = OpenSubtitles() 14 | out = open('download_subs.xml','wb') 15 | for moviename in get_movie_names('movies.txt'): 16 | avail_en = '' 17 | avail_jp = '' 18 | try: 19 | all_langs = opensubs.query(moviename) 20 | for info_dic in all_langs: 21 | if info_dic['lang'] == 'en': 22 | avail_en = 'en' 23 | down_en = info_dic['link'] 24 | if info_dic['lang'] == 'ja': 25 | avail_jp = 'jp' 26 | down_jp = info_dic['link'] 27 | 28 | if avail_en and avail_jp: 29 | print moviename 30 | output = ""%(moviename, down_en, down_jp) 31 | out.write(output) 32 | out.write('\n') 33 | except: pass 34 | 35 | 36 | -------------------------------------------------------------------------------- /src/jNlp/aquisition/download_subs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /src/jNlp/callunix.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import subprocess 4 | from subprocess import call 5 | def _checklist(argument): 6 | if not type(argument) is list: 7 | return argument.split() 8 | else: return argument 9 | 10 | def shell_out(command): 11 | command = _checklist(command) 12 | process = subprocess.Popen(command, stdout=subprocess.PIPE) 13 | return process.communicate()[0] 14 | 15 | def shell_call(command): 16 | command = _checklist(command) 17 | subprocess.Popen(command, stdout=subprocess.PIPE) 18 | return '' 19 | 20 | 21 | -------------------------------------------------------------------------------- /src/jNlp/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/src/jNlp/data/__init__.py -------------------------------------------------------------------------------- /src/jNlp/data/chasen_pos.txt: -------------------------------------------------------------------------------- 1 | 0 0 BOS/EOS 2 | 1 1 名詞 3 | 1 2 名詞-一般 4 | 1 3 名詞-固有名詞 5 | 1 4 名詞-固有名詞-一般 6 | 1 5 名詞-固有名詞-人名 7 | 1 6 名詞-固有名詞-人名-一般 8 | 1 7 名詞-固有名詞-人名-姓 9 | 1 8 名詞-固有名詞-人名-名 10 | 1 9 名詞-固有名詞-組織 11 | 1 10 名詞-固有名詞-地域 12 | 1 11 名詞-固有名詞-地域-一般 13 | 1 12 名詞-固有名詞-地域-国 14 | 1 13 名詞-代名詞 15 | 1 14 名詞-代名詞-一般 16 | 1 15 名詞-代名詞-縮約 17 | 1 16 名詞-副詞可能 18 | 1 17 名詞-サ変接続 19 | 1 18 名詞-形容動詞語幹 20 | 0 19 名詞-数 21 | 0 20 名詞-非自立 22 | 0 21 名詞-非自立-一般 23 | 0 22 名詞-非自立-副詞可能 24 | 0 23 名詞-非自立-助動詞語幹 25 | 0 24 名詞-非自立-形容動詞語幹 26 | 0 25 名詞-特殊 27 | 0 26 名詞-特殊-助動詞語幹 28 | 1 27 名詞-接尾 29 | 1 28 名詞-接尾-一般 30 | 1 29 名詞-接尾-人名 31 | 1 30 名詞-接尾-地域 32 | 1 31 名詞-接尾-サ変接続 33 | 0 32 名詞-接尾-助動詞語幹 34 | 1 33 名詞-接尾-形容動詞語幹 35 | 1 34 名詞-接尾-副詞可能 36 | 1 35 名詞-接尾-助数詞 37 | 0 36 名詞-接尾-特殊 38 | 0 37 名詞-接続詞的 39 | 0 38 名詞-動詞非自立的 40 | 0 39 名詞-引用文字列 41 | 1 40 名詞-ナイ形容詞語幹 42 | 0 41 接頭詞 43 | 0 42 接頭詞-名詞接続 44 | 0 43 接頭詞-動詞接続 45 | 0 44 接頭詞-形容詞接続 46 | 0 45 接頭詞-数接続 47 | 1 46 動詞 48 | 1 47 動詞-自立 49 | 0 48 動詞-非自立 50 | 0 49 動詞-接尾 51 | 1 50 形容詞 52 | 1 51 形容詞-自立 53 | 0 52 形容詞-非自立 54 | 1 53 形容詞-接尾 55 | 1 54 副詞 56 | 1 55 副詞-一般 57 | 1 56 副詞-助詞類接続 58 | 0 57 連体詞 59 | 0 58 接続詞 60 | 0 59 助詞 61 | 0 60 助詞-格助詞 62 | 0 61 助詞-格助詞-一般 63 | 0 62 助詞-格助詞-引用 64 | 0 63 助詞-格助詞-連語 65 | 0 64 助詞-接続助詞 66 | 0 65 助詞-係助詞 67 | 0 66 助詞-副助詞 68 | 0 67 助詞-間投助詞 69 | 0 68 助詞-並立助詞 70 | 0 69 助詞-終助詞 71 | 0 70 助詞-副助詞/並立助詞/終助詞 72 | 0 71 助詞-連体化 73 | 0 72 助詞-副詞化 74 | 0 73 助詞-特殊 75 | 0 74 助動詞 76 | 0 75 感動詞 77 | 0 76 記号 78 | 0 77 記号-一般 79 | 0 78 記号-句点 80 | 0 79 記号-読点 81 | 0 80 記号-空白 82 | 0 81 記号-アルファベット 83 | 0 82 記号-括弧開 84 | 0 83 記号-括弧閉 85 | 0 84 その他 86 | 0 85 その他-間投 87 | 0 86 フィラー 88 | 0 87 非言語音 89 | 0 88 語断片 90 | -------------------------------------------------------------------------------- /src/jNlp/data/hiraganaChart.txt: -------------------------------------------------------------------------------- 1 | a i u e o ya yu yo n 2 | X あ い う え お X X X ん 3 | k か き く け こ きゃ きゅ きょ X 4 | a さ し す せ そ しゃ しゅ しょ X 5 | t た X X て と ちゃ ちゅ ちょ X 6 | n な に ぬ ね の にゃ にゅ にょ X 7 | h は ひ ふ へ ほ ひゃ ひゅ ひょ X 8 | m ま み む め も みゃ みゅ みょ X 9 | y や X ゆ X よ X X X X 10 | r ら り る れ ろ りゃ りゅ りょ X 11 | w わ ゐ X ゑ を X X X X 12 | g が ぎ ぐ げ ご ぎゃ ぎゅ ぎょ X 13 | z ざ じ ず ぜ ぞ じゃ じゅ じょ X 14 | d だ ぢ づ で ど ぢゃ ぢゅ ぢょ X 15 | b ば び ぶ べ ぼ びゃ びゅ びょ X 16 | p ぱ ぴ ぷ ぺ ぽ ぴゃ ぴゅ ぴょ X 17 | ch X ち X X X X X X X 18 | ts X X つ X X X X X X 19 | -------------------------------------------------------------------------------- /src/jNlp/data/katakanaChart.txt: -------------------------------------------------------------------------------- 1 | a i u e o ya yu yo n 2 | X ア イ ウ エ オ X X X ン 3 | k カ キ ク ケ コ キャ キュ キョ X 4 | s サ シ ス セ ソ シャ シュ ショ X 5 | t タ X X テ ト チャ チュ チョ X 6 | n ナ ニ ヌ ネ ノ ニャ ニュ ニョ X 7 | h ハ ヒ フ ヘ ホ ヒャ ヒュ ヒョ X 8 | m マ ミ ム メ モ ミャ ミュ ミョ X 9 | y ヤ X ユ X ヨ X X X X 10 | r ラ リ ル レ ロ リャ リュ リョ X 11 | w ワ ヰ X ヱ ヲ X X X X 12 | g ガ ギ グ ゲ ゴ ギャ ギュ ギョ X 13 | z ザ X ズ ゼ ゾ X X X X 14 | d ダ ヂ ヅ デ ド ヂャ ヂュ ヂョ X 15 | b バ ビ ブ ベ ボ ビャ ビュ ビョ X 16 | p パ ピ プ ペ ポ ピャ ピュ ピョ X 17 | ch X チ X X X X X X X 18 | ts X X ツ X X X X X X 19 | j X ジ X X X ジャ ジュ ジョ X 20 | -------------------------------------------------------------------------------- /src/jNlp/eProcessing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from pkg_resources import resource_stream 4 | import sys, os, subprocess 5 | from subprocess import call 6 | import xml.etree.cElementTree as etree 7 | 8 | import nltk 9 | from nltk.stem.wordnet import WordNetLemmatizer 10 | 11 | 12 | 13 | if __name__ == '__main__': 14 | pass 15 | -------------------------------------------------------------------------------- /src/jNlp/edict_search_monash/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/src/jNlp/edict_search_monash/__init__.py -------------------------------------------------------------------------------- /src/jNlp/edict_search_monash/edict_examples.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kevincobain2000/jProcessing/5ea303cc4bf6e8aaa4a3c5f9d023368191919f75/src/jNlp/edict_search_monash/edict_examples.p -------------------------------------------------------------------------------- /src/jNlp/edict_search_monash/edict_examples.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This package uses the EDICT_ and KANJIDIC_ dictionary files. 5 | These files are the property of the 6 | Electronic Dictionary Research and Development Group_ , and 7 | are used in conformance with the Group's licence_ . 8 | 9 | .. _EDICT: http://www.csse.monash.edu.au/~jwb/edict.html 10 | .. _KANJIDIC: http://www.csse.monash.edu.au/~jwb/kanjidic.html 11 | .. _Group: http://www.edrdg.org/ 12 | .. _licence: http://www.edrdg.org/edrdg/licence.html 13 | .. 14 | """ 15 | # Copyright (c) 2011, Pulkit Kathuria 16 | # All rights reserved. 17 | # 18 | # Redistribution and use in source and binary forms, with or without 19 | # modification, are permitted provided that the following conditions 20 | # are met: 21 | # 22 | # * Redistributions of source code must retain the above copyright 23 | # notice, this list of conditions and the following disclaimer. 24 | # * Redistributions in binary form must reproduce the above 25 | # copyright notice, this list of conditions and the following 26 | # disclaimer in the documentation and/or other materials provided 27 | # with the distribution. 28 | # 29 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 30 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 31 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 32 | # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 33 | # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 34 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 35 | # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 36 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 37 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 38 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 39 | # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 40 | # POSSIBILITY OF SUCH DAMAGE. 41 | 42 | """ 43 | Edict Parser By **Paul Goins**, see ``edict_search.py`` 44 | Edict Example sentences, by search query, **Pulkit Kathuria** 45 | Edict examples pickle files are provided but latest example files 46 | can be downloaded from the links provided. 47 | 48 | Charset: 49 | 50 | - utf-8 charset example file 51 | - ISO-8859-1 edict_dictionary file 52 | 53 | Outputs example sentences for a query in Japanese only for ambiguous words. 54 | """ 55 | 56 | import re, os, subprocess 57 | from jNlp.edict_search_monash.edict_search import Parser 58 | import cPickle as pickle 59 | 60 | 61 | def word_and_id(BSent): 62 | results = [] 63 | for item in BSent.split(): 64 | brackets = re.compile('\[.*?\]') 65 | flter = re.sub('\(.*?\)','',item) 66 | word = re.split('\[|\]', re.sub('\{.*?\}','',flter))[0] 67 | try: s_id = re.split('\[|\]', re.sub('\{.*?\}','',flter))[1] 68 | except: pass 69 | if re.search(brackets, flter): 70 | results.append((word, s_id)) 71 | return results 72 | 73 | def parse_examples(edict_examples_file): 74 | """ 75 | Edict examples format 76 | --------------------- 77 | :: 78 | 79 | A: 誰にでも長所と.. Everyone has....points.#ID=276471_4870 80 | B: 才[01]{歳} 以上[01] 生きる (こと){こと} は 決して .. 81 | 82 | ambiguous_words: @type = dictionary 83 | format: Kanji ==> id ==> [examples_sent_id, ..] 84 | 才 ==> 01 ==> [#ID=276471_4870, ...] 85 | call: 86 | >>> ambiguous_words[kanji][01] 87 | ...[#ID=276471_4870, ...] 88 | 89 | edict_examples: @type = dictionary 90 | format: 91 | ID ==> u'example_sentence' 92 | #ID=276471_4870 ==> u'誰にでも長所と.. Everyone has....points' 93 | 94 | """ 95 | ambiguous_words = {} 96 | edict_examples = {} 97 | for line in edict_examples_file.readlines(): 98 | line = unicode(line,'utf-8') 99 | if line.startswith('A:'): 100 | eg_sent = line.split('#ID=')[0] 101 | eg_sent_id = line.split('#ID=')[1] 102 | edict_examples[eg_sent_id] = eg_sent 103 | continue 104 | for item in word_and_id(line): 105 | word = item[0] 106 | s_id = int(item[1]) 107 | if not ambiguous_words.has_key(word): ambiguous_words[word] = {} 108 | if not ambiguous_words[word].has_key(s_id): ambiguous_words[word][s_id] = [] 109 | ambiguous_words[word][s_id].append(eg_sent_id) 110 | return ambiguous_words, edict_examples 111 | 112 | def edict_entry(edict_file_path, query): 113 | kp = Parser(edict_file_path) 114 | for entry in kp.search(query): 115 | if entry.to_string().split()[0] == query: 116 | entry = entry.to_string() 117 | glosses = re.findall('\(\d\).*?;',entry) 118 | s_ids = [int(re.search('\d',gloss).group(0)) for gloss in glosses] 119 | return s_ids, glosses 120 | return [],[] 121 | 122 | def check_pickles(edict_examples_path): 123 | f = open(edict_examples_path) 124 | __checkpickles__ = ['edict_examples.p','ambiguous_words.p'] 125 | for pickl in __checkpickles__: 126 | if not os.path.exists(pickl): 127 | ambiguous_words, edict_examples = parse_examples(f) 128 | pickle.dump(ambiguous_words, open("ambiguous_words.p",'wb')) 129 | pickle.dump(edict_examples, open("edict_examples.p",'wb')) 130 | else: 131 | ambiguous_words = pickle.load(open('ambiguous_words.p')) 132 | edict_examples = pickle.load(open('edict_examples.p')) 133 | return ambiguous_words, edict_examples 134 | 135 | def search_with_example(edict_path, edict_examples_path, query): 136 | ambiguous_words, edict_examples = check_pickles(edict_examples_path) 137 | s_ids, glosses = edict_entry(edict_path, query) 138 | print query.encode('utf-8') 139 | for s_id, gloss in enumerate(glosses): 140 | print 141 | print 'Sense', gloss 142 | if ambiguous_words.has_key(query) and ambiguous_words[query].has_key(s_ids[s_id]): 143 | for ex_num, ex_id in enumerate(ambiguous_words[query][s_ids[s_id]], 1): 144 | ex_sentence = edict_examples[ex_id].replace(query[0], '*'+query[0]+'*') 145 | print '\t', ex_sentence.replace('A:','EX:'+str(ex_num).zfill(2)).encode('utf-8') 146 | 147 | def _mime(f_path): 148 | command = ['file','--mime',f_path] 149 | process = subprocess.Popen(command, stdout=subprocess.PIPE) 150 | charset = process.communicate()[0].split('charset=')[1] 151 | return charset.strip() 152 | 153 | def _encoding_check(edict_path, edict_examples_path): 154 | if _mime(edict_path) <> 'iso-8859-1' or _mime(edict_examples_path) <>'utf-8': 155 | print _mime(edict_path) 156 | print 'examples file must utf-8 encoded' 157 | print 'edict dictionary must be iso-8859-1 encoded' 158 | print 'man iconv' 159 | return True 160 | 161 | if __name__ == '__main__': 162 | query = u'水' 163 | edict_path = '../_dicts/edict-2011-08-30' 164 | edict_examples_path = '../_dicts/edict_examples' 165 | search_with_example(edict_path, edict_examples_path, query) 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | -------------------------------------------------------------------------------- /src/jNlp/edict_search_monash/edict_search.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright (c) 2009, Paul Goins 5 | # All rights reserved. 6 | # 7 | # Redistribution and use in source and binary forms, with or without 8 | # modification, are permitted provided that the following conditions 9 | # are met: 10 | # 11 | # * Redistributions of source code must retain the above copyright 12 | # notice, this list of conditions and the following disclaimer. 13 | # * Redistributions in binary form must reproduce the above 14 | # copyright notice, this list of conditions and the following 15 | # disclaimer in the documentation and/or other materials provided 16 | # with the distribution. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21 | # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22 | # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24 | # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28 | # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | # POSSIBILITY OF SUCH DAMAGE. 30 | 31 | """A parser for EDICT. 32 | 33 | This version is intended to be a more-or-less complete EDICT parser, 34 | with the exception of not doing special parsing for loan word tags. 35 | If you require special handling for those, then you probably ought to 36 | be using JMdict instead. 37 | 38 | """ 39 | 40 | import sys, os, re, gzip, gettext 41 | gettext.install('pyjben', unicode=True) 42 | 43 | 44 | # Below follows the information codes sorted more-or-less as they are 45 | # on http://www.csse.monash.edu.au/~jwb/edict_doc.html, however more 46 | # up to date. These sets are accurate as of 2009-Jul-17. 47 | 48 | # Part of speech codes 49 | valid_pos_codes = set(( 50 | "adj-i", "adj-na", "adj-no", "adj-pn", "adj-t", "adj-f", "adj", 51 | "adv", "adv-to", "aux", "aux-v", "aux-adj", "conj", "ctr", "exp", 52 | "int", "iv", "n", "n-adv", "n-suf", "n-pref", "n-t", "num", "pn", 53 | "pref", "prt", "suf", "v1", "v2a-s", "v4h", "v4r", "v5", "v5aru", 54 | "v5b", "v5g", "v5k", "v5k-s", "v5m", "v5n", "v5r", "v5r-i", "v5s", 55 | "v5t", "v5u", "v5u-s", "v5uru", "v5z", "vz", "vi", "vk", "vn", 56 | "vr", "vs", "vs-s", "vs-i", "vt", 57 | )) 58 | 59 | # Field of application codes 60 | valid_foa_codes = set(( 61 | "Buddh", "MA", "comp", "food", "geom", "ling", "math", "mil", 62 | "physics", "chem" 63 | )) 64 | 65 | # Miscellaneous marking codes 66 | valid_misc_codes = set(( 67 | "X", "abbr", "arch", "ateji", "chn", "col", "derog", "eK", "ek", 68 | "fam", "fem", "gikun", "hon", "hum", "iK", "id", "ik", "io", 69 | "m-sl", "male", "male-sl", "oK", "obs", "obsc", "ok", "on-mim", 70 | "poet", "pol", "rare", "sens", "sl", "uK", "uk", "vulg" 71 | )) 72 | 73 | # Dialect codes 74 | valid_dialect_codes = set(( 75 | "kyb", "osb", "ksb", "ktb", "tsb", "thb", "tsug", "kyu", "rkb", 76 | "nab" 77 | )) 78 | 79 | # Grab all ()'s before a gloss 80 | all_paren_match = re.compile("^(\([^)]*\)[ ]*)+") 81 | # Grab the first () data entry, with group(1) set to the contents 82 | paren_match = re.compile(u"^[ ]*\(([^)]+)\)[ ]*") 83 | 84 | def info_field_valid(i_field): 85 | """Returns whether a given info code is valid.""" 86 | 87 | # Validity is a sticky issue since there's so many fields: 88 | # 89 | # - Sense markers (1, 2, 3, ...) 90 | # - Part of speech markers (n, adv, v5r) 91 | # - Field of application markers (comp, math, mil) 92 | # - Miscellaneous meanings (X, abbr, arch, ateji, ..........) 93 | # - Word priority (P) 94 | # ? Okurigana variants (Maybe this is JMdict only?) 95 | # - Loan words, a.k.a. Gairaigo 96 | # - Regional Japanese words (Kansai-ben, etc.) 97 | # 98 | # Thankfully, this function should be reusable in the edict2 parser... 99 | 100 | if i_field in valid_pos_codes: return True 101 | if i_field == "P": return True 102 | if i_field in valid_misc_codes: return True 103 | if i_field in valid_foa_codes: return True 104 | if i_field[:-1] in valid_dialect_codes: return True 105 | # Check for (1), (2), etc. 106 | try: 107 | i = int(i_field) 108 | return True 109 | except: 110 | return False 111 | 112 | class EdictEntry(object): 113 | 114 | def __init__(self, raw_entry, quick_parsing=True): 115 | 116 | # Japanese - note, if only a kana reading is present, it's 117 | # stored as "japanese", and furigana is left as None. 118 | self.japanese = None 119 | self.furigana = None 120 | # Native language glosses 121 | self.glosses = [] 122 | # Info fields should be inserted here as "tags". 123 | self.tags = set() 124 | # Currently unhandled stuff goes here... 125 | self.unparsed = [] 126 | 127 | # Most people don't need ultra-fancy parsing and can happily 128 | # take glosses with keywords stuck in them. In this case, 129 | # they can save processing time by using parse_entry_quick. 130 | # However, this will mean that "J-Ben"-style entry sorting may 131 | # not work exactly as expected because of tags being appended 132 | # to the beginning or end. 133 | 134 | # Note: Even with full parsing, due to a few entries with tags 135 | # at the end of their glosses, there's a few entries which will not 136 | # successfully match on an "ends with" search. 137 | 138 | # ENABLE THIS once parse_entry_quick is implemented. 139 | if quick_parsing: 140 | self.parse_entry_quick(raw_entry) 141 | else: 142 | self.parse_entry(raw_entry) 143 | 144 | def parse_entry(self, raw_entry): 145 | if not raw_entry: 146 | return None 147 | 148 | jdata, ndata = raw_entry.split(u'/', 1) 149 | 150 | # Get Japanese 151 | pieces = jdata.split(u'[', 1) 152 | self.japanese = pieces[0].strip() 153 | if len(pieces) > 1: 154 | # Store furigana without '[]' 155 | self.furigana = pieces[1].strip()[:-1] 156 | 157 | #if self.furigana: 158 | # print "JAPANESE: %s, FURIGANA: %s" % (self.japanese, self.furigana) 159 | #else: 160 | # print "JAPANESE: %s" % self.japanese 161 | 162 | # Get native language data 163 | glosses = ndata.split(u'/') 164 | for gloss in glosses: 165 | # For each gloss, we need to check for ()'s at the beginning. 166 | # Multiple such ()'s may be present. 167 | # The actual gloss does not begin until the last set (or 168 | # an unhandled one) is encountered. 169 | 170 | if not gloss: continue 171 | #print "Unparsed gloss: [%s]" % gloss 172 | 173 | info = None 174 | m = all_paren_match.match(gloss) 175 | if m: 176 | info = m.group(0) 177 | if info: 178 | gloss_start = m.span()[1] 179 | gloss = gloss[gloss_start:] 180 | #print "Info field captured: [%s]" % info 181 | 182 | while info: 183 | m = paren_match.match(info) 184 | #if not m: break # Shouldn't ever happen... 185 | i_field = m.group(1) 186 | #print "INFO FIELD FOUND:", i_field 187 | i_fields = i_field.split(u',') 188 | 189 | # Check that all i_fields are valid 190 | bools = map(info_field_valid, i_fields) 191 | ok = reduce(lambda x, y: x and y, bools) 192 | 193 | if not ok: 194 | #print "INVALID INFO FIELD FOUND, REVERTING" 195 | #print "INFO WAS %s, GLOSS WAS %s" % (info, gloss) 196 | print info 197 | gloss = info + gloss 198 | #print "RESTORED GLOSS:", gloss 199 | break 200 | 201 | for tag in i_fields: 202 | self.tags.add(tag.rstrip(':')) # Handles "ksb:" 203 | # and other 204 | # dialect codes 205 | #print "INFO FIELD FOUND:", i 206 | next_i = m.span()[1] 207 | info = info[next_i:] 208 | 209 | #print "APPENDING GLOSS:", gloss 210 | self.glosses.append(gloss) 211 | 212 | def parse_entry_quick(self, raw_entry): 213 | if not raw_entry: 214 | return None 215 | 216 | jdata, ndata = raw_entry.split(u'/', 1) 217 | 218 | # Get Japanese 219 | pieces = jdata.split(u'[', 1) 220 | self.japanese = pieces[0].strip() 221 | if len(pieces) > 1: 222 | # Store furigana without '[]' 223 | self.furigana = pieces[1].strip()[:-1] 224 | 225 | # Get native language data 226 | self.glosses = [g for g in ndata.split(u'/') if g] 227 | 228 | def to_string(self, **kwargs): 229 | if self.furigana: 230 | ja = _(u"%s [%s]") % (self.japanese, self.furigana) 231 | else: 232 | ja = self.japanese 233 | native = _(u"; ").join(self.glosses) 234 | return _(u"%s: %s") % (ja, native) 235 | 236 | def __unicode__(self): 237 | """Dummy string dumper""" 238 | return unicode(self.__repr__()) 239 | 240 | class Parser(object): 241 | def __init__(self, filename, use_cache=True, encoding="EUC-JP"): 242 | if not os.path.exists(filename): 243 | raise Exception("Dictionary file does not exist.") 244 | self.filename = filename 245 | self.encoding = encoding 246 | self.use_cache = use_cache 247 | self.cache = {} 248 | 249 | def search(self, query): 250 | """Returns a list of entries matching the query.""" 251 | results = [] 252 | 253 | def proc_entry(entry): 254 | if query in entry.japanese: 255 | results.append(entry) 256 | else: 257 | for gloss in entry.glosses: 258 | if query in gloss: 259 | results.append(entry) 260 | break 261 | 262 | if self.use_cache and self.cache: 263 | # Read from cache 264 | for k, entry in self.cache.iteritems(): 265 | proc_entry(entry) 266 | else: 267 | # Read from file 268 | if len(self.filename) >= 3 and self.filename[-3:] == ".gz": 269 | f = gzip.open(self.filename) 270 | else: 271 | f = open(self.filename, "rb") 272 | fdata = f.read() 273 | f.close() 274 | fdata = fdata.decode(self.encoding) 275 | lines = fdata.splitlines() 276 | lines = [line for line in lines if line and (line[0] != u"#")] 277 | 278 | data = {} 279 | for line in lines: 280 | entry = EdictEntry(line) 281 | if self.use_cache: 282 | self.cache[entry.japanese] = entry 283 | proc_entry(entry) 284 | 285 | # Very simple sorting of results. 286 | # (Requires that (P) is left in glosses...) 287 | common = [] 288 | other = [] 289 | 290 | for item in results: 291 | is_common = False 292 | for gloss in item.glosses: 293 | if u'(P)' in gloss: 294 | is_common = True 295 | break 296 | if is_common: 297 | common.append(item) 298 | else: 299 | other.append(item) 300 | 301 | results = common 302 | results.extend(other) 303 | 304 | # Return results 305 | return results 306 | 307 | if __name__ == "__main__": 308 | kp = Parser('../_dicts/edict-2011-08-30') 309 | query = u'私' 310 | for i, entry in enumerate(kp.search(query)): 311 | print entry.to_string().encode('utf-8') 312 | 313 | -------------------------------------------------------------------------------- /src/jNlp/jCabocha.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys, subprocess, os 4 | from subprocess import call 5 | from tempfile import NamedTemporaryFile 6 | 7 | def formdamage(sent): 8 | rectify = [] 9 | for ch in sent: 10 | try: rectify.append(ch.encode('utf-8')) 11 | except: pass 12 | return ''.join(rectify) 13 | 14 | def cabocha(sent): 15 | if os.path.exists('/home_lab_local/s1010205/tmp/'): 16 | temp = NamedTemporaryFile(delete=False, dir='/home_lab_local/s1010205/tmp/') 17 | else: 18 | temp = NamedTemporaryFile(delete=False) 19 | try: sent = sent.encode('utf-8') 20 | except: sent = formdamage(sent) 21 | temp.write(sent) 22 | temp.close() 23 | command = ['cabocha', '-f', '3'] 24 | process = subprocess.Popen(command, stdin=open(temp.name,'r'), stdout=subprocess.PIPE) 25 | output = process.communicate()[0] 26 | os.unlink(temp.name) 27 | return unicode(output, 'utf-8') 28 | 29 | def main(): 30 | pass 31 | 32 | if __name__ == '__main__': 33 | input_sentence = u'私が五年前にこの団体を仲間たちと結成したのはマルコス疑惑などで日本のODA(政府開発援助)が問題になり、国まかせでなく、民間による国際協力が必要だと痛感したのが大きな理由です。' 34 | print cabocha(input_sentence).encode('utf-8') 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /src/jNlp/jColor.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | def color(raw_string, colour): 4 | """ 5 | @returns a bold font 6 | usage: color("raw string here", 'red') 7 | """ 8 | black = ('28', '1') 9 | red = ('31','1') 10 | green = ('32','1') 11 | return '\x1b[%sm%s\x1b[0m' % (';'.join(eval(colour)), raw_string) 12 | 13 | if __name__ == "__main__": 14 | print color("this string","black") 15 | -------------------------------------------------------------------------------- /src/jNlp/jConvert.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | from jNlp.jTokenize import jTokenize, jReads 5 | from jNlp.jCabocha import cabocha 6 | from pkg_resources import resource_stream 7 | 8 | class ChartParser(object): 9 | def __init__(self, chartFile): 10 | self.chart = resource_stream('jNlp', chartFile).read() 11 | def chartParse(self): 12 | """ 13 | @return chartDict 14 | ガ ==> g,a 15 | キ ==> k,i 16 | キャ ==> k,ya 17 | Similarily for Hiragana 18 | @setrofim : http://www.python-forum.org/pythonforum/viewtopic.php?f=3&t=31935 19 | """ 20 | lines = self.chart.split('\n') 21 | chartDict = {} 22 | output = {} 23 | col_headings = lines.pop(0).split() 24 | for line in lines: 25 | cells = line.split() 26 | for i, c in enumerate(cells[1:]): 27 | output[c] = (cells[0], col_headings[i]) 28 | for k in sorted(output.keys()): 29 | #@k = katakana 30 | #@r = first romaji in row 31 | #@c = concatinating romaji in column 32 | r, c = output[k] 33 | k, r, c = [unicode(item,'utf-8') for item in [k,r,c]] 34 | if k == 'X':continue 35 | romaji = ''.join([item.replace('X', '') for item in [r,c]]) 36 | chartDict[k] = romaji 37 | return chartDict 38 | 39 | def tokenizedRomaji(jSent): 40 | kataDict = ChartParser('data/katakanaChart.txt').chartParse() 41 | tokenizeRomaji = [] 42 | for kataChunk in jReads(jSent): 43 | romaji = '' 44 | for idx, kata in enumerate(kataChunk,1): 45 | if idx != len(kataChunk): 46 | doubles = kata+kataChunk[idx] 47 | if kataDict.has_key(doubles): 48 | romaji += kataDict[doubles] 49 | continue 50 | if kataDict.has_key(kata): 51 | romaji += kataDict[kata] 52 | else: 53 | pass 54 | #checkPunctuation(kata) 55 | tokenizeRomaji.append(romaji) 56 | return tokenizeRomaji 57 | 58 | if __name__ == '__main__': 59 | #kataDict = ChartParser('data/katakanaChart.txt').chartParse() 60 | sent = u'気象庁が21日午前4時48分、発表した天気概況によると、' 61 | print ' '.join(tokenizedRomaji(sent)).encode('utf-8') 62 | #print tokenizedRomaji(sent) 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /src/jNlp/jProcessing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from jNlp.jTokenize import * 4 | from pkg_resources import resource_stream 5 | import sys, os, subprocess, re 6 | from subprocess import call 7 | import xml.etree.cElementTree as etree 8 | 9 | def long_substr(str1, str2): 10 | data = [str1, str2] 11 | substr = '' 12 | if len(data) > 1 and len(data[0]) > 0: 13 | for i in range(len(data[0])): 14 | for j in range(len(data[0])-i+1): 15 | if j > len(substr) and all(data[0][i:i+j] in x for x in data): 16 | substr = data[0][i:i+j] 17 | return substr.strip() 18 | 19 | class Similarities(object): 20 | def minhash(self, *args): 21 | """ 22 | :*args: tokenized string like a nd b 23 | :Sentences: should be tokenized in string 24 | a = u"これ はな ん です" 25 | b = u"かこ れ何 です" 26 | """ 27 | score = 0.0 28 | tok_sent_1 = args[0] 29 | tok_sent_2 = args[1] 30 | shingles = lambda s: set(s[i:i+3] for i in range(len(s)-2)) 31 | try: 32 | jaccard_distance = lambda seta, setb: len(seta & setb)/float(len(seta | setb)) 33 | score = jaccard_distance(shingles(tok_sent_1), shingles(tok_sent_2)) 34 | return score 35 | except ZeroDivisionError: return score 36 | 37 | class Property(object): 38 | def __init__(self): 39 | pass 40 | def kanaChars(self): 41 | Chars = [] 42 | tables = ['hiraganaChart.txt', 'katakanaChart.txt'] 43 | for table in tables: 44 | buff = resource_stream('jNlp', 'data/%s'%table).readlines() 45 | for line in buff: 46 | line = unicode(line, 'utf-8') 47 | Chars += line.split() 48 | return Chars 49 | 50 | def iscontent(self, pos): 51 | self.pos = pos 52 | self.file = resource_stream('jNlp', 'data/chasen_pos.txt').readlines() 53 | self.content = {} 54 | for line in self.file: 55 | if not line.strip(): continue 56 | line = unicode(line,'utf-8') 57 | pos = line.split()[2].strip() 58 | self.content[pos] = int(line.split()[0].strip()) 59 | if self.content.has_key(self.pos) and self.content[self.pos]: return True 60 | return False 61 | def tok_xml(self, sent, word): 62 | #Usage 63 | #tok_xml(u'これでアナタも冷え知らず', u'冷').get('pos') 64 | self.sent = sent.replace(word, '*'+word+'*') 65 | cTree = jCabocha_with_target(self.sent) 66 | for chunk in cTree.getchildren():#chunks 67 | for tok in chunk.getchildren(): 68 | if tok.get('target'):return tok 69 | return etree.fromstring(u'') 70 | def iskana(self, word): 71 | romaji = ['a', 'b', 'c', 'd', 'e', 'f', 'g', \ 72 | 'h', 'i', 'j', 'k', 'l', 'm', 'n', \ 73 | 'o', 'p', 'q', 'r', 's', 't', 'u', \ 74 | 'v', 'w', 'x', 'y', 'z'] 75 | if len(word) == 1 and word in self.kanaChars() and word not in romaji: 76 | return True 77 | else: return False 78 | 79 | 80 | 81 | if __name__ == '__main__': 82 | a = 'Once upon a time in Italy' 83 | b = 'Thre was a time in America' 84 | #print long_substr(a, b) 85 | a = u'これでアナタも冷え知らず' 86 | b = u'これでア冷え知らずナタも' 87 | #print long_substr(a, b).encode('utf-8') 88 | #similarity = Similarities() 89 | #print similarity.minhash(' '.join(jTokenize(a)), ' '.join(jTokenize(b))) 90 | pos = Property() 91 | #print pos.iscontent(u'地域') 92 | #print pos.tok_xml(u'これでアナタも冷え知らず', u'冷').get('pos') 93 | print pos.iskana(u'冷') 94 | 95 | 96 | 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /src/jNlp/jSentiments.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import division 4 | import sys, subprocess, argparse 5 | from subprocess import call 6 | from jNlp.jTokenize import jTokenize 7 | from jNlp.jColor import color 8 | 9 | class Sentiment(object): 10 | def train(self, senti_path, wnjpn_path): 11 | """ 12 | ``idSenti & idjWord = type `` :: 13 | 14 | idSenti[00004980] = [posScore, negScore] 15 | idjWord[u'kanji/jword'] = 00004980 16 | """ 17 | self.idSenti = {} 18 | self.idjWord = {} 19 | with open(senti_path) as senti_f: 20 | senti_text = senti_f.readlines() 21 | for line in senti_text: 22 | if line.startswith('#'): continue 23 | try: 24 | ID, pScore, nScore = line.split()[1:4] 25 | self.idSenti[ID] = [float(pScore), float(nScore)] 26 | except (IndexError, ValueError): pass 27 | with open(wnjpn_path) as jwn_f: 28 | jwn_text = jwn_f.readlines() 29 | for line in jwn_text: 30 | ID = line.split()[0].split('-')[0] 31 | jWord = unicode(line.split()[1].strip(), 'utf-8') 32 | self.idjWord[jWord] = ID 33 | return self.idSenti, self.idjWord 34 | 35 | def polarScores_word(self, word): 36 | """ 37 | returns pos, neg score for one kanji 38 | """ 39 | if not self.idjWord.has_key(word): return 0.0, 0.0 40 | pScore = self.idSenti[self.idjWord[word]][0] 41 | nScore = self.idSenti[self.idjWord[word]][1] 42 | return pScore, nScore 43 | 44 | def polarScores_text(self, text): 45 | pScore = 0.0 46 | nScore = 0.0 47 | for sent in text.split(u'。'): 48 | if len(sent.strip()) == 0: continue 49 | for word in jTokenize(sent): 50 | if not self.idjWord.has_key(word): continue 51 | pScore += self.idSenti[self.idjWord[word]][0] 52 | nScore += self.idSenti[self.idjWord[word]][1] 53 | return pScore, nScore 54 | 55 | def baseline(self, text): 56 | pScore, nScore = self.polarScores_text(text) 57 | print 'Pos Score = %.3f Neg Score = %.3f'%(pScore, nScore) 58 | if pScore == nScore: 59 | print 'Text is Neural or Cannot Determine' 60 | return '' 61 | if pScore > nScore: 62 | print 'Text is', color('Positive', "green") 63 | return '' 64 | else: 65 | print 'Text is', color('Negative',"red") 66 | return '' 67 | 68 | if __name__ == '__main__': 69 | parser = argparse.ArgumentParser(add_help = True) 70 | parser = argparse.ArgumentParser(description= 'Sentiment Classifier for Japanese Text') 71 | #parser.add_argument('-f', action="store", nargs = 2, dest="files", type=argparse.FileType('rt'), help='-f senti.txt jwn.txt') 72 | myarguments = parser.parse_args() 73 | 74 | jp_wn = '_dicts/wnjpn-all.tab' 75 | en_swn = '_dicts/SentiWordNet_3.0.0_20100908.txt' 76 | classifier = Sentiment() 77 | sentiwordnet, jpwordnet = classifier.train(en_swn, jp_wn) 78 | positive_score = sentiwordnet[jpwordnet[u'全部']][0] 79 | negative_score = sentiwordnet[jpwordnet[u'全部']][1] 80 | print 'pos score = {0}, neg score = {1}'.format(positive_score, negative_score) 81 | 82 | text = u'監督、俳優、ストーリー、演出、全部最高!' 83 | print classifier.baseline(text) 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /src/jNlp/jTokenize.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import xml.etree.cElementTree as etree 5 | #Package imports 6 | from jNlp.jCabocha import * 7 | import argparse 8 | def add_target(jCabocha_tree,target_sent,**kwargs): 9 | """ 10 | Following is to mark a target word 11 | Not called 12 | See jCabocha_with_target() 13 | """ 14 | if kwargs.has_key('id'): attach_id = kwargs['id'] 15 | else: attach_id = 'unknown' 16 | start_pos = len(target_sent.split('*')[0]) 17 | tw = target_sent.split('*')[1] 18 | sent = u'' 19 | for chunk in jCabocha_tree.getchildren(): 20 | for tok in chunk: 21 | if tw in tok.text and len(sent) >= start_pos -3: 22 | tok.set("target", attach_id) 23 | return jCabocha_tree 24 | else: sent += tok.text 25 | return jCabocha_tree 26 | 27 | def jTokenize(target_sent): 28 | default_marker = '*' 29 | target = target_sent.replace(default_marker,'') 30 | sentence = etree.fromstring(cabocha(target).encode('utf-8')) 31 | jTokenized_sent = [] 32 | if default_marker in target_sent: 33 | added_target = add_target(sentence, target_sent) 34 | else: added_target = sentence 35 | for chunk in added_target.findall('chunk'): 36 | for tok in chunk.findall('tok'): 37 | if tok.get("target"): jTokenized_sent.append('*'+tok.text+'*') 38 | else: jTokenized_sent.append(tok.text) 39 | return jTokenized_sent 40 | 41 | def jReads(target_sent): 42 | sentence = etree.fromstring(cabocha(target_sent).encode('utf-8')) 43 | jReadsToks = [] 44 | for chunk in sentence: 45 | for tok in chunk.findall('tok'): 46 | if tok.get("feature"): 47 | read_tag = tok.get("feature").split(',')[-2] 48 | if read_tag == '*': read_tag = '' 49 | elif tok.get("read"): 50 | read_tag = tok.get("read") 51 | else: 52 | pass 53 | if read_tag: jReadsToks.append(read_tag) 54 | return jReadsToks 55 | 56 | def jCabocha_with_target(target_sent, *args): 57 | #target_sent has to be marked with * 58 | if '*' not in target_sent: return cabocha(target_sent) 59 | if args: attach_id = args[0] 60 | else: attach_id = "unknown" 61 | sent_plain = etree.fromstring(cabocha(target_sent.replace('*', '')).encode('utf-8')) 62 | return add_target(sent_plain, target_sent, id = attach_id) 63 | 64 | def jInfo(target_sent, infotype='base'): 65 | #return Info 66 | #Eg for base form do 67 | #>>>jInfo(target_sent, infotype='base') 68 | #...returns [word1baseform, word2baseform, ..] 69 | sentence = etree.fromstring(cabocha(target_sent).encode('utf-8')) 70 | Info = [] 71 | for chunk in sentence: 72 | for tok in chunk: 73 | if tok.get(infotype): Info.append(tok.get(infotype)) 74 | return Info 75 | 76 | 77 | if __name__ == '__main__': 78 | parser = argparse.ArgumentParser(add_help = True) 79 | parser = argparse.ArgumentParser(description= 'No description sepecified') 80 | parser.add_argument('-a', action="store", dest="action", type=unicode, help='-a base') 81 | parser.add_argument('-s', action="store", dest="sentence", type=str, help='-s Sentence') 82 | myarguments = parser.parse_args() 83 | print cabocha(unicode(myarguments.sentence,'utf-8')).encode('utf-8') 84 | print jReads(unicode(myarguments.sentence,'utf-8')) 85 | 86 | """ 87 | TO Mark the target word use * 1byte 88 | """ 89 | """ 90 | a = u'私は彼を5日前、つまりこの前の金曜日に駅で見かけた' 91 | print jTokenize(a) 92 | #print '--'.join(jTokenize(a)).encode('utf-8') 93 | #print '--'.join(jReads(a)).encode('utf-8') 94 | #--------------------------------------------------------------# 95 | a = u'私は彼を5日*前*、つまりこの前の金曜日に駅で見かけた' 96 | #print jTokenize(a) 97 | #input sentence has to be marked with target word otherwise target is not marked 98 | #print etree.tostring(jCabocha_with_target(a, 'nn:00:11'), 'utf-8') 99 | #print etree.tostring(jCabocha_with_target(a), 'utf-8') #default id = 'unknown' 100 | 101 | sent = u'日本最大級のポータルサイト' 102 | print jInfo(sent, 'base') 103 | #print ' '.join(jReads(a)).encode('utf-8') 104 | """ 105 | -------------------------------------------------------------------------------- /src/jNlp/summarize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from collections import defaultdict 3 | from itertools import repeat 4 | import re 5 | 6 | class Summary(object): 7 | def __init__(self): 8 | pass 9 | 10 | def tokenize(self, text): 11 | return text.split() 12 | 13 | def split_to_sentences(self, text): 14 | sentences = [] 15 | start = 0 16 | for match in re.finditer("(\s*[.!?]\s*)|(\n{2,})", text): 17 | sentences.append(text[start:match.end()].strip()) 18 | start = match.end() 19 | if start < len(text): 20 | sentences.append(text[start:].strip()) 21 | return sentences 22 | 23 | def token_frequency(self, text): 24 | '''Return frequency (count) for each token in the text''' 25 | frequencies = defaultdict(repeat(0).next) 26 | for token in self.tokenize(text): 27 | frequencies[token] += 1 28 | return frequencies 29 | 30 | def sentence_score(self, sentence, frequencies): 31 | return sum((frequencies[token] for token in self.tokenize(sentence))) 32 | 33 | def create_summary(self, sentences, max_length): 34 | summary = [] 35 | size = 0 36 | for sentence in sentences: 37 | size += len(sentence) 38 | if size >= max_length: break 39 | summary.append(sentence) 40 | return "\n".join(summary) 41 | 42 | def summarize(self, text, max_summary_size): 43 | frequencies = self.token_frequency(text) 44 | sentences = self.split_to_sentences(text) 45 | sentences.sort(key=lambda s: self.sentence_score(s, frequencies), reverse=1) 46 | summary = self.create_summary(sentences, max_summary_size) 47 | return summary 48 | 49 | if __name__ == "__main__": 50 | 51 | raw_text = """you know , i've seen network before , and it's a much better film . bulworth is , in the kindest of words , an " homage " to that picture , and at least it has an excellent role model . simply take the story about a tv newsman who goes nuts , stirs up controversy , and fatally angers the establishment and change it to a us senator who does the same thing , and you've got bulworth . warren beatty's title role performance is the only reason bulworth has anything going for it at all . much like tom cruise in jerry maguire , beatty takes a difficult character and makes it his own , and while beatty as a foul-mouthed politician is not exactly playing against type , it's still his very aggressive performance that carries the picture . everything else , from the dismal supporting cast ( halle berry has never looked so lost ) to the throw-away one-liners ( you've seen all the best over and over again on the trailers ) is cut-and-pasted from network or clearly dredged from some late night rewrite session . still , beatty's in fine form , and his outrageous wackiness takes the film halfway to where it could have been . ( and geez , he directed , produced , wrote , and starred in the film . . . maybe someone was a little too busy ? ) but overall , the missed opportunities , the overtly silly anti-pc message backed up by nothing , and the all-too-forseeable ending make bulworth little more than a fable that we already knew : that anyone involved with politics is totally insane . """ 52 | s= Summary() 53 | MAX_SUMMARY_SIZE = len(raw_text)/3 54 | print s.summarize(raw_text, MAX_SUMMARY_SIZE) 55 | -------------------------------------------------------------------------------- /src/jNlp/url2text.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import division 4 | from HTMLParser import HTMLParser 5 | from re import sub 6 | from sys import stderr 7 | from traceback import print_exc 8 | from urllib import * 9 | import re, string 10 | 11 | class Parser(HTMLParser): 12 | def __init__(self): 13 | HTMLParser.__init__(self) 14 | self.__text = [] 15 | 16 | def handle_data(self, data): 17 | text = data.strip() 18 | if len(text) > 0: 19 | text = sub('[ \t\r\n]+', ' ', text) 20 | self.__text.append(text + ' ') 21 | 22 | def handle_starttag(self, tag, attrs): 23 | if tag == 'p': 24 | self.__text.append('\n\n') 25 | elif tag == 'br': 26 | self.__text.append('\n') 27 | 28 | def handle_startendtag(self, tag, attrs): 29 | if tag == 'br': 30 | self.__text.append('\n\n') 31 | 32 | def text(self): 33 | return ''.join(self.__text).strip() 34 | class Url2Text(object): 35 | def raw_text(self, html_text): 36 | try: 37 | parser = Parser() 38 | parser.feed(html_text) 39 | parser.close() 40 | return parser.text() 41 | except: 42 | print "Couldn't extract" 43 | exit() 44 | 45 | def url2text(self, url): 46 | clean_text = [] 47 | html_text = urlopen(url).read() 48 | count = lambda l1, l2: len(list(filter(lambda c: c in l2, l1))) 49 | counts = [] 50 | text = self.raw_text(html_text) 51 | for line in text.splitlines(): 52 | counts.append(count(line, string.punctuation)) 53 | for line, punct in zip(text.splitlines(), counts): 54 | if line and punct < max(counts)/3: 55 | clean_text.append(line.strip()) 56 | return clean_text 57 | 58 | if __name__ == '__main__': 59 | url = "http://content.usatoday.com/communities/onpolitics/post/2012/03/mitt-romney-super-tuesday-results-rick-santorum-ohio/1" 60 | #url = 'http://www.terminally-incoherent.com/blog/2007/09/19/latex-squeezing-the-vertical-white-space/' 61 | a = Url2Text() 62 | print a.url2text(url) 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /src/jNlp/vcabocha.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from jNlp.jCabocha import * 4 | from jNlp.jTokenize import * 5 | import argparse 6 | 7 | if __name__ == '__main__': 8 | parser = argparse.ArgumentParser(add_help = True) 9 | parser = argparse.ArgumentParser(description= 'No description sepecified') 10 | parser.add_argument('-a', action="store", dest="action", type=unicode, help='-a [cabocha, tokenize, base, read, pos]') 11 | parser.add_argument('-s', action="store", dest="sentence", type=str, help='-s Sentence') 12 | myarguments = parser.parse_args() 13 | sent = unicode(myarguments.sentence,'utf-8') 14 | print myarguments.action 15 | if myarguments.action == "cabocha": 16 | print cabocha(sent).encode('utf-8') 17 | elif myarguments.action == "tokenize": 18 | print 'Tokenized' 19 | print '=========' 20 | print '\n'.join(jTokenize(sent)) 21 | elif myarguments.action: 22 | tokenized = jTokenize(sent) 23 | info = jInfo(sent, infotype=myarguments.action) 24 | mxlen = len(max(max(tokenized, key=len), max(info, key=len))) + 30 25 | print '{0:{mx}}{1:}'.format('Sent',myarguments.action, mx = mxlen) 26 | print '{0:{mx}}{1:}'.format('====','='*len(myarguments.action), mx = mxlen) 27 | 28 | for i, j in zip(tokenized, info): 29 | i = i.encode('utf-8') 30 | j = j.encode('utf-8') 31 | print '{0:{mx}}{1:<}'.format(i,j, mx = mxlen) 32 | else: 33 | print cabocha(sent).encode('utf-8') 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /src/jProcessing.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.0 2 | Name: jProcessing 3 | Version: 0.1 4 | Summary: Japanese NLP Utilities 5 | Home-page: http://www.jaist.ac.jp/~s1010205 6 | Author: KATHURIA Pulkit 7 | Author-email: pulkit@jaist.ac.jp 8 | License: LICENSE.txt 9 | Description: ==================== 10 | Japanese NLP Library 11 | ==================== 12 | 13 | 14 | Requirements 15 | ============ 16 | 17 | - Third Party Dependencies 18 | 19 | - Cabocha Japanese Morphological parser http://sourceforge.net/projects/cabocha/ 20 | 21 | - Python Dependencies 22 | 23 | - ``Python 2.6.*`` or above 24 | 25 | 26 | ``Links`` 27 | --------- 28 | 29 | - All code at jProcessing Repo GitHub_ 30 | 31 | .. _GitHub: https://github.com/kevincobain2000/jProcessing 32 | 33 | - Documentation_ and HomePage_ and Sphinx_ 34 | 35 | .. _Documentation: http://www.jaist.ac.jp/~s1010205/jnlp 36 | 37 | .. _HomePage: http://www.jaist.ac.jp/~s1010205/ 38 | 39 | .. _Sphinx: http://readthedocs.org/docs/jprocessing/en/latest/ 40 | 41 | 42 | - PyPi_ Python Package 43 | 44 | .. _PyPi: http://pypi.python.org/pypi/jProcessing/0.1 45 | 46 | :: 47 | 48 | clone git@github.com:kevincobain2000/jProcessing.git 49 | 50 | 51 | ``Install`` 52 | ----------- 53 | 54 | In ``Terminal`` :: 55 | 56 | >>>bash$ python setup.py install 57 | 58 | History 59 | ------- 60 | 61 | - ``0.2`` 62 | 63 | + Sentiment Analysis of Japanese Text 64 | 65 | - ``0.1`` 66 | + Morphologically Tokenize Japanese Sentence 67 | + Kanji / Hiragana / Katakana to Romaji Converter 68 | + Edict Dictionary Search - borrowed 69 | + Edict Examples Search - incomplete 70 | + Sentence Similarity between two JP Sentences 71 | + Run Cabocha(ISO--8859-1 configured) in Python. 72 | + Longest Common String between Sentences 73 | + Kanji to Katakana Pronunciation 74 | + Hiragana, Katakana Chart Parser 75 | 76 | Contacts 77 | ======== 78 | 79 | - ContactForm_ 80 | - BugReport_ 81 | - Contribute_ 82 | 83 | .. _ContactForm: http://www.jaist.ac.jp/~s1010205/styled-2/index.html 84 | .. _BugReport: http://www.jaist.ac.jp/~s1010205/styled/index.html 85 | .. _Contribute: https://github.com/kevincobain2000/jProcessing 86 | 87 | :Author: `pulkit[at]jaist.ac.jp` [change ``at`` with ``@``] 88 | 89 | 90 | 91 | 92 | 93 | Platform: UNKNOWN 94 | Classifier: Development Status :: 2 - Pre-Alpha 95 | Classifier: Natural Language :: Japanese 96 | Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence 97 | -------------------------------------------------------------------------------- /src/jProcessing.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | MANIFEST.in 2 | README 3 | setup.py 4 | scripts 5 | scripts/vcabocha.py 6 | src/jNlp/__init__.py 7 | src/jNlp/eProcessing.py 8 | src/jNlp/jCabocha.py 9 | src/jNlp/jCabocha.pyc 10 | src/jNlp/jColor.py 11 | src/jNlp/jConvert.py 12 | src/jNlp/jProcessing.py 13 | src/jNlp/jSentiments.py 14 | src/jNlp/jTokenize.py 15 | src/jNlp/vcabocha.py 16 | src/jNlp/aquisition/OpenSubtitles.py 17 | src/jNlp/aquisition/OpenSubtitles.pyc 18 | src/jNlp/aquisition/SubtitleDatabase.py 19 | src/jNlp/aquisition/SubtitleDatabase.pyc 20 | src/jNlp/aquisition/__init__.py 21 | src/jNlp/aquisition/aquire.py 22 | src/jNlp/aquisition/download_subs.xml 23 | src/jNlp/aquisition/movies.txt 24 | src/jNlp/data/JapaneseSentiWordNet.txt 25 | src/jNlp/data/__init__.py 26 | src/jNlp/data/chasen_pos.txt 27 | src/jNlp/data/hiraganaChart.txt 28 | src/jNlp/data/katakanaChart.txt 29 | src/jNlp/edict_search_monash/__init__.py 30 | src/jNlp/edict_search_monash/ambiguous_words.p 31 | src/jNlp/edict_search_monash/edict_examples.p 32 | src/jNlp/edict_search_monash/edict_examples.py 33 | src/jNlp/edict_search_monash/edict_search.py 34 | src/jProcessing.egg-info/PKG-INFO 35 | src/jProcessing.egg-info/SOURCES.txt 36 | src/jProcessing.egg-info/dependency_links.txt 37 | src/jProcessing.egg-info/top_level.txt -------------------------------------------------------------------------------- /src/jProcessing.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/jProcessing.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | jNlp 2 | --------------------------------------------------------------------------------