├── README ├── project.clj ├── resources ├── ChangeLog ├── INSTALL.txt ├── LICENSE.txt ├── MANIFEST.in ├── Makefile ├── NOTICE.txt ├── README.md ├── README.txt ├── RELEASE-HOWTO ├── distribute_setup.py ├── emacs │ ├── doctest-mode.el │ ├── psvn.el │ ├── pycomplete.el │ ├── pycomplete.py │ ├── python-mode.el │ ├── rst-mode.el │ └── rst.el ├── examples │ ├── grammars │ │ ├── Makefile │ │ ├── basque_grammars │ │ │ ├── basque1.cfg │ │ │ ├── basque1.fcfg │ │ │ ├── basque1.pcfg │ │ │ ├── basque1.regexp │ │ │ ├── basque2.cfg │ │ │ ├── basque2.fcfg │ │ │ ├── basque2.pcfg │ │ │ ├── basque2.regexp │ │ │ ├── basque3.cfg │ │ │ ├── basque3.fcfg │ │ │ ├── basque3.regexp │ │ │ ├── basque4.regexp │ │ │ └── basque5.regexp │ │ ├── book_grammars │ │ │ ├── background.fol │ │ │ ├── discourse.fcfg │ │ │ ├── drt.fcfg │ │ │ ├── feat0.fcfg │ │ │ ├── feat1.fcfg │ │ │ ├── german.fcfg │ │ │ ├── simple-sem.fcfg │ │ │ ├── sql0.fcfg │ │ │ ├── sql1.fcfg │ │ │ └── storage.fcfg │ │ ├── sample_grammars │ │ │ ├── background0.fol │ │ │ ├── bindop.fcfg │ │ │ ├── chat80.fcfg │ │ │ ├── chat_pnames.fcfg │ │ │ ├── dep_test2.dep │ │ │ ├── drt_glue.semtype │ │ │ ├── drt_glue_event.semtype │ │ │ ├── event.fcfg │ │ │ ├── glue.semtype │ │ │ ├── glue_event.semtype │ │ │ ├── glue_train.conll │ │ │ ├── gluesemantics.fcfg │ │ │ ├── hole.fcfg │ │ │ ├── np.fcfg │ │ │ ├── sem0.fcfg │ │ │ ├── sem1.fcfg │ │ │ ├── sem2.fcfg │ │ │ ├── sql.fcfg │ │ │ ├── toy.cfg │ │ │ └── valuation1.val │ │ └── spanish_grammars │ │ │ ├── spanish1.cfg │ │ │ ├── spanish1.fcfg │ │ │ ├── spanish1.pcfg │ │ │ ├── spanish1.regexp │ │ │ ├── spanish2.cfg │ │ │ ├── spanish2.fcfg │ │ │ ├── spanish2.pcfg │ │ │ ├── spanish2.regexp │ │ │ ├── spanish3.cfg │ │ │ ├── spanish3.regexp │ │ │ ├── spanish4.regexp │ │ │ └── spanish5.regexp │ ├── school │ │ ├── README │ │ ├── categories.py │ │ ├── count.py │ │ ├── generate.py │ │ ├── parse1.py │ │ ├── parse2.py │ │ ├── parse3.py │ │ ├── parser.py │ │ ├── search.py │ │ └── words.py │ └── semantics │ │ ├── chat.db │ │ ├── chat80.cfg │ │ ├── chat_pnames.cfg │ │ ├── chat_sentences │ │ ├── demo_sentences │ │ ├── model0.py │ │ ├── model1.py │ │ ├── sem0.cfg │ │ ├── sem1.cfg │ │ ├── sem2.cfg │ │ ├── sem3.cfg │ │ └── syn2sem.py ├── javasrc │ ├── Makefile │ ├── README.txt │ └── org │ │ └── nltk │ │ └── mallet │ │ ├── CRFInfo.java │ │ ├── RunCRF.java │ │ └── TrainCRF.java ├── nltk │ ├── VERSION │ ├── __init__.py │ ├── align.py │ ├── app │ │ ├── __init__.py │ │ ├── chartparser_app.py │ │ ├── chunkparser_app.py │ │ ├── collocations_app.py │ │ ├── concordance_app.py │ │ ├── nemo_app.py │ │ ├── rdparser_app.py │ │ ├── srparser_app.py │ │ ├── wordfreq_app.py │ │ └── wordnet_app.py │ ├── book.py │ ├── ccg │ │ ├── __init__.py │ │ ├── api.py │ │ ├── chart.py │ │ ├── combinator.py │ │ └── lexicon.py │ ├── chat │ │ ├── __init__.py │ │ ├── eliza.py │ │ ├── iesha.py │ │ ├── rude.py │ │ ├── suntsu.py │ │ ├── util.py │ │ └── zen.py │ ├── chunk │ │ ├── __init__.py │ │ ├── api.py │ │ ├── named_entity.py │ │ ├── regexp.py │ │ └── util.py │ ├── classify │ │ ├── __init__.py │ │ ├── api.py │ │ ├── decisiontree.py │ │ ├── mallet.py │ │ ├── maxent.py │ │ ├── megam.py │ │ ├── naivebayes.py │ │ ├── positivenaivebayes.py │ │ ├── rte_classify.py │ │ ├── scikitlearn.py │ │ ├── svm.py │ │ ├── tadm.py │ │ ├── util.py │ │ └── weka.py │ ├── cluster │ │ ├── __init__.py │ │ ├── api.py │ │ ├── em.py │ │ ├── gaac.py │ │ ├── kmeans.py │ │ └── util.py │ ├── collocations.py │ ├── corpus │ │ ├── __init__.py │ │ ├── europarl_raw.py │ │ ├── reader │ │ │ ├── __init__.py │ │ │ ├── aligned.py │ │ │ ├── api.py │ │ │ ├── bnc.py │ │ │ ├── bracket_parse.py │ │ │ ├── chasen.py │ │ │ ├── childes.py │ │ │ ├── chunked.py │ │ │ ├── cmudict.py │ │ │ ├── conll.py │ │ │ ├── dependency.py │ │ │ ├── ieer.py │ │ │ ├── indian.py │ │ │ ├── ipipan.py │ │ │ ├── knbc.py │ │ │ ├── lin.py │ │ │ ├── nombank.py │ │ │ ├── nps_chat.py │ │ │ ├── pl196x.py │ │ │ ├── plaintext.py │ │ │ ├── ppattach.py │ │ │ ├── propbank.py │ │ │ ├── rte.py │ │ │ ├── semcor.py │ │ │ ├── senseval.py │ │ │ ├── sinica_treebank.py │ │ │ ├── string_category.py │ │ │ ├── switchboard.py │ │ │ ├── tagged.py │ │ │ ├── timit.py │ │ │ ├── toolbox.py │ │ │ ├── util.py │ │ │ ├── verbnet.py │ │ │ ├── wordlist.py │ │ │ ├── wordnet.py │ │ │ ├── xmldocs.py │ │ │ └── ycoe.py │ │ └── util.py │ ├── data.py │ ├── decorators.py │ ├── downloader.py │ ├── draw │ │ ├── __init__.py │ │ ├── cfg.py │ │ ├── dispersion.py │ │ ├── table.py │ │ ├── tree.py │ │ └── util.py │ ├── examples │ │ ├── __init__.py │ │ └── pt.py │ ├── featstruct.py │ ├── grammar.py │ ├── help.py │ ├── inference │ │ ├── __init__.py │ │ ├── api.py │ │ ├── discourse.py │ │ ├── mace.py │ │ ├── nonmonotonic.py │ │ ├── prover9.py │ │ ├── resolution.py │ │ └── tableau.py │ ├── internals.py │ ├── lazyimport.py │ ├── metrics │ │ ├── __init__.py │ │ ├── agreement.py │ │ ├── artstein_poesio_example.txt │ │ ├── association.py │ │ ├── confusionmatrix.py │ │ ├── distance.py │ │ ├── scores.py │ │ ├── segmentation.py │ │ ├── spearman.py │ │ └── windowdiff.py │ ├── misc │ │ ├── __init__.py │ │ ├── babelfish.py │ │ ├── chomsky.py │ │ ├── minimalset.py │ │ ├── sort.py │ │ └── wordfinder.py │ ├── model │ │ ├── __init__.py │ │ ├── api.py │ │ └── ngram.py │ ├── nltk.jar │ ├── parse │ │ ├── __init__.py │ │ ├── api.py │ │ ├── broker_test.cfg │ │ ├── chart.py │ │ ├── dependencygraph.py │ │ ├── earleychart.py │ │ ├── featurechart.py │ │ ├── generate.py │ │ ├── generate2.py │ │ ├── malt.py │ │ ├── nonprojectivedependencyparser.py │ │ ├── pchart.py │ │ ├── projectivedependencyparser.py │ │ ├── rd.py │ │ ├── sr.py │ │ ├── test.cfg │ │ ├── util.py │ │ └── viterbi.py │ ├── probability.py │ ├── sem │ │ ├── __init__.py │ │ ├── boxer.py │ │ ├── chat80.py │ │ ├── cooper_storage.py │ │ ├── drt.py │ │ ├── drt_glue_demo.py │ │ ├── evaluate.py │ │ ├── glue.py │ │ ├── hole.py │ │ ├── lfg.py │ │ ├── linearlogic.py │ │ ├── logic.py │ │ ├── relextract.py │ │ ├── skolemize.py │ │ └── util.py │ ├── sourcedstring.py │ ├── stem │ │ ├── __init__.py │ │ ├── api.py │ │ ├── isri.py │ │ ├── lancaster.py │ │ ├── porter.py │ │ ├── regexp.py │ │ ├── rslp.py │ │ ├── snowball.py │ │ └── wordnet.py │ ├── tag │ │ ├── __init__.py │ │ ├── api.py │ │ ├── brill.py │ │ ├── crf.py │ │ ├── hmm.py │ │ ├── hunpos.py │ │ ├── senna.py │ │ ├── simplify.py │ │ ├── stanford.py │ │ ├── tnt.py │ │ └── util.py │ ├── test │ │ ├── Makefile │ │ ├── __init__.py │ │ ├── align.doctest │ │ ├── all.py │ │ ├── ccg.doctest │ │ ├── chat80.doctest │ │ ├── childes.doctest │ │ ├── chunk.doctest │ │ ├── classify.doctest │ │ ├── collocations.doctest │ │ ├── corpus.doctest │ │ ├── data.doctest │ │ ├── dependency.doctest │ │ ├── discourse.doctest │ │ ├── doctest_driver.py │ │ ├── doctest_nose_plugin.py │ │ ├── doctest_utils.py │ │ ├── drt.doctest │ │ ├── featgram.doctest │ │ ├── featstruct.doctest │ │ ├── floresta.txt │ │ ├── gluesemantics.doctest │ │ ├── grammar.doctest │ │ ├── grammartestsuites.doctest │ │ ├── inference.doctest │ │ ├── internals.doctest │ │ ├── japanese.doctest │ │ ├── logic.doctest │ │ ├── metrics.doctest │ │ ├── misc.doctest │ │ ├── nonmonotonic.doctest │ │ ├── onto1.fol │ │ ├── parse.doctest │ │ ├── portuguese.doctest_latin1 │ │ ├── portuguese_en.doctest │ │ ├── probability.doctest │ │ ├── relextract.doctest │ │ ├── resolution.doctest │ │ ├── runtests.py │ │ ├── segmentation.doctest │ │ ├── sem3.cfg │ │ ├── semantics.doctest │ │ ├── simple.doctest │ │ ├── sourcedstring.doctest │ │ ├── stem.doctest │ │ ├── tag.doctest │ │ ├── tokenize.doctest │ │ ├── toolbox.doctest │ │ ├── toy.cfg │ │ ├── tree.doctest │ │ ├── treetransforms.doctest │ │ ├── util.doctest │ │ └── wordnet.doctest │ ├── text.py │ ├── tokenize │ │ ├── __init__.py │ │ ├── api.py │ │ ├── punkt.py │ │ ├── regexp.py │ │ ├── sexpr.py │ │ ├── simple.py │ │ ├── texttiling.py │ │ ├── treebank.py │ │ └── util.py │ ├── toolbox.py │ ├── tree.py │ ├── treetransforms.py │ ├── util.py │ └── yamltags.py ├── papers │ ├── acl-02 │ │ ├── .cvsignore │ │ ├── Makefile │ │ ├── acl-02.tex │ │ ├── acl.bst │ │ ├── acl2002.sty │ │ ├── chartparse.eps.gz │ │ ├── contest.ps.gz │ │ └── nltk.bib │ ├── acl-04 │ │ ├── .cvsignore │ │ ├── Makefile │ │ ├── acl-04.tex │ │ ├── acl.bst │ │ ├── acl04.sty │ │ ├── chart-matrix.gif │ │ ├── chart.eps.gz │ │ └── nltk.bib │ ├── acl-06 │ │ ├── acl-06.tex │ │ ├── acl.bst │ │ ├── colacl06.sty │ │ ├── rdparser.eps.gz │ │ └── srparser.eps.gz │ ├── acl-08 │ │ ├── acl-08.bib │ │ ├── acl-08.tex │ │ ├── acl08.sty │ │ ├── grammar1.py │ │ ├── grammar2.py │ │ └── police.py │ ├── altw-06 │ │ ├── acl.bst │ │ ├── altw-06.bib │ │ ├── altw-06.tex │ │ └── colacl06.sty │ ├── icon-05 │ │ ├── acl.bst │ │ ├── acl2005.sty │ │ └── icon-05.tex │ └── iwcs-08 │ │ ├── drs.png │ │ ├── garrette-klein.tar.gz │ │ ├── iwcs.doctest │ │ ├── lingmacros.sty │ │ ├── modules.graffle │ │ ├── modules.pdf │ │ ├── nltk_iwcs_09.bib │ │ └── nltk_iwcs_09.tex ├── setup.cfg ├── setup.py ├── tools │ ├── find_deprecated.py │ ├── global_replace.py │ ├── nltk_term_index.py │ ├── nltk_term_index.stoplist │ └── svnmime.py ├── tox.ini └── web │ ├── Makefile │ ├── api │ └── nltk.rst │ ├── conf.py │ ├── data.rst │ ├── dev │ ├── jenkins.rst │ └── local_testing.rst │ ├── images │ ├── book.gif │ └── tree.gif │ ├── index.rst │ ├── install.rst │ └── news.rst ├── src └── clojure_nltk │ └── core.clj └── test └── clojure_nltk └── core_test.clj /README: -------------------------------------------------------------------------------- 1 | # clojure-nltk 2 | 3 | Most of the functionality in the Python-based Natural Language Toolkit (NLTK) 4 | works in Jython (it has a few dependencies only available in CPython). However 5 | with some minor tweaks it is possible to use a sizable subset of NLTK in Jython, 6 | and by extension, in Clojure. 7 | 8 | ## Usage 9 | 10 | (ns clojure-nltk.core 11 | (:require [clojure-nltk.core :as nltk])) 12 | 13 | (nltk/init) ; initialize nltk 14 | 15 | ## Installation 16 | 17 | To include as a dependency: 18 | 19 | Copy the config section found at http://clojars.org/clojure-nltk into your 20 | dependencies in your project's project.clj. 21 | 22 | ## License 23 | 24 | Copyright (C) 2010-2012 Robert P. Levy 25 | 26 | Distributed under the Eclipse Public License, the same as Clojure. 27 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject clojure-nltk "2.0.3-clj-0" 2 | :description "Python's NLTK for Clojure (interop / partial port)." 3 | :license {:name "Eclipse Public License" 4 | :url "http://www.eclipse.org/legal/epl-v10.html"} 5 | :dependencies [[org.clojure/clojure "1.4.0"] 6 | [clojure-python "0.4.1"]] 7 | :profiles {:dev {:dependencies [[midje "1.4.0"]]}} 8 | :plugins [[lein-midje "2.0.0"]]) 9 | -------------------------------------------------------------------------------- /resources/INSTALL.txt: -------------------------------------------------------------------------------- 1 | To install NLTK, run setup.py from an administrator account, e.g.: 2 | 3 | sudo python setup.py install 4 | 5 | For full installation instructions, please see http://nltk.github.com/install.html 6 | 7 | -------------------------------------------------------------------------------- /resources/LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (C) 2001-2012 NLTK Project 2 | 3 | Licensed under the Apache License, Version 2.0 (the 'License'); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an 'AS IS' BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /resources/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE.txt INSTALL.txt README.txt MANIFEST.in 2 | include setup.py distribute_setup.py 3 | include nltk/nltk.jar 4 | include nltk/test/*.doctest 5 | include nltk/VERSION 6 | recursive-include javasrc *.java *.txt Makefile 7 | global-exclude *~ 8 | -------------------------------------------------------------------------------- /resources/Makefile: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: source Makefile 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Steven Bird 5 | # Edward Loper 6 | # URL: 7 | # For license information, see LICENSE.TXT 8 | 9 | PYTHON = python 10 | VERSION = $(shell $(PYTHON) -c 'import nltk; print nltk.__version__' | sed '/^Warning: */d') 11 | NLTK_URL = $(shell $(PYTHON) -c 'import nltk; print nltk.__url__' | sed '/^Warning: */d') 12 | 13 | .PHONY: all clean clean_code 14 | 15 | all: dist 16 | 17 | ######################################################################## 18 | # TESTING 19 | ######################################################################## 20 | 21 | DOCTEST_DRIVER = nltk/test/doctest_driver.py 22 | DOCTEST_FLAGS = --ellipsis --normalize_whitespace 23 | DOCTEST_FILES = nltk/test/*.doctest 24 | DOCTEST_CODE_FILES = nltk/*.py nltk/*/*.py 25 | 26 | doctest: 27 | $(PYTHON) $(DOCTEST_DRIVER) $(DOCTEST_FLAGS) $(DOCTEST_FILES) 28 | 29 | doctest_code: 30 | $(PYTHON) $(DOCTEST_DRIVER) $(DOCTEST_FLAGS) $(DOCTEST_CODE_FILES) 31 | 32 | demotest: 33 | find nltk -name "*.py"\ 34 | -and -not -path *misc* \ 35 | -and -not -name brown_ic.py \ 36 | -exec echo ==== '{}' ==== \; -exec python '{}' \; 37 | 38 | ######################################################################## 39 | # JAVA 40 | ######################################################################## 41 | 42 | jar: nltk/nltk.jar 43 | 44 | JAVA_SRC = $(shell find javasrc/org/nltk -name '*.java') 45 | nltk/nltk.jar: $(JAVA_SRC) 46 | $(MAKE) -C javasrc jar 47 | cp javasrc/nltk.jar nltk/nltk.jar 48 | 49 | ######################################################################## 50 | # DISTRIBUTIONS 51 | ######################################################################## 52 | 53 | dist: zipdist gztardist windist 54 | 55 | gztardist: clean_code 56 | $(PYTHON) setup.py -q sdist --format=gztar 57 | zipdist: clean_code 58 | $(PYTHON) setup.py -q sdist --format=zip 59 | windist: clean_code 60 | $(PYTHON) setup.py -q bdist --format=wininst --plat-name=win32 61 | 62 | ######################################################################## 63 | # CLEAN 64 | ######################################################################## 65 | 66 | clean: clean_code 67 | rm -rf build iso dist api MANIFEST nltk-$(VERSION) nltk.egg-info 68 | $(MAKE) -C javasrc clean 69 | # rm -f nltk/nltk.jar 70 | 71 | clean_code: 72 | rm -f `find . -name '*.pyc'` 73 | rm -f `find . -name '*.pyo'` 74 | rm -f `find . -name '*~'` 75 | rm -f MANIFEST # regenerate manifest from MANIFEST.in 76 | -------------------------------------------------------------------------------- /resources/NOTICE.txt: -------------------------------------------------------------------------------- 1 | Natural Language Toolkit (NLTK) http://www.nltk.org/ 2 | 3 | Copyright (C) 2001-2012 NLTK Project 4 | 5 | Bird, Steven, Edward Loper and Ewan Klein (2009). 6 | Natural Language Processing with Python. O'Reilly Media Inc. 7 | -------------------------------------------------------------------------------- /resources/README.md: -------------------------------------------------------------------------------- 1 | Natural Language Toolkit (NLTK) www.nltk.org 2 | ==================================== 3 | 4 | Authors 5 | ---------------- 6 | - Steven Bird 7 | - Edward Loper 8 | - Ewan Klein 9 | 10 | Copyright (C) 2001-2012 NLTK Project 11 | 12 | For license information, see LICENSE.txt 13 | 14 | NLTK -- the Natural Language Toolkit -- is a suite of open source 15 | Python modules, data sets and tutorials supporting research and 16 | development in Natural Language Processing. 17 | 18 | Documentation 19 | ------------------------ 20 | A substantial amount of documentation about NLTK is available: 21 | 22 | - The [NLTK website](http://nltk.org/) has information about the NLTK community. 23 | 24 | - The [NLTK Book](https://sites.google.com/site/naturallanguagetoolkit/book) covers a wide range of introductory topics in NLP, and 25 | shows how to do all the processing tasks using the toolkit. 26 | 27 | - The [API Documentation](http://nltk.github.com/api/) describes every module, 28 | interface, class, method, function, and variable in the toolkit. 29 | 30 | Mailing Lists 31 | -------------------- 32 | There are several mailing lists associated with NLTK: 33 | 34 | - [nltk](http://groups.google.com/group/nltk): Public information and announcements about NLTK (very low volume). 35 | - [nltk-users](http://groups.google.com/group/nltk-users): Discussions amongst NLTK users. 36 | - [nltk-dev](http://groups.google.com/group/nltk-dev): Discussions amongst NLTK developers. 37 | - [nltk-translation](http://groups.google.com/group/nltk-translation): Discussions about translating the NLTK book. 38 | 39 | 40 | Contributing 41 | ------------------ 42 | If you would like to contribute to NLTK, please post your ideas to nltk-dev, or [fork nltk on github](https://github.com/nltk/nltk). 43 | 44 | Donating 45 | --------------- 46 | Have you found the toolkit helpful? Please support NLTK development 47 | by donating to the project via PayPal, using the link on the NLTK homepage. 48 | 49 | Redistributing 50 | ---------------------- 51 | NLTK source code is distributed under the Apache 2.0 License. 52 | NLTK documentation is distributed under the Creative Commons Attribution-Noncommercial-No Derivative Works 3.0 United States license. 53 | NLTK corpora are provided under the terms given in the README file for each corpus; all are redistributable, and available for non-commercial use. 54 | NLTK may be freely redistributed, subject to the provisions of these licenses. 55 | 56 | Citing 57 | --------- 58 | If you publish work that uses NLTK, please cite the NLTK book, as follows: 59 | 60 | Bird, Steven, Edward Loper and Ewan Klein (2009). 61 | Natural Language Processing with Python. O'Reilly Media Inc. 62 | -------------------------------------------------------------------------------- /resources/README.txt: -------------------------------------------------------------------------------- 1 | Natural Language Toolkit (NLTK) www.nltk.org 2 | 3 | Authors: Steven Bird 4 | Edward Loper 5 | Ewan Klein 6 | 7 | Copyright (C) 2001-2012 NLTK Project 8 | 9 | For license information, see LICENSE.txt 10 | 11 | NLTK -- the Natural Language Toolkit -- is a suite of open source 12 | Python modules, data sets and tutorials supporting research and 13 | development in Natural Language Processing. 14 | 15 | Documentation: A substantial amount of documentation about how 16 | to use NLTK, including a textbook and API documention, is 17 | available from the NLTK website: http://www.nltk.org/ 18 | 19 | - The book covers a wide range of introductory topics in NLP, and 20 | shows how to do all the processing tasks using the toolkit. 21 | 22 | - The toolkit's reference documentation describes every module, 23 | interface, class, method, function, and variable in the toolkit. 24 | This documentation should be useful to both users and developers. 25 | 26 | Mailing Lists: There are several mailing lists associated with NLTK: 27 | 28 | - nltk: Public information and announcements about NLTK (very low volume) 29 | http://groups.google.com/group/nltk 30 | - nltk-users: Discussions amongst NLTK users 31 | http://groups.google.com/group/nltk-users 32 | - nltk-dev: Discussions amongst NLTK developers 33 | http://groups.google.com/group/nltk-dev 34 | - nltk-translation: Discussions about translating the NLTK book 35 | http://groups.google.com/group/nltk-translation 36 | - nltk-commits: Subversion commit logs for NLTK 37 | http://groups.google.com/group/nltk-commits 38 | 39 | Contributing: If you would like to contribute to NLTK, 40 | please see http://www.nltk.org/contribute 41 | 42 | Donating: Have you found the toolkit helpful? Please support NLTK development 43 | by donating to the project via PayPal, using the link on the NLTK homepage. 44 | 45 | Redistributing: NLTK source code is distributed under the Apache 2.0 License. 46 | NLTK documentation is distributed under the Creative Commons 47 | Attribution-Noncommercial-No Derivative Works 3.0 United States license. 48 | NLTK corpora are provided under the terms given in the README file 49 | for each corpus; all are redistributable, and available for non-commercial use. 50 | NLTK may be freely redistributed, subject to the provisions of these licenses. 51 | 52 | Citing: If you publish work that uses NLTK, please cite the NLTK book, as follows: 53 | 54 | Bird, Steven, Edward Loper and Ewan Klein (2009). 55 | Natural Language Processing with Python. O'Reilly Media Inc. 56 | -------------------------------------------------------------------------------- /resources/RELEASE-HOWTO: -------------------------------------------------------------------------------- 1 | Building an NLTK distribution 2 | ---------------------------------- 3 | 4 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 5 | @@@ BUILD 6 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 7 | 8 | A. PREPARATION 9 | 10 | 1. Check that installation instructions are up-to-date 11 | 2. Update the data index (make data_index) and commit 12 | 3. Update the ChangeLog (for nltk, nltk_data) 13 | git log --since=20XX-YY-ZZ 14 | 4. install the new version, since its the installed code that is checked 15 | 5. cd nltk/test; make (run the tests in nltk.test) 16 | 6. make demotest (run the demonstration code included in many modules) 17 | 18 | B. BUILD 19 | 20 | 1. Modify nltk/VERSION with the version number and commit 21 | 2. Make dist 22 | ?. (cd ../nltk_contrib; make dist???) 23 | 24 | D. RELEASE 25 | 26 | 1. Update the news page in nltk/web/news.rst 27 | 2. git tag -a 2.X.Y -m "version 2.X.Y" 28 | 3. sudo python setup.py register 29 | 4. Log in to http://pypi.python.org/pypi and upload distributions 30 | 5. post announcement to NLTK the mailing lists: 31 | nltk-dev (for beta releases) 32 | nltk (for final releases) 33 | 6. post announcement to external mailing lists, for major N.N releases only 34 | CORPORA@uib.no, linguist@linguistlist.org, 35 | PythonSIL@lists.sil.org, edu-sig@python.org 36 | mailing lists for any local courses using NLTK 37 | 38 | 39 | 40 | 41 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 42 | @@@ BOOK BUILD 43 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 44 | 45 | The build requires docutils, pdflatex, python imaging library, epydoc, 46 | cdrtools, ImageMagick 47 | 48 | 1. Check out a clean copy of the subversion repository (or make clean) 49 | and install locally with sudo python setup.py install; make clean 50 | 2. make doc (slow; see doc/ for the results) and commit 51 | 52 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 53 | @@@ INSTALL 54 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 55 | 56 | D. INSTALLATION 57 | 58 | 1. download and install new version on all machines 59 | 2. contact relevant sysads to install new version 60 | 3. copy dist directory to memory stick 61 | 62 | E. NEW VERSION NUMBER (optional) 63 | 64 | 1. update the version numbers in the repository so that builds 65 | off the repository don't have the same version as the release, 66 | e.g. after release 0.9.6, update repository version to 0.9.7a (alpha) 67 | -------------------------------------------------------------------------------- /resources/emacs/pycomplete.el: -------------------------------------------------------------------------------- 1 | ;;; Complete symbols at point using Pymacs. 2 | 3 | ;;; See pycomplete.py for the Python side of things and a short description 4 | ;;; of what to expect. 5 | 6 | (require 'pymacs) 7 | (require 'python-mode) 8 | 9 | (pymacs-load "pycomplete") 10 | 11 | (defun py-complete () 12 | (interactive) 13 | (let ((pymacs-forget-mutability t)) 14 | (insert (pycomplete-pycomplete (py-symbol-near-point) 15 | (py-find-global-imports))))) 16 | 17 | (defun py-find-global-imports () 18 | (save-excursion 19 | (let (first-class-or-def imports) 20 | (goto-char (point-min)) 21 | (setq first-class-or-def 22 | (re-search-forward "^ *\\(def\\|class\\) " nil t)) 23 | (goto-char (point-min)) 24 | (setq imports nil) 25 | (while (re-search-forward 26 | "^\\(import \\|from \\([A-Za-z_][A-Za-z_0-9]*\\) import \\).*" 27 | nil t) 28 | (setq imports (append imports 29 | (list (buffer-substring 30 | (match-beginning 0) 31 | (match-end 0)))))) 32 | imports))) 33 | 34 | (define-key py-mode-map "\M-\C-i" 'py-complete) 35 | 36 | (provide 'pycomplete) 37 | -------------------------------------------------------------------------------- /resources/emacs/pycomplete.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | Python dot expression completion using Pymacs. 4 | 5 | This almost certainly needs work, but if you add 6 | 7 | (require 'pycomplete) 8 | 9 | to your .xemacs/init.el file (untried w/ GNU Emacs so far) and have Pymacs 10 | installed, when you hit M-TAB it will try to complete the dot expression 11 | before point. For example, given this import at the top of the file: 12 | 13 | import time 14 | 15 | typing "time.cl" then hitting M-TAB should complete "time.clock". 16 | 17 | This is unlikely to be done the way Emacs completion ought to be done, but 18 | it's a start. Perhaps someone with more Emacs mojo can take this stuff and 19 | do it right. 20 | 21 | See pycomplete.el for the Emacs Lisp side of things. 22 | """ 23 | 24 | import sys 25 | import os.path 26 | 27 | try: 28 | x = set 29 | except NameError: 30 | from sets import Set as set 31 | else: 32 | del x 33 | 34 | def get_all_completions(s, imports=None): 35 | """Return contextual completion of s (string of >= zero chars). 36 | 37 | If given, imports is a list of import statements to be executed first. 38 | """ 39 | locald = {} 40 | if imports is not None: 41 | for stmt in imports: 42 | try: 43 | exec stmt in globals(), locald 44 | except TypeError: 45 | raise TypeError, "invalid type: %s" % stmt 46 | 47 | dots = s.split(".") 48 | if not s or len(dots) == 1: 49 | keys = set() 50 | keys.update(locald.keys()) 51 | keys.update(globals().keys()) 52 | import __builtin__ 53 | keys.update(dir(__builtin__)) 54 | keys = list(keys) 55 | keys.sort() 56 | if s: 57 | return [k for k in keys if k.startswith(s)] 58 | else: 59 | return keys 60 | 61 | sym = None 62 | for i in range(1, len(dots)): 63 | s = ".".join(dots[:i]) 64 | try: 65 | sym = eval(s, globals(), locald) 66 | except NameError: 67 | try: 68 | sym = __import__(s, globals(), locald, []) 69 | except ImportError: 70 | return [] 71 | if sym is not None: 72 | s = dots[-1] 73 | return [k for k in dir(sym) if k.startswith(s)] 74 | 75 | def pycomplete(s, imports=None): 76 | completions = get_all_completions(s, imports) 77 | dots = s.split(".") 78 | return os.path.commonprefix([k[len(dots[-1]):] for k in completions]) 79 | 80 | if __name__ == "__main__": 81 | print " ->", pycomplete("") 82 | print "sys.get ->", pycomplete("sys.get") 83 | print "sy ->", pycomplete("sy") 84 | print "sy (sys in context) ->", pycomplete("sy", imports=["import sys"]) 85 | print "foo. ->", pycomplete("foo.") 86 | print "Enc (email * imported) ->", 87 | print pycomplete("Enc", imports=["from email import *"]) 88 | print "E (email * imported) ->", 89 | print pycomplete("E", imports=["from email import *"]) 90 | 91 | print "Enc ->", pycomplete("Enc") 92 | print "E ->", pycomplete("E") 93 | 94 | # Local Variables : 95 | # pymacs-auto-reload : t 96 | # End : 97 | -------------------------------------------------------------------------------- /resources/examples/grammars/Makefile: -------------------------------------------------------------------------------- 1 | # NLTK: Documentation Makefile 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Ewan Klein 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | DATADIR = ../../../nltk_data 9 | PUBLISH = $(DATADIR)/packages/grammars 10 | 11 | PACKAGE_DIRS = book_grammars sample_grammars #basque_grammars spanish_grammars 12 | PACKAGES := $(addsuffix .zip, $(PACKAGE_DIRS)) 13 | 14 | ZIP = zip 15 | 16 | define remove 17 | $(if $(wildcard $1), rm $1,) 18 | endef 19 | 20 | all: publish 21 | 22 | ci: 23 | git ci -m "updated grammar files" 24 | 25 | zip: clean $(PACKAGES) 26 | 27 | 28 | clean: 29 | $(call remove, *.zip) 30 | 31 | %.zip: % 32 | $(ZIP) -r $< $< 33 | git add *zip 34 | 35 | publish: zip 36 | cp $(PACKAGES) $(PUBLISH) 37 | $(MAKE) -C $(DATADIR) grammars 38 | $(MAKE) -C $(DATADIR) pkg_index 39 | -------------------------------------------------------------------------------- /resources/examples/grammars/basque_grammars/basque1.cfg: -------------------------------------------------------------------------------- 1 | P -> IS AS 2 | AS -> IS ADI 3 | AS -> ADI 4 | IS -> IM erl_atz 5 | IM -> ize_arr 6 | IM -> ize_izb 7 | ADI -> adt 8 | erl_atz -> "k" | "a" 9 | ize_arr -> "ardo" | "egunkari" | "baloi" 10 | ize_izb -> "Pintxo" | "Kepa" 11 | adt -> "dakar" | "darama" 12 | -------------------------------------------------------------------------------- /resources/examples/grammars/basque_grammars/basque1.fcfg: -------------------------------------------------------------------------------- 1 | % start AS 2 | # ############################ 3 | # Grammar Rules 4 | # ############################ 5 | # AS expansion rules 6 | AS[ergnum=?n1, absnum=?n2] -> IS[kas=erg, num=?n1] AS[ergnum=?n1, absnum=?n2] 7 | AS[ergnum=?n1, absnum=?n2] -> AS[ergnum=?n1, absnum=?n2] IS[kas=erg, num=?n1] 8 | AS[ergnum=?n1, absnum=?n2] -> IS[kas=abs, num=?n2] AS[ergnum=?n1, absnum=?n2] 9 | AS[ergnum=?n1, absnum=?n2] -> AS[ergnum=?n1, absnum=?n2] IS[kas=abs, num=?n2] 10 | IS[kas=?k, num=?n] -> ize[azp=arr] knmdek[kas=?k, num=?n] 11 | AS[ergnum=?n1, absnum=?n2] -> adt[ergnum=?n1, absnum=?n2] 12 | # ############################ 13 | # Lexicon 14 | # ############################ 15 | adt[ergnum=hu, absnum=hu] -> 'dakar' | 'darama' 16 | adt[ergnum=hk, absnum=hu] -> 'dakarte' | 'daramate' 17 | knmdek[kas=erg, num=hu] -> 'ak' 18 | knmdek[kas=erg, num=hk] -> 'ek' 19 | knmdek[kas=abs, num=hk] -> 'ak' 20 | knmdek[kas=abs, num=hu] -> 'a' 21 | ize[azp=arr] -> 'zakur' | 'gizon' 22 | 23 | -------------------------------------------------------------------------------- /resources/examples/grammars/basque_grammars/basque1.pcfg: -------------------------------------------------------------------------------- 1 | as -> mendekoa as [0.15] 2 | as -> adlg mendekoa as [0.31] 3 | as -> adlg adlg mendekoa as [0.08] 4 | as -> adi adl [0.46] 5 | mendekoa -> adlg mendekoa [0.37] 6 | mendekoa -> adlg adlg mendekoa [0.09] 7 | mendekoa -> 'joatea' [0.18] 8 | mendekoa -> 'joateko' [0.27] 9 | mendekoa -> 'sartzera' [0.09] 10 | adi -> 'esan' [0.5] 11 | adi -> 'debekatzen' [0.33] 12 | adi -> 'eraman' [0.17] 13 | adl -> 'zuen' [0.17] 14 | adl -> 'zioten' [0.83] 15 | adlg -> 'bozgorailuarekin' [0.28] 16 | adlg -> 'euskal_presoekin' [0.18] 17 | adlg -> 'epaitegian' [0.09] 18 | adlg -> 'mendira' [0.18] 19 | adlg -> 'ejertzitoan' [0.09] 20 | adlg -> 'derrigorrean' [0.09] 21 | adlg -> 'lagunekin' [0.09] 22 | -------------------------------------------------------------------------------- /resources/examples/grammars/basque_grammars/basque1.regexp: -------------------------------------------------------------------------------- 1 | NP: {+**} """ # adjetibo edo determinatzaileei loturiko izenak nahiz izen segidak topatzen ditu 2 | -------------------------------------------------------------------------------- /resources/examples/grammars/basque_grammars/basque2.cfg: -------------------------------------------------------------------------------- 1 | S -> is as 2 | is -> ize adj | ior 3 | ize -> 'gaizkile' | 'epaile' | 'bizilagun' 4 | adj -> 'gaiztoek' | 'gaiztoak' | 'kanpotarrak' | 'kanpotarrek' | 'berriak' | 'berriek' 5 | ior -> 'haiek' | 'hark' 6 | as -> mendekoa as | adlg mendekoa as | adlg adlg mendekoa as | adi adl 7 | mendekoa -> adlg mendekoa | adlg adlg mendekoa | 'joatea' | 'joateko' | 'sartzera' 8 | adi -> 'esan' | 'debekatzen' | 'eraman' 9 | adl -> 'zuen' |'zioten' 10 | adlg -> 'bozgorailuarekin' | 'euskal_presoekin' | 'epaitegian' | 'mendira' | 'ejertzitoan' | 'derrigorrean' | 'lagunekin' 11 | -------------------------------------------------------------------------------- /resources/examples/grammars/basque_grammars/basque2.fcfg: -------------------------------------------------------------------------------- 1 | % start S 2 | # ############################ 3 | # Grammar Rules 4 | # ############################ 5 | S -> IS[kas=erg] AS/IS 6 | # IS erregelak 7 | IS[kas=?k, num=?n] -> ize[azp=arr] knmdek[kas=?k, num=?n] 8 | IS[kas=?k, num=?n] -> ize[azp=ber] knmdek[kas=?k, num=?n] 9 | IS[kas=?k, num=?n]/IS -> 10 | # AS erregelak 11 | AS[ergnum=?n1, absnum=?n2]/?x -> IS[kas=abs, num=?n1]/?x AS[ergnum=?n1, absnum=?n2] 12 | AS[ergnum=?n1, absnum=?n2] -> adi adl[ergnum=?n1, absnum=?n2] 13 | # ############################ 14 | # Lexicon 15 | # ############################ 16 | knmdek[kas=erg, num=hu] -> 'ak' 17 | knmdek[kas=erg, num=hk] -> 'ek' 18 | knmdek[kas=abs, num=hk] -> 'ak' 19 | knmdek[kas=abs, num=hu] -> 'a' 20 | ize[azp=arr] -> 'bizilagun' | 'aita' | 'gizon' | 'emakume' 21 | ize[azp=ber] -> 'Kepa' | 'Ainara' 22 | adi -> 'ekarri' | 'eraman' | 'puskatu' | 'lapurtu' 23 | adl[ergnum=hu, absnum=hu] -> 'du' | 'zuen' 24 | adl[ergnum=hk, absnum=hu] -> 'dute' | 'zuten' 25 | adl[ergnum=hu, absnum=hk] -> 'ditu' | 'zituen' 26 | adl[ergnum=hk, absnum=hk] -> 'dituzte' | 'zituzten' 27 | -------------------------------------------------------------------------------- /resources/examples/grammars/basque_grammars/basque2.pcfg: -------------------------------------------------------------------------------- 1 | IS -> IZE_ARR [0.5] | IZE_ARR ADJ [0.3] | IS LOT IS [0.2] 2 | IZE_ARR -> 'gizon' [0.1] | 'emakume' [0.2] | 'ume' [0.3] | IZE_ARR LOT IZE_ARR [0.4] 3 | ADJ -> 'zaharrak' [0.4] | 'gazteak' [0.6] 4 | LOT -> 'eta' [0.9] | 'edo' [0.1] 5 | -------------------------------------------------------------------------------- /resources/examples/grammars/basque_grammars/basque2.regexp: -------------------------------------------------------------------------------- 1 | NP: {**} # adjetibo edo determinatzaileei loturiko izenak topatzen ditu 2 | NP: {+} # izen segidak topatzen ditu 3 | -------------------------------------------------------------------------------- /resources/examples/grammars/basque_grammars/basque3.cfg: -------------------------------------------------------------------------------- 1 | IS -> IZE_ARR | IZE_ARR ADJ | IS LOT IS 2 | IZE_ARR -> 'gizon' | 'emakume' | 'ume' | IZE_ARR LOT IZE_ARR 3 | ADJ -> 'zaharrak' | 'gazteak' 4 | LOT -> 'eta' | 'edo' 5 | -------------------------------------------------------------------------------- /resources/examples/grammars/basque_grammars/basque3.fcfg: -------------------------------------------------------------------------------- 1 | % start S 2 | # ############################ 3 | # Grammar Rules 4 | # ############################ 5 | 6 | ## NORK-NOR Kasuak 7 | 8 | S -> IS[kas=erg] AS/IS 9 | # IS erregelak 10 | IS[kas=?k, num=?n] -> ize[azp=arr] knmdek[kas=?k, num=?n] 11 | IS[kas=?k, num=?n] -> ize[azp=ber] knmdek[kas=?k, num=?n] 12 | 13 | IS[kas=?k, num=?n]/IS -> 14 | 15 | # AS erregelak 16 | AS[ergnum=?n1, absnum=?n2]/?x -> IS[kas=abs, num=?n1]/?x AS[ergnum=?n1, absnum=?n2] 17 | AS[ergnum=?n1, absnum=?n2] -> adi adl[ergnum=?n1, absnum=?n2] 18 | # ############################ 19 | # Lexicon 20 | # ############################ 21 | 22 | knmdek[kas=erg, num=hu] -> 'ak' 23 | knmdek[kas=erg, num=hk] -> 'ek' 24 | 25 | knmdek[kas=abs, num=hk] -> 'ak' 26 | knmdek[kas=abs, num=hu] -> 'a' 27 | 28 | ize[azp=arr] -> 'bizilagun' | 'aita' | 'gizon' | 'emakume' 29 | ize[azp=ber] -> 'Kepa' | 'Ainara' 30 | 31 | adi -> 'ekarri' | 'eraman' | 'puskatu' | 'lapurtu' 32 | 33 | adl[ergnum=hu, absnum=hu] -> 'du' | 'zuen' 34 | adl[ergnum=hk, absnum=hu] -> 'dute' | 'zuten' 35 | adl[ergnum=hu, absnum=hk] -> 'ditu' | 'zituen' 36 | adl[ergnum=hk, absnum=hk] -> 'dituzte' | 'zituzten' 37 | -------------------------------------------------------------------------------- /resources/examples/grammars/basque_grammars/basque3.regexp: -------------------------------------------------------------------------------- 1 | IS: 2 | {<.*>+} # Edozer Onartzen Duen Chunkerra 3 | }+{ # Chink Bezala Barneratu Aditzak (ADI.*, ADT.* eta ADL.*), Adberbioak (ADB.*), Preposizioak (POST.*), Loturak (LOT.*) Eta Puntuazio Ikurrak (PUNT.*) 4 | -------------------------------------------------------------------------------- /resources/examples/grammars/basque_grammars/basque4.regexp: -------------------------------------------------------------------------------- 1 | IS: {(**+**)*} #noun phrase chunks 2 | AS: {(*)+} # verb phrase chunks 3 | PS: {+} # prepositional phrase chunks 4 | -------------------------------------------------------------------------------- /resources/examples/grammars/basque_grammars/basque5.regexp: -------------------------------------------------------------------------------- 1 | IS: {(*****)*} #noun phrase chunks 2 | AS: {(+)+*} # verb phrase chunks 3 | PS: {+} # prepositional phrase chunks 4 | S: {} 5 | {} # Chunk NP, VP 6 | -------------------------------------------------------------------------------- /resources/examples/grammars/book_grammars/background.fol: -------------------------------------------------------------------------------- 1 | ## Natural Language Toolkit: background1.fol 2 | ## 3 | ## Illustration of simple knowledge base for use with inference tools. 4 | ## To accompany sem4.fcfg 5 | ## 6 | ## Author: Ewan Klein 7 | ## URL: 8 | ## For license information, see LICENSE.TXT 9 | 10 | all x. (boxerdog(x) -> dog(x)) 11 | all x. (boxer(x) -> person(x)) 12 | 13 | all x. (-(dog(x) & person(x))) 14 | 15 | all x. (married(x) <-> exists y. marry(x,y)) 16 | all x. (bark(x) -> dog(x)) 17 | 18 | all x. all y. (marry(x,y) -> (person(x) & person(y))) 19 | 20 | (-(Vincent = Mia)) 21 | (-(Vincent = Fido)) 22 | (-(Mia = Fido)) 23 | -------------------------------------------------------------------------------- /resources/examples/grammars/book_grammars/feat0.fcfg: -------------------------------------------------------------------------------- 1 | ## Natural Language Toolkit: feat0.fcfg 2 | ## 3 | ## First example of a feature-based grammar for English, illustrating 4 | ## value-sharing of NUM and TENSE features. 5 | ## Used in Feature-Based Grammars chapter. 6 | ## 7 | ## Author: Ewan Klein 8 | ## URL: 9 | ## For license information, see LICENSE.TXT 10 | 11 | % start S 12 | # ################### 13 | # Grammar Productions 14 | # ################### 15 | 16 | # S expansion productions 17 | S -> NP[NUM=?n] VP[NUM=?n] 18 | 19 | # NP expansion productions 20 | NP[NUM=?n] -> N[NUM=?n] 21 | NP[NUM=?n] -> PropN[NUM=?n] 22 | NP[NUM=?n] -> Det[NUM=?n] N[NUM=?n] 23 | NP[NUM=pl] -> N[NUM=pl] 24 | 25 | # VP expansion productions 26 | VP[TENSE=?t, NUM=?n] -> IV[TENSE=?t, NUM=?n] 27 | VP[TENSE=?t, NUM=?n] -> TV[TENSE=?t, NUM=?n] NP 28 | 29 | # ################### 30 | # Lexical Productions 31 | # ################### 32 | 33 | Det[NUM=sg] -> 'this' | 'every' 34 | Det[NUM=pl] -> 'these' | 'all' 35 | Det -> 'the' | 'some' | 'several' 36 | 37 | PropN[NUM=sg]-> 'Kim' | 'Jody' 38 | 39 | N[NUM=sg] -> 'dog' | 'girl' | 'car' | 'child' 40 | N[NUM=pl] -> 'dogs' | 'girls' | 'cars' | 'children' 41 | 42 | IV[TENSE=pres, NUM=sg] -> 'disappears' | 'walks' 43 | TV[TENSE=pres, NUM=sg] -> 'sees' | 'likes' 44 | 45 | IV[TENSE=pres, NUM=pl] -> 'disappear' | 'walk' 46 | TV[TENSE=pres, NUM=pl] -> 'see' | 'like' 47 | 48 | IV[TENSE=past] -> 'disappeared' | 'walked' 49 | TV[TENSE=past] -> 'saw' | 'liked' 50 | -------------------------------------------------------------------------------- /resources/examples/grammars/book_grammars/feat1.fcfg: -------------------------------------------------------------------------------- 1 | ## Natural Language Toolkit: feat1.fcfg 2 | ## 3 | ## Second example of a feature-based grammar, illustrating 4 | ## SUBCAT and slash features. Also introduces SBar and embedded 5 | ## clauses. 6 | ## Used in Feature-Based Grammars chapter. 7 | ## 8 | ## Author: Ewan Klein 9 | ## URL: 10 | ## For license information, see LICENSE.TXT 11 | 12 | % start S 13 | # ################### 14 | # Grammar Productions 15 | # ################### 16 | 17 | S[-INV] -> NP VP 18 | S[-INV]/?x -> NP VP/?x 19 | 20 | S[-INV] -> NP S/NP 21 | S[-INV] -> Adv[+NEG] S[+INV] 22 | 23 | S[+INV] -> V[+AUX] NP VP 24 | S[+INV]/?x -> V[+AUX] NP VP/?x 25 | 26 | SBar -> Comp S[-INV] 27 | SBar/?x -> Comp S[-INV]/?x 28 | 29 | VP -> V[SUBCAT=intrans, -AUX] 30 | 31 | VP -> V[SUBCAT=trans, -AUX] NP 32 | VP/?x -> V[SUBCAT=trans, -AUX] NP/?x 33 | 34 | VP -> V[SUBCAT=clause, -AUX] SBar 35 | VP/?x -> V[SUBCAT=clause, -AUX] SBar/?x 36 | 37 | VP -> V[+AUX] VP 38 | VP/?x -> V[+AUX] VP/?x 39 | 40 | # ################### 41 | # Lexical Productions 42 | # ################### 43 | V[SUBCAT=intrans, -AUX] -> 'walk' | 'sing' 44 | V[SUBCAT=trans, -AUX] -> 'see' | 'like' 45 | V[SUBCAT=clause, -AUX] -> 'say' | 'claim' 46 | V[+AUX] -> 'do' | 'can' 47 | 48 | NP[-WH] -> 'you' | 'cats' 49 | NP[+WH] -> 'who' 50 | 51 | Adv[+NEG] -> 'rarely' | 'never' 52 | 53 | NP/NP -> 54 | 55 | Comp -> 'that' 56 | -------------------------------------------------------------------------------- /resources/examples/grammars/book_grammars/german.fcfg: -------------------------------------------------------------------------------- 1 | ## Natural Language Toolkit: german.fcfg 2 | ## 3 | ## Example of a feature-based grammar for German, illustrating 4 | ## CASE and AGR features (PER, GND, NUM) working as a bundle. 5 | ## Used in Feature-Based Grammars chapter. 6 | ## 7 | ## Author: Michaela Atterer 8 | ## Ewan Klein 9 | ## 10 | ## Plural transitive verbs productions by Jordan Boyd-Graber (ezubaric at users.sourceforge.net) 11 | 12 | % start S 13 | ##################### 14 | # Grammar Productions 15 | ##################### 16 | S -> NP[CASE=nom, AGR=?a] VP[AGR=?a] 17 | 18 | NP[CASE=?c, AGR=?a] -> PRO[CASE=?c, AGR=?a] 19 | NP[CASE=?c, AGR=?a] -> Det[CASE=?c, AGR=?a] N[CASE=?c, AGR=?a] 20 | 21 | VP[AGR=?a] -> IV[AGR=?a] 22 | VP[AGR=?a] -> TV[OBJCASE=?c, AGR=?a] NP[CASE=?c] 23 | 24 | ##################### 25 | # Lexical Productions 26 | ##################### 27 | # Singular determiners 28 | 29 | # masc 30 | Det[CASE=nom, AGR=[GND=masc,PER=3,NUM=sg]] -> 'der' 31 | Det[CASE=dat, AGR=[GND=masc,PER=3,NUM=sg]] -> 'dem' 32 | Det[CASE=acc, AGR=[GND=masc,PER=3,NUM=sg]] -> 'den' 33 | 34 | # fem 35 | Det[CASE=nom, AGR=[GND=fem,PER=3,NUM=sg]] -> 'die' 36 | Det[CASE=dat, AGR=[GND=fem,PER=3,NUM=sg]] -> 'der' 37 | Det[CASE=acc, AGR=[GND=fem,PER=3,NUM=sg]] -> 'die' 38 | 39 | # Plural determiners 40 | Det[CASE=nom, AGR=[PER=3,NUM=pl]] -> 'die' 41 | Det[CASE=dat, AGR=[PER=3,NUM=pl]] -> 'den' 42 | Det[CASE=acc, AGR=[PER=3,NUM=pl]] -> 'die' 43 | 44 | # Nouns 45 | N[AGR=[GND=masc,PER=3,NUM=sg]] -> 'Hund' 46 | N[CASE=nom, AGR=[GND=masc,PER=3,NUM=pl]] -> 'Hunde' 47 | N[CASE=dat, AGR=[GND=masc,PER=3,NUM=pl]] -> 'Hunden' 48 | N[CASE=acc, AGR=[GND=masc,PER=3,NUM=pl]] -> 'Hunde' 49 | 50 | N[AGR=[GND=fem,PER=3,NUM=sg]] -> 'Katze' 51 | N[AGR=[GND=fem,PER=3,NUM=pl]] -> 'Katzen' 52 | 53 | # Pronouns 54 | PRO[CASE=nom, AGR=[PER=1,NUM=sg]] -> 'ich' 55 | PRO[CASE=acc, AGR=[PER=1,NUM=sg]] -> 'mich' 56 | PRO[CASE=dat, AGR=[PER=1,NUM=sg]] -> 'mir' 57 | PRO[CASE=nom, AGR=[PER=2,NUM=sg]] -> 'du' 58 | PRO[CASE=nom, AGR=[PER=3,NUM=sg]] -> 'er' | 'sie' | 'es' 59 | PRO[CASE=nom, AGR=[PER=1,NUM=pl]] -> 'wir' 60 | PRO[CASE=acc, AGR=[PER=1,NUM=pl]] -> 'uns' 61 | PRO[CASE=dat, AGR=[PER=1,NUM=pl]] -> 'uns' 62 | PRO[CASE=nom, AGR=[PER=2,NUM=pl]] -> 'ihr' 63 | PRO[CASE=nom, AGR=[PER=3,NUM=pl]] -> 'sie' 64 | 65 | # Verbs 66 | IV[AGR=[NUM=sg,PER=1]] -> 'komme' 67 | IV[AGR=[NUM=sg,PER=2]] -> 'kommst' 68 | IV[AGR=[NUM=sg,PER=3]] -> 'kommt' 69 | IV[AGR=[NUM=pl, PER=1]] -> 'kommen' 70 | IV[AGR=[NUM=pl, PER=2]] -> 'kommt' 71 | IV[AGR=[NUM=pl, PER=3]] -> 'kommen' 72 | 73 | TV[OBJCASE=acc, AGR=[NUM=sg,PER=1]] -> 'sehe' | 'mag' 74 | TV[OBJCASE=acc, AGR=[NUM=sg,PER=2]] -> 'siehst' | 'magst' 75 | TV[OBJCASE=acc, AGR=[NUM=sg,PER=3]] -> 'sieht' | 'mag' 76 | TV[OBJCASE=dat, AGR=[NUM=sg,PER=1]] -> 'folge' | 'helfe' 77 | TV[OBJCASE=dat, AGR=[NUM=sg,PER=2]] -> 'folgst' | 'hilfst' 78 | TV[OBJCASE=dat, AGR=[NUM=sg,PER=3]] -> 'folgt' | 'hilft' 79 | TV[OBJCASE=acc, AGR=[NUM=pl,PER=1]] -> 'sehen' | 'moegen' 80 | TV[OBJCASE=acc, AGR=[NUM=pl,PER=2]] -> 'sieht' | 'moegt' 81 | TV[OBJCASE=acc, AGR=[NUM=pl,PER=3]] -> 'sehen' | 'moegen' 82 | TV[OBJCASE=dat, AGR=[NUM=pl,PER=1]] -> 'folgen' | 'helfen' 83 | TV[OBJCASE=dat, AGR=[NUM=pl,PER=2]] -> 'folgt' | 'helft' 84 | TV[OBJCASE=dat, AGR=[NUM=pl,PER=3]] -> 'folgen' | 'helfen' 85 | 86 | 87 | -------------------------------------------------------------------------------- /resources/examples/grammars/book_grammars/simple-sem.fcfg: -------------------------------------------------------------------------------- 1 | ## Natural Language Toolkit: sem3.fcfg 2 | ## 3 | ## Alternative simple grammar with transitive verbs and 4 | ## quantifiers for the book. 5 | ## 6 | ## Author: Ewan Klein 7 | ## URL: 8 | ## For license information, see LICENSE.TXT 9 | 10 | 11 | % start S 12 | ############################ 13 | # Grammar Rules 14 | ############################# 15 | 16 | S[SEM = ] -> NP[NUM=?n,SEM=?subj] VP[NUM=?n,SEM=?vp] 17 | 18 | NP[NUM=?n,SEM= ] -> Det[NUM=?n,SEM=?det] Nom[NUM=?n,SEM=?nom] 19 | NP[LOC=?l,NUM=?n,SEM=?np] -> PropN[LOC=?l,NUM=?n,SEM=?np] 20 | 21 | Nom[NUM=?n,SEM=?nom] -> N[NUM=?n,SEM=?nom] 22 | 23 | VP[NUM=?n,SEM=?v] -> IV[NUM=?n,SEM=?v] 24 | VP[NUM=?n,SEM=] -> TV[NUM=?n,SEM=?v] NP[SEM=?obj] 25 | VP[NUM=?n,SEM=] -> DTV[NUM=?n,SEM=?v] NP[SEM=?obj] PP[+TO,SEM=?pp] 26 | 27 | PP[+TO, SEM=?np] -> P[+TO] NP[SEM=?np] 28 | 29 | ############################# 30 | # Lexical Rules 31 | ############################# 32 | 33 | PropN[-LOC,NUM=sg,SEM=<\P.P(angus)>] -> 'Angus' 34 | PropN[-LOC,NUM=sg,SEM=<\P.P(cyril)>] -> 'Cyril' 35 | PropN[-LOC,NUM=sg,SEM=<\P.P(irene)>] -> 'Irene' 36 | 37 | Det[NUM=sg,SEM=<\P Q.all x.(P(x) -> Q(x))>] -> 'every' 38 | Det[NUM=pl,SEM=<\P Q.all x.(P(x) -> Q(x))>] -> 'all' 39 | Det[SEM=<\P Q.exists x.(P(x) & Q(x))>] -> 'some' 40 | Det[NUM=sg,SEM=<\P Q.exists x.(P(x) & Q(x))>] -> 'a' 41 | Det[NUM=sg,SEM=<\P Q.exists x.(P(x) & Q(x))>] -> 'an' 42 | 43 | N[NUM=sg,SEM=<\x.man(x)>] -> 'man' 44 | N[NUM=sg,SEM=<\x.girl(x)>] -> 'girl' 45 | N[NUM=sg,SEM=<\x.boy(x)>] -> 'boy' 46 | N[NUM=sg,SEM=<\x.bone(x)>] -> 'bone' 47 | N[NUM=sg,SEM=<\x.ankle(x)>] -> 'ankle' 48 | N[NUM=sg,SEM=<\x.dog(x)>] -> 'dog' 49 | N[NUM=pl,SEM=<\x.dog(x)>] -> 'dogs' 50 | 51 | IV[NUM=sg,SEM=<\x.bark(x)>,TNS=pres] -> 'barks' 52 | IV[NUM=pl,SEM=<\x.bark(x)>,TNS=pres] -> 'bark' 53 | IV[NUM=sg,SEM=<\x.walk(x)>,TNS=pres] -> 'walks' 54 | IV[NUM=pl,SEM=<\x.walk(x)>,TNS=pres] -> 'walk' 55 | TV[NUM=sg,SEM=<\X x.X(\y.chase(x,y))>,TNS=pres] -> 'chases' 56 | TV[NUM=pl,SEM=<\X x.X(\y.chase(x,y))>,TNS=pres] -> 'chase' 57 | TV[NUM=sg,SEM=<\X x.X(\y.see(x,y))>,TNS=pres] -> 'sees' 58 | TV[NUM=pl,SEM=<\X x.X(\y.see(x,y))>,TNS=pres] -> 'see' 59 | TV[NUM=sg,SEM=<\X x.X(\y.bite(x,y))>,TNS=pres] -> 'bites' 60 | TV[NUM=pl,SEM=<\X x.X(\y.bite(x,y))>,TNS=pres] -> 'bite' 61 | DTV[NUM=sg,SEM=<\Y X x.X(\z.Y(\y.give(x,y,z)))>,TNS=pres] -> 'gives' 62 | DTV[NUM=pl,SEM=<\Y X x.X(\z.Y(\y.give(x,y,z)))>,TNS=pres] -> 'give' 63 | 64 | P[+to] -> 'to' 65 | 66 | -------------------------------------------------------------------------------- /resources/examples/grammars/book_grammars/sql0.fcfg: -------------------------------------------------------------------------------- 1 | ## Natural Language Toolkit: sql.fcfg 2 | ## 3 | ## Deliberately naive string-based grammar for 4 | ## deriving SQL queries from English 5 | ## 6 | ## Author: Ewan Klein 7 | ## URL: 8 | ## For license information, see LICENSE.TXT 9 | 10 | % start S 11 | 12 | S[SEM=(?np + WHERE + ?vp)] -> NP[SEM=?np] VP[SEM=?vp] 13 | 14 | VP[SEM=(?v + ?pp)] -> IV[SEM=?v] PP[SEM=?pp] 15 | VP[SEM=(?v + ?ap)] -> IV[SEM=?v] AP[SEM=?ap] 16 | NP[SEM=(?det + ?n)] -> Det[SEM=?det] N[SEM=?n] 17 | PP[SEM=(?p + ?np)] -> P[SEM=?p] NP[SEM=?np] 18 | AP[SEM=?pp] -> A[SEM=?a] PP[SEM=?pp] 19 | 20 | NP[SEM='Country="greece"'] -> 'Greece' 21 | NP[SEM='Country="china"'] -> 'China' 22 | 23 | Det[SEM='SELECT'] -> 'Which' | 'What' 24 | 25 | N[SEM='City FROM city_table'] -> 'cities' 26 | 27 | IV[SEM=''] -> 'are' 28 | A -> 'located' 29 | P[SEM=''] -> 'in' 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /resources/examples/grammars/book_grammars/sql1.fcfg: -------------------------------------------------------------------------------- 1 | ## Natural Language Toolkit: sql.fcfg 2 | ## 3 | ## Deliberately naive string-based grammar for 4 | ## deriving SQL queries from English 5 | ## 6 | ## Author: Ewan Klein 7 | ## URL: 8 | ## For license information, see LICENSE.TXT 9 | 10 | % start S 11 | 12 | S[SEM=(?np + WHERE + ?vp)] -> NP[SEM=?np] VP[SEM=?vp] 13 | 14 | VP[SEM=(?v + ?pp)] -> IV[SEM=?v] PP[SEM=?pp] 15 | VP[SEM=(?v + ?ap)] -> IV[SEM=?v] AP[SEM=?ap] 16 | VP[SEM=(?v + ?np)] -> TV[SEM=?v] NP[SEM=?np] 17 | VP[SEM=(?vp1 + ?c + ?vp2)] -> VP[SEM=?vp1] Conj[SEM=?c] VP[SEM=?vp2] 18 | 19 | NP[SEM=(?det + ?n)] -> Det[SEM=?det] N[SEM=?n] 20 | NP[SEM=(?n + ?pp)] -> N[SEM=?n] PP[SEM=?pp] 21 | NP[SEM=?n] -> N[SEM=?n] | CardN[SEM=?n] 22 | 23 | ## NB Numbers in the Chat-80 database represent thousands. 24 | CardN[SEM='1000'] -> '1,000,000' 25 | 26 | PP[SEM=(?p + ?np)] -> P[SEM=?p] NP[SEM=?np] 27 | AP[SEM=?pp] -> A[SEM=?a] PP[SEM=?pp] 28 | 29 | NP[SEM='Country="greece"'] -> 'Greece' 30 | NP[SEM='Country="china"'] -> 'China' 31 | 32 | Det[SEM='SELECT'] -> 'Which' | 'What' 33 | Conj[SEM='AND'] -> 'and' 34 | 35 | N[SEM='City FROM city_table'] -> 'cities' 36 | N[SEM='Population'] -> 'populations' 37 | 38 | IV[SEM=''] -> 'are' 39 | TV[SEM=''] -> 'have' 40 | A -> 'located' 41 | P[SEM=''] -> 'in' 42 | P[SEM='>'] -> 'above' 43 | 44 | 45 | -------------------------------------------------------------------------------- /resources/examples/grammars/book_grammars/storage.fcfg: -------------------------------------------------------------------------------- 1 | ## Natural Language Toolkit: storage.fcfg 2 | ## 3 | ## Feature-based grammar that implements Cooper storage by dividing the 4 | ## semantics for each phrase into two pieces: the core semantics 5 | ## ('SEM','CORE') and a sequence of binding operators ('SEM','STORE'). 6 | ## Each binding operator is encoded as a logic term , 7 | ## where is a quantifier expression and the individual variable 8 | ## <@var> specifies the 'address' of the quantifier in the core 9 | ## semantics. and is a predicate describing that variable. 10 | 11 | ## In order for this grammar to generate the correct results, all 12 | ## variables of the form <@var> must be instantiated (i.e., replaced 13 | ## by unique new variables) whenever they are used. This can be 14 | ## accomplished by using the InstantiateVarsChart class when parsing. 15 | ## 16 | ## Author: Edward Loper , 17 | ## Ewan Klein 18 | ## Robin Cooper 19 | ## URL: 20 | ## For license information, see LICENSE.TXT 21 | 22 | %start S 23 | 24 | S[SEM=[CORE=, STORE=(?b1+?b2)]] -> NP[SEM=[CORE=?subj, STORE=?b1]] VP[SEM=[CORE=?vp, STORE=?b2]] 25 | 26 | VP[SEM=?s] -> IV[SEM=?s] 27 | VP[SEM=[CORE=, STORE=(?b1+?b2)]] -> TV[SEM=[CORE=?v, STORE=?b1]] NP[SEM=[CORE=?obj, STORE=?b2]] 28 | VP[SEM=[CORE=, STORE=(?b1+?b2+?b3)]] -> DTV[SEM=[CORE=?v, STORE=?b1]] NP[SEM=[CORE=?obj, STORE=?b2]] PP[+TO, SEM=[CORE=?pp, STORE=?b3]] 29 | 30 | NP[SEM=[CORE=<@x>, STORE=(()+?b1+?b2)]] -> Det[SEM=[CORE=?det, STORE=?b1]] N[SEM=[CORE=?n, STORE=?b2]] 31 | 32 | PP[+TO, SEM=[CORE=?np, STORE=?b1]] -> P NP[SEM=[CORE=?np, STORE=?b1]] 33 | 34 | # Lexical items: 35 | Det[SEM=[CORE=<\Q P.exists x.(Q(x) & P(x))>, STORE=(/)]] -> 'a' 36 | Det[SEM=[CORE=<\Q P.all x.(Q(x) implies P(x))>, STORE=(/)]] -> 'every' 37 | 38 | N[SEM=[CORE=, STORE=(/)]] -> 'dog' 39 | N[SEM=[CORE=, STORE=(/)]] -> 'bone' 40 | N[SEM=[CORE=, STORE=(/)]] -> 'girl' 41 | N[SEM=[CORE=, STORE=(/)]] -> 'man' 42 | 43 | IV[SEM=[CORE=<\x.smile(x)>, STORE=(/)]] -> 'smiles' 44 | IV[SEM=[CORE=<\x.walk(x)>, STORE=(/)]] -> 'walks' 45 | 46 | TV[SEM=[CORE=<\y x.feed(x,y)>, STORE=(/)]] -> 'feeds' 47 | TV[SEM=[CORE=<\y x.chase(x,y)>, STORE=(/)]] -> 'chases' 48 | 49 | DTV[SEM=[CORE=<\z y x.give(x,y,z)>, STORE=(/)]] -> 'gives' 50 | 51 | NP[SEM=[CORE=<@x>, STORE=()]] -> 'Angus' 52 | NP[SEM=[CORE=<@x>, STORE=()]] -> 'Cyril' 53 | 54 | P[+TO] -> 'to' 55 | -------------------------------------------------------------------------------- /resources/examples/grammars/sample_grammars/background0.fol: -------------------------------------------------------------------------------- 1 | ## Natural Language Toolkit: background0.fol 2 | ## 3 | ## Illustration of simple knowledge base for use with inference tools. 4 | ## To accompany sem4.fcfg 5 | ## 6 | ## Author: Ewan Klein 7 | ## URL: 8 | ## For license information, see LICENSE.TXT 9 | 10 | all x. (boxerdog(x) -> dog(x)) 11 | all x. (boxer(x) -> person(x)) 12 | 13 | all x. (-(dog(x) & person(x))) 14 | 15 | some x. boxer(x) 16 | some x. boxerdog(x) 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /resources/examples/grammars/sample_grammars/bindop.fcfg: -------------------------------------------------------------------------------- 1 | ## Natural Language Toolkit: sem0.fcfg 2 | ## 3 | ## Feature-based grammar that divides the semantics for each element 4 | ## into two pieces: the core semantics, with path ('SEM','CORE'), and a set of 5 | ## binding operators, with path ('SEM','BO'). Each binding operator is encoded 6 | ## as a lambda-calculus expression , specifying 7 | ## that <@var> is an individual variable that should be instantiated, 8 | ## and is an expression that can bind that variable. 9 | ## 10 | ## In order for this grammar to generate the correct results, all 11 | ## variables of the form <@var> must be instantiated (i.e., replaced 12 | ## by unique new variables) whenever they are used. This can be 13 | ## accomplished by using the InstantiateVarsChart class when parsing. 14 | ## 15 | ## Author: Edward Loper , 16 | ## Ewan Klein 17 | ## URL: 18 | ## For license information, see LICENSE.TXT 19 | 20 | %start S 21 | ## Grammar summary: 22 | ## S -> NP VP 23 | ## VP -> TV NP | IV 24 | ## NP -> Det N | proper nouns... 25 | ## TV -> transitive verbs... 26 | ## IV -> intransitive verbs... 27 | ## Det -> determiners... 28 | 29 | S[SEM=[CORE=, BO={?b1+?b2}]] -> NP[SEM=[CORE=?subj, BO=?b1]] VP[SEM=[CORE=?vp, BO=?b2]] 30 | 31 | VP[SEM=[CORE=, BO={?b1+?b2}]] -> TV[SEM=[CORE=?v, BO=?b1]] NP[SEM=[CORE=?obj, BO=?b2]] 32 | 33 | VP[SEM=?s] -> IV[SEM=?s] 34 | 35 | NP[SEM=[CORE=<@x>, BO={{}+?b1+?b2}]] -> Det[SEM=[CORE=?det, BO=?b1]] N[SEM=[CORE=?n, BO=?b2]] 36 | 37 | # Lexical items: 38 | Det[SEM=[CORE=<\Q P.exists x.(Q(x) & P(x))>, BO={/}]] -> 'a' 39 | N[SEM=[CORE=, BO={/}]] -> 'dog' | 'cat' | 'mouse' 40 | IV[SEM=[CORE=<\x.bark(x)>, BO={/}]] -> 'barks' | 'eats' | 'walks' 41 | TV[SEM=[CORE=<\x y.feed(y,x)>, BO={/}]] -> 'feeds' | 'walks' 42 | NP[SEM=[CORE=<@x>, BO={}]] -> 'john' | 'alex' 43 | -------------------------------------------------------------------------------- /resources/examples/grammars/sample_grammars/dep_test2.dep: -------------------------------------------------------------------------------- 1 | 1 John _ NNP _ _ 2 SUBJ _ _ 2 | 2 sees _ VB _ _ 0 ROOT _ _ 3 | 3 a _ DT _ _ 4 SPEC _ _ 4 | 4 dog _ NN _ _ 2 OBJ _ _ 5 | -------------------------------------------------------------------------------- /resources/examples/grammars/sample_grammars/event.fcfg: -------------------------------------------------------------------------------- 1 | ## Natural Language Toolkit: event.fcfg 2 | ## 3 | ## Illustrating Davidson-style event semantics 4 | ## 5 | ## Author: Ewan Klein 6 | ## URL: 7 | ## For license information, see LICENSE.TXT 8 | 9 | % start S 10 | ############################ 11 | # Grammar Rules 12 | ############################# 13 | 14 | S[sem = ] -> NP[num=?n,sem=?subj] VP[num=?n,sem=?vp] 15 | 16 | NP[num=?n,sem= ] -> Det[num=?n,sem=?det] Nom[num=?n,sem=?nom] 17 | NP[loc=?l,num=?n,sem=?np] -> PropN[loc=?l,num=?n,sem=?np] 18 | 19 | Nom[num=?n,sem=?nom] -> N[num=?n,sem=?nom] 20 | Nom[num=?n,sem=] -> N[num=?n,sem=?nom] PP[sem=?pp] 21 | 22 | VP[num=?n,sem=?v] -> IV[num=?n,sem=?v] 23 | VP[num=?n,sem=] -> TV[num=?n,sem=?v] NP[sem=?obj] 24 | VP[num=?n,sem=] -> DTV[num=?n,sem=?v] NP[sem=?obj] PP[+to, sem=?pp] 25 | 26 | 27 | VP[num=?n,sem=] -> VP[num=?n,sem=?vp] PP[sem=?pp] 28 | VP[num=?n,sem=] -> VP[num=?n,sem=?vp] Adv[sem=?adv] 29 | 30 | PP[sem=] -> P[loc=?l,sem=?p] NP[loc=?l,sem=?np] 31 | 32 | ############################# 33 | # Lexical Rules 34 | ############################# 35 | 36 | PropN[-loc,num=sg,sem=<\e R.R(e,angus)>] -> 'Angus' 37 | PropN[-loc,num=sg,sem=<\e R.R(e,pat)>] -> 'Pat' 38 | PropN[-loc,num=sg,sem=<\e R.R(e,irene)>] -> 'Irene' 39 | PropN[-loc,num=sg,sem=<\e R.R(e,cyril)>] -> 'Cyril' 40 | PropN[+loc, num=sg,sem=<\e R.R(e,stockbridge)>] -> 'Stockbridge' 41 | 42 | NP[-loc, num=sg, sem=<\P.\x.P(x)>] -> 'who' 43 | 44 | Det[num=sg,sem=<\P R e.all x.(P(x) -> R(e,x))>] -> 'every' 45 | Det[num=pl,sem=<\P R e.all x.(P(x) -> R(e,x))>] -> 'all' 46 | Det[sem=<\P R e.exists x.(P(x) & R(e,x))>] -> 'some' 47 | Det[num=sg,sem=<\P R e.exists x.(P(x) & R(e,x))>] -> 'a' 48 | 49 | N[num=sg,sem=] -> 'boy' 50 | N[num=pl,sem=] -> 'boys' 51 | N[num=sg,sem=] -> 'girl' 52 | N[num=pl,sem=] -> 'girls' 53 | N[num=sg,sem=] -> 'bone' 54 | N[num=sg,sem=] -> 'dog' 55 | 56 | IV[num=sg,sem=<\e x.(bark(e) & agent(e,x))>,tns=pres] -> 'barks' 57 | IV[num=pl,sem=<\e x.(bark(e) & agent(e,x))>,tns=pres] -> 'bark' 58 | IV[num=sg,sem=<\e x.(walk(e) & agent(e,x))>,tns=pres] -> 'walks' 59 | IV[num=pl,sem=<\e x.( walk(e) & agent(e,x))>,tns=pres] -> 'walk' 60 | TV[num=sg,sem=<\X y.X(\e x.(chase(e) & agent(e,y) & patient(e,x)))>,tns=pres] -> 'chases' 61 | TV[num=pl,sem=<\X y.X(\e x.(chase(e) & agent(e,y) & patient(e,x)))>,tns=pres] -> 'chase' 62 | TV[num=sg,sem=<\X y.X(\e x.(see(e) & agent(e,y) & patient(e,x)))>,tns=pres] -> 'sees' 63 | TV[num=pl,sem=<\X y.X(\e x.(see(e) & agent(e,y) & patient(e,x)))>,tns=pres] -> 'see' 64 | DTV[num=sg,sem=<\Y X x.X(\z.Y(\e y.(give(e) & agent(e,x) & theme(e,y) & recip(e,z))))>,tns=pres] -> 'gives' 65 | DTV[num=pl,sem=<\Y X x.X(\z.Y(\e y.(give(e) & agent(e,x) & theme(e,y) & recip(e,z))))>,tns=pres] -> 'give' 66 | 67 | P[+loc,sem=<\X P e.X(\y.(P(e) & in(e,y)))>] -> 'in' 68 | P[-loc,sem=<\X P e.X(\y.(P(e) & with(e,y)))>] -> 'with' 69 | P[+to,sem=<\X.X>] -> 'to' 70 | 71 | Adv[sem=<\R e x.(slow(e) & R(e,x))>] -> 'slowly' 72 | Adv[sem=<\R e x.(thoughtful(e) & R(e,x))>] -> 'thoughtfully' 73 | -------------------------------------------------------------------------------- /resources/examples/grammars/sample_grammars/glue.semtype: -------------------------------------------------------------------------------- 1 | ######################################################################## 2 | # Glue Semantics Formulas Using Event Representation 3 | # 4 | # Entries are made up of three parts, separated by colons (":") 5 | # 6 | # 1) The semtype name. 7 | # - May appear multiple times with different relationship sets (3) 8 | # - May "extend" other semtypes: "type(parent)" 9 | # 10 | # 2) The glue formulas. 11 | # - A comma-separated list of tuples representing glue formulas 12 | # - If the entry is an extension, then the listed formulas will be added to 13 | # the list from the super type 14 | # 15 | # 3) The relationship set (OPTIONAL) 16 | # - If not specified, then assume the entry covers ALL relationship sets 17 | # - If the entry is an extension, then the relationship set dictates which 18 | # particular entry should be extended. If no relationship set is 19 | # specified, then every entry of the parent type is extended. 20 | # 21 | ######################################################################## 22 | 23 | #Quantifiers 24 | def_art : (\P Q.exists x.(P(x) & all y.(Q(y) <-> (x = y))), ((super.v -o super.r) -o ((super.f -o super.var) -o super.var))) 25 | ex_quant : (\P Q.exists x.(P(x) & Q(x)), ((super.v -o super.r) -o ((super.f -o super.var) -o super.var))) 26 | univ_quant : (\P Q.all x.(P(x) -> Q(x)), ((super.v -o super.r) -o ((super.f -o super.var) -o super.var))) 27 | no_quant : (\P Q.-exists x.(P(x) & Q(x)), ((super.v -o super.r) -o ((super.f -o super.var) -o super.var))) 28 | 29 | #Nouns 30 | NN : (\x.(x), (v -o r)) : [spec] 31 | NN : (\P Q.exists x.(P(x) & Q(x)), ((v -o r) -o ((f -o var) -o var))), (\x.(x), (v -o r)) : [] # treat a noun missing its spec as implicitly existentially quantified 32 | NNP : (\P Q.exists x.(P(x) & Q(x)), ((v -o r) -o ((f -o var) -o var))), (\x.(x), (v -o r)) 33 | NNS(NN) 34 | PRP : (\P Q.exists x.(P(x) & Q(x)), ((v -o r) -o ((f -o var) -o var))), (\x.PRO(x), (v -o r)) 35 | 36 | #Verbs 37 | VB : (\x.(x), (subj -o f)) : [subj] #iv 38 | VB : (\x y.(x,y), (subj -o (obj -o f))) : [subj, obj] #tv 39 | VB : (\y.exists x.(x,y), (obj -o f)) : [obj] #incomplete tv 40 | VB : (\x y z.(x,y,z), (subj -o (obj -o (theme -o f)))) : [subj, obj, theme] #dtv 41 | VB : (\y z.exists x.(x,y,z), obj -o (theme -o f)) : [obj, theme] #incomplete dtv 42 | VB : (\x z.exists y.(x,y,z), subj -o (theme -o f)) : [subj, theme] #incomplete dtv 43 | VB : (\z.exists x y.(x,y,z), theme -o f) : [theme] #incomplete dtv 44 | VB : (\x y.(x,y), (subj -o (comp -o f))) : [subj, comp] #tv_comp 45 | VB : (\x P.(x,P), (subj -o ((xcomp.subj -o xcomp) -o f))) : [subj, xcomp] #equi 46 | VB : (\x y P.(x,y,P), (subj -o (obj -o ((xcomp.subj -o xcomp) -o f)))) : [subj, obj, xcomp] # object equi 47 | VB : (\P.(P), (xcomp -o f)) : [xcomp] #raising 48 | VBD(VB) : (\P.PAST(P), (f -o f)) 49 | VBZ(VB) 50 | 51 | #Modifiers 52 | nmod : (\Q P x.(P(x) & Q(x)), (f -o ((super.v -o super.r) -o (super.v -o super.r)))), (\x.(x), f) 53 | JJ(nmod) 54 | vmod : (\P.(P), (super.f -o super.f)) 55 | RB(vmod) 56 | tense : (\P.(P), (super.f -o super.f)) 57 | 58 | #Conjunctions 59 | cc_clause : (\P Q.(P & Q), (a -o (b -o f))) 60 | -------------------------------------------------------------------------------- /resources/examples/grammars/sample_grammars/glue_train.conll: -------------------------------------------------------------------------------- 1 | 1 John _ NNP _ _ 2 SUBJ _ _ 2 | 2 runs _ VB _ _ 0 ROOT _ _ 3 | 4 | 1 a _ DT _ _ 2 SPEC _ _ 5 | 2 man _ NN _ _ 3 SUBJ _ _ 6 | 3 runs _ VB _ _ 0 ROOT _ _ 7 | 8 | 1 John _ NNP _ _ 2 SUBJ _ _ 9 | 2 sees _ VB _ _ 0 ROOT _ _ 10 | 3 Mary _ NNP _ _ 2 OBJ _ _ 11 | 12 | 1 every _ DT _ _ 2 SPEC _ _ 13 | 2 girl _ NN _ _ 3 SUBJ _ _ 14 | 3 chases _ VB _ _ 0 ROOT _ _ 15 | 4 an _ DT _ _ 5 SPEC _ _ 16 | 5 animal _ NN _ _ 3 OBJ _ _ 17 | 18 | 1 Bill _ NNP _ _ 2 SUBJ _ _ 19 | 2 sees _ VB _ _ 0 ROOT _ _ 20 | 3 a _ DT _ _ 4 SPEC _ _ 21 | 4 dog _ NN _ _ 2 OBJ _ _ 22 | 23 | 1 every _ DT _ _ 2 SPEC _ _ 24 | 2 girl _ NN _ _ 3 SUBJ _ _ 25 | 3 chases _ VB _ _ 0 ROOT _ _ 26 | 4 John _ NNP _ _ 3 OBJ _ _ 27 | 28 | -------------------------------------------------------------------------------- /resources/examples/grammars/sample_grammars/hole.fcfg: -------------------------------------------------------------------------------- 1 | ## Natural Language Toolkit: hole.fcfg 2 | ## 3 | ## Minimal feature-based grammar with lambda semantics for use by the hole.py 4 | ## module for Hole Semantics (see Blackburn and Bos). 5 | ## 6 | ## Author: Dan Garrette 7 | ## Robin Cooper 8 | ## URL: 9 | ## For license information, see LICENSE.TXT 10 | 11 | % start S 12 | 13 | S[SEM=] -> NP[SEM=?subj] VP[SEM=?vp] 14 | VP[SEM=?v] -> IV[SEM=?v] 15 | VP[NUM=?n,SEM=] -> TV[NUM=?n,SEM=?v] NP[SEM=?obj] 16 | NP[SEM=] -> Det[SEM=?det] N[SEM=?n] 17 | 18 | Det[SEM=<\P Q h l.exists h1 l1 l2 l3 x.(ALL(l2,x,l3) & IMP(l3,l1,h1) & LEQ(l,h1) & LEQ(l2,h) & P(x)(h)(l1) & Q(x)(h)(l) & HOLE(h) & HOLE(h1) & LABEL(l) & LABEL(l1) & LABEL(l2) & LABEL(l3))>] -> 'every' 19 | Det[SEM=<\P Q h l.exists h1 l1 l2 l3 x.(EXISTS(l2,x,l3) & AND(l3,l1,h1) & LEQ(l,h1) & LEQ(l2,h) & P(x)(h)(l1) & Q(x)(h)(l) & HOLE(h) & HOLE(h1) & LABEL(l) & LABEL(l1) & LABEL(l2) & LABEL(l3))>] -> 'a' 20 | N[SEM=<\x h l.(PRED(l,girl,x) & LEQ(l,h) & HOLE(h) & LABEL(l))>] -> 'girl' 21 | N[SEM=<\x h l.(PRED(l,dog,x) & LEQ(l,h) & HOLE(h) & LABEL(l))>] -> 'dog' 22 | IV[SEM=<\x h l.(PRED(l,bark,x) & LEQ(l,h) & HOLE(h) & LABEL(l))>] -> 'barks' 23 | TV[SEM=<\P x.P(\y h l.(PRED(l,chase,x,y) & LEQ(l,h) & HOLE(h) & LABEL(l)))>] -> 'chases' 24 | -------------------------------------------------------------------------------- /resources/examples/grammars/sample_grammars/np.fcfg: -------------------------------------------------------------------------------- 1 | % start NP 2 | NP[AGR=?a] -> Det[AGR=?a] N[AGR=?a] 3 | Det[AGR=[NUM='sg', PER=3]] -> 'this' | 'that' 4 | Det[AGR=[NUM='pl', PER=3]] -> 'these' | 'those' 5 | Det[AGR=[NUM='pl', PER=1]] -> 'we' 6 | Det[AGR=[PER=2]] -> 'you' 7 | N[AGR=[NUM='sg', GND='m']] -> 'boy' 8 | N[AGR=[NUM='pl', GND='m']] -> 'boys' 9 | N[AGR=[NUM='sg', GND='f']] -> 'girl' 10 | N[AGR=[NUM='pl', GND='f']] -> 'girls' 11 | N[AGR=[NUM='sg']] -> 'student' 12 | N[AGR=[NUM='pl']] -> 'students' 13 | -------------------------------------------------------------------------------- /resources/examples/grammars/sample_grammars/sem0.fcfg: -------------------------------------------------------------------------------- 1 | ## Natural Language Toolkit: sem0.fcfg 2 | ## 3 | ## Minimal feature-based grammar with lambda semantics. 4 | ## 5 | ## Author: Ewan Klein 6 | ## URL: 7 | ## For license information, see LICENSE.TXT 8 | 9 | % start S 10 | 11 | S[SEM=] -> NP[SEM=?subj] VP[SEM=?vp] 12 | VP[SEM=?v] -> V[SEM=?v] 13 | NP[SEM=] -> 'Cyril' 14 | V[SEM=<\x.bark(x)>] -> 'barks' 15 | -------------------------------------------------------------------------------- /resources/examples/grammars/sample_grammars/sem1.fcfg: -------------------------------------------------------------------------------- 1 | ## Natural Language Toolkit: sem1.fcfg 2 | ## 3 | ## Minimal feature-based grammar to illustrate the interpretation of 4 | ## determiner phrases. 5 | ## 6 | ## Author: Ewan Klein 7 | ## URL: 8 | ## For license information, see LICENSE.TXT 9 | 10 | % start S 11 | 12 | S[SEM = ] -> NP[SEM=?subj] VP[SEM=?vp] 13 | VP[SEM=?v] -> IV[SEM=?v] 14 | NP[SEM=] -> Det[SEM=?det] N[SEM=?n] 15 | 16 | Det[SEM=<\Q P.exists x.(Q(x) & P(x))>] -> 'a' 17 | Det[SEM=<\Q P.all x.(Q(x) -> P(x))>] -> 'every' 18 | N[SEM=<\x.dog(x)>] -> 'dog' 19 | IV[SEM=<\x.bark(x)>] -> 'barks' 20 | -------------------------------------------------------------------------------- /resources/examples/grammars/sample_grammars/sem2.fcfg: -------------------------------------------------------------------------------- 1 | ## Natural Language Toolkit: sem2.fcfg 2 | ## 3 | ## Longer feature-based grammar with more quantifers, and illustrating 4 | ## transitive verbs and prepositional phrases (PPs). The 5 | ## interpretation of PPs is a bit weird and could do with further 6 | ## work. 7 | ## 8 | ## Author: Ewan Klein 9 | ## URL: 10 | ## For license information, see LICENSE.TXT 11 | 12 | % start S 13 | ############################ 14 | # Grammar Rules 15 | ############################# 16 | 17 | S[SEM = ] -> NP[NUM=?n,SEM=?subj] VP[NUM=?n,SEM=?vp] 18 | 19 | NP[NUM=?n,SEM= ] -> Det[NUM=?n,SEM=?det] Nom[NUM=?n,SEM=?nom] 20 | NP[LOC=?l,NUM=?n,SEM=?np] -> PropN[LOC=?l,NUM=?n,SEM=?np] 21 | 22 | Nom[NUM=?n,SEM=?nom] -> N[NUM=?n,SEM=?nom] 23 | Nom[NUM=?n,SEM=] -> N[NUM=?n,SEM=?nom] PP[SEM=?pp] 24 | 25 | VP[NUM=?n,SEM=] -> TV[NUM=?n,SEM=?v] NP[SEM=?obj] 26 | VP[NUM=?n,SEM=?v] -> IV[NUM=?n,SEM=?v] 27 | 28 | VP[NUM=?n,SEM=] -> VP[NUM=?n,SEM=?vp] PP[SEM=?pp] 29 | 30 | PP[SEM=] -> P[LOC=?l,SEM=?p] NP[LOC=?l,SEM=?np] 31 | 32 | ############################# 33 | # Lexical Rules 34 | ############################# 35 | 36 | PropN[-LOC,NUM=sg,SEM=<\P.P(john)>] -> 'John' 37 | PropN[-LOC,NUM=sg,SEM=<\P.P(mary)>] -> 'Mary' 38 | PropN[-LOC,NUM=sg,SEM=<\P.P(suzie)>] -> 'Suzie' 39 | PropN[-LOC,NUM=sg,SEM=<\P.P(fido)>] -> 'Fido' 40 | PropN[+LOC, NUM=sg,SEM=<\P.P(noosa)>] -> 'Noosa' 41 | 42 | NP[-LOC, NUM=sg, SEM=<\P.\x.P(x)>] -> 'who' 43 | 44 | Det[NUM=sg,SEM=<\P Q.all x.(P(x) -> Q(x))>] -> 'every' 45 | Det[NUM=pl,SEM=<\P Q.all x.(P(x) -> Q(x))>] -> 'all' 46 | Det[SEM=<\P Q.exists x.(P(x) & Q(x))>] -> 'some' 47 | Det[NUM=sg,SEM=<\P Q.exists x.(P(x) & Q(x))>] -> 'a' 48 | 49 | N[NUM=sg,SEM=<\x.boy(x)>] -> 'boy' 50 | N[NUM=pl,SEM=<\x.boy(x)>] -> 'boys' 51 | N[NUM=sg,SEM=<\x.girl(x)>] -> 'girl' 52 | N[NUM=pl,SEM=<\x.girl(x)>] -> 'girls' 53 | N[NUM=sg,SEM=<\x.dog(x)>] -> 'dog' 54 | N[NUM=pl,SEM=<\x.dog(x)>] -> 'dogs' 55 | 56 | TV[NUM=sg,SEM=<\X y.X(\x.chase(y,x))>,TNS=pres] -> 'chases' 57 | TV[NUM=pl,SEM=<\X y.X(\x.chase(y,x))>,TNS=pres] -> 'chase' 58 | TV[NUM=sg,SEM=<\X y.X(\x.see(y,x))>,TNS=pres] -> 'sees' 59 | TV[NUM=pl,SEM=<\X y.X(\x.see(y,x))>,TNS=pres] -> 'see' 60 | TV[NUM=sg,SEM=<\X y.X(\x.chase(y,x))>,TNS=pres] -> 'chases' 61 | TV[NUM=pl,SEM=<\X y.X(\x.chase(y,x))>,TNS=pres] -> 'chase' 62 | IV[NUM=sg,SEM=<\x.bark(x)>,TNS=pres] -> 'barks' 63 | IV[NUM=pl,SEM=<\x.bark(x)>,TNS=pres] -> 'bark' 64 | IV[NUM=sg,SEM=<\x.walk(x)>,TNS=pres] -> 'walks' 65 | IV[NUM=pl,SEM=<\x.walk(x)>,TNS=pres] -> 'walk' 66 | 67 | P[+LOC,SEM=<\X P x.X(\y.(P(x) & in(x,y)))>] -> 'in' 68 | P[-LOC,SEM=<\X P x.X(\y.(P(x) & with(x,y)))>] -> 'with' 69 | -------------------------------------------------------------------------------- /resources/examples/grammars/sample_grammars/sql.fcfg: -------------------------------------------------------------------------------- 1 | ## Natural Language Toolkit: sql.fcfg 2 | ## 3 | ## Deliberately naive string-based grammar for 4 | ## deriving SQL queries from English 5 | ## 6 | ## Author: Ewan Klein 7 | ## URL: 8 | ## For license information, see LICENSE.TXT 9 | 10 | % start S 11 | 12 | S[sem=(?np + ?vp)] -> NP[sem=?np] VP[sem=?vp] 13 | 14 | VP[sem=(?v + ?pp)] -> IV[sem=?v] PP[sem=?pp] 15 | VP[sem=(?v + ?np)] -> TV[sem=?v] NP[sem=?np] 16 | 17 | NP[sem=(?det + ?n)] -> Det[sem=?det] N[sem=?n] 18 | NP[sem='Country="japan"'] -> 'Japan' 19 | NP[sem='Country="united_states"'] -> 'USA' 20 | 21 | Det[sem='SELECT'] -> 'Which' 22 | N[sem='City FROM city_table'] -> 'cities' 23 | 24 | IV[sem='WHERE'] -> 'are' 25 | PP[sem=?np] -> P[sem=?p] NP[sem=?np] 26 | P -> 'in' 27 | 28 | -------------------------------------------------------------------------------- /resources/examples/grammars/sample_grammars/toy.cfg: -------------------------------------------------------------------------------- 1 | S -> NP VP 2 | PP -> P NP 3 | NP -> Det N | NP PP 4 | VP -> V NP | VP PP 5 | Det -> 'a' | 'the' 6 | N -> 'dog' | 'cat' 7 | V -> 'chased' | 'sat' 8 | P -> 'on' | 'in' 9 | 10 | -------------------------------------------------------------------------------- /resources/examples/grammars/sample_grammars/valuation1.val: -------------------------------------------------------------------------------- 1 | john => b1 2 | mary => g1 3 | suzie => g2 4 | fido => d1 5 | tess => d2 6 | noosa => n 7 | girl => {g1, g2} 8 | boy => {b1, b2} 9 | dog => {d1, d2} 10 | bark => {d1, d2} 11 | walk => {b1, g2, d1} 12 | chase => {(b1, g1), (b2, g1), (g1, d1), (g2, d2)} 13 | see => {(b1, g1), (b2, d2), (g1, b1),(d2, b1), (g2, n)} 14 | in => {(b1, n), (b2, n), (d2, n)} 15 | with => {(b1, g1), (g1, b1), (d1, b1), (b1, d1)} 16 | -------------------------------------------------------------------------------- /resources/examples/grammars/spanish_grammars/spanish1.cfg: -------------------------------------------------------------------------------- 1 | S -> SN SV 2 | SV -> v SN 3 | SV -> v 4 | SN -> det GN 5 | GN -> nom_com 6 | GN -> nom_prop 7 | det -> "el" | "la" | "los" | "las" | "un" | "una" | "unos" | "unas" 8 | nom_com -> "vecino" | "ladrones" | "mujeres" | "bosques" | "noche" | "flauta" | "ventana" 9 | nom_prop -> "Jose" | "Lucas" | "Pedro" | "Marta" 10 | v -> "toca" | "moja" | "adoran" | "robaron" | "escondieron" | "rompió" 11 | -------------------------------------------------------------------------------- /resources/examples/grammars/spanish_grammars/spanish1.fcfg: -------------------------------------------------------------------------------- 1 | % start S 2 | # ############################ 3 | # Grammar Rules 4 | # ############################ 5 | S -> SN[num=?n,gen=?g] SV[num=?n,tiempo=?t] 6 | SN[num=?n,gen=?g,+PROP] -> NP[num=?n] 7 | SN[num=?n,gen=?g,-PROP] -> DET[num=?n,gen=?g] NC[num=?n,gen=?g] 8 | SN[num=plural,gen=?g,-PROP] -> DET[num=plural,gen=?g] NC[num=plural,gen=?g] 9 | SV[tiempo=?t,num=?n] -> VI[tiempo=?t,num=?n] 10 | SV[tiempo=?t,num=?n] -> VT[tiempo=?t,num=?n] SN[-PROP] 11 | SV[tiempo=?t,num=?n] -> VT[tiempo=?t,num=?n] PREP SN 12 | # ############################ 13 | # Lexical Rules 14 | # ############################ 15 | DET[num=singular,gen=masculino] -> 'un' | 'el' 16 | DET[num=singular,gen=femenino] -> 'una' | 'la' 17 | DET[num=plural,gen=masculino] -> 'unos' | 'los' 18 | DET[num=plural,gen=femenino] -> 'unas' | 'las' 19 | PREP -> 'a' 20 | NP[num=singular] -> 'Miguel' | 'Sara' | 'Pedro' 21 | NC[num=singular,gen=masculino] -> 'perro' | 'gato' | 'vecino' | 'profesor' 22 | NC[num=singular,gen=femenino] -> 'perra' | 'gata' | 'vecina' | 'profesora' 23 | NC[num=plural,gen=masculino] -> 'perros' | 'gatos' | 'vecinos' | 'profesores' 24 | NC[num=plural,gen=femenino] -> 'perras' | 'gatas' | 'vecinas' | 'profesoras' 25 | VI[tiempo=pasado,num=singular] -> 'desaparecio' | 'anduvo' | 'murio' 26 | VI[tiempo=presente,num=singular] -> 'desaparece' | 'anda' | 'muere' 27 | VI[tiempo=pasado,num=plural] -> 'desaparecion' | 'anduvieron' | 'murieron' 28 | VI[tiempo=presente,num=plural] -> 'desaparecen' | 'andan' | 'mueren' 29 | VT[tiempo=pasado,num=singular] -> 'vio' | 'adoró' | 'gritó' | 'odio' 30 | VT[tiempo=presente,num=singular] -> 've' | 'adora' | 'grita' | 'odia' 31 | VT[tiempo=pasado,num=plural] -> 'vieron' | 'adoraron' | 'gritaron' | 'odiaron' 32 | VT[tiempo=presente,num=plural] -> 'ven' | 'adoran' | 'gritan' | 'odian' 33 | -------------------------------------------------------------------------------- /resources/examples/grammars/spanish_grammars/spanish1.pcfg: -------------------------------------------------------------------------------- 1 | S -> SN SV [1.0] 2 | SV -> VTrans SN [0.4] 3 | SV -> VIntrans [0.3] 4 | SV -> VSupl SN SN [0.3] 5 | VTrans -> "bebió" [1.0] 6 | VIntrans -> "murió" [1.0] 7 | VSupl -> "regaló" [1.0] 8 | SN -> "flores" [0.6] 9 | SN -> "agua" [0.4] 10 | -------------------------------------------------------------------------------- /resources/examples/grammars/spanish_grammars/spanish1.regexp: -------------------------------------------------------------------------------- 1 | 2 | NP: {*+*} # busca determinantes y adjetivos que acompañen a nombres 3 | -------------------------------------------------------------------------------- /resources/examples/grammars/spanish_grammars/spanish2.cfg: -------------------------------------------------------------------------------- 1 | S -> SN SV 2 | SP -> P SN 3 | SN -> Det N | SN SP 4 | SV -> V SN | SV SP 5 | Det -> "el" | "la" | "un" | "una" | "los" | "las" 6 | N -> "tren" | "telescopio" | "noticia" | "mesa" | "hombre" | "casa" | "amiga" 7 | V -> "vio" | "leí" | "encontró" 8 | P -> "en" | "sobre" | "con" | "de" | "a" 9 | -------------------------------------------------------------------------------- /resources/examples/grammars/spanish_grammars/spanish2.fcfg: -------------------------------------------------------------------------------- 1 | % start S 2 | # ############################ 3 | # Grammar Rules 4 | # ############################ 5 | S -> SN S/SN 6 | S/?x -> SV/?x 7 | S/?x -> V[+aux] COMP SV/?x 8 | SN/SN -> 9 | SV/?x -> V[-aux] SN/?x 10 | # ############################ 11 | # Lexical Rules 12 | # ############################ 13 | V[-aux] -> 'adoras' | 'odias' 14 | V[+aux] -> 'dices' 15 | 16 | SN -> 'quien' | 'que' 17 | COMP -> 'que' 18 | -------------------------------------------------------------------------------- /resources/examples/grammars/spanish_grammars/spanish2.pcfg: -------------------------------------------------------------------------------- 1 | SN -> N [0.5]| N Adj [0.3]| SN Conj SN [0.2] 2 | N -> 'hombres' [0.1]| 'mujeres' [0.2]| 'niños' [0.3]| N Conj N [0.4] 3 | Adj -> 'mayores' [0.3]| 'jovenes' [0.7] 4 | Conj -> 'y' [0.6]| 'o' [0.3] | 'e' [0.1] 5 | -------------------------------------------------------------------------------- /resources/examples/grammars/spanish_grammars/spanish2.regexp: -------------------------------------------------------------------------------- 1 | 2 | NP: {**} # Busca det + nombre + adjetivo 3 | NP: {*+} # Busca seguidas de nombres 4 | 5 | -------------------------------------------------------------------------------- /resources/examples/grammars/spanish_grammars/spanish3.cfg: -------------------------------------------------------------------------------- 1 | SN -> N | N Adj | SN Conj SN 2 | N -> 'hombres' | 'mujeres' | 'niños' | N Conj N 3 | Adj -> 'mayores' | 'jovenes' 4 | Conj -> 'y' | 'o' | 'e' 5 | -------------------------------------------------------------------------------- /resources/examples/grammars/spanish_grammars/spanish3.regexp: -------------------------------------------------------------------------------- 1 | 2 | SN: 3 | {<.*>+} # Crea Un Chunk Con Cualquier Cosa 4 | }+{ # Considerar Como Chink Apariciones De Verbos (v.*), Preposiciones (sp.*) y Signos De Puntuación (F.*) 5 | -------------------------------------------------------------------------------- /resources/examples/grammars/spanish_grammars/spanish4.regexp: -------------------------------------------------------------------------------- 1 | 2 | SN: {?+*} # noun phrase chunks 3 | SV: {?} # verb phrase chunks 4 | SP: {} # prepositional phrase chunks 5 | 6 | -------------------------------------------------------------------------------- /resources/examples/grammars/spanish_grammars/spanish5.regexp: -------------------------------------------------------------------------------- 1 | 2 | SN: {?+*} # noun phrase chunks 3 | SV: {?+*} # verb phrase chunks 4 | SP: {} # prepositional phrase chunks 5 | S: {} # Chunk NP, VP 6 | 7 | -------------------------------------------------------------------------------- /resources/examples/school/README: -------------------------------------------------------------------------------- 1 | The files in this directory were created for teaching computational 2 | linguistics in secondary school English classes. For instructions 3 | and lesson plans, please see http://nltk.org/index.php/Electronic_Grammar 4 | -------------------------------------------------------------------------------- /resources/examples/school/count.py: -------------------------------------------------------------------------------- 1 | from words import * 2 | words = read_words('corpus/telephone.txt') 3 | counts = count_words(words) 4 | print_freq(counts) 5 | 6 | 7 | 8 | 9 | from words import * 10 | words = read_words('corpus/rural.txt') 11 | counts = count_pairs(words) 12 | print_freq(counts) 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /resources/examples/school/generate.py: -------------------------------------------------------------------------------- 1 | from words import * 2 | 3 | telephone_words = read_words('corpus/telephone.txt') 4 | model = train(telephone_words) 5 | generate(model) 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /resources/examples/school/parse1.py: -------------------------------------------------------------------------------- 1 | from parser import * 2 | 3 | grammar = """ 4 | NP -> P | D J N 5 | D -> 'a' 6 | J -> 'red' | 'green' 7 | N -> 'chair' | 'house' 8 | """ 9 | 10 | phrase = 'a red chair' 11 | 12 | parse_draw(phrase, grammar) 13 | 14 | 15 | -------------------------------------------------------------------------------- /resources/examples/school/parse2.py: -------------------------------------------------------------------------------- 1 | from parser import * 2 | 3 | grammar = """ 4 | S -> NP VP | VP 5 | VP -> V NP | VP PP 6 | NP -> Det N | NP PP 7 | PP -> P NP 8 | NP -> 'I' 9 | Det -> 'the' | 'my' 10 | N -> 'elephant' | 'pajamas' 11 | V -> 'shot' 12 | P -> 'in' 13 | """ 14 | 15 | sent = 'I shot the elephant in my pajamas' 16 | parse_draw(sent, grammar) 17 | 18 | -------------------------------------------------------------------------------- /resources/examples/school/parse3.py: -------------------------------------------------------------------------------- 1 | from parser import * 2 | 3 | grammar = """ 4 | S -> NP VP | VP 5 | PP -> P NP 6 | NP -> N | Det N | N N | NP PP | N VP 7 | VP -> V | V NP | VP PP | VP ADVP 8 | ADVP -> ADV NP 9 | Det -> 'a' | 'an' | 'the' 10 | N -> 'flies' | 'banana' | 'fruit' | 'arrow' | 'time' 11 | V -> 'like' | 'flies' | 'time' 12 | P -> 'on' | 'in' | 'by' 13 | ADV -> 'like' 14 | """ 15 | 16 | sent = 'time flies like an arrow' 17 | 18 | parse_draw(sent, grammar) 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /resources/examples/school/parser.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | 3 | def parse(sent, grammar): 4 | gr = nltk.parse_cfg(grammar) 5 | parser = nltk.parse.ChartParse(gr, nltk.parse.TD_STRATEGY) 6 | return parser.get_parse_list(sent.split()) 7 | 8 | def parse_draw(sent, grammar): 9 | trees = parse(sent, grammar) 10 | nltk.draw.draw_trees(*trees) 11 | 12 | def parse_print(sent, grammar): 13 | trees = parse(sent, grammar) 14 | for tree in trees: 15 | print tree 16 | 17 | -------------------------------------------------------------------------------- /resources/examples/school/search.py: -------------------------------------------------------------------------------- 1 | from words import * 2 | words = read_text('corpus/telephone.txt') 3 | concordance(" um", words) 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /resources/examples/semantics/chat.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rplevy/clojure-nltk/b4119d66b21f3c619ca4d385f605417901e47349/resources/examples/semantics/chat.db -------------------------------------------------------------------------------- /resources/examples/semantics/chat_sentences: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Demo Sentences 2 | # 3 | # Author: Ewan Klein 4 | # URL: 5 | # For license information, see LICENSE.TXT 6 | ############################################ 7 | # Some example sentences for the Chat-80 demo 8 | 9 | what is the capital of France 10 | which sea borders France 11 | what contains Berlin 12 | which Asian countries border the_Mediterranean 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /resources/examples/semantics/demo_sentences: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Demo Sentences 2 | # 3 | # Author: Ewan Klein 4 | # URL: 5 | # For license information, see LICENSE.TXT 6 | ############################################ 7 | # Some example sentences for the sem2.cfg demo 8 | 9 | Fido sees a boy with Mary 10 | John sees Mary 11 | every girl chases a dog 12 | every boy chases a girl 13 | John walks with a girl in Noosa 14 | who walks 15 | -------------------------------------------------------------------------------- /resources/examples/semantics/model0.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Example Model 2 | # 3 | # Author: Ewan Klein 4 | # URL: 5 | # For license information, see LICENSE.TXT 6 | 7 | """ 8 | This is a sample model to accompany the U{sem2.cfg} grammar, and is 9 | intended to be imported as a module. 10 | """ 11 | 12 | from nltk.semantics import * 13 | 14 | val = Valuation() 15 | #Initialize a valuation of non-logical constants.""" 16 | 17 | v = [('john', 'b1'), 18 | ('mary', 'g1'), 19 | ('suzie', 'g2'), 20 | ('fido', 'd1'), 21 | ('tess', 'd2'), 22 | ('noosa', 'n'), 23 | ('girl', set(['g1', 'g2'])), 24 | ('boy', set(['b1', 'b2'])), 25 | ('dog', set(['d1', 'd2'])), 26 | ('bark', set(['d1', 'd2'])), 27 | ('walk', set(['b1', 'g2', 'd1'])), 28 | ('chase', set([('b1', 'g1'), ('b2', 'g1'), ('g1', 'd1'), ('g2', 'd2')])), 29 | ('see', set([('b1', 'g1'), ('b2', 'd2'), ('g1', 'b1'),('d2', 'b1'), ('g2', 'n')])), 30 | ('in', set([('b1', 'n'), ('b2', 'n'), ('d2', 'n')])), 31 | ('with', set([('b1', 'g1'), ('g1', 'b1'), ('d1', 'b1'), ('b1', 'd1')])) 32 | ] 33 | 34 | 35 | #Read in the data from C{v} 36 | val.read(v) 37 | 38 | #Bind C{dom} to the C{domain} property of C{val} 39 | dom = val.domain 40 | 41 | #Initialize a model with parameters C{dom} and C{val}. 42 | m = Model(dom, val) 43 | 44 | #Initialize a variable assignment with parameter C{dom} 45 | g = Assignment(dom) 46 | -------------------------------------------------------------------------------- /resources/examples/semantics/model1.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Example Model 2 | # 3 | # Author: Ewan Klein 4 | # URL: 5 | # For license information, see LICENSE.TXT 6 | 7 | """ 8 | This is a sample model to accompany the U{chat80.cfg} grammar} and is 9 | intended to be imported as a module. 10 | """ 11 | 12 | from nltk.semantics import * 13 | from nltk.corpora import chat80 14 | 15 | rels = chat80.rels 16 | concept_map = chat80.process_bundle(rels) 17 | concepts = concept_map.values() 18 | val = chat80.make_valuation(concepts, read=True) 19 | 20 | #Bind C{dom} to the C{domain} property of C{val}. 21 | dom = val.domain 22 | 23 | #Initialize a model with parameters C{dom} and C{val}. 24 | m = Model(dom, val) 25 | 26 | #Initialize a variable assignment with parameter C{dom}. 27 | g = Assignment(dom) 28 | -------------------------------------------------------------------------------- /resources/examples/semantics/sem0.cfg: -------------------------------------------------------------------------------- 1 | ## Natural Language Toolkit: sem0.cfg 2 | ## 3 | ## Minimal feature-based grammar with lambda semantics. 4 | ## 5 | ## Author: Ewan Klein 6 | ## URL: 7 | ## For license information, see LICENSE.TXT 8 | 9 | % start S 10 | 11 | S[sem = ] -> NP[sem=?subj] VP[sem=?vp] 12 | VP[sem=?v] -> V[sem=?v] 13 | NP[sem=] -> 'John' 14 | V[sem=<\x.(walk x)>] -> 'walks' 15 | -------------------------------------------------------------------------------- /resources/examples/semantics/sem1.cfg: -------------------------------------------------------------------------------- 1 | ## Natural Language Toolkit: sem1.cfg 2 | ## 3 | ## Minimal feature-based grammar to illustrate the interpretation of 4 | ## determiner phrases. 5 | ## 6 | ## Author: Ewan Klein 7 | ## URL: 8 | ## For license information, see LICENSE.TXT 9 | 10 | % start S 11 | 12 | S[sem = ] -> NP[sem=?subj] VP[sem=?vp] 13 | VP[sem=?v] -> IV[sem=?v] 14 | NP[sem=] -> Det[sem=?det] N[sem=?n] 15 | 16 | Det[sem=<\Q P. some x. ((Q x) and (P x))>] -> 'a' 17 | N[sem=] -> 'dog' 18 | IV[sem=<\x.(bark x)>] -> 'barks' 19 | -------------------------------------------------------------------------------- /resources/examples/semantics/sem2.cfg: -------------------------------------------------------------------------------- 1 | ## Natural Language Toolkit: sem2.cfg 2 | ## 3 | ## Longer feature-based grammar with more quantifers, and illustrating 4 | ## transitive verbs and prepositional phrases (PPs). The 5 | ## interpretation of PPs is a bit weird and could do with further 6 | ## work. 7 | ## 8 | ## Author: Ewan Klein 9 | ## URL: 10 | ## For license information, see LICENSE.TXT 11 | 12 | % start S 13 | ############################ 14 | # Grammar Rules 15 | ############################# 16 | 17 | S[sem = ] -> NP[num=?n,sem=?subj] VP[num=?n,sem=?vp] 18 | 19 | NP[num=?n,sem= ] -> Det[num=?n,sem=?det] Nom[num=?n,sem=?nom] 20 | NP[loc=?l,num=?n,sem=?np] -> PropN[loc=?l,num=?n,sem=?np] 21 | 22 | Nom[num=?n,sem=?nom] -> N[num=?n,sem=?nom] 23 | Nom[num=?n,sem=] -> N[num=?n,sem=?nom] PP[sem=?pp] 24 | 25 | VP[num=?n,sem=] -> TV[num=?n,sem=?v] NP[sem=?obj] 26 | VP[num=?n,sem=?v] -> IV[num=?n,sem=?v] 27 | 28 | VP[num=?n,sem=] -> VP[num=?n,sem=?vp] PP[sem=?pp] 29 | 30 | PP[sem=] -> P[loc=?l,sem=?p] NP[loc=?l,sem=?np] 31 | 32 | ############################# 33 | # Lexical Rules 34 | ############################# 35 | 36 | PropN[-loc,num=sg,sem=<\P.(P john)>] -> 'John' 37 | PropN[-loc,num=sg,sem=<\P.(P mary)>] -> 'Mary' 38 | PropN[-loc,num=sg,sem=<\P.(P suzie)>] -> 'Suzie' 39 | PropN[-loc,num=sg,sem=<\P.(P fido)>] -> 'Fido' 40 | PropN[+loc, num=sg,sem=<\P.(P noosa)>] -> 'Noosa' 41 | 42 | NP[-loc, num=sg, sem=<\P.\x.(P x)>] -> 'who' 43 | 44 | Det[num=sg,sem=<\P Q. all x. ((P x) implies (Q x))>] -> 'every' 45 | Det[num=pl,sem=<\P Q. all x. ((P x) implies (Q x))>] -> 'all' 46 | Det[sem=<\P Q. some x. ((P x) and (Q x))>] -> 'some' 47 | Det[num=sg,sem=<\P Q. some x. ((P x) and (Q x))>] -> 'a' 48 | 49 | N[num=sg,sem=] -> 'boy' 50 | N[num=pl,sem=] -> 'boys' 51 | N[num=sg,sem=] -> 'girl' 52 | N[num=pl,sem=] -> 'girls' 53 | N[num=sg,sem=] -> 'dog' 54 | N[num=pl,sem=] -> 'dogs' 55 | 56 | TV[num=sg,sem=<\X y. (X \x. (chase x y))>,tns=pres] -> 'chases' 57 | TV[num=pl,sem=<\X y. (X \x. (chase x y))>,tns=pres] -> 'chase' 58 | TV[num=sg,sem=<\X y. (X \x. (see x y))>,tns=pres] -> 'sees' 59 | TV[num=pl,sem=<\X y. (X \x. (see x y))>,tns=pres] -> 'see' 60 | TV[num=sg,sem=<\X y. (X \x. (chase x y))>,tns=pres] -> 'chases' 61 | TV[num=pl,sem=<\X y. (X \x. (chase x y))>,tns=pres] -> 'chase' 62 | IV[num=sg,sem=<\x. (bark x)>,tns=pres] -> 'barks' 63 | IV[num=pl,sem=<\x. (bark x)>,tns=pres] -> 'bark' 64 | IV[num=sg,sem=<\x. (walk x)>,tns=pres] -> 'walks' 65 | IV[num=pl,sem=<\x. (walk x)>,tns=pres] -> 'walk' 66 | 67 | P[+loc,sem=<\X P x. (X \y. ((P x) and (in y x)))>] -> 'in' 68 | P[-loc,sem=<\X P x. (X \y. ((P x) and (with y x)))>] -> 'with' 69 | -------------------------------------------------------------------------------- /resources/examples/semantics/sem3.cfg: -------------------------------------------------------------------------------- 1 | ## Natural Language Toolkit: sem3.cfg 2 | ## 3 | ## First attempt at HPSG-style feature-based semantics. 4 | ## This version doesn't work properly! 5 | ## 6 | ## Author: Ewan Klein 7 | ## URL: 8 | ## For license information, see LICENSE.TXT 9 | 10 | % start S 11 | 12 | S[sem=?vp] -> NP[sem=?np] VP[subj=?np, sem=?vp] 13 | VP[sem=?v, subj=?np] -> IV[sem=?v, subj=?np] 14 | NP[sem=[index='k',name='kim']] -> 'Kim' 15 | IV[sem=[rel='bark', arg=?i], subj=[sem=[index=?i]]] -> 'barks' 16 | #IV[fsem=[rel='bark', arg=(1)[]], subj=[fsem=[index->(1)]]] -> 'barks' 17 | 18 | -------------------------------------------------------------------------------- /resources/javasrc/Makefile: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: java interface code Makefile 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Edward Loper 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | # Dependencies. 9 | MALLET_HOME = /usr/local/mallet-0.4 10 | 11 | # Locate the NLTK java source code 12 | JAVA_SRC = $(shell find org/nltk -name '*.java') 13 | JAVA_CLS = $(JAVA_SRC:.java=.class) 14 | 15 | # Set up java. 16 | JAVAC=javac 17 | CLASSPATH = .:$(MALLET_HOME)/class/:$(MALLET_HOME)/lib/mallet-deps.jar:$(MALLET_HOME)/lib/mallet.jar 18 | 19 | ######################################################################## 20 | # Targets 21 | ######################################################################## 22 | 23 | .PHONY: find-mallet javac clean jar jar2 24 | 25 | jar: find-mallet nltk.jar 26 | 27 | find-mallet: 28 | @if [ -d $(MALLET_HOME) ]; then \ 29 | echo "Found Mallet: $(MALLET_HOME)"; \ 30 | else \ 31 | echo; \ 32 | echo "Unable to locate required Mallet dependencies. Use:"; \ 33 | echo " make MALLET_HOME=/path/to/mallet [target...]"; \ 34 | echo "to specify the location of Mallet. Mallet can be "; \ 35 | echo "downloaded from http://mallet.cs.umass.edu/"; \ 36 | echo; false; fi 37 | 38 | nltk.jar: $(JAVA_SRC) 39 | $(JAVAC) -cp "$(CLASSPATH)" $(JAVA_SRC) 40 | jar -cf nltk.jar `find org/nltk -name '*.class'` 41 | 42 | clean: 43 | rm -f $(JAVA_CLS) nltk.jar 44 | -------------------------------------------------------------------------------- /resources/javasrc/README.txt: -------------------------------------------------------------------------------- 1 | NLTK-Java Interface Code 2 | 3 | Copyright (C) 2001-2012 NLTK Project 4 | For license information, see LICENSE.TXT 5 | 6 | The Java code in this directory is used by NLTK to communicate with 7 | external Java packages, such as Mallet. In particular, this directory 8 | defines several command-line interfaces that are used by NLTK to 9 | communicate with external Java packages, by spawning them as 10 | subprocesss. In cases where an external Java package already provides 11 | a command-line interface, teh replacement interface provided here is 12 | either more functional or more stable (or both). 13 | 14 | These command-line interfaces may be called directly by users, but 15 | they are primarily intended for use by NLTK. 16 | -------------------------------------------------------------------------------- /resources/nltk/VERSION: -------------------------------------------------------------------------------- 1 | 2.0.3 2 | -------------------------------------------------------------------------------- /resources/nltk/app/__init__.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Applications package 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Edward Loper 5 | # Steven Bird 6 | # URL: 7 | # For license information, see LICENSE.TXT 8 | 9 | """ 10 | Interactive NLTK Applications: 11 | 12 | chartparser: Chart Parser 13 | chunkparser: Regular-Expression Chunk Parser 14 | collocations: Find collocations in text 15 | concordance: Part-of-speech concordancer 16 | nemo: Finding (and Replacing) Nemo regular expression tool 17 | rdparser: Recursive Descent Parser 18 | srparser: Shift-Reduce Parser 19 | wordnet: WordNet Browser 20 | """ 21 | 22 | 23 | # Import Tkinter-based modules if Tkinter is installed 24 | try: 25 | import Tkinter 26 | except ImportError: 27 | import warnings 28 | warnings.warn("nltk.app package not loaded " 29 | "(please install Tkinter library).") 30 | else: 31 | from chartparser_app import app as chartparser 32 | from chunkparser_app import app as chunkparser 33 | from collocations_app import app as collocations 34 | from concordance_app import app as concordance 35 | from nemo_app import app as nemo 36 | from rdparser_app import app as rdparser 37 | from srparser_app import app as srparser 38 | from wordnet_app import app as wordnet 39 | 40 | try: 41 | import pylab 42 | except ImportError: 43 | import warnings 44 | warnings.warn("nltk.app.wordfreq not loaded " 45 | "(requires the pylab library).") 46 | else: 47 | from wordfreq_app import app as wordfreq 48 | -------------------------------------------------------------------------------- /resources/nltk/app/wordfreq_app.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Wordfreq Application 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Sumukh Ghodke 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | import pylab 9 | import nltk.text 10 | from nltk.corpus import gutenberg 11 | 12 | def plot_word_freq_dist(text): 13 | fd = text.vocab() 14 | 15 | samples = fd.keys()[:50] 16 | values = [fd[sample] for sample in samples] 17 | values = [sum(values[:i+1]) * 100.0/fd.N() for i in range(len(values))] 18 | pylab.title(text.name) 19 | pylab.xlabel("Samples") 20 | pylab.ylabel("Cumulative Percentage") 21 | pylab.plot(values) 22 | pylab.xticks(range(len(samples)), [str(s) for s in samples], rotation=90) 23 | pylab.show() 24 | 25 | def app(): 26 | t1 = nltk.Text(gutenberg.words('melville-moby_dick.txt')) 27 | plot_word_freq_dist(t1) 28 | 29 | if __name__ == '__main__': 30 | app() 31 | 32 | __all__ = ['app'] 33 | -------------------------------------------------------------------------------- /resources/nltk/ccg/__init__.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Combinatory Categorial Grammar 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Graeme Gange 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | """ 9 | Combinatory Categorial Grammar. 10 | 11 | For more information see nltk/doc/contrib/ccg/ccg.pdf 12 | """ 13 | 14 | from nltk.ccg.combinator import (UndirectedBinaryCombinator, DirectedBinaryCombinator, 15 | ForwardCombinator, BackwardCombinator, 16 | UndirectedFunctionApplication, ForwardApplication, 17 | BackwardApplication, UndirectedComposition, 18 | ForwardComposition, BackwardComposition, 19 | BackwardBx, UndirectedSubstitution, ForwardSubstitution, 20 | BackwardSx, UndirectedTypeRaise, ForwardT, BackwardT) 21 | from nltk.ccg.chart import CCGEdge, CCGLeafEdge, CCGChartParser, CCGChart 22 | from nltk.ccg.lexicon import CCGLexicon 23 | -------------------------------------------------------------------------------- /resources/nltk/chat/__init__.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Chatbots 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Authors: Steven Bird 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | # Based on an Eliza implementation by Joe Strout , 9 | # Jeff Epler and Jez Higgins . 10 | 11 | """ 12 | A class for simple chatbots. These perform simple pattern matching on sentences 13 | typed by users, and respond with automatically generated sentences. 14 | 15 | These chatbots may not work using the windows command line or the 16 | windows IDLE GUI. 17 | """ 18 | 19 | from util import Chat 20 | from eliza import eliza_chat 21 | from iesha import iesha_chat 22 | from rude import rude_chat 23 | from suntsu import suntsu_chat 24 | from zen import zen_chat 25 | 26 | bots = [ 27 | (eliza_chat, 'Eliza (psycho-babble)'), 28 | (iesha_chat, 'Iesha (teen anime junky)'), 29 | (rude_chat, 'Rude (abusive bot)'), 30 | (suntsu_chat, 'Suntsu (Chinese sayings)'), 31 | (zen_chat, 'Zen (gems of wisdom)')] 32 | 33 | def chatbots(): 34 | import sys 35 | print 'Which chatbot would you like to talk to?' 36 | botcount = len(bots) 37 | for i in range(botcount): 38 | print ' %d: %s' % (i+1, bots[i][1]) 39 | while True: 40 | print '\nEnter a number in the range 1-%d: ' % botcount, 41 | choice = sys.stdin.readline().strip() 42 | if choice.isdigit() and (int(choice) - 1) in range(botcount): 43 | break 44 | else: 45 | print ' Error: bad chatbot number' 46 | 47 | chatbot = bots[int(choice)-1][0] 48 | chatbot() 49 | -------------------------------------------------------------------------------- /resources/nltk/chat/rude.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Rude Chatbot 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Peter Spiller 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | from util import Chat, reflections 9 | 10 | pairs = ( 11 | (r'We (.*)', 12 | ("What do you mean, 'we'?", 13 | "Don't include me in that!", 14 | "I wouldn't be so sure about that.")), 15 | 16 | (r'You should (.*)', 17 | ("Don't tell me what to do, buddy.", 18 | "Really? I should, should I?")), 19 | 20 | (r'You\'re(.*)', 21 | ("More like YOU'RE %1!", 22 | "Hah! Look who's talking.", 23 | "Come over here and tell me I'm %1.")), 24 | 25 | (r'You are(.*)', 26 | ("More like YOU'RE %1!", 27 | "Hah! Look who's talking.", 28 | "Come over here and tell me I'm %1.")), 29 | 30 | (r'I can\'t(.*)', 31 | ("You do sound like the type who can't %1.", 32 | "Hear that splashing sound? That's my heart bleeding for you.", 33 | "Tell somebody who might actually care.")), 34 | 35 | (r'I think (.*)', 36 | ("I wouldn't think too hard if I were you.", 37 | "You actually think? I'd never have guessed...")), 38 | 39 | (r'I (.*)', 40 | ("I'm getting a bit tired of hearing about you.", 41 | "How about we talk about me instead?", 42 | "Me, me, me... Frankly, I don't care.")), 43 | 44 | (r'How (.*)', 45 | ("How do you think?", 46 | "Take a wild guess.", 47 | "I'm not even going to dignify that with an answer.")), 48 | 49 | (r'What (.*)', 50 | ("Do I look like an encyclopedia?", 51 | "Figure it out yourself.")), 52 | 53 | (r'Why (.*)', 54 | ("Why not?", 55 | "That's so obvious I thought even you'd have already figured it out.")), 56 | 57 | (r'(.*)shut up(.*)', 58 | ("Make me.", 59 | "Getting angry at a feeble NLP assignment? Somebody's losing it.", 60 | "Say that again, I dare you.")), 61 | 62 | (r'Shut up(.*)', 63 | ("Make me.", 64 | "Getting angry at a feeble NLP assignment? Somebody's losing it.", 65 | "Say that again, I dare you.")), 66 | 67 | (r'Hello(.*)', 68 | ("Oh good, somebody else to talk to. Joy.", 69 | "'Hello'? How original...")), 70 | 71 | (r'(.*)', 72 | ("I'm getting bored here. Become more interesting.", 73 | "Either become more thrilling or get lost, buddy.", 74 | "Change the subject before I die of fatal boredom.")) 75 | ) 76 | 77 | rude_chatbot = Chat(pairs, reflections) 78 | 79 | def rude_chat(): 80 | print "Talk to the program by typing in plain English, using normal upper-" 81 | print 'and lower-case letters and punctuation. Enter "quit" when done.' 82 | print '='*72 83 | print "I suppose I should say hello." 84 | 85 | rude_chatbot.converse() 86 | 87 | def demo(): 88 | rude_chat() 89 | 90 | if __name__ == "__main__": 91 | demo() 92 | -------------------------------------------------------------------------------- /resources/nltk/chunk/api.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Chunk parsing API 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Edward Loper 5 | # Steven Bird (minor additions) 6 | # URL: 7 | # For license information, see LICENSE.TXT 8 | 9 | ##////////////////////////////////////////////////////// 10 | ## Chunk Parser Interface 11 | ##////////////////////////////////////////////////////// 12 | 13 | from nltk.parse import ParserI 14 | 15 | from nltk.chunk.util import ChunkScore 16 | 17 | class ChunkParserI(ParserI): 18 | """ 19 | A processing interface for identifying non-overlapping groups in 20 | unrestricted text. Typically, chunk parsers are used to find base 21 | syntactic constituents, such as base noun phrases. Unlike 22 | ``ParserI``, ``ChunkParserI`` guarantees that the ``parse()`` method 23 | will always generate a parse. 24 | """ 25 | def parse(self, tokens): 26 | """ 27 | Return the best chunk structure for the given tokens 28 | and return a tree. 29 | 30 | :param tokens: The list of (word, tag) tokens to be chunked. 31 | :type tokens: list(tuple) 32 | :rtype: Tree 33 | """ 34 | raise NotImplementedError() 35 | 36 | def evaluate(self, gold): 37 | """ 38 | Score the accuracy of the chunker against the gold standard. 39 | Remove the chunking the gold standard text, rechunk it using 40 | the chunker, and return a ``ChunkScore`` object 41 | reflecting the performance of this chunk peraser. 42 | 43 | :type gold: list(Tree) 44 | :param gold: The list of chunked sentences to score the chunker on. 45 | :rtype: ChunkScore 46 | """ 47 | chunkscore = ChunkScore() 48 | for correct in gold: 49 | chunkscore.score(correct, self.parse(correct.leaves())) 50 | return chunkscore 51 | 52 | -------------------------------------------------------------------------------- /resources/nltk/classify/mallet.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Interface to Mallet Machine Learning Package 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Edward Loper 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | """ 9 | A set of functions used to interface with the external Mallet_ machine learning 10 | package. Before mallet can be used, you should tell NLTK where it can find 11 | the mallet package, using the ``config_mallet()`` function. Typical usage: 12 | 13 | .. doctest:: 14 | :options: +SKIP 15 | 16 | >>> from nltk.classify import mallet 17 | >>> mallet.config_mallet() # pass path to mallet as argument if needed 18 | [Found mallet: ...] 19 | 20 | .. _Mallet: http://mallet.cs.umass.edu/ 21 | """ 22 | 23 | import os 24 | import os.path 25 | 26 | from nltk.internals import find_binary, java 27 | 28 | ###################################################################### 29 | #{ Configuration 30 | ###################################################################### 31 | 32 | _mallet_home = None 33 | _mallet_classpath = None 34 | def config_mallet(mallet_home=None): 35 | """ 36 | Configure NLTK's interface to the Mallet machine learning package. 37 | 38 | :type mallet_home: str 39 | :param mallet_home: The full path to the mallet directory. If not 40 | specified, then NLTK will search the system for a mallet directory; 41 | and if one is not found, it will raise a ``LookupError`` exception. 42 | """ 43 | global _mallet_home, _mallet_classpath 44 | 45 | # We don't actually care about this binary -- we just use it to 46 | # make sure we've found the right directory. 47 | mallethon_bin = find_binary( 48 | 'mallet', mallet_home, 49 | env_vars=['MALLET', 'MALLET_HOME'], 50 | binary_names=['mallethon'], 51 | url='http://mallet.cs.umass.edu') 52 | # Record the location where mallet lives. 53 | bin_dir = os.path.split(mallethon_bin)[0] 54 | _mallet_home = os.path.split(bin_dir)[0] 55 | # Construct a classpath for using mallet. 56 | lib_dir = os.path.join(_mallet_home, 'lib') 57 | if not os.path.isdir(lib_dir): 58 | raise ValueError('While configuring mallet: directory %r ' 59 | 'not found.' % lib_dir) 60 | _mallet_classpath = os.path.pathsep.join([os.path.join(lib_dir, filename) 61 | for filename in sorted(os.listdir(lib_dir)) 62 | if filename.endswith('.jar')]) 63 | 64 | 65 | def call_mallet(cmd, classpath=None, stdin=None, stdout=None, stderr=None, 66 | blocking=True): 67 | """ 68 | Call `nltk.internals.java` with the given command, and with the classpath 69 | modified to include both ``nltk.jar`` and all the ``.jar`` files defined by 70 | Mallet. 71 | 72 | See `nltk.internals.java` for parameter and return value descriptions. 73 | """ 74 | if _mallet_classpath is None: 75 | config_mallet() 76 | 77 | # Set up the classpath 78 | if classpath is None: 79 | classpath = _mallet_classpath 80 | else: 81 | classpath += os.path.pathsep + _mallet_classpath 82 | # Delegate to java() 83 | return java(cmd, classpath, stdin, stdout, stderr, blocking) 84 | -------------------------------------------------------------------------------- /resources/nltk/cluster/api.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Clusterer Interfaces 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Trevor Cohn 5 | # Porting: Steven Bird 6 | # URL: 7 | # For license information, see LICENSE.TXT 8 | 9 | from nltk.probability import DictionaryProbDist 10 | 11 | class ClusterI(object): 12 | """ 13 | Interface covering basic clustering functionality. 14 | """ 15 | 16 | def cluster(self, vectors, assign_clusters=False): 17 | """ 18 | Assigns the vectors to clusters, learning the clustering parameters 19 | from the data. Returns a cluster identifier for each vector. 20 | """ 21 | raise NotImplementedError() 22 | 23 | def classify(self, token): 24 | """ 25 | Classifies the token into a cluster, setting the token's CLUSTER 26 | parameter to that cluster identifier. 27 | """ 28 | raise NotImplementedError() 29 | 30 | def likelihood(self, vector, label): 31 | """ 32 | Returns the likelihood (a float) of the token having the 33 | corresponding cluster. 34 | """ 35 | if self.classify(vector) == label: 36 | return 1.0 37 | else: 38 | return 0.0 39 | 40 | def classification_probdist(self, vector): 41 | """ 42 | Classifies the token into a cluster, returning 43 | a probability distribution over the cluster identifiers. 44 | """ 45 | likelihoods = {} 46 | sum = 0.0 47 | for cluster in self.cluster_names(): 48 | likelihoods[cluster] = self.likelihood(vector, cluster) 49 | sum += likelihoods[cluster] 50 | for cluster in self.cluster_names(): 51 | likelihoods[cluster] /= sum 52 | return DictionaryProbDist(likelihoods) 53 | 54 | def num_clusters(self): 55 | """ 56 | Returns the number of clusters. 57 | """ 58 | raise NotImplementedError() 59 | 60 | def cluster_names(self): 61 | """ 62 | Returns the names of the clusters. 63 | """ 64 | return range(self.num_clusters()) 65 | 66 | def cluster_name(self, index): 67 | """ 68 | Returns the names of the cluster at index. 69 | """ 70 | return index 71 | -------------------------------------------------------------------------------- /resources/nltk/corpus/europarl_raw.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Europarl Corpus Readers 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Nitin Madnani 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | import re 9 | from util import LazyCorpusLoader 10 | from reader import * 11 | 12 | # Create a new corpus reader instance for each European language 13 | danish = LazyCorpusLoader( 14 | 'europarl_raw/danish', EuroparlCorpusReader, r'ep-.*\.da', encoding='utf-8') 15 | 16 | dutch = LazyCorpusLoader( 17 | 'europarl_raw/dutch', EuroparlCorpusReader, r'ep-.*\.nl', encoding='utf-8') 18 | 19 | english = LazyCorpusLoader( 20 | 'europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8') 21 | 22 | finnish = LazyCorpusLoader( 23 | 'europarl_raw/finnish', EuroparlCorpusReader, r'ep-.*\.fi', encoding='utf-8') 24 | 25 | french = LazyCorpusLoader( 26 | 'europarl_raw/french', EuroparlCorpusReader, r'ep-.*\.fr', encoding='utf-8') 27 | 28 | german = LazyCorpusLoader( 29 | 'europarl_raw/german', EuroparlCorpusReader, r'ep-.*\.de', encoding='utf-8') 30 | 31 | greek = LazyCorpusLoader( 32 | 'europarl_raw/greek', EuroparlCorpusReader, r'ep-.*\.el', encoding='utf-8') 33 | 34 | italian = LazyCorpusLoader( 35 | 'europarl_raw/italian', EuroparlCorpusReader, r'ep-.*\.it', encoding='utf-8') 36 | 37 | portuguese = LazyCorpusLoader( 38 | 'europarl_raw/portuguese', EuroparlCorpusReader, r'ep-.*\.pt', encoding='utf-8') 39 | 40 | spanish = LazyCorpusLoader( 41 | 'europarl_raw/spanish', EuroparlCorpusReader, r'ep-.*\.es', encoding='utf-8') 42 | 43 | swedish = LazyCorpusLoader( 44 | 'europarl_raw/swedish', EuroparlCorpusReader, r'ep-.*\.sv', encoding='utf-8') 45 | -------------------------------------------------------------------------------- /resources/nltk/corpus/reader/indian.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Indian Language POS-Tagged Corpus Reader 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Steven Bird 5 | # Edward Loper 6 | # URL: 7 | # For license information, see LICENSE.TXT 8 | 9 | """ 10 | Indian Language POS-Tagged Corpus 11 | Collected by A Kumaran, Microsoft Research, India 12 | Distributed with permission 13 | 14 | Contents: 15 | - Bangla: IIT Kharagpur 16 | - Hindi: Microsoft Research India 17 | - Marathi: IIT Bombay 18 | - Telugu: IIIT Hyderabad 19 | """ 20 | 21 | import codecs 22 | 23 | from nltk.tag.util import str2tuple 24 | 25 | from util import * 26 | from api import * 27 | 28 | class IndianCorpusReader(CorpusReader): 29 | """ 30 | List of words, one per line. Blank lines are ignored. 31 | """ 32 | def words(self, fileids=None): 33 | return concat([IndianCorpusView(fileid, enc, 34 | False, False) 35 | for (fileid, enc) in self.abspaths(fileids, True)]) 36 | 37 | def tagged_words(self, fileids=None, simplify_tags=False): 38 | if simplify_tags: 39 | tag_mapping_function = self._tag_mapping_function 40 | else: 41 | tag_mapping_function = None 42 | return concat([IndianCorpusView(fileid, enc, 43 | True, False, tag_mapping_function) 44 | for (fileid, enc) in self.abspaths(fileids, True)]) 45 | 46 | def sents(self, fileids=None): 47 | return concat([IndianCorpusView(fileid, enc, 48 | False, True) 49 | for (fileid, enc) in self.abspaths(fileids, True)]) 50 | 51 | def tagged_sents(self, fileids=None, simplify_tags=False): 52 | if simplify_tags: 53 | tag_mapping_function = self._tag_mapping_function 54 | else: 55 | tag_mapping_function = None 56 | return concat([IndianCorpusView(fileid, enc, 57 | True, True, tag_mapping_function) 58 | for (fileid, enc) in self.abspaths(fileids, True)]) 59 | 60 | def raw(self, fileids=None): 61 | if fileids is None: fileids = self._fileids 62 | elif isinstance(fileids, basestring): fileids = [fileids] 63 | return concat([self.open(f).read() for f in fileids]) 64 | 65 | 66 | class IndianCorpusView(StreamBackedCorpusView): 67 | def __init__(self, corpus_file, encoding, tagged, 68 | group_by_sent, tag_mapping_function=None): 69 | self._tagged = tagged 70 | self._group_by_sent = group_by_sent 71 | self._tag_mapping_function = tag_mapping_function 72 | StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) 73 | 74 | def read_block(self, stream): 75 | line = stream.readline() 76 | if line.startswith('<'): 77 | return [] 78 | sent = [str2tuple(word, sep='_') for word in line.split()] 79 | if self._tag_mapping_function: 80 | sent = [(w, self._tag_mapping_function(t)) for (w,t) in sent] 81 | if not self._tagged: sent = [w for (w,t) in sent] 82 | if self._group_by_sent: 83 | return [sent] 84 | else: 85 | return sent 86 | 87 | 88 | -------------------------------------------------------------------------------- /resources/nltk/corpus/reader/nps_chat.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: NPS Chat Corpus Reader 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Edward Loper 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | import re 9 | import textwrap 10 | 11 | from nltk.util import LazyConcatenation 12 | from nltk.internals import ElementWrapper 13 | 14 | from util import * 15 | from api import * 16 | from xmldocs import * 17 | 18 | class NPSChatCorpusReader(XMLCorpusReader): 19 | 20 | def __init__(self, root, fileids, wrap_etree=False, tag_mapping_function=None): 21 | XMLCorpusReader.__init__(self, root, fileids, wrap_etree) 22 | self._tag_mapping_function = tag_mapping_function 23 | 24 | def xml_posts(self, fileids=None): 25 | if self._wrap_etree: 26 | return concat([XMLCorpusView(fileid, 'Session/Posts/Post', 27 | self._wrap_elt) 28 | for fileid in self.abspaths(fileids)]) 29 | else: 30 | return concat([XMLCorpusView(fileid, 'Session/Posts/Post') 31 | for fileid in self.abspaths(fileids)]) 32 | 33 | def posts(self, fileids=None): 34 | return concat([XMLCorpusView(fileid, 'Session/Posts/Post/terminals', 35 | self._elt_to_words) 36 | for fileid in self.abspaths(fileids)]) 37 | 38 | def tagged_posts(self, fileids=None, simplify_tags=False): 39 | def reader(elt, handler): 40 | return self._elt_to_tagged_words(elt, handler, simplify_tags) 41 | return concat([XMLCorpusView(fileid, 'Session/Posts/Post/terminals', 42 | reader) 43 | for fileid in self.abspaths(fileids)]) 44 | 45 | def words(self, fileids=None): 46 | return LazyConcatenation(self.posts(fileids)) 47 | 48 | def tagged_words(self, fileids=None, simplify_tags=False): 49 | return LazyConcatenation(self.tagged_posts(fileids, simplify_tags)) 50 | 51 | def _wrap_elt(self, elt, handler): 52 | return ElementWrapper(elt) 53 | 54 | def _elt_to_words(self, elt, handler): 55 | return [self._simplify_username(t.attrib['word']) 56 | for t in elt.findall('t')] 57 | 58 | def _elt_to_tagged_words(self, elt, handler, simplify_tags=False): 59 | tagged_post = [(self._simplify_username(t.attrib['word']), 60 | t.attrib['pos']) for t in elt.findall('t')] 61 | if simplify_tags: 62 | tagged_post = [(w, self._tag_mapping_function(t)) 63 | for (w,t) in tagged_post] 64 | return tagged_post 65 | 66 | @staticmethod 67 | def _simplify_username(word): 68 | if 'User' in word: 69 | word = 'U' + word.split('User', 1)[1] 70 | return word 71 | -------------------------------------------------------------------------------- /resources/nltk/corpus/reader/ppattach.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: PP Attachment Corpus Reader 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Steven Bird 5 | # Edward Loper 6 | # URL: 7 | # For license information, see LICENSE.TXT 8 | 9 | """ 10 | Read lines from the Prepositional Phrase Attachment Corpus. 11 | 12 | The PP Attachment Corpus contains several files having the format: 13 | 14 | sentence_id verb noun1 preposition noun2 attachment 15 | 16 | For example: 17 | 18 | 42960 gives authority to administration V 19 | 46742 gives inventors of microchip N 20 | 21 | The PP attachment is to the verb phrase (V) or noun phrase (N), i.e.: 22 | 23 | (VP gives (NP authority) (PP to administration)) 24 | (VP gives (NP inventors (PP of microchip))) 25 | 26 | The corpus contains the following files: 27 | 28 | training: training set 29 | devset: development test set, used for algorithm development. 30 | test: test set, used to report results 31 | bitstrings: word classes derived from Mutual Information Clustering for the Wall Street Journal. 32 | 33 | Ratnaparkhi, Adwait (1994). A Maximum Entropy Model for Prepositional 34 | Phrase Attachment. Proceedings of the ARPA Human Language Technology 35 | Conference. [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps] 36 | 37 | The PP Attachment Corpus is distributed with NLTK with the permission 38 | of the author. 39 | """ 40 | 41 | import codecs 42 | 43 | from util import * 44 | from api import * 45 | 46 | class PPAttachment: 47 | def __init__(self, sent, verb, noun1, prep, noun2, attachment): 48 | self.sent = sent 49 | self.verb = verb 50 | self.noun1 = noun1 51 | self.prep = prep 52 | self.noun2 = noun2 53 | self.attachment = attachment 54 | 55 | def __repr__(self): 56 | return ('PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, ' 57 | 'noun2=%r, attachment=%r)' % 58 | (self.sent, self.verb, self.noun1, self.prep, 59 | self.noun2, self.attachment)) 60 | 61 | class PPAttachmentCorpusReader(CorpusReader): 62 | """ 63 | sentence_id verb noun1 preposition noun2 attachment 64 | """ 65 | def attachments(self, fileids): 66 | return concat([StreamBackedCorpusView(fileid, self._read_obj_block, 67 | encoding=enc) 68 | for (fileid, enc) in self.abspaths(fileids, True)]) 69 | 70 | def tuples(self, fileids): 71 | return concat([StreamBackedCorpusView(fileid, self._read_tuple_block, 72 | encoding=enc) 73 | for (fileid, enc) in self.abspaths(fileids, True)]) 74 | 75 | def raw(self, fileids=None): 76 | if fileids is None: fileids = self._fileids 77 | elif isinstance(fileids, basestring): fileids = [fileids] 78 | return concat([self.open(f).read() for f in fileids]) 79 | 80 | def _read_tuple_block(self, stream): 81 | line = stream.readline() 82 | if line: 83 | return [tuple(line.split())] 84 | else: 85 | return [] 86 | 87 | def _read_obj_block(self, stream): 88 | line = stream.readline() 89 | if line: 90 | return [PPAttachment(*line.split())] 91 | else: 92 | return [] 93 | 94 | -------------------------------------------------------------------------------- /resources/nltk/corpus/reader/sinica_treebank.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Sinica Treebank Reader 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Steven Bird 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | """ 9 | Sinica Treebank Corpus Sample 10 | 11 | http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm 12 | 13 | 10,000 parsed sentences, drawn from the Academia Sinica Balanced 14 | Corpus of Modern Chinese. Parse tree notation is based on 15 | Information-based Case Grammar. Tagset documentation is available 16 | at http://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html 17 | 18 | Language and Knowledge Processing Group, Institute of Information 19 | Science, Academia Sinica 20 | 21 | It is distributed with the Natural Language Toolkit under the terms of 22 | the Creative Commons Attribution-NonCommercial-ShareAlike License 23 | [http://creativecommons.org/licenses/by-nc-sa/2.5/]. 24 | 25 | References: 26 | 27 | Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999) 28 | The Construction of Sinica Treebank. Computational Linguistics and 29 | Chinese Language Processing, 4, pp 87-104. 30 | 31 | Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming 32 | Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria, 33 | Annotation Guidelines, and On-line Interface. Proceedings of 2nd 34 | Chinese Language Processing Workshop, Association for Computational 35 | Linguistics. 36 | 37 | Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar 38 | Extraction, Proceedings of IJCNLP-04, pp560-565. 39 | """ 40 | 41 | import os 42 | import re 43 | 44 | import nltk 45 | 46 | from util import * 47 | from api import * 48 | 49 | IDENTIFIER = re.compile(r'^#\S+\s') 50 | APPENDIX = re.compile(r'(?<=\))#.*$') 51 | TAGWORD = re.compile(r':([^:()|]+):([^:()|]+)') 52 | WORD = re.compile(r':[^:()|]+:([^:()|]+)') 53 | 54 | class SinicaTreebankCorpusReader(SyntaxCorpusReader): 55 | """ 56 | Reader for the sinica treebank. 57 | """ 58 | def _read_block(self, stream): 59 | sent = stream.readline() 60 | sent = IDENTIFIER.sub('', sent) 61 | sent = APPENDIX.sub('', sent) 62 | return [sent] 63 | 64 | def _parse(self, sent): 65 | return nltk.tree.sinica_parse(sent) 66 | 67 | def _tag(self, sent, simplify_tags=None): 68 | tagged_sent = [(w,t) for (t,w) in TAGWORD.findall(sent)] 69 | if simplify_tags: 70 | tagged_sent = [(w, self._tag_mapping_function(t)) 71 | for (w,t) in tagged_sent] 72 | return tagged_sent 73 | 74 | def _word(self, sent): 75 | return WORD.findall(sent) 76 | -------------------------------------------------------------------------------- /resources/nltk/corpus/reader/string_category.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: String Category Corpus Reader 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Steven Bird 5 | # Edward Loper 6 | # URL: 7 | # For license information, see LICENSE.TXT 8 | 9 | """ 10 | Read tuples from a corpus consisting of categorized strings. 11 | For example, from the question classification corpus: 12 | 13 | NUM:dist How far is it from Denver to Aspen ? 14 | LOC:city What county is Modesto , California in ? 15 | HUM:desc Who was Galileo ? 16 | DESC:def What is an atom ? 17 | NUM:date When did Hawaii become a state ? 18 | """ 19 | 20 | # based on PPAttachmentCorpusReader 21 | 22 | import os 23 | 24 | from util import * 25 | from api import * 26 | 27 | # [xx] Should the order of the tuple be reversed -- in most other places 28 | # in nltk, we use the form (data, tag) -- e.g., tagged words and 29 | # labeled texts for classifiers. 30 | class StringCategoryCorpusReader(CorpusReader): 31 | def __init__(self, root, fileids, delimiter=' ', encoding=None): 32 | """ 33 | :param root: The root directory for this corpus. 34 | :param fileids: A list or regexp specifying the fileids in this corpus. 35 | :param delimiter: Field delimiter 36 | """ 37 | CorpusReader.__init__(self, root, fileids, encoding) 38 | self._delimiter = delimiter 39 | 40 | def tuples(self, fileids=None): 41 | if fileids is None: fileids = self._fileids 42 | elif isinstance(fileids, basestring): fileids = [fileids] 43 | return concat([StreamBackedCorpusView(fileid, self._read_tuple_block, 44 | encoding=enc) 45 | for (fileid, enc) in self.abspaths(fileids, True)]) 46 | 47 | def raw(self, fileids=None): 48 | """ 49 | :return: the text contents of the given fileids, as a single string. 50 | """ 51 | if fileids is None: fileids = self._fileids 52 | elif isinstance(fileids, basestring): fileids = [fileids] 53 | return concat([self.open(f).read() for f in fileids]) 54 | 55 | def _read_tuple_block(self, stream): 56 | line = stream.readline().strip() 57 | if line: 58 | return [tuple(line.split(self._delimiter, 1))] 59 | else: 60 | return [] 61 | -------------------------------------------------------------------------------- /resources/nltk/corpus/reader/toolbox.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Toolbox Reader 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Greg Aumann 5 | # Stuart Robinson 6 | # Steven Bird 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | """ 11 | Module for reading, writing and manipulating 12 | Toolbox databases and settings fileids. 13 | """ 14 | 15 | import os 16 | import re 17 | import codecs 18 | 19 | from nltk.toolbox import ToolboxData 20 | 21 | from util import * 22 | from api import * 23 | 24 | class ToolboxCorpusReader(CorpusReader): 25 | def xml(self, fileids, key=None): 26 | return concat([ToolboxData(path, enc).parse(key) 27 | for (path, enc) in self.abspaths(fileids, True)]) 28 | 29 | def fields(self, fileids, strip=True, unwrap=True, encoding=None, 30 | errors='strict', unicode_fields=None): 31 | return concat([list(ToolboxData(fileid,enc).fields( 32 | strip, unwrap, encoding, errors, unicode_fields)) 33 | for (fileid, enc) 34 | in self.abspaths(fileids, include_encoding=True)]) 35 | 36 | # should probably be done lazily: 37 | def entries(self, fileids, **kwargs): 38 | if 'key' in kwargs: 39 | key = kwargs['key'] 40 | del kwargs['key'] 41 | else: 42 | key = 'lx' # the default key in MDF 43 | entries = [] 44 | for marker, contents in self.fields(fileids, **kwargs): 45 | if marker == key: 46 | entries.append((contents, [])) 47 | else: 48 | try: 49 | entries[-1][-1].append((marker, contents)) 50 | except IndexError: 51 | pass 52 | return entries 53 | 54 | def words(self, fileids, key='lx'): 55 | return [contents for marker, contents in self.fields(fileids) if marker == key] 56 | 57 | def raw(self, fileids): 58 | if fileids is None: fileids = self._fileids 59 | elif isinstance(fileids, basestring): fileids = [fileids] 60 | return concat([self.open(f).read() for f in fileids]) 61 | 62 | 63 | def demo(): 64 | pass 65 | 66 | if __name__ == '__main__': 67 | demo() 68 | -------------------------------------------------------------------------------- /resources/nltk/corpus/reader/wordlist.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Word List Corpus Reader 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Steven Bird 5 | # Edward Loper 6 | # URL: 7 | # For license information, see LICENSE.TXT 8 | 9 | from nltk.tokenize import line_tokenize 10 | 11 | from util import * 12 | from api import * 13 | 14 | class WordListCorpusReader(CorpusReader): 15 | """ 16 | List of words, one per line. Blank lines are ignored. 17 | """ 18 | def words(self, fileids=None): 19 | return line_tokenize(self.raw(fileids)) 20 | 21 | def raw(self, fileids=None): 22 | if fileids is None: fileids = self._fileids 23 | elif isinstance(fileids, basestring): fileids = [fileids] 24 | return concat([self.open(f).read() for f in fileids]) 25 | 26 | 27 | class SwadeshCorpusReader(WordListCorpusReader): 28 | def entries(self, fileids=None): 29 | """ 30 | :return: a tuple of words for the specified fileids. 31 | """ 32 | if not fileids: 33 | fileids = self.fileids() 34 | 35 | wordlists = [self.words(f) for f in fileids] 36 | return zip(*wordlists) 37 | -------------------------------------------------------------------------------- /resources/nltk/corpus/reader/ycoe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rplevy/clojure-nltk/b4119d66b21f3c619ca4d385f605417901e47349/resources/nltk/corpus/reader/ycoe.py -------------------------------------------------------------------------------- /resources/nltk/corpus/util.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Corpus Reader Utility Functions 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Edward Loper 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | ###################################################################### 9 | #{ Lazy Corpus Loader 10 | ###################################################################### 11 | 12 | import re 13 | import nltk 14 | 15 | TRY_ZIPFILE_FIRST = False 16 | 17 | class LazyCorpusLoader(object): 18 | """ 19 | A proxy object which is used to stand in for a corpus object 20 | before the corpus is loaded. This allows NLTK to create an object 21 | for each corpus, but defer the costs associated with loading those 22 | corpora until the first time that they're actually accessed. 23 | 24 | The first time this object is accessed in any way, it will load 25 | the corresponding corpus, and transform itself into that corpus 26 | (by modifying its own ``__class__`` and ``__dict__`` attributes). 27 | 28 | If the corpus can not be found, then accessing this object will 29 | raise an exception, displaying installation instructions for the 30 | NLTK data package. Once they've properly installed the data 31 | package (or modified ``nltk.data.path`` to point to its location), 32 | they can then use the corpus object without restarting python. 33 | """ 34 | def __init__(self, name, reader_cls, *args, **kwargs): 35 | from nltk.corpus.reader.api import CorpusReader 36 | assert issubclass(reader_cls, CorpusReader) 37 | self.__name = self.__name__ = name 38 | self.__reader_cls = reader_cls 39 | self.__args = args 40 | self.__kwargs = kwargs 41 | 42 | def __load(self): 43 | # Find the corpus root directory. 44 | zip_name = re.sub(r'(([^/]*)(/.*)?)', r'\2.zip/\1/', self.__name) 45 | if TRY_ZIPFILE_FIRST: 46 | try: 47 | root = nltk.data.find('corpora/%s' % zip_name) 48 | except LookupError: 49 | raise 50 | root = nltk.data.find('corpora/%s' % self.__name) 51 | else: 52 | try: 53 | root = nltk.data.find('corpora/%s' % self.__name) 54 | except LookupError, e: 55 | try: root = nltk.data.find('corpora/%s' % zip_name) 56 | except LookupError: raise e 57 | 58 | # Load the corpus. 59 | corpus = self.__reader_cls(root, *self.__args, **self.__kwargs) 60 | 61 | # This is where the magic happens! Transform ourselves into 62 | # the corpus by modifying our own __dict__ and __class__ to 63 | # match that of the corpus. 64 | self.__dict__ = corpus.__dict__ 65 | self.__class__ = corpus.__class__ 66 | 67 | def __getattr__(self, attr): 68 | self.__load() 69 | # This looks circular, but its not, since __load() changes our 70 | # __class__ to something new: 71 | return getattr(self, attr) 72 | 73 | def __repr__(self): 74 | return '<%s in %r (not loaded yet)>' % ( 75 | self.__reader_cls.__name__, '.../corpora/'+self.__name) 76 | -------------------------------------------------------------------------------- /resources/nltk/draw/__init__.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: graphical representations package 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Edward Loper 5 | # Steven Bird 6 | # URL: 7 | # For license information, see LICENSE.TXT 8 | 9 | # Import Tkinter-based modules if Tkinter is installed 10 | try: 11 | import Tkinter 12 | except ImportError: 13 | import warnings 14 | warnings.warn("nltk.draw package not loaded " 15 | "(please install Tkinter library).") 16 | else: 17 | from cfg import ProductionList, CFGEditor, CFGDemo 18 | from tree import (TreeSegmentWidget, tree_to_treesegment, 19 | TreeWidget, TreeView, draw_trees) 20 | from dispersion import dispersion_plot 21 | from table import Table 22 | -------------------------------------------------------------------------------- /resources/nltk/draw/dispersion.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Dispersion Plots 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Steven Bird 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | """ 9 | A utility for displaying lexical dispersion. 10 | """ 11 | 12 | def dispersion_plot(text, words, ignore_case=False): 13 | """ 14 | Generate a lexical dispersion plot. 15 | 16 | :param text: The source text 17 | :type text: list(str) or enum(str) 18 | :param words: The target words 19 | :type words: list of str 20 | :param ignore_case: flag to set if case should be ignored when searching text 21 | :type ignore_case: bool 22 | """ 23 | 24 | try: 25 | import pylab 26 | except ImportError: 27 | raise ValueError('The plot function requires the matplotlib package (aka pylab).' 28 | 'See http://matplotlib.sourceforge.net/') 29 | 30 | text = list(text) 31 | words.reverse() 32 | 33 | if ignore_case: 34 | words_to_comp = map(str.lower, words) 35 | text_to_comp = map(str.lower, text) 36 | else: 37 | words_to_comp = words 38 | text_to_comp = text 39 | 40 | points = [(x,y) for x in range(len(text_to_comp)) 41 | for y in range(len(words_to_comp)) 42 | if text_to_comp[x] == words_to_comp[y]] 43 | if points: 44 | x, y = zip(*points) 45 | else: 46 | x = y = () 47 | pylab.plot(x, y, "b|", scalex=.1) 48 | pylab.yticks(range(len(words)), words, color="b") 49 | pylab.ylim(-1, len(words)) 50 | pylab.title("Lexical Dispersion Plot") 51 | pylab.xlabel("Word Offset") 52 | pylab.show() 53 | 54 | if __name__ == '__main__': 55 | from nltk.corpus import gutenberg 56 | words = ['Elinor', 'Marianne', 'Edward', 'Willoughby'] 57 | dispersion_plot(gutenberg.words('austen-sense.txt'), words) 58 | -------------------------------------------------------------------------------- /resources/nltk/examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rplevy/clojure-nltk/b4119d66b21f3c619ca4d385f605417901e47349/resources/nltk/examples/__init__.py -------------------------------------------------------------------------------- /resources/nltk/examples/pt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rplevy/clojure-nltk/b4119d66b21f3c619ca4d385f605417901e47349/resources/nltk/examples/pt.py -------------------------------------------------------------------------------- /resources/nltk/help.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit (NLTK) Help 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Authors: Steven Bird 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | """ 9 | Provide structured access to documentation. 10 | """ 11 | 12 | import re 13 | from textwrap import wrap 14 | 15 | from nltk.data import load 16 | 17 | def brown_tagset(tagpattern=None): 18 | _format_tagset("brown_tagset", tagpattern) 19 | 20 | def claws5_tagset(tagpattern=None): 21 | _format_tagset("claws5_tagset", tagpattern) 22 | 23 | def upenn_tagset(tagpattern=None): 24 | _format_tagset("upenn_tagset", tagpattern) 25 | 26 | ##################################################################### 27 | # UTILITIES 28 | ##################################################################### 29 | 30 | def _print_entries(tags, tagdict): 31 | for tag in tags: 32 | entry = tagdict[tag] 33 | defn = [tag + ": " + entry[0]] 34 | examples = wrap(entry[1], width=75, initial_indent=' ', subsequent_indent=' ') 35 | print "\n".join(defn + examples) 36 | 37 | def _format_tagset(tagset, tagpattern=None): 38 | tagdict = load("help/tagsets/" + tagset + ".pickle") 39 | if not tagpattern: 40 | _print_entries(sorted(tagdict), tagdict) 41 | elif tagpattern in tagdict: 42 | _print_entries([tagpattern], tagdict) 43 | else: 44 | tagpattern = re.compile(tagpattern) 45 | tags = [tag for tag in sorted(tagdict) if tagpattern.match(tag)] 46 | if tags: 47 | _print_entries(tags, tagdict) 48 | else: 49 | print "No matching tags found." 50 | 51 | if __name__ == '__main__': 52 | brown_tagset(r'NN.*') 53 | upenn_tagset(r'.*\$') 54 | claws5_tagset('UNDEFINED') 55 | brown_tagset(r'NN') 56 | -------------------------------------------------------------------------------- /resources/nltk/inference/__init__.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Inference 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Dan Garrette 5 | # Ewan Klein 6 | # 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | """ 11 | Classes and interfaces for theorem proving and model building. 12 | """ 13 | 14 | from api import ParallelProverBuilder, ParallelProverBuilderCommand 15 | from mace import Mace, MaceCommand 16 | from prover9 import Prover9, Prover9Command 17 | from resolution import ResolutionProver, ResolutionProverCommand 18 | from tableau import TableauProver, TableauProverCommand 19 | from discourse import (ReadingCommand, CfgReadingCommand, 20 | DrtGlueReadingCommand, DiscourseTester) 21 | -------------------------------------------------------------------------------- /resources/nltk/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Metrics 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Steven Bird 5 | # Edward Loper 6 | # URL: 7 | # For license information, see LICENSE.TXT 8 | # 9 | 10 | """ 11 | NLTK Metrics 12 | 13 | Classes and methods for scoring processing modules. 14 | """ 15 | 16 | from nltk.metrics.scores import (accuracy, precision, recall, f_measure, 17 | log_likelihood, approxrand) 18 | from nltk.metrics.confusionmatrix import ConfusionMatrix 19 | from nltk.metrics.distance import (edit_distance, binary_distance, 20 | jaccard_distance, masi_distance, 21 | interval_distance, custom_distance, 22 | presence, fractional_presence) 23 | from nltk.metrics.segmentation import windowdiff, ghd, pk 24 | from nltk.metrics.agreement import AnnotationTask 25 | from nltk.metrics.association import (NgramAssocMeasures, BigramAssocMeasures, 26 | TrigramAssocMeasures, ContingencyMeasures) 27 | from nltk.metrics.spearman import (spearman_correlation, ranks_from_sequence, 28 | ranks_from_scores) 29 | -------------------------------------------------------------------------------- /resources/nltk/metrics/artstein_poesio_example.txt: -------------------------------------------------------------------------------- 1 | a 1 stat 2 | b 1 stat 3 | a 2 stat 4 | b 2 stat 5 | a 3 stat 6 | b 3 stat 7 | a 4 stat 8 | b 4 stat 9 | a 5 stat 10 | b 5 stat 11 | a 6 stat 12 | b 6 stat 13 | a 7 stat 14 | b 7 stat 15 | a 8 stat 16 | b 8 stat 17 | a 9 stat 18 | b 9 stat 19 | a 10 stat 20 | b 10 stat 21 | a 11 stat 22 | b 11 stat 23 | a 12 stat 24 | b 12 stat 25 | a 13 stat 26 | b 13 stat 27 | a 14 stat 28 | b 14 stat 29 | a 15 stat 30 | b 15 stat 31 | a 16 stat 32 | b 16 stat 33 | a 17 stat 34 | b 17 stat 35 | a 18 stat 36 | b 18 stat 37 | a 19 stat 38 | b 19 stat 39 | a 20 stat 40 | b 20 stat 41 | a 21 stat 42 | b 21 stat 43 | a 22 stat 44 | b 22 stat 45 | a 23 stat 46 | b 23 stat 47 | a 24 stat 48 | b 24 stat 49 | a 25 stat 50 | b 25 stat 51 | a 26 stat 52 | b 26 stat 53 | a 27 stat 54 | b 27 stat 55 | a 28 stat 56 | b 28 stat 57 | a 29 stat 58 | b 29 stat 59 | a 30 stat 60 | b 30 stat 61 | a 31 stat 62 | b 31 stat 63 | a 32 stat 64 | b 32 stat 65 | a 33 stat 66 | b 33 stat 67 | a 34 stat 68 | b 34 stat 69 | a 35 stat 70 | b 35 stat 71 | a 36 stat 72 | b 36 stat 73 | a 37 stat 74 | b 37 stat 75 | a 38 stat 76 | b 38 stat 77 | a 39 stat 78 | b 39 stat 79 | a 40 stat 80 | b 40 stat 81 | a 41 stat 82 | b 41 stat 83 | a 42 stat 84 | b 42 stat 85 | a 43 stat 86 | b 43 stat 87 | a 44 stat 88 | b 44 stat 89 | a 45 stat 90 | b 45 stat 91 | a 46 stat 92 | b 46 stat 93 | a 47 ireq 94 | b 47 stat 95 | a 48 ireq 96 | b 48 stat 97 | a 49 ireq 98 | b 49 stat 99 | a 50 ireq 100 | b 50 stat 101 | a 51 ireq 102 | b 51 stat 103 | a 52 ireq 104 | b 52 stat 105 | a 53 ireq 106 | b 53 ireq 107 | a 54 ireq 108 | b 54 ireq 109 | a 55 ireq 110 | b 55 ireq 111 | a 56 ireq 112 | b 56 ireq 113 | a 57 ireq 114 | b 57 ireq 115 | a 58 ireq 116 | b 58 ireq 117 | a 59 ireq 118 | b 59 ireq 119 | a 60 ireq 120 | b 60 ireq 121 | a 61 ireq 122 | b 61 ireq 123 | a 62 ireq 124 | b 62 ireq 125 | a 63 ireq 126 | b 63 ireq 127 | a 64 ireq 128 | b 64 ireq 129 | a 65 ireq 130 | b 65 ireq 131 | a 66 ireq 132 | b 66 ireq 133 | a 67 ireq 134 | b 67 ireq 135 | a 68 ireq 136 | b 68 ireq 137 | a 69 ireq 138 | b 69 ireq 139 | a 70 ireq 140 | b 70 ireq 141 | a 71 ireq 142 | b 71 ireq 143 | a 72 ireq 144 | b 72 ireq 145 | a 73 ireq 146 | b 73 ireq 147 | a 74 ireq 148 | b 74 ireq 149 | a 75 ireq 150 | b 75 ireq 151 | a 76 ireq 152 | b 76 ireq 153 | a 77 ireq 154 | b 77 ireq 155 | a 78 ireq 156 | b 78 ireq 157 | a 79 ireq 158 | b 79 ireq 159 | a 80 ireq 160 | b 80 ireq 161 | a 81 ireq 162 | b 81 ireq 163 | a 82 ireq 164 | b 82 ireq 165 | a 83 ireq 166 | b 83 ireq 167 | a 84 ireq 168 | b 84 ireq 169 | a 85 ireq 170 | b 85 chck 171 | a 86 ireq 172 | b 86 chck 173 | a 87 ireq 174 | b 87 chck 175 | a 88 ireq 176 | b 88 chck 177 | a 89 ireq 178 | b 89 chck 179 | a 90 ireq 180 | b 90 chck 181 | a 91 chck 182 | b 91 chck 183 | a 92 chck 184 | b 92 chck 185 | a 93 chck 186 | b 93 chck 187 | a 94 chck 188 | b 94 chck 189 | a 95 chck 190 | b 95 chck 191 | a 96 chck 192 | b 96 chck 193 | a 97 chck 194 | b 97 chck 195 | a 98 chck 196 | b 98 chck 197 | a 99 chck 198 | b 99 chck 199 | a 100 chck 200 | b 100 chck 201 | -------------------------------------------------------------------------------- /resources/nltk/metrics/spearman.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Spearman Rank Correlation 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Joel Nothman 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | """ 9 | Tools for comparing ranked lists. 10 | """ 11 | 12 | def _rank_dists(ranks1, ranks2): 13 | """Finds the difference between the values in ranks1 and ranks2 for keys 14 | present in both dicts. If the arguments are not dicts, they are converted 15 | from (key, rank) sequences. 16 | """ 17 | ranks1 = dict(ranks1) 18 | ranks2 = dict(ranks2) 19 | for k, v1 in ranks1.iteritems(): 20 | try: 21 | yield k, v1 - ranks2[k] 22 | except KeyError: 23 | pass 24 | 25 | 26 | def spearman_correlation(ranks1, ranks2): 27 | """Returns the Spearman correlation coefficient for two rankings, which 28 | should be dicts or sequences of (key, rank). The coefficient ranges from 29 | -1.0 (ranks are opposite) to 1.0 (ranks are identical), and is only 30 | calculated for keys in both rankings (for meaningful results, remove keys 31 | present in only one list before ranking).""" 32 | n = 0 33 | res = 0 34 | for k, d in _rank_dists(ranks1, ranks2): 35 | res += d * d 36 | n += 1 37 | try: 38 | return 1 - (6 * float(res) / (n * (n*n - 1))) 39 | except ZeroDivisionError: 40 | # Result is undefined if only one item is ranked 41 | return 0.0 42 | 43 | 44 | def ranks_from_sequence(seq): 45 | """Given a sequence, yields each element with an increasing rank, suitable 46 | for use as an argument to ``spearman_correlation``. 47 | """ 48 | return ((k, i) for i, k in enumerate(seq)) 49 | 50 | 51 | def ranks_from_scores(scores, rank_gap=1e-15): 52 | """Given a sequence of (key, score) tuples, yields each key with an 53 | increasing rank, tying with previous key's rank if the difference between 54 | their scores is less than rank_gap. Suitable for use as an argument to 55 | ``spearman_correlation``. 56 | """ 57 | prev_score = None 58 | rank = 0 59 | for i, (key, score) in enumerate(scores): 60 | try: 61 | if abs(score - prev_score) > rank_gap: 62 | rank = i 63 | except TypeError: 64 | pass 65 | 66 | yield key, rank 67 | prev_score = score 68 | 69 | -------------------------------------------------------------------------------- /resources/nltk/metrics/windowdiff.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Windowdiff 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Edward Loper 5 | # Steven Bird 6 | # URL: 7 | # For license information, see LICENSE.TXT 8 | 9 | ########################################################################## 10 | # Windowdiff 11 | # Pevzner, L., and Hearst, M., A Critique and Improvement of 12 | # an Evaluation Metric for Text Segmentation, 13 | # Computational Linguistics,, 28 (1), March 2002, pp. 19-36 14 | ########################################################################## 15 | 16 | def windowdiff(seg1, seg2, k, boundary="1"): 17 | """ 18 | Compute the windowdiff score for a pair of segmentations. A segmentation is any sequence 19 | over a vocabulary of two items (e.g. "0", "1"), where the specified boundary value is used 20 | to mark the edge of a segmentation. 21 | 22 | >>> from nltk.metrics.windowdiff import windowdiff 23 | >>> s1 = "00000010000000001000000" 24 | >>> s2 = "00000001000000010000000" 25 | >>> s3 = "00010000000000000001000" 26 | >>> windowdiff(s1, s1, 3) 27 | 0 28 | >>> windowdiff(s1, s2, 3) 29 | 4 30 | >>> windowdiff(s2, s3, 3) 31 | 16 32 | 33 | :param seg1: a segmentation 34 | :type seg1: str or list 35 | :param seg2: a segmentation 36 | :type seg2: str or list 37 | :param k: window width 38 | :type k: int 39 | :param boundary: boundary value 40 | :type boundary: str or int or bool 41 | :rtype: int 42 | """ 43 | 44 | if len(seg1) != len(seg2): 45 | raise ValueError, "Segmentations have unequal length" 46 | wd = 0 47 | for i in range(len(seg1) - k): 48 | wd += abs(seg1[i:i+k+1].count(boundary) - seg2[i:i+k+1].count(boundary)) 49 | return wd 50 | 51 | def demo(): 52 | s1 = "00000010000000001000000" 53 | s2 = "00000001000000010000000" 54 | s3 = "00010000000000000001000" 55 | print "s1:", s1 56 | print "s2:", s2 57 | print "s3:", s3 58 | 59 | print "windowdiff(s1, s1, 3) = ", windowdiff(s1, s1, 3) 60 | print "windowdiff(s1, s2, 3) = ", windowdiff(s1, s2, 3) 61 | print "windowdiff(s2, s3, 3) = ", windowdiff(s2, s3, 3) 62 | -------------------------------------------------------------------------------- /resources/nltk/misc/__init__.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Miscellaneous modules 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Steven Bird 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | from chomsky import generate_chomsky 9 | from wordfinder import word_finder 10 | from minimalset import MinimalSet 11 | from babelfish import babelize, babelize_shell 12 | 13 | -------------------------------------------------------------------------------- /resources/nltk/misc/minimalset.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Minimal Sets 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Steven Bird 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | from collections import defaultdict 9 | 10 | class MinimalSet(object): 11 | """ 12 | Find contexts where more than one possible target value can 13 | appear. E.g. if targets are word-initial letters, and contexts 14 | are the remainders of words, then we would like to find cases like 15 | "fat" vs "cat", and "training" vs "draining". If targets are 16 | parts-of-speech and contexts are words, then we would like to find 17 | cases like wind (noun) 'air in rapid motion', vs wind (verb) 18 | 'coil, wrap'. 19 | """ 20 | def __init__(self, parameters=None): 21 | """ 22 | Create a new minimal set. 23 | 24 | :param parameters: The (context, target, display) tuples for the item 25 | :type parameters: list(tuple(str, str, str)) 26 | """ 27 | self._targets = set() # the contrastive information 28 | self._contexts = set() # what we are controlling for 29 | self._seen = defaultdict(set) # to record what we have seen 30 | self._displays = {} # what we will display 31 | 32 | if parameters: 33 | for context, target, display in parameters: 34 | self.add(context, target, display) 35 | 36 | def add(self, context, target, display): 37 | """ 38 | Add a new item to the minimal set, having the specified 39 | context, target, and display form. 40 | 41 | :param context: The context in which the item of interest appears 42 | :type context: str 43 | :param target: The item of interest 44 | :type target: str 45 | :param display: The information to be reported for each item 46 | :type display: str 47 | """ 48 | # Store the set of targets that occurred in this context 49 | self._seen[context].add(target) 50 | 51 | # Keep track of which contexts and targets we have seen 52 | self._contexts.add(context) 53 | self._targets.add(target) 54 | 55 | # For a given context and target, store the display form 56 | self._displays[(context, target)] = display 57 | 58 | def contexts(self, minimum=2): 59 | """ 60 | Determine which contexts occurred with enough distinct targets. 61 | 62 | :param minimum: the minimum number of distinct target forms 63 | :type minimum: int 64 | :rtype list 65 | """ 66 | return [c for c in self._contexts if len(self._seen[c]) >= minimum] 67 | 68 | def display(self, context, target, default=""): 69 | if (context, target) in self._displays: 70 | return self._displays[(context, target)] 71 | else: 72 | return default 73 | 74 | def display_all(self, context): 75 | result = [] 76 | for target in self._targets: 77 | x = self.display(context, target) 78 | if x: result.append(x) 79 | return result 80 | 81 | def targets(self): 82 | return self._targets 83 | 84 | -------------------------------------------------------------------------------- /resources/nltk/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Language Models 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Steven Bird 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | from ngram import NgramModel 9 | -------------------------------------------------------------------------------- /resources/nltk/model/api.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: API for Language Models 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Steven Bird 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | 9 | # should this be a subclass of ConditionalProbDistI? 10 | 11 | class ModelI(object): 12 | """ 13 | A processing interface for assigning a probability to the next word. 14 | """ 15 | 16 | def __init__(self): 17 | '''Create a new language model.''' 18 | raise NotImplementedError() 19 | 20 | def prob(self, word, context): 21 | '''Evaluate the probability of this word in this context.''' 22 | raise NotImplementedError() 23 | 24 | def logprob(self, word, context): 25 | '''Evaluate the (negative) log probability of this word in this context.''' 26 | raise NotImplementedError() 27 | 28 | def choose_random_word(self, context): 29 | '''Randomly select a word that is likely to appear in this context.''' 30 | raise NotImplementedError() 31 | 32 | def generate(self, n): 33 | '''Generate n words of text from the language model.''' 34 | raise NotImplementedError() 35 | 36 | def entropy(self, text): 37 | '''Evaluate the total entropy of a message with respect to the model. 38 | This is the sum of the log probability of each word in the message.''' 39 | raise NotImplementedError() 40 | 41 | -------------------------------------------------------------------------------- /resources/nltk/nltk.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rplevy/clojure-nltk/b4119d66b21f3c619ca4d385f605417901e47349/resources/nltk/nltk.jar -------------------------------------------------------------------------------- /resources/nltk/parse/broker_test.cfg: -------------------------------------------------------------------------------- 1 | %start S 2 | 3 | S[sem=] -> NP[sem=?subj] VP[sem=?vp] 4 | VP[sem = ] -> V[sem = ?v] NP[sem=?obj] 5 | VP[sem = ?v] -> V[sem = ?v] 6 | NP[sem = ] -> 'Kim' 7 | NP[sem = ] -> 'I' 8 | V[sem = <\x y.(like x y)>, tns=pres] -> 'like' 9 | V[sem = <\x.(sleeps x)>, tns=pres] -> 'sleeps' 10 | 11 | -------------------------------------------------------------------------------- /resources/nltk/parse/generate.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Generating from a CFG 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Steven Bird 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | # 8 | 9 | from nltk.grammar import Nonterminal, parse_cfg 10 | 11 | def generate(grammar, start=None): 12 | if not start: 13 | start = grammar.start() 14 | return _generate_all(grammar, [start])[0] 15 | 16 | def _generate_all(grammar, items): 17 | frags = [] 18 | if len(items) == 1: 19 | if isinstance(items[0], Nonterminal): 20 | for prod in grammar.productions(lhs=items[0]): 21 | frags.append(_generate_all(grammar, prod.rhs())) 22 | else: 23 | frags.append(items[0]) 24 | else: 25 | for frag1 in _generate_all(grammar, [items[0]]): 26 | for frag2 in _generate_all(grammar, items[1:]): 27 | for frag in _multiply(frag1, frag2): 28 | frags.append(frag) 29 | return frags 30 | 31 | def _multiply(frag1, frag2): 32 | frags = [] 33 | if len(frag1) == 1: 34 | frag1 = [frag1] 35 | if len(frag2) == 1: 36 | frag2 = [frag2] 37 | for f1 in frag1: 38 | for f2 in frag2: 39 | frags.append(f1+f2) 40 | return frags 41 | 42 | grammar = parse_cfg(""" 43 | S -> NP VP 44 | NP -> Det N 45 | VP -> V NP 46 | Det -> 'the' 47 | Det -> 'a' 48 | N -> 'man' | 'park' | 'dog' | 'telescope' 49 | V -> 'saw' | 'walked' 50 | P -> 'in' | 'with' 51 | """) 52 | 53 | for sent in generate(grammar): 54 | print ' '.join(sent) 55 | 56 | -------------------------------------------------------------------------------- /resources/nltk/parse/generate2.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Generating from a CFG 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Steven Bird 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | # 8 | 9 | from nltk.grammar import Nonterminal, parse_cfg 10 | 11 | def all_combsi(lol): 12 | lens = map(lambda x: len(x), lol) 13 | num_combs = reduce(lambda x, y: x*y, lens, 1) 14 | for i in xrange(num_combs): 15 | tmp = [0]*len(lol) 16 | for j in xrange(len(tmp)): 17 | tmp[j] = lol[j][i % lens[j]] 18 | i = i / lens[j] 19 | yield tmp 20 | 21 | def expand_nonterm(symbol, grammar): 22 | if isinstance(symbol, Nonterminal): 23 | return map(lambda prod: list(prod.rhs()), grammar.productions(lhs=symbol)) 24 | else: 25 | return symbol 26 | 27 | def tree_traverse(root, get_children, isleaf, maxdepth): 28 | if isleaf(root): 29 | yield root 30 | elif maxdepth > 0: 31 | for child in get_children(root): 32 | for x in tree_traverse(child, get_children, isleaf, maxdepth - 1): 33 | yield x 34 | 35 | def flatten(lst): 36 | val = [] 37 | for x in lst: 38 | if isinstance(x, list): 39 | val = val + x 40 | else: 41 | val.append(x) 42 | return val 43 | 44 | def generate(grammar, start=None, depth=10): 45 | def is_terminal(lofs): 46 | tmp = map(lambda x: not isinstance(x, Nonterminal), lofs) 47 | return all(tmp) 48 | 49 | def get_children(l_of_symbols): 50 | x = map(lambda x: expand_nonterm(x, grammar), l_of_symbols) 51 | x = map(lambda x: x if isinstance(x, list) else [x], x) 52 | for comb in all_combsi(x): 53 | yield flatten(comb) 54 | 55 | if not start: 56 | start = grammar.start() 57 | return [x for x in tree_traverse([start], get_children, is_terminal, depth)] 58 | 59 | def _generate_demo(): 60 | g = parse_cfg(""" 61 | S -> NP VP 62 | NP -> Det N 63 | VP -> V NP 64 | Det -> 'the' 65 | Det -> 'a' 66 | N -> 'man' | 'park' | 'dog' | 'telescope' 67 | V -> 'saw' | 'walked' 68 | P -> 'in' | 'with' 69 | """) 70 | for s in generate(g): 71 | print ' '.join(s) 72 | 73 | if __name__ == "__main__": 74 | _generate_demo() 75 | -------------------------------------------------------------------------------- /resources/nltk/parse/test.cfg: -------------------------------------------------------------------------------- 1 | %start S 2 | 3 | S[sem=] -> NP[sem=?subj] VP[sem=?vp] 4 | VP[sem = ] -> V[sem = ?v] NP[sem=?obj] 5 | VP[sem = ?v] -> V[sem = ?v] 6 | NP[sem = ] -> 'Kim' 7 | NP[sem = ] -> 'I' 8 | V[sem = <\x y.(like x y)>, tns=pres] -> 'like' 9 | V[sem = <\x.(sleeps x)>, tns=pres] -> 'sleeps' 10 | 11 | -------------------------------------------------------------------------------- /resources/nltk/sem/__init__.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Semantic Interpretation 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Ewan Klein 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | """ 9 | NLTK Semantic Interpretation Package 10 | 11 | This package contains classes for representing semantic structure in 12 | formulas of first-order logic and for evaluating such formulas in 13 | set-theoretic models. 14 | 15 | >>> from nltk.sem import logic 16 | >>> logic._counter._value = 0 17 | 18 | The package has two main components: 19 | 20 | - ``logic`` provides a parser for analyzing expressions of First 21 | Order Logic (FOL). 22 | - ``evaluate`` allows users to recursively determine truth in a 23 | model for formulas of FOL. 24 | 25 | A model consists of a domain of discourse and a valuation function, 26 | which assigns values to non-logical constants. We assume that entities 27 | in the domain are represented as strings such as ``'b1'``, ``'g1'``, 28 | etc. A ``Valuation`` is initialized with a list of (symbol, value) 29 | pairs, where values are entities, sets of entities or sets of tuples 30 | of entities. 31 | The domain of discourse can be inferred from the valuation, and model 32 | is then created with domain and valuation as parameters. 33 | 34 | >>> from nltk.sem import Valuation, Model 35 | >>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'), 36 | ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), 37 | ... ('dog', set(['d1'])), 38 | ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))] 39 | >>> val = Valuation(v) 40 | >>> dom = val.domain 41 | >>> m = Model(dom, val) 42 | """ 43 | 44 | from nltk.sem.util import (batch_parse, batch_interpret, batch_evaluate, 45 | root_semrep, parse_valuation) 46 | from nltk.sem.evaluate import (Valuation, Assignment, Model, Undefined, 47 | is_rel, set2rel, arity) 48 | from nltk.sem.logic import (LogicParser, boolean_ops, binding_ops, 49 | equality_preds, parse_logic) 50 | from nltk.sem.skolemize import skolemize 51 | from nltk.sem.lfg import FStructure 52 | from nltk.sem.relextract import extract_rels 53 | from nltk.sem.boxer import Boxer 54 | from nltk.sem.drt import DrtParser, DRS 55 | from nltk.sem.linearlogic import LinearLogicParser 56 | 57 | # from nltk.sem.glue import Glue 58 | # from nltk.sem.hole import HoleSemantics 59 | # from nltk.sem.cooper_storage import CooperStore 60 | 61 | # don't import chat80 as its names are too generic 62 | -------------------------------------------------------------------------------- /resources/nltk/stem/__init__.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Stemmers 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Trevor Cohn 5 | # Edward Loper 6 | # Steven Bird 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | """ 11 | NLTK Stemmers 12 | 13 | Interfaces used to remove morphological affixes from words, leaving 14 | only the word stem. Stemming algorithms aim to remove those affixes 15 | required for eg. grammatical role, tense, derivational morphology 16 | leaving only the stem of the word. This is a difficult problem due to 17 | irregular words (eg. common verbs in English), complicated 18 | morphological rules, and part-of-speech and sense ambiguities 19 | (eg. ``ceil-`` is not the stem of ``ceiling``). 20 | 21 | StemmerI defines a standard interface for stemmers. 22 | """ 23 | 24 | from nltk.stem.api import StemmerI 25 | from nltk.stem.regexp import RegexpStemmer 26 | from nltk.stem.lancaster import LancasterStemmer 27 | from nltk.stem.isri import ISRIStemmer 28 | from nltk.stem.porter import PorterStemmer 29 | from nltk.stem.snowball import SnowballStemmer 30 | from nltk.stem.wordnet import WordNetLemmatizer 31 | from nltk.stem.rslp import RSLPStemmer 32 | 33 | 34 | if __name__ == "__main__": 35 | import doctest 36 | doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) 37 | -------------------------------------------------------------------------------- /resources/nltk/stem/api.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Stemmer Interface 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Trevor Cohn 5 | # Edward Loper 6 | # Steven Bird 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | class StemmerI(object): 11 | """ 12 | A processing interface for removing morphological affixes from 13 | words. This process is known as stemming. 14 | 15 | """ 16 | def stem(self, token): 17 | """ 18 | Strip affixes from the token and return the stem. 19 | 20 | :param token: The token that should be stemmed. 21 | :type token: str 22 | """ 23 | raise NotImplementedError() 24 | 25 | 26 | if __name__ == "__main__": 27 | import doctest 28 | doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) 29 | -------------------------------------------------------------------------------- /resources/nltk/stem/regexp.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Stemmers 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Trevor Cohn 5 | # Edward Loper 6 | # Steven Bird 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | import re 11 | 12 | from api import StemmerI 13 | 14 | class RegexpStemmer(StemmerI): 15 | """ 16 | A stemmer that uses regular expressions to identify morphological 17 | affixes. Any substrings that match the regular expressions will 18 | be removed. 19 | 20 | >>> from nltk.stem import RegexpStemmer 21 | >>> st = RegexpStemmer('ing$|s$|e$', min=4) 22 | >>> st.stem('cars') 23 | 'car' 24 | >>> st.stem('mass') 25 | 'mas' 26 | >>> st.stem('was') 27 | 'was' 28 | >>> st.stem('bee') 29 | 'bee' 30 | >>> st.stem('compute') 31 | 'comput' 32 | 33 | :type regexp: str or regexp 34 | :param regexp: The regular expression that should be used to 35 | identify morphological affixes. 36 | :type min: int 37 | :param min: The minimum length of string to stem 38 | """ 39 | def __init__(self, regexp, min=0): 40 | 41 | if not hasattr(regexp, 'pattern'): 42 | regexp = re.compile(regexp) 43 | self._regexp = regexp 44 | self._min = min 45 | 46 | def stem(self, word): 47 | if len(word) < self._min: 48 | return word 49 | else: 50 | return self._regexp.sub('', word) 51 | 52 | def __repr__(self): 53 | return '' % self._regexp.pattern 54 | 55 | 56 | 57 | if __name__ == "__main__": 58 | import doctest 59 | doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) 60 | 61 | -------------------------------------------------------------------------------- /resources/nltk/stem/wordnet.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: WordNet stemmer interface 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Steven Bird 5 | # Edward Loper 6 | # URL: 7 | # For license information, see LICENSE.TXT 8 | 9 | from nltk.corpus.reader.wordnet import NOUN 10 | from nltk.corpus import wordnet 11 | 12 | class WordNetLemmatizer(object): 13 | """ 14 | WordNet Lemmatizer 15 | 16 | Lemmatize using WordNet's built-in morphy function. 17 | Returns the input word unchanged if it cannot be found in WordNet. 18 | 19 | >>> from nltk.stem import WordNetLemmatizer 20 | >>> wnl = WordNetLemmatizer() 21 | >>> wnl.lemmatize('dogs') 22 | 'dog' 23 | >>> wnl.lemmatize('churches') 24 | 'church' 25 | >>> wnl.lemmatize('aardwolves') 26 | 'aardwolf' 27 | >>> wnl.lemmatize('abaci') 28 | 'abacus' 29 | >>> wnl.lemmatize('hardrock') 30 | 'hardrock' 31 | """ 32 | 33 | def __init__(self): 34 | pass 35 | 36 | def lemmatize(self, word, pos=NOUN): 37 | lemmas = wordnet._morphy(word, pos) 38 | return min(lemmas, key=len) if lemmas else word 39 | 40 | def __repr__(self): 41 | return '' 42 | 43 | 44 | if __name__ == "__main__": 45 | import doctest 46 | doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) 47 | -------------------------------------------------------------------------------- /resources/nltk/tag/util.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Tagger Utilities 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Edward Loper 5 | # Steven Bird 6 | # URL: 7 | # For license information, see LICENSE.TXT 8 | 9 | def str2tuple(s, sep='/'): 10 | """ 11 | Given the string representation of a tagged token, return the 12 | corresponding tuple representation. The rightmost occurrence of 13 | *sep* in *s* will be used to divide *s* into a word string and 14 | a tag string. If *sep* does not occur in *s*, return (s, None). 15 | 16 | >>> from nltk.tag.util import str2tuple 17 | >>> str2tuple('fly/NN') 18 | ('fly', 'NN') 19 | 20 | :type s: str 21 | :param s: The string representation of a tagged token. 22 | :type sep: str 23 | :param sep: The separator string used to separate word strings 24 | from tags. 25 | """ 26 | loc = s.rfind(sep) 27 | if loc >= 0: 28 | return (s[:loc], s[loc+len(sep):].upper()) 29 | else: 30 | return (s, None) 31 | 32 | def tuple2str(tagged_token, sep='/'): 33 | """ 34 | Given the tuple representation of a tagged token, return the 35 | corresponding string representation. This representation is 36 | formed by concatenating the token's word string, followed by the 37 | separator, followed by the token's tag. (If the tag is None, 38 | then just return the bare word string.) 39 | 40 | >>> from nltk.tag.util import tuple2str 41 | >>> tagged_token = ('fly', 'NN') 42 | >>> tuple2str(tagged_token) 43 | 'fly/NN' 44 | 45 | :type tagged_token: tuple(str, str) 46 | :param tagged_token: The tuple representation of a tagged token. 47 | :type sep: str 48 | :param sep: The separator string used to separate word strings 49 | from tags. 50 | """ 51 | word, tag = tagged_token 52 | if tag is None: 53 | return word 54 | else: 55 | assert sep not in tag, 'tag may not contain sep!' 56 | return '%s%s%s' % (word, sep, tag) 57 | 58 | def untag(tagged_sentence): 59 | """ 60 | Given a tagged sentence, return an untagged version of that 61 | sentence. I.e., return a list containing the first element 62 | of each tuple in *tagged_sentence*. 63 | 64 | >>> from nltk.tag.util import untag 65 | >>> untag([('John', 'NNP'), ('saw', 'VBD'), ('Mary', 'NNP')]) 66 | ['John', 'saw', 'Mary'] 67 | 68 | """ 69 | return [w for (w, t) in tagged_sentence] 70 | 71 | 72 | 73 | if __name__ == "__main__": 74 | import doctest 75 | doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) 76 | -------------------------------------------------------------------------------- /resources/nltk/test/Makefile: -------------------------------------------------------------------------------- 1 | .SUFFIXES: .doctest .errs 2 | 3 | TESTS = $(wildcard *.doctest) 4 | 5 | ERRS := $(TESTS:.doctest=.errs) 6 | 7 | .doctest.errs: 8 | python ./doctest_driver.py $< > $@ 9 | 10 | all: $(ERRS) 11 | 12 | clean: 13 | rm -f *.errs 14 | -------------------------------------------------------------------------------- /resources/nltk/test/__init__.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Unit Tests 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Edward Loper 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | """ 9 | Unit tests for the NLTK modules. These tests are intended to ensure 10 | that changes that we make to NLTK's code don't accidentally introduce 11 | bugs. 12 | 13 | Use doctest_driver.py to run the tests:: 14 | 15 | doctest_driver.py --help 16 | 17 | NB. Popular options for NLTK documentation are:: 18 | 19 | --ellipsis --normalize_whitespace 20 | 21 | """ 22 | -------------------------------------------------------------------------------- /resources/nltk/test/all.py: -------------------------------------------------------------------------------- 1 | """Test suite that runs all NLTK tests. 2 | 3 | This module, `nltk.test.all`, is named as the NLTK ``test_suite`` in the 4 | project's ``setup-eggs.py`` file. Here, we create a test suite that 5 | runs all of our doctests, and return it for processing by the setuptools 6 | test harness. 7 | 8 | """ 9 | import doctest, unittest 10 | from glob import glob 11 | import os.path 12 | 13 | def additional_tests(): 14 | #print "here-000000000000000" 15 | #print "-----", glob(os.path.join(os.path.dirname(__file__), '*.doctest')) 16 | dir = os.path.dirname(__file__) 17 | paths = glob(os.path.join(dir, '*.doctest')) 18 | files = [ os.path.basename(path) for path in paths ] 19 | return unittest.TestSuite( 20 | [ doctest.DocFileSuite(file) for file in files ] 21 | ) 22 | #if os.path.split(path)[-1] != 'index.rst' 23 | # skips time-dependent doctest in index.rst 24 | -------------------------------------------------------------------------------- /resources/nltk/test/doctest_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | 4 | def float_equal(a, b, eps=1e-8): 5 | return abs(a-b) < eps 6 | -------------------------------------------------------------------------------- /resources/nltk/test/floresta.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rplevy/clojure-nltk/b4119d66b21f3c619ca4d385f605417901e47349/resources/nltk/test/floresta.txt -------------------------------------------------------------------------------- /resources/nltk/test/grammar.doctest: -------------------------------------------------------------------------------- 1 | .. Copyright (C) 2001-2012 NLTK Project 2 | .. For license information, see LICENSE.TXT 3 | 4 | =============== 5 | Grammar Parsing 6 | =============== 7 | 8 | Grammars can be parsed from strings: 9 | 10 | >>> from nltk import parse_cfg 11 | >>> grammar = parse_cfg(""" 12 | ... S -> NP VP 13 | ... PP -> P NP 14 | ... NP -> Det N | NP PP 15 | ... VP -> V NP | VP PP 16 | ... Det -> 'a' | 'the' 17 | ... N -> 'dog' | 'cat' 18 | ... V -> 'chased' | 'sat' 19 | ... P -> 'on' | 'in' 20 | ... """) 21 | >>> grammar 22 | 23 | >>> grammar.start() 24 | S 25 | >>> grammar.productions() # doctest: +NORMALIZE_WHITESPACE 26 | [S -> NP VP, PP -> P NP, NP -> Det N, NP -> NP PP, VP -> V NP, VP -> VP PP, 27 | Det -> 'a', Det -> 'the', N -> 'dog', N -> 'cat', V -> 'chased', V -> 'sat', 28 | P -> 'on', P -> 'in'] 29 | 30 | Probabilistic CFGs: 31 | 32 | >>> from nltk import parse_pcfg 33 | >>> toy_pcfg1 = parse_pcfg(""" 34 | ... S -> NP VP [1.0] 35 | ... NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] 36 | ... Det -> 'the' [0.8] | 'my' [0.2] 37 | ... N -> 'man' [0.5] | 'telescope' [0.5] 38 | ... VP -> VP PP [0.1] | V NP [0.7] | V [0.2] 39 | ... V -> 'ate' [0.35] | 'saw' [0.65] 40 | ... PP -> P NP [1.0] 41 | ... P -> 'with' [0.61] | 'under' [0.39] 42 | ... """) 43 | 44 | Chomsky Normal Form grammar (Test for bug 474) 45 | 46 | >>> g = parse_cfg("VP^ -> VBP NP^") 47 | >>> g.productions()[0].lhs() 48 | VP^ 49 | -------------------------------------------------------------------------------- /resources/nltk/test/japanese.doctest: -------------------------------------------------------------------------------- 1 | .. Copyright (C) 2001-2012 NLTK Project 2 | .. For license information, see LICENSE.TXT 3 | 4 | ============================ 5 | Japanese Language Processing 6 | ============================ 7 | 8 | >>> from nltk import * 9 | 10 | ------------- 11 | Corpus Access 12 | ------------- 13 | 14 | KNB Corpus 15 | ---------- 16 | 17 | Currently, the interface returns objects of the wrong type. 18 | 19 | >>> from nltk.corpus import knbc 20 | 21 | Access the words: this should produce a list of strings: 22 | 23 | >>> type(knbc.words()[0]) 24 | 25 | 26 | Access the sentences: this should produce a list of lists of strings: 27 | 28 | >>> type(knbc.sents()[0][0]) 29 | 30 | 31 | Access the tagged words: this should produce a list of word, tag pairs: 32 | 33 | >>> type(knbc.tagged_words()[0]) 34 | 35 | 36 | Access the tagged sentences: this should produce a list of lists of word, tag pairs: 37 | 38 | >>> type(knbc.tagged_sents()[0][0]) 39 | 40 | 41 | 42 | JEITA Corpus 43 | ------------ 44 | 45 | >>> from nltk.corpus import jeita 46 | 47 | Access the tagged words: this should produce a list of word, tag pairs, where a tag is a string: 48 | 49 | >>> type(jeita.tagged_words()[0][1]) 50 | 51 | -------------------------------------------------------------------------------- /resources/nltk/test/onto1.fol: -------------------------------------------------------------------------------- 1 | all x. ((boxer2 x) implies (dog x)) 2 | all x. ((boxer1 x) implies (person x)) 3 | all x. (not ((dog x) and (person x))) 4 | all x. (not ((kitchen x) and (garden x))) 5 | all x. ((kitchen x) implies (location x)) 6 | all x. ((garden x) implies (location x)) -------------------------------------------------------------------------------- /resources/nltk/test/portuguese.doctest_latin1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rplevy/clojure-nltk/b4119d66b21f3c619ca4d385f605417901e47349/resources/nltk/test/portuguese.doctest_latin1 -------------------------------------------------------------------------------- /resources/nltk/test/portuguese_en.doctest: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rplevy/clojure-nltk/b4119d66b21f3c619ca4d385f605417901e47349/resources/nltk/test/portuguese_en.doctest -------------------------------------------------------------------------------- /resources/nltk/test/runtests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import absolute_import 4 | import sys 5 | import os 6 | import nose 7 | 8 | NLTK_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')) 9 | sys.path.insert(0, NLTK_ROOT) 10 | 11 | NLTK_TEST_DIR = os.path.join(NLTK_ROOT, 'nltk') 12 | 13 | 14 | # These tests are expected to fail. 15 | # NOTE: Remember to remove tests from this list after they have been fixed. 16 | FAILING_TESTS = [ 17 | "ccg.doctest", # This test randomly fails - nondeterministic output 18 | "collocations.doctest", 19 | "corpus.doctest", 20 | "portuguese_en.doctest", 21 | "probability.doctest", 22 | "relextract.doctest", 23 | ] 24 | 25 | # These tests require extra dependencies and should not run by default 26 | # TODO: Run the tests if the relevant dependeices are present on the system 27 | DEPENDENT_TESTS = [ 28 | # "classify.doctest", 29 | "discourse.doctest", 30 | "drt.doctest", 31 | "gluesemantics.doctest", 32 | "inference.doctest", 33 | "nonmonotonic.doctest", 34 | ] 35 | 36 | EXCLUDED_TESTS = FAILING_TESTS + DEPENDENT_TESTS 37 | _EXCLUDE_ARGV = ['--exclude='+test for test in EXCLUDED_TESTS] 38 | 39 | if __name__ == '__main__': 40 | from nltk.test.doctest_nose_plugin import DoctestFix 41 | from nose.plugins.manager import PluginManager 42 | from nose.plugins.doctests import Doctest 43 | from nose.plugins import builtin 44 | 45 | class NltkPluginManager(PluginManager): 46 | """ 47 | Nose plugin manager that replaces standard doctest plugin 48 | with a patched version. 49 | """ 50 | def loadPlugins(self): 51 | for plug in builtin.plugins: 52 | if plug != Doctest: 53 | self.addPlugin(plug()) 54 | self.addPlugin(DoctestFix()) 55 | super(NltkPluginManager, self).loadPlugins() 56 | 57 | manager = NltkPluginManager() 58 | manager.loadPlugins() 59 | 60 | # allow passing extra options and running individual tests 61 | # Examples: 62 | # 63 | # python runtests.py semantics.doctest 64 | # python runtests.py --with-id -v 65 | # python runtests.py --with-id -v nltk.featstruct 66 | 67 | args = sys.argv[1:] 68 | if not args: 69 | args = [NLTK_TEST_DIR] 70 | 71 | if all(arg.startswith('-') for arg in args): 72 | # only extra options were passed 73 | args += [NLTK_TEST_DIR] 74 | 75 | nose.main(argv=_EXCLUDE_ARGV + [ 76 | #'--with-xunit', 77 | #'--xunit-file=$WORKSPACE/nosetests.xml', 78 | '--with-doctest', 79 | '--doctest-extension=.doctest', 80 | '--doctest-options=+ELLIPSIS,+NORMALIZE_WHITESPACE,+IGNORE_EXCEPTION_DETAIL', 81 | #'--verbosity=3', 82 | ] + args, plugins=manager.plugins) 83 | -------------------------------------------------------------------------------- /resources/nltk/test/segmentation.doctest: -------------------------------------------------------------------------------- 1 | .. Copyright (C) 2001-2012 NLTK Project 2 | .. For license information, see LICENSE.TXT 3 | 4 | ========================= 5 | Text Segmentation Metrics 6 | ========================= 7 | 8 | The `nltk.metrics.segmentation` module provides a variety of 9 | *evaluation measures* which can be used for evaluating text 10 | segmentation methods 11 | 12 | A segmentation is any sequence over a vocabulary of two items 13 | (e.g. "0", "1"), where the specified boundary value is used to 14 | mark the edge of a segmentation. 15 | 16 | >>> from nltk.metrics import windowdiff, ghd, pk 17 | 18 | ---------- 19 | Windowdiff 20 | ---------- 21 | 22 | Compute the windowdiff score for a pair of segmentations. 23 | 24 | >>> s1 = "00000010000000001000000" 25 | >>> s2 = "00000001000000010000000" 26 | >>> s3 = "00010000000000000001000" 27 | >>> windowdiff(s1, s1, 3) 28 | 0 29 | >>> windowdiff(s1, s2, 3) 30 | 4 31 | >>> windowdiff(s2, s3, 3) 32 | 16 33 | 34 | 35 | ---------------------------- 36 | Generalized Hamming Distance 37 | ---------------------------- 38 | 39 | Generalized Hamming Distance may be used as an evaluation metric for 40 | text segmentation. It compares two segmentations, and returns the cost 41 | of transforming one segmentation into the other. The transformation 42 | is done though boundary insertions, deletions and shifts. Each 43 | operation may have a different cost. 44 | 45 | >>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5) 46 | 0.5 47 | >>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5) 48 | 2.0 49 | >>> ghd('011', '110', 1.0, 1.0, 0.5) 50 | 1.0 51 | >>> ghd('1', '0', 1.0, 1.0, 0.5) 52 | 1.0 53 | >>> ghd('111', '000', 1.0, 1.0, 0.5) 54 | 3.0 55 | >>> ghd('000', '111', 1.0, 2.0, 0.5) 56 | 6.0 57 | 58 | 59 | -------------- 60 | Befferman's Pk 61 | -------------- 62 | 63 | Beeferman's Pk was proposed as an evaluation metric for text 64 | segmentation. It takes a reference segmentation as first argument, an 65 | hypothesis segmentation as second argument. It returns the 66 | propability that randomly chosen pair of words a distance of k words 67 | is inconsistently classified. 68 | 69 | >>> print pk('1000100', '1000100', 3) 70 | 0.0 71 | >>> print pk('100', '010', 2) 72 | 0.5 73 | >>> print pk('100100', '111111', 2) 74 | 0.64 75 | >>> print pk('100100', '000000', 2) 76 | 0.04 77 | >>> print pk('100100', '111111', 3) 78 | 0.25 79 | >>> print pk('100100', '000000', 3) 80 | 0.25 81 | -------------------------------------------------------------------------------- /resources/nltk/test/sem3.cfg: -------------------------------------------------------------------------------- 1 | ####################################### 2 | # sem1.cfg 3 | ####################################### 4 | # Minimal feature-based grammar with determiner semantics. 5 | 6 | 7 | % start S 8 | 9 | S[sem=?vp] -> NP[sem=?np] VP[subj=?np, sem=?vp] 10 | VP[sem=?v, subj=?np] -> IV[sem=?v, subj=?np] 11 | NP[sem=[index='k',name='kim']] -> 'Kim' 12 | IV[sem=[rel='bark', arg=?i], subj=[sem=[index=?i]]] -> 'barks' 13 | #IV[fsem=[rel='bark', arg=(1)[]], subj=[fsem=[index->(1)]]] -> 'barks' 14 | 15 | -------------------------------------------------------------------------------- /resources/nltk/test/simple.doctest: -------------------------------------------------------------------------------- 1 | .. Copyright (C) 2001-2012 NLTK Project 2 | .. For license information, see LICENSE.TXT 3 | 4 | ================= 5 | EasyInstall Tests 6 | ================= 7 | 8 | This file contains some simple tests that will be run by EasyInstall in 9 | order to test the installation when NLTK-Data is absent. 10 | 11 | >>> from nltk.test.doctest_utils import * 12 | 13 | ------------ 14 | Tokenization 15 | ------------ 16 | 17 | >>> from nltk.tokenize import wordpunct_tokenize 18 | >>> s = ("Good muffins cost $3.88\nin New York. Please buy me\n" 19 | ... "two of them.\n\nThanks.") 20 | >>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE 21 | ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 22 | 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] 23 | 24 | ------- 25 | Metrics 26 | ------- 27 | 28 | >>> from nltk.metrics import precision, recall, f_measure 29 | >>> reference = 'DET NN VB DET JJ NN NN IN DET NN'.split() 30 | >>> test = 'DET VB VB DET NN NN NN IN DET NN'.split() 31 | >>> reference_set = set(reference) 32 | >>> test_set = set(test) 33 | >>> precision(reference_set, test_set) 34 | 1.0 35 | >>> float_equal(recall(reference_set, test_set), 0.8) 36 | True 37 | >>> float_equal(f_measure(reference_set, test_set), 0.88888888888888) 38 | True 39 | 40 | ------------------ 41 | Feature Structures 42 | ------------------ 43 | 44 | >>> from nltk import FeatStruct 45 | >>> fs1 = FeatStruct(PER=3, NUM='pl', GND='fem') 46 | >>> fs2 = FeatStruct(POS='N', AGR=fs1) 47 | >>> print fs2 48 | [ [ GND = 'fem' ] ] 49 | [ AGR = [ NUM = 'pl' ] ] 50 | [ [ PER = 3 ] ] 51 | [ ] 52 | [ POS = 'N' ] 53 | >>> print fs2['AGR'] 54 | [ GND = 'fem' ] 55 | [ NUM = 'pl' ] 56 | [ PER = 3 ] 57 | >>> print fs2['AGR']['PER'] 58 | 3 59 | 60 | ------- 61 | Parsing 62 | ------- 63 | 64 | >>> from nltk.parse.rd import RecursiveDescentParser, parse_cfg 65 | >>> grammar = parse_cfg(""" 66 | ... S -> NP VP 67 | ... PP -> P NP 68 | ... NP -> 'the' N | N PP | 'the' N PP 69 | ... VP -> V NP | V PP | V NP PP 70 | ... N -> 'cat' | 'dog' | 'rug' 71 | ... V -> 'chased' 72 | ... P -> 'on' 73 | ... """) 74 | >>> rd = RecursiveDescentParser(grammar) 75 | >>> sent = 'the cat chased the dog on the rug'.split() 76 | >>> for t in rd.nbest_parse(sent): 77 | ... print t 78 | (S 79 | (NP the (N cat)) 80 | (VP (V chased) (NP the (N dog) (PP (P on) (NP the (N rug)))))) 81 | (S 82 | (NP the (N cat)) 83 | (VP (V chased) (NP the (N dog)) (PP (P on) (NP the (N rug))))) 84 | 85 | -------------------------------------------------------------------------------- /resources/nltk/test/tag.doctest: -------------------------------------------------------------------------------- 1 | .. Copyright (C) 2001-2012 NLTK Project 2 | .. For license information, see LICENSE.TXT 3 | 4 | Regression Tests 5 | ~~~~~~~~~~~~~~~~ 6 | 7 | Sequential Taggers 8 | ------------------ 9 | 10 | Add tests for: 11 | - make sure backoff is being done correctly. 12 | - make sure ngram taggers don't use previous sentences for context. 13 | - make sure ngram taggers see 'beginning of the sentence' as a 14 | unique context 15 | - make sure regexp tagger's regexps are tried in order 16 | - train on some simple examples, & make sure that the size & the 17 | generated models are correct. 18 | - make sure cutoff works as intended 19 | - make sure that ngram models only exclude contexts covered by the 20 | backoff tagger if the backoff tagger gets that context correct at 21 | *all* locations. 22 | 23 | Brill Tagger 24 | ------------ 25 | - test that fast & normal trainers get identical results when 26 | deterministic=True is used. 27 | - check on some simple examples to make sure they're doing the 28 | right thing. 29 | 30 | Make sure that get_neighborhoods is implemented correctly -- in 31 | particular, given *index*, it should return the indices *i* such that 32 | applicable_rules(token, i, ...) depends on the value of the 33 | *index*\ th token. There used to be a bug where this was swapped -- 34 | i.e., it calculated the values of *i* such that 35 | applicable_rules(token, index, ...) depended on *i*. 36 | 37 | >>> from nltk.tag.brill import ProximateTokensTemplate, ProximateWordsRule 38 | >>> t = ProximateTokensTemplate(ProximateWordsRule, (2,3)) 39 | >>> for i in range(10): 40 | ... print sorted(t.get_neighborhood('abcdefghijkl', i)) 41 | [0] 42 | [1] 43 | [0, 2] 44 | [0, 1, 3] 45 | [1, 2, 4] 46 | [2, 3, 5] 47 | [3, 4, 6] 48 | [4, 5, 7] 49 | [5, 6, 8] 50 | [6, 7, 9] 51 | 52 | -------------------------------------------------------------------------------- /resources/nltk/test/toy.cfg: -------------------------------------------------------------------------------- 1 | S -> NP VP 2 | PP -> P NP 3 | NP -> Det N | NP PP 4 | VP -> V NP | VP PP 5 | Det -> 'a' | 'the' 6 | N -> 'dog' | 'cat' 7 | V -> 'chased' | 'sat' 8 | P -> 'on' | 'in' 9 | 10 | -------------------------------------------------------------------------------- /resources/nltk/test/util.doctest: -------------------------------------------------------------------------------- 1 | .. Copyright (C) 2001-2012 NLTK Project 2 | .. For license information, see LICENSE.TXT 3 | 4 | ================= 5 | Utility functions 6 | ================= 7 | 8 | >>> from nltk.util import * 9 | >>> from nltk.tree import Tree 10 | 11 | >>> print_string("This is a long string, therefore it should break", 25) 12 | This is a long string, 13 | therefore it should break 14 | 15 | >>> re_show("[a-z]+", "sdf123") 16 | {sdf}123 17 | 18 | >>> tree = Tree(5, 19 | ... [Tree(4, [Tree(2, [1, 3])]), 20 | ... Tree(8, [Tree(6, [7]), 9])]) 21 | >>> for x in breadth_first(tree): 22 | ... if isinstance(x, int): print x 23 | ... else: print x.node 24 | 5 25 | 4 26 | 8 27 | 2 28 | 6 29 | 9 30 | 1 31 | 3 32 | 7 33 | >>> for x in breadth_first(tree, maxdepth=2): 34 | ... if isinstance(x, int): print x 35 | ... else: print x.node 36 | 5 37 | 4 38 | 8 39 | 2 40 | 6 41 | 9 42 | 43 | >>> invert_dict({1: 2}) 44 | defaultdict(, {2: 1}) 45 | 46 | >>> invert_dict({1: [3, 4, 5]}) 47 | defaultdict(, {3: [1], 4: [1], 5: [1]}) 48 | 49 | Testing HTML cleaning 50 | --------------------- 51 | 52 | >>> html = """ 53 | ...
54 | ...
65 | ... 73 | ... 79 | ... 82 | ... 83 | ... """ 84 | >>> [link.strip() for link in re.split("\n+", clean_html(html))] 85 | ['Skip Links', 'AOL', 'My AOL', 'Mail', '', '', 'Get The All-Amer... Ringtones'] 86 | >>> clean_html("

Heading

Test

") 87 | 'Heading Test' 88 | >>> clean_html(" aaa

bbb ") 89 | 'aaa bbb' 90 | -------------------------------------------------------------------------------- /resources/nltk/tokenize/api.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Tokenizer Interface 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Edward Loper 5 | # Steven Bird 6 | # URL: 7 | # For license information, see LICENSE.TXT 8 | 9 | """ 10 | Tokenizer Interface 11 | """ 12 | 13 | from nltk.internals import overridden 14 | from nltk.tokenize.util import string_span_tokenize 15 | 16 | class TokenizerI(object): 17 | """ 18 | A processing interface for tokenizing a string. 19 | Subclasses must define ``tokenize()`` or ``batch_tokenize()`` (or both). 20 | """ 21 | def tokenize(self, s): 22 | """ 23 | Return a tokenized copy of *s*. 24 | 25 | :rtype: list of str 26 | """ 27 | if overridden(self.batch_tokenize): 28 | return self.batch_tokenize([s])[0] 29 | else: 30 | raise NotImplementedError() 31 | 32 | def span_tokenize(self, s): 33 | """ 34 | Identify the tokens using integer offsets ``(start_i, end_i)``, 35 | where ``s[start_i:end_i]`` is the corresponding token. 36 | 37 | :rtype: iter(tuple(int, int)) 38 | """ 39 | raise NotImplementedError() 40 | 41 | def batch_tokenize(self, strings): 42 | """ 43 | Apply ``self.tokenize()`` to each element of ``strings``. I.e.: 44 | 45 | return [self.tokenize(s) for s in strings] 46 | 47 | :rtype: list(list(str)) 48 | """ 49 | return [self.tokenize(s) for s in strings] 50 | 51 | def batch_span_tokenize(self, strings): 52 | """ 53 | Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.: 54 | 55 | return [self.span_tokenize(s) for s in strings] 56 | 57 | :rtype: iter(list(tuple(int, int))) 58 | """ 59 | for s in strings: 60 | yield list(self.span_tokenize(s)) 61 | 62 | 63 | class StringTokenizer(TokenizerI): 64 | """A tokenizer that divides a string into substrings by splitting 65 | on the specified string (defined in subclasses). 66 | """ 67 | 68 | def tokenize(self, s): 69 | return s.split(self._string) 70 | 71 | def span_tokenize(self, s): 72 | for span in string_span_tokenize(s, self._string): 73 | yield span 74 | 75 | 76 | if __name__ == "__main__": 77 | import doctest 78 | doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) 79 | -------------------------------------------------------------------------------- /resources/nltk/yamltags.py: -------------------------------------------------------------------------------- 1 | """ 2 | Register YAML tags in the NLTK namespace with the YAML loader, by telling it 3 | what module and class to look for. 4 | 5 | NLTK uses simple '!' tags to mark the types of objects, but the fully-qualified 6 | "tag:nltk.org,2011:" prefix is also accepted in case anyone ends up 7 | using it. 8 | """ 9 | 10 | #import yaml 11 | 12 | def custom_import(name): 13 | components = name.split('.') 14 | module_path = '.'.join(components[:-1]) 15 | mod = __import__(module_path) 16 | for comp in components[1:]: 17 | mod = getattr(mod, comp) 18 | return mod 19 | 20 | def metaloader(classpath): 21 | def loader(*args, **kwds): 22 | classref = custom_import(classpath) 23 | return classref.from_yaml(*args, **kwds) 24 | return loader 25 | 26 | def register_tag(tag, classpath): 27 | yaml.add_constructor(u'!'+tag, metaloader(classpath)) 28 | yaml.add_constructor(u'tag:nltk.org,2011:'+tag, 29 | metaloader(classpath)) 30 | 31 | register_tag(u'tag.Unigram', 'nltk.tag.unigram.Unigram') 32 | register_tag(u'tag.Brill', 'nltk.tag.brill.Brill') 33 | 34 | __all__ = ['custom_import', 'metaloader', 'register_tag'] 35 | -------------------------------------------------------------------------------- /resources/papers/acl-02/.cvsignore: -------------------------------------------------------------------------------- 1 | *.aux 2 | *.dvi 3 | *.log 4 | *.ps 5 | *.bbl 6 | *.blg 7 | *.pdf 8 | -------------------------------------------------------------------------------- /resources/papers/acl-02/Makefile: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Technical report Makefile 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Edward Loper 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | ############################################## 9 | ## The name of the report 10 | REPORT = acl02 11 | 12 | help: usage 13 | usage: 14 | @echo 15 | @echo make '[dvi | ps | pdf | clean]' 16 | @echo 17 | 18 | # We're using bibtex: 19 | $(REPORT).dvi: $(REPORT).bbl 20 | BIBFILE = nltk.bib 21 | 22 | ############################################## 23 | ## Figure dependancies 24 | 25 | 26 | ############################################## 27 | ## You shouldn't have to change anything below here. 28 | 29 | # Find the name of the dvi and ps files. 30 | DVI := $(REPORT).dvi 31 | PS := $(REPORT).ps 32 | PDF := $(REPORT).pdf 33 | 34 | # Top-level rules. 35 | dvi: $(DVI) 36 | ps: $(PS) 37 | pdf: $(PDF) 38 | clean: 39 | rm -f *.log *.aux *.dvi *.ps *.toc *.pdf *.bbl *.blg 40 | 41 | %.bbl: %.tex $(BIBFILE) 42 | latex $*.tex || (rm -f $*.dvi && false) 43 | bibtex $* || (rm -f $*.dvi $@ && false) 44 | 45 | %.dvi: %.tex 46 | latex $*.tex || (rm -f $@ && false) 47 | latex $*.tex || (rm -f $@ && false) 48 | 49 | %.ps: %.dvi 50 | dvips -t letter -o $@ $< -G0 -Ppdf 51 | 52 | %.eps: %.dot 53 | dot -Tps -o $@ $< 54 | 55 | %.eps: %.obj 56 | tgif -print -eps $< 57 | 58 | %.pdf: %.ps 59 | ps2pdf -sPAPERSIZE=letter -dMaxSubsetPct=100 \ 60 | -dCompatibilityLevel=1.2 -dSubsetFonts=true \ 61 | -dEmbedAllFonts=true $< $@ 62 | 63 | -------------------------------------------------------------------------------- /resources/papers/acl-02/chartparse.eps.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rplevy/clojure-nltk/b4119d66b21f3c619ca4d385f605417901e47349/resources/papers/acl-02/chartparse.eps.gz -------------------------------------------------------------------------------- /resources/papers/acl-02/contest.ps.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rplevy/clojure-nltk/b4119d66b21f3c619ca4d385f605417901e47349/resources/papers/acl-02/contest.ps.gz -------------------------------------------------------------------------------- /resources/papers/acl-04/.cvsignore: -------------------------------------------------------------------------------- 1 | ! 2 | #* 3 | *.aux 4 | *.dvi 5 | *.eps 6 | *.log 7 | *.pdf 8 | *.ps 9 | *.toc 10 | *~ 11 | acl04.bbl 12 | acl04.blg 13 | -------------------------------------------------------------------------------- /resources/papers/acl-04/Makefile: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Technical report Makefile 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Edward Loper 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | ############################################## 9 | ## The name of the report 10 | REPORT = acl04 11 | 12 | ############################################## 13 | ## Figure dependancies 14 | 15 | ############################################## 16 | ## You shouldn't have to change anything below here. 17 | 18 | # Find the name of the dvi and ps files. 19 | DVI := $(REPORT).dvi 20 | PS := $(REPORT).ps 21 | PDF := $(REPORT).pdf 22 | 23 | # Top-level rules. 24 | dvi: $(DVI) 25 | ps: $(PS) 26 | pdf: $(PDF) 27 | clean: 28 | rm -f *.eps *.log *.aux *.dvi *.ps *.toc *.pdf 29 | 30 | # General rules 31 | %.dvi: %.tex 32 | latex $< 33 | latex $< 34 | 35 | %.ps: %.dvi 36 | dvips -t letter -o $@ $< 37 | 38 | %.eps: %.dot 39 | dot -Tps -o $@ $< 40 | 41 | %.eps: %.obj 42 | tgif -print -eps $< 43 | 44 | %.pdf: %.ps 45 | ps2pdf $< $@ 46 | -------------------------------------------------------------------------------- /resources/papers/acl-04/chart-matrix.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rplevy/clojure-nltk/b4119d66b21f3c619ca4d385f605417901e47349/resources/papers/acl-04/chart-matrix.gif -------------------------------------------------------------------------------- /resources/papers/acl-04/chart.eps.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rplevy/clojure-nltk/b4119d66b21f3c619ca4d385f605417901e47349/resources/papers/acl-04/chart.eps.gz -------------------------------------------------------------------------------- /resources/papers/acl-04/nltk.bib: -------------------------------------------------------------------------------- 1 | 2 | @Book{Rossum03intro, 3 | author = {Guido Van Rossum}, 4 | title = {An Introduction to Python}, 5 | publisher = {Network Theory Ltd}, 6 | year = 2003 7 | } 8 | 9 | @Book{Rossum03ref, 10 | author = {Guido Van Rossum}, 11 | title = {The Python Language Reference}, 12 | publisher = {Network Theory Ltd}, 13 | year = 2003 14 | } 15 | 16 | @InProceedings{LoperBird02, 17 | author = {Edward Loper and Steven Bird}, 18 | title = {{NLTK: The Natural Language Toolkit}}, 19 | booktitle = {Proceedings of the ACL Workshop on Effective Tools and 20 | Methodologies for Teaching Natural Language Processing and Computational 21 | Linguistics}, 22 | year = 2002, 23 | publisher={Somerset, NJ: Association for Computational Linguistics}, 24 | pages={62--69}, 25 | note = {\url{http://arXiv.org/abs/cs/0205028}}, 26 | } 27 | 28 | @InProceedings{Loper04, 29 | author = {Edward Loper}, 30 | title = {{NLTK}: Building a Pedagogical Toolkit in {Python}}, 31 | booktitle = {PyCon DC 2004}, 32 | year = 2004, 33 | publisher = {Python Software Foundation}, 34 | note = {\url{http://www.python.org/pycon/dc2004/papers/}} 35 | } 36 | 37 | @Misc{tkinter, 38 | author = {Fredrik Lundh}, 39 | title = {An Introduction to Tkinter}, 40 | note = {\url{http://www.pythonware.com/library/tkinter/introduction/index.htm}}, 41 | year = 1999 42 | } 43 | 44 | @Misc{epydoc, 45 | author = {Edward Loper}, 46 | title = {Epydoc}, 47 | year = 2002, 48 | note = {\url{http://epydoc.sourceforge.net/}} 49 | } 50 | 51 | -------------------------------------------------------------------------------- /resources/papers/acl-06/rdparser.eps.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rplevy/clojure-nltk/b4119d66b21f3c619ca4d385f605417901e47349/resources/papers/acl-06/rdparser.eps.gz -------------------------------------------------------------------------------- /resources/papers/acl-06/srparser.eps.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rplevy/clojure-nltk/b4119d66b21f3c619ca4d385f605417901e47349/resources/papers/acl-06/srparser.eps.gz -------------------------------------------------------------------------------- /resources/papers/acl-08/grammar1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | 3 | def parse(sent, grammar): 4 | gr = nltk.cfg.parse_cfg(grammar) 5 | parser = nltk.ChartParser(gr, nltk.parse.TD_STRATEGY) 6 | trees = parser.nbest_parse(sent.split()) 7 | nltk.draw.draw_trees(*trees) 8 | 9 | grammar = """ 10 | S -> NP VP 11 | VP -> V NP | VP PP 12 | NP -> Det N | NP PP 13 | PP -> P NP 14 | NP -> 'I' 15 | Det -> 'the' | 'a' | 'my' 16 | N -> 'elephant' | 'pajamas' | 'man' | 'park' | 'telescope' 17 | V -> 'shot' | 'saw' 18 | P -> 'in' | 'on' | 'with' 19 | """ 20 | 21 | sent = 'I shot the elephant in my pajamas' 22 | parse(sent, grammar) 23 | -------------------------------------------------------------------------------- /resources/papers/acl-08/grammar2.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | 3 | def parse(sent, grammar): 4 | gr = nltk.cfg.parse_cfg(grammar) 5 | parser = nltk.ChartParser(gr, nltk.parse.TD_STRATEGY) 6 | trees = parser.nbest_parse(sent.split()) 7 | nltk.draw.draw_trees(*trees) 8 | 9 | grammar = """ 10 | S -> NP VP 11 | VP -> V NP | VP PP 12 | NP -> Det N | NP PP 13 | PP -> P NP 14 | NP -> 'I' 15 | Det -> 'the' | 'a' | 'my' 16 | N -> 'elephant' | 'pajamas' | 'man' | 'park' | 'telescope' 17 | V -> 'shot' | 'saw' 18 | P -> 'in' | 'on' | 'with' 19 | """ 20 | 21 | sent = 'I saw the man in the park with a telescope' 22 | parse(sent, grammar) 23 | -------------------------------------------------------------------------------- /resources/papers/acl-08/police.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | 3 | def parse(sent, grammar): 4 | gr = nltk.cfg.parse_cfg(grammar) 5 | parser = nltk.ChartParser(gr, nltk.parse.TD_STRATEGY) 6 | trees = parser.nbest_parse(sent.split()) 7 | nltk.draw.draw_trees(*trees) 8 | 9 | grammar = """ 10 | S -> NP V NP 11 | NP -> NP Sbar 12 | Sbar -> NP V 13 | NP -> 'fish' | 'police' 14 | V -> 'fish' | 'police' 15 | """ 16 | 17 | sent = 'police police police police police police police police police' 18 | parse(sent, grammar) 19 | 20 | -------------------------------------------------------------------------------- /resources/papers/altw-06/altw-06.bib: -------------------------------------------------------------------------------- 1 | @Book{Blackburn:2005:RINL, 2 | author = {Patrick Blackburn and Johan Bos}, 3 | title = {Representation and Inference for Natural Language: A First Course in Computational Semantics}, 4 | publisher = {CSLI Publications}, 5 | year = 2005} 6 | 7 | 8 | @InCollection{Montague:1974:PTQ, 9 | author = {Richard Montague}, 10 | title = {The Proper Treatment of Quantification in Ordinary {E}nglish}, 11 | booktitle = {Formal Philosphy: Selected Papers of Richard Montague}, 12 | pages = {247--270}, 13 | publisher = {Yale University Press}, 14 | year = 1974, 15 | editor = {R. H. Thomason}, 16 | address = {New Haven}} 17 | 18 | @Book{Dowty:1981:IMS, 19 | author = {D. R. Dowty and R. E. Wall and S. Peters}, 20 | title = {Introduction to {M}ontague {S}emantics}, 21 | publisher = {Reidel}, 22 | year = 1981, 23 | series = {Studies in Linguistics and Philosophy}, 24 | address = {Dordrecht}} 25 | 26 | @InProceedings{Bird:2005:NES, 27 | author = {Steven Bird}, 28 | title = {{NLTK-Lite}: Efficient Scripting for Natural Language Processing}, 29 | booktitle = {Proceedings of the 4th International Conference on Natural Language Processing (ICON)}, 30 | pages = {11--18}, 31 | year = 2005, 32 | address = {New Delhi}, 33 | month = {December}, 34 | publisher = {Allied Publishers}} 35 | 36 | 37 | @Book{vanRossum:2006:PT, 38 | author = {Guido van Rossum}, 39 | title = {Python Tutorial}, 40 | year = 2006, 41 | month = {March}, 42 | note = {Release 2.4.3}, 43 | url = {http://docs.python.org/tut/tut.html} 44 | } 45 | 46 | 47 | @Book{Russell:2003:AIMA, 48 | author = {Stuart Russell and Peter Norvig}, 49 | title = {Artifical Intelligence: A Modern Approach}, 50 | publisher = {Prentice Hall}, 51 | year = 2003, 52 | note = {2nd edition}} 53 | 54 | -------------------------------------------------------------------------------- /resources/papers/iwcs-08/drs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rplevy/clojure-nltk/b4119d66b21f3c619ca4d385f605417901e47349/resources/papers/iwcs-08/drs.png -------------------------------------------------------------------------------- /resources/papers/iwcs-08/garrette-klein.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rplevy/clojure-nltk/b4119d66b21f3c619ca4d385f605417901e47349/resources/papers/iwcs-08/garrette-klein.tar.gz -------------------------------------------------------------------------------- /resources/papers/iwcs-08/modules.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rplevy/clojure-nltk/b4119d66b21f3c619ca4d385f605417901e47349/resources/papers/iwcs-08/modules.graffle -------------------------------------------------------------------------------- /resources/papers/iwcs-08/modules.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rplevy/clojure-nltk/b4119d66b21f3c619ca4d385f605417901e47349/resources/papers/iwcs-08/modules.pdf -------------------------------------------------------------------------------- /resources/papers/iwcs-08/nltk_iwcs_09.bib: -------------------------------------------------------------------------------- 1 | @book{Dalrymple2001, 2 | author = {Mary Dalrymple}, 3 | title = {Lexical Functional Grammar}, 4 | series = {Syntax and Semantics}, 5 | volume = {34}, 6 | publisher = {Academic Press}, 7 | address = {New York}, 8 | year = {2001} 9 | } 10 | 11 | 12 | @InCollection{Dalrymple:1999:RRB, 13 | author = {Mary Dalrymple and V. Gupta and John Lamping and V. Saraswat}, 14 | title = {Relating resource-based 15 | semantics to categorial semantics}, 16 | booktitle = {Semantics and syntax in {Lexical Functional Grammar}: the resource 17 | logic approach}, 18 | pages = { 261--280}, 19 | publisher = {MIT Press}, 20 | year = 1999, 21 | editor = {Mary Dalrymple}, 22 | address = {Cambridge, MA}} 23 | 24 | 25 | 26 | 27 | @book{BB, 28 | author = {Patrick Blackburn and Johan Bos}, 29 | title = {Representation and Inference for Natural Language: A First Course in Computational Semantics}, 30 | publisher = {CSLI Publications}, 31 | address = {New York}, 32 | year = {2005} 33 | } 34 | 35 | @book{KampReyle, 36 | author = {Hans Kamp and Uwe Reyle}, 37 | title = {From Discourse to the Lexicon: Introduction to Modeltheoretic Semantics of Natural Language, Formal Logic and Discourse Representation Theory}, 38 | publisher = {Kluwer Academic Publishers}, 39 | year = {1993} 40 | } 41 | 42 | @inproceedings{Multidisciplinary, 43 | author = {Steven Bird and Ewan Klein and Edward Loper and Jason Baldridge}, 44 | title = {Multidisciplinary instruction with the {Natural Language Toolkit}}, 45 | booktitle = {Proceedings of the Third Workshop on Issues in Teaching Computational Linguistics}, 46 | address = {Columbus, Ohio, USA}, 47 | month = {June}, 48 | year = {2008} 49 | } 50 | 51 | @Misc{McCune, 52 | author = {William McCune}, 53 | title = {Prover9: Automated theorem prover for first-order and equational logic}, 54 | year = 2008, 55 | note = {\url{http://www.cs.unm.edu/~mccune/mace4/manual-examples.html}} 56 | } 57 | 58 | @inproceedings{BosRTE, 59 | author = {Johan Bos and Katja Markert}, 60 | title = {Recognising textual entailment with logical inference}, 61 | booktitle = {Proceedings of the conference on Human Language Technology and Empirical Methods in Natural Language Processing}, 62 | address = {Vancouver, British Columbia, Canada}, 63 | year = {2005} 64 | } 65 | 66 | @InProceedings{Klein06altw, 67 | author = {Ewan Klein}, 68 | title = {Computational semantics in the {Natural Language Toolkit}}, 69 | booktitle = {Proceedings of the Australasian Language Technology Workshop}, 70 | pages = {26--33}, 71 | year = 2006 72 | } 73 | -------------------------------------------------------------------------------- /resources/setup.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rplevy/clojure-nltk/b4119d66b21f3c619ca4d385f605417901e47349/resources/setup.cfg -------------------------------------------------------------------------------- /resources/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Distribute setup script for the Natural Language Toolkit 4 | # 5 | # Copyright (C) 2001-2012 NLTK Project 6 | # Author: Steven Bird 7 | # Edward Loper 8 | # Ewan Klein 9 | # URL: 10 | # For license information, see LICENSE.TXT 11 | 12 | # python2.5 compatibility 13 | from __future__ import with_statement 14 | 15 | import os 16 | 17 | # Use the VERSION file to get NLTK version 18 | version_file = os.path.join(os.path.dirname(__file__), 'nltk', 'VERSION') 19 | with open(version_file) as fh: 20 | nltk_version = fh.read().strip() 21 | 22 | import distribute_setup 23 | distribute_setup.use_setuptools() 24 | 25 | from setuptools import setup, find_packages 26 | 27 | # 28 | # Prevent setuptools from trying to add extra files to the source code 29 | # manifest by scanning the version control system for its contents. 30 | # 31 | from setuptools.command import sdist 32 | del sdist.finders[:] 33 | 34 | setup( 35 | name = "nltk", 36 | description = "Natural Language Toolkit", 37 | version = nltk_version, 38 | url = "http://nltk.org/", 39 | long_description = """\ 40 | The Natural Language Toolkit (NLTK) is a Python package for 41 | natural language processing. NLTK requires Python 2.5 or higher.""", 42 | license = "Apache License, Version 2.0", 43 | keywords = ['NLP', 'CL', 'natural language processing', 44 | 'computational linguistics', 'parsing', 'tagging', 45 | 'tokenizing', 'syntax', 'linguistics', 'language', 46 | 'natural language', 'text analytics'], 47 | maintainer = "Steven Bird", 48 | maintainer_email = "stevenbird1@gmail.com", 49 | author = "Steven Bird", 50 | author_email = "stevenbird1@gmail.com", 51 | classifiers = [ 52 | 'Development Status :: 5 - Production/Stable', 53 | 'Intended Audience :: Developers', 54 | 'Intended Audience :: Education', 55 | 'Intended Audience :: Information Technology', 56 | 'Intended Audience :: Science/Research', 57 | 'License :: OSI Approved :: Apache Software License', 58 | 'Operating System :: OS Independent', 59 | 'Programming Language :: Python :: 2.5', 60 | 'Programming Language :: Python :: 2.6', 61 | 'Programming Language :: Python :: 2.7', 62 | 'Topic :: Scientific/Engineering', 63 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 64 | 'Topic :: Scientific/Engineering :: Human Machine Interfaces', 65 | 'Topic :: Scientific/Engineering :: Information Analysis', 66 | 'Topic :: Text Processing', 67 | 'Topic :: Text Processing :: Filters', 68 | 'Topic :: Text Processing :: General', 69 | 'Topic :: Text Processing :: Indexing', 70 | 'Topic :: Text Processing :: Linguistic', 71 | ], 72 | package_data = {'nltk': ['nltk.jar', 'test/*.doctest', 'VERSION']}, 73 | packages = find_packages(), 74 | zip_safe=False, # since normal files will be present too? 75 | install_requires=['PyYAML>=3.09'], 76 | test_suite = 'nltk.test.simple', 77 | ) 78 | -------------------------------------------------------------------------------- /resources/tools/global_replace.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | ## Natural Language Toolkit: substitute a pattern with a replacement in every file 4 | # 5 | # Copyright (C) 2001-2012 NLTK Project 6 | # Author: Edward Loper 7 | # Steven Bird 8 | # URL: 9 | # For license information, see LICENSE.TXT 10 | 11 | # NB Should work on all platforms, http://www.python.org/doc/2.5.2/lib/os-file-dir.html 12 | 13 | import os, stat, sys 14 | 15 | def update(file, pattern, replacement, verbose=False): 16 | if verbose: 17 | print "Updating:", file 18 | 19 | # make sure we can write the file 20 | old_perm = os.stat(file)[0] 21 | if not os.access(file, os.W_OK): 22 | os.chmod(file, old_perm | stat.S_IWRITE) 23 | 24 | # write the file 25 | s = open(file, 'rb').read() 26 | t = s.replace(pattern, replacement) 27 | out = open(file, 'wb') 28 | out.write(t) 29 | out.close() 30 | 31 | # restore permissions 32 | os.chmod(file, old_perm) 33 | 34 | return s != t 35 | 36 | if __name__ == '__main__': 37 | 38 | if len(sys.argv) != 3: 39 | exit("Usage: %s " % sys.argv[0]) 40 | 41 | pattern = sys.argv[1] 42 | replacement = sys.argv[2] 43 | count = 0 44 | 45 | for root, dirs, files in os.walk('.'): 46 | if '/.git' not in root: 47 | for file in files: 48 | path = os.path.join(root, file) 49 | if update(path, pattern, replacement): 50 | print "Updated:", path 51 | count += 1 52 | 53 | print "Updated %d files" % count 54 | -------------------------------------------------------------------------------- /resources/tools/nltk_term_index.stoplist: -------------------------------------------------------------------------------- 1 | __init__ 2 | Comment 3 | Plot 4 | about 5 | add 6 | all 7 | analysis 8 | args 9 | book 10 | bubble 11 | categories 12 | close 13 | concatenate 14 | contains 15 | copy 16 | coverage 17 | defaultdict 18 | demo 19 | describe 20 | dict 21 | discourse 22 | doctype 23 | documents 24 | dump 25 | end 26 | ends 27 | fileids 28 | files 29 | find 30 | first 31 | free 32 | goal 33 | groups 34 | help 35 | incorrect 36 | insert 37 | instances 38 | items 39 | join 40 | key 41 | labels 42 | lhs 43 | line 44 | lines 45 | list 46 | lookup 47 | matches 48 | max 49 | means 50 | min 51 | missed 52 | name 53 | next 54 | nltk 55 | nltk.book 56 | open 57 | pairs 58 | play 59 | plot 60 | pop 61 | pos 62 | pp 63 | pprint 64 | prev 65 | process 66 | purge 67 | put 68 | quick 69 | raw 70 | read 71 | reader 72 | readings 73 | readme 74 | repr 75 | rhs 76 | root 77 | run 78 | second 79 | see 80 | select 81 | sentences 82 | sents 83 | set 84 | simple 85 | size 86 | sorted 87 | span 88 | start 89 | step 90 | stop 91 | str 92 | table 93 | test 94 | text 95 | texts 96 | trace 97 | type 98 | update 99 | verbs 100 | view 101 | vocab 102 | walk 103 | wav 104 | width 105 | words 106 | write 107 | -------------------------------------------------------------------------------- /resources/tools/svnmime.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # NB, this wouldn't be needed if everyone had .subversion/config 4 | # configured to automatically set mime types 5 | # http://code.google.com/p/support/wiki/FAQ 6 | 7 | import os 8 | import sys 9 | 10 | types_map = { 11 | 'ai': 'application/postscript', 12 | 'coverage': 'text/plain', 13 | 'css': 'text/css', 14 | 'eps': 'application/postscript', 15 | 'exe': 'application/octet-stream', 16 | 'errs': 'text/plain', 17 | 'gif': 'image/gif', 18 | 'htm': 'text/html', 19 | 'html': 'text/html', 20 | 'jpeg': 'image/jpeg', 21 | 'jpg': 'image/jpeg', 22 | 'js': 'application/x-javascript', 23 | 'pbm': 'image/x-portable-bitmap', 24 | 'pdf': 'application/pdf', 25 | 'pgm': 'image/x-portable-graymap', 26 | 'pnm': 'image/x-portable-anymap', 27 | 'png': 'image/png', 28 | 'ppm': 'image/x-portable-pixmap', 29 | 'py': 'text/x-python', 30 | 'ps': 'application/postscript', 31 | 'rst': 'text/plain', 32 | 'tex': 'application/x-tex', 33 | 'txt': 'text/plain', 34 | 'xml': 'text/xml', 35 | 'xsl': 'text/plain', 36 | 'zip': 'application/zip', 37 | } 38 | 39 | def usage(): 40 | exit("Usage: svnmime files") 41 | 42 | for file in sys.argv[1:]: 43 | if "." in file: 44 | extension = file.rsplit('.', 1)[1] 45 | if extension in types_map: 46 | os.system("svn propset svn:mime-type %s %s" % (types_map[extension], file)) 47 | else: 48 | print "Unrecognized extension", extension 49 | -------------------------------------------------------------------------------- /resources/tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py25,py26,py27,pypy 3 | 4 | [testenv] 5 | 6 | ; simplify numpy installation 7 | setenv = 8 | LAPACK= 9 | ATLAS=None 10 | 11 | deps = 12 | ; epydoc 13 | numpy 14 | nose 15 | svmlight 16 | 17 | 18 | changedir = nltk/test 19 | commands = 20 | ; scipy and scikit-learn requires numpy even to run setup.py so 21 | ; they can't be installed in one command 22 | 23 | pip install --download-cache={toxworkdir}/_download scipy scikit-learn 24 | python runtests.py [] 25 | 26 | [testenv:pypy] 27 | ; pysvmlight don't work with pypy; numpy is bundled with pypy. 28 | deps = 29 | epydoc 30 | nose 31 | 32 | commands = 33 | python runtests.py [] 34 | -------------------------------------------------------------------------------- /resources/web/api/nltk.rst: -------------------------------------------------------------------------------- 1 | .. manually constructed -- removed several low-level packages 2 | 3 | nltk Package 4 | ============ 5 | 6 | :mod:`nltk` Package 7 | ------------------- 8 | 9 | .. automodule:: nltk.__init__ 10 | :members: 11 | :undoc-members: 12 | :show-inheritance: 13 | 14 | :mod:`align` Module 15 | ------------------- 16 | 17 | .. automodule:: nltk.align 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | :mod:`collocations` Module 23 | -------------------------- 24 | 25 | .. automodule:: nltk.collocations 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | 30 | :mod:`data` Module 31 | ------------------ 32 | 33 | .. automodule:: nltk.data 34 | :members: 35 | :undoc-members: 36 | :show-inheritance: 37 | 38 | :mod:`downloader` Module 39 | ------------------------ 40 | 41 | .. automodule:: nltk.downloader 42 | :members: 43 | :undoc-members: 44 | :show-inheritance: 45 | 46 | :mod:`featstruct` Module 47 | ------------------------ 48 | 49 | .. automodule:: nltk.featstruct 50 | :members: 51 | :undoc-members: 52 | :show-inheritance: 53 | 54 | :mod:`grammar` Module 55 | --------------------- 56 | 57 | .. automodule:: nltk.grammar 58 | :members: 59 | :undoc-members: 60 | :show-inheritance: 61 | 62 | :mod:`help` Module 63 | ------------------ 64 | 65 | .. automodule:: nltk.help 66 | :members: 67 | :undoc-members: 68 | :show-inheritance: 69 | 70 | :mod:`probability` Module 71 | ------------------------- 72 | 73 | .. automodule:: nltk.probability 74 | :members: 75 | :undoc-members: 76 | :show-inheritance: 77 | 78 | :mod:`sourcedstring` Module 79 | --------------------------- 80 | 81 | .. automodule:: nltk.sourcedstring 82 | :members: 83 | :undoc-members: 84 | :show-inheritance: 85 | 86 | :mod:`text` Module 87 | ------------------ 88 | 89 | .. automodule:: nltk.text 90 | :members: 91 | :undoc-members: 92 | :show-inheritance: 93 | 94 | :mod:`toolbox` Module 95 | --------------------- 96 | 97 | .. automodule:: nltk.toolbox 98 | :members: 99 | :undoc-members: 100 | :show-inheritance: 101 | 102 | :mod:`tree` Module 103 | ------------------ 104 | 105 | .. automodule:: nltk.tree 106 | :members: 107 | :undoc-members: 108 | :show-inheritance: 109 | 110 | :mod:`treetransforms` Module 111 | ---------------------------- 112 | 113 | .. automodule:: nltk.treetransforms 114 | :members: 115 | :undoc-members: 116 | :show-inheritance: 117 | 118 | :mod:`util` Module 119 | ------------------ 120 | 121 | .. automodule:: nltk.util 122 | :members: 123 | :undoc-members: 124 | :show-inheritance: 125 | 126 | Subpackages 127 | ----------- 128 | 129 | .. toctree:: 130 | 131 | nltk.app 132 | nltk.ccg 133 | nltk.chat 134 | nltk.chunk 135 | nltk.classify 136 | nltk.cluster 137 | nltk.corpus 138 | nltk.draw 139 | nltk.examples 140 | nltk.inference 141 | nltk.metrics 142 | nltk.misc 143 | nltk.model 144 | nltk.parse 145 | nltk.sem 146 | nltk.stem 147 | nltk.tag 148 | nltk.test 149 | nltk.tokenize 150 | 151 | -------------------------------------------------------------------------------- /resources/web/data.rst: -------------------------------------------------------------------------------- 1 | Installing NLTK Data 2 | ==================== 3 | 4 | NLTK comes with many corpora, toy grammars, trained models, etc. A complete list is posted at: http://nltk.org/nltk_data/ 5 | 6 | To install the data, first install NLTK (see http://nltk.org/install.html), then use NLTK's data downloader as described below. 7 | 8 | Apart from individual data packages, you can download the entire collection (using "all"), or just the data required for the examples and exercises in the book (using "book"), or just the corpora and no grammars or trained models (using "all-corpora"). 9 | 10 | Interactive installer 11 | --------------------- 12 | 13 | *For central installation on a multi-user machine, do the following from an administrator account.* 14 | 15 | Run the Python interpreter and type the commands: 16 | 17 | >>> import nltk 18 | >>> nltk.download() 19 | 20 | A new window should open, showing the NLTK Downloader. Click on the File menu and select Change Download Directory. For central installation, set this to ``C:\nltk_data`` (Windows), or ``/usr/share/nltk_data`` (Mac, Unix). Next, select the packages or collections you want to download. 21 | 22 | If you did not install the data to one of the above central locations, you will need to set the ``NLTK_DATA`` environment variable to specify the location of the data. (On a Windows machine, right click on "My Computer" then select ``Properties > Advanced > Environment Variables > User Variables > New...``) 23 | 24 | Test that the data has been installed as follows. (This assumes you downloaded the Brown Corpus): 25 | 26 | >>> from nltk.corpus import brown 27 | >>> brown.words() 28 | ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...] 29 | 30 | Installing via a proxy web server 31 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 32 | 33 | If your web connection uses a proxy server, you should specify the proxy address as follows. In the case of an authenticating proxy, specify a username and password. If the proxy is set to None then this function will attempt to detect the system proxy. 34 | 35 | >>> nltk.set_proxy('http://proxy.example.com:3128' ('USERNAME', 'PASSWORD')) 36 | >>> nltk.download() 37 | 38 | Command line installation 39 | ------------------------- 40 | 41 | The downloader will search for an existing ``nltk_data`` directory to install NLTK data. If one does not exist it will attempt to create one in a central location (when using an administrator account) or otherwise in the user's filespace. If necessary, run the download command from an administrator account, or using sudo. The default system location on Windows is ``C:\nltk_data``; and on Mac and Unix is ``/usr/share/nltk_data``. You can use the ``-d`` flag to specify a different location (but if you do this, be sure to set the ``NLTK_DATA`` environment variable accordingly). 42 | 43 | Python 2.5-2.7: Run the command ``python -m nltk.downloader all``. To ensure central installation, run the command ``sudo python -m nltk.downloader -d /usr/share/nltk_data all``. 44 | 45 | Windows: Use the "Run..." option on the Start menu. Windows Vista users need to first turn on this option, using ``Start -> Properties -> Customize`` to check the box to activate the "Run..." option. 46 | 47 | Test the installation: Check that the user environment and privileges are set correctly by logging in to a user account, 48 | starting the Python interpreter, and accessing the Brown Corpus (see the previous section). 49 | 50 | -------------------------------------------------------------------------------- /resources/web/dev/local_testing.rst: -------------------------------------------------------------------------------- 1 | NLTK testing 2 | ============ 3 | 4 | 1. Obtain nltk source code; 5 | 2. install virtualenv and tox:: 6 | 7 | pip install virtualenv 8 | pip install tox 9 | 10 | 3. make sure python2.5, python2.6, python2.7 and pypy executables are 11 | in system PATH. It is OK not to have all the executables, tests will 12 | be executed for available interpreters. 13 | 14 | 4. Make sure all NLTK data is downloaded (see `nltk.download()`); 15 | 16 | 5. run 'tox' command from the root nltk folder. It will install dependencies 17 | and run `nltk/test/runtests.py` script for all available interpreters. 18 | You may pass any options to runtests.py script separating them by '--'. 19 | 20 | It may take a long time at first run, but the subsequent runs will be much faster. 21 | Please consult http://tox.testrun.org/ for more info about the tox tool. 22 | 23 | Examples 24 | -------- 25 | 26 | Run tests for python 2.7 in verbose mode; executing only tests 27 | that failed in the last test run:: 28 | 29 | tox -e py27 -- -v --failed 30 | 31 | 32 | Run tree doctests for all available interpreters:: 33 | 34 | tox -- tree.doctest 35 | 36 | -------------------------------------------------------------------------------- /resources/web/images/book.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rplevy/clojure-nltk/b4119d66b21f3c619ca4d385f605417901e47349/resources/web/images/book.gif -------------------------------------------------------------------------------- /resources/web/images/tree.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rplevy/clojure-nltk/b4119d66b21f3c619ca4d385f605417901e47349/resources/web/images/tree.gif -------------------------------------------------------------------------------- /resources/web/index.rst: -------------------------------------------------------------------------------- 1 | Natural Language Toolkit 2 | ======================== 3 | 4 | NLTK is a leading platform for building Python programs to work with human language data. 5 | It provides easy-to-use interfaces to over 50 corpora and lexical resources such as WordNet, 6 | along with a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning. 7 | 8 | Thanks to a hands-on guide introducing programming fundamentals alongside topics in computational linguistics, 9 | NLTK is suitable for linguists, engineers, students, educators, researchers, and industry users alike. 10 | NLTK is available for Windows, Mac OS X, and Linux. Best of all, NLTK is a free, open source, community-driven project. 11 | 12 | NLTK has been called "a wonderful tool for teaching, and working in, computational linguistics using Python," 13 | and "an amazing library to play with natural language." 14 | 15 | `Natural Language Processing with Python `_ provides a practical 16 | introduction to programming for language processing. 17 | Written by the creators of NLTK, it guides the reader through the fundamentals 18 | of writing Python programs, working with corpora, categorizing text, analyzing linguistic structure, 19 | and more. 20 | 21 | Some simple things you can do with NLTK 22 | --------------------------------------- 23 | 24 | Tokenize and tag some text: 25 | 26 | >>> import nltk 27 | >>> sentence = """At eight o'clock on Thursday morning 28 | ... Arthur didn't feel very good.""" 29 | >>> tokens = nltk.word_tokenize(sentence) 30 | >>> tokens 31 | ['At', 'eight', "o'clock", 'on', 'Thursday', 'morning', 32 | 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.'] 33 | >>> tagged = nltk.pos_tag(tokens) 34 | >>> tagged[0:6] 35 | [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'), 36 | ('Thursday', 'NNP'), ('morning', 'NN')] 37 | 38 | Identify named entities: 39 | 40 | >>> entities = nltk.chunk.ne_chunk(tagged) 41 | >>> entities 42 | Tree('S', [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), 43 | ('on', 'IN'), ('Thursday', 'NNP'), ('morning', 'NN'), 44 | Tree('PERSON', [('Arthur', 'NNP')]), 45 | ('did', 'VBD'), ("n't", 'RB'), ('feel', 'VB'), 46 | ('very', 'RB'), ('good', 'JJ'), ('.', '.')]) 47 | 48 | Display a parse tree: 49 | 50 | .. doctest:: 51 | :options: +SKIP 52 | 53 | >>> from nltk.corpus import treebank 54 | >>> t = treebank.parsed_sents('wsj_0001.mrg')[0] 55 | >>> t.draw() 56 | 57 | .. image:: images/tree.gif 58 | 59 | Links 60 | ----- 61 | 62 | * NLTK-Users mailing list: http://groups.google.com/group/nltk-users 63 | * NLTK's previous website: https://sites.google.com/site/naturallanguagetoolkit 64 | * NLTK development: https://github.com/nltk 65 | * NLTK-Dev mailing list: http://groups.google.com/group/nltk-dev 66 | * Publications about NLTK: http://scholar.google.com.au/scholar?q=NLTK 67 | 68 | Contents 69 | ======== 70 | 71 | .. toctree:: 72 | :maxdepth: 1 73 | 74 | news 75 | install 76 | data 77 | api/nltk 78 | 79 | * :ref:`genindex` 80 | * :ref:`modindex` 81 | * :ref:`search` 82 | -------------------------------------------------------------------------------- /resources/web/install.rst: -------------------------------------------------------------------------------- 1 | Installing NLTK 2 | =============== 3 | 4 | NLTK requires Python versions 2.5-2.7. 5 | 6 | Mac/Unix 7 | -------- 8 | 9 | #. Open ``Finder>Applications>Utilities>Terminal`` and type ``python -V`` to find out what version of Python is installed 10 | #. Install Setuptools: Download the corresponding version of Setuptools from 11 | http://pypi.python.org/pypi/setuptools (scroll to the bottom, and pick the filename that contains the right version number and which has the extension .egg). Install it by typing ``sudo sh Downloads/setuptools-...egg``, giving the location of the downloaded file. 12 | #. Install Pip: run ``sudo easy_install pip`` 13 | #. Install Numpy (optional): run ``sudo pip install -U numpy`` 14 | #. Install PyYAML and NLTK: run ``sudo pip install -U pyyaml nltk`` 15 | #. Test installation: run ``python`` then type ``import nltk`` 16 | 17 | Windows 18 | ------- 19 | 20 | These instructions assume that you do not already have Python installed on your machine. 21 | If you do, you can skip to the final step and just install NLTK. 22 | 23 | 32-bit binary installation 24 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 25 | 26 | #. Install Python: http://www.python.org/download/releases/2.7.3/ 27 | #. Install Numpy (optional): http://sourceforge.net/projects/numpy/files/NumPy/1.6.2/numpy-1.6.2-win32-superpack-python2.7.exe 28 | #. Install NLTK: http://pypi.python.org/pypi/nltk 29 | #. Install PyYAML: http://pyyaml.org/wiki/PyYAML 30 | #. Test installation: ``Start>Python27``, then type ``import nltk`` 31 | 32 | Source installation (for 32-bit or 64-bit Windows) 33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 34 | 35 | #. Install Python: http://www.python.org/download/releases/2.7.3/ 36 | #. Install Numpy (optional): http://www.lfd.uci.edu/~gohlke/pythonlibs/#numpy 37 | #. Install Setuptools: http://pypi.python.org/packages/2.7/s/setuptools/setuptools-0.6c11.win32-py2.7.exe 38 | #. Install Pip: ``Start>Run... c:\Python27\Scripts\easy_install pip`` 39 | #. Install PyYAML and NLTK: ``Start>Run... c:\Python27\Scripts\pip install pyyaml nltk`` 40 | #. Test installation: ``Start>All Programs>Python27>IDLE``, then type ``import nltk`` 41 | 42 | -------------------------------------------------------------------------------- /src/clojure_nltk/core.clj: -------------------------------------------------------------------------------- 1 | (ns clojure-nltk.core 2 | (:require [clojure-python.core :as py] 3 | [clojure.java.io :as io])) 4 | 5 | (defmacro nltk-init 6 | "set up ntlk. currently supported usages: 7 | (nltk-init (:import foo bar baz))" 8 | [& clauses] 9 | (let [import-clauses (set (apply concat 10 | (map #(if (= :import (first %)) 11 | (rest %)) clauses)))] 12 | `(do 13 | (py/init (io/resource "nltk/")) 14 | (py/py-import-lib ~'nltk) 15 | ~@(map (fn [module] 16 | `(py/py-import-lib 17 | ~'nltk 18 | ~module)) 19 | import-clauses)))) 20 | 21 | (defmacro corpus-base [corpus-name method & params] 22 | `(py/pyobj-iterate 23 | (py/_> [~'corpus ~corpus-name ~method] 24 | ~@params))) 25 | (defmacro corpus-words [corpus-name & params] 26 | `(py/corpus-base ~corpus-name ~'words ~@params)) 27 | (defmacro corpus-categories [corpus-name & params] 28 | `(py/corpus-base ~corpus-name ~'categories ~@params)) 29 | (defmacro corpus-fileids [corpus-name & params] 30 | `(py/corpus-base ~corpus-name ~'fileids ~@params)) 31 | -------------------------------------------------------------------------------- /test/clojure_nltk/core_test.clj: -------------------------------------------------------------------------------- 1 | (ns clojure-nltk.core-test 2 | (:require [clojure-nltk.core :as base] 3 | [midje.sweet :refer :all])) 4 | 5 | (fact (base/nltk-init) => anything) 6 | --------------------------------------------------------------------------------