├── slides ├── book.bib ├── demos │ ├── parse.py │ ├── names.py │ ├── similar_words_2.py │ └── similar_words.py ├── beamer.tex ├── Makefile ├── index.rst └── lsa110_1.tex ├── book ├── .gitignore ├── corpus.txt ├── dict.htm ├── ch00-pt.rst ├── revision.rst ├── feedback.txt ├── callouts │ ├── callout1.gif │ ├── callout2.gif │ ├── callout3.gif │ ├── callout4.gif │ ├── callout5.gif │ ├── callout6.gif │ ├── callout7.gif │ ├── callout8.gif │ ├── callout9.gif │ ├── callout10.gif │ ├── callout11.gif │ ├── callout12.gif │ ├── callout13.gif │ ├── callout14.gif │ └── callout15.gif ├── term_index.rst ├── footer-pt.rst ├── definitions-pt.rst ├── dict.csv ├── docbook-issues.txt ├── ch6-fmla.tex ├── ch12-extras.rst ├── footer.rst ├── bib_template.html ├── book.rst ├── ch02-extras.rst ├── print.rst ├── reprint1-4.txt ├── ch05-extras.rst ├── reprint1-1.txt ├── errata2.txt ├── copy-edits.txt ├── reprint1-2.txt ├── book.xml ├── ch03-extras.rst ├── guidelines.txt ├── DOCUTILS ├── regexp-defns.rst ├── reprint1-3.txt ├── intro-outline.txt ├── introduction-code.txt ├── SCHEDULE ├── CheckList.txt ├── image_scaling.rst ├── ch01-extras.rst ├── second-edition.txt └── ch01-notes.rst ├── pt-br ├── Makefile └── index.txt ├── nltk.ppt ├── images ├── T9.png ├── are.png ├── xp.png ├── avm1.pdf ├── avm1.png ├── blank.png ├── brent.png ├── dag01.png ├── dag02.png ├── dag03.png ├── dag04.png ├── drs1.png ├── drs2.png ├── empty.png ├── maps.png ├── tally.png ├── timit.png ├── utf8.png ├── ambig02.png ├── ambig03.png ├── authors.png ├── cup-test.png ├── dag04-1.png ├── dag04-2.png ├── dag04-3.png ├── dialogue.png ├── drs1.graffle ├── drs2.graffle ├── iu-mien.png ├── jigsaw.png ├── lee-dog.png ├── lexicon.png ├── maps01.png ├── maps02.png ├── semint.png ├── tally2.png ├── target.png ├── unicode.png ├── xp-mod.png ├── ambig01-a.png ├── ambig01-b.png ├── cfd-gender.png ├── chart_fr1.png ├── chart_fr2.png ├── chart_np0.png ├── chart_np1.png ├── chart_np2.png ├── chunk-muc.png ├── corpus-org.pdf ├── corpus-org.png ├── datatypes.png ├── depgraph0.png ├── dog-graph.png ├── evolution.png ├── fdist-moby.png ├── ic_diagram.pdf ├── ic_diagram.png ├── inaugural.png ├── inaugural2.png ├── indexing01.png ├── indexing02.png ├── locations.png ├── mergesort.png ├── model_kids.png ├── nl_to_fol.png ├── parse_draw.png ├── parse_tree.png ├── pipeline1.png ├── rdparser1.png ├── rdparser2.png ├── rdparser3.png ├── rdparser4.png ├── rdparser5.png ├── rdparser6.png ├── srparser1.png ├── srparser2.png ├── srparser3.png ├── srparser4.png ├── srparser5.png ├── srparser6.png ├── srparser7.png ├── srparser8.png ├── srparser9.png ├── tag-indian.png ├── tag-lookup.png ├── trees_tree.png ├── windowdiff.png ├── 2nd_ed_plan.png ├── array-memory.png ├── chart_bu_ex1.png ├── chart_bu_ex2.png ├── chart_bu_ex3.png ├── chart_bu_fr.png ├── chart_bu_init.png ├── chart_demo1.png ├── chart_demo1.tiff ├── chart_demo2.png ├── chart_demo2.tiff ├── chart_init0.png ├── chart_init1.png ├── chart_prods.png ├── chart_td_ex1.png ├── chart_td_ex2.png ├── chart_td_ex3.png ├── chart_td_ex4.png ├── chart_td_init.png ├── chart_wfst1.png ├── chart_wfst2.png ├── chunk-coref.png ├── chunk-tagrep.png ├── chunk-treerep.png ├── decision-tree.png ├── exploration.png ├── findtheblock1.png ├── findtheblock2.png ├── findtheblock3.png ├── findtheblock4.png ├── mod_relation.png ├── modal_genre.png ├── models_admire.png ├── multi-module.png ├── partialtree.png ├── polish-utf8.png ├── quant-ambig.png ├── rdparser1-6.png ├── sensibility.png ├── sinica-tree.png ├── srparser1-6.png ├── stack-queue.png ├── string-memory.png ├── syntax-tree.png ├── tag-context.png ├── word-len-dist.png ├── 2nd_ed_plan.graffle ├── chart_bottom_up.png ├── chart_td_match1.png ├── chart_td_match2.png ├── chart_top_down.png ├── classification.png ├── corpus-org.graffle ├── dialogue-90dpi.png ├── drs_screenshot0.png ├── drs_screenshot1.png ├── findtheblock1.tiff ├── findtheblock2.tiff ├── findtheblock3.tiff ├── findtheblock4.tiff ├── ie-architecture.png ├── mimo-and-bruno.jpg ├── mimo-and-bruno.png ├── models_walk_cf.png ├── nltk-downloader.png ├── string-slicing.png ├── timit-structure.png ├── chart_bu_predict1.png ├── chart_bu_predict2.png ├── chart_fundamental.png ├── chart_intro_2edges.png ├── chart_intro_3edges.png ├── chart_intro_empty.png ├── chart_positions1.png ├── chart_positions2.png ├── chart_td_expand1.png ├── chart_td_expand2.png ├── chart_useless_edge.png ├── chunk-segmentation.png ├── decision-tree.graffle ├── feature-extraction.png ├── ic_diagram_labeled.pdf ├── ic_diagram_labeled.png ├── models_love_cf01.png ├── models_love_cf02.png ├── multi-module.graffle ├── naive_bayes_graph.png ├── old-string-memory.png ├── precision-recall.png ├── recursive_parse1.png ├── three-layer-arch.png ├── vocabulary-growth.png ├── wordnet-hierarchy.png ├── words-dispersion.png ├── Binary_entropy_plot.pdf ├── Binary_entropy_plot.png ├── chart_intro_prodedge.png ├── chart_intro_selfloop.png ├── chart_td_match1_alt.png ├── naive-bayes-triangle.png ├── naive_bayes_bargraph.png ├── parse_rdparsewindow.png ├── chart_intro_dottededge.png ├── chart_intro_incomplete.png ├── chart_intro_parseedge.png ├── naive_bayes_graph.graffle ├── text-corpus-structure.png ├── naive-bayes-triangle.graffle ├── naive_bayes_bargraph.graffle ├── supervised-classification.png ├── text-corpus-structure.graffle ├── supervised-classification.graffle ├── ambig02.py ├── ambig03.py ├── ambig01-a.py ├── ambig01-b.py ├── lee-dog.py ├── polish-utf8.py ├── chart_td_init.dot ├── avm1.tex ├── chart_intro_empty.dot ├── Makefile ├── chart_useless_edge.dot ├── chart_intro_dottededge.dot ├── chart_intro_parseedge.dot ├── chart_td_match2.dot ├── chart_bu_init.dot ├── chart_intro_selfloop.dot ├── chart_intro_2edges.dot ├── chart_td_expand2.dot ├── chart_bu_predict1.dot ├── chart_bu_predict2.dot ├── chart_td_expand1.dot ├── chart_td_match1.dot ├── chart_intro_incomplete.dot ├── chart_intro_prodedge.dot ├── chart_intro_3edges.dot ├── chart_fr2.dot ├── chart_fr1.dot ├── Binary_entropy_plot.tex ├── precision-recall.fig ├── are.fig ├── chunk-tagrep.fig ├── chunk-segmentation.fig └── chunk-treerep.fig ├── LSA325 ├── engineer.pdf ├── data-model1.pdf ├── data-model2.pdf ├── data-model3.pdf ├── data-model4.pdf ├── evaluations.xls ├── lsa325_070907.pdf ├── assignment3.txt ├── assignment2.txt ├── assignment5.txt ├── log_lc_and_functions.txt ├── assignment4.txt ├── LSA325_3_handout.tex ├── lsa325_5.tex ├── LSA325_5_handout.tex ├── log_fds.txt ├── lsa110_1.tex └── lsa110_2.tex ├── book-pl ├── footer.rst └── footer-pl.rst ├── book-jp ├── fig_jpime_eq.png ├── fig_jpma_lattice1.png └── fig_jpma_lattice2.png ├── .gitignore ├── index.html ├── surveys └── 2005-10.txt ├── latexhacks.py ├── pages.py ├── index.rst ├── howto ├── coverage.txt ├── Makefile ├── show_coverage.py └── update_list.py ├── examples.py ├── rsthacks.py ├── doctest_split.py ├── archives └── sourceforge-dev.txt ├── xincluder.py ├── epydoc.diff ├── xelatexsymbols.tex ├── xmlpp.py ├── HouseStyle.txt └── definitions.sty /slides/book.bib: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /book/.gitignore: -------------------------------------------------------------------------------- 1 | _build/**/* 2 | -------------------------------------------------------------------------------- /pt-br/Makefile: -------------------------------------------------------------------------------- 1 | include ../Makefile.doc 2 | -------------------------------------------------------------------------------- /book/corpus.txt: -------------------------------------------------------------------------------- 1 | Hello world. This is a test file. 2 | -------------------------------------------------------------------------------- /nltk.ppt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/nltk.ppt -------------------------------------------------------------------------------- /book/dict.htm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/dict.htm -------------------------------------------------------------------------------- /images/T9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/T9.png -------------------------------------------------------------------------------- /images/are.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/are.png -------------------------------------------------------------------------------- /images/xp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/xp.png -------------------------------------------------------------------------------- /book/ch00-pt.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/ch00-pt.rst -------------------------------------------------------------------------------- /book/revision.rst: -------------------------------------------------------------------------------- 1 | This document was built on 2 | Wed 4 Sep 2019 11:25:35 ACST 3 | -------------------------------------------------------------------------------- /images/avm1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/avm1.pdf -------------------------------------------------------------------------------- /images/avm1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/avm1.png -------------------------------------------------------------------------------- /images/blank.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/blank.png -------------------------------------------------------------------------------- /images/brent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/brent.png -------------------------------------------------------------------------------- /images/dag01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/dag01.png -------------------------------------------------------------------------------- /images/dag02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/dag02.png -------------------------------------------------------------------------------- /images/dag03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/dag03.png -------------------------------------------------------------------------------- /images/dag04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/dag04.png -------------------------------------------------------------------------------- /images/drs1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/drs1.png -------------------------------------------------------------------------------- /images/drs2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/drs2.png -------------------------------------------------------------------------------- /images/empty.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/empty.png -------------------------------------------------------------------------------- /images/maps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/maps.png -------------------------------------------------------------------------------- /images/tally.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/tally.png -------------------------------------------------------------------------------- /images/timit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/timit.png -------------------------------------------------------------------------------- /images/utf8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/utf8.png -------------------------------------------------------------------------------- /LSA325/engineer.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/LSA325/engineer.pdf -------------------------------------------------------------------------------- /book-pl/footer.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book-pl/footer.rst -------------------------------------------------------------------------------- /book/feedback.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/feedback.txt -------------------------------------------------------------------------------- /images/ambig02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/ambig02.png -------------------------------------------------------------------------------- /images/ambig03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/ambig03.png -------------------------------------------------------------------------------- /images/authors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/authors.png -------------------------------------------------------------------------------- /images/cup-test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/cup-test.png -------------------------------------------------------------------------------- /images/dag04-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/dag04-1.png -------------------------------------------------------------------------------- /images/dag04-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/dag04-2.png -------------------------------------------------------------------------------- /images/dag04-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/dag04-3.png -------------------------------------------------------------------------------- /images/dialogue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/dialogue.png -------------------------------------------------------------------------------- /images/drs1.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/drs1.graffle -------------------------------------------------------------------------------- /images/drs2.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/drs2.graffle -------------------------------------------------------------------------------- /images/iu-mien.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/iu-mien.png -------------------------------------------------------------------------------- /images/jigsaw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/jigsaw.png -------------------------------------------------------------------------------- /images/lee-dog.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/lee-dog.png -------------------------------------------------------------------------------- /images/lexicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/lexicon.png -------------------------------------------------------------------------------- /images/maps01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/maps01.png -------------------------------------------------------------------------------- /images/maps02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/maps02.png -------------------------------------------------------------------------------- /images/semint.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/semint.png -------------------------------------------------------------------------------- /images/tally2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/tally2.png -------------------------------------------------------------------------------- /images/target.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/target.png -------------------------------------------------------------------------------- /images/unicode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/unicode.png -------------------------------------------------------------------------------- /images/xp-mod.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/xp-mod.png -------------------------------------------------------------------------------- /book-pl/footer-pl.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book-pl/footer-pl.rst -------------------------------------------------------------------------------- /images/ambig01-a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/ambig01-a.png -------------------------------------------------------------------------------- /images/ambig01-b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/ambig01-b.png -------------------------------------------------------------------------------- /images/cfd-gender.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/cfd-gender.png -------------------------------------------------------------------------------- /images/chart_fr1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_fr1.png -------------------------------------------------------------------------------- /images/chart_fr2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_fr2.png -------------------------------------------------------------------------------- /images/chart_np0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_np0.png -------------------------------------------------------------------------------- /images/chart_np1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_np1.png -------------------------------------------------------------------------------- /images/chart_np2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_np2.png -------------------------------------------------------------------------------- /images/chunk-muc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chunk-muc.png -------------------------------------------------------------------------------- /images/corpus-org.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/corpus-org.pdf -------------------------------------------------------------------------------- /images/corpus-org.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/corpus-org.png -------------------------------------------------------------------------------- /images/datatypes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/datatypes.png -------------------------------------------------------------------------------- /images/depgraph0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/depgraph0.png -------------------------------------------------------------------------------- /images/dog-graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/dog-graph.png -------------------------------------------------------------------------------- /images/evolution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/evolution.png -------------------------------------------------------------------------------- /images/fdist-moby.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/fdist-moby.png -------------------------------------------------------------------------------- /images/ic_diagram.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/ic_diagram.pdf -------------------------------------------------------------------------------- /images/ic_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/ic_diagram.png -------------------------------------------------------------------------------- /images/inaugural.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/inaugural.png -------------------------------------------------------------------------------- /images/inaugural2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/inaugural2.png -------------------------------------------------------------------------------- /images/indexing01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/indexing01.png -------------------------------------------------------------------------------- /images/indexing02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/indexing02.png -------------------------------------------------------------------------------- /images/locations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/locations.png -------------------------------------------------------------------------------- /images/mergesort.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/mergesort.png -------------------------------------------------------------------------------- /images/model_kids.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/model_kids.png -------------------------------------------------------------------------------- /images/nl_to_fol.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/nl_to_fol.png -------------------------------------------------------------------------------- /images/parse_draw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/parse_draw.png -------------------------------------------------------------------------------- /images/parse_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/parse_tree.png -------------------------------------------------------------------------------- /images/pipeline1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/pipeline1.png -------------------------------------------------------------------------------- /images/rdparser1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/rdparser1.png -------------------------------------------------------------------------------- /images/rdparser2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/rdparser2.png -------------------------------------------------------------------------------- /images/rdparser3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/rdparser3.png -------------------------------------------------------------------------------- /images/rdparser4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/rdparser4.png -------------------------------------------------------------------------------- /images/rdparser5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/rdparser5.png -------------------------------------------------------------------------------- /images/rdparser6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/rdparser6.png -------------------------------------------------------------------------------- /images/srparser1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/srparser1.png -------------------------------------------------------------------------------- /images/srparser2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/srparser2.png -------------------------------------------------------------------------------- /images/srparser3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/srparser3.png -------------------------------------------------------------------------------- /images/srparser4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/srparser4.png -------------------------------------------------------------------------------- /images/srparser5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/srparser5.png -------------------------------------------------------------------------------- /images/srparser6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/srparser6.png -------------------------------------------------------------------------------- /images/srparser7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/srparser7.png -------------------------------------------------------------------------------- /images/srparser8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/srparser8.png -------------------------------------------------------------------------------- /images/srparser9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/srparser9.png -------------------------------------------------------------------------------- /images/tag-indian.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/tag-indian.png -------------------------------------------------------------------------------- /images/tag-lookup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/tag-lookup.png -------------------------------------------------------------------------------- /images/trees_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/trees_tree.png -------------------------------------------------------------------------------- /images/windowdiff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/windowdiff.png -------------------------------------------------------------------------------- /LSA325/data-model1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/LSA325/data-model1.pdf -------------------------------------------------------------------------------- /LSA325/data-model2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/LSA325/data-model2.pdf -------------------------------------------------------------------------------- /LSA325/data-model3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/LSA325/data-model3.pdf -------------------------------------------------------------------------------- /LSA325/data-model4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/LSA325/data-model4.pdf -------------------------------------------------------------------------------- /LSA325/evaluations.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/LSA325/evaluations.xls -------------------------------------------------------------------------------- /LSA325/lsa325_070907.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/LSA325/lsa325_070907.pdf -------------------------------------------------------------------------------- /book-jp/fig_jpime_eq.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book-jp/fig_jpime_eq.png -------------------------------------------------------------------------------- /images/2nd_ed_plan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/2nd_ed_plan.png -------------------------------------------------------------------------------- /images/array-memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/array-memory.png -------------------------------------------------------------------------------- /images/chart_bu_ex1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_bu_ex1.png -------------------------------------------------------------------------------- /images/chart_bu_ex2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_bu_ex2.png -------------------------------------------------------------------------------- /images/chart_bu_ex3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_bu_ex3.png -------------------------------------------------------------------------------- /images/chart_bu_fr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_bu_fr.png -------------------------------------------------------------------------------- /images/chart_bu_init.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_bu_init.png -------------------------------------------------------------------------------- /images/chart_demo1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_demo1.png -------------------------------------------------------------------------------- /images/chart_demo1.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_demo1.tiff -------------------------------------------------------------------------------- /images/chart_demo2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_demo2.png -------------------------------------------------------------------------------- /images/chart_demo2.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_demo2.tiff -------------------------------------------------------------------------------- /images/chart_init0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_init0.png -------------------------------------------------------------------------------- /images/chart_init1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_init1.png -------------------------------------------------------------------------------- /images/chart_prods.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_prods.png -------------------------------------------------------------------------------- /images/chart_td_ex1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_td_ex1.png -------------------------------------------------------------------------------- /images/chart_td_ex2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_td_ex2.png -------------------------------------------------------------------------------- /images/chart_td_ex3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_td_ex3.png -------------------------------------------------------------------------------- /images/chart_td_ex4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_td_ex4.png -------------------------------------------------------------------------------- /images/chart_td_init.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_td_init.png -------------------------------------------------------------------------------- /images/chart_wfst1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_wfst1.png -------------------------------------------------------------------------------- /images/chart_wfst2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_wfst2.png -------------------------------------------------------------------------------- /images/chunk-coref.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chunk-coref.png -------------------------------------------------------------------------------- /images/chunk-tagrep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chunk-tagrep.png -------------------------------------------------------------------------------- /images/chunk-treerep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chunk-treerep.png -------------------------------------------------------------------------------- /images/decision-tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/decision-tree.png -------------------------------------------------------------------------------- /images/exploration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/exploration.png -------------------------------------------------------------------------------- /images/findtheblock1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/findtheblock1.png -------------------------------------------------------------------------------- /images/findtheblock2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/findtheblock2.png -------------------------------------------------------------------------------- /images/findtheblock3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/findtheblock3.png -------------------------------------------------------------------------------- /images/findtheblock4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/findtheblock4.png -------------------------------------------------------------------------------- /images/mod_relation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/mod_relation.png -------------------------------------------------------------------------------- /images/modal_genre.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/modal_genre.png -------------------------------------------------------------------------------- /images/models_admire.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/models_admire.png -------------------------------------------------------------------------------- /images/multi-module.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/multi-module.png -------------------------------------------------------------------------------- /images/partialtree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/partialtree.png -------------------------------------------------------------------------------- /images/polish-utf8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/polish-utf8.png -------------------------------------------------------------------------------- /images/quant-ambig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/quant-ambig.png -------------------------------------------------------------------------------- /images/rdparser1-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/rdparser1-6.png -------------------------------------------------------------------------------- /images/sensibility.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/sensibility.png -------------------------------------------------------------------------------- /images/sinica-tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/sinica-tree.png -------------------------------------------------------------------------------- /images/srparser1-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/srparser1-6.png -------------------------------------------------------------------------------- /images/stack-queue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/stack-queue.png -------------------------------------------------------------------------------- /images/string-memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/string-memory.png -------------------------------------------------------------------------------- /images/syntax-tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/syntax-tree.png -------------------------------------------------------------------------------- /images/tag-context.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/tag-context.png -------------------------------------------------------------------------------- /images/word-len-dist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/word-len-dist.png -------------------------------------------------------------------------------- /book/callouts/callout1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout1.gif -------------------------------------------------------------------------------- /book/callouts/callout2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout2.gif -------------------------------------------------------------------------------- /book/callouts/callout3.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout3.gif -------------------------------------------------------------------------------- /book/callouts/callout4.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout4.gif -------------------------------------------------------------------------------- /book/callouts/callout5.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout5.gif -------------------------------------------------------------------------------- /book/callouts/callout6.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout6.gif -------------------------------------------------------------------------------- /book/callouts/callout7.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout7.gif -------------------------------------------------------------------------------- /book/callouts/callout8.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout8.gif -------------------------------------------------------------------------------- /book/callouts/callout9.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout9.gif -------------------------------------------------------------------------------- /images/2nd_ed_plan.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/2nd_ed_plan.graffle -------------------------------------------------------------------------------- /images/chart_bottom_up.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_bottom_up.png -------------------------------------------------------------------------------- /images/chart_td_match1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_td_match1.png -------------------------------------------------------------------------------- /images/chart_td_match2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_td_match2.png -------------------------------------------------------------------------------- /images/chart_top_down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_top_down.png -------------------------------------------------------------------------------- /images/classification.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/classification.png -------------------------------------------------------------------------------- /images/corpus-org.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/corpus-org.graffle -------------------------------------------------------------------------------- /images/dialogue-90dpi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/dialogue-90dpi.png -------------------------------------------------------------------------------- /images/drs_screenshot0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/drs_screenshot0.png -------------------------------------------------------------------------------- /images/drs_screenshot1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/drs_screenshot1.png -------------------------------------------------------------------------------- /images/findtheblock1.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/findtheblock1.tiff -------------------------------------------------------------------------------- /images/findtheblock2.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/findtheblock2.tiff -------------------------------------------------------------------------------- /images/findtheblock3.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/findtheblock3.tiff -------------------------------------------------------------------------------- /images/findtheblock4.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/findtheblock4.tiff -------------------------------------------------------------------------------- /images/ie-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/ie-architecture.png -------------------------------------------------------------------------------- /images/mimo-and-bruno.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/mimo-and-bruno.jpg -------------------------------------------------------------------------------- /images/mimo-and-bruno.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/mimo-and-bruno.png -------------------------------------------------------------------------------- /images/models_walk_cf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/models_walk_cf.png -------------------------------------------------------------------------------- /images/nltk-downloader.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/nltk-downloader.png -------------------------------------------------------------------------------- /images/string-slicing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/string-slicing.png -------------------------------------------------------------------------------- /images/timit-structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/timit-structure.png -------------------------------------------------------------------------------- /book-jp/fig_jpma_lattice1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book-jp/fig_jpma_lattice1.png -------------------------------------------------------------------------------- /book-jp/fig_jpma_lattice2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book-jp/fig_jpma_lattice2.png -------------------------------------------------------------------------------- /book/callouts/callout10.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout10.gif -------------------------------------------------------------------------------- /book/callouts/callout11.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout11.gif -------------------------------------------------------------------------------- /book/callouts/callout12.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout12.gif -------------------------------------------------------------------------------- /book/callouts/callout13.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout13.gif -------------------------------------------------------------------------------- /book/callouts/callout14.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout14.gif -------------------------------------------------------------------------------- /book/callouts/callout15.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout15.gif -------------------------------------------------------------------------------- /images/chart_bu_predict1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_bu_predict1.png -------------------------------------------------------------------------------- /images/chart_bu_predict2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_bu_predict2.png -------------------------------------------------------------------------------- /images/chart_fundamental.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_fundamental.png -------------------------------------------------------------------------------- /images/chart_intro_2edges.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_intro_2edges.png -------------------------------------------------------------------------------- /images/chart_intro_3edges.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_intro_3edges.png -------------------------------------------------------------------------------- /images/chart_intro_empty.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_intro_empty.png -------------------------------------------------------------------------------- /images/chart_positions1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_positions1.png -------------------------------------------------------------------------------- /images/chart_positions2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_positions2.png -------------------------------------------------------------------------------- /images/chart_td_expand1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_td_expand1.png -------------------------------------------------------------------------------- /images/chart_td_expand2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_td_expand2.png -------------------------------------------------------------------------------- /images/chart_useless_edge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_useless_edge.png -------------------------------------------------------------------------------- /images/chunk-segmentation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chunk-segmentation.png -------------------------------------------------------------------------------- /images/decision-tree.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/decision-tree.graffle -------------------------------------------------------------------------------- /images/feature-extraction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/feature-extraction.png -------------------------------------------------------------------------------- /images/ic_diagram_labeled.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/ic_diagram_labeled.pdf -------------------------------------------------------------------------------- /images/ic_diagram_labeled.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/ic_diagram_labeled.png -------------------------------------------------------------------------------- /images/models_love_cf01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/models_love_cf01.png -------------------------------------------------------------------------------- /images/models_love_cf02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/models_love_cf02.png -------------------------------------------------------------------------------- /images/multi-module.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/multi-module.graffle -------------------------------------------------------------------------------- /images/naive_bayes_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/naive_bayes_graph.png -------------------------------------------------------------------------------- /images/old-string-memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/old-string-memory.png -------------------------------------------------------------------------------- /images/precision-recall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/precision-recall.png -------------------------------------------------------------------------------- /images/recursive_parse1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/recursive_parse1.png -------------------------------------------------------------------------------- /images/three-layer-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/three-layer-arch.png -------------------------------------------------------------------------------- /images/vocabulary-growth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/vocabulary-growth.png -------------------------------------------------------------------------------- /images/wordnet-hierarchy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/wordnet-hierarchy.png -------------------------------------------------------------------------------- /images/words-dispersion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/words-dispersion.png -------------------------------------------------------------------------------- /images/Binary_entropy_plot.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/Binary_entropy_plot.pdf -------------------------------------------------------------------------------- /images/Binary_entropy_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/Binary_entropy_plot.png -------------------------------------------------------------------------------- /images/chart_intro_prodedge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_intro_prodedge.png -------------------------------------------------------------------------------- /images/chart_intro_selfloop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_intro_selfloop.png -------------------------------------------------------------------------------- /images/chart_td_match1_alt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_td_match1_alt.png -------------------------------------------------------------------------------- /images/naive-bayes-triangle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/naive-bayes-triangle.png -------------------------------------------------------------------------------- /images/naive_bayes_bargraph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/naive_bayes_bargraph.png -------------------------------------------------------------------------------- /images/parse_rdparsewindow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/parse_rdparsewindow.png -------------------------------------------------------------------------------- /book/term_index.rst: -------------------------------------------------------------------------------- 1 | 2 | .. index:: :extern: 3 | 4 | .. include:: ../definitions.rst 5 | .. include:: footer.rst 6 | -------------------------------------------------------------------------------- /images/chart_intro_dottededge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_intro_dottededge.png -------------------------------------------------------------------------------- /images/chart_intro_incomplete.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_intro_incomplete.png -------------------------------------------------------------------------------- /images/chart_intro_parseedge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_intro_parseedge.png -------------------------------------------------------------------------------- /images/naive_bayes_graph.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/naive_bayes_graph.graffle -------------------------------------------------------------------------------- /images/text-corpus-structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/text-corpus-structure.png -------------------------------------------------------------------------------- /images/naive-bayes-triangle.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/naive-bayes-triangle.graffle -------------------------------------------------------------------------------- /images/naive_bayes_bargraph.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/naive_bayes_bargraph.graffle -------------------------------------------------------------------------------- /images/supervised-classification.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/supervised-classification.png -------------------------------------------------------------------------------- /images/text-corpus-structure.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/text-corpus-structure.graffle -------------------------------------------------------------------------------- /book/footer-pt.rst: -------------------------------------------------------------------------------- 1 | .. admonition:: About this translation... 2 | 3 | This translation was contributed by Tiago Tresoldi. 4 | -------------------------------------------------------------------------------- /images/supervised-classification.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/supervised-classification.graffle -------------------------------------------------------------------------------- /book/definitions-pt.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | .. ifndef:: definitions-pt 4 | 5 | .. def:: definitions-pt 6 | 7 | .. |PLN| replace:: PLN 8 | 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Files and directories that get built automatically 2 | 3 | *.errs 4 | *.html 5 | *.ref 6 | *.rst2 7 | 8 | pylisting 9 | tree_images 10 | 11 | revision.rst 12 | 13 | -------------------------------------------------------------------------------- /images/ambig02.py: -------------------------------------------------------------------------------- 1 | from nltk.parse import bracket_parse 2 | sent = '(S (NP the policeman)(VP (V saw)(NP (NP the burglar)(PP with a gun))))' 3 | tree = bracket_parse(sent) 4 | tree.draw() 5 | -------------------------------------------------------------------------------- /images/ambig03.py: -------------------------------------------------------------------------------- 1 | from nltk.parse import bracket_parse 2 | sent = '(S (NP the policeman)(VP (V saw)(NP the burglar)(PP with a telescope)))' 3 | tree = bracket_parse(sent) 4 | tree.draw() 5 | -------------------------------------------------------------------------------- /slides/demos/parse.py: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | ## 3 | ## Chart Parsing Demo 4 | ## 5 | 6 | import nltk.draw.chart 7 | nltk.draw.chart.demo() 8 | -------------------------------------------------------------------------------- /book/dict.csv: -------------------------------------------------------------------------------- 1 | "sleep","sli:p","v.i","a condition of body and mind ..." 2 | "walk","wo:k","v.intr","progress by lifting and setting down each foot ..." 3 | "wake","weik","intrans","cease to sleep" 4 | -------------------------------------------------------------------------------- /images/ambig01-a.py: -------------------------------------------------------------------------------- 1 | from nltk.parse import bracket_parse 2 | sent = '(S (S Kim arrived) (conj or) (S (S Dana left) (conj and) (S everyone cheered)))' 3 | tree = bracket_parse(sent) 4 | tree.draw() 5 | -------------------------------------------------------------------------------- /images/ambig01-b.py: -------------------------------------------------------------------------------- 1 | from nltk.parse import bracket_parse 2 | sent = '(S (S (S Kim arrived) (conj or) (S Dana left)) (conj and) (S everyone cheered))' 3 | tree = bracket_parse(sent) 4 | tree.draw() 5 | -------------------------------------------------------------------------------- /images/lee-dog.py: -------------------------------------------------------------------------------- 1 | from nltk_lite.parse import bracket_parse 2 | from pprint import pprint 3 | sent = '(S (NP Lee)(VP (V saw)(NP the dog)))' 4 | tree = bracket_parse(sent) 5 | pprint(tree.pp()) 6 | #tree.draw() 7 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /book/docbook-issues.txt: -------------------------------------------------------------------------------- 1 | 2 | * Linguistic examples in chapter 10 having literal linebreaks are not 3 | correctly formatted. They should be in a larger font size, vertically aligned with 4 | the example number, and indented further. -------------------------------------------------------------------------------- /surveys/2005-10.txt: -------------------------------------------------------------------------------- 1 | NLTK-Lite 0.5 Survey 2 | -------------------- 3 | 4 | 5 | * questions about user, interests 6 | 7 | - is their subject homepage linked from the NLTK site? 8 | 9 | 10 | * questions about existing functionality and data 11 | 12 | 13 | * questions about desired functionality and data 14 | 15 | 16 | * questions about how the person would be able to contribute 17 | 18 | -------------------------------------------------------------------------------- /latexhacks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Post-process latex output in-place 3 | 4 | import sys 5 | import re 6 | 7 | # load the file 8 | file = open(sys.argv[1]) 9 | contents = file.read() 10 | file.close() 11 | 12 | # modify it 13 | contents = re.sub(r'subsection{', r'subsection*{', contents) 14 | 15 | # save the file 16 | file= open(sys.argv[1], 'w') 17 | file.write(contents) 18 | file.close() 19 | -------------------------------------------------------------------------------- /images/polish-utf8.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import re 4 | sent = """ 5 | Przewiezione przez Niemców pod koniec II wojny światowej na Dolny 6 | Śląsk, zostały odnalezione po 1945 r. na terytorium Polski. 7 | """ 8 | 9 | u = sent.decode('utf8') 10 | u.lower() 11 | print u.encode('utf8') 12 | 13 | SACUTE = re.compile('ś|Ś') 14 | replaced = re.sub(SACUTE, '[sacute]', sent) 15 | print replaced 16 | 17 | 18 | -------------------------------------------------------------------------------- /book/ch6-fmla.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \pagestyle{empty} 3 | \usepackage[verbose=true,margin=0cm,ignoreheadfoot,ignoremp, 4 | paperwidth=370pt,paperheight=180pt]{geometry} 5 | \begin{document} 6 | 7 | \begin{description} 8 | \item[Precision:] $\frac{\mathit{TP}}{\mathit{TP} + \mathit{FP}}$ 9 | 10 | \item[Recall:] $\frac{\mathit{TP}}{\mathit{TP} + \mathit{FN}}$ 11 | 12 | \item[F-Measure:] $\frac{2 \times \mathit{Precision} \times \mathit{Recall}}{\mathit{Precision} + \mathit{Recall}}$ 13 | \end{description} 14 | \end{document} 15 | -------------------------------------------------------------------------------- /slides/demos/names.py: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | ## 3 | ## Guess an unseen name's gender! 4 | ## 5 | 6 | from nltk.classify.naivebayes import NaiveBayesClassifier 7 | from nltk.classify.util import names_demo 8 | 9 | # Feature Extraction: 10 | def name_features(name): 11 | features = {} 12 | return features 13 | 14 | # Test the classifier: 15 | classifier = names_demo(NaiveBayesClassifier.train, name_features) 16 | 17 | # Feature Analysis: 18 | #classifier.show_most_informative_features() 19 | -------------------------------------------------------------------------------- /LSA325/assignment3.txt: -------------------------------------------------------------------------------- 1 | 1. Explore what kind of sequences are annotated as VP in the CONLL2000 "train" corpus data. 2 | 3 | 2. Develop a chunk.Regexp grammar to capture the regularities. 4 | 5 | 3. Use the trace=1 setting of the chunk parser to examine the success of your VP chunking rules. 6 | 7 | 4. Once you are reasonably happy with your rules, try evaluating your them against the CONLL2000 "test" corpus data (i.e., using the chunk.accuracy() function). 8 | 9 | 5. Briefly comment on how easy or difficult it was to develop an adequate rule set. 10 | -------------------------------------------------------------------------------- /images/chart_td_init.dot: -------------------------------------------------------------------------------- 1 | digraph x { 2 | rankdir=LR; 3 | ranksep=0.25; 4 | 5 | /* The nodes */ 6 | { 7 | node [style=filled, height=0.1,width=0.1, fillcolor=cadetblue]; 8 | 1 [label="0"]; 9 | } 10 | 11 | /* The sentence */ 12 | { 13 | edge [style=invis, weight=100]; 14 | node [shape=plaintext]; 15 | 1->dots2; 16 | dots2 [label="…"]; 17 | } 18 | 19 | /* Edges */ 20 | { 21 | edge [fontname=LucidaGrande]; 22 | 1->1 [label="S → • α "]; 23 | } 24 | 25 | } 26 | -------------------------------------------------------------------------------- /LSA325/assignment2.txt: -------------------------------------------------------------------------------- 1 | 1. Define a function find_tags(item, word) which takes a section of the Brown Corpus and a word as its arguments and returns a list of the tags that occur for that word, sorted in decreasing frequency (using the FreqDist.sorted() method). E.g. find_tags('a', 'present') should return ['jj', 'rb', 'vb', 'nn']]. 2 | 3 | 2. Define a function test_tagger(item, sent) which trains a bigram tagger on the specified section of the Brown Corpus, and uses it to tag sent. Write comment lines to explain why the tagger performs badly and to suggest a way performance could be improved. 4 | -------------------------------------------------------------------------------- /book/ch12-extras.rst: -------------------------------------------------------------------------------- 1 | -------------- 2 | The Holy Grail 3 | -------------- 4 | 5 | * NLP-Complete Problems: SLDS, MT 6 | (cf AI-complete) 7 | 8 | * Why they are hard 9 | 10 | * The problem of grounding. Embodied conversational agents. 11 | 12 | * Approaches: "grammar engineering" (scaling up a rule-based approach 13 | with the help of engineering methods such as grammar test suites); 14 | "grammar inference" (training on manually-checked annotated data). 15 | 16 | * Even simple problems! 17 | ``http://itre.cis.upenn.edu/~myl/languagelog/archives/001445.html`` 18 | 19 | 20 | -------------------------------------------------------------------------------- /images/avm1.tex: -------------------------------------------------------------------------------- 1 | \documentclass[12pt]{article} 2 | \usepackage{avm} 3 | \usepackage{helvet} 4 | 5 | \avmvalfont{\it} 6 | \avmfont{\sf} 7 | \avmoptions{active,unsorted} 8 | \pagestyle{empty} 9 | \usepackage[verbose=true,margin=0cm,ignoreheadfoot,ignoremp, 10 | paperwidth=370pt,paperheight=180pt]{geometry} 11 | \begin{document} 12 | {\Huge 13 | 14 | \fontfamily{phv}\selectfont 15 | \begin{avm} 16 | [ 17 | POS & N\\ 18 | AGR & [PER & 3\\ 19 | NUM & pl\\ 20 | GND & fem 21 | ] 22 | ] 23 | \end{avm} 24 | } 25 | \end{document} -------------------------------------------------------------------------------- /pages.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Natural Language Toolkit: Page length extraction script 4 | # 5 | # Copyright (C) 2001-2006 NLTK Project 6 | # Author: Steven Bird 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | r""" 11 | 12 | This script extracts the pagecount from a latex log file. 13 | 14 | """ 15 | 16 | from sys import argv 17 | from re import search 18 | 19 | regexp = r'\[(\d+)\][^\[]*$' # last [nn] in file 20 | logfile = open(argv[1]).read() # latex logfile 21 | print search(regexp, logfile).group(1) 22 | -------------------------------------------------------------------------------- /images/chart_intro_empty.dot: -------------------------------------------------------------------------------- 1 | /* -*- coding:utf-8 -*- */ 2 | 3 | digraph x { 4 | rankdir=LR; 5 | ranksep=0.25; 6 | 7 | /* The nodes */ 8 | { 9 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue4]; 10 | 0 [label=""]; 11 | 1 [label=""]; 12 | 2 [label=""]; 13 | 3 [label=""]; 14 | } 15 | 16 | /* The sentence */ 17 | { 18 | edge [style=invis, weight=100]; 19 | node [shape=plaintext,fontname="Times-BoldItalic"]; 20 | 0->John->1->likes->2->Mary->3; 21 | } 22 | 23 | /* Edges */ 24 | { 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /book/footer.rst: -------------------------------------------------------------------------------- 1 | .. Footer to be used in all chapters 2 | 3 | .. admonition:: About this document... 4 | 5 | UPDATED FOR NLTK 3.0. 6 | This is a chapter from *Natural Language Processing with Python*, 7 | by |StevenBird|, |EwanKlein| and |EdwardLoper|, 8 | Copyright |copy| 2019 the authors. 9 | It is distributed with the *Natural Language Toolkit* [|NLTK-URL|], 10 | Version |version|, under the terms of the 11 | *Creative Commons Attribution-Noncommercial-No Derivative Works 3.0 United States License* 12 | [http://creativecommons.org/licenses/by-nc-nd/3.0/us/]. 13 | 14 | .. include:: revision.rst 15 | -------------------------------------------------------------------------------- /images/Makefile: -------------------------------------------------------------------------------- 1 | PIX = chart_bottom_up.dot chart_bu_init.dot chart_bu_predict1.dot chart_bu_predict2.dot chart_fr1.dot chart_fr2.dot chart_intro_2edges.dot chart_intro_3edges.dot chart_intro_dottededge.dot chart_intro_empty.dot chart_intro_incomplete.dot chart_intro_parseedge.dot chart_intro_prodedge.dot chart_intro_selfloop.dot chart_td_expand1.dot chart_td_expand2.dot chart_td_init.dot chart_td_match1.dot chart_td_match2.dot chart_top_down.dot chart_useless_edge.dot 2 | 3 | PNG := $(PIX:.dot=.png) 4 | 5 | .SUFFIXES: .dot .png 6 | 7 | png: $(PNG) 8 | 9 | clean: 10 | rm -f $(PNG) 11 | 12 | .dot.png: 13 | dot -Tpng $< > $@ 14 | -------------------------------------------------------------------------------- /images/chart_useless_edge.dot: -------------------------------------------------------------------------------- 1 | digraph x { 2 | rankdir=LR; 3 | 4 | /* The nodes */ 5 | { 6 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue]; 7 | x [label="" style=invis]; 8 | 0 [label=""]; 9 | 1 [label=""]; 10 | 2 [label=""]; 11 | 3 [label=""]; 12 | } 13 | 14 | /* The sentence */ 15 | { 16 | edge [style=invis, weight=100]; 17 | node [shape=plaintext]; 18 | x->0->John->1->likes->2->Mary->3; 19 | } 20 | 21 | /* Edges */ 22 | { 23 | edge [fontname=LucidaGrande]; 24 | 0->0 [label="VP → • V NP"]; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /images/chart_intro_dottededge.dot: -------------------------------------------------------------------------------- 1 | digraph x { 2 | rankdir=LR; 3 | ranksep=0.25; 4 | 5 | /* The nodes */ 6 | { 7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue4]; 8 | 0 [label=""]; 9 | 1 [label=""]; 10 | 2 [label=""]; 11 | 3 [label=""]; 12 | } 13 | 14 | /* The sentence */ 15 | { 16 | edge [style=invis, weight=100]; 17 | node [shape=plaintext,fontname="Times-BoldItalic"]; 18 | 0->John->1->likes->2->Mary->3; 19 | } 20 | 21 | /* Edges */ 22 | { 23 | edge [fontname=LucidaGrande]; 24 | 1->2 [label="VP → V • NP"]; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /images/chart_intro_parseedge.dot: -------------------------------------------------------------------------------- 1 | digraph x { 2 | rankdir=LR; 3 | ranksep=0.25; 4 | 5 | /* The nodes */ 6 | { 7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue4]; 8 | 0 [label=""]; 9 | 1 [label=""]; 10 | 2 [label=""]; 11 | 3 [label=""]; 12 | } 13 | 14 | /* The sentence */ 15 | { 16 | edge [style=invis, weight=100]; 17 | node [shape=plaintext,fontname="Times-BoldItalic"]; 18 | 0->John->1->likes->2->Mary->3; 19 | } 20 | 21 | /* Edges */ 22 | { 23 | edge [fontname=LucidaGrande]; 24 | 0->3 [label="S → NP V •"]; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /images/chart_td_match2.dot: -------------------------------------------------------------------------------- 1 | digraph x { 2 | rankdir=LR; 3 | ranksep=0.25; 4 | 5 | /* The nodes */ 6 | { 7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue]; 8 | 1 [label="i"]; 9 | 2 [label="j"]; 10 | } 11 | 12 | /* The sentence */ 13 | { 14 | edge [style=invis, weight=100]; 15 | node [shape=plaintext]; 16 | dots1->1->dots2->2->dots3; 17 | dots1 [label="..."]; 18 | dots2 [label="..."]; 19 | dots3 [label="..."]; 20 | } 21 | 22 | /* Edges */ 23 | { 24 | edge [fontname=LucidaGrande]; 25 | 2->2 [label="w[j] → •"]; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /images/chart_bu_init.dot: -------------------------------------------------------------------------------- 1 | digraph x { 2 | rankdir=LR; 3 | ranksep=0.25; 4 | 5 | /* The nodes */ 6 | { 7 | node [style=filled, height=0.1,width=0.1,fillcolor=cadetblue]; 8 | 1 [label="i"]; 9 | 2 [label="i+1"]; 10 | } 11 | 12 | /* The sentence */ 13 | { 14 | edge [style=invis, weight=100]; 15 | node [shape=plaintext]; 16 | wi [label="w[i]"] 17 | dots1->1->wi->2->dots2; 18 | dots1 [label="..."]; 19 | dots2 [label="..."]; 20 | } 21 | 22 | /* Edges */ 23 | { 24 | edge [fontname=LucidaGrande]; 25 | 1->2 [label="w[i] → •"]; 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /images/chart_intro_selfloop.dot: -------------------------------------------------------------------------------- 1 | digraph x { 2 | rankdir=LR; 3 | ranksep=0.25; 4 | 5 | /* The nodes */ 6 | { 7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue4]; 8 | 0 [label=""]; 9 | 1 [label=""]; 10 | 2 [label=""]; 11 | 3 [label=""]; 12 | } 13 | 14 | /* The sentence */ 15 | { 16 | edge [style=invis, weight=100]; 17 | node [shape=plaintext,fontname="Times-BoldItalic"]; 18 | 0->John->1->likes->2->Mary->3; 19 | } 20 | 21 | /* Edges */ 22 | { 23 | edge [fontname=LucidaGrande]; 24 | 1->1 [label="VP → • V NP"]; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /book/bib_template.html: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | Natural Language Processing: Bibliography 8 | 9 | 10 | 11 | 12 | 13 |

Bibliography

14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /images/chart_intro_2edges.dot: -------------------------------------------------------------------------------- 1 | digraph x { 2 | rankdir=LR; 3 | ranksep=0.25; 4 | 5 | /* The nodes */ 6 | { 7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue4]; 8 | 0 [label=""]; 9 | 1 [label=""]; 10 | 2 [label=""]; 11 | 3 [label=""]; 12 | } 13 | 14 | /* The sentence */ 15 | { 16 | edge [style=invis, weight=100]; 17 | node [shape=plaintext,fontname="Times-BoldItalic"]; 18 | 0->John->1->likes->2->Mary->3; 19 | } 20 | 21 | /* Edges */ 22 | { 23 | edge [fontname=lucidagrande]; 24 | 1->2 [label="V"]; 25 | 2->3 [label="NP"]; 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /images/chart_td_expand2.dot: -------------------------------------------------------------------------------- 1 | digraph x { 2 | rankdir=LR; 3 | ranksep=0.25; 4 | 5 | /* The nodes */ 6 | { 7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue]; 8 | 1 [label="i"]; 9 | 2 [label="j"]; 10 | } 11 | 12 | /* The sentence */ 13 | { 14 | edge [style=invis, weight=100]; 15 | node [shape=plaintext]; 16 | dots1->1->dots2->2->dots3; 17 | dots1 [label="…"]; 18 | dots2 [label="…"]; 19 | dots3 [label="…"]; 20 | } 21 | 22 | /* Edges */ 23 | { 24 | edge [fontname=LucidaGrande]; 25 | 2->2 [label="B → • γ"]; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /index.rst: -------------------------------------------------------------------------------- 1 | .. NLTK documentation master file, created by 2 | sphinx-quickstart on Sat Oct 8 22:36:44 2011. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to NLTK's documentation! 7 | ================================ 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | 14 | ch00 15 | ch01 16 | ch02 17 | ch03 18 | ch04 19 | ch05 20 | ch06 21 | ch07 22 | ch08 23 | ch09 24 | ch10 25 | ch11 26 | ch12 27 | 28 | 29 | Indices and tables 30 | ================== 31 | 32 | * :ref:`genindex` 33 | * :ref:`modindex` 34 | * :ref:`search` 35 | 36 | -------------------------------------------------------------------------------- /images/chart_bu_predict1.dot: -------------------------------------------------------------------------------- 1 | digraph x { 2 | rankdir=LR; 3 | ranksep=0.25; 4 | 5 | /* The nodes */ 6 | { 7 | node [style=filled, height=0.1,width=0.1,fillcolor=cadetblue]; 8 | 1 [label="i"]; 9 | 2 [label="i+1"]; 10 | } 11 | 12 | /* The sentence */ 13 | { 14 | edge [style=invis, weight=100]; 15 | node [shape=plaintext]; 16 | dots1->1->dots2->2->dots3; 17 | dots1 [label="…"]; 18 | dots2 [label="…"]; 19 | dots3 [label="…"]; 20 | } 21 | 22 | /* Edges */ 23 | { 24 | edge [fontname=LucidaGrande]; 25 | 1->2 [label="A → α •"]; 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /images/chart_bu_predict2.dot: -------------------------------------------------------------------------------- 1 | digraph x { 2 | rankdir=LR; 3 | ranksep=0.25; 4 | 5 | /* The nodes */ 6 | { 7 | node [style=filled, height=0.1,width=0.1,fillcolor=cadetblue]; 8 | 1 [label="i"]; 9 | 2 [label="j"]; 10 | } 11 | 12 | /* The sentence */ 13 | { 14 | edge [style=invis, weight=100]; 15 | node [shape=plaintext]; 16 | dots1->1->dots2->2->dots3; 17 | dots1 [label="…"]; 18 | dots2 [label="…"]; 19 | dots3 [label="…"]; 20 | } 21 | 22 | /* Edges */ 23 | { 24 | edge [fontname=LucidaGrande]; 25 | 1->1 [label="B → • A β"]; 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /images/chart_td_expand1.dot: -------------------------------------------------------------------------------- 1 | digraph x { 2 | rankdir=LR; 3 | ranksep=0.25; 4 | 5 | /* The nodes */ 6 | { 7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue]; 8 | 1 [label="i"]; 9 | 2 [label="j"]; 10 | } 11 | 12 | /* The sentence */ 13 | { 14 | edge [style=invis, weight=100]; 15 | node [shape=plaintext]; 16 | dots1->1->dots2->2->dots3; 17 | dots1 [label="…"]; 18 | dots2 [label="…"]; 19 | dots3 [label="…"]; 20 | } 21 | 22 | /* Edges */ 23 | { 24 | edge [fontname=LucidaGrande]; 25 | 1->2 [label="A → α • β"]; 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /images/chart_td_match1.dot: -------------------------------------------------------------------------------- 1 | digraph x { 2 | rankdir=LR; 3 | ranksep=0.25; 4 | 5 | /* The nodes */ 6 | { 7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue]; 8 | 1 [label="i"]; 9 | 2 [label="j"]; 10 | } 11 | 12 | /* The sentence */ 13 | { 14 | edge [style=invis, weight=100]; 15 | node [shape=plaintext]; 16 | dots1->1->dots2->2->dots3; 17 | dots1 [label="…"]; 18 | dots2 [label="…"]; 19 | dots3 [label="…"]; 20 | } 21 | 22 | /* Edges */ 23 | { 24 | edge [fontname=LucidaGrande]; 25 | 1->2 [label="A → α • w[j] β"]; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /images/chart_intro_incomplete.dot: -------------------------------------------------------------------------------- 1 | digraph x { 2 | rankdir=LR; 3 | ranksep=0.25; 4 | 5 | /* The nodes */ 6 | { 7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue4]; 8 | 0 [label=""]; 9 | 1 [label=""]; 10 | 2 [label=""]; 11 | 3 [label=""]; 12 | } 13 | 14 | /* The sentence */ 15 | { 16 | edge [style=invis, weight=100]; 17 | node [shape=plaintext,fontname="Times-BoldItalic"]; 18 | 0->John->1->likes->2->Mary->3; 19 | } 20 | 21 | /* Edges */ 22 | { 23 | edge [fontname=LucidaGrande]; 24 | 1->3 [label="VP → V NP •"]; 25 | 1->2 [label="VP → V • NP"]; 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /howto/coverage.txt: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | ============================= 4 | NLTK Regression Test Coverage 5 | ============================= 6 | 7 | The following table lists each NLTK module, and indicates what 8 | percentage of the module's statements are currently covered by the 9 | regression test set. To see which functions are methods are covered 10 | in a given module, click on that module. You can then click on those 11 | functions and methods to see their source code, and to check what 12 | portion of them is covered by tests. 13 | 14 | .. include:: coverage-list.txt 15 | 16 | ---- 17 | 18 | `Return to the NLTK Regression Tests <../index.html>`__ 19 | 20 | `Return to the NLTK Homepage `__ 21 | -------------------------------------------------------------------------------- /images/chart_intro_prodedge.dot: -------------------------------------------------------------------------------- 1 | digraph x { 2 | rankdir=LR; 3 | ranksep=0.25; 4 | 5 | /* The nodes */ 6 | { 7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue4]; 8 | 0 [label=""]; 9 | 1 [label=""]; 10 | 2 [label=""]; 11 | 3 [label=""]; 12 | } 13 | 14 | /* The sentence */ 15 | { 16 | edge [style=invis, weight=100]; 17 | node [shape=plaintext,fontname="Times-BoldItalic"]; 18 | 0->John->1->likes->2->Mary->3; 19 | } 20 | 21 | /* Edges */ 22 | { 23 | edge [fontname=LucidaGrande]; 24 | 1->3 [label="VP → V NP"]; 25 | 1->2 [label="V → likes"]; 26 | 2->3 [label="NP → Mary"]; 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /images/chart_intro_3edges.dot: -------------------------------------------------------------------------------- 1 | /* -*- coding:utf-8 -*- */ 2 | 3 | digraph x { 4 | rankdir=LR; 5 | ranksep=0.25; 6 | 7 | /* The nodes */ 8 | { 9 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue4]; 10 | 0 [label=""]; 11 | 1 [label=""]; 12 | 2 [label=""]; 13 | 3 [label=""]; 14 | } 15 | 16 | /* The sentence */ 17 | { 18 | edge [style=invis, weight=100]; 19 | node [shape=plaintext,fontname="Times-BoldItalic"]; 20 | 0->John->1->likes->2->Mary->3; 21 | } 22 | 23 | /* Edges */ 24 | { 25 | edge [fontname=lucidagrande]; 26 | edge [weight=0]; 27 | 1->3 [label="VP"]; 28 | 1->2 [label="V"]; 29 | 2->3 [label="NP"]; 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /images/chart_fr2.dot: -------------------------------------------------------------------------------- 1 | digraph x { 2 | rankdir=LR; 3 | ranksep=0.25; 4 | 5 | /* The nodes */ 6 | { 7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue]; 8 | 1 [label="i"]; 9 | 2 [label="j"]; 10 | 3 [label="k"]; 11 | } 12 | 13 | /* The sentence */ 14 | { 15 | edge [style=invis, weight=100]; 16 | node [shape=plaintext]; 17 | dots1->1->dots2->2->dots3->3->dots4; 18 | dots1 [label="…"]; 19 | dots2 [label="…"]; 20 | dots3 [label="…"]; 21 | dots4 [label="…"]; 22 | } 23 | 24 | /* Edges */ 25 | { 26 | edge [fontname=LucidaGrande]; 27 | 1->3 [label="A → α B • β "]; 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /book/book.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | Natural Language Processing 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | .. def:: book 8 | .. include:: ../definitions.rst 9 | 10 | :Authors: Steven Bird, Ewan Klein, Edward Loper 11 | :Version: |version| (draft only, please send feedback to authors) 12 | :Copyright: |copy| |copyrightinfo| 13 | :License: |license| 14 | :Revision: 15 | :Date: 16 | 17 | .. contents:: 18 | :depth: 2 19 | 20 | .. preface:: 21 | .. ch00.rst 22 | 23 | .. toctree:: 24 | :maxdepth: 2 25 | 26 | ch00 27 | ch01 28 | ch02 29 | ch03 30 | ch04 31 | ch05 32 | ch06 33 | ch07 34 | ch08 35 | ch09 36 | ch10 37 | ch11 38 | ch12 39 | 40 | .. index:: 41 | 42 | .. include:: term_index.rst 43 | 44 | -------------------------------------------------------------------------------- /book/ch02-extras.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | .. include:: ../definitions.rst 3 | 4 | .. standard global imports 5 | 6 | >>> import nltk, re, pprint 7 | 8 | ======================================================== 9 | 2. Accessing Text Corpora and Lexical Resources (Extras) 10 | ======================================================== 11 | 12 | 13 | ------------------------------------- 14 | Language Resource Listings on the Web 15 | ------------------------------------- 16 | 17 | * http://nlp.stanford.edu/links/statnlp.html 18 | 19 | Search OLAC, the `Open Language Archives Community` 20 | 21 | * http://www.language-archives.org/ 22 | 23 | Search the archives of the "Corpora List": 24 | 25 | * http://listserv.linguistlist.org/archives/corpora.html 26 | 27 | 28 | .. include:: footer.rst 29 | -------------------------------------------------------------------------------- /images/chart_fr1.dot: -------------------------------------------------------------------------------- 1 | digraph x { 2 | rankdir=LR; 3 | ranksep=0.25; 4 | 5 | /* The nodes */ 6 | { 7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue]; 8 | 1 [label="i"]; 9 | 2 [label="j"]; 10 | 3 [label="k"]; 11 | } 12 | 13 | /* The sentence */ 14 | { 15 | edge [style=invis, weight=100]; 16 | node [shape=plaintext]; 17 | dots1->1->dots2->2->dots3->3->dots4; 18 | dots1 [label="…"]; 19 | dots2 [label="…"]; 20 | dots3 [label="…"]; 21 | dots4 [label="…"]; 22 | } 23 | 24 | /* Edges */ 25 | { 26 | edge [fontname=LucidaGrande]; 27 | 1->2 [label="A → α • B β"]; 28 | 2->3 [label="B → γ"]; 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /examples.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Natural Language Toolkit: Example generation script 4 | # 5 | # Copyright (C) 2001-2012 NLTK Project 6 | # Author: Steven Bird 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | """ 11 | Extract the code samples from a file in restructured text format 12 | """ 13 | 14 | import sys 15 | 16 | from epydoc.markup.doctest import DoctestColorizer 17 | PROMPT_RE = DoctestColorizer.PROMPT_RE 18 | 19 | for filename in sys.argv[1:]: 20 | in_code = False 21 | for line in open(filename).readlines(): 22 | if PROMPT_RE.match(line): 23 | in_code = True 24 | print PROMPT_RE.sub('', line), 25 | 26 | elif in_code: 27 | in_code = False 28 | print 29 | -------------------------------------------------------------------------------- /slides/beamer.tex: -------------------------------------------------------------------------------- 1 | \mode 2 | { 3 | \usetheme{Pittsburgh} 4 | \setbeamercovered{transparent} 5 | \beamerdefaultoverlayspecification{<+->} 6 | } 7 | 8 | \mode 9 | { 10 | \usetheme{default} 11 | \usecolortheme{default} 12 | \useoutertheme{default} 13 | \usepackage{pgf} 14 | \usepackage{pgfpages} 15 | % \pgfpagesuselayout{4 on 1}[a4paper,landscape,scale=0.9] 16 | \setjobnamebeamerversion{handout.beamer} 17 | } 18 | 19 | \mode
20 | { 21 | \usepackage{fullpage} 22 | \usepackage{pgf} 23 | \usepackage{hyperref} 24 | \setjobnamebeamerversion{notes.beamer} 25 | } 26 | 27 | \usepackage[english]{babel} 28 | \usepackage[latin1]{inputenc} 29 | \usepackage{times} 30 | \usepackage[T1]{fontenc} 31 | 32 | \date{\today} 33 | 34 | \subject{Natural Language Toolkit} 35 | 36 | % hack since pgfex is not defined 37 | \def\pgfex{ex} 38 | 39 | -------------------------------------------------------------------------------- /images/Binary_entropy_plot.tex: -------------------------------------------------------------------------------- 1 | %Plot of information entropy of bernoulli variable 2 | % 3 | %latex binary_entropy_plot; dvips binary_entropy_plot 4 | %open .ps file in gimp, choose strong antialias in both text and graphics, 5 | %resulution 500, color mode, crop, scale to 45%, save as .png 6 | \documentclass[12pt]{article} 7 | \usepackage{pst-plot} 8 | \begin{document} 9 | \psset{unit=4cm} 10 | \begin{pspicture}(0,0)(1.01,1) 11 | \psgrid[gridlabels=0pt,gridcolor=lightgray,subgriddiv=10,subgridcolor=lightgray](0,0)(0,0)(1,1) 12 | \newrgbcolor{myblue}{0 0 0.7} 13 | \psaxes[arrows=->,arrowsize=2pt 4,Dx=0.5,Dy=0.5](0,0)(0,0)(1.1,1.1) 14 | \psplot[plotstyle=curve,plotpoints=100,linewidth=1.8pt,linecolor=myblue]{0.0001}{0.9999}{-1 x x log 2 log div mul 1 x sub 1 x sub log 2 log div mul add mul} 15 | \rput(0.5,-0.22){$P(male)$} 16 | \rput{90}(-0.28,0.5){$H$} 17 | \end{pspicture} 18 | \end{document} 19 | -------------------------------------------------------------------------------- /book/print.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | Natural Language Processing 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | .. def:: book 8 | .. include:: ../definitions.rst 9 | 10 | :Authors: Steven Bird, Ewan Klein, Edward Loper 11 | :Version: |version| (draft only, please send feedback to authors) 12 | :Copyright: |copy| |copyrightinfo| 13 | :License: |license| 14 | :Revision: 15 | :Date: 16 | 17 | .. contents:: 18 | :depth: 2 19 | 20 | .. preface:: 21 | .. include:: ch00.rst 22 | 23 | .. body:: 24 | .. include:: ch01.rst 25 | .. include:: ch02.rst 26 | .. include:: ch03.rst 27 | .. include:: ch04.rst 28 | .. include:: ch05.rst 29 | .. include:: ch06.rst 30 | .. include:: ch07.rst 31 | .. include:: ch08.rst 32 | .. include:: ch09.rst 33 | .. include:: ch10.rst 34 | .. include:: ch11.rst 35 | .. include:: ch12.rst 36 | 37 | .. index:: 38 | 39 | .. include:: term_index.rst 40 | 41 | -------------------------------------------------------------------------------- /rsthacks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Pre-process rst source 3 | 4 | from optparse import OptionParser 5 | import re 6 | 7 | _SCALE_RE = b'(:scale:\s+)(\d+):(\d+):(\d+)' 8 | 9 | def process(file, format): 10 | contents = open(file, 'rb').read() 11 | if format == "html": 12 | contents = re.sub(_SCALE_RE, r'\1\2', contents) 13 | elif format == "latex": 14 | contents = re.sub(_SCALE_RE, r'\1\3', contents) 15 | elif format == "xml": 16 | contents = re.sub(_SCALE_RE, r'\1\4', contents) 17 | open(file + "2", 'wb').write(contents) 18 | 19 | parser = OptionParser() 20 | parser.add_option("-f", "--format", dest="format", 21 | help="output format (html, latex, xml)", metavar="FMT") 22 | 23 | o, a = parser.parse_args() 24 | 25 | if o.format and o.format in ["html", "latex", "xml"] and a and len(a) == 1: 26 | process(a[0], o.format) 27 | 28 | else: 29 | exit("Must specify a format (html, latex, xml) and a filename") 30 | -------------------------------------------------------------------------------- /LSA325/assignment5.txt: -------------------------------------------------------------------------------- 1 | Write a short review (2 to 3 pages) on some aspect of the NLTK book 2 | draft (http://nltk.org/index.php/Book). What we would appreciate is a 3 | thoughtful analysis of some specific portion of the book, rather than, 4 | say, a list of typos and errors over a large portion of the book. You 5 | can focus on anything you like: for example, the presentation of key 6 | ideas, the explanation of a piece of code, orthe way in which some 7 | exercises are formulated. You could even critique a piece of code. You 8 | could reflect on any key concepts you have struggled with, or the most 9 | important thing you learned, or new ways to illustrate the ideas using 10 | examples from your favorite area of linguistics, and make concrete 11 | suggestions for improving the presentation. 12 | 13 | All suggestions that we use will be acknowledged in the book's 14 | preface. Please submit your work in a plain text file, named 15 | yoursurname_review.txt, via the file upload option. 16 | 17 | -------------------------------------------------------------------------------- /pt-br/index.txt: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | .. include:: ../definitions.txt 3 | 4 | ====================== 5 | Tutoriais do NLTK-Lite 6 | ====================== 7 | 8 | :Autores: Steven Bird, Ewan Klein, Edward Loper 9 | :Contato: sb@csse.unimelb.edu.au 10 | :Version: |version| 11 | :Copyright: |copy| |copyrightinfo| 12 | :Licença: |license| 13 | 14 | .. _Prefácio: preface.html 15 | .. _Introdução: introduction.html 16 | .. _Programação: programming.html 17 | .. _Toquenização: tokenize.html 18 | .. _Tag: tag.html 19 | .. _Parsing: parse.html 20 | .. _Chunk: chunk.html 21 | .. _Chart: chart.html 22 | .. _PCFG: pcfg.html 23 | .. _Field: field.html 24 | .. _Regexps: regexps.html 25 | .. _Projetos: projects.html 26 | 27 | ------ 28 | Índice 29 | ------ 30 | 31 | 0. Prefácio_ 32 | #. Introdução_ 33 | #. Programação_ 34 | #. Toquenização_ 35 | #. Tag_ 36 | #. Parsing_ 37 | #. Chunk_ 38 | #. Chart_ 39 | #. PCFG_ 40 | #. Field_ 41 | #. Regexps_ 42 | #. Projetos_ 43 | 44 | ---- 45 | 46 | NLTK_ 47 | 48 | .. _NLTK: http://nltk.sourceforge.net/ 49 | 50 | -------------------------------------------------------------------------------- /doctest_split.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Natural Language Toolkit: Split an RST file into sections for independent doctest checking 4 | # 5 | # Copyright (C) 2001-2012 NLTK Project 6 | # Author: Steven Bird 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | import sys 11 | import re 12 | 13 | EXT = "doctest" # output filename extension 14 | SEC = r"\n(?=-+\n.+\n-+\n)" # pattern to match section heading 15 | 16 | # include this at the top of each output file 17 | HDR = """ 18 | >>> import nltk, re, pprint 19 | >>> from nltk import word_tokenize 20 | """ 21 | 22 | for filename in sys.argv[1:]: 23 | contents = open(filename).read() 24 | basename, suffix = filename.split('.') 25 | for count, section in enumerate(re.split(SEC, contents)): 26 | chunk_name = "%s-%d.%s" % (basename, count+1, EXT) 27 | chunk_file = open(chunk_name, "w") 28 | chunk_file.write(HDR + "\n") 29 | chunk_file.write(section) 30 | chunk_file.close() 31 | -------------------------------------------------------------------------------- /archives/sourceforge-dev.txt: -------------------------------------------------------------------------------- 1 | nltk 1.3 documentation 2 | By: Patrick Ye (jingy) - 2004-04-14 17:12 3 | Would it be possible to package the documentation for version 1.3 so it can be downloaded easily? This would 4 | be pretty useful in case we have no internet access. 5 | 6 | Thanks a lot. 7 | 8 | Patrick 9 | 10 | 11 | RE: nltk 1.3 documentation 12 | By: Edward Loper (edloperProject Admin) - 2004-04-15 09:11 13 | I just added it to the files page: http://sourceforge.net/project/showfiles.php?group_id=30982 14 | 15 | Sorry for the omission. 16 | 17 | 18 | nltk_contrib for WordNet 19 | By: Patrick Ye (jingy) - 2004-04-14 17:04 20 | Hi, 21 | 22 | I created a python package (with a setup.py) for interfacing python/nltk with WordNet using the C code that comes with the WordNet library. I'm not sure how to integrate this new package into nltk_contrib, i.e, should I just use the python package or unpackage it and treat it as just a directory that contains a few python files? 23 | 24 | Any help would be appreciated. 25 | 26 | Patrick 27 | 28 | -------------------------------------------------------------------------------- /xincluder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Natural Language Toolkit: Process the XIncludes of an XML document 4 | # 5 | # Copyright (C) 2001-2012 NLTK Project 6 | # Author: Steven Bird 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | import sys 11 | import re 12 | 13 | EXT = "-flat" # output filename extension 14 | XI1 = r'' 15 | DOC = r'(?s)' 16 | NAMESPACE = r' xmlns:xi="http://www.w3.org/2001/XInclude"' 17 | 18 | for filename in sys.argv[1:]: 19 | basename, suffix = filename.split('.') 20 | output_filename = basename + EXT + "." + suffix 21 | output = open(output_filename, "w") 22 | for line in open(filename): 23 | m = re.search(XI1, line) 24 | if m: 25 | contents = open(m.group(1)).read() 26 | if re.search(DOC, contents): 27 | contents = re.split(DOC, contents)[1] 28 | output.write(contents) 29 | else: 30 | if NAMESPACE in line: 31 | line = re.sub(NAMESPACE, '', line) 32 | output.write(line) 33 | output.close() 34 | 35 | 36 | -------------------------------------------------------------------------------- /slides/Makefile: -------------------------------------------------------------------------------- 1 | # Presentation Slides Makefile 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Steven Bird 5 | # Edward Loper 6 | # URL: 7 | # For license information, see LICENSE.TXT 8 | 9 | WEB = $(USER)@shell.sourceforge.net:/home/groups/n/nl/nltk/htdocs 10 | 11 | TEX = preface.tex introduction.tex programming.tex tag.tex chunk.tex data.tex 12 | PDF := $(TEX:.tex=.pdf) 13 | 14 | RST2HTML = ../rst.py --html 15 | 16 | RSYNC_OPTS = -lrtvz -e ssh --relative --cvs-exclude 17 | 18 | .SUFFIXES: .rst .html .tex .pdf 19 | 20 | .PHONY: all clean 21 | 22 | all: $(PDF) index.html 23 | 24 | clean: clean_up 25 | rm -f $(PDF) index.html 26 | 27 | clean_up: 28 | rm -f *.log *.aux *.snm *.vrb *.out *.nav *.toc index*.html 29 | 30 | index.html: index.rst 31 | $(RST2HTML) index.rst > index.html 32 | 33 | .tex.pdf: 34 | pdflatex $< 35 | pdflatex $< 36 | mkdir -p handouts 37 | sed 's/documentclass/documentclass[handout]/' < $< > handouts/$< 38 | pdflatex -output-directory handouts handouts/$< 39 | pdflatex -output-directory handouts handouts/$< 40 | 41 | rsync: 42 | rsync $(RSYNC_OPTS) . $(WEB)/doc/slides/ 43 | -------------------------------------------------------------------------------- /book/reprint1-4.txt: -------------------------------------------------------------------------------- 1 | p115 first line after example 3-3: maximizes s/b minimizes 2 | (from OReilly errata) 3 | 4 | p234 example 6-6: 5 | 6up: for i, word in words: s/b for i, word in enumerate(words): 6 | 5up: classifier.classify(words, i) s/b classifier.classify(punct_features(words, i)) 7 | 0up: add final line of code, indented at same level as "if" statement: return sents 8 | 9 | p243 10up formula (1) please insert minus sign between equals and uppercase sigma 10 | 11 | p309 9up "through entire list" s/b "through the entire list" 12 | (from previous list) 13 | 14 | p336 Fig 9-1 should still be a bit smaller please. 15 | 16 | p426 1d: add opening tag on new line, after closing tag: 17 | (from OReilly errata) 18 | 19 | 20 | whale 21 | noun 22 | 23 | any of the larger cetacean mammals having a streamlined 24 | body and breathing through a blowhole on the head 25 | whale.n.02 26 | 27 | <------------------------ NEW TAG HERE 28 | a very large person; impressive in size or qualities 29 | giant.n.04 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /images/precision-recall.fig: -------------------------------------------------------------------------------- 1 | #FIG 3.2 2 | Landscape 3 | Center 4 | Metric 5 | A4 6 | 100.00 7 | Single 8 | -2 9 | 1200 2 10 | 0 32 #808080 11 | 5 1 0 1 1 7 50 -1 -1 0.000 0 0 0 0 6726.731 4118.000 6750 6750 4095 4185 6705 1485 12 | 5 1 0 1 4 7 50 -1 -1 0.000 0 0 0 0 4522.494 6276.731 1890 6300 4455 3645 7155 6255 13 | 1 3 0 4 32 7 60 -1 -1 0.000 1 0.0000 4545 4050 2655 2655 4545 4050 7200 4095 14 | 4 0 1 60 -1 0 18 0.0000 4 180 1680 7875 4005 DOCUMENTS\001 15 | 4 0 1 60 -1 0 18 0.0000 4 180 1500 7875 3555 RETRIEVED\001 16 | 4 0 4 60 -1 0 18 0.0000 4 180 2685 3195 7245 INFORMATION NEED\001 17 | 4 0 0 60 -1 0 18 0.0000 4 180 1035 5265 2700 irrelevant\001 18 | 4 0 0 60 -1 0 18 0.0000 4 180 870 2835 4950 relevant\001 19 | 4 0 0 60 -1 0 18 0.0000 4 180 1380 2655 5400 not retrieved\001 20 | 4 0 0 60 -1 0 18 0.0000 4 180 345 3240 4500 FN\001 21 | 4 0 0 60 -1 0 18 0.0000 4 180 360 3465 2250 TN\001 22 | 4 0 0 60 -1 0 18 0.0000 4 180 1035 2835 2700 irrelevant\001 23 | 4 0 0 60 -1 0 18 0.0000 4 180 1380 2475 3150 not retrieved\001 24 | 4 0 0 60 -1 0 18 0.0000 4 180 960 5130 3150 retrieved\001 25 | 4 0 0 60 -1 0 18 0.0000 4 180 300 5580 2250 FP\001 26 | 4 0 0 60 -1 0 18 0.0000 4 180 315 4815 4500 TP\001 27 | 4 0 0 60 -1 0 18 0.0000 4 180 960 5130 5400 retrieved\001 28 | 4 0 0 60 -1 0 18 0.0000 4 180 870 4905 4950 relevant\001 29 | -------------------------------------------------------------------------------- /slides/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | .. include:: ../definitions.rst 3 | 4 | ======================== 5 | NLTK Presentation Slides 6 | ======================== 7 | 8 | :Authors: Steven Bird 9 | :Version: |version| 10 | :Copyright: |copy| |copyrightinfo| 11 | :License: |license| 12 | 13 | -------- 14 | Contents 15 | -------- 16 | 17 | These PDF slides were produced using the LaTeX Beamer package. 18 | The source materials are also made available here; please send 19 | any improvements to Steven Bird, for inclusion in future versions. 20 | (Note these are out-of-date.) 21 | 22 | 0. Preface 23 | #. [\ `PDF slides `__\ \|\ `LaTeX-Beamer source `__\ ] Preface 24 | #. [\ `PDF slides `__\ \|\ `LaTeX-Beamer source `__\ ] Introduction 25 | #. [\ `PDF slides `__\ \|\ `LaTeX-Beamer source `__\ ] Programming 26 | #. [\ `PDF slides `__\ \|\ `LaTeX-Beamer source `__\ ] Tagging 27 | #. [\ `PDF slides `__\ \|\ `LaTeX-Beamer source `__\ ] Chunking 28 | #. [\ `PDF slides `__\ \|\ `LateX-Beamer source `__\ ] Linguistic Data Management 29 | 30 | [`beamer.tex `__\ ] File used to build slides. 31 | 32 | ---- 33 | 34 | NLTK_ 35 | 36 | .. _NLTK: http://nltk.org/ 37 | 38 | 39 | -------------------------------------------------------------------------------- /book/ch05-extras.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | .. include:: ../definitions.rst 3 | 4 | .. standard global imports 5 | 6 | >>> import nltk, re, pprint 7 | 8 | ========================================== 9 | 5. Categorizing and Tagging Words (Extras) 10 | ========================================== 11 | 12 | Tagging exhibits several properties that are characteristic of natural 13 | language processing. First, tagging involves *classification*: words have 14 | properties; many words share the same property (e.g. ``cat`` and ``dog`` 15 | are both nouns), while some words can have multiple such properties 16 | (e.g. ``wind`` is a noun and a verb). Second, in tagging, disambiguation 17 | occurs via *representation*: we augment the representation of tokens with 18 | part-of-speech tags. Third, training a tagger involves *sequence learning 19 | from annotated corpora*. Finally, tagging uses *simple, general methods* 20 | such as conditional frequency distributions and transformation-based learning. 21 | 22 | 23 | List of available taggers: 24 | ``http://www-nlp.stanford.edu/links/statnlp.html`` 25 | 26 | NLTK's HMM tagger, ``nltk.HiddenMarkovModelTagger`` 27 | 28 | [Abney1996PST]_ 29 | 30 | ``http://en.wikipedia.org/wiki/Part-of-speech_tagging`` 31 | 32 | .. Dutch example: http://www.askoxford.com/pressroom/archive/odelaunch/ 33 | -------------------------------------------------------------------------------- /book/reprint1-1.txt: -------------------------------------------------------------------------------- 1 | Communicated to O'Reilly in July 2009. 2 | 3 | Issues with Figures 4 | 5 | Fig 1.1 -- more contrast (supplied image was color) 6 | Fig 1.3 -- smaller scale 7 | Fig 2.7 -- more contrast (supplied image was color) 8 | Fig 4.3 -- inconsistent arrow style, colliding arrow heads, inconsistent arrow origins 9 | Fig 6.5 -- higher resolution (we need to supply a better image) 10 | Fig 9.1 -- smaller scale (closer in size to example (18) same page), fix broken vbars 11 | Fig 10.3 -- fix horizontal alignment (subtrees rooted at greek letter variables) 12 | Fig 11.4 -- missing subscript on s_1 13 | 14 | Errata reported on O'Reilly site (first two already noted in QC2 annotations) 15 | 16 | p46 2d: file[:4] -> fileid[:4] 17 | p88 3up: print b -> print line 18 | p92 middle: s.titlecase() -> s.title() 19 | 20 | Ch 9: DAGs all scaled too big 21 | 22 | Everywhere: No trees should have boldfaced text (except 23 | the one on p167). (fyi Ch 9 (35) was still inconsistently boldfaced) 24 | 25 | p339 (23) incorrect diagram; it should be: 26 | http://nltk.googlecode.com/svn/trunk/doc/book/ch09.html#ex-dag04 27 | 28 | p355 -- remove box from code block 29 | 30 | p382 (28) -- smaller scale 31 | 32 | bad pagebreaks: xiv, 18, 43, 76, 103, 341, 343 33 | 34 | p395 8up "core", "store" s/b uc in the SEM value of VP 35 | 36 | p467: "deve-test" -> "dev-test" 37 | 38 | -------------------------------------------------------------------------------- /book/errata2.txt: -------------------------------------------------------------------------------- 1 | Errata still present in the second printing: 2 | 3 | * sec 3.12 exercise 16: should be "from test import monty; print monty" 4 | * sec 4.2: output missing from len(t) -- issue 500 5 | 6 | * sec 4.4: append['noun'] should be append('noun') 7 | 8 | * sec 4.2: Thus, zip() takes the items of two or more sequences and "zips" them together into a single list of pairs. 9 | s/b "list of tuples". We only get pairs when two sequences are zipped. 10 | 11 | * ch8: the `a`:em: "are" of column `A`:em: 12 | s/b the `a`:em: "are" of column `I`:em: 13 | 14 | * ch4: trie = nltk.defaultdict(dict) 15 | s/b trie = {} 16 | 17 | * ch4: sort them according to their ``path_distance()`` 18 | s/b sort them according to their ``shortest_path_distance()`` 19 | 20 | * ch4: [len(w) for w in nltk.corpus.brown.sents(categories='news'))] 21 | s/b [len(w) for w in nltk.corpus.brown.sents(categories='news')] 22 | 23 | * ch4: random.randint(0, %d) in vocab" % vocab_size * 2 24 | s/b random.randint(0, %d) in vocab" % (vocab_size * 2) 25 | 26 | * ch1: if the thieves are sold, ... if the paintings are sold. 27 | s/b if the thieves are found, ... if the paintings are found. 28 | 29 | * ch5: print nltk.ConfusionMatrix(gold, test) 30 | s/b print nltk.ConfusionMatrix(gold_tags, test_tags) 31 | 32 | -- 33 | 34 | * ch1: correct translation to German would have used "nach" instead of "zu" (page 30) 35 | -------------------------------------------------------------------------------- /book/copy-edits.txt: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | ====================== 4 | Copy-Edit Categories 5 | ====================== 6 | 7 | ---------------------------- 8 | 1. To be entered by O'Reilly 9 | ---------------------------- 10 | 11 | - punctuation, including use of em dashes 12 | - 'like' -> 'such as' 13 | - 'which' -> 'that' 14 | - 'while' -> 'whereas' 15 | - alternatives to 'below', 'above' 16 | - capitalization of titles and figure legends 17 | - hyphenation conventions in body of text (e.g, 'multi-line' -> 'multiline') 18 | - spelling 19 | - numerals (e.g., '5' -> 'five') 20 | 21 | 22 | 23 | ---------------------------- 24 | 2. To be entered by Authors 25 | ---------------------------- 26 | 27 | ?? 28 | 29 | 30 | ------------ 31 | 3. No Change 32 | ------------ 33 | 34 | - boldface text retained for new and important terms 35 | - constant width retained for program names, menu names, filenames, etc 36 | 37 | ------------------ 38 | 4. To be discussed 39 | ------------------ 40 | 41 | - Retain long captions 42 | - Don't double-quote emphasized text inside captions, but set them in roman. (Why 43 | would emphasis show up as italic inside plain text, and double-quoted inside a 44 | caption?) 45 | 46 | 47 | -------- 48 | 5. Other 49 | -------- 50 | 51 | Steven mentioned: 52 | 53 | * splitting a sentence-final example off into a new verb-less sentence. 54 | 55 | Not sure what the issue is here. 56 | -------------------------------------------------------------------------------- /book/reprint1-2.txt: -------------------------------------------------------------------------------- 1 | Communicated to O'Reilly on 2009-12-04 2 | 3 | p9 16d lexical diversity() s/b lexical_diversity() -- with underscore instead of space 4 | 5 | p18 Fig 1-4 or the code that creates it needs to be fixed (currently NLTK does counts, not percentages) 6 | 7 | p46 Fig 2.1 -- more contrast (supplied image was color) 8 | 9 | p132 9up "makes detection is easier" s/b "makes detection easier" 10 | 11 | p144 16up "an empty dictionary" s/b "an empty list" 12 | 13 | p153 3d add quotes around "in-place dictionary" 14 | add following sentence: (Dictionaries will be presented in Section 5.3.) 15 | 16 | p153 bottom and 154 top -- code block spanning page break: 17 | variable "trace" should be renamed to "verbose" x4 18 | 19 | p172 3d "dendogram" s/b "dendrogram" 20 | 21 | p177 ex 33 -- move to chapter 5 (new exercise 43). Change reference "described in chapter 5" 22 | to "described in this chapter" 23 | 24 | p336 Fig 9.1 -- larger scale (closer in size to example (18) same page), fix broken vbars 25 | (reported as too big last time, but now it is too small.) 26 | 27 | p391 6d insert space before "yields" 28 | 29 | p393 8up -- semrel s/b semrep 30 | 31 | p393 5up 32 | exists z3.(ankle(z3) & bite(cyril,z3)) 33 | s/b 34 | all z4.(boy(z4) -> see(cyril,z4)) 35 | 36 | General: Some readers report that the program line annotations (numbered bullets) 37 | are confusing in their current position. Can they be placed to the left of the line? 38 | -------------------------------------------------------------------------------- /slides/demos/similar_words_2.py: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | ## 3 | ## What words tend to co-occur? 4 | ## 5 | 6 | from nltk.probability import ConditionalFreqDist 7 | from nltk.corpus import brown 8 | 9 | ###################################################################### 10 | def build_association_distribution(): 11 | assoc = ConditionalFreqDist() 12 | 13 | # For each document in the "Brown Corpus"... 14 | for document in brown.files(): 15 | words = brown.tagged_words(document) 16 | 17 | # For each word that's a noun... 18 | for index, (word, tag) in enumerate(words): 19 | if tag.startswith('N'): 20 | 21 | # Look at any nouns in the next 5 words... 22 | window = words[index+1:index+5] 23 | for (window_word, window_tag) in window: 24 | if window_tag.startswith('N'): 25 | 26 | # And add them to our freq. distribution 27 | assoc[word].inc(window_word.lower()) 28 | 29 | return assoc 30 | 31 | if 'associations' not in globals(): 32 | associations = build_association_distribution() 33 | 34 | ###################################################################### 35 | def assoc(word): 36 | print '%20s -> %s' % (word, associations[word].max()) 37 | 38 | ###################################################################### 39 | assoc('man') 40 | assoc('woman') 41 | assoc('level') 42 | 43 | 44 | -------------------------------------------------------------------------------- /epydoc.diff: -------------------------------------------------------------------------------- 1 | --- /Library/Python/2.7/site-packages/epydoc/markup/restructuredtext.py~ 2008-01-28 13:15:33.000000000 -0500 2 | +++ /Library/Python/2.7/site-packages/epydoc/markup/restructuredtext.py 2012-09-23 20:59:35.000000000 -0400 3 | @@ -304,10 +304,10 @@ 4 | # Extract the first sentence. 5 | for child in node: 6 | if isinstance(child, docutils.nodes.Text): 7 | - m = self._SUMMARY_RE.match(child.data) 8 | + m = self._SUMMARY_RE.match(child) 9 | if m: 10 | summary_pieces.append(docutils.nodes.Text(m.group(1))) 11 | - other = child.data[m.end():] 12 | + other = child[m.end():] 13 | if other and not other.isspace(): 14 | self.other_docs = True 15 | break 16 | @@ -489,10 +489,10 @@ 17 | if (len(fbody[0]) > 0 and 18 | isinstance(fbody[0][0], docutils.nodes.Text)): 19 | child = fbody[0][0] 20 | - if child.data[:1] in ':-': 21 | - child.data = child.data[1:].lstrip() 22 | + if child[:1] in ':-': 23 | + child = docutil.nodes.Text(child[1:].lstrip()) 24 | elif child.data[:2] in (' -', ' :'): 25 | - child.data = child.data[2:].lstrip() 26 | + child = docutil.nodes.Text(child[2:].lstrip()) 27 | 28 | # Wrap the field body, and add a new field 29 | self._add_field(tagname, arg, fbody) 30 | -------------------------------------------------------------------------------- /xelatexsymbols.tex: -------------------------------------------------------------------------------- 1 | %&program=xelatex 2 | %&encoding=UTF-8 Unicode 3 | 4 | \newcommand{\as}[1]{{\fontspec{Apple Symbols}#1}} 5 | \newcommand{\asb}[1]{{\fontspec[Scale=1.1]{Apple Symbols}#1}} 6 | \newcommand{\ls}[1]{{\fontspec[Scale=0.9]{Lucida Grande}#1}} 7 | \def\reflect#1{{\setbox0=\hbox{#1}\rlap{\kern0.5\wd0 8 | \special{x:gsave}\special{x:scale -1 1}}\box0 \special{x:grestore}}} 9 | \def\XeLaTeX{\leavevmode 10 | \setbox0=\hbox{X\lower.5ex\hbox{\kern-.15em\reflect{E}}\kern-.0833em \LaTeX}% 11 | \dp0=0pt\ht0=0pt\box0 } 12 | 13 | \documentclass[11pt]{article} 14 | \title{Math Symbols in \XeLaTeX} 15 | \author{Ewan Klein} 16 | \date{\today} 17 | \usepackage{fontspec} 18 | \setromanfont{Palatino} 19 | 20 | \begin{document} 21 | \maketitle 22 | \section{Introduction} 23 | 24 | This file tests direct insertion of unicode characters using cut and paste 25 | from the Mac OS X font book application. As far as I can tell, some of 26 | the characters are only available in the Apple Symbols font, and some 27 | which you might have expected to be in Apple Symbols aren't, but can 28 | be found instead in Lucida Grande, for example. 29 | 30 | I still haven't found a way of producing angle brackets. 31 | 32 | 33 | \begin{itemize} 34 | \item $P$ \as{≐} $R$ \as{∧} $Q$ 35 | \item Z ≠ X \as{⊆} Y and X \as{↦} Y 36 | \item X \as{⊑} Y 37 | \item a ≥ b ≤ c < d > e 38 | \item p \ls{→} \as{⊥} 39 | \item 3 \as{╳} 0.2 = (0.3)\as{⋅}2 40 | \item 3 — \as{〈}4, 5 \as{〉} 41 | \item A \as{⊓} B = A if \as{≡} \ls{¬} 42 | \item \asb{∃}x\asb{∀}y.R(x) \as{→} R(y) 43 | \item $x$ \as{∈} $Y$ 44 | \end{itemize} 45 | 46 | 47 | 48 | \end{document} 49 | -------------------------------------------------------------------------------- /book/book.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Natural Language Processing with Python 5 | 6 | 1 7 | 9780596516499 8 | 9 | StevenBird 10 | EwanKlein 11 | EdwardLoper 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | NLTK Index 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /slides/demos/similar_words.py: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | ## 3 | ## What words occur in similar contexts? 4 | ## 5 | 6 | from nltk import * 7 | from collections import defaultdict 8 | 9 | ###################################################################### 10 | def build_context_map(): 11 | """Build a dictionary mapping words in the brown corpus to lists 12 | of local lexical contexts, where a context is encoded as a tuple 13 | (prevword, nextword).""" 14 | context_map = defaultdict(list) 15 | for document in corpus.brown.files(): 16 | words = corpus.brown.words(document) 17 | words = [word.lower() for word in words] 18 | for i in range(1, len(words)-1): 19 | prevword, word, nextword = words[i-1:i+2] 20 | context_map[word].append( (prevword, nextword) ) 21 | return context_map 22 | 23 | if 'context_map' not in globals(): 24 | context_map = build_context_map() 25 | 26 | ###################################################################### 27 | def dist_sim(context_map, word, num=6): 28 | """Display words that appear in similar contexts to the given 29 | word, based on the given context map.""" 30 | contexts = set(context_map.get(word, ())) 31 | fd = nltk.FreqDist(w for w in context_map 32 | for c in context_map[w] 33 | if c in contexts and w!=word) 34 | 35 | print 'Words similar to %r:' % word 36 | print ' '.join('%10s' % wd for wd in fd.keys()[:num]) 37 | print ' '.join('%10s' % fd[wd] for wd in fd.keys()[:num]) 38 | 39 | ###################################################################### 40 | 41 | dist_sim(context_map, 'man') 42 | dist_sim(context_map, 'woman') 43 | dist_sim(context_map, 'walk') 44 | dist_sim(context_map, 'in') 45 | -------------------------------------------------------------------------------- /book/ch03-extras.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | .. include:: ../definitions.rst 3 | .. include:: regexp-defns.rst 4 | 5 | .. standard global imports 6 | 7 | >>> import nltk, re, pprint 8 | 9 | =============================== 10 | 3. Processing Raw Text (Extras) 11 | =============================== 12 | 13 | 14 | as x as y: ``http://acl.ldc.upenn.edu/P/P07/P07-1008.pdf`` 15 | 16 | ------------------- 17 | Regular Expressions 18 | ------------------- 19 | 20 | http://www.regular-expressions.info/ is a useful online resource, 21 | providing a tutorial and references to tools and other sources of 22 | information. 23 | 24 | Unicode Regular Expressions: 25 | http://www.unicode.org/reports/tr18/ 26 | 27 | Regex Library: 28 | http://regexlib.com/ 29 | 30 | 31 | 32 | #. The above example of extracting (name, domain) pairs from 33 | text does not work when there is more than one email address 34 | on a line, because the ``+`` operator is "greedy" and consumes 35 | too much of the input. 36 | 37 | a) Experiment with input text containing more than one email address 38 | per line, such as that shown below. What happens? 39 | #) Using ``re.findall()``, write another regular expression 40 | to extract email addresses, replacing the period character 41 | with a range or negated range, such as ``[a-z]+`` or ``[^ >]+``. 42 | #) Now try to match email addresses by changing the regular 43 | expression ``.+`` to its "non-greedy" counterpart, ``.+?`` 44 | 45 | >>> s = """ 46 | ... austen-emma.txt:hart@vmd.cso.uiuc.edu (internet) hart@uiucvmd (bitnet) 47 | ... austen-emma.txt:Internet (72600.2026@compuserve.com); TEL: (212-254-5093) 48 | ... austen-persuasion.txt:Editing by Martin Ward (Martin.Ward@uk.ac.durham) 49 | ... blake-songs.txt:Prepared by David Price, email ccx074@coventry.ac.uk 50 | ... """ 51 | -------------------------------------------------------------------------------- /LSA325/log_lc_and_functions.txt: -------------------------------------------------------------------------------- 1 | List Comprehensions & Functions 2 | =============================== 3 | 4 | Find the first letter of each word: 5 | 6 | >>> words = 'this is a short sentence'.split() 7 | >>> [word[0] for word in words] 8 | ['t', 'i', 'a', 's', 's'] 9 | 10 | Convert each word in a list to lower case: 11 | 12 | >>> words = 'This sentence has some Capitalized words'.split() 13 | >>> [word.lower() for word in words] 14 | ['this', 'sentence', 'has', 'some', 'capitalized', 'words'] 15 | 16 | Define a function that counts the number of vowels in a word: 17 | 18 | >>> def vowels(word): 19 | ... v = 0 20 | ... for char in word: 21 | ... if char in 'aeiouAEIOU': 22 | ... v = v + 1 23 | ... return v 24 | 25 | Use the new 'vowels' function to find the number of vowels in each 26 | word from a word list: 27 | 28 | >>> [vowels(word) for word in words] 29 | [1, 3, 1, 2, 5, 1] 30 | 31 | Define a function that finds the average of a list of numbers. 32 | 33 | >>> def avg(numbers): 34 | ... return sum(numbers) / float(len(numbers)) # [1] 35 | 36 | [1] Note that we used 'float()' to convert the denominator from an 37 | integer to a real number, since dividing by integers rounds to 38 | the nearest value: 39 | 40 | >>> print 10/3 41 | 3 42 | 43 | Find the average number of vowels in the world list. 44 | 45 | >>> print avg([vowels(word) for word in words]) 46 | 2.16666666667 47 | 48 | Find the average length of words that begin with a vowel: 49 | 50 | >>> from nltk.corpus import brown 51 | >>> words = brown.tokenized('a', group_by_sent=False) 52 | >>> # Select only those words that start with a vowel: 53 | >>> words = [w for w in words if w[0].lower() in 'aeiouy'] 54 | >>> # Find the length of each word: 55 | >>> lengths = [len(w) for w in words] 56 | >>> # Get the average: 57 | >>> print avg(lengths) 58 | 4.00069249257 59 | -------------------------------------------------------------------------------- /book/guidelines.txt: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | .. include:: ../definitions.rst 3 | 4 | 5 | ========================= 6 | NLTK Developer Guidelines 7 | ========================= 8 | 9 | ----------------- 10 | Design Philosophy 11 | ----------------- 12 | 13 | http://en.wikipedia.org/wiki/Worse_is_better 14 | 15 | 16 | ------------------ 17 | Naming Conventions 18 | ------------------ 19 | 20 | #. Packages: These are typically verbs, all lowercase letters. When 21 | whole packages are imported, NLP processing tasks have very readable 22 | names, e.g. ``tokenize.whitespace()``, ``tag.ngram()``. 23 | 24 | #. Modules: These are lowercase words; multi-word names are joined 25 | without punctuation, e.g. ``parse.featurestructure``. 26 | 27 | #. Classes: These are uppercase-initial words, e.g. ``Chart``. 28 | Multiple words are joined together, with an uppercase letter to 29 | begin each new word, e.g. ``PorterStemmer``. 30 | 31 | #. Functions and Variables: These are all lowercase, with underscore 32 | to separate multiple words 33 | 34 | #. Constants: These are all-caps 35 | 36 | ------------------ 37 | Python Conventions 38 | ------------------ 39 | 40 | New-style Python classes (all ultimately inherit from ``object``). 41 | 42 | Each module contains demo code, and which can be run from the command line. 43 | This demo code should also be self contained (i.e. contain its own 44 | import statements, so that someone can cut and paste it into a new 45 | file and run it without modification. 46 | 47 | Each package's __init__.py file should import all the package's 48 | modules, so that everything in a package can be accessed by importing 49 | that package. 50 | 51 | Indentation: tab = 4 spaces 52 | 53 | ---------- 54 | Commenting 55 | ---------- 56 | 57 | Detailed module-level docstring; epydoc docstrings; ... 58 | 59 | ----------------- 60 | Repository Access 61 | ----------------- 62 | 63 | NLTK developers (people with write access to the repository) are 64 | welcome to contribute and maintain their own code in 65 | ``nltk.contrib``, but should not touch any other files. 66 | This is because the core developers need to be responsible for 67 | ensuring that everything works. 68 | 69 | NLTK core developers can modify any files. 70 | 71 | 72 | 73 | .. include:: footer.txt 74 | -------------------------------------------------------------------------------- /book/DOCUTILS: -------------------------------------------------------------------------------- 1 | HIGH PRIORITY 2 | 3 | (without these its painful to continue with docutils) 4 | 5 | book 6 | - inclusions of individual chapters 7 | - single pagination across all chapters 8 | - evidence in repository of earlier effort on this, but currently broken 9 | 10 | xrefs 11 | - assign a symbolic label to any section 12 | - refer to this label anywhere in the chapter (or book) 13 | - should be able to do this for figures etc 14 | 15 | MEDIUM PRIORITY 16 | 17 | (without these we are forced to do some major rehabilitation on the 18 | latex output, or ugly scripts to hack it) 19 | 20 | bibliography 21 | - storage of bib data (bibtex?) 22 | - inline citation of key expands to human-readable citation 23 | - generation of chapter or book-level bibliography section 24 | 25 | index 26 | - construct an index of all "dt" (defined terms) 27 | - permit other terms to be indexed (e.g. text role "idx") 28 | - index topical terms embedded in text but not displayed (e.g. text role "topic") 29 | - index refers to pages on which these appeared 30 | 31 | feature structures 32 | - map the existing ReST syntax into Manning's avm.sty syntax 33 | 34 | text substitutions 35 | - we need to allow these to be interpreted inside non-literal roles 36 | such as :math: and :dt: 37 | 38 | example numbering 39 | - tweak the ex environment so that it will allow include-ed program 40 | fragments to be numbered and indented in the same way as trees 41 | 42 | logical paragraphs 43 | - when a paragraph contains an example, the text following the 44 | example is a continuation of the paragraph, not a new paragraph 45 | (cf HTML output for notes that contain doctest examples, as in 46 | example in section on conditional expressions here: 47 | http://nltk.sourceforge.net/lite/doc/en/programming.html ) 48 | 49 | LOW PRIORITY 50 | 51 | (without these we have to do a bit more work in the rst 52 | source, or do minor last-minute hacking before submitting CRC) 53 | 54 | table-of-contents 55 | - collate section and subsection headings 56 | - tabulate, with page numbers 57 | - some control over depth of subsections to include 58 | 59 | aligned glossed examples 60 | - design syntax which will map into an appropriate LaTeX macro 61 | (which one? Covington?) 62 | (and ideally into a specialized class of HTML table) 63 | 64 | low-level formatting issues 65 | - doctest-ignore directive causes following code block to be unindented 66 | 67 | -------------------------------------------------------------------------------- /book/regexp-defns.rst: -------------------------------------------------------------------------------- 1 | .. ifndef:: regexp_defns 2 | 3 | .. def:: regexp_defns 4 | 5 | .. |s.ng| replace:: |l|\ ``s.ng``\ |r| 6 | .. |.| replace:: |l|\ ``.``\ |r| 7 | .. |....zy| replace:: |l|\ ``....zy``\ |r| 8 | .. |....berry| replace:: |l|\ ``....berry``\ |r| 9 | .. |t...| replace:: |l|\ ``t...``\ |r| 10 | .. |colou?r| replace:: |l|\ ``colou?r``\ |r| 11 | .. |e-?mail| replace:: |l|\ ``e-?mail``\ |r| 12 | .. |patt| replace:: |l|\ ``patt``\ |r| 13 | .. |coo+l| replace:: |l|\ ``coo+l``\ |r| 14 | .. |f.+f| replace:: |l|\ ``f.+f``\ |r| 15 | .. |.+ed| replace:: |l|\ ``.+ed``\ |r| 16 | .. |.*gnt.*| replace:: |l|\ ``.*gnt.*``\ |r| 17 | .. |[aeiou]| replace:: |l|\ ``[aeiou]``\ |r| 18 | .. |[uoiea]| replace:: |l|\ ``[uoiea]``\ |r| 19 | .. |[^aeiou]| replace:: |l| ``[^aeiou]`` |r| 20 | .. |p[aeiou]t| replace:: |l|\ ``p[aeiou]t``\ |r| 21 | .. |p[aeiou]+t| replace:: |l|\ ``p[aeiou]+t``\ |r| 22 | .. |NN.*| replace:: |l|\ ``NN.*``\ |r| 23 | .. |.*| replace:: |l|\ ``.*``\ |r| 24 | .. |123|456| replace:: |l|\ ``123|456``\ |r| 25 | .. |12(3|4)56| replace:: |l|\ ``12(3|4)56``\ |r| 26 | .. |[a-z]| replace:: |l|\ ``[a-z]``\ |r| 27 | .. |[^a-z]| replace:: |l|\ ``[^a-z]``\ |r| 28 | .. |[a-zA-Z]| replace:: |l|\ ``[a-zA-Z]``\ |r| 29 | .. |t[a-z][a-z][a-z]| replace:: |l|\ ``t[a-z][a-z][a-z]``\ |r| 30 | .. |[A-Z][a-z]*| replace:: |l|\ ``[A-Z][a-z]*``\ |r| 31 | .. |20[0-4][0-9]| replace:: |l|\ ``20[0-4][0-9]``\ |r| 32 | .. |[b-df-hj-np-tv-z]+| replace:: |l|\ ``[b-df-hj-np-tv-z]+``\ |r| 33 | .. |.*| replace:: |l|\ ``.*``\ |r| 34 | .. |^[A-Za-z]+| replace:: |l|\ ``^[A-Za-z]+``\ |r| 35 | .. |^[^ ]+| replace:: |l|\ ``^[^ ]+``\ |r| 36 | .. |[a-z]*s$| replace:: |l|\ ``[a-z]*s$``\ |r| 37 | .. |^$| replace:: |l|\ ``^$``\ |r| 38 | .. |*| replace:: |l|\ ``*``\ |r| 39 | .. |*?| replace:: |l|\ ``*?``\ |r| 40 | .. |<.*>| replace:: |l|\ ``<.*>``\ |r| 41 | .. |NN.*|JJ.*|DT| replace:: |l|\ ``NN.*|JJ.*|DT``\ |r| 42 | .. |dwelling|domicile|abode| replace:: |l|\ ``dwelling|domicile|abode``\ |r| 43 | 44 | -------------------------------------------------------------------------------- /book/reprint1-3.txt: -------------------------------------------------------------------------------- 1 | p177 ex 33 -- move to chapter 5 (new exercise 43). Change reference "described in chapter 5" 2 | to "described in this chapter" 3 | 4 | p306 17up "The advantages of shift-reduce" s/b "The advantage of shift-reduce" 5 | 6 | p309 9up "through entire list" s/b "through the entire list" 7 | 8 | p309 13-14up "Det at wfst[0][1] and N at wfst[1][2], we can add NP to wfst[0][2]" s/b 9 | "Det at wfst[2][3] and N at wfst[3][4], we can add NP to wfst[2][4]" 10 | 11 | p334 10d Delete this whole line, viz "NP[NUM=?n] -> N[NUM=?n]", and close up space. 12 | 13 | p336 Fig 9-1 is too big in the latest pdf. Also, the feature labels shouldn't be bold. 14 | 15 | p340 ex 24 -- s/b smaller for consistency with the other DAGs (cf p339) 16 | 17 | p342 DAG (27a) is incorrect. It should look just like (27c) but *without* the middle arc 18 | labeled 'CITY'. (The online version of this chapter is correct, and uses dag04-1.png 19 | for this subfigure.) 20 | 21 | p363 21d -- node['sem'] s/b node['SEM'] 22 | NB This is http://www.oreillynet.com/cs/nl/edit/errata/40392 23 | 24 | p389 17up "nltk.Variable('z')" s/b "nltk.sem.Variable('z')" 25 | 26 | p373 19d "such as or ." s/b 27 | "such as or >." 28 | NB This is http://www.oreillynet.com/cs/nl/edit/errata/39295 29 | 30 | p392 6d "nltk.ApplicationExpression(tvp, np)" s/b 31 | "nltk.sem.ApplicationExpression(tvp, np)" 32 | 33 | p396 20up "trees[0].node['sem']" s/b "trees[0].node['SEM']" 34 | 35 | p399 4d Det[NUM=sg,SEM=<\P Q.([x],[]) + P(x) + Q(x)>] -> 'a' 36 | s/b 37 | Det[NUM=sg,SEM=<\P Q.(([x],[]) + P(x) + Q(x))>] -> 'a' 38 | 39 | 40 | p400 20d "trees[0].node['sem'].simplify()" s/b 41 | "trees[0].node['SEM'].simplify()" 42 | 43 | p405-406 exs (5)-(7). Please replace all seven occurrences of 44 | "nltk.ApplicationExpression" with 45 | "nltk.sem.ApplicationExpression". 46 | 47 | p426 The error report at http://www.oreillynet.com/cs/nl/edit/errata/39424 48 | looks correct to me (EK). 49 | 50 | p429 11-12d sentence beginning with "Ignoring...", please replace with 51 | the following (and set "OTH" in cw): 52 | 53 | Ignoring the entries for exchanges between people 54 | other than the top 5 (labeled OTH), the largest value suggests 55 | that Portia and Bassanio have the most significant interactions. 56 | 57 | p444 7d can never been known s/b can never be known 58 | -------------------------------------------------------------------------------- /LSA325/assignment4.txt: -------------------------------------------------------------------------------- 1 | Grammar development 2 | 3 | For this assignment, you will be editing a file named 4 | 'assignment4.py', which you can download from: 5 | 6 | http://nltk.org/temp/assignment4.py 7 | 8 | Please rename this file to 9 | 'assignment4-.py' before submitting it. 10 | 11 | 12 | 1. Chose a linguistic phenomenon of interest that you would like to 13 | model using a grammar. Here are some ideas, or use your own: 14 | 15 | - Noun modifiers 16 | ("slow cat", "very slow cat") 17 | - Comparative expressions 18 | ("bigger than a breadbox", "less heavy than ...") 19 | - Sentential complements 20 | ("I think that you know that ...") 21 | - Quantifiers 22 | ("For every boy, some girl ...") 23 | 24 | 2. Choose 4-5 example sentences, and add them to the 'sent_corpus' 25 | variable. This variable contains a list of sentences, one per 26 | line. 27 | 28 | 3. Add grammar and lexical rules to the 'grammar' variable to cover 29 | your example sentences. 30 | 31 | 4. Run the program, and check the parse trees you get. Were there any 32 | extra parse trees you weren't expecting? Were there any sentences 33 | that failed to parse? 34 | 35 | 5. Refine your grammar until it covers your example sentences. If 36 | possible, your grammar should not produce extra unintended parse 37 | trees. (But for some linguistic phenomena, this might not be 38 | possible!) 39 | 40 | 6. Once you're happy with the output parse trees, copy them to the 41 | 'tree_corpus' variable. Do NOT copy the sentence strings -- just 42 | the tree expressions. (If your grammar generates extra unintended 43 | parse trees, don't include them.) You can then delete these 44 | sentences from 'sent_corpus'. Run the program again, and it will 45 | automatically check to make sure that the intended parse trees are 46 | getting generated. This way you won't have to keep checking them 47 | by hand if you choose to do the optional step (7). 48 | 49 | 7. Optional: return to step 2 (as many times as you like). 50 | 51 | 8. Use the 'comments' variable to write a short comment about grammar 52 | development -- was it easier or harder than you thought? How hard 53 | do you think it would be to merge the grammar that you developed 54 | with some of your classmates' grammars, that were designed to 55 | handle other linguistic phenomena? What problems might come up 56 | when merging grammars? 57 | -------------------------------------------------------------------------------- /book/intro-outline.txt: -------------------------------------------------------------------------------- 1 | 2 | NB. M-x outline-mode 3 | NB. [N] = new stuff, [?] = needs discussion, maybe omit or merge 4 | NB. Section titles may have been changed, or inserted to make 5 | structure clearer. 6 | 7 | ------------------------------------------------------ 8 | 9 | * Some things you can do with NLP 10 | 11 | ** Examples 12 | NLP for linguistic research 13 | NLP for applications 14 | 15 | ** What do we mean by NLP? 16 | terminology: CL/HLT etc 17 | 18 | ** Audience and Goals [N] 19 | 20 | 21 | * The Language Challenge 22 | 23 | ** NL is rich and complex and difficult 24 | 25 | *** Language is integral to culture 26 | 27 | *** Language is intertwined in modern technology [?] 28 | [not quite clear what the message is here] 29 | 30 | ** How NLP can make a difference 31 | 32 | *** Information overload for ordinary people 33 | 34 | Text and multimedia on WWW 35 | 36 | QA example 37 | 38 | *** Information overload for professionals [N] 39 | biomedical example 40 | 41 | *** But NLP is still limited in what it can do 42 | 43 | * Overview of NLP 44 | 45 | ** One ideal: Intelligence and Turing Test 46 | example of spoken dialogue 47 | 48 | ** More realistic: not so intelligent NLP 49 | 50 | Other examples of NLP applications 51 | 52 | ** Brief history of NLP 53 | [this seems a bit tricky -- what needs to be included here? so far, 54 | it seems a bit arbitrary] 55 | 56 | *** Formal language theory 57 | --> computational syntax 58 | 59 | *** Formal logic 60 | --> automated inference 61 | 62 | *** Formal semantics and compositionality [?] 63 | [not really at the same level; maybe merge in with preceding chunk] 64 | 65 | *** Domain dependence 66 | e.g. semantic grammars 67 | 68 | *** Machine learning [N] 69 | 70 | 71 | ** Philosophical Perspective 72 | [maybe push this up a level?] 73 | 74 | *** Rationalism vs Empiricism 75 | schools of linguistics 76 | 77 | *** Realism vs Idealism [?] 78 | God's truth vs. hocus pocus 79 | [maybe drop this?\ 80 | 81 | * Architecture of Linguistic and NLP Systems 82 | 83 | ** Modularity 84 | Generative grammars [link back to formal language theory] 85 | 86 | ** Competence and Performance 87 | 88 | *** NLP aspects 89 | simple parsing example 90 | 91 | *** Cognitive aspects [N] 92 | human sentence processing -- say that it's out of scope 93 | 94 | *** Spoken Dialogue system 95 | 96 | language resources / static 97 | processing tools / dynamic 98 | 99 | * Outline of book [N] 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /howto/Makefile: -------------------------------------------------------------------------------- 1 | # NLTK: Doctest Makefile 2 | # 3 | # Copyright (C) 2001-2012 NLTK Project 4 | # Author: Edward Loper 5 | # Steven Bird 6 | # URL: 7 | # For license information, see LICENSE.TXT 8 | 9 | # File locations: 10 | DOCTEST_SRC = ../../nltk/nltk/test 11 | DOCTESTS = $(wildcard $(DOCTEST_SRC)/*.doctest) 12 | PYSRC = $(shell find ../../nltk/nltk -name '*.py') 13 | HTML = $(DOCTESTS:$(DOCTEST_SRC)/%.doctest=%.html) 14 | ERRS = $(DOCTESTS:$(DOCTEST_SRC)/%.doctest=%.errs) 15 | COVERAGE = $(DOCTESTS:$(DOCTEST_SRC)/%.doctest=%.coverage) 16 | PYTHONPATH = ../../ 17 | export PYTHONPATH 18 | 19 | # Converting rst->html: 20 | RST = ../rst.py 21 | RST2HTML = $(RST) --html --css ../nltkdoc.css 22 | 23 | # Testing: 24 | PYTHON = python 25 | DOCTEST = $(PYTHON) ../../nltk/nltk/test/doctest_driver.py 26 | DOCTEST_FLAGS = --udiff 27 | 28 | PUBLISH = ../../nltk.github.com 29 | 30 | .PHONY: all html coverage 31 | 32 | #all: html errs coverage 33 | html: $(HTML) #index.html 34 | errs: $(ERRS) 35 | @echo Failed doctests: 36 | @grep 'FAILED (failures' *.errs |sed 's/\(.*\)\.errs:.*/ - \1/' 37 | coverage: coverage/index.html 38 | 39 | coverage/index.html: coverage-list.txt coverage.txt 40 | $(RST2HTML) coverage.txt -o $@ 41 | 42 | coverage-list.txt: $(COVERAGE) ./show_coverage.py 43 | python ./show_coverage.py $(COVERAGE) 44 | 45 | %.coverage: %.errs 46 | @true 47 | %.errs: $(DOCTEST_SRC)/%.doctest $(PYSRC) 48 | $(DOCTEST) $(DOCTEST_FLAGS) $(DOCTEST_SRC)/$*.doctest \ 49 | --coverage=$*.coverage > $*.errs 2>&1 50 | 51 | %.html: $(DOCTEST_SRC)/%.doctest 52 | $(RST2HTML) $< -o $@ 53 | 54 | $(DOCTEST_SRC)/%.doctest: $(DOCTEST_SRC)/%.doctest_latin1 55 | iconv -f iso8859-1 -t utf8 $< > $@ 56 | 57 | index.html: index.txt #test-list.txt 58 | $(RST2HTML) index.txt 59 | # cat index.txt |sed s/test-list.txt/test-list-sort-title.txt/ \ 60 | # >sort-title.txt 61 | # cat index.txt |sed s/test-list.txt/test-list-sort-lines.txt/ \ 62 | # >sort-lines.txt 63 | # cat index.txt |sed s/test-list.txt/test-list-sort-tests.txt/ \ 64 | # >sort-tests.txt 65 | # cat index.txt |sed s/test-list.txt/test-list-sort-outcome.txt/ \ 66 | # >sort-outcome.txt 67 | # $(RST2HTML) sort-title.txt 68 | # $(RST2HTML) sort-lines.txt 69 | # $(RST2HTML) sort-tests.txt 70 | # $(RST2HTML) sort-outcome.txt 71 | 72 | test-list.txt: update_list.py $(ERRS) 73 | python update_list.py 74 | 75 | clean: 76 | rm -f `find . -name '*.html'` 77 | rm -f `find . -name '*.errs'` 78 | 79 | clean_up: 80 | true # nothing to do. 81 | 82 | publish: 83 | #cp *.html *.errs *.coverage $(PUBLISH)/howto/ 84 | cp *.html $(PUBLISH)/howto/ 85 | -------------------------------------------------------------------------------- /LSA325/LSA325_3_handout.tex: -------------------------------------------------------------------------------- 1 | \documentclass[a4paper]{article} 2 | 3 | \begin{document} 4 | 5 | \begin{center} 6 | {\Large LSA 325, Class 3, Thu 12th July} 7 | \end{center} 8 | \section*{Topics} 9 | 10 | \begin{itemize} 11 | \item Partial Parsing and Interpretation 12 | \item Chunking 13 | \begin{itemize} 14 | \item What is a chunk? 15 | \item \texttt{chunk.Regexp} 16 | \item Data-driven approaches 17 | \item Chunking as Tagging 18 | \end{itemize} 19 | \end{itemize} 20 | 21 | \subsection*{Class Materials} 22 | \begin{itemize} 23 | \item The second installment of the NLTK book will be available in the 24 | Stanford Bookstore soon, either tomorrow or Monday 16 July. 25 | \item We will also distribute two further chapters in class next Monday: 26 | \begin{enumerate} 27 | \item \textit{Linguistic Data Management} 28 | \item Updated version of the \textit{Chunking} chapter. We hope it 29 | will be slightly easier for you to read, but there is no signicant 30 | difference in the code; so it is mainly relevant for the 31 | \textit{NLTK Book Review Assignment}. 32 | \end{enumerate} 33 | 34 | \end{itemize} 35 | 36 | \section*{Practical Stuff} 37 | 38 | \noindent 39 | Data Exploration: 40 | \begin{verbatim} 41 | >>> for tree in corpus.conll2000.read('train', chunk_types=('NP',))[:5]: 42 | ... print tree 43 | \end{verbatim} 44 | 45 | \noindent 46 | Write some rules (in a file): 47 | \begin{verbatim} 48 | grammar = r""" 49 | NP: {?*} # chunk determiners, adjectives and nouns 50 | {+} # chunk sequences of proper nouns 51 | """ 52 | cp = chunk.Regexp(grammar) 53 | \end{verbatim} 54 | 55 | \noindent 56 | Examine results: 57 | \begin{verbatim} 58 | for tree in corpus.conll2000.read('train', chunk_types=('NP',))[:2]: 59 | print cp.parse(tree.flatten(), trace=1) 60 | \end{verbatim} 61 | 62 | \noindent 63 | Evaluate your grammar: 64 | \begin{verbatim} 65 | print chunk.accuracy(cp, corpus.conll2000.chunked('test', chunk_types=('NP',))) 66 | \end{verbatim} 67 | 68 | \noindent 69 | Iterate until accuracy = 100\%. 70 | 71 | \subsection*{Chunking Assignment} 72 | 73 | \begin{enumerate} 74 | \item Explore what kind of sequences are annotated as VP in the 75 | CONLL2000 \texttt{train} data. 76 | \item Develop a \texttt{chunk.Regexp} grammar to capture the regularities. 77 | \item Use trace to examine the success of your rules. 78 | \item Once you are reasonably happy, try evaluating your rules against 79 | the CONLL2000 \texttt{test} data. 80 | \item Briefly comment on how easy or difficult it was to develop an 81 | adequate rule set. 82 | 83 | \end{enumerate} 84 | 85 | 86 | 87 | 88 | \end{document} 89 | 90 | %%% Local Variables: 91 | %%% mode: latex 92 | %%% TeX-master: t 93 | %%% End: 94 | -------------------------------------------------------------------------------- /book/introduction-code.txt: -------------------------------------------------------------------------------- 1 | 2 | >>> from nltk.corpora import cmudict 3 | >>> from string import join 4 | >>> for word, num, pron in cmudict.raw(): 5 | ... stress_pattern = join(c for c in join(pron) if c in "012") 6 | ... if stress_pattern.endswith("1 0 0 0 0"): 7 | ... print word, "/", join(pron) 8 | 9 | 10 | 11 | 12 | >>> from nltk.corpora import shoebox 13 | >>> from nltk.utilities import MinimalSet 14 | >>> length, position, min = 4, 1, 3 15 | >>> lexemes = [field[1].lower() for entry in shoebox.raw('rotokas.dic') 16 | ... for field in entry if field[0] == 'lx'] 17 | >>> ms = MinimalSet() 18 | >>> for lex in lexemes: 19 | ... if len(lex) == length: 20 | ... context = lex[:position] + '_' + lex[position+1:] 21 | ... target = lex[position] 22 | ... ms.add(context, target, lex) 23 | >>> for context in ms.contexts(3): 24 | ... for target in ms.targets(): 25 | ... print "%-4s" % ms.display(context, target, "-"), 26 | ... print 27 | 28 | 29 | 30 | >>> from nltk.corpora import genesis 31 | >>> from nltk.probability import ConditionalFreqDist 32 | >>> from nltk.utilities import print_string 33 | >>> cfdist = ConditionalFreqDist() 34 | >>> prev = None 35 | >>> for word in genesis.raw(): 36 | ... word = word.lower() 37 | ... cfdist[prev].inc(word) 38 | ... prev = word 39 | >>> words = [] 40 | >>> prev = 'lo,' 41 | >>> for i in range(99): 42 | ... words.append(prev) 43 | ... for word in cfdist[prev].sorted(): 44 | ... if word not in words: 45 | ... break 46 | ... prev = word 47 | >>> print_string(join(words)) 48 | 49 | 50 | 51 | 52 | 53 | >>> from nltk.corpora import treebank 54 | >>> from string import join 55 | >>> def vp_conj(tree): 56 | ... if tree.node == 'VP' and len(tree) == 3 and tree[1].leaves() == ['but']: 57 | ... return True 58 | ... else: 59 | ... return False 60 | >>> for tree in treebank.parsed_sents(): 61 | ... for vp1,conj,vp2 in tree.subtrees(vp_conj): 62 | ... print join(child.node for child in vp1), "*BUT*", join(child.node for child in vp2) 63 | 64 | 65 | 66 | from nltk.corpora import treebank 67 | from string import join 68 | def vp_conj(tree): 69 | if tree.node == 'VP' and len(tree) == 3 and tree[1].leaves() == ['but']: 70 | return True 71 | else: 72 | return False 73 | 74 | def pr(subtree): 75 | return "(%s %s)" % (subtree.node, join(subtree.leaves())) 76 | 77 | for tree in treebank.parsed_sents(): 78 | for vp1,conj,vp2 in tree.subtrees(vp_conj): 79 | print join(pr(child) for child in vp1), "*BUT*", join(pr(child) for child in vp2) 80 | -------------------------------------------------------------------------------- /book/SCHEDULE: -------------------------------------------------------------------------------- 1 | $Id$ 2 | ---------------- 3 | WRITING SCHEDULE 4 | ---------------- 5 | 6 | 7 | 0. Preface 8 | 9 | 0. Python and NLTK 10 | 11 | 1. Introduction 12 | 13 | ------------------------------------------ 14 | 15 | PART I: Basics 16 | 17 | Part Intro 18 | 19 | 2. Programming 20 | + more exercises 21 | + checking for coverage 22 | + summary 23 | 24 | 3. Words 25 | + lexical resources 26 | + sentence tokenization? 27 | + morphological analysis 28 | + Multiword expressions 29 | + summary 30 | 31 | 4. Tagging 32 | + non-Latin tagging example 33 | + n-gram language modeling, smoothing 34 | + move Brill stuff elsewhere 35 | + summary 36 | 37 | 5. Chunk Parsing 38 | + [P] rule format 39 | + summary 40 | 41 | ------------------------------------------ 42 | 43 | PART II: Parsing 44 | 45 | Part Intro 46 | 47 | 6. Structured Programming 48 | + XML 49 | + collocations? 50 | + simple extractive summarization? 51 | 52 | 7. Grammars and Parsing 53 | + complete discussion of problems with parsing algorithms 54 | + material on dependencies, dependency grammar (+simple parser?) 55 | + discussion of generation 56 | 57 | 8. Advanced Parsing 58 | + Categorial grammar? 59 | 60 | 9. Feature Based Grammar 61 | + Describe feature structure module (done; but what about featurelite?) 62 | 63 | ------------------------------------------ 64 | 65 | PART III: Advanced Topics 66 | 67 | Part Intro 68 | 69 | 10. Advanced Programming 70 | + Unicode, character encoding, XML, web (urlopen), crawling? 71 | 72 | 11. Semantic Interpretation 73 | + feature-based semantics (requires update of parser) 74 | + theta roles, propbank 75 | + Cooper storage (requires list-valued features) 76 | 77 | 12. Language Engineering / Data-intensive NLP 78 | + language id problem? 79 | + language modelling (already some major components here, esp for estimation) 80 | + HMMs 81 | + other machine learning techniques (e.g., Transformation-based learning) 82 | + Naive Bayes classification, clustering 83 | [NER, text classification (& question classification), ontology extraction] 84 | + NLP on the Web 85 | [stuff on RDF?] 86 | 87 | 13. Managing linguistic data 88 | + corpus construction 89 | + OLAC, annotation 90 | 91 | 14. Lexicon and Morphology 92 | + representing lexical information, redundancy 93 | + lexical resources 94 | + comlex 95 | + framenet 96 | + lexical semantics, use of ontologies 97 | + morphology/lexicon interaction 98 | + grammar/lexicon interaction (Levin classes) 99 | + lexical rules, hierarchical lexicon 100 | + multiword expressions, collocations, idioms 101 | --> AT&T WFST toolkit; Python bindings? 102 | 103 | 15. Conclusion 104 | brief pointers on 'hot topics': MT, Spoken Dialogue, QA 105 | ------------------------------------------- 106 | 107 | APPENDIXES: 108 | 109 | * Regular Expressions 110 | * Cheat Sheet 111 | -------------------------------------------------------------------------------- /book/CheckList.txt: -------------------------------------------------------------------------------- 1 | CHAPTER TASK 2 | ----------------------------------------------------------------------------- 3 | 4 | 012345...9ABC Hellmann Review 5 | 0123456..9ABC Indurkya Review 6 | 0123456789ABC Munn Review 7 | 012345...9ABC Rhodes Review 8 | 012345.789ABC Schlansker Review 9 | 0123456789ABC Sproat Review 10 | 11 | 012345..89ABC Extended captions so figures and pylistings are self-contained 12 | 012345...9A.C Summary finalized 13 | 012345..8.A.C Further readings finalized 14 | 01..........C Exercises checked for coverage of chapter, sequence 15 | 012345...9ABC Book issues in issue tracker dealt with 16 | 012345...9A.C Doctests checked 17 | 012345......C Roughly uniform use of note blocks and |TRY| 18 | 012345....A.. Ensure no XXX annotations are commenting out a code block 19 | 012345...9A.. No writing left to do 20 | 21 | ----------------------------------------------------------------------------- 22 | 23 | 012345...9... Doctest callouts used for referring to locations in code 24 | 012345....... Overflowing lines fixed 25 | 012345.789ABC All numbered displays (figure, pylisting, table) referenced from text 26 | 01........... Typographic changes implemented throughout, e.g. ->, :gc: 27 | 0123456789ABC Consistent URL formatting 28 | 01........... Image scaling ok 29 | 012345.....B. US spell check done 30 | 0............ Check for any more index terms 31 | 32 | ----------------------------------------------------------------------------- 33 | 34 | General issues: 35 | * NLTK index and stoplist 36 | * Comments back to O'Reilly in docs/notes.txt 37 | * Code examples would be easier to read if the user input and the 38 | system output were in different fonts -- e.g., bold for user input. 39 | * We're inconsistant about whether to include a blank trailing prompt 40 | (">>>") in our code examples. 41 | * Ensure the URLs are in the following format: 42 | ``http://www.nltk.org`` including those inserted via rst replace such as |NLTK-URL| 43 | * n-gram vs *n*-gram markup 44 | 45 | Outstanding issues: 46 | 47 | ch06 only has only one |TRY| exercise 48 | ch07 has a conclusion (non-standard) but no summary 49 | ch07 needs some non-chunking exercises 50 | ch07 could describe SRL in 7.1 as another shallow processing task 51 | ch07 should describe NLTK's off-the-shelf NE tagger 52 | ch07 typography should follow the simplified style of later chapters, e.g. with NP 53 | ch07 only has two |TRY| exercises 54 | ch08 language is more formal than necessary, less accessible than it should be 55 | ch08 typography should no longer use :gc: 56 | ch08 section 8.6 on grammar development is incomplete (incl PE08 discussion) 57 | ch08 assumes knowledge of "head" (did some content disappear?) [it got moved to ch09] 58 | ch09 uses :lex: role, not processed by docbook [`appear`:lex: also in ch03] 59 | ch09 could mention use of trees as source of features for ML 60 | ch09 includes contents of grammar files that have changed in data distribution 61 | ch09 has no |TRY| exercises 62 | ch11 has no |TRY| exercises 63 | 64 | 65 | ch07 -- reorder, put the stuff that we can't actually do (IE, etc) at the end 66 | - move the n-gram and ne classifier earlier, to establish the connection 67 | to chapters 5 and 6. 68 | -------------------------------------------------------------------------------- /howto/show_coverage.py: -------------------------------------------------------------------------------- 1 | 2 | import sys, os, re 3 | import nltk.test.coverage as coverage 4 | import color_coverage 5 | 6 | OUT_DIR = 'coverage' 7 | MODULE_RE = re.compile(r'nltk.*') 8 | 9 | HEAD = (".. ==========================================================\n" 10 | ".. AUTO-GENERATED LISTING -- DO NOT EDIT!:\n\n" 11 | ".. role:: red\n" 12 | " :class: red\n\n" 13 | ".. role:: yellow\n" 14 | " :class: yellow\n\n" 15 | ".. role:: green\n" 16 | " :class: green\n\n" 17 | ".. container:: doctest-list\n\n" 18 | " .. list-table::\n" 19 | " :class: doctest-list \n" 20 | " :widths: 80 20\n" 21 | " :header-rows: 1\n\n" 22 | " * - Module\n - Coverage\n") 23 | FOOT = (".. END AUTO-GENERATED LISTING\n" 24 | ".. ==========================================================\n") 25 | 26 | def report_coverage(module): 27 | sys.stdout.write(' %-40s ' % module.__name__) 28 | sys.stdout.flush() 29 | (fname, stmts, excluded, missing, fmt_missing, def_info) = ( 30 | coverage.analysis3(module)) 31 | out = open(os.path.join(OUT_DIR, module.__name__+'.html'), 'wb') 32 | color_coverage.colorize_file(fname, module.__name__, out, 33 | fmt_missing, def_info) 34 | out.close() 35 | if not missing: c = 100 36 | elif stmts: c = 100.*(len(stmts)-len(missing)) / len(stmts) 37 | else: c = 100 38 | sys.stdout.write('%3d%%\n' % c) 39 | return c 40 | 41 | def init_out_dir(): 42 | # Create the dir if it doesn't exist. 43 | if not os.path.exists(OUT_DIR): 44 | os.mkdir(OUT_DIR) 45 | 46 | # Make sure it's actually a dir. 47 | if not os.path.isdir(OUT_DIR): 48 | raise ValueError('%s is in the way' % OUT_DIR) 49 | 50 | # Clear its contents. 51 | for filename in os.listdir(OUT_DIR): 52 | os.remove(os.path.join(OUT_DIR, filename)) 53 | 54 | def main(filenames): 55 | # Collect the coverage data from the given files. 56 | for filename in filenames: 57 | cexecuted = coverage.the_coverage.restore_file(filename) 58 | coverage.the_coverage.merge_data(cexecuted) 59 | 60 | try: init_out_dir() 61 | except Exception, e: 62 | print 'Unable to create output directory %r: %s' % (OUT_DIR, e) 63 | return 64 | 65 | out = open('coverage-list.txt', 'wb') 66 | out.write(HEAD) 67 | 68 | # Construct a coverage file for each NLTK module. 69 | print '\nGenerating coverage summary files...\n' 70 | print ' %-40s %s' % ('Module', 'Coverage') 71 | print ' '+'-'*50 72 | for module_name, module in sorted(sys.modules.items()): 73 | if module is None: continue 74 | if MODULE_RE.match(module_name): 75 | cover = report_coverage(module) 76 | if cover == 100: color = 'green' 77 | elif cover > 50: color = 'yellow' 78 | else: color = 'red' 79 | out.write(' * - `%s <%s.html>`__\n' 80 | ' - `%d%%`:%s:\n' % 81 | (module_name, module_name, cover, color)) 82 | out.flush() 83 | 84 | out.write(FOOT) 85 | out.close() 86 | 87 | if __name__ == '__main__': 88 | main(sys.argv[1:]) 89 | -------------------------------------------------------------------------------- /book/image_scaling.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | .. include:: ../definitions.rst 3 | 4 | ============= 5 | Image Scaling 6 | ============= 7 | 8 | Here's a collection of images from the book and a summary of the scaling result for each. 9 | 10 | 11 | ============ ========== ========== ========== ========== ======= 12 | Figure LaTeX/PDF HTML Docbook Origin Scale 13 | ============ ========== ========== ========== ========== ======= 14 | authors_ too big ok too small photo 150 15 | ------------ ---------- ---------- ---------- ---------- ------- 16 | inaugural_ ok too big ok pylab 90 17 | ------------ ---------- ---------- ---------- ---------- ------- 18 | maps02_ ok too big ok graffle 22 19 | ------------ ---------- ---------- ---------- ---------- ------- 20 | dialogue_ ok too big ok graffle 32 21 | ------------ ---------- ---------- ---------- ---------- ------- 22 | structure_ ok too big ok graffle 150 23 | ------------ ---------- ---------- ---------- ---------- ------- 24 | tally_ too small too big ok graffle 30 25 | ------------ ---------- ---------- ---------- ---------- ------- 26 | lexicon_ too small too big ok graffle 50 27 | ------------ ---------- ---------- ---------- ---------- ------- 28 | pipeline_ too small too big ok graffle 40 29 | ------------ ---------- ---------- ---------- ---------- ------- 30 | triangle_ too small too big too big graffle 50 31 | ------------ ---------- ---------- ---------- ---------- ------- 32 | polish_ ok ok too big screenshot default 33 | ============ ========== ========== ========== ========== ======= 34 | 35 | 36 | .. _authors: 37 | .. figure:: ../images/authors.png 38 | :scale: 150 39 | 40 | Edward Loper, Ewan Klein, and Steven Bird, Stanford, July 2007 41 | 42 | .. _inaugural: 43 | .. figure:: ../images/inaugural.png 44 | :scale: 90 45 | 46 | Lexical Dispersion Plot for Words in US Presidential Inaugural Addresses 47 | 48 | .. _maps02: 49 | .. figure:: ../images/maps02.png 50 | :scale: 22 51 | 52 | Dictionary Look-up 53 | 54 | .. _dialogue: 55 | .. figure:: ../images/dialogue.png 56 | :scale: 32 57 | 58 | Simple Pipeline Architecture for a Spoken Dialogue System 59 | 60 | .. _structure: 61 | .. figure:: ../images/text-corpus-structure.png 62 | :scale: 150 63 | 64 | Common Structures for Text Corpora (one point per text) 65 | 66 | .. _tally: 67 | .. figure:: ../images/tally.png 68 | :scale: 30 69 | 70 | Counting Words Appearing in a Text (a frequency distribution) 71 | 72 | .. _lexicon: 73 | .. figure:: ../images/lexicon.png 74 | :scale: 50 75 | 76 | Lexicon Terminology 77 | 78 | .. _pipeline: 79 | .. figure:: ../images/pipeline1.png 80 | :scale: 40 81 | 82 | The Processing Pipeline 83 | 84 | .. _triangle: 85 | .. figure:: ../images/naive-bayes-triangle.png 86 | :scale: 50 87 | 88 | An abstract illustration of the procedure used by the Naive Bayes 89 | classifier to choose the topic for a document. 90 | 91 | .. _polish: 92 | .. figure:: ../images/polish-utf8.png 93 | 94 | Screenshot 95 | -------------------------------------------------------------------------------- /book/ch01-extras.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | .. include:: ../definitions.rst 3 | 4 | .. standard global imports 5 | 6 | >>> import nltk, re, pprint 7 | 8 | ========================================== 9 | 1. Language Processing and Python (Extras) 10 | ========================================== 11 | 12 | 13 | -------------------------------------------------------- 14 | Websites with Information on Natural Language Processing 15 | -------------------------------------------------------- 16 | 17 | Several websites have useful information about |NLP|, including 18 | conferences, resources, and special-interest groups, e.g. 19 | ``www.lt-world.org``, ``www.aclweb.org``, ``www.elsnet.org``. 20 | 21 | The website of the *Association for Computational Linguistics*, 22 | at ``www.aclweb.org``, contains an overview of computational linguistics, 23 | including copies of introductory chapters from recent textbooks. 24 | Wikipedia has entries for |NLP| and its subfields 25 | (but don't confuse natural language processing with 26 | the other |NLP|\ : neuro-linguistic programming). 27 | 28 | ``http://www.statmt.org/`` 29 | 30 | ``http://www.aclweb.org/aclwiki/index.php?title=Textual_Entailment_Resource_Pool`` 31 | 32 | ---------------------------------- 33 | NLP Systems with Online Interfaces 34 | ---------------------------------- 35 | 36 | Several |NLP| systems have online interfaces that you might like to 37 | experiment with, e.g.: 38 | 39 | * WordNet: ``http://wordnet.princeton.edu/`` 40 | * Translation: ``http://babelfish.yahoo.com/``, ``http://translate.google.com/`` 41 | * ChatterBots: ``http://www.loebner.net/Prizef/loebner-prize.html`` 42 | * Question Answering: ``http://www.answerbus.com/`` 43 | * Summarization: ``http://newsblaster.cs.columbia.edu/`` 44 | 45 | Online concordancing: 46 | 47 | * ``http://corpus.leeds.ac.uk/internet.html`` 48 | 49 | ------ 50 | Python 51 | ------ 52 | 53 | A good starting place: http://www.python.org/doc/intros/ 54 | 55 | [vanRossum2006IP]_ is a Python tutorial by Guido van 56 | Rossum, the inventor of Python and Fred Drake, the official 57 | editor of the Python documentation. It is available online at 58 | ``http://docs.python.org/tut/tut.html``. A more detailed but still 59 | introductory text is [Lutz2003LP]_, which covers the essential 60 | features of Python, and also provides an overview of the standard libraries. 61 | A more advanced text, [vanRossum2006PLR]_ is the official reference 62 | for the Python language itself, and describes the syntax of Python and 63 | its built-in datatypes in depth. It is also available online at 64 | ``http://docs.python.org/ref/ref.html.`` 65 | [Beazley2006PER]_ is a succinct reference book; although not suitable 66 | as an introduction to Python, it is an excellent resource for 67 | intermediate and advanced programmers. 68 | Finally, it is always worth checking the official *Python 69 | Documentation* at http://docs.python.org/. 70 | 71 | Two freely available online texts are the following: 72 | 73 | * Josh Cogliati, *Non-Programmer's Tutorial for Python*, 74 | ``http://en.wikibooks.org/wiki/Non-Programmer's_Tutorial_for_Python/Contents`` 75 | 76 | * Jeffrey Elkner, Allen B. Downey and Chris Meyers, 77 | *How to Think Like a Computer Scientist: Learning with Python* (Second Edition), 78 | ``http://openbookproject.net/thinkCSpy/`` 79 | 80 | 81 | .. include:: footer.rst 82 | -------------------------------------------------------------------------------- /LSA325/lsa325_5.tex: -------------------------------------------------------------------------------- 1 | \documentclass[t]{beamer} % for slides 2 | %\documentclass[handout]{beamer} % for handout 3 | \input{beamer} 4 | 5 | \title{Introduction to Computational Linguistics\\LSA 325} 6 | 7 | \author{Steven Bird \and Ewan Klein \and Edward Loper} 8 | \institute{ 9 | University of Melbourne, AUSTRALIA 10 | \and 11 | University of Edinburgh, UK 12 | \and 13 | University of Pennsylvania, USA 14 | } 15 | 16 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 17 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 18 | 19 | \begin{document} 20 | 21 | 22 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 23 | 24 | \begin{frame} 25 | \titlepage 26 | \end{frame} 27 | 28 | 29 | \begin{frame} 30 | 31 | \frametitle{Compositional Semantics} 32 | 33 | \begin{itemize} 34 | \item Contrast with lexical semantics 35 | \item Meaning of a phrase is a function of the meaning of its parts 36 | \item Truth-conditions: minimum hurdle for a theory of meaning 37 | \item Entailment ($\phi \Rightarrow \psi$: every situation that makes 38 | $\phi$ true also make $\psi$ true 39 | 40 | 41 | \end{itemize} 42 | \end{frame} 43 | 44 | \begin{frame} 45 | 46 | \begin{exampleblock}{Entailment Examples} 47 | \begin{itemize} 48 | \item \textit{Kim eats toasted bagels} $\Rightarrow$ \textit{Kim eats 49 | bagels} 50 | \item \textit{Lee sings and dances} $\Rightarrow$ \textit{Lee sings} 51 | \item \textit{Lee sings songs to Kim} $\Rightarrow$ \textit{Lee sings 52 | songs to someone} 53 | \item \textit{Kim hates all green vegetables and calabrese is a green 54 | vegetable} $\Rightarrow$ \textit{Kim hates 55 | calabrese} 56 | \end{itemize} 57 | \end{exampleblock} 58 | 59 | \end{frame} 60 | 61 | \begin{frame} 62 | 63 | \frametitle{Truth in a model, version 1} 64 | 65 | \begin{itemize} 66 | \item A model is a pair $\langle D, V\rangle$ 67 | \item $V:$ Individual terms $\mapsto$ entities in $D$ 68 | \item $V:$ 1-place predicates $\mapsto$ sets of entities 69 | \item $V:$ 2-place predicates (relations) $\mapsto$ sets of pairs of entities 70 | \end{itemize} 71 | 72 | \end{frame} 73 | 74 | \begin{frame}[fragile] 75 | 76 | \frametitle{Truth in a model, version 1} 77 | 78 | \begin{exampleblock}{N-ary Relations} 79 | \begin{verbatim} 80 | ('boy', set(['b1', 'b2'])), 81 | ('chase', set([('b1', 'g1'), ('b2', 'g1'), ('g1', 'd1'), ('g2', 'd2')])), 82 | \end{verbatim} 83 | \end{exampleblock} 84 | \end{frame} 85 | 86 | \begin{frame} 87 | 88 | \frametitle{Truth in a model, version 2} 89 | 90 | \begin{itemize} 91 | \item A model is a pair $\langle D, V\rangle$ 92 | \item Individual terms $\mapsto$ entities 93 | \item 1-place predicates $\mapsto$ mappings from entities to truth values 94 | \item 2-place predicates (relations) $\mapsto$ mappings from 95 | entities to the meanings of 1-place predicates 96 | \end{itemize} 97 | \end{frame} 98 | 99 | \begin{frame}[fragile] 100 | 101 | \frametitle{Truth in a model, version 2} 102 | 103 | \begin{exampleblock}{Characteristic Functions} 104 | \begin{verbatim} 105 | 'boy': {'b1': True, 'b2': True}, 106 | 'chase': {'d1': {'g1': True}, 107 | 'd2': {'g2': True}, 108 | 'g1': {'b1': True, 'b2': True}}, 109 | \end{verbatim} 110 | \end{exampleblock} 111 | \end{frame} 112 | 113 | 114 | 115 | \end{document} 116 | -------------------------------------------------------------------------------- /LSA325/LSA325_5_handout.tex: -------------------------------------------------------------------------------- 1 | \documentclass[12pt]{article} 2 | \usepackage{url,a4wide} 3 | 4 | \begin{document} 5 | 6 | \begin{center} 7 | {\Large LSA 325, Class 6, Mon 23rd July} 8 | \end{center} 9 | 10 | 11 | 12 | \section*{Exercises for class} 13 | 14 | \subsection*{Unification} 15 | 16 | Try the following, and some variations that you make up yourself: 17 | \begin{verbatim} 18 | >>> from nltk import FeatStruct, unify 19 | >>> fs1 = FeatStruct.parse('[AGR=[GND=masc]]') 20 | >>> fs2 = FeatStruct.parse('[AGR=[PER=3]]') 21 | >>> print unify(fs1, fs2) 22 | [ AGR = [ GND = 'masc' ] ] 23 | [ [ PER = 3 ] ] 24 | \end{verbatim} 25 | 26 | \noindent 27 | Now have a go at a few of the unification examples from exercise (2) 28 | in section 9.3.4 of Chapter 9. It's up to you whether you want to try 29 | them with paper and pencil, or dive straight in the Python interpreter 30 | 31 | \subsection*{Feature-based Grammar} 32 | \noindent 33 | Toy grammar of English NPs: 34 | \begin{verbatim} 35 | % start NP 36 | NP[AGR=?a] -> Det[AGR=?a] N[AGR=?a] 37 | Det[AGR=[NUM='sg', PER=3]] -> 'this' | 'that' 38 | Det[AGR=[NUM='pl', PER=3]] -> 'these' | 'those' 39 | Det[AGR=[NUM='pl', PER=1]] -> 'we' 40 | Det[AGR=[PER=2]] -> 'you' 41 | N[AGR=[NUM='sg', GND='m']] -> 'boy' 42 | N[AGR=[NUM='pl', GND='m']] -> 'boys' 43 | N[AGR=[NUM='sg', GND='f']] -> 'girl' 44 | N[AGR=[NUM='pl', GND='f']] -> 'girls' 45 | N[AGR=[NUM='sg']] -> 'student' 46 | N[AGR=[NUM='pl']] -> 'students' 47 | \end{verbatim} 48 | 49 | \noindent 50 | This can be downloaded from the 51 | \url{http://nltk.org/temp/np.cfg}. Save it your current working 52 | directory. At 53 | the same time, download and save the following file to the same directory: 54 | \url{http://nltk.org/temp/fix_featureparser.py}. Start up IDLE in this 55 | directory. Then you can run the 56 | grammar as follows: 57 | \begin{verbatim} 58 | >>> from nltk.book import * 59 | >>> import fix_featureparser 60 | >>> tokens = 'these girls'.split() 61 | >>> cp = parse.load_earley('np.cfg', trace=2) 62 | >>> trees = cp.get_parse(tokens) 63 | >>> for tree in trees: print tree 64 | \end{verbatim} 65 | Alternatively, download these instructions as 66 | \url{http://nltk.org/temp/quickstart.py} and give the command 67 | \begin{verbatim} 68 | import quickstart 69 | \end{verbatim} 70 | 71 | \noindent 72 | Play with the grammar a bit. Now try to write a similar grammar of 73 | your own. You might like to try working with the Spanish data in 74 | Chapter 9; i.e., Exercise 3 in section 9.2.4. 75 | 76 | \subsection*{Semantics} 77 | 78 | Download and save the following file to the same directory: 79 | \url{http://nltk.org/temp/model.py}. 80 | 81 | You can now inspect this first-order model, and play around with the 82 | valuations that it gives. Here are some starting suggestions: 83 | \begin{verbatim} 84 | >>> from model import * 85 | >>> print m 86 | >>> val['walk'] 87 | >>> val['walk'][val['john']] 88 | >>> m.evaluate('(walk john)', g) 89 | >>> m.evaluate('((walk john) and (walk fido))', g) 90 | >>> m.evaluate('(chase mary fido)', g) 91 | >>> m.evaluate('(chase fido mary)', g) 92 | >>> g.add('b1', 'x') 93 | >>> print g 94 | g[b1/x] 95 | >>> m.evaluate('(chase fido x)', g) 96 | >>> m.evaluate('(walk x)', g) 97 | \end{verbatim} 98 | Have a look at Chapter 11, and see if you can evaluate some formulas 99 | involving the quantifiers \texttt{some} and \texttt{all}. 100 | 101 | 102 | 103 | \end{document} 104 | 105 | %%% Local Variables: 106 | %%% mode: latex 107 | %%% TeX-master: t 108 | %%% End: 109 | -------------------------------------------------------------------------------- /book/second-edition.txt: -------------------------------------------------------------------------------- 1 | Natural Language Processing with Python 2 | Proposal for 2nd Edition (Draft 14 Nov 2013) 3 | 4 | We propose a second edition of the book with the following key changes: 5 | 6 | 1. Incorporate material on new developments in the field where they 7 | are sufficiently mature to form part of an introductory textbook, such 8 | as dependency parsing and machine translation. 9 | 10 | 2. Incorporate and systematize popular applications of the toolkit 11 | that have arisen since the first edition was published, such as 12 | sentiment analysis and the semantic web. 13 | 14 | 3. Add new sections on scaling up to several of the chapters, in order 15 | to show readers how to handle larger datasets and how to interface to 16 | specialized industry-strength tools; this will give readers a clearer 17 | pathway into R&D. 18 | 19 | 4. Incorporate feedback from people who have adopted the book for teaching 20 | (we have identified approximately 100 such courses). 21 | 22 | 5. Update program samples for consistency with Python 3 and NLTK 3; in 23 | many cases the surrounding discussion needs to be updated as well. 24 | 25 | 6. Tighten up existing content to save space; trim sections with the 26 | help of reader feedback; rescale or rework diagrams so they take less 27 | space; possibly hyperlinking from electronic versions to online code 28 | samples so that some can be omitted from the book. 29 | 30 | 31 | 32 | Table of Contents (additions marked with "+", deletions marked with "-") 33 | 34 | 0. Preface 35 | + converting NLTK 2 to NLTK 3 code 36 | 37 | 1. Language Processing and Python 38 | + Chatbots 39 | 40 | 2. Accessing Text Corpora and Lexical Resources 41 | + Google ngrams corpus 42 | - move lexical resources to lexicon chapter 43 | 44 | 3. Processing Raw Text 45 | + processing twitter feeds 46 | 47 | 4. Writing Structured Programs 48 | + string edit distance 49 | 50 | 5. Categorizing and Tagging Words 51 | + scaling up: interface to stanford tagger 52 | 53 | 6. Learning to Classify Text 54 | + clustering, semi-supervised approaches 55 | + scaling up: map-reduce and NLP in the cloud 56 | 57 | + The Lexicon 58 | + wordnet, framenet 59 | + ontologies and the semantic web 60 | + interface to finite-state morphology toolkit 61 | + word-sense disambiguation 62 | + distributional semantics 63 | + multilingual wordnet 64 | 65 | 7. Extracting Information from Text 66 | + semantic role labeling, VerbNet, and PropBank 67 | + sentiment analysis 68 | + resources for named-entity recognition 69 | + abbreviations 70 | + normalization and grounding of named entities 71 | + scaling up: interface to stanford NER system 72 | 73 | + Machine Translation [new chapter] 74 | + bilingual aligned text 75 | + bitext corpora 76 | + sentence alignment (Gale-Church algorithm) 77 | + word-alignment (IBM models 1-3) 78 | + evaluation 79 | 80 | 8. Analyzing Sentence Structure 81 | * grammar-development -> scaling up: grammar development 82 | 8.6 add material on training dependency grammar 83 | + scaling up: interfacing to stanford parser and maltparser 84 | 85 | 9. Building Feature Based Grammars 86 | 87 | 10. Analyzing the Meaning of Sentences 88 | + machine learning techniques for learning semantic representations 89 | - model theoretic semantics moved into free-standing HOWTO 90 | 91 | 11. Managing Linguistic Data 92 | 93 | 12. Afterword: Facing the Language Challenge 94 | 95 | Omitted: 96 | * textual entailment 97 | * summarisation 98 | * generation 99 | 100 | -------------------------------------------------------------------------------- /images/are.fig: -------------------------------------------------------------------------------- 1 | #FIG 3.2 2 | Landscape 3 | Center 4 | Metric 5 | A4 6 | 100.00 7 | Single 8 | -2 9 | 1200 2 10 | 6 1800 2700 2250 4050 11 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 12 | 1800 2700 2250 2700 2250 3150 1800 3150 1800 2700 13 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 14 | 1800 3150 2250 3150 2250 3600 1800 3600 1800 3150 15 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 16 | 1800 3600 2250 3600 2250 4050 1800 4050 1800 3600 17 | -6 18 | 6 2250 2700 2700 4050 19 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 20 | 2250 2700 2700 2700 2700 3150 2250 3150 2250 2700 21 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 22 | 2250 3150 2700 3150 2700 3600 2250 3600 2250 3150 23 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 24 | 2250 3600 2700 3600 2700 4050 2250 4050 2250 3600 25 | -6 26 | 6 2700 2700 3150 4050 27 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 28 | 2700 2700 3150 2700 3150 3150 2700 3150 2700 2700 29 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 30 | 2700 3150 3150 3150 3150 3600 2700 3600 2700 3150 31 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 32 | 2700 3600 3150 3600 3150 4050 2700 4050 2700 3600 33 | -6 34 | 6 3150 2700 3600 4050 35 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 36 | 3150 2700 3600 2700 3600 3150 3150 3150 3150 2700 37 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 38 | 3150 3150 3600 3150 3600 3600 3150 3600 3150 3150 39 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 40 | 3150 3600 3600 3600 3600 4050 3150 4050 3150 3600 41 | -6 42 | 6 3600 2700 4050 4050 43 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 44 | 3600 2700 4050 2700 4050 3150 3600 3150 3600 2700 45 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 46 | 3600 3150 4050 3150 4050 3600 3600 3600 3600 3150 47 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 48 | 3600 3600 4050 3600 4050 4050 3600 4050 3600 3600 49 | -6 50 | 6 4050 2700 4500 4050 51 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 52 | 4050 2700 4500 2700 4500 3150 4050 3150 4050 2700 53 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 54 | 4050 3150 4500 3150 4500 3600 4050 3600 4050 3150 55 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 56 | 4050 3600 4500 3600 4500 4050 4050 4050 4050 3600 57 | -6 58 | 6 4500 2700 4950 4050 59 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 60 | 4500 2700 4950 2700 4950 3150 4500 3150 4500 2700 61 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 62 | 4500 3150 4950 3150 4950 3600 4500 3600 4500 3150 63 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 64 | 4500 3600 4950 3600 4950 4050 4500 4050 4500 3600 65 | -6 66 | 6 4950 2700 5400 4050 67 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 68 | 4950 2700 5400 2700 5400 3150 4950 3150 4950 2700 69 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 70 | 4950 3150 5400 3150 5400 3600 4950 3600 4950 3150 71 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 72 | 4950 3600 5400 3600 5400 4050 4950 4050 4950 3600 73 | -6 74 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 75 | 5400 3150 5850 3150 5850 3600 5400 3600 5400 3150 76 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 77 | 5400 3600 5850 3600 5850 4050 5400 4050 5400 3600 78 | 2 2 0 1 0 0 50 -1 10 0.000 0 0 -1 0 0 5 79 | 5400 2700 5850 2700 5850 3150 5400 3150 5400 2700 80 | 4 1 0 50 -1 0 15 0.0000 4 150 150 2520 4320 B\001 81 | 4 1 0 50 -1 0 15 0.0000 4 150 165 2070 4320 A\001 82 | 4 1 0 50 -1 0 15 0.0000 4 150 150 2970 4320 C\001 83 | 4 1 0 50 -1 0 15 0.0000 4 150 165 3420 4320 D\001 84 | 4 1 0 50 -1 0 15 0.0000 4 150 135 3870 4320 E\001 85 | 4 1 0 50 -1 0 15 0.0000 4 150 120 4320 4320 F\001 86 | 4 1 0 50 -1 0 15 0.0000 4 150 165 4770 4320 G\001 87 | 4 1 0 50 -1 0 15 0.0000 4 150 165 5220 4320 H\001 88 | 4 1 0 50 -1 0 15 0.0000 4 150 75 5580 4320 I\001 89 | 4 1 0 50 -1 0 15 0.0000 4 105 105 1530 2970 a\001 90 | 4 1 0 50 -1 0 15 0.0000 4 150 120 1530 3420 b\001 91 | 4 1 0 50 -1 0 15 0.0000 4 105 105 1530 3870 c\001 92 | -------------------------------------------------------------------------------- /book/ch01-notes.rst: -------------------------------------------------------------------------------- 1 | 2 | ======================= 3 | Computing with Language 4 | ======================= 5 | 6 | * not a conventional introduction to programming where we work 7 | through language features one by one 8 | (in fact, features will be introduced in a rather unusual order) 9 | * plenty of such books exist already (incl for Python) 10 | * instead, a problem-oriented approach: a series of tasks each requiring some programming, 11 | each building on what has come before (so getting more difficult) 12 | * starting point: we have lots of text and lots of computing cycles: what can we do? 13 | * no prior programming ability assumed, just retyping examples 14 | 15 | ---------------------------------- 16 | Searching large quantities of text 17 | ---------------------------------- 18 | 19 | * most obvious: searching large amounts of text 20 | * includes functionality for generating random text in this style 21 | * first-hand experience with scale and diversity of corpora 22 | 23 | Questions coming out of this: 24 | * what makes texts different? 25 | * what is a text? seq of characters on a page (does page matter?) 26 | seq of words? seq of chapters made up of seq of paras ... 27 | * our simplification: text = sequence of words (plus punctuation 'words'): "tokens" 28 | * explicit notation: ["the", "cat", "sat", "on", "the", "mat"] 29 | * key concept: TEXT = LIST OF WORDS 30 | * reuse material from 2.4.1 31 | 32 | IDLE session: 33 | * getting started with IDLE 34 | * lists, str.split(), len() 35 | * variables 36 | 37 | ------------------- 38 | Counting vocabulary 39 | ------------------- 40 | 41 | * one thing that makes texts different is the set of words used (vocabulary) 42 | * vocabulary richness 43 | * defining functions -- allows us to explain what the () are everywhere 44 | and gives inkling of the power of programming 45 | * key concept: VOCABULARY = SET OF WORDS 46 | 47 | IDLE session: 48 | * str.lower() 49 | * defining simple functions (diagram of unary function) 50 | 51 | 52 | >>> sorted(set(word for word in text3 if word.endswith("eth"))) 53 | ['Hazarmaveth', 'Heth', 'Japheth', 'Jetheth', 'Seth', 'aileth', 'asketh', 'biteth', 'blesseth', 'breaketh', 'cometh', 'compasseth', 'creepeth', 'crieth', 'curseth', 'divineth', 'doeth', 'drinketh', 'faileth', 'findeth', 'giveth', 'goeth', 'knoweth', 'lieth', 'liveth', 'longeth', 'loveth', 'meeteth', 'moveth', 'needeth', 'pleaseth', 'proceedeth', 'remaineth', 'repenteth', 'seeth', 'sheddeth', 'sheweth', 'slayeth', 'speaketh', 'teeth', 'togeth', 'toucheth', 'twentieth', 'walketh', 'wotteth'] 54 | 55 | 56 | 57 | ------- 58 | Corpora 59 | ------- 60 | 61 | * definition 62 | * accessing 63 | 64 | 65 | 66 | -------------- 67 | Changing Tense 68 | -------------- 69 | 70 | * convert a verb into past tense (perfect) 71 | * motivation? 72 | * key concepts: CONDITIONAL EXPRESSIONS, STRINGS 73 | 74 | IDLE session: 75 | * string concatenation 76 | * string indexing 77 | * conditional expressions 78 | * function past(word) -> past-tense form 79 | 80 | -------------- 81 | Classification 82 | -------------- 83 | 84 | * informal study of how texts differ 85 | * genre, author, language 86 | * FreqDists initialized with list comprehensions 87 | * key concept: COMPREHENSIONS (ITERATION) 88 | 89 | IDLE session: 90 | * word length distribution plot: FreqDist(len(word) for word in text).plot() 91 | (comparing languages, text difficulties) 92 | (need to permit >1 plot to be overlaid) 93 | * character distribution plot: FreqDist(char for word in text for char in word).plot() 94 | (comparing languages) 95 | * relative frequency of modals: FreqDist(word for word in text if word in modals).plot() 96 | (comparing Brown corpus genres) 97 | 98 | 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /LSA325/log_fds.txt: -------------------------------------------------------------------------------- 1 | Python 2.5 (r25:51918, Sep 19 2006, 08:49:13) 2 | [GCC 4.0.1 (Apple Computer, Inc. build 5341)] on darwin 3 | Type "copyright", "credits" or "license()" for more information. 4 | 5 | **************************************************************** 6 | Personal firewall software may warn about the connection IDLE 7 | makes to its subprocess using this computer's internal loopback 8 | interface. This connection is not visible on any external 9 | interface and no data is sent to or received from the Internet. 10 | **************************************************************** 11 | 12 | IDLE 1.2 13 | >>> 14 | >>> 15 | >>> from nltk.book import * 16 | >>> text = '''Hello. Isn't this fun?''' 17 | >>> list(tokenize.regexp(text, r'[a-z]')) 18 | ['e', 'l', 'l', 'o', 's', 'n', 't', 't', 'h', 'i', 's', 'f', 'u', 'n'] 19 | >>> list(tokenize.regexp(text, r'[a-z]+')) 20 | ['ello', 'sn', 't', 'this', 'fun'] 21 | >>> list(tokenize.regexp(text, r'[A-Za-z]+')) 22 | ['Hello', 'Isn', 't', 'this', 'fun'] 23 | >>> list(tokenize.regexp(text, r'[A-Za-z]+|[.?!;]')) 24 | ['Hello', '.', 'Isn', 't', 'this', 'fun', '?'] 25 | >>> list(tokenize.regexp(text, r'[A-Za-z]+|[.?!;']')) 26 | SyntaxError: invalid syntax 27 | >>> 28 | >>> list(tokenize.regexp(text, r"[A-Za-z]+|[.?!;']")) 29 | ['Hello', '.', 'Isn', "'", 't', 'this', 'fun', '?'] 30 | >>> list(tokenize.regexp(text, r"\w+[.?!;']\w+|[.?!;']")) 31 | ['.', "Isn't", '?'] 32 | >>> list(tokenize.regexp(text, r"\w+([.?!;']\w+)?|[.?!;']")) 33 | ['Hello', '.', "Isn't", 'this', 'fun', '?'] 34 | >>> list(tokenize.whitespace(text)) 35 | ['Hello.', "Isn't", 'this', 'fun?'] 36 | >>> list(tokenize.wordpunct(text)) 37 | ['Hello', '.', 'Isn', "'", 't', 'this', 'fun', '?'] 38 | >>> nltk.FreqDist 39 | 40 | >>> sentence = "the cat sat on the mat" 41 | >>> words = sentence.split() 42 | >>> words 43 | ['the', 'cat', 'sat', 'on', 'the', 'mat'] 44 | >>> fd = nltk.FreqDist(words) 45 | >>> 46 | >>> fd['the'] 47 | 2 48 | >>> fd['sat'] 49 | 1 50 | >>> fd2 = nltk.FreqDist(sentence) 51 | >>> fd2.keys() 52 | ['a', ' ', 'c', 'e', 'h', 'm', 'o', 'n', 's', 't'] 53 | >>> fd2['c'] 54 | 1 55 | >>> corpus.inaugural.items 56 | ['1789-Washington', '1793-Washington', '1797-Adams', '1801-Jefferson', '1805-Jefferson', '1809-Madison', '1813-Madison', '1817-Monroe', '1821-Monroe', '1825-Adams', '1829-Jackson', '1833-Jackson', '1837-VanBuren', '1841-Harrison', '1845-Polk', '1849-Taylor', '1853-Pierce', '1857-Buchanan', '1861-Lincoln', '1865-Lincoln', '1869-Grant', '1873-Grant', '1877-Hayes', '1881-Garfield', '1885-Cleveland', '1889-Harrison', '1893-Cleveland', '1897-McKinley', '1901-McKinley', '1905-Roosevelt', '1909-Taft', '1913-Wilson', '1917-Wilson', '1921-Harding', '1925-Coolidge', '1929-Hoover', '1933-Roosevelt', '1937-Roosevelt', '1941-Roosevelt', '1945-Roosevelt', '1949-Truman', '1953-Eisenhower', '1957-Eisenhower', '1961-Kennedy', '1965-Johnson', '1969-Nixon', '1973-Nixon', '1977-Carter', '1981-Reagan', '1985-Reagan', '1989-Bush', '1993-Clinton', '1997-Clinton', '2001-Bush', '2005-Bush'] 57 | >>> for word in corpus.inaugural.tokenized('2005-Bush'): 58 | if word in ['he', 'him', 'she', 'her', 'man', 'woman']: 59 | print word, 60 | man woman her 61 | >>> fd = nltk.FreqDist() 62 | >>> fd 63 | 64 | >>> fd.inc('male') 65 | >>> fd 66 | 67 | >>> 68 | >>> fd.inc('female') 69 | >>> fd.inc('female') 70 | >>> fd.inc('female') 71 | >>> fd.inc('female') 72 | >>> fd 73 | 74 | >>> 75 | >>> fd['male'] 76 | 1 77 | >>> fd['female'] 78 | 4 79 | >>> if word in ['..', '..', '..']: 80 | fd.inc('???') 81 | 82 | >>> for word in corpus.inaugural.tokenized('2005-Bush'): 83 | fd.inc(word) 84 | >>> fd 85 | 86 | >>> fd['President'] 87 | 4 88 | >>> fd['man'] 89 | 1 90 | >>> 91 | 92 | -------------------------------------------------------------------------------- /images/chunk-tagrep.fig: -------------------------------------------------------------------------------- 1 | #FIG 3.2 2 | Landscape 3 | Center 4 | Metric 5 | A4 6 | 100.00 7 | Single 8 | -2 9 | 1200 2 10 | 0 32 #aeaeae 11 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 12 | 540 1170 720 1170 720 1350 540 1350 540 1170 13 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 14 | 720 1170 900 1170 900 1350 720 1350 720 1170 15 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 16 | 900 1170 1080 1170 1080 1350 900 1350 900 1170 17 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 18 | 1080 1170 1260 1170 1260 1350 1080 1350 1080 1170 19 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 20 | 1260 1170 1440 1170 1440 1350 1260 1350 1260 1170 21 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 22 | 1440 1170 1620 1170 1620 1350 1440 1350 1440 1170 23 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 24 | 1620 1170 1800 1170 1800 1350 1620 1350 1620 1170 25 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 26 | 1800 1170 1980 1170 1980 1350 1800 1350 1800 1170 27 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 28 | 1980 1170 2160 1170 2160 1350 1980 1350 1980 1170 29 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 30 | 2160 1170 2340 1170 2340 1350 2160 1350 2160 1170 31 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 32 | 2340 1170 2520 1170 2520 1350 2340 1350 2340 1170 33 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 34 | 2700 1170 2880 1170 2880 1350 2700 1350 2700 1170 35 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 36 | 2520 1170 2700 1170 2700 1350 2520 1350 2520 1170 37 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 38 | 3060 1170 3240 1170 3240 1350 3060 1350 3060 1170 39 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 40 | 2880 1170 3060 1170 3060 1350 2880 1350 2880 1170 41 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 42 | 3240 1170 3420 1170 3420 1350 3240 1350 3240 1170 43 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 44 | 3420 1170 3600 1170 3600 1350 3420 1350 3420 1170 45 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 46 | 3600 1170 3780 1170 3780 1350 3600 1350 3600 1170 47 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 48 | 3780 1170 3960 1170 3960 1350 3780 1350 3780 1170 49 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 50 | 495 1125 945 1125 945 1800 495 1800 495 1125 51 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 52 | 1035 1125 1665 1125 1665 1800 1035 1800 1035 1125 53 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 54 | 1755 1125 2385 1125 2385 1800 1755 1800 1755 1125 55 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 56 | 2475 1125 3105 1125 3105 1800 2475 1800 2475 1125 57 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 58 | 3195 1125 3825 1125 3825 1800 3195 1800 3195 1125 59 | 4 1 0 50 0 14 12 0.0000 4 75 135 1170 1305 s\001 60 | 4 1 0 50 0 14 12 0.0000 4 75 135 1350 1305 a\001 61 | 4 1 0 50 0 14 12 0.0000 4 75 135 1530 1305 w\001 62 | 4 1 0 50 0 14 12 0.0000 4 105 135 1890 1305 t\001 63 | 4 1 0 50 0 14 12 0.0000 4 105 135 2070 1305 h\001 64 | 4 1 0 50 0 14 12 0.0000 4 75 135 2250 1305 e\001 65 | 4 1 0 50 0 14 12 0.0000 4 105 135 2610 1305 b\001 66 | 4 1 0 50 0 14 12 0.0000 4 105 135 2790 1305 i\001 67 | 4 1 0 50 0 14 12 0.0000 4 105 135 2970 1305 g\001 68 | 4 1 0 50 0 14 12 0.0000 4 105 135 3330 1305 d\001 69 | 4 1 0 50 0 14 12 0.0000 4 75 135 3510 1305 o\001 70 | 4 1 0 50 0 14 12 0.0000 4 105 135 3690 1305 g\001 71 | 4 1 0 50 0 14 12 0.0000 4 30 135 3870 1305 .\001 72 | 4 1 0 50 0 14 12 0.0000 4 105 135 630 1305 H\001 73 | 4 1 0 50 0 14 12 0.0000 4 75 135 810 1305 e\001 74 | 4 1 0 50 0 14 12 0.0000 4 105 405 720 1575 PRP\001 75 | 4 1 0 50 0 14 12 0.0000 4 105 405 1350 1575 VBD\001 76 | 4 1 0 50 0 14 12 0.0000 4 105 270 2070 1575 DT\001 77 | 4 1 0 50 0 14 12 0.0000 4 105 270 2790 1575 JJ\001 78 | 4 1 0 50 0 14 12 0.0000 4 105 270 3510 1575 NN\001 79 | 4 1 0 50 0 14 9 0.0000 4 75 420 720 1755 B-NP\001 80 | 4 1 0 50 0 14 9 0.0000 4 75 105 1350 1755 O\001 81 | 4 1 0 50 0 14 9 0.0000 4 75 420 2070 1755 B-NP\001 82 | 4 1 0 50 0 14 9 0.0000 4 75 420 2790 1755 I-NP\001 83 | 4 1 0 50 0 14 9 0.0000 4 75 420 3510 1755 I-NP\001 84 | -------------------------------------------------------------------------------- /xmlpp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2008, Fredrik Ekholdt 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 8 | 9 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 10 | 11 | * Neither the name of None nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 14 | 15 | (Minor modifications by Steven Bird) 16 | """ 17 | import sys 18 | 19 | def usage(this_file): 20 | return """SYNOPSIS: pretty print an XML document 21 | USAGE: python %s or use stdin as input\n""" % this_file 22 | 23 | def pprint(indent_level, line): 24 | if line.strip(): 25 | sys.stdout.write(" " * indent_level + line + "\n") 26 | 27 | def get_next_elem(data): 28 | start_pos = data.find("<") 29 | end_pos = data.find(">") + 1 30 | retval = data[start_pos:end_pos] 31 | stopper = retval.find("/") 32 | single = (stopper > -1 and ((retval.find(">") - stopper) < (stopper - retval.find("<")))) 33 | 34 | ignore_excl = retval.find(" -1 35 | ignore_question = retval.find(" -1 36 | 37 | if ignore_excl: 38 | cdata = retval.find(" -1 39 | if cdata: 40 | end_pos = data.find("]]>") 41 | if end_pos > -1: 42 | end_pos = end_pos + len("]]>") 43 | 44 | elif ignore_question: 45 | end_pos = data.find("?>") + len("?>") 46 | ignore = ignore_excl or ignore_question 47 | 48 | no_indent = ignore or single 49 | 50 | #print retval, end_pos, start_pos, no_indent 51 | return start_pos, \ 52 | end_pos, \ 53 | stopper > -1, \ 54 | no_indent 55 | 56 | 57 | if __name__ == "__main__": 58 | if "-h" in sys.argv or "--help" in sys.argv: 59 | sys.stderr.write(usage(sys.argv[0])) 60 | sys.exit(1) 61 | if len(sys.argv) < 2: 62 | data = sys.stdin.read() 63 | else: 64 | filename = sys.argv[1] 65 | data = open(filename).read() 66 | 67 | INDENT = 2 68 | 69 | indent_level = 0 70 | 71 | start_pos, end_pos, is_stop, no_indent = get_next_elem(data) 72 | while ((start_pos > -1 and end_pos > -1)): 73 | pprint(indent_level, data[:start_pos].strip()) 74 | data = data[start_pos:] 75 | if is_stop and not no_indent: 76 | indent_level = indent_level - INDENT 77 | pprint(indent_level, data[:end_pos - start_pos]) 78 | data = data[end_pos - start_pos:] 79 | if not is_stop and not no_indent : 80 | indent_level = indent_level + INDENT 81 | 82 | if not data: 83 | break 84 | else: 85 | start_pos, end_pos, is_stop, no_indent = get_next_elem(data) 86 | -------------------------------------------------------------------------------- /images/chunk-segmentation.fig: -------------------------------------------------------------------------------- 1 | #FIG 3.2 2 | Landscape 3 | Center 4 | Metric 5 | A4 6 | 100.00 7 | Single 8 | -2 9 | 1200 2 10 | 0 32 #aeaeae 11 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 12 | 540 1170 720 1170 720 1350 540 1350 540 1170 13 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 14 | 720 1170 900 1170 900 1350 720 1350 720 1170 15 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 16 | 900 1170 1080 1170 1080 1350 900 1350 900 1170 17 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 18 | 1080 1170 1260 1170 1260 1350 1080 1350 1080 1170 19 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 20 | 1260 1170 1440 1170 1440 1350 1260 1350 1260 1170 21 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 22 | 1440 1170 1620 1170 1620 1350 1440 1350 1440 1170 23 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 24 | 1620 1170 1800 1170 1800 1350 1620 1350 1620 1170 25 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 26 | 1800 1170 1980 1170 1980 1350 1800 1350 1800 1170 27 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 28 | 1980 1170 2160 1170 2160 1350 1980 1350 1980 1170 29 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 30 | 2160 1170 2340 1170 2340 1350 2160 1350 2160 1170 31 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 32 | 2340 1170 2520 1170 2520 1350 2340 1350 2340 1170 33 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 34 | 2700 1170 2880 1170 2880 1350 2700 1350 2700 1170 35 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 36 | 2520 1170 2700 1170 2700 1350 2520 1350 2520 1170 37 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 38 | 3060 1170 3240 1170 3240 1350 3060 1350 3060 1170 39 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 40 | 2880 1170 3060 1170 3060 1350 2880 1350 2880 1170 41 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 42 | 3240 1170 3420 1170 3420 1350 3240 1350 3240 1170 43 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 44 | 3420 1170 3600 1170 3600 1350 3420 1350 3420 1170 45 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 46 | 3600 1170 3780 1170 3780 1350 3600 1350 3600 1170 47 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 48 | 3780 1170 3960 1170 3960 1350 3780 1350 3780 1170 49 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 50 | 495 1125 945 1125 945 1620 495 1620 495 1125 51 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 52 | 1035 1125 1665 1125 1665 1620 1035 1620 1035 1125 53 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 54 | 2475 1125 3105 1125 3105 1620 2475 1620 2475 1125 55 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 56 | 3195 1125 3825 1125 3825 1620 3195 1620 3195 1125 57 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 58 | 1755 1125 2385 1125 2385 1620 1755 1620 1755 1125 59 | 2 2 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 5 60 | 1710 1080 3870 1080 3870 1890 1710 1890 1710 1080 61 | 2 2 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 5 62 | 450 1080 990 1080 990 1890 450 1890 450 1080 63 | 4 1 0 50 0 14 12 0.0000 4 75 105 1170 1305 s\001 64 | 4 1 0 50 0 14 12 0.0000 4 75 105 1350 1305 a\001 65 | 4 1 0 50 0 14 12 0.0000 4 75 105 1530 1305 w\001 66 | 4 1 0 50 0 14 12 0.0000 4 105 105 1890 1305 t\001 67 | 4 1 0 50 0 14 12 0.0000 4 105 105 2070 1305 h\001 68 | 4 1 0 50 0 14 12 0.0000 4 75 105 2250 1305 e\001 69 | 4 1 0 50 0 14 12 0.0000 4 105 105 2610 1305 b\001 70 | 4 1 0 50 0 14 12 0.0000 4 105 105 2790 1305 i\001 71 | 4 1 0 50 0 14 12 0.0000 4 120 105 2970 1305 g\001 72 | 4 1 0 50 0 14 12 0.0000 4 105 105 3330 1305 d\001 73 | 4 1 0 50 0 14 12 0.0000 4 75 105 3510 1305 o\001 74 | 4 1 0 50 0 14 12 0.0000 4 120 105 3690 1305 g\001 75 | 4 1 0 50 0 14 12 0.0000 4 30 105 3870 1305 .\001 76 | 4 1 0 50 0 14 12 0.0000 4 105 105 630 1305 H\001 77 | 4 1 0 50 0 14 12 0.0000 4 75 105 810 1305 e\001 78 | 4 1 0 50 0 14 12 0.0000 4 105 315 720 1575 PRP\001 79 | 4 1 0 50 0 14 12 0.0000 4 105 315 1350 1575 VBD\001 80 | 4 1 0 50 0 14 12 0.0000 4 105 210 2070 1575 DT\001 81 | 4 1 0 50 0 14 12 0.0000 4 105 210 2790 1575 JJ\001 82 | 4 1 0 50 0 14 12 0.0000 4 105 210 3510 1575 NN\001 83 | 4 1 0 50 0 14 12 0.0000 4 105 210 2790 1845 NP\001 84 | 4 1 0 50 0 14 12 0.0000 4 105 210 720 1845 NP\001 85 | -------------------------------------------------------------------------------- /HouseStyle.txt: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | .. include:: definitions.rst 3 | 4 | ========================================== 5 | NLTK Book House Style: Restructured Text 6 | ========================================== 7 | 8 | ------------------ 9 | ReST Inline Markup 10 | ------------------ 11 | 12 | 13 | Double backquotes are used for code:: 14 | 15 | e.g., the ``tokenize.whitespace`` function 16 | 17 | Double quotes used for quoted speech:: 18 | 19 | e.g., a noun is "the name of a person, place or thing" 20 | 21 | Single quotes used for scare quotes:: 22 | 23 | e.g., there is no one 'right way' to assign tags 24 | 25 | ---------- 26 | Text Roles 27 | ---------- 28 | 29 | LX: Linguistic eXample -- cited form in running text:: 30 | 31 | e.g., the verb `walks`:lx: 32 | 33 | DT: Defined Term -- first or canonical use of technical term:: 34 | 35 | e.g., the process of `parsing`:dt: 36 | 37 | GC: Grammatical Category:: 38 | 39 | e.g., `NP`:gc: and `verb`:gc: as technical terms 40 | 41 | EM: Emphasis:: 42 | 43 | e.g., this word is `emphasised`:em: here. 44 | 45 | ----------------------------- 46 | Examples and Cross-references 47 | ----------------------------- 48 | 49 | Write examples using the custom 'example' directive (or 'ex' for short):: 50 | 51 | .. ex:: John went to the store. 52 | .. ex:: John bought some bread. 53 | 54 | The example directive may be nested, to define groups of related examples:: 55 | 56 | .. ex:: 57 | .. ex:: John went to the store. 58 | .. ex:: \* John went from the store. 59 | 60 | (Note that \* must be backslashed when used to indicate grammaticality 61 | judgements.) 62 | 63 | To refer to an example, mark it with a crossreference target, and 64 | refer to it with a crossreference link:: 65 | 66 | .. _johntostore: 67 | .. ex:: John went to the store. 68 | 69 | In example johntostore_, `John`:lx: is the subject. 70 | 71 | .. Old instructions: 72 | 73 | When the text above an example refers to the example by number, the 74 | text ends with a period. When the text does not refer to the example 75 | by number, it ends with a colon. 76 | In text references, place letters referring to subexample numbers 77 | inside parentheses. (2a), (5a – b), (6a,c), (6a – f), 78 | (8a,c,d), 79 | 80 | 81 | ------ 82 | Titles 83 | ------ 84 | 85 | Section titles and captions should have initial capitals on non-function words. 86 | 87 | ----- 88 | Trees 89 | ----- 90 | 91 | Write trees using the custom 'tree' directive:: 92 | 93 | .. tree:: (S (NP John) (VP (V saw) (NP Mary))) 94 | 95 | Constituents that should be drawn with a 'roof' (i.e., a triangle 96 | between the node & its children, rather than individual lines) 97 | are marked using angle brackets:: 98 | 99 | .. tree:: (S (NP John) ) 100 | 101 | Subscripting is done using underscore (similar to latex). If 102 | the subscripted string is more than one character long, it should 103 | be enclosed in brackets:: 104 | 105 | .. tree:: (S (NP Mary_i) (VP was (VP seen t_i))) 106 | 107 | Substrings can be italicized by using '*...*':: 108 | 109 | .. tree:: (S (NP *Mary_i*) (VP was (VP seen *t_i*))) 110 | 111 | Backslash can be used to escape characters that would otherwise 112 | be treated as markup (i.e., any of C{'<>()_* '}). Note that this 113 | list includes space:: 114 | 115 | .. tree:: (S (NP Mary) (VP went (PP to (NP New\ York)))) 116 | 117 | Typically, trees will be included as the single element of an example:: 118 | 119 | .. ex:: 120 | .. tree:: (S (NP Mary) (VP went (PP to (NP New\ York)))) 121 | 122 | ------------------ 123 | Dashes and Hyphens 124 | ------------------ 125 | 126 | Use an em-dash between words:: 127 | 128 | e.g., you should try yourself |mdash| in fact, we insist! 129 | 130 | Use an en-dash between numerals:: 131 | 132 | e.g., during the persiod 1900\ |ndash|\ 1950 133 | -------------------------------------------------------------------------------- /LSA325/lsa110_1.tex: -------------------------------------------------------------------------------- 1 | %\documentclass{beamer} % for slides 2 | \documentclass[handout]{beamer} % for handout 3 | \input{beamer} 4 | 5 | \title{Python Programming for Linguists\\LSA 100 Presession} 6 | 7 | % \author{Steven Bird \and Ewan Klein \and Edward Loper} 8 | % \institute{ 9 | % University of Melbourne, AUSTRALIA 10 | % \and 11 | % University of Edinburgh, UK 12 | % \and 13 | % University of Pennsylvania, USA 14 | % } 15 | 16 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 17 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 18 | 19 | \begin{document} 20 | 21 | 22 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 23 | 24 | \begin{frame} 25 | \titlepage 26 | \end{frame} 27 | 28 | 29 | 30 | \begin{frame} 31 | \frametitle{Introduction} 32 | \begin{itemize} 33 | \item Who we are 34 | \item Python and NLTK 35 | \item Materials and Resources 36 | \item Goals 37 | \item Syllabus 38 | \end{itemize} 39 | \end{frame} 40 | 41 | \begin{frame} 42 | \frametitle{Who we are} 43 | 44 | Instructors: 45 | \begin{itemize} 46 | \item Steven Bird 47 | \item Ewan Klein 48 | \item Edward Loper (here tomorrow) 49 | \end{itemize} 50 | 51 | TAs: 52 | \begin{itemize} 53 | \item David Hall 54 | \item Yaron Greif 55 | \item Yun-Hsuan Sung 56 | % \item Jette Viethen 57 | \end{itemize} 58 | 59 | \end{frame} 60 | 61 | 62 | \begin{frame} 63 | \frametitle{Python and NLTK} 64 | \begin{itemize} 65 | \item Pre-session for \textit{Introduction to 66 | Computational Linguistics} (LSA 325) 67 | \item First steps in using Python and Natural Language Toolkit (NLTK) 68 | \item Why Python? 69 | \begin{itemize} 70 | \item designed to be easy to learn; 71 | \item good for processing linguistic data; 72 | \item good for interactive experiments. 73 | \end{itemize} 74 | \item Many online tutorials (see \url{www.python.org}) 75 | \end{itemize} 76 | \end{frame} 77 | 78 | 79 | \begin{frame} 80 | \frametitle{Materials and Resources} 81 | \begin{itemize} 82 | \item Chapter 2, \textit{Programming Fundamentals and Python} in the NLTK Book 83 | (\url{http://nltk.org/index.php/Book}) 84 | \item \textbf{Su07-LSA-110} page on \url{http://coursework.stanford.edu} 85 | \item Main NLTK page: \url{http://nltk.org} 86 | \begin{itemize} 87 | \item Chatroom 88 | \item \texttt{nltk-users} Mailing List 89 | \end{itemize} 90 | 91 | \end{itemize} 92 | \end{frame} 93 | 94 | \begin{frame} 95 | \frametitle{Audience and Goals} 96 | \begin{itemize} 97 | \item We are assuming you have not done programming before. 98 | \item So, getting you to a point where: 99 | \begin{itemize} 100 | \item you have got some confidence in using basic Python commands; 101 | \item you can use Python for carrying out simple operations on text; 102 | \item you can do all the easy and intermediate exercises in 103 | Chapter 2; 104 | \item you have found out where to get more information (fellow 105 | students, the web, textbooks) 106 | \end{itemize} 107 | \end{itemize} 108 | \end{frame} 109 | 110 | \begin{frame} 111 | \frametitle{Syllabus} 112 | \begin{description} 113 | \item[Class 1] Manipulating strings, lists and other sequences. 114 | \item[Class 2] Conditionals, dictionaries, functions and regular 115 | expressions. 116 | \item[Class 3] Preview of NLTK chapters on Words and Tagging 117 | \end{description} 118 | \end{frame} 119 | 120 | \begin{frame} 121 | \frametitle{Almost there \ldots} 122 | \begin{itemize} 123 | \item Installation CDs 124 | \item Today: at least Python 125 | \item Tomorrow: full NLTK installation 126 | \item Homework: catch up on exercises and reading 127 | \end{itemize} 128 | \end{frame} 129 | 130 | 131 | 132 | 133 | \end{document} 134 | -------------------------------------------------------------------------------- /slides/lsa110_1.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} % for slides 2 | % \documentclass[handout]{beamer} % for handout 3 | \input{beamer} 4 | 5 | \title{Python Programming for Linguists\\LSA 100 Presession} 6 | 7 | % \author{Steven Bird \and Ewan Klein \and Edward Loper} 8 | % \institute{ 9 | % University of Melbourne, AUSTRALIA 10 | % \and 11 | % University of Edinburgh, UK 12 | % \and 13 | % University of Pennsylvania, USA 14 | % } 15 | 16 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 17 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 18 | 19 | \begin{document} 20 | 21 | 22 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 23 | 24 | \begin{frame} 25 | \titlepage 26 | \end{frame} 27 | 28 | 29 | 30 | \begin{frame} 31 | \frametitle{Introduction} 32 | \begin{itemize} 33 | \item Who we are 34 | \item Python and NLTK 35 | \item Materials and Resources 36 | \item Goals 37 | \item Syllabus 38 | \end{itemize} 39 | \end{frame} 40 | 41 | \begin{frame} 42 | \frametitle{Who we are} 43 | 44 | Instructors: 45 | \begin{itemize} 46 | \item Steven Bird 47 | \item Ewan Klein 48 | \item Edward Loper (here tomorrow) 49 | \end{itemize} 50 | 51 | TAs: 52 | \begin{itemize} 53 | \item David Hall 54 | \item Yaron Greif 55 | \item Yun-Hsuan Sung 56 | % \item Jette Viethen 57 | \end{itemize} 58 | 59 | \end{frame} 60 | 61 | 62 | \begin{frame} 63 | \frametitle{Python and NLTK} 64 | \begin{itemize} 65 | \item Pre-session for \textit{Introduction to 66 | Computational Linguistics} (LSA 325) 67 | \item First steps in using Python and Natural Language Toolkit (NLTK) 68 | \item Why Python? 69 | \begin{itemize} 70 | \item designed to be easy to learn; 71 | \item good for processing linguistic data; 72 | \item good for interactive experiments. 73 | \end{itemize} 74 | \item Many online tutorials (see \url{www.python.org}) 75 | \end{itemize} 76 | \end{frame} 77 | 78 | 79 | \begin{frame} 80 | \frametitle{Materials and Resources} 81 | \begin{itemize} 82 | \item Chapter 2, \textit{Programming Fundamentals and Python} in the NLTK Book 83 | (\url{http://nltk.org/index.php/Book}) 84 | \item \textbf{Su07-LSA-110} page on \url{http://coursework.stanford.edu} 85 | \item Main NLTK page: \url{http://nltk.org} 86 | \begin{itemize} 87 | \item Chatroom 88 | \item \texttt{nltk-users} Mailing List 89 | \end{itemize} 90 | 91 | \end{itemize} 92 | \end{frame} 93 | 94 | \begin{frame} 95 | \frametitle{Audience and Goals} 96 | \begin{itemize} 97 | \item We are assuming you have not done programming before. 98 | \item So, getting you to a point where: 99 | \begin{itemize} 100 | \item you have got some confidence in using basic Python commands; 101 | \item you can use Python for carrying out simple operations on text; 102 | \item you can do all the easy and intermediate exercises in 103 | Chapter 2; 104 | \item you have found out where to get more information (fellow 105 | students, the web, textbooks) 106 | \end{itemize} 107 | \end{itemize} 108 | \end{frame} 109 | 110 | \begin{frame} 111 | \frametitle{Syllabus} 112 | \begin{description} 113 | \item[Class 1] Manipulating strings, lists and other sequences. 114 | \item[Class 2] Conditionals, dictionaries, functions and regular 115 | expressions. 116 | \item[Class 3] Preview of NLTK chapters on Words and Tagging 117 | \end{description} 118 | \end{frame} 119 | 120 | \begin{frame} 121 | \frametitle{Almost there \ldots} 122 | \begin{itemize} 123 | \item Installation CDs 124 | \item Today: at least Python 125 | \item Tomorrow: full NLTK installation 126 | \item Homework: catch up on exercises and reading 127 | \end{itemize} 128 | \end{frame} 129 | 130 | 131 | 132 | 133 | \end{document} 134 | -------------------------------------------------------------------------------- /images/chunk-treerep.fig: -------------------------------------------------------------------------------- 1 | #FIG 3.2 2 | Landscape 3 | Center 4 | Metric 5 | A4 6 | 100.00 7 | Single 8 | -2 9 | 1200 2 10 | 0 32 #aeaeae 11 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 12 | 540 1170 720 1170 720 1350 540 1350 540 1170 13 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 14 | 720 1170 900 1170 900 1350 720 1350 720 1170 15 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 16 | 900 1170 1080 1170 1080 1350 900 1350 900 1170 17 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 18 | 1080 1170 1260 1170 1260 1350 1080 1350 1080 1170 19 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 20 | 1260 1170 1440 1170 1440 1350 1260 1350 1260 1170 21 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 22 | 1440 1170 1620 1170 1620 1350 1440 1350 1440 1170 23 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 24 | 1620 1170 1800 1170 1800 1350 1620 1350 1620 1170 25 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 26 | 1800 1170 1980 1170 1980 1350 1800 1350 1800 1170 27 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 28 | 1980 1170 2160 1170 2160 1350 1980 1350 1980 1170 29 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 30 | 2160 1170 2340 1170 2340 1350 2160 1350 2160 1170 31 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 32 | 2340 1170 2520 1170 2520 1350 2340 1350 2340 1170 33 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 34 | 2700 1170 2880 1170 2880 1350 2700 1350 2700 1170 35 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 36 | 2520 1170 2700 1170 2700 1350 2520 1350 2520 1170 37 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 38 | 3060 1170 3240 1170 3240 1350 3060 1350 3060 1170 39 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 40 | 2880 1170 3060 1170 3060 1350 2880 1350 2880 1170 41 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 42 | 3240 1170 3420 1170 3420 1350 3240 1350 3240 1170 43 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 44 | 3420 1170 3600 1170 3600 1350 3420 1350 3420 1170 45 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 46 | 3600 1170 3780 1170 3780 1350 3600 1350 3600 1170 47 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5 48 | 3780 1170 3960 1170 3960 1350 3780 1350 3780 1170 49 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 50 | 495 1125 945 1125 945 1620 495 1620 495 1125 51 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 52 | 1035 1125 1665 1125 1665 1620 1035 1620 1035 1125 53 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 54 | 2475 1125 3105 1125 3105 1620 2475 1620 2475 1125 55 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 56 | 3195 1125 3825 1125 3825 1620 3195 1620 3195 1125 57 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 58 | 1755 1125 2385 1125 2385 1620 1755 1620 1755 1125 59 | 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 60 | 720 1620 720 1800 61 | 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 62 | 2790 1620 2790 1800 63 | 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 64 | 2250 1620 2700 1800 65 | 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 66 | 3330 1620 2880 1800 67 | 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 68 | 2790 1980 1845 2205 69 | 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 70 | 720 1980 1665 2205 71 | 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 72 | 1350 1620 1755 2205 73 | 4 1 0 50 0 14 12 0.0000 4 75 105 1170 1305 s\001 74 | 4 1 0 50 0 14 12 0.0000 4 75 105 1350 1305 a\001 75 | 4 1 0 50 0 14 12 0.0000 4 75 105 1530 1305 w\001 76 | 4 1 0 50 0 14 12 0.0000 4 105 105 1890 1305 t\001 77 | 4 1 0 50 0 14 12 0.0000 4 105 105 2070 1305 h\001 78 | 4 1 0 50 0 14 12 0.0000 4 75 105 2250 1305 e\001 79 | 4 1 0 50 0 14 12 0.0000 4 105 105 2610 1305 b\001 80 | 4 1 0 50 0 14 12 0.0000 4 105 105 2790 1305 i\001 81 | 4 1 0 50 0 14 12 0.0000 4 120 105 2970 1305 g\001 82 | 4 1 0 50 0 14 12 0.0000 4 105 105 3330 1305 d\001 83 | 4 1 0 50 0 14 12 0.0000 4 75 105 3510 1305 o\001 84 | 4 1 0 50 0 14 12 0.0000 4 120 105 3690 1305 g\001 85 | 4 1 0 50 0 14 12 0.0000 4 30 105 3870 1305 .\001 86 | 4 1 0 50 0 14 12 0.0000 4 105 105 630 1305 H\001 87 | 4 1 0 50 0 14 12 0.0000 4 75 105 810 1305 e\001 88 | 4 1 0 50 0 14 12 0.0000 4 105 315 720 1575 PRP\001 89 | 4 1 0 50 0 14 12 0.0000 4 105 315 1350 1575 VBD\001 90 | 4 1 0 50 0 14 12 0.0000 4 105 210 2070 1575 DT\001 91 | 4 1 0 50 0 14 12 0.0000 4 105 210 2790 1575 JJ\001 92 | 4 1 0 50 0 14 12 0.0000 4 105 210 3510 1575 NN\001 93 | 4 1 0 50 0 14 12 0.0000 4 105 210 720 1935 NP\001 94 | 4 1 0 50 0 14 12 0.0000 4 105 210 2790 1935 NP\001 95 | 4 1 0 50 0 14 12 0.0000 4 105 105 1755 2340 S\001 96 | -------------------------------------------------------------------------------- /definitions.sty: -------------------------------------------------------------------------------- 1 | 2 | \usepackage{times} 3 | \usepackage{boxedminipage} 4 | \setlength{\parindent}{0pt} 5 | \setlength{\parskip}{1ex} 6 | 7 | %%%%%%%% UNICODE SUPPORT %%%%%%%% 8 | 9 | \usepackage{ucs} 10 | \usepackage{pdffonts} 11 | \usepackage{color} 12 | \providecommand{\textalpha}{{\usefont{OML}{hlcm}{m}{n} \ensuremath{\alpha}}} 13 | \providecommand{\textbeta}{{\usefont{OML}{hlcm}{m}{n} \ensuremath{\beta}}} 14 | \providecommand{\textgamma}{{\usefont{OML}{hlcm}{m}{n} \ensuremath{\gamma}}} 15 | \providecommand{\textmu}{{\usefont{OML}{hlcm}{m}{n} \ensuremath{\mu}}} 16 | 17 | \renewcommand{\labelitemi}{$\blacksquare$} 18 | 19 | %%%%%%%% ATTRIBUTE VALUE MATRICES %%%%%%%% 20 | 21 | \usepackage{avm} 22 | \avmfont{\sc} 23 | \avmvalfont{\it} 24 | 25 | %%%%%%%% HEADERS AND FOOTERS %%%%%%%% 26 | 27 | \usepackage{fancyheadings} 28 | \pagestyle{fancy} 29 | \setlength{\headrulewidth}{0.5pt} 30 | \setlength{\footrulewidth}{0.5pt} 31 | 32 | \newcommand{\authors}{\small \emph{Bird, Klein \& Loper}} 33 | \newcommand{\booktitle}{\small \emph{Natural Language Processing (DRAFT)}} 34 | \newcommand{\thedate}{\small \emph{\today}} 35 | \renewcommand{\chaptermark}[1]{\markboth{\emph{\thechapter.\ #1}}{}} 36 | \renewcommand{\sectionmark}[1]{\markright{\emph{\thesection.\ {#1}}}} 37 | 38 | \lhead [] {\leftmark} 39 | \chead [] {} 40 | \rhead [\rightmark] {\booktitle} 41 | \rfoot [\authors] {\thedate} 42 | \lfoot [\thedate] {\authors} 43 | \cfoot [\thepage] {\thepage} 44 | 45 | 46 | %%%%%%%% CUSTOM INLINE ROLES %%%%%%%% 47 | 48 | % Placeholder -- to be replaced by some actual value in a program 49 | \newcommand{\docutilsroleplaceholder}[1]{\texttt{\textit{#1}}} 50 | % Linguistic example - cited form in running text 51 | \newcommand{\docutilsroleexample}[1]{\textit{#1}} 52 | % Emphasized text 53 | \newcommand{\docutilsroleemphasis}[1]{\emph{#1}} 54 | % Defined term - first of canonical use of technical term 55 | \newcommand{\docutilsroleterm}[1]{\textbf{#1}} 56 | % Gramatical category - e.g. NP and verb as technical terms 57 | \newcommand{\docutilsrolecategory}[1]{\textsc{#1}} 58 | % Math symbols 59 | \newcommand{\docutilsrolemath}[1]{${#1}$} 60 | % Text in math env 61 | % Currently implemented as \textit since we can't do embedded text 62 | % roles in RST 63 | \newcommand{\docutilsrolemathit}[1]{\textsf{#1}} 64 | % Features and values 65 | \newcommand{\docutilsrolefeature}[1]{\textsc{#1}} 66 | \newcommand{\docutilsrolefval}[1]{\textit{#1}} 67 | % Lexemes 68 | \newcommand{\docutilsrolelex}[1]{\textsc{#1}} 69 | 70 | \newcommand{\docutilsroleNone}[1]{#1} 71 | 72 | %%%%%%%% PYTHON SOURCE CODE MARKUP %%%%%%%% 73 | 74 | % Note -- there is no bold tt font, so currently most of these commands 75 | % don't really do anything. :-/ 76 | 77 | \definecolor{py@keywordcolour}{rgb}{1,0.45882,0} 78 | \definecolor{py@stringcolour}{rgb}{0,0.666666,0} 79 | \definecolor{py@commentcolour}{rgb}{1,0,0} 80 | \definecolor{py@ps1colour}{rgb}{0.60784,0,0} 81 | \definecolor{py@ps2colour}{rgb}{0.60784,0,1} 82 | \definecolor{py@inputcolour}{rgb}{0,0,0} 83 | \definecolor{py@outputcolour}{rgb}{0,0,1} 84 | \definecolor{py@exceptcolour}{rgb}{1,0,0} 85 | \definecolor{py@builtincolour}{rgb}{0.58039,0,0.58039} 86 | \definecolor{py@identifiercolour}{rgb}{0,0,0} 87 | \definecolor{py@linenumcolour}{rgb}{0.4,0.4,0.4} 88 | \definecolor{py@inputcolour}{rgb}{0,0,0} 89 | \definecolor{py@defnamecolour}{rgb}{0,0.5,0.5} 90 | 91 | % Prompt 92 | \renewcommand{\pysrcprompt}[1]{\textcolor{py@ps1colour}{#1}} 93 | \renewcommand{\pysrcmore}[1]{\textcolor{py@ps2colour}{#1}} 94 | % Source code 95 | \renewcommand{\pysrckeyword}[1]{\textcolor{py@keywordcolour}{#1}} 96 | \renewcommand{\pysrcbuiltin}[1]{\textcolor{py@builtincolour}{#1}} 97 | \renewcommand{\pysrcstring}[1]{\textcolor{py@stringcolour}{#1}} 98 | \renewcommand{\pysrcdefname}[1]{\textcolor{py@defnamecolour}{#1}} 99 | \renewcommand{\pysrcother}[1]{#1} 100 | % Comments 101 | \renewcommand{\pysrccomment}[1]{\textcolor{py@commentcolour}{#1}} 102 | % Output 103 | \renewcommand{\pysrcoutput}[1]{\textcolor{py@outputcolour}{#1}} 104 | % Exceptions 105 | \renewcommand{\pysrcexcept}[1]{\textcolor{py@exceptcolour}{#1}} 106 | 107 | %%%%%%%% HYPHENATION CONTROL %%%%%%%% 108 | 109 | \pretolerance 250 110 | \tolerance 500 111 | % \hyphenpenalty 250 112 | \hyphenpenalty 200 113 | \exhyphenpenalty 100 114 | \doublehyphendemerits 7500 115 | \finalhyphendemerits 7500 116 | \brokenpenalty 10000 117 | \lefthyphenmin 3 118 | \righthyphenmin 3 119 | \widowpenalty 10000 120 | \clubpenalty 10000 121 | \displaywidowpenalty 10000 122 | \looseness 1 123 | -------------------------------------------------------------------------------- /LSA325/lsa110_2.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} % for slides 2 | %\documentclass[handout]{beamer} % for handout 3 | \input{beamer} 4 | 5 | \title{Python Programming for Linguists\\LSA 100 Presession} 6 | 7 | % \author{Steven Bird \and Ewan Klein \and Edward Loper} 8 | % \institute{ 9 | % University of Melbourne, AUSTRALIA 10 | % \and 11 | % University of Edinburgh, UK 12 | % \and 13 | % University of Pennsylvania, USA 14 | % } 15 | 16 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 17 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 18 | 19 | \begin{document} 20 | 21 | 22 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 23 | 24 | \begin{frame} 25 | \titlepage 26 | \end{frame} 27 | 28 | 29 | 30 | \begin{frame} 31 | \frametitle{Exercise for the Break} 32 | \begin{enumerate} 33 | \item Get together in groups of around 5 or 6 34 | \item Try to agree on: 35 | \begin{itemize} 36 | \item a question; or 37 | \item something that puzzles you; or 38 | \item a gripe; or 39 | \item a suggestion for improving this course. 40 | \end{itemize} 41 | 42 | \item Write it down on a piece of paper, and give it to us. 43 | \item We will respond, either in today's session or on the web. 44 | \end{enumerate} 45 | \end{frame} 46 | 47 | \begin{frame} 48 | \frametitle{Keeping in Touch} 49 | 50 | \begin{itemize} 51 | \item We circulated the class list earlier in today's session. 52 | \item If you haven't already done so, please add your email address. 53 | \item We will make sure you are subscribed to \texttt{nltk-announce} 54 | \end{itemize} 55 | 56 | \end{frame} 57 | 58 | \begin{frame} 59 | \frametitle{LSA 325: Introduction to Computational Linguistics} 60 | 61 | \begin{itemize} 62 | \item We can accept new participants: 63 | \item Cordura 100 64 | \item Mon/Thu 1:30-3:15 65 | \end{itemize} 66 | 67 | \end{frame} 68 | 69 | 70 | % \begin{frame} 71 | % \frametitle{Python and NLTK} 72 | % \begin{itemize} 73 | % \item Pre-session for \textit{Introduction to 74 | % Computational Linguistics} (LSA 325) 75 | % \item First steps in using Python and Natural Language Toolkit (NLTK) 76 | % \item Why Python? 77 | % \begin{itemize} 78 | % \item designed to be easy to learn; 79 | % \item good for processing linguistic data; 80 | % \item good for interactive experiments. 81 | % \end{itemize} 82 | % \item Many online tutorials (see \url{www.python.org}) 83 | % \end{itemize} 84 | % \end{frame} 85 | 86 | 87 | % \begin{frame} 88 | % \frametitle{Materials and Resources} 89 | % \begin{itemize} 90 | % \item Chapter 2, \textit{Programming Fundamentals and Python} in the NLTK Book 91 | % (\url{http://nltk.org/index.php/Book}) 92 | % \item \textbf{Su07-LSA-110} page on \url{http://coursework.stanford.edu} 93 | % \item Main NLTK page: \url{http://nltk.org} 94 | % \begin{itemize} 95 | % \item Chatroom 96 | % \item \texttt{nltk-users} Mailing List 97 | % \end{itemize} 98 | 99 | % \end{itemize} 100 | % \end{frame} 101 | 102 | % \begin{frame} 103 | % \frametitle{Audience and Goals} 104 | % \begin{itemize} 105 | % \item We are assuming you have not done programming before. 106 | % \item So, getting you to a point where: 107 | % \begin{itemize} 108 | % \item you have got some confidence in using basic Python commands; 109 | % \item you can use Python for carrying out simple operations on text; 110 | % \item you can do all the easy and intermediate exercises in 111 | % Chapter 2; 112 | % \item you have found out where to get more information (fellow 113 | % students, the web, textbooks) 114 | % \end{itemize} 115 | % \end{itemize} 116 | % \end{frame} 117 | 118 | % \begin{frame} 119 | % \frametitle{Syllabus} 120 | % \begin{description} 121 | % \item[Class 1] Manipulating strings, lists and other sequences. 122 | % \item[Class 2] Conditionals, dictionaries, functions and regular 123 | % expressions. 124 | % \item[Class 3] Preview of NLTK chapters on Words and Tagging 125 | % \end{description} 126 | % \end{frame} 127 | 128 | % \begin{frame} 129 | % \frametitle{Almost there \ldots} 130 | % \begin{itemize} 131 | % \item Installation CDs 132 | % \item Today: at least Python 133 | % \item Tomorrow: full NLTK installation 134 | % \item Homework: catch up on exercises and reading 135 | % \end{itemize} 136 | % \end{frame} 137 | 138 | 139 | 140 | 141 | \end{document} 142 | -------------------------------------------------------------------------------- /howto/update_list.py: -------------------------------------------------------------------------------- 1 | # 2 | # Script that updates test-list.txt 3 | # 4 | 5 | import os, os.path, re, sys 6 | 7 | DOCTEST_SRC = '../../nltk/test' 8 | 9 | HEAD = (".. ==========================================================\n" 10 | ".. AUTO-GENERATED LISTING -- DO NOT EDIT!:\n\n" 11 | ".. role:: passed\n" 12 | " :class: doctest-passed\n\n" 13 | ".. role:: failed\n" 14 | " :class: doctest-failed\n\n" 15 | ".. role:: guide-linecount\n" 16 | " :class: guide-linecount\n\n" 17 | ".. container:: doctest-list\n\n" 18 | " .. list-table::\n" 19 | " :class: doctest-list \n" 20 | " :widths: 60 10 10 20\n" 21 | " :header-rows: 1\n\n" 22 | " * - `Topic `__\n" 23 | " - `Lines `__\n" 24 | " - `Tests `__\n" 25 | " - `Test Outcome `__\n") 26 | FOOT = (".. END AUTO-GENERATED LISTING\n" 27 | ".. ==========================================================\n") 28 | 29 | TITLE_REGEXPS = ( 30 | '\s*----+[ ]*\n(.*)\n----+[ ]*\n', 31 | '\s*====+[ ]*\n(.*)\n====+[ ]*\n', 32 | '\s*(.*)\n====+[ ]*\n', 33 | '\s*(.*)\n----+[ ]*\n') 34 | 35 | def find_title(basename): 36 | filename = os.path.join(DOCTEST_SRC, basename + '.doctest') 37 | head = open(filename).read(800) 38 | for regexp in TITLE_REGEXPS: 39 | regexp = '\A\s*(?:\.\..*\n)*'+regexp 40 | m = re.match(regexp, head) 41 | if m: return m.group(1).strip().replace('`', "'") 42 | print 'Warning: no title found for %s' % basename 43 | return basename 44 | 45 | def linecount(basename): 46 | filename = os.path.join(DOCTEST_SRC, basename + '.doctest') 47 | s = open(filename).read() 48 | return len(re.findall('(?m)^\s*>>>', s)), s.count('\n') 49 | 50 | def fmt_num(n): 51 | if n > 50: 52 | n = n - n%10 53 | if n > 500: 54 | n = n - n%100 55 | if n >= 1000: 56 | n = str(n)[:-3]+','+str(n)[-3:] 57 | return n 58 | 59 | def doctest_listing(sortkey=None): 60 | listing = '' 61 | 62 | files = [f for f in os.listdir(DOCTEST_SRC) if f.endswith('.doctest')] 63 | err_refs = [] 64 | lines = [] 65 | for filename in files: 66 | basename = filename.replace('.doctest', '') 67 | if basename == 'temp': continue 68 | 69 | result = '`Passed!`:passed:' 70 | if os.path.exists(basename+'.errs'): 71 | s = open(basename+'.errs').read() 72 | num_failed = 0 73 | if not re.search(r'OK\s*\Z', s): 74 | num_failed = len(re.findall(r'(?m)^Failed [Ee]xample:', s)) 75 | result = '|%s|_' % basename 76 | err_refs.append( (basename, num_failed) ) 77 | if sortkey is None: 78 | print ('test %s failed (%d examples)' % 79 | (basename, num_failed)) 80 | 81 | title = find_title(basename) 82 | numtests, numlines = linecount(basename) 83 | lines.append([title, basename, numtests, numlines, result, num_failed]) 84 | 85 | if sortkey in ('title', None): 86 | lines.sort(key=lambda v:v[0]) 87 | if sortkey == 'lines': 88 | lines.sort(key=lambda v:(-int(v[2]), v[0])) 89 | if sortkey == 'tests': 90 | lines.sort(key=lambda v:(-int(v[3]), v[0])) 91 | if sortkey == 'outcome': 92 | lines.sort(key=lambda v:(-v[5], v[0])) 93 | 94 | for (title, basename, numtests, numlines, result, num_failed) in lines: 95 | numlines = fmt_num(numlines) 96 | numtests = fmt_num(numtests) 97 | listing += (' * - `%s <%s.html>`__\n' % (title,basename) + 98 | ' - :guide-linecount:`%s`\n' % numlines + 99 | ' - :guide-linecount:`%s`\n' % numtests + 100 | ' - %s\n' % result) 101 | 102 | for (basename, num_failed) in err_refs: 103 | plural = (num_failed!=1 and 's' or '') 104 | listing += ('\n.. |%s| replace:: `%d test%s failed!`:failed:' 105 | '\n.. _%s: %s.errs\n' % 106 | (basename, num_failed, plural, basename, basename)) 107 | 108 | return listing 109 | 110 | def main(): 111 | out = open('test-list.txt', 'w') 112 | out.write('%s\n%s\n%s' % (HEAD, doctest_listing(), FOOT)) 113 | out.close() 114 | 115 | for sortkey in ('title', 'basename', 'lines', 'tests', 'outcome'): 116 | out = open('test-list-sort-%s.txt' % sortkey, 'w') 117 | out.write('%s\n%s\n%s' % (HEAD, doctest_listing(sortkey), FOOT)) 118 | out.close() 119 | 120 | if __name__ == '__main__': 121 | main() 122 | --------------------------------------------------------------------------------