├── slides
├── book.bib
├── demos
│ ├── parse.py
│ ├── names.py
│ ├── similar_words_2.py
│ └── similar_words.py
├── beamer.tex
├── Makefile
├── index.rst
└── lsa110_1.tex
├── book
├── .gitignore
├── corpus.txt
├── dict.htm
├── ch00-pt.rst
├── revision.rst
├── feedback.txt
├── callouts
│ ├── callout1.gif
│ ├── callout2.gif
│ ├── callout3.gif
│ ├── callout4.gif
│ ├── callout5.gif
│ ├── callout6.gif
│ ├── callout7.gif
│ ├── callout8.gif
│ ├── callout9.gif
│ ├── callout10.gif
│ ├── callout11.gif
│ ├── callout12.gif
│ ├── callout13.gif
│ ├── callout14.gif
│ └── callout15.gif
├── term_index.rst
├── footer-pt.rst
├── definitions-pt.rst
├── dict.csv
├── docbook-issues.txt
├── ch6-fmla.tex
├── ch12-extras.rst
├── footer.rst
├── bib_template.html
├── book.rst
├── ch02-extras.rst
├── print.rst
├── reprint1-4.txt
├── ch05-extras.rst
├── reprint1-1.txt
├── errata2.txt
├── copy-edits.txt
├── reprint1-2.txt
├── book.xml
├── ch03-extras.rst
├── guidelines.txt
├── DOCUTILS
├── regexp-defns.rst
├── reprint1-3.txt
├── intro-outline.txt
├── introduction-code.txt
├── SCHEDULE
├── CheckList.txt
├── image_scaling.rst
├── ch01-extras.rst
├── second-edition.txt
└── ch01-notes.rst
├── pt-br
├── Makefile
└── index.txt
├── nltk.ppt
├── images
├── T9.png
├── are.png
├── xp.png
├── avm1.pdf
├── avm1.png
├── blank.png
├── brent.png
├── dag01.png
├── dag02.png
├── dag03.png
├── dag04.png
├── drs1.png
├── drs2.png
├── empty.png
├── maps.png
├── tally.png
├── timit.png
├── utf8.png
├── ambig02.png
├── ambig03.png
├── authors.png
├── cup-test.png
├── dag04-1.png
├── dag04-2.png
├── dag04-3.png
├── dialogue.png
├── drs1.graffle
├── drs2.graffle
├── iu-mien.png
├── jigsaw.png
├── lee-dog.png
├── lexicon.png
├── maps01.png
├── maps02.png
├── semint.png
├── tally2.png
├── target.png
├── unicode.png
├── xp-mod.png
├── ambig01-a.png
├── ambig01-b.png
├── cfd-gender.png
├── chart_fr1.png
├── chart_fr2.png
├── chart_np0.png
├── chart_np1.png
├── chart_np2.png
├── chunk-muc.png
├── corpus-org.pdf
├── corpus-org.png
├── datatypes.png
├── depgraph0.png
├── dog-graph.png
├── evolution.png
├── fdist-moby.png
├── ic_diagram.pdf
├── ic_diagram.png
├── inaugural.png
├── inaugural2.png
├── indexing01.png
├── indexing02.png
├── locations.png
├── mergesort.png
├── model_kids.png
├── nl_to_fol.png
├── parse_draw.png
├── parse_tree.png
├── pipeline1.png
├── rdparser1.png
├── rdparser2.png
├── rdparser3.png
├── rdparser4.png
├── rdparser5.png
├── rdparser6.png
├── srparser1.png
├── srparser2.png
├── srparser3.png
├── srparser4.png
├── srparser5.png
├── srparser6.png
├── srparser7.png
├── srparser8.png
├── srparser9.png
├── tag-indian.png
├── tag-lookup.png
├── trees_tree.png
├── windowdiff.png
├── 2nd_ed_plan.png
├── array-memory.png
├── chart_bu_ex1.png
├── chart_bu_ex2.png
├── chart_bu_ex3.png
├── chart_bu_fr.png
├── chart_bu_init.png
├── chart_demo1.png
├── chart_demo1.tiff
├── chart_demo2.png
├── chart_demo2.tiff
├── chart_init0.png
├── chart_init1.png
├── chart_prods.png
├── chart_td_ex1.png
├── chart_td_ex2.png
├── chart_td_ex3.png
├── chart_td_ex4.png
├── chart_td_init.png
├── chart_wfst1.png
├── chart_wfst2.png
├── chunk-coref.png
├── chunk-tagrep.png
├── chunk-treerep.png
├── decision-tree.png
├── exploration.png
├── findtheblock1.png
├── findtheblock2.png
├── findtheblock3.png
├── findtheblock4.png
├── mod_relation.png
├── modal_genre.png
├── models_admire.png
├── multi-module.png
├── partialtree.png
├── polish-utf8.png
├── quant-ambig.png
├── rdparser1-6.png
├── sensibility.png
├── sinica-tree.png
├── srparser1-6.png
├── stack-queue.png
├── string-memory.png
├── syntax-tree.png
├── tag-context.png
├── word-len-dist.png
├── 2nd_ed_plan.graffle
├── chart_bottom_up.png
├── chart_td_match1.png
├── chart_td_match2.png
├── chart_top_down.png
├── classification.png
├── corpus-org.graffle
├── dialogue-90dpi.png
├── drs_screenshot0.png
├── drs_screenshot1.png
├── findtheblock1.tiff
├── findtheblock2.tiff
├── findtheblock3.tiff
├── findtheblock4.tiff
├── ie-architecture.png
├── mimo-and-bruno.jpg
├── mimo-and-bruno.png
├── models_walk_cf.png
├── nltk-downloader.png
├── string-slicing.png
├── timit-structure.png
├── chart_bu_predict1.png
├── chart_bu_predict2.png
├── chart_fundamental.png
├── chart_intro_2edges.png
├── chart_intro_3edges.png
├── chart_intro_empty.png
├── chart_positions1.png
├── chart_positions2.png
├── chart_td_expand1.png
├── chart_td_expand2.png
├── chart_useless_edge.png
├── chunk-segmentation.png
├── decision-tree.graffle
├── feature-extraction.png
├── ic_diagram_labeled.pdf
├── ic_diagram_labeled.png
├── models_love_cf01.png
├── models_love_cf02.png
├── multi-module.graffle
├── naive_bayes_graph.png
├── old-string-memory.png
├── precision-recall.png
├── recursive_parse1.png
├── three-layer-arch.png
├── vocabulary-growth.png
├── wordnet-hierarchy.png
├── words-dispersion.png
├── Binary_entropy_plot.pdf
├── Binary_entropy_plot.png
├── chart_intro_prodedge.png
├── chart_intro_selfloop.png
├── chart_td_match1_alt.png
├── naive-bayes-triangle.png
├── naive_bayes_bargraph.png
├── parse_rdparsewindow.png
├── chart_intro_dottededge.png
├── chart_intro_incomplete.png
├── chart_intro_parseedge.png
├── naive_bayes_graph.graffle
├── text-corpus-structure.png
├── naive-bayes-triangle.graffle
├── naive_bayes_bargraph.graffle
├── supervised-classification.png
├── text-corpus-structure.graffle
├── supervised-classification.graffle
├── ambig02.py
├── ambig03.py
├── ambig01-a.py
├── ambig01-b.py
├── lee-dog.py
├── polish-utf8.py
├── chart_td_init.dot
├── avm1.tex
├── chart_intro_empty.dot
├── Makefile
├── chart_useless_edge.dot
├── chart_intro_dottededge.dot
├── chart_intro_parseedge.dot
├── chart_td_match2.dot
├── chart_bu_init.dot
├── chart_intro_selfloop.dot
├── chart_intro_2edges.dot
├── chart_td_expand2.dot
├── chart_bu_predict1.dot
├── chart_bu_predict2.dot
├── chart_td_expand1.dot
├── chart_td_match1.dot
├── chart_intro_incomplete.dot
├── chart_intro_prodedge.dot
├── chart_intro_3edges.dot
├── chart_fr2.dot
├── chart_fr1.dot
├── Binary_entropy_plot.tex
├── precision-recall.fig
├── are.fig
├── chunk-tagrep.fig
├── chunk-segmentation.fig
└── chunk-treerep.fig
├── LSA325
├── engineer.pdf
├── data-model1.pdf
├── data-model2.pdf
├── data-model3.pdf
├── data-model4.pdf
├── evaluations.xls
├── lsa325_070907.pdf
├── assignment3.txt
├── assignment2.txt
├── assignment5.txt
├── log_lc_and_functions.txt
├── assignment4.txt
├── LSA325_3_handout.tex
├── lsa325_5.tex
├── LSA325_5_handout.tex
├── log_fds.txt
├── lsa110_1.tex
└── lsa110_2.tex
├── book-pl
├── footer.rst
└── footer-pl.rst
├── book-jp
├── fig_jpime_eq.png
├── fig_jpma_lattice1.png
└── fig_jpma_lattice2.png
├── .gitignore
├── index.html
├── surveys
└── 2005-10.txt
├── latexhacks.py
├── pages.py
├── index.rst
├── howto
├── coverage.txt
├── Makefile
├── show_coverage.py
└── update_list.py
├── examples.py
├── rsthacks.py
├── doctest_split.py
├── archives
└── sourceforge-dev.txt
├── xincluder.py
├── epydoc.diff
├── xelatexsymbols.tex
├── xmlpp.py
├── HouseStyle.txt
└── definitions.sty
/slides/book.bib:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/book/.gitignore:
--------------------------------------------------------------------------------
1 | _build/**/*
2 |
--------------------------------------------------------------------------------
/pt-br/Makefile:
--------------------------------------------------------------------------------
1 | include ../Makefile.doc
2 |
--------------------------------------------------------------------------------
/book/corpus.txt:
--------------------------------------------------------------------------------
1 | Hello world. This is a test file.
2 |
--------------------------------------------------------------------------------
/nltk.ppt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/nltk.ppt
--------------------------------------------------------------------------------
/book/dict.htm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/dict.htm
--------------------------------------------------------------------------------
/images/T9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/T9.png
--------------------------------------------------------------------------------
/images/are.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/are.png
--------------------------------------------------------------------------------
/images/xp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/xp.png
--------------------------------------------------------------------------------
/book/ch00-pt.rst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/ch00-pt.rst
--------------------------------------------------------------------------------
/book/revision.rst:
--------------------------------------------------------------------------------
1 | This document was built on
2 | Wed 4 Sep 2019 11:25:35 ACST
3 |
--------------------------------------------------------------------------------
/images/avm1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/avm1.pdf
--------------------------------------------------------------------------------
/images/avm1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/avm1.png
--------------------------------------------------------------------------------
/images/blank.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/blank.png
--------------------------------------------------------------------------------
/images/brent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/brent.png
--------------------------------------------------------------------------------
/images/dag01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/dag01.png
--------------------------------------------------------------------------------
/images/dag02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/dag02.png
--------------------------------------------------------------------------------
/images/dag03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/dag03.png
--------------------------------------------------------------------------------
/images/dag04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/dag04.png
--------------------------------------------------------------------------------
/images/drs1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/drs1.png
--------------------------------------------------------------------------------
/images/drs2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/drs2.png
--------------------------------------------------------------------------------
/images/empty.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/empty.png
--------------------------------------------------------------------------------
/images/maps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/maps.png
--------------------------------------------------------------------------------
/images/tally.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/tally.png
--------------------------------------------------------------------------------
/images/timit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/timit.png
--------------------------------------------------------------------------------
/images/utf8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/utf8.png
--------------------------------------------------------------------------------
/LSA325/engineer.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/LSA325/engineer.pdf
--------------------------------------------------------------------------------
/book-pl/footer.rst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book-pl/footer.rst
--------------------------------------------------------------------------------
/book/feedback.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/feedback.txt
--------------------------------------------------------------------------------
/images/ambig02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/ambig02.png
--------------------------------------------------------------------------------
/images/ambig03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/ambig03.png
--------------------------------------------------------------------------------
/images/authors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/authors.png
--------------------------------------------------------------------------------
/images/cup-test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/cup-test.png
--------------------------------------------------------------------------------
/images/dag04-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/dag04-1.png
--------------------------------------------------------------------------------
/images/dag04-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/dag04-2.png
--------------------------------------------------------------------------------
/images/dag04-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/dag04-3.png
--------------------------------------------------------------------------------
/images/dialogue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/dialogue.png
--------------------------------------------------------------------------------
/images/drs1.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/drs1.graffle
--------------------------------------------------------------------------------
/images/drs2.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/drs2.graffle
--------------------------------------------------------------------------------
/images/iu-mien.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/iu-mien.png
--------------------------------------------------------------------------------
/images/jigsaw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/jigsaw.png
--------------------------------------------------------------------------------
/images/lee-dog.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/lee-dog.png
--------------------------------------------------------------------------------
/images/lexicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/lexicon.png
--------------------------------------------------------------------------------
/images/maps01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/maps01.png
--------------------------------------------------------------------------------
/images/maps02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/maps02.png
--------------------------------------------------------------------------------
/images/semint.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/semint.png
--------------------------------------------------------------------------------
/images/tally2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/tally2.png
--------------------------------------------------------------------------------
/images/target.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/target.png
--------------------------------------------------------------------------------
/images/unicode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/unicode.png
--------------------------------------------------------------------------------
/images/xp-mod.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/xp-mod.png
--------------------------------------------------------------------------------
/book-pl/footer-pl.rst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book-pl/footer-pl.rst
--------------------------------------------------------------------------------
/images/ambig01-a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/ambig01-a.png
--------------------------------------------------------------------------------
/images/ambig01-b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/ambig01-b.png
--------------------------------------------------------------------------------
/images/cfd-gender.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/cfd-gender.png
--------------------------------------------------------------------------------
/images/chart_fr1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_fr1.png
--------------------------------------------------------------------------------
/images/chart_fr2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_fr2.png
--------------------------------------------------------------------------------
/images/chart_np0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_np0.png
--------------------------------------------------------------------------------
/images/chart_np1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_np1.png
--------------------------------------------------------------------------------
/images/chart_np2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_np2.png
--------------------------------------------------------------------------------
/images/chunk-muc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chunk-muc.png
--------------------------------------------------------------------------------
/images/corpus-org.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/corpus-org.pdf
--------------------------------------------------------------------------------
/images/corpus-org.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/corpus-org.png
--------------------------------------------------------------------------------
/images/datatypes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/datatypes.png
--------------------------------------------------------------------------------
/images/depgraph0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/depgraph0.png
--------------------------------------------------------------------------------
/images/dog-graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/dog-graph.png
--------------------------------------------------------------------------------
/images/evolution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/evolution.png
--------------------------------------------------------------------------------
/images/fdist-moby.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/fdist-moby.png
--------------------------------------------------------------------------------
/images/ic_diagram.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/ic_diagram.pdf
--------------------------------------------------------------------------------
/images/ic_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/ic_diagram.png
--------------------------------------------------------------------------------
/images/inaugural.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/inaugural.png
--------------------------------------------------------------------------------
/images/inaugural2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/inaugural2.png
--------------------------------------------------------------------------------
/images/indexing01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/indexing01.png
--------------------------------------------------------------------------------
/images/indexing02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/indexing02.png
--------------------------------------------------------------------------------
/images/locations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/locations.png
--------------------------------------------------------------------------------
/images/mergesort.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/mergesort.png
--------------------------------------------------------------------------------
/images/model_kids.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/model_kids.png
--------------------------------------------------------------------------------
/images/nl_to_fol.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/nl_to_fol.png
--------------------------------------------------------------------------------
/images/parse_draw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/parse_draw.png
--------------------------------------------------------------------------------
/images/parse_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/parse_tree.png
--------------------------------------------------------------------------------
/images/pipeline1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/pipeline1.png
--------------------------------------------------------------------------------
/images/rdparser1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/rdparser1.png
--------------------------------------------------------------------------------
/images/rdparser2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/rdparser2.png
--------------------------------------------------------------------------------
/images/rdparser3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/rdparser3.png
--------------------------------------------------------------------------------
/images/rdparser4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/rdparser4.png
--------------------------------------------------------------------------------
/images/rdparser5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/rdparser5.png
--------------------------------------------------------------------------------
/images/rdparser6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/rdparser6.png
--------------------------------------------------------------------------------
/images/srparser1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/srparser1.png
--------------------------------------------------------------------------------
/images/srparser2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/srparser2.png
--------------------------------------------------------------------------------
/images/srparser3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/srparser3.png
--------------------------------------------------------------------------------
/images/srparser4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/srparser4.png
--------------------------------------------------------------------------------
/images/srparser5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/srparser5.png
--------------------------------------------------------------------------------
/images/srparser6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/srparser6.png
--------------------------------------------------------------------------------
/images/srparser7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/srparser7.png
--------------------------------------------------------------------------------
/images/srparser8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/srparser8.png
--------------------------------------------------------------------------------
/images/srparser9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/srparser9.png
--------------------------------------------------------------------------------
/images/tag-indian.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/tag-indian.png
--------------------------------------------------------------------------------
/images/tag-lookup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/tag-lookup.png
--------------------------------------------------------------------------------
/images/trees_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/trees_tree.png
--------------------------------------------------------------------------------
/images/windowdiff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/windowdiff.png
--------------------------------------------------------------------------------
/LSA325/data-model1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/LSA325/data-model1.pdf
--------------------------------------------------------------------------------
/LSA325/data-model2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/LSA325/data-model2.pdf
--------------------------------------------------------------------------------
/LSA325/data-model3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/LSA325/data-model3.pdf
--------------------------------------------------------------------------------
/LSA325/data-model4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/LSA325/data-model4.pdf
--------------------------------------------------------------------------------
/LSA325/evaluations.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/LSA325/evaluations.xls
--------------------------------------------------------------------------------
/LSA325/lsa325_070907.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/LSA325/lsa325_070907.pdf
--------------------------------------------------------------------------------
/book-jp/fig_jpime_eq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book-jp/fig_jpime_eq.png
--------------------------------------------------------------------------------
/images/2nd_ed_plan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/2nd_ed_plan.png
--------------------------------------------------------------------------------
/images/array-memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/array-memory.png
--------------------------------------------------------------------------------
/images/chart_bu_ex1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_bu_ex1.png
--------------------------------------------------------------------------------
/images/chart_bu_ex2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_bu_ex2.png
--------------------------------------------------------------------------------
/images/chart_bu_ex3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_bu_ex3.png
--------------------------------------------------------------------------------
/images/chart_bu_fr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_bu_fr.png
--------------------------------------------------------------------------------
/images/chart_bu_init.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_bu_init.png
--------------------------------------------------------------------------------
/images/chart_demo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_demo1.png
--------------------------------------------------------------------------------
/images/chart_demo1.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_demo1.tiff
--------------------------------------------------------------------------------
/images/chart_demo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_demo2.png
--------------------------------------------------------------------------------
/images/chart_demo2.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_demo2.tiff
--------------------------------------------------------------------------------
/images/chart_init0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_init0.png
--------------------------------------------------------------------------------
/images/chart_init1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_init1.png
--------------------------------------------------------------------------------
/images/chart_prods.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_prods.png
--------------------------------------------------------------------------------
/images/chart_td_ex1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_td_ex1.png
--------------------------------------------------------------------------------
/images/chart_td_ex2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_td_ex2.png
--------------------------------------------------------------------------------
/images/chart_td_ex3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_td_ex3.png
--------------------------------------------------------------------------------
/images/chart_td_ex4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_td_ex4.png
--------------------------------------------------------------------------------
/images/chart_td_init.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_td_init.png
--------------------------------------------------------------------------------
/images/chart_wfst1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_wfst1.png
--------------------------------------------------------------------------------
/images/chart_wfst2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_wfst2.png
--------------------------------------------------------------------------------
/images/chunk-coref.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chunk-coref.png
--------------------------------------------------------------------------------
/images/chunk-tagrep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chunk-tagrep.png
--------------------------------------------------------------------------------
/images/chunk-treerep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chunk-treerep.png
--------------------------------------------------------------------------------
/images/decision-tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/decision-tree.png
--------------------------------------------------------------------------------
/images/exploration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/exploration.png
--------------------------------------------------------------------------------
/images/findtheblock1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/findtheblock1.png
--------------------------------------------------------------------------------
/images/findtheblock2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/findtheblock2.png
--------------------------------------------------------------------------------
/images/findtheblock3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/findtheblock3.png
--------------------------------------------------------------------------------
/images/findtheblock4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/findtheblock4.png
--------------------------------------------------------------------------------
/images/mod_relation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/mod_relation.png
--------------------------------------------------------------------------------
/images/modal_genre.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/modal_genre.png
--------------------------------------------------------------------------------
/images/models_admire.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/models_admire.png
--------------------------------------------------------------------------------
/images/multi-module.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/multi-module.png
--------------------------------------------------------------------------------
/images/partialtree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/partialtree.png
--------------------------------------------------------------------------------
/images/polish-utf8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/polish-utf8.png
--------------------------------------------------------------------------------
/images/quant-ambig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/quant-ambig.png
--------------------------------------------------------------------------------
/images/rdparser1-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/rdparser1-6.png
--------------------------------------------------------------------------------
/images/sensibility.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/sensibility.png
--------------------------------------------------------------------------------
/images/sinica-tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/sinica-tree.png
--------------------------------------------------------------------------------
/images/srparser1-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/srparser1-6.png
--------------------------------------------------------------------------------
/images/stack-queue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/stack-queue.png
--------------------------------------------------------------------------------
/images/string-memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/string-memory.png
--------------------------------------------------------------------------------
/images/syntax-tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/syntax-tree.png
--------------------------------------------------------------------------------
/images/tag-context.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/tag-context.png
--------------------------------------------------------------------------------
/images/word-len-dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/word-len-dist.png
--------------------------------------------------------------------------------
/book/callouts/callout1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout1.gif
--------------------------------------------------------------------------------
/book/callouts/callout2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout2.gif
--------------------------------------------------------------------------------
/book/callouts/callout3.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout3.gif
--------------------------------------------------------------------------------
/book/callouts/callout4.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout4.gif
--------------------------------------------------------------------------------
/book/callouts/callout5.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout5.gif
--------------------------------------------------------------------------------
/book/callouts/callout6.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout6.gif
--------------------------------------------------------------------------------
/book/callouts/callout7.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout7.gif
--------------------------------------------------------------------------------
/book/callouts/callout8.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout8.gif
--------------------------------------------------------------------------------
/book/callouts/callout9.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout9.gif
--------------------------------------------------------------------------------
/images/2nd_ed_plan.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/2nd_ed_plan.graffle
--------------------------------------------------------------------------------
/images/chart_bottom_up.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_bottom_up.png
--------------------------------------------------------------------------------
/images/chart_td_match1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_td_match1.png
--------------------------------------------------------------------------------
/images/chart_td_match2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_td_match2.png
--------------------------------------------------------------------------------
/images/chart_top_down.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_top_down.png
--------------------------------------------------------------------------------
/images/classification.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/classification.png
--------------------------------------------------------------------------------
/images/corpus-org.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/corpus-org.graffle
--------------------------------------------------------------------------------
/images/dialogue-90dpi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/dialogue-90dpi.png
--------------------------------------------------------------------------------
/images/drs_screenshot0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/drs_screenshot0.png
--------------------------------------------------------------------------------
/images/drs_screenshot1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/drs_screenshot1.png
--------------------------------------------------------------------------------
/images/findtheblock1.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/findtheblock1.tiff
--------------------------------------------------------------------------------
/images/findtheblock2.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/findtheblock2.tiff
--------------------------------------------------------------------------------
/images/findtheblock3.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/findtheblock3.tiff
--------------------------------------------------------------------------------
/images/findtheblock4.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/findtheblock4.tiff
--------------------------------------------------------------------------------
/images/ie-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/ie-architecture.png
--------------------------------------------------------------------------------
/images/mimo-and-bruno.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/mimo-and-bruno.jpg
--------------------------------------------------------------------------------
/images/mimo-and-bruno.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/mimo-and-bruno.png
--------------------------------------------------------------------------------
/images/models_walk_cf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/models_walk_cf.png
--------------------------------------------------------------------------------
/images/nltk-downloader.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/nltk-downloader.png
--------------------------------------------------------------------------------
/images/string-slicing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/string-slicing.png
--------------------------------------------------------------------------------
/images/timit-structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/timit-structure.png
--------------------------------------------------------------------------------
/book-jp/fig_jpma_lattice1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book-jp/fig_jpma_lattice1.png
--------------------------------------------------------------------------------
/book-jp/fig_jpma_lattice2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book-jp/fig_jpma_lattice2.png
--------------------------------------------------------------------------------
/book/callouts/callout10.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout10.gif
--------------------------------------------------------------------------------
/book/callouts/callout11.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout11.gif
--------------------------------------------------------------------------------
/book/callouts/callout12.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout12.gif
--------------------------------------------------------------------------------
/book/callouts/callout13.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout13.gif
--------------------------------------------------------------------------------
/book/callouts/callout14.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout14.gif
--------------------------------------------------------------------------------
/book/callouts/callout15.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/book/callouts/callout15.gif
--------------------------------------------------------------------------------
/images/chart_bu_predict1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_bu_predict1.png
--------------------------------------------------------------------------------
/images/chart_bu_predict2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_bu_predict2.png
--------------------------------------------------------------------------------
/images/chart_fundamental.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_fundamental.png
--------------------------------------------------------------------------------
/images/chart_intro_2edges.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_intro_2edges.png
--------------------------------------------------------------------------------
/images/chart_intro_3edges.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_intro_3edges.png
--------------------------------------------------------------------------------
/images/chart_intro_empty.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_intro_empty.png
--------------------------------------------------------------------------------
/images/chart_positions1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_positions1.png
--------------------------------------------------------------------------------
/images/chart_positions2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_positions2.png
--------------------------------------------------------------------------------
/images/chart_td_expand1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_td_expand1.png
--------------------------------------------------------------------------------
/images/chart_td_expand2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_td_expand2.png
--------------------------------------------------------------------------------
/images/chart_useless_edge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_useless_edge.png
--------------------------------------------------------------------------------
/images/chunk-segmentation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chunk-segmentation.png
--------------------------------------------------------------------------------
/images/decision-tree.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/decision-tree.graffle
--------------------------------------------------------------------------------
/images/feature-extraction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/feature-extraction.png
--------------------------------------------------------------------------------
/images/ic_diagram_labeled.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/ic_diagram_labeled.pdf
--------------------------------------------------------------------------------
/images/ic_diagram_labeled.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/ic_diagram_labeled.png
--------------------------------------------------------------------------------
/images/models_love_cf01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/models_love_cf01.png
--------------------------------------------------------------------------------
/images/models_love_cf02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/models_love_cf02.png
--------------------------------------------------------------------------------
/images/multi-module.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/multi-module.graffle
--------------------------------------------------------------------------------
/images/naive_bayes_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/naive_bayes_graph.png
--------------------------------------------------------------------------------
/images/old-string-memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/old-string-memory.png
--------------------------------------------------------------------------------
/images/precision-recall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/precision-recall.png
--------------------------------------------------------------------------------
/images/recursive_parse1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/recursive_parse1.png
--------------------------------------------------------------------------------
/images/three-layer-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/three-layer-arch.png
--------------------------------------------------------------------------------
/images/vocabulary-growth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/vocabulary-growth.png
--------------------------------------------------------------------------------
/images/wordnet-hierarchy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/wordnet-hierarchy.png
--------------------------------------------------------------------------------
/images/words-dispersion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/words-dispersion.png
--------------------------------------------------------------------------------
/images/Binary_entropy_plot.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/Binary_entropy_plot.pdf
--------------------------------------------------------------------------------
/images/Binary_entropy_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/Binary_entropy_plot.png
--------------------------------------------------------------------------------
/images/chart_intro_prodedge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_intro_prodedge.png
--------------------------------------------------------------------------------
/images/chart_intro_selfloop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_intro_selfloop.png
--------------------------------------------------------------------------------
/images/chart_td_match1_alt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_td_match1_alt.png
--------------------------------------------------------------------------------
/images/naive-bayes-triangle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/naive-bayes-triangle.png
--------------------------------------------------------------------------------
/images/naive_bayes_bargraph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/naive_bayes_bargraph.png
--------------------------------------------------------------------------------
/images/parse_rdparsewindow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/parse_rdparsewindow.png
--------------------------------------------------------------------------------
/book/term_index.rst:
--------------------------------------------------------------------------------
1 |
2 | .. index:: :extern:
3 |
4 | .. include:: ../definitions.rst
5 | .. include:: footer.rst
6 |
--------------------------------------------------------------------------------
/images/chart_intro_dottededge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_intro_dottededge.png
--------------------------------------------------------------------------------
/images/chart_intro_incomplete.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_intro_incomplete.png
--------------------------------------------------------------------------------
/images/chart_intro_parseedge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/chart_intro_parseedge.png
--------------------------------------------------------------------------------
/images/naive_bayes_graph.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/naive_bayes_graph.graffle
--------------------------------------------------------------------------------
/images/text-corpus-structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/text-corpus-structure.png
--------------------------------------------------------------------------------
/images/naive-bayes-triangle.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/naive-bayes-triangle.graffle
--------------------------------------------------------------------------------
/images/naive_bayes_bargraph.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/naive_bayes_bargraph.graffle
--------------------------------------------------------------------------------
/images/supervised-classification.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/supervised-classification.png
--------------------------------------------------------------------------------
/images/text-corpus-structure.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/text-corpus-structure.graffle
--------------------------------------------------------------------------------
/book/footer-pt.rst:
--------------------------------------------------------------------------------
1 | .. admonition:: About this translation...
2 |
3 | This translation was contributed by Tiago Tresoldi.
4 |
--------------------------------------------------------------------------------
/images/supervised-classification.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nltk/nltk_book/HEAD/images/supervised-classification.graffle
--------------------------------------------------------------------------------
/book/definitions-pt.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | .. ifndef:: definitions-pt
4 |
5 | .. def:: definitions-pt
6 |
7 | .. |PLN| replace:: PLN
8 |
9 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Files and directories that get built automatically
2 |
3 | *.errs
4 | *.html
5 | *.ref
6 | *.rst2
7 |
8 | pylisting
9 | tree_images
10 |
11 | revision.rst
12 |
13 |
--------------------------------------------------------------------------------
/images/ambig02.py:
--------------------------------------------------------------------------------
1 | from nltk.parse import bracket_parse
2 | sent = '(S (NP the policeman)(VP (V saw)(NP (NP the burglar)(PP with a gun))))'
3 | tree = bracket_parse(sent)
4 | tree.draw()
5 |
--------------------------------------------------------------------------------
/images/ambig03.py:
--------------------------------------------------------------------------------
1 | from nltk.parse import bracket_parse
2 | sent = '(S (NP the policeman)(VP (V saw)(NP the burglar)(PP with a telescope)))'
3 | tree = bracket_parse(sent)
4 | tree.draw()
5 |
--------------------------------------------------------------------------------
/slides/demos/parse.py:
--------------------------------------------------------------------------------
1 | ######################################################################
2 | ##
3 | ## Chart Parsing Demo
4 | ##
5 |
6 | import nltk.draw.chart
7 | nltk.draw.chart.demo()
8 |
--------------------------------------------------------------------------------
/book/dict.csv:
--------------------------------------------------------------------------------
1 | "sleep","sli:p","v.i","a condition of body and mind ..."
2 | "walk","wo:k","v.intr","progress by lifting and setting down each foot ..."
3 | "wake","weik","intrans","cease to sleep"
4 |
--------------------------------------------------------------------------------
/images/ambig01-a.py:
--------------------------------------------------------------------------------
1 | from nltk.parse import bracket_parse
2 | sent = '(S (S Kim arrived) (conj or) (S (S Dana left) (conj and) (S everyone cheered)))'
3 | tree = bracket_parse(sent)
4 | tree.draw()
5 |
--------------------------------------------------------------------------------
/images/ambig01-b.py:
--------------------------------------------------------------------------------
1 | from nltk.parse import bracket_parse
2 | sent = '(S (S (S Kim arrived) (conj or) (S Dana left)) (conj and) (S everyone cheered))'
3 | tree = bracket_parse(sent)
4 | tree.draw()
5 |
--------------------------------------------------------------------------------
/images/lee-dog.py:
--------------------------------------------------------------------------------
1 | from nltk_lite.parse import bracket_parse
2 | from pprint import pprint
3 | sent = '(S (NP Lee)(VP (V saw)(NP the dog)))'
4 | tree = bracket_parse(sent)
5 | pprint(tree.pp())
6 | #tree.draw()
7 |
--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/book/docbook-issues.txt:
--------------------------------------------------------------------------------
1 |
2 | * Linguistic examples in chapter 10 having literal linebreaks are not
3 | correctly formatted. They should be in a larger font size, vertically aligned with
4 | the example number, and indented further.
--------------------------------------------------------------------------------
/surveys/2005-10.txt:
--------------------------------------------------------------------------------
1 | NLTK-Lite 0.5 Survey
2 | --------------------
3 |
4 |
5 | * questions about user, interests
6 |
7 | - is their subject homepage linked from the NLTK site?
8 |
9 |
10 | * questions about existing functionality and data
11 |
12 |
13 | * questions about desired functionality and data
14 |
15 |
16 | * questions about how the person would be able to contribute
17 |
18 |
--------------------------------------------------------------------------------
/latexhacks.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Post-process latex output in-place
3 |
4 | import sys
5 | import re
6 |
7 | # load the file
8 | file = open(sys.argv[1])
9 | contents = file.read()
10 | file.close()
11 |
12 | # modify it
13 | contents = re.sub(r'subsection{', r'subsection*{', contents)
14 |
15 | # save the file
16 | file= open(sys.argv[1], 'w')
17 | file.write(contents)
18 | file.close()
19 |
--------------------------------------------------------------------------------
/images/polish-utf8.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import re
4 | sent = """
5 | Przewiezione przez Niemców pod koniec II wojny światowej na Dolny
6 | Śląsk, zostały odnalezione po 1945 r. na terytorium Polski.
7 | """
8 |
9 | u = sent.decode('utf8')
10 | u.lower()
11 | print u.encode('utf8')
12 |
13 | SACUTE = re.compile('ś|Ś')
14 | replaced = re.sub(SACUTE, '[sacute]', sent)
15 | print replaced
16 |
17 |
18 |
--------------------------------------------------------------------------------
/book/ch6-fmla.tex:
--------------------------------------------------------------------------------
1 | \documentclass{article}
2 | \pagestyle{empty}
3 | \usepackage[verbose=true,margin=0cm,ignoreheadfoot,ignoremp,
4 | paperwidth=370pt,paperheight=180pt]{geometry}
5 | \begin{document}
6 |
7 | \begin{description}
8 | \item[Precision:] $\frac{\mathit{TP}}{\mathit{TP} + \mathit{FP}}$
9 |
10 | \item[Recall:] $\frac{\mathit{TP}}{\mathit{TP} + \mathit{FN}}$
11 |
12 | \item[F-Measure:] $\frac{2 \times \mathit{Precision} \times \mathit{Recall}}{\mathit{Precision} + \mathit{Recall}}$
13 | \end{description}
14 | \end{document}
15 |
--------------------------------------------------------------------------------
/slides/demos/names.py:
--------------------------------------------------------------------------------
1 | ######################################################################
2 | ##
3 | ## Guess an unseen name's gender!
4 | ##
5 |
6 | from nltk.classify.naivebayes import NaiveBayesClassifier
7 | from nltk.classify.util import names_demo
8 |
9 | # Feature Extraction:
10 | def name_features(name):
11 | features = {}
12 | return features
13 |
14 | # Test the classifier:
15 | classifier = names_demo(NaiveBayesClassifier.train, name_features)
16 |
17 | # Feature Analysis:
18 | #classifier.show_most_informative_features()
19 |
--------------------------------------------------------------------------------
/LSA325/assignment3.txt:
--------------------------------------------------------------------------------
1 | 1. Explore what kind of sequences are annotated as VP in the CONLL2000 "train" corpus data.
2 |
3 | 2. Develop a chunk.Regexp grammar to capture the regularities.
4 |
5 | 3. Use the trace=1 setting of the chunk parser to examine the success of your VP chunking rules.
6 |
7 | 4. Once you are reasonably happy with your rules, try evaluating your them against the CONLL2000 "test" corpus data (i.e., using the chunk.accuracy() function).
8 |
9 | 5. Briefly comment on how easy or difficult it was to develop an adequate rule set.
10 |
--------------------------------------------------------------------------------
/images/chart_td_init.dot:
--------------------------------------------------------------------------------
1 | digraph x {
2 | rankdir=LR;
3 | ranksep=0.25;
4 |
5 | /* The nodes */
6 | {
7 | node [style=filled, height=0.1,width=0.1, fillcolor=cadetblue];
8 | 1 [label="0"];
9 | }
10 |
11 | /* The sentence */
12 | {
13 | edge [style=invis, weight=100];
14 | node [shape=plaintext];
15 | 1->dots2;
16 | dots2 [label="…"];
17 | }
18 |
19 | /* Edges */
20 | {
21 | edge [fontname=LucidaGrande];
22 | 1->1 [label="S → • α "];
23 | }
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/LSA325/assignment2.txt:
--------------------------------------------------------------------------------
1 | 1. Define a function find_tags(item, word) which takes a section of the Brown Corpus and a word as its arguments and returns a list of the tags that occur for that word, sorted in decreasing frequency (using the FreqDist.sorted() method). E.g. find_tags('a', 'present') should return ['jj', 'rb', 'vb', 'nn']].
2 |
3 | 2. Define a function test_tagger(item, sent) which trains a bigram tagger on the specified section of the Brown Corpus, and uses it to tag sent. Write comment lines to explain why the tagger performs badly and to suggest a way performance could be improved.
4 |
--------------------------------------------------------------------------------
/book/ch12-extras.rst:
--------------------------------------------------------------------------------
1 | --------------
2 | The Holy Grail
3 | --------------
4 |
5 | * NLP-Complete Problems: SLDS, MT
6 | (cf AI-complete)
7 |
8 | * Why they are hard
9 |
10 | * The problem of grounding. Embodied conversational agents.
11 |
12 | * Approaches: "grammar engineering" (scaling up a rule-based approach
13 | with the help of engineering methods such as grammar test suites);
14 | "grammar inference" (training on manually-checked annotated data).
15 |
16 | * Even simple problems!
17 | ``http://itre.cis.upenn.edu/~myl/languagelog/archives/001445.html``
18 |
19 |
20 |
--------------------------------------------------------------------------------
/images/avm1.tex:
--------------------------------------------------------------------------------
1 | \documentclass[12pt]{article}
2 | \usepackage{avm}
3 | \usepackage{helvet}
4 |
5 | \avmvalfont{\it}
6 | \avmfont{\sf}
7 | \avmoptions{active,unsorted}
8 | \pagestyle{empty}
9 | \usepackage[verbose=true,margin=0cm,ignoreheadfoot,ignoremp,
10 | paperwidth=370pt,paperheight=180pt]{geometry}
11 | \begin{document}
12 | {\Huge
13 |
14 | \fontfamily{phv}\selectfont
15 | \begin{avm}
16 | [
17 | POS & N\\
18 | AGR & [PER & 3\\
19 | NUM & pl\\
20 | GND & fem
21 | ]
22 | ]
23 | \end{avm}
24 | }
25 | \end{document}
--------------------------------------------------------------------------------
/pages.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Natural Language Toolkit: Page length extraction script
4 | #
5 | # Copyright (C) 2001-2006 NLTK Project
6 | # Author: Steven Bird
7 | # URL:
8 | # For license information, see LICENSE.TXT
9 |
10 | r"""
11 |
12 | This script extracts the pagecount from a latex log file.
13 |
14 | """
15 |
16 | from sys import argv
17 | from re import search
18 |
19 | regexp = r'\[(\d+)\][^\[]*$' # last [nn] in file
20 | logfile = open(argv[1]).read() # latex logfile
21 | print search(regexp, logfile).group(1)
22 |
--------------------------------------------------------------------------------
/images/chart_intro_empty.dot:
--------------------------------------------------------------------------------
1 | /* -*- coding:utf-8 -*- */
2 |
3 | digraph x {
4 | rankdir=LR;
5 | ranksep=0.25;
6 |
7 | /* The nodes */
8 | {
9 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue4];
10 | 0 [label=""];
11 | 1 [label=""];
12 | 2 [label=""];
13 | 3 [label=""];
14 | }
15 |
16 | /* The sentence */
17 | {
18 | edge [style=invis, weight=100];
19 | node [shape=plaintext,fontname="Times-BoldItalic"];
20 | 0->John->1->likes->2->Mary->3;
21 | }
22 |
23 | /* Edges */
24 | {
25 | }
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/book/footer.rst:
--------------------------------------------------------------------------------
1 | .. Footer to be used in all chapters
2 |
3 | .. admonition:: About this document...
4 |
5 | UPDATED FOR NLTK 3.0.
6 | This is a chapter from *Natural Language Processing with Python*,
7 | by |StevenBird|, |EwanKlein| and |EdwardLoper|,
8 | Copyright |copy| 2019 the authors.
9 | It is distributed with the *Natural Language Toolkit* [|NLTK-URL|],
10 | Version |version|, under the terms of the
11 | *Creative Commons Attribution-Noncommercial-No Derivative Works 3.0 United States License*
12 | [http://creativecommons.org/licenses/by-nc-nd/3.0/us/].
13 |
14 | .. include:: revision.rst
15 |
--------------------------------------------------------------------------------
/images/Makefile:
--------------------------------------------------------------------------------
1 | PIX = chart_bottom_up.dot chart_bu_init.dot chart_bu_predict1.dot chart_bu_predict2.dot chart_fr1.dot chart_fr2.dot chart_intro_2edges.dot chart_intro_3edges.dot chart_intro_dottededge.dot chart_intro_empty.dot chart_intro_incomplete.dot chart_intro_parseedge.dot chart_intro_prodedge.dot chart_intro_selfloop.dot chart_td_expand1.dot chart_td_expand2.dot chart_td_init.dot chart_td_match1.dot chart_td_match2.dot chart_top_down.dot chart_useless_edge.dot
2 |
3 | PNG := $(PIX:.dot=.png)
4 |
5 | .SUFFIXES: .dot .png
6 |
7 | png: $(PNG)
8 |
9 | clean:
10 | rm -f $(PNG)
11 |
12 | .dot.png:
13 | dot -Tpng $< > $@
14 |
--------------------------------------------------------------------------------
/images/chart_useless_edge.dot:
--------------------------------------------------------------------------------
1 | digraph x {
2 | rankdir=LR;
3 |
4 | /* The nodes */
5 | {
6 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue];
7 | x [label="" style=invis];
8 | 0 [label=""];
9 | 1 [label=""];
10 | 2 [label=""];
11 | 3 [label=""];
12 | }
13 |
14 | /* The sentence */
15 | {
16 | edge [style=invis, weight=100];
17 | node [shape=plaintext];
18 | x->0->John->1->likes->2->Mary->3;
19 | }
20 |
21 | /* Edges */
22 | {
23 | edge [fontname=LucidaGrande];
24 | 0->0 [label="VP → • V NP"];
25 | }
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/images/chart_intro_dottededge.dot:
--------------------------------------------------------------------------------
1 | digraph x {
2 | rankdir=LR;
3 | ranksep=0.25;
4 |
5 | /* The nodes */
6 | {
7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue4];
8 | 0 [label=""];
9 | 1 [label=""];
10 | 2 [label=""];
11 | 3 [label=""];
12 | }
13 |
14 | /* The sentence */
15 | {
16 | edge [style=invis, weight=100];
17 | node [shape=plaintext,fontname="Times-BoldItalic"];
18 | 0->John->1->likes->2->Mary->3;
19 | }
20 |
21 | /* Edges */
22 | {
23 | edge [fontname=LucidaGrande];
24 | 1->2 [label="VP → V • NP"];
25 | }
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/images/chart_intro_parseedge.dot:
--------------------------------------------------------------------------------
1 | digraph x {
2 | rankdir=LR;
3 | ranksep=0.25;
4 |
5 | /* The nodes */
6 | {
7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue4];
8 | 0 [label=""];
9 | 1 [label=""];
10 | 2 [label=""];
11 | 3 [label=""];
12 | }
13 |
14 | /* The sentence */
15 | {
16 | edge [style=invis, weight=100];
17 | node [shape=plaintext,fontname="Times-BoldItalic"];
18 | 0->John->1->likes->2->Mary->3;
19 | }
20 |
21 | /* Edges */
22 | {
23 | edge [fontname=LucidaGrande];
24 | 0->3 [label="S → NP V •"];
25 | }
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/images/chart_td_match2.dot:
--------------------------------------------------------------------------------
1 | digraph x {
2 | rankdir=LR;
3 | ranksep=0.25;
4 |
5 | /* The nodes */
6 | {
7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue];
8 | 1 [label="i"];
9 | 2 [label="j"];
10 | }
11 |
12 | /* The sentence */
13 | {
14 | edge [style=invis, weight=100];
15 | node [shape=plaintext];
16 | dots1->1->dots2->2->dots3;
17 | dots1 [label="..."];
18 | dots2 [label="..."];
19 | dots3 [label="..."];
20 | }
21 |
22 | /* Edges */
23 | {
24 | edge [fontname=LucidaGrande];
25 | 2->2 [label="w[j] → •"];
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/images/chart_bu_init.dot:
--------------------------------------------------------------------------------
1 | digraph x {
2 | rankdir=LR;
3 | ranksep=0.25;
4 |
5 | /* The nodes */
6 | {
7 | node [style=filled, height=0.1,width=0.1,fillcolor=cadetblue];
8 | 1 [label="i"];
9 | 2 [label="i+1"];
10 | }
11 |
12 | /* The sentence */
13 | {
14 | edge [style=invis, weight=100];
15 | node [shape=plaintext];
16 | wi [label="w[i]"]
17 | dots1->1->wi->2->dots2;
18 | dots1 [label="..."];
19 | dots2 [label="..."];
20 | }
21 |
22 | /* Edges */
23 | {
24 | edge [fontname=LucidaGrande];
25 | 1->2 [label="w[i] → •"];
26 | }
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/images/chart_intro_selfloop.dot:
--------------------------------------------------------------------------------
1 | digraph x {
2 | rankdir=LR;
3 | ranksep=0.25;
4 |
5 | /* The nodes */
6 | {
7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue4];
8 | 0 [label=""];
9 | 1 [label=""];
10 | 2 [label=""];
11 | 3 [label=""];
12 | }
13 |
14 | /* The sentence */
15 | {
16 | edge [style=invis, weight=100];
17 | node [shape=plaintext,fontname="Times-BoldItalic"];
18 | 0->John->1->likes->2->Mary->3;
19 | }
20 |
21 | /* Edges */
22 | {
23 | edge [fontname=LucidaGrande];
24 | 1->1 [label="VP → • V NP"];
25 | }
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/book/bib_template.html:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 |
7 | Natural Language Processing: Bibliography
8 |
9 |
10 |
11 |
12 |
13 | Bibliography
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/images/chart_intro_2edges.dot:
--------------------------------------------------------------------------------
1 | digraph x {
2 | rankdir=LR;
3 | ranksep=0.25;
4 |
5 | /* The nodes */
6 | {
7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue4];
8 | 0 [label=""];
9 | 1 [label=""];
10 | 2 [label=""];
11 | 3 [label=""];
12 | }
13 |
14 | /* The sentence */
15 | {
16 | edge [style=invis, weight=100];
17 | node [shape=plaintext,fontname="Times-BoldItalic"];
18 | 0->John->1->likes->2->Mary->3;
19 | }
20 |
21 | /* Edges */
22 | {
23 | edge [fontname=lucidagrande];
24 | 1->2 [label="V"];
25 | 2->3 [label="NP"];
26 | }
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/images/chart_td_expand2.dot:
--------------------------------------------------------------------------------
1 | digraph x {
2 | rankdir=LR;
3 | ranksep=0.25;
4 |
5 | /* The nodes */
6 | {
7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue];
8 | 1 [label="i"];
9 | 2 [label="j"];
10 | }
11 |
12 | /* The sentence */
13 | {
14 | edge [style=invis, weight=100];
15 | node [shape=plaintext];
16 | dots1->1->dots2->2->dots3;
17 | dots1 [label="…"];
18 | dots2 [label="…"];
19 | dots3 [label="…"];
20 | }
21 |
22 | /* Edges */
23 | {
24 | edge [fontname=LucidaGrande];
25 | 2->2 [label="B → • γ"];
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/index.rst:
--------------------------------------------------------------------------------
1 | .. NLTK documentation master file, created by
2 | sphinx-quickstart on Sat Oct 8 22:36:44 2011.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Welcome to NLTK's documentation!
7 | ================================
8 |
9 | Contents:
10 |
11 | .. toctree::
12 | :maxdepth: 1
13 |
14 | ch00
15 | ch01
16 | ch02
17 | ch03
18 | ch04
19 | ch05
20 | ch06
21 | ch07
22 | ch08
23 | ch09
24 | ch10
25 | ch11
26 | ch12
27 |
28 |
29 | Indices and tables
30 | ==================
31 |
32 | * :ref:`genindex`
33 | * :ref:`modindex`
34 | * :ref:`search`
35 |
36 |
--------------------------------------------------------------------------------
/images/chart_bu_predict1.dot:
--------------------------------------------------------------------------------
1 | digraph x {
2 | rankdir=LR;
3 | ranksep=0.25;
4 |
5 | /* The nodes */
6 | {
7 | node [style=filled, height=0.1,width=0.1,fillcolor=cadetblue];
8 | 1 [label="i"];
9 | 2 [label="i+1"];
10 | }
11 |
12 | /* The sentence */
13 | {
14 | edge [style=invis, weight=100];
15 | node [shape=plaintext];
16 | dots1->1->dots2->2->dots3;
17 | dots1 [label="…"];
18 | dots2 [label="…"];
19 | dots3 [label="…"];
20 | }
21 |
22 | /* Edges */
23 | {
24 | edge [fontname=LucidaGrande];
25 | 1->2 [label="A → α •"];
26 | }
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/images/chart_bu_predict2.dot:
--------------------------------------------------------------------------------
1 | digraph x {
2 | rankdir=LR;
3 | ranksep=0.25;
4 |
5 | /* The nodes */
6 | {
7 | node [style=filled, height=0.1,width=0.1,fillcolor=cadetblue];
8 | 1 [label="i"];
9 | 2 [label="j"];
10 | }
11 |
12 | /* The sentence */
13 | {
14 | edge [style=invis, weight=100];
15 | node [shape=plaintext];
16 | dots1->1->dots2->2->dots3;
17 | dots1 [label="…"];
18 | dots2 [label="…"];
19 | dots3 [label="…"];
20 | }
21 |
22 | /* Edges */
23 | {
24 | edge [fontname=LucidaGrande];
25 | 1->1 [label="B → • A β"];
26 | }
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/images/chart_td_expand1.dot:
--------------------------------------------------------------------------------
1 | digraph x {
2 | rankdir=LR;
3 | ranksep=0.25;
4 |
5 | /* The nodes */
6 | {
7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue];
8 | 1 [label="i"];
9 | 2 [label="j"];
10 | }
11 |
12 | /* The sentence */
13 | {
14 | edge [style=invis, weight=100];
15 | node [shape=plaintext];
16 | dots1->1->dots2->2->dots3;
17 | dots1 [label="…"];
18 | dots2 [label="…"];
19 | dots3 [label="…"];
20 | }
21 |
22 | /* Edges */
23 | {
24 | edge [fontname=LucidaGrande];
25 | 1->2 [label="A → α • β"];
26 | }
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/images/chart_td_match1.dot:
--------------------------------------------------------------------------------
1 | digraph x {
2 | rankdir=LR;
3 | ranksep=0.25;
4 |
5 | /* The nodes */
6 | {
7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue];
8 | 1 [label="i"];
9 | 2 [label="j"];
10 | }
11 |
12 | /* The sentence */
13 | {
14 | edge [style=invis, weight=100];
15 | node [shape=plaintext];
16 | dots1->1->dots2->2->dots3;
17 | dots1 [label="…"];
18 | dots2 [label="…"];
19 | dots3 [label="…"];
20 | }
21 |
22 | /* Edges */
23 | {
24 | edge [fontname=LucidaGrande];
25 | 1->2 [label="A → α • w[j] β"];
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/images/chart_intro_incomplete.dot:
--------------------------------------------------------------------------------
1 | digraph x {
2 | rankdir=LR;
3 | ranksep=0.25;
4 |
5 | /* The nodes */
6 | {
7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue4];
8 | 0 [label=""];
9 | 1 [label=""];
10 | 2 [label=""];
11 | 3 [label=""];
12 | }
13 |
14 | /* The sentence */
15 | {
16 | edge [style=invis, weight=100];
17 | node [shape=plaintext,fontname="Times-BoldItalic"];
18 | 0->John->1->likes->2->Mary->3;
19 | }
20 |
21 | /* Edges */
22 | {
23 | edge [fontname=LucidaGrande];
24 | 1->3 [label="VP → V NP •"];
25 | 1->2 [label="VP → V • NP"];
26 | }
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/howto/coverage.txt:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | =============================
4 | NLTK Regression Test Coverage
5 | =============================
6 |
7 | The following table lists each NLTK module, and indicates what
8 | percentage of the module's statements are currently covered by the
9 | regression test set. To see which functions are methods are covered
10 | in a given module, click on that module. You can then click on those
11 | functions and methods to see their source code, and to check what
12 | portion of them is covered by tests.
13 |
14 | .. include:: coverage-list.txt
15 |
16 | ----
17 |
18 | `Return to the NLTK Regression Tests <../index.html>`__
19 |
20 | `Return to the NLTK Homepage `__
21 |
--------------------------------------------------------------------------------
/images/chart_intro_prodedge.dot:
--------------------------------------------------------------------------------
1 | digraph x {
2 | rankdir=LR;
3 | ranksep=0.25;
4 |
5 | /* The nodes */
6 | {
7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue4];
8 | 0 [label=""];
9 | 1 [label=""];
10 | 2 [label=""];
11 | 3 [label=""];
12 | }
13 |
14 | /* The sentence */
15 | {
16 | edge [style=invis, weight=100];
17 | node [shape=plaintext,fontname="Times-BoldItalic"];
18 | 0->John->1->likes->2->Mary->3;
19 | }
20 |
21 | /* Edges */
22 | {
23 | edge [fontname=LucidaGrande];
24 | 1->3 [label="VP → V NP"];
25 | 1->2 [label="V → likes"];
26 | 2->3 [label="NP → Mary"];
27 | }
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/images/chart_intro_3edges.dot:
--------------------------------------------------------------------------------
1 | /* -*- coding:utf-8 -*- */
2 |
3 | digraph x {
4 | rankdir=LR;
5 | ranksep=0.25;
6 |
7 | /* The nodes */
8 | {
9 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue4];
10 | 0 [label=""];
11 | 1 [label=""];
12 | 2 [label=""];
13 | 3 [label=""];
14 | }
15 |
16 | /* The sentence */
17 | {
18 | edge [style=invis, weight=100];
19 | node [shape=plaintext,fontname="Times-BoldItalic"];
20 | 0->John->1->likes->2->Mary->3;
21 | }
22 |
23 | /* Edges */
24 | {
25 | edge [fontname=lucidagrande];
26 | edge [weight=0];
27 | 1->3 [label="VP"];
28 | 1->2 [label="V"];
29 | 2->3 [label="NP"];
30 | }
31 |
32 | }
33 |
--------------------------------------------------------------------------------
/images/chart_fr2.dot:
--------------------------------------------------------------------------------
1 | digraph x {
2 | rankdir=LR;
3 | ranksep=0.25;
4 |
5 | /* The nodes */
6 | {
7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue];
8 | 1 [label="i"];
9 | 2 [label="j"];
10 | 3 [label="k"];
11 | }
12 |
13 | /* The sentence */
14 | {
15 | edge [style=invis, weight=100];
16 | node [shape=plaintext];
17 | dots1->1->dots2->2->dots3->3->dots4;
18 | dots1 [label="…"];
19 | dots2 [label="…"];
20 | dots3 [label="…"];
21 | dots4 [label="…"];
22 | }
23 |
24 | /* Edges */
25 | {
26 | edge [fontname=LucidaGrande];
27 | 1->3 [label="A → α B • β "];
28 | }
29 |
30 | }
31 |
--------------------------------------------------------------------------------
/book/book.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
4 | Natural Language Processing
5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
6 |
7 | .. def:: book
8 | .. include:: ../definitions.rst
9 |
10 | :Authors: Steven Bird, Ewan Klein, Edward Loper
11 | :Version: |version| (draft only, please send feedback to authors)
12 | :Copyright: |copy| |copyrightinfo|
13 | :License: |license|
14 | :Revision:
15 | :Date:
16 |
17 | .. contents::
18 | :depth: 2
19 |
20 | .. preface::
21 | .. ch00.rst
22 |
23 | .. toctree::
24 | :maxdepth: 2
25 |
26 | ch00
27 | ch01
28 | ch02
29 | ch03
30 | ch04
31 | ch05
32 | ch06
33 | ch07
34 | ch08
35 | ch09
36 | ch10
37 | ch11
38 | ch12
39 |
40 | .. index::
41 |
42 | .. include:: term_index.rst
43 |
44 |
--------------------------------------------------------------------------------
/book/ch02-extras.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 | .. include:: ../definitions.rst
3 |
4 | .. standard global imports
5 |
6 | >>> import nltk, re, pprint
7 |
8 | ========================================================
9 | 2. Accessing Text Corpora and Lexical Resources (Extras)
10 | ========================================================
11 |
12 |
13 | -------------------------------------
14 | Language Resource Listings on the Web
15 | -------------------------------------
16 |
17 | * http://nlp.stanford.edu/links/statnlp.html
18 |
19 | Search OLAC, the `Open Language Archives Community`
20 |
21 | * http://www.language-archives.org/
22 |
23 | Search the archives of the "Corpora List":
24 |
25 | * http://listserv.linguistlist.org/archives/corpora.html
26 |
27 |
28 | .. include:: footer.rst
29 |
--------------------------------------------------------------------------------
/images/chart_fr1.dot:
--------------------------------------------------------------------------------
1 | digraph x {
2 | rankdir=LR;
3 | ranksep=0.25;
4 |
5 | /* The nodes */
6 | {
7 | node [style=filled,height=0.1,width=0.1,fillcolor=cadetblue];
8 | 1 [label="i"];
9 | 2 [label="j"];
10 | 3 [label="k"];
11 | }
12 |
13 | /* The sentence */
14 | {
15 | edge [style=invis, weight=100];
16 | node [shape=plaintext];
17 | dots1->1->dots2->2->dots3->3->dots4;
18 | dots1 [label="…"];
19 | dots2 [label="…"];
20 | dots3 [label="…"];
21 | dots4 [label="…"];
22 | }
23 |
24 | /* Edges */
25 | {
26 | edge [fontname=LucidaGrande];
27 | 1->2 [label="A → α • B β"];
28 | 2->3 [label="B → γ"];
29 | }
30 |
31 | }
32 |
--------------------------------------------------------------------------------
/examples.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Natural Language Toolkit: Example generation script
4 | #
5 | # Copyright (C) 2001-2012 NLTK Project
6 | # Author: Steven Bird
7 | # URL:
8 | # For license information, see LICENSE.TXT
9 |
10 | """
11 | Extract the code samples from a file in restructured text format
12 | """
13 |
14 | import sys
15 |
16 | from epydoc.markup.doctest import DoctestColorizer
17 | PROMPT_RE = DoctestColorizer.PROMPT_RE
18 |
19 | for filename in sys.argv[1:]:
20 | in_code = False
21 | for line in open(filename).readlines():
22 | if PROMPT_RE.match(line):
23 | in_code = True
24 | print PROMPT_RE.sub('', line),
25 |
26 | elif in_code:
27 | in_code = False
28 | print
29 |
--------------------------------------------------------------------------------
/slides/beamer.tex:
--------------------------------------------------------------------------------
1 | \mode
2 | {
3 | \usetheme{Pittsburgh}
4 | \setbeamercovered{transparent}
5 | \beamerdefaultoverlayspecification{<+->}
6 | }
7 |
8 | \mode
9 | {
10 | \usetheme{default}
11 | \usecolortheme{default}
12 | \useoutertheme{default}
13 | \usepackage{pgf}
14 | \usepackage{pgfpages}
15 | % \pgfpagesuselayout{4 on 1}[a4paper,landscape,scale=0.9]
16 | \setjobnamebeamerversion{handout.beamer}
17 | }
18 |
19 | \mode
20 | {
21 | \usepackage{fullpage}
22 | \usepackage{pgf}
23 | \usepackage{hyperref}
24 | \setjobnamebeamerversion{notes.beamer}
25 | }
26 |
27 | \usepackage[english]{babel}
28 | \usepackage[latin1]{inputenc}
29 | \usepackage{times}
30 | \usepackage[T1]{fontenc}
31 |
32 | \date{\today}
33 |
34 | \subject{Natural Language Toolkit}
35 |
36 | % hack since pgfex is not defined
37 | \def\pgfex{ex}
38 |
39 |
--------------------------------------------------------------------------------
/images/Binary_entropy_plot.tex:
--------------------------------------------------------------------------------
1 | %Plot of information entropy of bernoulli variable
2 | %
3 | %latex binary_entropy_plot; dvips binary_entropy_plot
4 | %open .ps file in gimp, choose strong antialias in both text and graphics,
5 | %resulution 500, color mode, crop, scale to 45%, save as .png
6 | \documentclass[12pt]{article}
7 | \usepackage{pst-plot}
8 | \begin{document}
9 | \psset{unit=4cm}
10 | \begin{pspicture}(0,0)(1.01,1)
11 | \psgrid[gridlabels=0pt,gridcolor=lightgray,subgriddiv=10,subgridcolor=lightgray](0,0)(0,0)(1,1)
12 | \newrgbcolor{myblue}{0 0 0.7}
13 | \psaxes[arrows=->,arrowsize=2pt 4,Dx=0.5,Dy=0.5](0,0)(0,0)(1.1,1.1)
14 | \psplot[plotstyle=curve,plotpoints=100,linewidth=1.8pt,linecolor=myblue]{0.0001}{0.9999}{-1 x x log 2 log div mul 1 x sub 1 x sub log 2 log div mul add mul}
15 | \rput(0.5,-0.22){$P(male)$}
16 | \rput{90}(-0.28,0.5){$H$}
17 | \end{pspicture}
18 | \end{document}
19 |
--------------------------------------------------------------------------------
/book/print.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
4 | Natural Language Processing
5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
6 |
7 | .. def:: book
8 | .. include:: ../definitions.rst
9 |
10 | :Authors: Steven Bird, Ewan Klein, Edward Loper
11 | :Version: |version| (draft only, please send feedback to authors)
12 | :Copyright: |copy| |copyrightinfo|
13 | :License: |license|
14 | :Revision:
15 | :Date:
16 |
17 | .. contents::
18 | :depth: 2
19 |
20 | .. preface::
21 | .. include:: ch00.rst
22 |
23 | .. body::
24 | .. include:: ch01.rst
25 | .. include:: ch02.rst
26 | .. include:: ch03.rst
27 | .. include:: ch04.rst
28 | .. include:: ch05.rst
29 | .. include:: ch06.rst
30 | .. include:: ch07.rst
31 | .. include:: ch08.rst
32 | .. include:: ch09.rst
33 | .. include:: ch10.rst
34 | .. include:: ch11.rst
35 | .. include:: ch12.rst
36 |
37 | .. index::
38 |
39 | .. include:: term_index.rst
40 |
41 |
--------------------------------------------------------------------------------
/rsthacks.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Pre-process rst source
3 |
4 | from optparse import OptionParser
5 | import re
6 |
7 | _SCALE_RE = b'(:scale:\s+)(\d+):(\d+):(\d+)'
8 |
9 | def process(file, format):
10 | contents = open(file, 'rb').read()
11 | if format == "html":
12 | contents = re.sub(_SCALE_RE, r'\1\2', contents)
13 | elif format == "latex":
14 | contents = re.sub(_SCALE_RE, r'\1\3', contents)
15 | elif format == "xml":
16 | contents = re.sub(_SCALE_RE, r'\1\4', contents)
17 | open(file + "2", 'wb').write(contents)
18 |
19 | parser = OptionParser()
20 | parser.add_option("-f", "--format", dest="format",
21 | help="output format (html, latex, xml)", metavar="FMT")
22 |
23 | o, a = parser.parse_args()
24 |
25 | if o.format and o.format in ["html", "latex", "xml"] and a and len(a) == 1:
26 | process(a[0], o.format)
27 |
28 | else:
29 | exit("Must specify a format (html, latex, xml) and a filename")
30 |
--------------------------------------------------------------------------------
/LSA325/assignment5.txt:
--------------------------------------------------------------------------------
1 | Write a short review (2 to 3 pages) on some aspect of the NLTK book
2 | draft (http://nltk.org/index.php/Book). What we would appreciate is a
3 | thoughtful analysis of some specific portion of the book, rather than,
4 | say, a list of typos and errors over a large portion of the book. You
5 | can focus on anything you like: for example, the presentation of key
6 | ideas, the explanation of a piece of code, orthe way in which some
7 | exercises are formulated. You could even critique a piece of code. You
8 | could reflect on any key concepts you have struggled with, or the most
9 | important thing you learned, or new ways to illustrate the ideas using
10 | examples from your favorite area of linguistics, and make concrete
11 | suggestions for improving the presentation.
12 |
13 | All suggestions that we use will be acknowledged in the book's
14 | preface. Please submit your work in a plain text file, named
15 | yoursurname_review.txt, via the file upload option.
16 |
17 |
--------------------------------------------------------------------------------
/pt-br/index.txt:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 | .. include:: ../definitions.txt
3 |
4 | ======================
5 | Tutoriais do NLTK-Lite
6 | ======================
7 |
8 | :Autores: Steven Bird, Ewan Klein, Edward Loper
9 | :Contato: sb@csse.unimelb.edu.au
10 | :Version: |version|
11 | :Copyright: |copy| |copyrightinfo|
12 | :Licença: |license|
13 |
14 | .. _Prefácio: preface.html
15 | .. _Introdução: introduction.html
16 | .. _Programação: programming.html
17 | .. _Toquenização: tokenize.html
18 | .. _Tag: tag.html
19 | .. _Parsing: parse.html
20 | .. _Chunk: chunk.html
21 | .. _Chart: chart.html
22 | .. _PCFG: pcfg.html
23 | .. _Field: field.html
24 | .. _Regexps: regexps.html
25 | .. _Projetos: projects.html
26 |
27 | ------
28 | Índice
29 | ------
30 |
31 | 0. Prefácio_
32 | #. Introdução_
33 | #. Programação_
34 | #. Toquenização_
35 | #. Tag_
36 | #. Parsing_
37 | #. Chunk_
38 | #. Chart_
39 | #. PCFG_
40 | #. Field_
41 | #. Regexps_
42 | #. Projetos_
43 |
44 | ----
45 |
46 | NLTK_
47 |
48 | .. _NLTK: http://nltk.sourceforge.net/
49 |
50 |
--------------------------------------------------------------------------------
/doctest_split.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Natural Language Toolkit: Split an RST file into sections for independent doctest checking
4 | #
5 | # Copyright (C) 2001-2012 NLTK Project
6 | # Author: Steven Bird
7 | # URL:
8 | # For license information, see LICENSE.TXT
9 |
10 | import sys
11 | import re
12 |
13 | EXT = "doctest" # output filename extension
14 | SEC = r"\n(?=-+\n.+\n-+\n)" # pattern to match section heading
15 |
16 | # include this at the top of each output file
17 | HDR = """
18 | >>> import nltk, re, pprint
19 | >>> from nltk import word_tokenize
20 | """
21 |
22 | for filename in sys.argv[1:]:
23 | contents = open(filename).read()
24 | basename, suffix = filename.split('.')
25 | for count, section in enumerate(re.split(SEC, contents)):
26 | chunk_name = "%s-%d.%s" % (basename, count+1, EXT)
27 | chunk_file = open(chunk_name, "w")
28 | chunk_file.write(HDR + "\n")
29 | chunk_file.write(section)
30 | chunk_file.close()
31 |
--------------------------------------------------------------------------------
/archives/sourceforge-dev.txt:
--------------------------------------------------------------------------------
1 | nltk 1.3 documentation
2 | By: Patrick Ye (jingy) - 2004-04-14 17:12
3 | Would it be possible to package the documentation for version 1.3 so it can be downloaded easily? This would
4 | be pretty useful in case we have no internet access.
5 |
6 | Thanks a lot.
7 |
8 | Patrick
9 |
10 |
11 | RE: nltk 1.3 documentation
12 | By: Edward Loper (edloperProject Admin) - 2004-04-15 09:11
13 | I just added it to the files page: http://sourceforge.net/project/showfiles.php?group_id=30982
14 |
15 | Sorry for the omission.
16 |
17 |
18 | nltk_contrib for WordNet
19 | By: Patrick Ye (jingy) - 2004-04-14 17:04
20 | Hi,
21 |
22 | I created a python package (with a setup.py) for interfacing python/nltk with WordNet using the C code that comes with the WordNet library. I'm not sure how to integrate this new package into nltk_contrib, i.e, should I just use the python package or unpackage it and treat it as just a directory that contains a few python files?
23 |
24 | Any help would be appreciated.
25 |
26 | Patrick
27 |
28 |
--------------------------------------------------------------------------------
/xincluder.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Natural Language Toolkit: Process the XIncludes of an XML document
4 | #
5 | # Copyright (C) 2001-2012 NLTK Project
6 | # Author: Steven Bird
7 | # URL:
8 | # For license information, see LICENSE.TXT
9 |
10 | import sys
11 | import re
12 |
13 | EXT = "-flat" # output filename extension
14 | XI1 = r''
15 | DOC = r'(?s)'
16 | NAMESPACE = r' xmlns:xi="http://www.w3.org/2001/XInclude"'
17 |
18 | for filename in sys.argv[1:]:
19 | basename, suffix = filename.split('.')
20 | output_filename = basename + EXT + "." + suffix
21 | output = open(output_filename, "w")
22 | for line in open(filename):
23 | m = re.search(XI1, line)
24 | if m:
25 | contents = open(m.group(1)).read()
26 | if re.search(DOC, contents):
27 | contents = re.split(DOC, contents)[1]
28 | output.write(contents)
29 | else:
30 | if NAMESPACE in line:
31 | line = re.sub(NAMESPACE, '', line)
32 | output.write(line)
33 | output.close()
34 |
35 |
36 |
--------------------------------------------------------------------------------
/slides/Makefile:
--------------------------------------------------------------------------------
1 | # Presentation Slides Makefile
2 | #
3 | # Copyright (C) 2001-2012 NLTK Project
4 | # Author: Steven Bird
5 | # Edward Loper
6 | # URL:
7 | # For license information, see LICENSE.TXT
8 |
9 | WEB = $(USER)@shell.sourceforge.net:/home/groups/n/nl/nltk/htdocs
10 |
11 | TEX = preface.tex introduction.tex programming.tex tag.tex chunk.tex data.tex
12 | PDF := $(TEX:.tex=.pdf)
13 |
14 | RST2HTML = ../rst.py --html
15 |
16 | RSYNC_OPTS = -lrtvz -e ssh --relative --cvs-exclude
17 |
18 | .SUFFIXES: .rst .html .tex .pdf
19 |
20 | .PHONY: all clean
21 |
22 | all: $(PDF) index.html
23 |
24 | clean: clean_up
25 | rm -f $(PDF) index.html
26 |
27 | clean_up:
28 | rm -f *.log *.aux *.snm *.vrb *.out *.nav *.toc index*.html
29 |
30 | index.html: index.rst
31 | $(RST2HTML) index.rst > index.html
32 |
33 | .tex.pdf:
34 | pdflatex $<
35 | pdflatex $<
36 | mkdir -p handouts
37 | sed 's/documentclass/documentclass[handout]/' < $< > handouts/$<
38 | pdflatex -output-directory handouts handouts/$<
39 | pdflatex -output-directory handouts handouts/$<
40 |
41 | rsync:
42 | rsync $(RSYNC_OPTS) . $(WEB)/doc/slides/
43 |
--------------------------------------------------------------------------------
/book/reprint1-4.txt:
--------------------------------------------------------------------------------
1 | p115 first line after example 3-3: maximizes s/b minimizes
2 | (from OReilly errata)
3 |
4 | p234 example 6-6:
5 | 6up: for i, word in words: s/b for i, word in enumerate(words):
6 | 5up: classifier.classify(words, i) s/b classifier.classify(punct_features(words, i))
7 | 0up: add final line of code, indented at same level as "if" statement: return sents
8 |
9 | p243 10up formula (1) please insert minus sign between equals and uppercase sigma
10 |
11 | p309 9up "through entire list" s/b "through the entire list"
12 | (from previous list)
13 |
14 | p336 Fig 9-1 should still be a bit smaller please.
15 |
16 | p426 1d: add opening tag on new line, after closing tag:
17 | (from OReilly errata)
18 |
19 |
20 | whale
21 | noun
22 |
23 | any of the larger cetacean mammals having a streamlined
24 | body and breathing through a blowhole on the head
25 | whale.n.02
26 |
27 | <------------------------ NEW TAG HERE
28 | a very large person; impressive in size or qualities
29 | giant.n.04
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/images/precision-recall.fig:
--------------------------------------------------------------------------------
1 | #FIG 3.2
2 | Landscape
3 | Center
4 | Metric
5 | A4
6 | 100.00
7 | Single
8 | -2
9 | 1200 2
10 | 0 32 #808080
11 | 5 1 0 1 1 7 50 -1 -1 0.000 0 0 0 0 6726.731 4118.000 6750 6750 4095 4185 6705 1485
12 | 5 1 0 1 4 7 50 -1 -1 0.000 0 0 0 0 4522.494 6276.731 1890 6300 4455 3645 7155 6255
13 | 1 3 0 4 32 7 60 -1 -1 0.000 1 0.0000 4545 4050 2655 2655 4545 4050 7200 4095
14 | 4 0 1 60 -1 0 18 0.0000 4 180 1680 7875 4005 DOCUMENTS\001
15 | 4 0 1 60 -1 0 18 0.0000 4 180 1500 7875 3555 RETRIEVED\001
16 | 4 0 4 60 -1 0 18 0.0000 4 180 2685 3195 7245 INFORMATION NEED\001
17 | 4 0 0 60 -1 0 18 0.0000 4 180 1035 5265 2700 irrelevant\001
18 | 4 0 0 60 -1 0 18 0.0000 4 180 870 2835 4950 relevant\001
19 | 4 0 0 60 -1 0 18 0.0000 4 180 1380 2655 5400 not retrieved\001
20 | 4 0 0 60 -1 0 18 0.0000 4 180 345 3240 4500 FN\001
21 | 4 0 0 60 -1 0 18 0.0000 4 180 360 3465 2250 TN\001
22 | 4 0 0 60 -1 0 18 0.0000 4 180 1035 2835 2700 irrelevant\001
23 | 4 0 0 60 -1 0 18 0.0000 4 180 1380 2475 3150 not retrieved\001
24 | 4 0 0 60 -1 0 18 0.0000 4 180 960 5130 3150 retrieved\001
25 | 4 0 0 60 -1 0 18 0.0000 4 180 300 5580 2250 FP\001
26 | 4 0 0 60 -1 0 18 0.0000 4 180 315 4815 4500 TP\001
27 | 4 0 0 60 -1 0 18 0.0000 4 180 960 5130 5400 retrieved\001
28 | 4 0 0 60 -1 0 18 0.0000 4 180 870 4905 4950 relevant\001
29 |
--------------------------------------------------------------------------------
/slides/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 | .. include:: ../definitions.rst
3 |
4 | ========================
5 | NLTK Presentation Slides
6 | ========================
7 |
8 | :Authors: Steven Bird
9 | :Version: |version|
10 | :Copyright: |copy| |copyrightinfo|
11 | :License: |license|
12 |
13 | --------
14 | Contents
15 | --------
16 |
17 | These PDF slides were produced using the LaTeX Beamer package.
18 | The source materials are also made available here; please send
19 | any improvements to Steven Bird, for inclusion in future versions.
20 | (Note these are out-of-date.)
21 |
22 | 0. Preface
23 | #. [\ `PDF slides `__\ \|\ `LaTeX-Beamer source `__\ ] Preface
24 | #. [\ `PDF slides `__\ \|\ `LaTeX-Beamer source `__\ ] Introduction
25 | #. [\ `PDF slides `__\ \|\ `LaTeX-Beamer source `__\ ] Programming
26 | #. [\ `PDF slides `__\ \|\ `LaTeX-Beamer source `__\ ] Tagging
27 | #. [\ `PDF slides `__\ \|\ `LaTeX-Beamer source `__\ ] Chunking
28 | #. [\ `PDF slides `__\ \|\ `LateX-Beamer source `__\ ] Linguistic Data Management
29 |
30 | [`beamer.tex `__\ ] File used to build slides.
31 |
32 | ----
33 |
34 | NLTK_
35 |
36 | .. _NLTK: http://nltk.org/
37 |
38 |
39 |
--------------------------------------------------------------------------------
/book/ch05-extras.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 | .. include:: ../definitions.rst
3 |
4 | .. standard global imports
5 |
6 | >>> import nltk, re, pprint
7 |
8 | ==========================================
9 | 5. Categorizing and Tagging Words (Extras)
10 | ==========================================
11 |
12 | Tagging exhibits several properties that are characteristic of natural
13 | language processing. First, tagging involves *classification*: words have
14 | properties; many words share the same property (e.g. ``cat`` and ``dog``
15 | are both nouns), while some words can have multiple such properties
16 | (e.g. ``wind`` is a noun and a verb). Second, in tagging, disambiguation
17 | occurs via *representation*: we augment the representation of tokens with
18 | part-of-speech tags. Third, training a tagger involves *sequence learning
19 | from annotated corpora*. Finally, tagging uses *simple, general methods*
20 | such as conditional frequency distributions and transformation-based learning.
21 |
22 |
23 | List of available taggers:
24 | ``http://www-nlp.stanford.edu/links/statnlp.html``
25 |
26 | NLTK's HMM tagger, ``nltk.HiddenMarkovModelTagger``
27 |
28 | [Abney1996PST]_
29 |
30 | ``http://en.wikipedia.org/wiki/Part-of-speech_tagging``
31 |
32 | .. Dutch example: http://www.askoxford.com/pressroom/archive/odelaunch/
33 |
--------------------------------------------------------------------------------
/book/reprint1-1.txt:
--------------------------------------------------------------------------------
1 | Communicated to O'Reilly in July 2009.
2 |
3 | Issues with Figures
4 |
5 | Fig 1.1 -- more contrast (supplied image was color)
6 | Fig 1.3 -- smaller scale
7 | Fig 2.7 -- more contrast (supplied image was color)
8 | Fig 4.3 -- inconsistent arrow style, colliding arrow heads, inconsistent arrow origins
9 | Fig 6.5 -- higher resolution (we need to supply a better image)
10 | Fig 9.1 -- smaller scale (closer in size to example (18) same page), fix broken vbars
11 | Fig 10.3 -- fix horizontal alignment (subtrees rooted at greek letter variables)
12 | Fig 11.4 -- missing subscript on s_1
13 |
14 | Errata reported on O'Reilly site (first two already noted in QC2 annotations)
15 |
16 | p46 2d: file[:4] -> fileid[:4]
17 | p88 3up: print b -> print line
18 | p92 middle: s.titlecase() -> s.title()
19 |
20 | Ch 9: DAGs all scaled too big
21 |
22 | Everywhere: No trees should have boldfaced text (except
23 | the one on p167). (fyi Ch 9 (35) was still inconsistently boldfaced)
24 |
25 | p339 (23) incorrect diagram; it should be:
26 | http://nltk.googlecode.com/svn/trunk/doc/book/ch09.html#ex-dag04
27 |
28 | p355 -- remove box from code block
29 |
30 | p382 (28) -- smaller scale
31 |
32 | bad pagebreaks: xiv, 18, 43, 76, 103, 341, 343
33 |
34 | p395 8up "core", "store" s/b uc in the SEM value of VP
35 |
36 | p467: "deve-test" -> "dev-test"
37 |
38 |
--------------------------------------------------------------------------------
/book/errata2.txt:
--------------------------------------------------------------------------------
1 | Errata still present in the second printing:
2 |
3 | * sec 3.12 exercise 16: should be "from test import monty; print monty"
4 | * sec 4.2: output missing from len(t) -- issue 500
5 |
6 | * sec 4.4: append['noun'] should be append('noun')
7 |
8 | * sec 4.2: Thus, zip() takes the items of two or more sequences and "zips" them together into a single list of pairs.
9 | s/b "list of tuples". We only get pairs when two sequences are zipped.
10 |
11 | * ch8: the `a`:em: "are" of column `A`:em:
12 | s/b the `a`:em: "are" of column `I`:em:
13 |
14 | * ch4: trie = nltk.defaultdict(dict)
15 | s/b trie = {}
16 |
17 | * ch4: sort them according to their ``path_distance()``
18 | s/b sort them according to their ``shortest_path_distance()``
19 |
20 | * ch4: [len(w) for w in nltk.corpus.brown.sents(categories='news'))]
21 | s/b [len(w) for w in nltk.corpus.brown.sents(categories='news')]
22 |
23 | * ch4: random.randint(0, %d) in vocab" % vocab_size * 2
24 | s/b random.randint(0, %d) in vocab" % (vocab_size * 2)
25 |
26 | * ch1: if the thieves are sold, ... if the paintings are sold.
27 | s/b if the thieves are found, ... if the paintings are found.
28 |
29 | * ch5: print nltk.ConfusionMatrix(gold, test)
30 | s/b print nltk.ConfusionMatrix(gold_tags, test_tags)
31 |
32 | --
33 |
34 | * ch1: correct translation to German would have used "nach" instead of "zu" (page 30)
35 |
--------------------------------------------------------------------------------
/book/copy-edits.txt:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | ======================
4 | Copy-Edit Categories
5 | ======================
6 |
7 | ----------------------------
8 | 1. To be entered by O'Reilly
9 | ----------------------------
10 |
11 | - punctuation, including use of em dashes
12 | - 'like' -> 'such as'
13 | - 'which' -> 'that'
14 | - 'while' -> 'whereas'
15 | - alternatives to 'below', 'above'
16 | - capitalization of titles and figure legends
17 | - hyphenation conventions in body of text (e.g, 'multi-line' -> 'multiline')
18 | - spelling
19 | - numerals (e.g., '5' -> 'five')
20 |
21 |
22 |
23 | ----------------------------
24 | 2. To be entered by Authors
25 | ----------------------------
26 |
27 | ??
28 |
29 |
30 | ------------
31 | 3. No Change
32 | ------------
33 |
34 | - boldface text retained for new and important terms
35 | - constant width retained for program names, menu names, filenames, etc
36 |
37 | ------------------
38 | 4. To be discussed
39 | ------------------
40 |
41 | - Retain long captions
42 | - Don't double-quote emphasized text inside captions, but set them in roman. (Why
43 | would emphasis show up as italic inside plain text, and double-quoted inside a
44 | caption?)
45 |
46 |
47 | --------
48 | 5. Other
49 | --------
50 |
51 | Steven mentioned:
52 |
53 | * splitting a sentence-final example off into a new verb-less sentence.
54 |
55 | Not sure what the issue is here.
56 |
--------------------------------------------------------------------------------
/book/reprint1-2.txt:
--------------------------------------------------------------------------------
1 | Communicated to O'Reilly on 2009-12-04
2 |
3 | p9 16d lexical diversity() s/b lexical_diversity() -- with underscore instead of space
4 |
5 | p18 Fig 1-4 or the code that creates it needs to be fixed (currently NLTK does counts, not percentages)
6 |
7 | p46 Fig 2.1 -- more contrast (supplied image was color)
8 |
9 | p132 9up "makes detection is easier" s/b "makes detection easier"
10 |
11 | p144 16up "an empty dictionary" s/b "an empty list"
12 |
13 | p153 3d add quotes around "in-place dictionary"
14 | add following sentence: (Dictionaries will be presented in Section 5.3.)
15 |
16 | p153 bottom and 154 top -- code block spanning page break:
17 | variable "trace" should be renamed to "verbose" x4
18 |
19 | p172 3d "dendogram" s/b "dendrogram"
20 |
21 | p177 ex 33 -- move to chapter 5 (new exercise 43). Change reference "described in chapter 5"
22 | to "described in this chapter"
23 |
24 | p336 Fig 9.1 -- larger scale (closer in size to example (18) same page), fix broken vbars
25 | (reported as too big last time, but now it is too small.)
26 |
27 | p391 6d insert space before "yields"
28 |
29 | p393 8up -- semrel s/b semrep
30 |
31 | p393 5up
32 | exists z3.(ankle(z3) & bite(cyril,z3))
33 | s/b
34 | all z4.(boy(z4) -> see(cyril,z4))
35 |
36 | General: Some readers report that the program line annotations (numbered bullets)
37 | are confusing in their current position. Can they be placed to the left of the line?
38 |
--------------------------------------------------------------------------------
/slides/demos/similar_words_2.py:
--------------------------------------------------------------------------------
1 | ######################################################################
2 | ##
3 | ## What words tend to co-occur?
4 | ##
5 |
6 | from nltk.probability import ConditionalFreqDist
7 | from nltk.corpus import brown
8 |
9 | ######################################################################
10 | def build_association_distribution():
11 | assoc = ConditionalFreqDist()
12 |
13 | # For each document in the "Brown Corpus"...
14 | for document in brown.files():
15 | words = brown.tagged_words(document)
16 |
17 | # For each word that's a noun...
18 | for index, (word, tag) in enumerate(words):
19 | if tag.startswith('N'):
20 |
21 | # Look at any nouns in the next 5 words...
22 | window = words[index+1:index+5]
23 | for (window_word, window_tag) in window:
24 | if window_tag.startswith('N'):
25 |
26 | # And add them to our freq. distribution
27 | assoc[word].inc(window_word.lower())
28 |
29 | return assoc
30 |
31 | if 'associations' not in globals():
32 | associations = build_association_distribution()
33 |
34 | ######################################################################
35 | def assoc(word):
36 | print '%20s -> %s' % (word, associations[word].max())
37 |
38 | ######################################################################
39 | assoc('man')
40 | assoc('woman')
41 | assoc('level')
42 |
43 |
44 |
--------------------------------------------------------------------------------
/epydoc.diff:
--------------------------------------------------------------------------------
1 | --- /Library/Python/2.7/site-packages/epydoc/markup/restructuredtext.py~ 2008-01-28 13:15:33.000000000 -0500
2 | +++ /Library/Python/2.7/site-packages/epydoc/markup/restructuredtext.py 2012-09-23 20:59:35.000000000 -0400
3 | @@ -304,10 +304,10 @@
4 | # Extract the first sentence.
5 | for child in node:
6 | if isinstance(child, docutils.nodes.Text):
7 | - m = self._SUMMARY_RE.match(child.data)
8 | + m = self._SUMMARY_RE.match(child)
9 | if m:
10 | summary_pieces.append(docutils.nodes.Text(m.group(1)))
11 | - other = child.data[m.end():]
12 | + other = child[m.end():]
13 | if other and not other.isspace():
14 | self.other_docs = True
15 | break
16 | @@ -489,10 +489,10 @@
17 | if (len(fbody[0]) > 0 and
18 | isinstance(fbody[0][0], docutils.nodes.Text)):
19 | child = fbody[0][0]
20 | - if child.data[:1] in ':-':
21 | - child.data = child.data[1:].lstrip()
22 | + if child[:1] in ':-':
23 | + child = docutil.nodes.Text(child[1:].lstrip())
24 | elif child.data[:2] in (' -', ' :'):
25 | - child.data = child.data[2:].lstrip()
26 | + child = docutil.nodes.Text(child[2:].lstrip())
27 |
28 | # Wrap the field body, and add a new field
29 | self._add_field(tagname, arg, fbody)
30 |
--------------------------------------------------------------------------------
/xelatexsymbols.tex:
--------------------------------------------------------------------------------
1 | %&program=xelatex
2 | %&encoding=UTF-8 Unicode
3 |
4 | \newcommand{\as}[1]{{\fontspec{Apple Symbols}#1}}
5 | \newcommand{\asb}[1]{{\fontspec[Scale=1.1]{Apple Symbols}#1}}
6 | \newcommand{\ls}[1]{{\fontspec[Scale=0.9]{Lucida Grande}#1}}
7 | \def\reflect#1{{\setbox0=\hbox{#1}\rlap{\kern0.5\wd0
8 | \special{x:gsave}\special{x:scale -1 1}}\box0 \special{x:grestore}}}
9 | \def\XeLaTeX{\leavevmode
10 | \setbox0=\hbox{X\lower.5ex\hbox{\kern-.15em\reflect{E}}\kern-.0833em \LaTeX}%
11 | \dp0=0pt\ht0=0pt\box0 }
12 |
13 | \documentclass[11pt]{article}
14 | \title{Math Symbols in \XeLaTeX}
15 | \author{Ewan Klein}
16 | \date{\today}
17 | \usepackage{fontspec}
18 | \setromanfont{Palatino}
19 |
20 | \begin{document}
21 | \maketitle
22 | \section{Introduction}
23 |
24 | This file tests direct insertion of unicode characters using cut and paste
25 | from the Mac OS X font book application. As far as I can tell, some of
26 | the characters are only available in the Apple Symbols font, and some
27 | which you might have expected to be in Apple Symbols aren't, but can
28 | be found instead in Lucida Grande, for example.
29 |
30 | I still haven't found a way of producing angle brackets.
31 |
32 |
33 | \begin{itemize}
34 | \item $P$ \as{≐} $R$ \as{∧} $Q$
35 | \item Z ≠ X \as{⊆} Y and X \as{↦} Y
36 | \item X \as{⊑} Y
37 | \item a ≥ b ≤ c < d > e
38 | \item p \ls{→} \as{⊥}
39 | \item 3 \as{╳} 0.2 = (0.3)\as{⋅}2
40 | \item 3 — \as{〈}4, 5 \as{〉}
41 | \item A \as{⊓} B = A if \as{≡} \ls{¬}
42 | \item \asb{∃}x\asb{∀}y.R(x) \as{→} R(y)
43 | \item $x$ \as{∈} $Y$
44 | \end{itemize}
45 |
46 |
47 |
48 | \end{document}
49 |
--------------------------------------------------------------------------------
/book/book.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Natural Language Processing with Python
5 |
6 | 1
7 | 9780596516499
8 |
9 | StevenBird
10 | EwanKlein
11 | EdwardLoper
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 | NLTK Index
32 |
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/slides/demos/similar_words.py:
--------------------------------------------------------------------------------
1 | ######################################################################
2 | ##
3 | ## What words occur in similar contexts?
4 | ##
5 |
6 | from nltk import *
7 | from collections import defaultdict
8 |
9 | ######################################################################
10 | def build_context_map():
11 | """Build a dictionary mapping words in the brown corpus to lists
12 | of local lexical contexts, where a context is encoded as a tuple
13 | (prevword, nextword)."""
14 | context_map = defaultdict(list)
15 | for document in corpus.brown.files():
16 | words = corpus.brown.words(document)
17 | words = [word.lower() for word in words]
18 | for i in range(1, len(words)-1):
19 | prevword, word, nextword = words[i-1:i+2]
20 | context_map[word].append( (prevword, nextword) )
21 | return context_map
22 |
23 | if 'context_map' not in globals():
24 | context_map = build_context_map()
25 |
26 | ######################################################################
27 | def dist_sim(context_map, word, num=6):
28 | """Display words that appear in similar contexts to the given
29 | word, based on the given context map."""
30 | contexts = set(context_map.get(word, ()))
31 | fd = nltk.FreqDist(w for w in context_map
32 | for c in context_map[w]
33 | if c in contexts and w!=word)
34 |
35 | print 'Words similar to %r:' % word
36 | print ' '.join('%10s' % wd for wd in fd.keys()[:num])
37 | print ' '.join('%10s' % fd[wd] for wd in fd.keys()[:num])
38 |
39 | ######################################################################
40 |
41 | dist_sim(context_map, 'man')
42 | dist_sim(context_map, 'woman')
43 | dist_sim(context_map, 'walk')
44 | dist_sim(context_map, 'in')
45 |
--------------------------------------------------------------------------------
/book/ch03-extras.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 | .. include:: ../definitions.rst
3 | .. include:: regexp-defns.rst
4 |
5 | .. standard global imports
6 |
7 | >>> import nltk, re, pprint
8 |
9 | ===============================
10 | 3. Processing Raw Text (Extras)
11 | ===============================
12 |
13 |
14 | as x as y: ``http://acl.ldc.upenn.edu/P/P07/P07-1008.pdf``
15 |
16 | -------------------
17 | Regular Expressions
18 | -------------------
19 |
20 | http://www.regular-expressions.info/ is a useful online resource,
21 | providing a tutorial and references to tools and other sources of
22 | information.
23 |
24 | Unicode Regular Expressions:
25 | http://www.unicode.org/reports/tr18/
26 |
27 | Regex Library:
28 | http://regexlib.com/
29 |
30 |
31 |
32 | #. The above example of extracting (name, domain) pairs from
33 | text does not work when there is more than one email address
34 | on a line, because the ``+`` operator is "greedy" and consumes
35 | too much of the input.
36 |
37 | a) Experiment with input text containing more than one email address
38 | per line, such as that shown below. What happens?
39 | #) Using ``re.findall()``, write another regular expression
40 | to extract email addresses, replacing the period character
41 | with a range or negated range, such as ``[a-z]+`` or ``[^ >]+``.
42 | #) Now try to match email addresses by changing the regular
43 | expression ``.+`` to its "non-greedy" counterpart, ``.+?``
44 |
45 | >>> s = """
46 | ... austen-emma.txt:hart@vmd.cso.uiuc.edu (internet) hart@uiucvmd (bitnet)
47 | ... austen-emma.txt:Internet (72600.2026@compuserve.com); TEL: (212-254-5093)
48 | ... austen-persuasion.txt:Editing by Martin Ward (Martin.Ward@uk.ac.durham)
49 | ... blake-songs.txt:Prepared by David Price, email ccx074@coventry.ac.uk
50 | ... """
51 |
--------------------------------------------------------------------------------
/LSA325/log_lc_and_functions.txt:
--------------------------------------------------------------------------------
1 | List Comprehensions & Functions
2 | ===============================
3 |
4 | Find the first letter of each word:
5 |
6 | >>> words = 'this is a short sentence'.split()
7 | >>> [word[0] for word in words]
8 | ['t', 'i', 'a', 's', 's']
9 |
10 | Convert each word in a list to lower case:
11 |
12 | >>> words = 'This sentence has some Capitalized words'.split()
13 | >>> [word.lower() for word in words]
14 | ['this', 'sentence', 'has', 'some', 'capitalized', 'words']
15 |
16 | Define a function that counts the number of vowels in a word:
17 |
18 | >>> def vowels(word):
19 | ... v = 0
20 | ... for char in word:
21 | ... if char in 'aeiouAEIOU':
22 | ... v = v + 1
23 | ... return v
24 |
25 | Use the new 'vowels' function to find the number of vowels in each
26 | word from a word list:
27 |
28 | >>> [vowels(word) for word in words]
29 | [1, 3, 1, 2, 5, 1]
30 |
31 | Define a function that finds the average of a list of numbers.
32 |
33 | >>> def avg(numbers):
34 | ... return sum(numbers) / float(len(numbers)) # [1]
35 |
36 | [1] Note that we used 'float()' to convert the denominator from an
37 | integer to a real number, since dividing by integers rounds to
38 | the nearest value:
39 |
40 | >>> print 10/3
41 | 3
42 |
43 | Find the average number of vowels in the world list.
44 |
45 | >>> print avg([vowels(word) for word in words])
46 | 2.16666666667
47 |
48 | Find the average length of words that begin with a vowel:
49 |
50 | >>> from nltk.corpus import brown
51 | >>> words = brown.tokenized('a', group_by_sent=False)
52 | >>> # Select only those words that start with a vowel:
53 | >>> words = [w for w in words if w[0].lower() in 'aeiouy']
54 | >>> # Find the length of each word:
55 | >>> lengths = [len(w) for w in words]
56 | >>> # Get the average:
57 | >>> print avg(lengths)
58 | 4.00069249257
59 |
--------------------------------------------------------------------------------
/book/guidelines.txt:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 | .. include:: ../definitions.rst
3 |
4 |
5 | =========================
6 | NLTK Developer Guidelines
7 | =========================
8 |
9 | -----------------
10 | Design Philosophy
11 | -----------------
12 |
13 | http://en.wikipedia.org/wiki/Worse_is_better
14 |
15 |
16 | ------------------
17 | Naming Conventions
18 | ------------------
19 |
20 | #. Packages: These are typically verbs, all lowercase letters. When
21 | whole packages are imported, NLP processing tasks have very readable
22 | names, e.g. ``tokenize.whitespace()``, ``tag.ngram()``.
23 |
24 | #. Modules: These are lowercase words; multi-word names are joined
25 | without punctuation, e.g. ``parse.featurestructure``.
26 |
27 | #. Classes: These are uppercase-initial words, e.g. ``Chart``.
28 | Multiple words are joined together, with an uppercase letter to
29 | begin each new word, e.g. ``PorterStemmer``.
30 |
31 | #. Functions and Variables: These are all lowercase, with underscore
32 | to separate multiple words
33 |
34 | #. Constants: These are all-caps
35 |
36 | ------------------
37 | Python Conventions
38 | ------------------
39 |
40 | New-style Python classes (all ultimately inherit from ``object``).
41 |
42 | Each module contains demo code, and which can be run from the command line.
43 | This demo code should also be self contained (i.e. contain its own
44 | import statements, so that someone can cut and paste it into a new
45 | file and run it without modification.
46 |
47 | Each package's __init__.py file should import all the package's
48 | modules, so that everything in a package can be accessed by importing
49 | that package.
50 |
51 | Indentation: tab = 4 spaces
52 |
53 | ----------
54 | Commenting
55 | ----------
56 |
57 | Detailed module-level docstring; epydoc docstrings; ...
58 |
59 | -----------------
60 | Repository Access
61 | -----------------
62 |
63 | NLTK developers (people with write access to the repository) are
64 | welcome to contribute and maintain their own code in
65 | ``nltk.contrib``, but should not touch any other files.
66 | This is because the core developers need to be responsible for
67 | ensuring that everything works.
68 |
69 | NLTK core developers can modify any files.
70 |
71 |
72 |
73 | .. include:: footer.txt
74 |
--------------------------------------------------------------------------------
/book/DOCUTILS:
--------------------------------------------------------------------------------
1 | HIGH PRIORITY
2 |
3 | (without these its painful to continue with docutils)
4 |
5 | book
6 | - inclusions of individual chapters
7 | - single pagination across all chapters
8 | - evidence in repository of earlier effort on this, but currently broken
9 |
10 | xrefs
11 | - assign a symbolic label to any section
12 | - refer to this label anywhere in the chapter (or book)
13 | - should be able to do this for figures etc
14 |
15 | MEDIUM PRIORITY
16 |
17 | (without these we are forced to do some major rehabilitation on the
18 | latex output, or ugly scripts to hack it)
19 |
20 | bibliography
21 | - storage of bib data (bibtex?)
22 | - inline citation of key expands to human-readable citation
23 | - generation of chapter or book-level bibliography section
24 |
25 | index
26 | - construct an index of all "dt" (defined terms)
27 | - permit other terms to be indexed (e.g. text role "idx")
28 | - index topical terms embedded in text but not displayed (e.g. text role "topic")
29 | - index refers to pages on which these appeared
30 |
31 | feature structures
32 | - map the existing ReST syntax into Manning's avm.sty syntax
33 |
34 | text substitutions
35 | - we need to allow these to be interpreted inside non-literal roles
36 | such as :math: and :dt:
37 |
38 | example numbering
39 | - tweak the ex environment so that it will allow include-ed program
40 | fragments to be numbered and indented in the same way as trees
41 |
42 | logical paragraphs
43 | - when a paragraph contains an example, the text following the
44 | example is a continuation of the paragraph, not a new paragraph
45 | (cf HTML output for notes that contain doctest examples, as in
46 | example in section on conditional expressions here:
47 | http://nltk.sourceforge.net/lite/doc/en/programming.html )
48 |
49 | LOW PRIORITY
50 |
51 | (without these we have to do a bit more work in the rst
52 | source, or do minor last-minute hacking before submitting CRC)
53 |
54 | table-of-contents
55 | - collate section and subsection headings
56 | - tabulate, with page numbers
57 | - some control over depth of subsections to include
58 |
59 | aligned glossed examples
60 | - design syntax which will map into an appropriate LaTeX macro
61 | (which one? Covington?)
62 | (and ideally into a specialized class of HTML table)
63 |
64 | low-level formatting issues
65 | - doctest-ignore directive causes following code block to be unindented
66 |
67 |
--------------------------------------------------------------------------------
/book/regexp-defns.rst:
--------------------------------------------------------------------------------
1 | .. ifndef:: regexp_defns
2 |
3 | .. def:: regexp_defns
4 |
5 | .. |s.ng| replace:: |l|\ ``s.ng``\ |r|
6 | .. |.| replace:: |l|\ ``.``\ |r|
7 | .. |....zy| replace:: |l|\ ``....zy``\ |r|
8 | .. |....berry| replace:: |l|\ ``....berry``\ |r|
9 | .. |t...| replace:: |l|\ ``t...``\ |r|
10 | .. |colou?r| replace:: |l|\ ``colou?r``\ |r|
11 | .. |e-?mail| replace:: |l|\ ``e-?mail``\ |r|
12 | .. |patt| replace:: |l|\ ``patt``\ |r|
13 | .. |coo+l| replace:: |l|\ ``coo+l``\ |r|
14 | .. |f.+f| replace:: |l|\ ``f.+f``\ |r|
15 | .. |.+ed| replace:: |l|\ ``.+ed``\ |r|
16 | .. |.*gnt.*| replace:: |l|\ ``.*gnt.*``\ |r|
17 | .. |[aeiou]| replace:: |l|\ ``[aeiou]``\ |r|
18 | .. |[uoiea]| replace:: |l|\ ``[uoiea]``\ |r|
19 | .. |[^aeiou]| replace:: |l| ``[^aeiou]`` |r|
20 | .. |p[aeiou]t| replace:: |l|\ ``p[aeiou]t``\ |r|
21 | .. |p[aeiou]+t| replace:: |l|\ ``p[aeiou]+t``\ |r|
22 | .. |NN.*| replace:: |l|\ ``NN.*``\ |r|
23 | .. |.*| replace:: |l|\ ``.*``\ |r|
24 | .. |123|456| replace:: |l|\ ``123|456``\ |r|
25 | .. |12(3|4)56| replace:: |l|\ ``12(3|4)56``\ |r|
26 | .. |[a-z]| replace:: |l|\ ``[a-z]``\ |r|
27 | .. |[^a-z]| replace:: |l|\ ``[^a-z]``\ |r|
28 | .. |[a-zA-Z]| replace:: |l|\ ``[a-zA-Z]``\ |r|
29 | .. |t[a-z][a-z][a-z]| replace:: |l|\ ``t[a-z][a-z][a-z]``\ |r|
30 | .. |[A-Z][a-z]*| replace:: |l|\ ``[A-Z][a-z]*``\ |r|
31 | .. |20[0-4][0-9]| replace:: |l|\ ``20[0-4][0-9]``\ |r|
32 | .. |[b-df-hj-np-tv-z]+| replace:: |l|\ ``[b-df-hj-np-tv-z]+``\ |r|
33 | .. |.*| replace:: |l|\ ``.*``\ |r|
34 | .. |^[A-Za-z]+| replace:: |l|\ ``^[A-Za-z]+``\ |r|
35 | .. |^[^ ]+| replace:: |l|\ ``^[^ ]+``\ |r|
36 | .. |[a-z]*s$| replace:: |l|\ ``[a-z]*s$``\ |r|
37 | .. |^$| replace:: |l|\ ``^$``\ |r|
38 | .. |*| replace:: |l|\ ``*``\ |r|
39 | .. |*?| replace:: |l|\ ``*?``\ |r|
40 | .. |<.*>| replace:: |l|\ ``<.*>``\ |r|
41 | .. |NN.*|JJ.*|DT| replace:: |l|\ ``NN.*|JJ.*|DT``\ |r|
42 | .. |dwelling|domicile|abode| replace:: |l|\ ``dwelling|domicile|abode``\ |r|
43 |
44 |
--------------------------------------------------------------------------------
/book/reprint1-3.txt:
--------------------------------------------------------------------------------
1 | p177 ex 33 -- move to chapter 5 (new exercise 43). Change reference "described in chapter 5"
2 | to "described in this chapter"
3 |
4 | p306 17up "The advantages of shift-reduce" s/b "The advantage of shift-reduce"
5 |
6 | p309 9up "through entire list" s/b "through the entire list"
7 |
8 | p309 13-14up "Det at wfst[0][1] and N at wfst[1][2], we can add NP to wfst[0][2]" s/b
9 | "Det at wfst[2][3] and N at wfst[3][4], we can add NP to wfst[2][4]"
10 |
11 | p334 10d Delete this whole line, viz "NP[NUM=?n] -> N[NUM=?n]", and close up space.
12 |
13 | p336 Fig 9-1 is too big in the latest pdf. Also, the feature labels shouldn't be bold.
14 |
15 | p340 ex 24 -- s/b smaller for consistency with the other DAGs (cf p339)
16 |
17 | p342 DAG (27a) is incorrect. It should look just like (27c) but *without* the middle arc
18 | labeled 'CITY'. (The online version of this chapter is correct, and uses dag04-1.png
19 | for this subfigure.)
20 |
21 | p363 21d -- node['sem'] s/b node['SEM']
22 | NB This is http://www.oreillynet.com/cs/nl/edit/errata/40392
23 |
24 | p389 17up "nltk.Variable('z')" s/b "nltk.sem.Variable('z')"
25 |
26 | p373 19d "such as or ." s/b
27 | "such as or >."
28 | NB This is http://www.oreillynet.com/cs/nl/edit/errata/39295
29 |
30 | p392 6d "nltk.ApplicationExpression(tvp, np)" s/b
31 | "nltk.sem.ApplicationExpression(tvp, np)"
32 |
33 | p396 20up "trees[0].node['sem']" s/b "trees[0].node['SEM']"
34 |
35 | p399 4d Det[NUM=sg,SEM=<\P Q.([x],[]) + P(x) + Q(x)>] -> 'a'
36 | s/b
37 | Det[NUM=sg,SEM=<\P Q.(([x],[]) + P(x) + Q(x))>] -> 'a'
38 |
39 |
40 | p400 20d "trees[0].node['sem'].simplify()" s/b
41 | "trees[0].node['SEM'].simplify()"
42 |
43 | p405-406 exs (5)-(7). Please replace all seven occurrences of
44 | "nltk.ApplicationExpression" with
45 | "nltk.sem.ApplicationExpression".
46 |
47 | p426 The error report at http://www.oreillynet.com/cs/nl/edit/errata/39424
48 | looks correct to me (EK).
49 |
50 | p429 11-12d sentence beginning with "Ignoring...", please replace with
51 | the following (and set "OTH" in cw):
52 |
53 | Ignoring the entries for exchanges between people
54 | other than the top 5 (labeled OTH), the largest value suggests
55 | that Portia and Bassanio have the most significant interactions.
56 |
57 | p444 7d can never been known s/b can never be known
58 |
--------------------------------------------------------------------------------
/LSA325/assignment4.txt:
--------------------------------------------------------------------------------
1 | Grammar development
2 |
3 | For this assignment, you will be editing a file named
4 | 'assignment4.py', which you can download from:
5 |
6 | http://nltk.org/temp/assignment4.py
7 |
8 | Please rename this file to
9 | 'assignment4-.py' before submitting it.
10 |
11 |
12 | 1. Chose a linguistic phenomenon of interest that you would like to
13 | model using a grammar. Here are some ideas, or use your own:
14 |
15 | - Noun modifiers
16 | ("slow cat", "very slow cat")
17 | - Comparative expressions
18 | ("bigger than a breadbox", "less heavy than ...")
19 | - Sentential complements
20 | ("I think that you know that ...")
21 | - Quantifiers
22 | ("For every boy, some girl ...")
23 |
24 | 2. Choose 4-5 example sentences, and add them to the 'sent_corpus'
25 | variable. This variable contains a list of sentences, one per
26 | line.
27 |
28 | 3. Add grammar and lexical rules to the 'grammar' variable to cover
29 | your example sentences.
30 |
31 | 4. Run the program, and check the parse trees you get. Were there any
32 | extra parse trees you weren't expecting? Were there any sentences
33 | that failed to parse?
34 |
35 | 5. Refine your grammar until it covers your example sentences. If
36 | possible, your grammar should not produce extra unintended parse
37 | trees. (But for some linguistic phenomena, this might not be
38 | possible!)
39 |
40 | 6. Once you're happy with the output parse trees, copy them to the
41 | 'tree_corpus' variable. Do NOT copy the sentence strings -- just
42 | the tree expressions. (If your grammar generates extra unintended
43 | parse trees, don't include them.) You can then delete these
44 | sentences from 'sent_corpus'. Run the program again, and it will
45 | automatically check to make sure that the intended parse trees are
46 | getting generated. This way you won't have to keep checking them
47 | by hand if you choose to do the optional step (7).
48 |
49 | 7. Optional: return to step 2 (as many times as you like).
50 |
51 | 8. Use the 'comments' variable to write a short comment about grammar
52 | development -- was it easier or harder than you thought? How hard
53 | do you think it would be to merge the grammar that you developed
54 | with some of your classmates' grammars, that were designed to
55 | handle other linguistic phenomena? What problems might come up
56 | when merging grammars?
57 |
--------------------------------------------------------------------------------
/book/intro-outline.txt:
--------------------------------------------------------------------------------
1 |
2 | NB. M-x outline-mode
3 | NB. [N] = new stuff, [?] = needs discussion, maybe omit or merge
4 | NB. Section titles may have been changed, or inserted to make
5 | structure clearer.
6 |
7 | ------------------------------------------------------
8 |
9 | * Some things you can do with NLP
10 |
11 | ** Examples
12 | NLP for linguistic research
13 | NLP for applications
14 |
15 | ** What do we mean by NLP?
16 | terminology: CL/HLT etc
17 |
18 | ** Audience and Goals [N]
19 |
20 |
21 | * The Language Challenge
22 |
23 | ** NL is rich and complex and difficult
24 |
25 | *** Language is integral to culture
26 |
27 | *** Language is intertwined in modern technology [?]
28 | [not quite clear what the message is here]
29 |
30 | ** How NLP can make a difference
31 |
32 | *** Information overload for ordinary people
33 |
34 | Text and multimedia on WWW
35 |
36 | QA example
37 |
38 | *** Information overload for professionals [N]
39 | biomedical example
40 |
41 | *** But NLP is still limited in what it can do
42 |
43 | * Overview of NLP
44 |
45 | ** One ideal: Intelligence and Turing Test
46 | example of spoken dialogue
47 |
48 | ** More realistic: not so intelligent NLP
49 |
50 | Other examples of NLP applications
51 |
52 | ** Brief history of NLP
53 | [this seems a bit tricky -- what needs to be included here? so far,
54 | it seems a bit arbitrary]
55 |
56 | *** Formal language theory
57 | --> computational syntax
58 |
59 | *** Formal logic
60 | --> automated inference
61 |
62 | *** Formal semantics and compositionality [?]
63 | [not really at the same level; maybe merge in with preceding chunk]
64 |
65 | *** Domain dependence
66 | e.g. semantic grammars
67 |
68 | *** Machine learning [N]
69 |
70 |
71 | ** Philosophical Perspective
72 | [maybe push this up a level?]
73 |
74 | *** Rationalism vs Empiricism
75 | schools of linguistics
76 |
77 | *** Realism vs Idealism [?]
78 | God's truth vs. hocus pocus
79 | [maybe drop this?\
80 |
81 | * Architecture of Linguistic and NLP Systems
82 |
83 | ** Modularity
84 | Generative grammars [link back to formal language theory]
85 |
86 | ** Competence and Performance
87 |
88 | *** NLP aspects
89 | simple parsing example
90 |
91 | *** Cognitive aspects [N]
92 | human sentence processing -- say that it's out of scope
93 |
94 | *** Spoken Dialogue system
95 |
96 | language resources / static
97 | processing tools / dynamic
98 |
99 | * Outline of book [N]
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
--------------------------------------------------------------------------------
/howto/Makefile:
--------------------------------------------------------------------------------
1 | # NLTK: Doctest Makefile
2 | #
3 | # Copyright (C) 2001-2012 NLTK Project
4 | # Author: Edward Loper
5 | # Steven Bird
6 | # URL:
7 | # For license information, see LICENSE.TXT
8 |
9 | # File locations:
10 | DOCTEST_SRC = ../../nltk/nltk/test
11 | DOCTESTS = $(wildcard $(DOCTEST_SRC)/*.doctest)
12 | PYSRC = $(shell find ../../nltk/nltk -name '*.py')
13 | HTML = $(DOCTESTS:$(DOCTEST_SRC)/%.doctest=%.html)
14 | ERRS = $(DOCTESTS:$(DOCTEST_SRC)/%.doctest=%.errs)
15 | COVERAGE = $(DOCTESTS:$(DOCTEST_SRC)/%.doctest=%.coverage)
16 | PYTHONPATH = ../../
17 | export PYTHONPATH
18 |
19 | # Converting rst->html:
20 | RST = ../rst.py
21 | RST2HTML = $(RST) --html --css ../nltkdoc.css
22 |
23 | # Testing:
24 | PYTHON = python
25 | DOCTEST = $(PYTHON) ../../nltk/nltk/test/doctest_driver.py
26 | DOCTEST_FLAGS = --udiff
27 |
28 | PUBLISH = ../../nltk.github.com
29 |
30 | .PHONY: all html coverage
31 |
32 | #all: html errs coverage
33 | html: $(HTML) #index.html
34 | errs: $(ERRS)
35 | @echo Failed doctests:
36 | @grep 'FAILED (failures' *.errs |sed 's/\(.*\)\.errs:.*/ - \1/'
37 | coverage: coverage/index.html
38 |
39 | coverage/index.html: coverage-list.txt coverage.txt
40 | $(RST2HTML) coverage.txt -o $@
41 |
42 | coverage-list.txt: $(COVERAGE) ./show_coverage.py
43 | python ./show_coverage.py $(COVERAGE)
44 |
45 | %.coverage: %.errs
46 | @true
47 | %.errs: $(DOCTEST_SRC)/%.doctest $(PYSRC)
48 | $(DOCTEST) $(DOCTEST_FLAGS) $(DOCTEST_SRC)/$*.doctest \
49 | --coverage=$*.coverage > $*.errs 2>&1
50 |
51 | %.html: $(DOCTEST_SRC)/%.doctest
52 | $(RST2HTML) $< -o $@
53 |
54 | $(DOCTEST_SRC)/%.doctest: $(DOCTEST_SRC)/%.doctest_latin1
55 | iconv -f iso8859-1 -t utf8 $< > $@
56 |
57 | index.html: index.txt #test-list.txt
58 | $(RST2HTML) index.txt
59 | # cat index.txt |sed s/test-list.txt/test-list-sort-title.txt/ \
60 | # >sort-title.txt
61 | # cat index.txt |sed s/test-list.txt/test-list-sort-lines.txt/ \
62 | # >sort-lines.txt
63 | # cat index.txt |sed s/test-list.txt/test-list-sort-tests.txt/ \
64 | # >sort-tests.txt
65 | # cat index.txt |sed s/test-list.txt/test-list-sort-outcome.txt/ \
66 | # >sort-outcome.txt
67 | # $(RST2HTML) sort-title.txt
68 | # $(RST2HTML) sort-lines.txt
69 | # $(RST2HTML) sort-tests.txt
70 | # $(RST2HTML) sort-outcome.txt
71 |
72 | test-list.txt: update_list.py $(ERRS)
73 | python update_list.py
74 |
75 | clean:
76 | rm -f `find . -name '*.html'`
77 | rm -f `find . -name '*.errs'`
78 |
79 | clean_up:
80 | true # nothing to do.
81 |
82 | publish:
83 | #cp *.html *.errs *.coverage $(PUBLISH)/howto/
84 | cp *.html $(PUBLISH)/howto/
85 |
--------------------------------------------------------------------------------
/LSA325/LSA325_3_handout.tex:
--------------------------------------------------------------------------------
1 | \documentclass[a4paper]{article}
2 |
3 | \begin{document}
4 |
5 | \begin{center}
6 | {\Large LSA 325, Class 3, Thu 12th July}
7 | \end{center}
8 | \section*{Topics}
9 |
10 | \begin{itemize}
11 | \item Partial Parsing and Interpretation
12 | \item Chunking
13 | \begin{itemize}
14 | \item What is a chunk?
15 | \item \texttt{chunk.Regexp}
16 | \item Data-driven approaches
17 | \item Chunking as Tagging
18 | \end{itemize}
19 | \end{itemize}
20 |
21 | \subsection*{Class Materials}
22 | \begin{itemize}
23 | \item The second installment of the NLTK book will be available in the
24 | Stanford Bookstore soon, either tomorrow or Monday 16 July.
25 | \item We will also distribute two further chapters in class next Monday:
26 | \begin{enumerate}
27 | \item \textit{Linguistic Data Management}
28 | \item Updated version of the \textit{Chunking} chapter. We hope it
29 | will be slightly easier for you to read, but there is no signicant
30 | difference in the code; so it is mainly relevant for the
31 | \textit{NLTK Book Review Assignment}.
32 | \end{enumerate}
33 |
34 | \end{itemize}
35 |
36 | \section*{Practical Stuff}
37 |
38 | \noindent
39 | Data Exploration:
40 | \begin{verbatim}
41 | >>> for tree in corpus.conll2000.read('train', chunk_types=('NP',))[:5]:
42 | ... print tree
43 | \end{verbatim}
44 |
45 | \noindent
46 | Write some rules (in a file):
47 | \begin{verbatim}
48 | grammar = r"""
49 | NP: {?*} # chunk determiners, adjectives and nouns
50 | {+} # chunk sequences of proper nouns
51 | """
52 | cp = chunk.Regexp(grammar)
53 | \end{verbatim}
54 |
55 | \noindent
56 | Examine results:
57 | \begin{verbatim}
58 | for tree in corpus.conll2000.read('train', chunk_types=('NP',))[:2]:
59 | print cp.parse(tree.flatten(), trace=1)
60 | \end{verbatim}
61 |
62 | \noindent
63 | Evaluate your grammar:
64 | \begin{verbatim}
65 | print chunk.accuracy(cp, corpus.conll2000.chunked('test', chunk_types=('NP',)))
66 | \end{verbatim}
67 |
68 | \noindent
69 | Iterate until accuracy = 100\%.
70 |
71 | \subsection*{Chunking Assignment}
72 |
73 | \begin{enumerate}
74 | \item Explore what kind of sequences are annotated as VP in the
75 | CONLL2000 \texttt{train} data.
76 | \item Develop a \texttt{chunk.Regexp} grammar to capture the regularities.
77 | \item Use trace to examine the success of your rules.
78 | \item Once you are reasonably happy, try evaluating your rules against
79 | the CONLL2000 \texttt{test} data.
80 | \item Briefly comment on how easy or difficult it was to develop an
81 | adequate rule set.
82 |
83 | \end{enumerate}
84 |
85 |
86 |
87 |
88 | \end{document}
89 |
90 | %%% Local Variables:
91 | %%% mode: latex
92 | %%% TeX-master: t
93 | %%% End:
94 |
--------------------------------------------------------------------------------
/book/introduction-code.txt:
--------------------------------------------------------------------------------
1 |
2 | >>> from nltk.corpora import cmudict
3 | >>> from string import join
4 | >>> for word, num, pron in cmudict.raw():
5 | ... stress_pattern = join(c for c in join(pron) if c in "012")
6 | ... if stress_pattern.endswith("1 0 0 0 0"):
7 | ... print word, "/", join(pron)
8 |
9 |
10 |
11 |
12 | >>> from nltk.corpora import shoebox
13 | >>> from nltk.utilities import MinimalSet
14 | >>> length, position, min = 4, 1, 3
15 | >>> lexemes = [field[1].lower() for entry in shoebox.raw('rotokas.dic')
16 | ... for field in entry if field[0] == 'lx']
17 | >>> ms = MinimalSet()
18 | >>> for lex in lexemes:
19 | ... if len(lex) == length:
20 | ... context = lex[:position] + '_' + lex[position+1:]
21 | ... target = lex[position]
22 | ... ms.add(context, target, lex)
23 | >>> for context in ms.contexts(3):
24 | ... for target in ms.targets():
25 | ... print "%-4s" % ms.display(context, target, "-"),
26 | ... print
27 |
28 |
29 |
30 | >>> from nltk.corpora import genesis
31 | >>> from nltk.probability import ConditionalFreqDist
32 | >>> from nltk.utilities import print_string
33 | >>> cfdist = ConditionalFreqDist()
34 | >>> prev = None
35 | >>> for word in genesis.raw():
36 | ... word = word.lower()
37 | ... cfdist[prev].inc(word)
38 | ... prev = word
39 | >>> words = []
40 | >>> prev = 'lo,'
41 | >>> for i in range(99):
42 | ... words.append(prev)
43 | ... for word in cfdist[prev].sorted():
44 | ... if word not in words:
45 | ... break
46 | ... prev = word
47 | >>> print_string(join(words))
48 |
49 |
50 |
51 |
52 |
53 | >>> from nltk.corpora import treebank
54 | >>> from string import join
55 | >>> def vp_conj(tree):
56 | ... if tree.node == 'VP' and len(tree) == 3 and tree[1].leaves() == ['but']:
57 | ... return True
58 | ... else:
59 | ... return False
60 | >>> for tree in treebank.parsed_sents():
61 | ... for vp1,conj,vp2 in tree.subtrees(vp_conj):
62 | ... print join(child.node for child in vp1), "*BUT*", join(child.node for child in vp2)
63 |
64 |
65 |
66 | from nltk.corpora import treebank
67 | from string import join
68 | def vp_conj(tree):
69 | if tree.node == 'VP' and len(tree) == 3 and tree[1].leaves() == ['but']:
70 | return True
71 | else:
72 | return False
73 |
74 | def pr(subtree):
75 | return "(%s %s)" % (subtree.node, join(subtree.leaves()))
76 |
77 | for tree in treebank.parsed_sents():
78 | for vp1,conj,vp2 in tree.subtrees(vp_conj):
79 | print join(pr(child) for child in vp1), "*BUT*", join(pr(child) for child in vp2)
80 |
--------------------------------------------------------------------------------
/book/SCHEDULE:
--------------------------------------------------------------------------------
1 | $Id$
2 | ----------------
3 | WRITING SCHEDULE
4 | ----------------
5 |
6 |
7 | 0. Preface
8 |
9 | 0. Python and NLTK
10 |
11 | 1. Introduction
12 |
13 | ------------------------------------------
14 |
15 | PART I: Basics
16 |
17 | Part Intro
18 |
19 | 2. Programming
20 | + more exercises
21 | + checking for coverage
22 | + summary
23 |
24 | 3. Words
25 | + lexical resources
26 | + sentence tokenization?
27 | + morphological analysis
28 | + Multiword expressions
29 | + summary
30 |
31 | 4. Tagging
32 | + non-Latin tagging example
33 | + n-gram language modeling, smoothing
34 | + move Brill stuff elsewhere
35 | + summary
36 |
37 | 5. Chunk Parsing
38 | + [P] rule format
39 | + summary
40 |
41 | ------------------------------------------
42 |
43 | PART II: Parsing
44 |
45 | Part Intro
46 |
47 | 6. Structured Programming
48 | + XML
49 | + collocations?
50 | + simple extractive summarization?
51 |
52 | 7. Grammars and Parsing
53 | + complete discussion of problems with parsing algorithms
54 | + material on dependencies, dependency grammar (+simple parser?)
55 | + discussion of generation
56 |
57 | 8. Advanced Parsing
58 | + Categorial grammar?
59 |
60 | 9. Feature Based Grammar
61 | + Describe feature structure module (done; but what about featurelite?)
62 |
63 | ------------------------------------------
64 |
65 | PART III: Advanced Topics
66 |
67 | Part Intro
68 |
69 | 10. Advanced Programming
70 | + Unicode, character encoding, XML, web (urlopen), crawling?
71 |
72 | 11. Semantic Interpretation
73 | + feature-based semantics (requires update of parser)
74 | + theta roles, propbank
75 | + Cooper storage (requires list-valued features)
76 |
77 | 12. Language Engineering / Data-intensive NLP
78 | + language id problem?
79 | + language modelling (already some major components here, esp for estimation)
80 | + HMMs
81 | + other machine learning techniques (e.g., Transformation-based learning)
82 | + Naive Bayes classification, clustering
83 | [NER, text classification (& question classification), ontology extraction]
84 | + NLP on the Web
85 | [stuff on RDF?]
86 |
87 | 13. Managing linguistic data
88 | + corpus construction
89 | + OLAC, annotation
90 |
91 | 14. Lexicon and Morphology
92 | + representing lexical information, redundancy
93 | + lexical resources
94 | + comlex
95 | + framenet
96 | + lexical semantics, use of ontologies
97 | + morphology/lexicon interaction
98 | + grammar/lexicon interaction (Levin classes)
99 | + lexical rules, hierarchical lexicon
100 | + multiword expressions, collocations, idioms
101 | --> AT&T WFST toolkit; Python bindings?
102 |
103 | 15. Conclusion
104 | brief pointers on 'hot topics': MT, Spoken Dialogue, QA
105 | -------------------------------------------
106 |
107 | APPENDIXES:
108 |
109 | * Regular Expressions
110 | * Cheat Sheet
111 |
--------------------------------------------------------------------------------
/book/CheckList.txt:
--------------------------------------------------------------------------------
1 | CHAPTER TASK
2 | -----------------------------------------------------------------------------
3 |
4 | 012345...9ABC Hellmann Review
5 | 0123456..9ABC Indurkya Review
6 | 0123456789ABC Munn Review
7 | 012345...9ABC Rhodes Review
8 | 012345.789ABC Schlansker Review
9 | 0123456789ABC Sproat Review
10 |
11 | 012345..89ABC Extended captions so figures and pylistings are self-contained
12 | 012345...9A.C Summary finalized
13 | 012345..8.A.C Further readings finalized
14 | 01..........C Exercises checked for coverage of chapter, sequence
15 | 012345...9ABC Book issues in issue tracker dealt with
16 | 012345...9A.C Doctests checked
17 | 012345......C Roughly uniform use of note blocks and |TRY|
18 | 012345....A.. Ensure no XXX annotations are commenting out a code block
19 | 012345...9A.. No writing left to do
20 |
21 | -----------------------------------------------------------------------------
22 |
23 | 012345...9... Doctest callouts used for referring to locations in code
24 | 012345....... Overflowing lines fixed
25 | 012345.789ABC All numbered displays (figure, pylisting, table) referenced from text
26 | 01........... Typographic changes implemented throughout, e.g. ->, :gc:
27 | 0123456789ABC Consistent URL formatting
28 | 01........... Image scaling ok
29 | 012345.....B. US spell check done
30 | 0............ Check for any more index terms
31 |
32 | -----------------------------------------------------------------------------
33 |
34 | General issues:
35 | * NLTK index and stoplist
36 | * Comments back to O'Reilly in docs/notes.txt
37 | * Code examples would be easier to read if the user input and the
38 | system output were in different fonts -- e.g., bold for user input.
39 | * We're inconsistant about whether to include a blank trailing prompt
40 | (">>>") in our code examples.
41 | * Ensure the URLs are in the following format:
42 | ``http://www.nltk.org`` including those inserted via rst replace such as |NLTK-URL|
43 | * n-gram vs *n*-gram markup
44 |
45 | Outstanding issues:
46 |
47 | ch06 only has only one |TRY| exercise
48 | ch07 has a conclusion (non-standard) but no summary
49 | ch07 needs some non-chunking exercises
50 | ch07 could describe SRL in 7.1 as another shallow processing task
51 | ch07 should describe NLTK's off-the-shelf NE tagger
52 | ch07 typography should follow the simplified style of later chapters, e.g. with NP
53 | ch07 only has two |TRY| exercises
54 | ch08 language is more formal than necessary, less accessible than it should be
55 | ch08 typography should no longer use :gc:
56 | ch08 section 8.6 on grammar development is incomplete (incl PE08 discussion)
57 | ch08 assumes knowledge of "head" (did some content disappear?) [it got moved to ch09]
58 | ch09 uses :lex: role, not processed by docbook [`appear`:lex: also in ch03]
59 | ch09 could mention use of trees as source of features for ML
60 | ch09 includes contents of grammar files that have changed in data distribution
61 | ch09 has no |TRY| exercises
62 | ch11 has no |TRY| exercises
63 |
64 |
65 | ch07 -- reorder, put the stuff that we can't actually do (IE, etc) at the end
66 | - move the n-gram and ne classifier earlier, to establish the connection
67 | to chapters 5 and 6.
68 |
--------------------------------------------------------------------------------
/howto/show_coverage.py:
--------------------------------------------------------------------------------
1 |
2 | import sys, os, re
3 | import nltk.test.coverage as coverage
4 | import color_coverage
5 |
6 | OUT_DIR = 'coverage'
7 | MODULE_RE = re.compile(r'nltk.*')
8 |
9 | HEAD = (".. ==========================================================\n"
10 | ".. AUTO-GENERATED LISTING -- DO NOT EDIT!:\n\n"
11 | ".. role:: red\n"
12 | " :class: red\n\n"
13 | ".. role:: yellow\n"
14 | " :class: yellow\n\n"
15 | ".. role:: green\n"
16 | " :class: green\n\n"
17 | ".. container:: doctest-list\n\n"
18 | " .. list-table::\n"
19 | " :class: doctest-list \n"
20 | " :widths: 80 20\n"
21 | " :header-rows: 1\n\n"
22 | " * - Module\n - Coverage\n")
23 | FOOT = (".. END AUTO-GENERATED LISTING\n"
24 | ".. ==========================================================\n")
25 |
26 | def report_coverage(module):
27 | sys.stdout.write(' %-40s ' % module.__name__)
28 | sys.stdout.flush()
29 | (fname, stmts, excluded, missing, fmt_missing, def_info) = (
30 | coverage.analysis3(module))
31 | out = open(os.path.join(OUT_DIR, module.__name__+'.html'), 'wb')
32 | color_coverage.colorize_file(fname, module.__name__, out,
33 | fmt_missing, def_info)
34 | out.close()
35 | if not missing: c = 100
36 | elif stmts: c = 100.*(len(stmts)-len(missing)) / len(stmts)
37 | else: c = 100
38 | sys.stdout.write('%3d%%\n' % c)
39 | return c
40 |
41 | def init_out_dir():
42 | # Create the dir if it doesn't exist.
43 | if not os.path.exists(OUT_DIR):
44 | os.mkdir(OUT_DIR)
45 |
46 | # Make sure it's actually a dir.
47 | if not os.path.isdir(OUT_DIR):
48 | raise ValueError('%s is in the way' % OUT_DIR)
49 |
50 | # Clear its contents.
51 | for filename in os.listdir(OUT_DIR):
52 | os.remove(os.path.join(OUT_DIR, filename))
53 |
54 | def main(filenames):
55 | # Collect the coverage data from the given files.
56 | for filename in filenames:
57 | cexecuted = coverage.the_coverage.restore_file(filename)
58 | coverage.the_coverage.merge_data(cexecuted)
59 |
60 | try: init_out_dir()
61 | except Exception, e:
62 | print 'Unable to create output directory %r: %s' % (OUT_DIR, e)
63 | return
64 |
65 | out = open('coverage-list.txt', 'wb')
66 | out.write(HEAD)
67 |
68 | # Construct a coverage file for each NLTK module.
69 | print '\nGenerating coverage summary files...\n'
70 | print ' %-40s %s' % ('Module', 'Coverage')
71 | print ' '+'-'*50
72 | for module_name, module in sorted(sys.modules.items()):
73 | if module is None: continue
74 | if MODULE_RE.match(module_name):
75 | cover = report_coverage(module)
76 | if cover == 100: color = 'green'
77 | elif cover > 50: color = 'yellow'
78 | else: color = 'red'
79 | out.write(' * - `%s <%s.html>`__\n'
80 | ' - `%d%%`:%s:\n' %
81 | (module_name, module_name, cover, color))
82 | out.flush()
83 |
84 | out.write(FOOT)
85 | out.close()
86 |
87 | if __name__ == '__main__':
88 | main(sys.argv[1:])
89 |
--------------------------------------------------------------------------------
/book/image_scaling.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 | .. include:: ../definitions.rst
3 |
4 | =============
5 | Image Scaling
6 | =============
7 |
8 | Here's a collection of images from the book and a summary of the scaling result for each.
9 |
10 |
11 | ============ ========== ========== ========== ========== =======
12 | Figure LaTeX/PDF HTML Docbook Origin Scale
13 | ============ ========== ========== ========== ========== =======
14 | authors_ too big ok too small photo 150
15 | ------------ ---------- ---------- ---------- ---------- -------
16 | inaugural_ ok too big ok pylab 90
17 | ------------ ---------- ---------- ---------- ---------- -------
18 | maps02_ ok too big ok graffle 22
19 | ------------ ---------- ---------- ---------- ---------- -------
20 | dialogue_ ok too big ok graffle 32
21 | ------------ ---------- ---------- ---------- ---------- -------
22 | structure_ ok too big ok graffle 150
23 | ------------ ---------- ---------- ---------- ---------- -------
24 | tally_ too small too big ok graffle 30
25 | ------------ ---------- ---------- ---------- ---------- -------
26 | lexicon_ too small too big ok graffle 50
27 | ------------ ---------- ---------- ---------- ---------- -------
28 | pipeline_ too small too big ok graffle 40
29 | ------------ ---------- ---------- ---------- ---------- -------
30 | triangle_ too small too big too big graffle 50
31 | ------------ ---------- ---------- ---------- ---------- -------
32 | polish_ ok ok too big screenshot default
33 | ============ ========== ========== ========== ========== =======
34 |
35 |
36 | .. _authors:
37 | .. figure:: ../images/authors.png
38 | :scale: 150
39 |
40 | Edward Loper, Ewan Klein, and Steven Bird, Stanford, July 2007
41 |
42 | .. _inaugural:
43 | .. figure:: ../images/inaugural.png
44 | :scale: 90
45 |
46 | Lexical Dispersion Plot for Words in US Presidential Inaugural Addresses
47 |
48 | .. _maps02:
49 | .. figure:: ../images/maps02.png
50 | :scale: 22
51 |
52 | Dictionary Look-up
53 |
54 | .. _dialogue:
55 | .. figure:: ../images/dialogue.png
56 | :scale: 32
57 |
58 | Simple Pipeline Architecture for a Spoken Dialogue System
59 |
60 | .. _structure:
61 | .. figure:: ../images/text-corpus-structure.png
62 | :scale: 150
63 |
64 | Common Structures for Text Corpora (one point per text)
65 |
66 | .. _tally:
67 | .. figure:: ../images/tally.png
68 | :scale: 30
69 |
70 | Counting Words Appearing in a Text (a frequency distribution)
71 |
72 | .. _lexicon:
73 | .. figure:: ../images/lexicon.png
74 | :scale: 50
75 |
76 | Lexicon Terminology
77 |
78 | .. _pipeline:
79 | .. figure:: ../images/pipeline1.png
80 | :scale: 40
81 |
82 | The Processing Pipeline
83 |
84 | .. _triangle:
85 | .. figure:: ../images/naive-bayes-triangle.png
86 | :scale: 50
87 |
88 | An abstract illustration of the procedure used by the Naive Bayes
89 | classifier to choose the topic for a document.
90 |
91 | .. _polish:
92 | .. figure:: ../images/polish-utf8.png
93 |
94 | Screenshot
95 |
--------------------------------------------------------------------------------
/book/ch01-extras.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 | .. include:: ../definitions.rst
3 |
4 | .. standard global imports
5 |
6 | >>> import nltk, re, pprint
7 |
8 | ==========================================
9 | 1. Language Processing and Python (Extras)
10 | ==========================================
11 |
12 |
13 | --------------------------------------------------------
14 | Websites with Information on Natural Language Processing
15 | --------------------------------------------------------
16 |
17 | Several websites have useful information about |NLP|, including
18 | conferences, resources, and special-interest groups, e.g.
19 | ``www.lt-world.org``, ``www.aclweb.org``, ``www.elsnet.org``.
20 |
21 | The website of the *Association for Computational Linguistics*,
22 | at ``www.aclweb.org``, contains an overview of computational linguistics,
23 | including copies of introductory chapters from recent textbooks.
24 | Wikipedia has entries for |NLP| and its subfields
25 | (but don't confuse natural language processing with
26 | the other |NLP|\ : neuro-linguistic programming).
27 |
28 | ``http://www.statmt.org/``
29 |
30 | ``http://www.aclweb.org/aclwiki/index.php?title=Textual_Entailment_Resource_Pool``
31 |
32 | ----------------------------------
33 | NLP Systems with Online Interfaces
34 | ----------------------------------
35 |
36 | Several |NLP| systems have online interfaces that you might like to
37 | experiment with, e.g.:
38 |
39 | * WordNet: ``http://wordnet.princeton.edu/``
40 | * Translation: ``http://babelfish.yahoo.com/``, ``http://translate.google.com/``
41 | * ChatterBots: ``http://www.loebner.net/Prizef/loebner-prize.html``
42 | * Question Answering: ``http://www.answerbus.com/``
43 | * Summarization: ``http://newsblaster.cs.columbia.edu/``
44 |
45 | Online concordancing:
46 |
47 | * ``http://corpus.leeds.ac.uk/internet.html``
48 |
49 | ------
50 | Python
51 | ------
52 |
53 | A good starting place: http://www.python.org/doc/intros/
54 |
55 | [vanRossum2006IP]_ is a Python tutorial by Guido van
56 | Rossum, the inventor of Python and Fred Drake, the official
57 | editor of the Python documentation. It is available online at
58 | ``http://docs.python.org/tut/tut.html``. A more detailed but still
59 | introductory text is [Lutz2003LP]_, which covers the essential
60 | features of Python, and also provides an overview of the standard libraries.
61 | A more advanced text, [vanRossum2006PLR]_ is the official reference
62 | for the Python language itself, and describes the syntax of Python and
63 | its built-in datatypes in depth. It is also available online at
64 | ``http://docs.python.org/ref/ref.html.``
65 | [Beazley2006PER]_ is a succinct reference book; although not suitable
66 | as an introduction to Python, it is an excellent resource for
67 | intermediate and advanced programmers.
68 | Finally, it is always worth checking the official *Python
69 | Documentation* at http://docs.python.org/.
70 |
71 | Two freely available online texts are the following:
72 |
73 | * Josh Cogliati, *Non-Programmer's Tutorial for Python*,
74 | ``http://en.wikibooks.org/wiki/Non-Programmer's_Tutorial_for_Python/Contents``
75 |
76 | * Jeffrey Elkner, Allen B. Downey and Chris Meyers,
77 | *How to Think Like a Computer Scientist: Learning with Python* (Second Edition),
78 | ``http://openbookproject.net/thinkCSpy/``
79 |
80 |
81 | .. include:: footer.rst
82 |
--------------------------------------------------------------------------------
/LSA325/lsa325_5.tex:
--------------------------------------------------------------------------------
1 | \documentclass[t]{beamer} % for slides
2 | %\documentclass[handout]{beamer} % for handout
3 | \input{beamer}
4 |
5 | \title{Introduction to Computational Linguistics\\LSA 325}
6 |
7 | \author{Steven Bird \and Ewan Klein \and Edward Loper}
8 | \institute{
9 | University of Melbourne, AUSTRALIA
10 | \and
11 | University of Edinburgh, UK
12 | \and
13 | University of Pennsylvania, USA
14 | }
15 |
16 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
17 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
18 |
19 | \begin{document}
20 |
21 |
22 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
23 |
24 | \begin{frame}
25 | \titlepage
26 | \end{frame}
27 |
28 |
29 | \begin{frame}
30 |
31 | \frametitle{Compositional Semantics}
32 |
33 | \begin{itemize}
34 | \item Contrast with lexical semantics
35 | \item Meaning of a phrase is a function of the meaning of its parts
36 | \item Truth-conditions: minimum hurdle for a theory of meaning
37 | \item Entailment ($\phi \Rightarrow \psi$: every situation that makes
38 | $\phi$ true also make $\psi$ true
39 |
40 |
41 | \end{itemize}
42 | \end{frame}
43 |
44 | \begin{frame}
45 |
46 | \begin{exampleblock}{Entailment Examples}
47 | \begin{itemize}
48 | \item \textit{Kim eats toasted bagels} $\Rightarrow$ \textit{Kim eats
49 | bagels}
50 | \item \textit{Lee sings and dances} $\Rightarrow$ \textit{Lee sings}
51 | \item \textit{Lee sings songs to Kim} $\Rightarrow$ \textit{Lee sings
52 | songs to someone}
53 | \item \textit{Kim hates all green vegetables and calabrese is a green
54 | vegetable} $\Rightarrow$ \textit{Kim hates
55 | calabrese}
56 | \end{itemize}
57 | \end{exampleblock}
58 |
59 | \end{frame}
60 |
61 | \begin{frame}
62 |
63 | \frametitle{Truth in a model, version 1}
64 |
65 | \begin{itemize}
66 | \item A model is a pair $\langle D, V\rangle$
67 | \item $V:$ Individual terms $\mapsto$ entities in $D$
68 | \item $V:$ 1-place predicates $\mapsto$ sets of entities
69 | \item $V:$ 2-place predicates (relations) $\mapsto$ sets of pairs of entities
70 | \end{itemize}
71 |
72 | \end{frame}
73 |
74 | \begin{frame}[fragile]
75 |
76 | \frametitle{Truth in a model, version 1}
77 |
78 | \begin{exampleblock}{N-ary Relations}
79 | \begin{verbatim}
80 | ('boy', set(['b1', 'b2'])),
81 | ('chase', set([('b1', 'g1'), ('b2', 'g1'), ('g1', 'd1'), ('g2', 'd2')])),
82 | \end{verbatim}
83 | \end{exampleblock}
84 | \end{frame}
85 |
86 | \begin{frame}
87 |
88 | \frametitle{Truth in a model, version 2}
89 |
90 | \begin{itemize}
91 | \item A model is a pair $\langle D, V\rangle$
92 | \item Individual terms $\mapsto$ entities
93 | \item 1-place predicates $\mapsto$ mappings from entities to truth values
94 | \item 2-place predicates (relations) $\mapsto$ mappings from
95 | entities to the meanings of 1-place predicates
96 | \end{itemize}
97 | \end{frame}
98 |
99 | \begin{frame}[fragile]
100 |
101 | \frametitle{Truth in a model, version 2}
102 |
103 | \begin{exampleblock}{Characteristic Functions}
104 | \begin{verbatim}
105 | 'boy': {'b1': True, 'b2': True},
106 | 'chase': {'d1': {'g1': True},
107 | 'd2': {'g2': True},
108 | 'g1': {'b1': True, 'b2': True}},
109 | \end{verbatim}
110 | \end{exampleblock}
111 | \end{frame}
112 |
113 |
114 |
115 | \end{document}
116 |
--------------------------------------------------------------------------------
/LSA325/LSA325_5_handout.tex:
--------------------------------------------------------------------------------
1 | \documentclass[12pt]{article}
2 | \usepackage{url,a4wide}
3 |
4 | \begin{document}
5 |
6 | \begin{center}
7 | {\Large LSA 325, Class 6, Mon 23rd July}
8 | \end{center}
9 |
10 |
11 |
12 | \section*{Exercises for class}
13 |
14 | \subsection*{Unification}
15 |
16 | Try the following, and some variations that you make up yourself:
17 | \begin{verbatim}
18 | >>> from nltk import FeatStruct, unify
19 | >>> fs1 = FeatStruct.parse('[AGR=[GND=masc]]')
20 | >>> fs2 = FeatStruct.parse('[AGR=[PER=3]]')
21 | >>> print unify(fs1, fs2)
22 | [ AGR = [ GND = 'masc' ] ]
23 | [ [ PER = 3 ] ]
24 | \end{verbatim}
25 |
26 | \noindent
27 | Now have a go at a few of the unification examples from exercise (2)
28 | in section 9.3.4 of Chapter 9. It's up to you whether you want to try
29 | them with paper and pencil, or dive straight in the Python interpreter
30 |
31 | \subsection*{Feature-based Grammar}
32 | \noindent
33 | Toy grammar of English NPs:
34 | \begin{verbatim}
35 | % start NP
36 | NP[AGR=?a] -> Det[AGR=?a] N[AGR=?a]
37 | Det[AGR=[NUM='sg', PER=3]] -> 'this' | 'that'
38 | Det[AGR=[NUM='pl', PER=3]] -> 'these' | 'those'
39 | Det[AGR=[NUM='pl', PER=1]] -> 'we'
40 | Det[AGR=[PER=2]] -> 'you'
41 | N[AGR=[NUM='sg', GND='m']] -> 'boy'
42 | N[AGR=[NUM='pl', GND='m']] -> 'boys'
43 | N[AGR=[NUM='sg', GND='f']] -> 'girl'
44 | N[AGR=[NUM='pl', GND='f']] -> 'girls'
45 | N[AGR=[NUM='sg']] -> 'student'
46 | N[AGR=[NUM='pl']] -> 'students'
47 | \end{verbatim}
48 |
49 | \noindent
50 | This can be downloaded from the
51 | \url{http://nltk.org/temp/np.cfg}. Save it your current working
52 | directory. At
53 | the same time, download and save the following file to the same directory:
54 | \url{http://nltk.org/temp/fix_featureparser.py}. Start up IDLE in this
55 | directory. Then you can run the
56 | grammar as follows:
57 | \begin{verbatim}
58 | >>> from nltk.book import *
59 | >>> import fix_featureparser
60 | >>> tokens = 'these girls'.split()
61 | >>> cp = parse.load_earley('np.cfg', trace=2)
62 | >>> trees = cp.get_parse(tokens)
63 | >>> for tree in trees: print tree
64 | \end{verbatim}
65 | Alternatively, download these instructions as
66 | \url{http://nltk.org/temp/quickstart.py} and give the command
67 | \begin{verbatim}
68 | import quickstart
69 | \end{verbatim}
70 |
71 | \noindent
72 | Play with the grammar a bit. Now try to write a similar grammar of
73 | your own. You might like to try working with the Spanish data in
74 | Chapter 9; i.e., Exercise 3 in section 9.2.4.
75 |
76 | \subsection*{Semantics}
77 |
78 | Download and save the following file to the same directory:
79 | \url{http://nltk.org/temp/model.py}.
80 |
81 | You can now inspect this first-order model, and play around with the
82 | valuations that it gives. Here are some starting suggestions:
83 | \begin{verbatim}
84 | >>> from model import *
85 | >>> print m
86 | >>> val['walk']
87 | >>> val['walk'][val['john']]
88 | >>> m.evaluate('(walk john)', g)
89 | >>> m.evaluate('((walk john) and (walk fido))', g)
90 | >>> m.evaluate('(chase mary fido)', g)
91 | >>> m.evaluate('(chase fido mary)', g)
92 | >>> g.add('b1', 'x')
93 | >>> print g
94 | g[b1/x]
95 | >>> m.evaluate('(chase fido x)', g)
96 | >>> m.evaluate('(walk x)', g)
97 | \end{verbatim}
98 | Have a look at Chapter 11, and see if you can evaluate some formulas
99 | involving the quantifiers \texttt{some} and \texttt{all}.
100 |
101 |
102 |
103 | \end{document}
104 |
105 | %%% Local Variables:
106 | %%% mode: latex
107 | %%% TeX-master: t
108 | %%% End:
109 |
--------------------------------------------------------------------------------
/book/second-edition.txt:
--------------------------------------------------------------------------------
1 | Natural Language Processing with Python
2 | Proposal for 2nd Edition (Draft 14 Nov 2013)
3 |
4 | We propose a second edition of the book with the following key changes:
5 |
6 | 1. Incorporate material on new developments in the field where they
7 | are sufficiently mature to form part of an introductory textbook, such
8 | as dependency parsing and machine translation.
9 |
10 | 2. Incorporate and systematize popular applications of the toolkit
11 | that have arisen since the first edition was published, such as
12 | sentiment analysis and the semantic web.
13 |
14 | 3. Add new sections on scaling up to several of the chapters, in order
15 | to show readers how to handle larger datasets and how to interface to
16 | specialized industry-strength tools; this will give readers a clearer
17 | pathway into R&D.
18 |
19 | 4. Incorporate feedback from people who have adopted the book for teaching
20 | (we have identified approximately 100 such courses).
21 |
22 | 5. Update program samples for consistency with Python 3 and NLTK 3; in
23 | many cases the surrounding discussion needs to be updated as well.
24 |
25 | 6. Tighten up existing content to save space; trim sections with the
26 | help of reader feedback; rescale or rework diagrams so they take less
27 | space; possibly hyperlinking from electronic versions to online code
28 | samples so that some can be omitted from the book.
29 |
30 |
31 |
32 | Table of Contents (additions marked with "+", deletions marked with "-")
33 |
34 | 0. Preface
35 | + converting NLTK 2 to NLTK 3 code
36 |
37 | 1. Language Processing and Python
38 | + Chatbots
39 |
40 | 2. Accessing Text Corpora and Lexical Resources
41 | + Google ngrams corpus
42 | - move lexical resources to lexicon chapter
43 |
44 | 3. Processing Raw Text
45 | + processing twitter feeds
46 |
47 | 4. Writing Structured Programs
48 | + string edit distance
49 |
50 | 5. Categorizing and Tagging Words
51 | + scaling up: interface to stanford tagger
52 |
53 | 6. Learning to Classify Text
54 | + clustering, semi-supervised approaches
55 | + scaling up: map-reduce and NLP in the cloud
56 |
57 | + The Lexicon
58 | + wordnet, framenet
59 | + ontologies and the semantic web
60 | + interface to finite-state morphology toolkit
61 | + word-sense disambiguation
62 | + distributional semantics
63 | + multilingual wordnet
64 |
65 | 7. Extracting Information from Text
66 | + semantic role labeling, VerbNet, and PropBank
67 | + sentiment analysis
68 | + resources for named-entity recognition
69 | + abbreviations
70 | + normalization and grounding of named entities
71 | + scaling up: interface to stanford NER system
72 |
73 | + Machine Translation [new chapter]
74 | + bilingual aligned text
75 | + bitext corpora
76 | + sentence alignment (Gale-Church algorithm)
77 | + word-alignment (IBM models 1-3)
78 | + evaluation
79 |
80 | 8. Analyzing Sentence Structure
81 | * grammar-development -> scaling up: grammar development
82 | 8.6 add material on training dependency grammar
83 | + scaling up: interfacing to stanford parser and maltparser
84 |
85 | 9. Building Feature Based Grammars
86 |
87 | 10. Analyzing the Meaning of Sentences
88 | + machine learning techniques for learning semantic representations
89 | - model theoretic semantics moved into free-standing HOWTO
90 |
91 | 11. Managing Linguistic Data
92 |
93 | 12. Afterword: Facing the Language Challenge
94 |
95 | Omitted:
96 | * textual entailment
97 | * summarisation
98 | * generation
99 |
100 |
--------------------------------------------------------------------------------
/images/are.fig:
--------------------------------------------------------------------------------
1 | #FIG 3.2
2 | Landscape
3 | Center
4 | Metric
5 | A4
6 | 100.00
7 | Single
8 | -2
9 | 1200 2
10 | 6 1800 2700 2250 4050
11 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
12 | 1800 2700 2250 2700 2250 3150 1800 3150 1800 2700
13 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
14 | 1800 3150 2250 3150 2250 3600 1800 3600 1800 3150
15 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
16 | 1800 3600 2250 3600 2250 4050 1800 4050 1800 3600
17 | -6
18 | 6 2250 2700 2700 4050
19 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
20 | 2250 2700 2700 2700 2700 3150 2250 3150 2250 2700
21 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
22 | 2250 3150 2700 3150 2700 3600 2250 3600 2250 3150
23 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
24 | 2250 3600 2700 3600 2700 4050 2250 4050 2250 3600
25 | -6
26 | 6 2700 2700 3150 4050
27 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
28 | 2700 2700 3150 2700 3150 3150 2700 3150 2700 2700
29 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
30 | 2700 3150 3150 3150 3150 3600 2700 3600 2700 3150
31 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
32 | 2700 3600 3150 3600 3150 4050 2700 4050 2700 3600
33 | -6
34 | 6 3150 2700 3600 4050
35 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
36 | 3150 2700 3600 2700 3600 3150 3150 3150 3150 2700
37 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
38 | 3150 3150 3600 3150 3600 3600 3150 3600 3150 3150
39 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
40 | 3150 3600 3600 3600 3600 4050 3150 4050 3150 3600
41 | -6
42 | 6 3600 2700 4050 4050
43 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
44 | 3600 2700 4050 2700 4050 3150 3600 3150 3600 2700
45 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
46 | 3600 3150 4050 3150 4050 3600 3600 3600 3600 3150
47 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
48 | 3600 3600 4050 3600 4050 4050 3600 4050 3600 3600
49 | -6
50 | 6 4050 2700 4500 4050
51 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
52 | 4050 2700 4500 2700 4500 3150 4050 3150 4050 2700
53 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
54 | 4050 3150 4500 3150 4500 3600 4050 3600 4050 3150
55 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
56 | 4050 3600 4500 3600 4500 4050 4050 4050 4050 3600
57 | -6
58 | 6 4500 2700 4950 4050
59 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
60 | 4500 2700 4950 2700 4950 3150 4500 3150 4500 2700
61 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
62 | 4500 3150 4950 3150 4950 3600 4500 3600 4500 3150
63 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
64 | 4500 3600 4950 3600 4950 4050 4500 4050 4500 3600
65 | -6
66 | 6 4950 2700 5400 4050
67 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
68 | 4950 2700 5400 2700 5400 3150 4950 3150 4950 2700
69 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
70 | 4950 3150 5400 3150 5400 3600 4950 3600 4950 3150
71 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
72 | 4950 3600 5400 3600 5400 4050 4950 4050 4950 3600
73 | -6
74 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
75 | 5400 3150 5850 3150 5850 3600 5400 3600 5400 3150
76 | 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
77 | 5400 3600 5850 3600 5850 4050 5400 4050 5400 3600
78 | 2 2 0 1 0 0 50 -1 10 0.000 0 0 -1 0 0 5
79 | 5400 2700 5850 2700 5850 3150 5400 3150 5400 2700
80 | 4 1 0 50 -1 0 15 0.0000 4 150 150 2520 4320 B\001
81 | 4 1 0 50 -1 0 15 0.0000 4 150 165 2070 4320 A\001
82 | 4 1 0 50 -1 0 15 0.0000 4 150 150 2970 4320 C\001
83 | 4 1 0 50 -1 0 15 0.0000 4 150 165 3420 4320 D\001
84 | 4 1 0 50 -1 0 15 0.0000 4 150 135 3870 4320 E\001
85 | 4 1 0 50 -1 0 15 0.0000 4 150 120 4320 4320 F\001
86 | 4 1 0 50 -1 0 15 0.0000 4 150 165 4770 4320 G\001
87 | 4 1 0 50 -1 0 15 0.0000 4 150 165 5220 4320 H\001
88 | 4 1 0 50 -1 0 15 0.0000 4 150 75 5580 4320 I\001
89 | 4 1 0 50 -1 0 15 0.0000 4 105 105 1530 2970 a\001
90 | 4 1 0 50 -1 0 15 0.0000 4 150 120 1530 3420 b\001
91 | 4 1 0 50 -1 0 15 0.0000 4 105 105 1530 3870 c\001
92 |
--------------------------------------------------------------------------------
/book/ch01-notes.rst:
--------------------------------------------------------------------------------
1 |
2 | =======================
3 | Computing with Language
4 | =======================
5 |
6 | * not a conventional introduction to programming where we work
7 | through language features one by one
8 | (in fact, features will be introduced in a rather unusual order)
9 | * plenty of such books exist already (incl for Python)
10 | * instead, a problem-oriented approach: a series of tasks each requiring some programming,
11 | each building on what has come before (so getting more difficult)
12 | * starting point: we have lots of text and lots of computing cycles: what can we do?
13 | * no prior programming ability assumed, just retyping examples
14 |
15 | ----------------------------------
16 | Searching large quantities of text
17 | ----------------------------------
18 |
19 | * most obvious: searching large amounts of text
20 | * includes functionality for generating random text in this style
21 | * first-hand experience with scale and diversity of corpora
22 |
23 | Questions coming out of this:
24 | * what makes texts different?
25 | * what is a text? seq of characters on a page (does page matter?)
26 | seq of words? seq of chapters made up of seq of paras ...
27 | * our simplification: text = sequence of words (plus punctuation 'words'): "tokens"
28 | * explicit notation: ["the", "cat", "sat", "on", "the", "mat"]
29 | * key concept: TEXT = LIST OF WORDS
30 | * reuse material from 2.4.1
31 |
32 | IDLE session:
33 | * getting started with IDLE
34 | * lists, str.split(), len()
35 | * variables
36 |
37 | -------------------
38 | Counting vocabulary
39 | -------------------
40 |
41 | * one thing that makes texts different is the set of words used (vocabulary)
42 | * vocabulary richness
43 | * defining functions -- allows us to explain what the () are everywhere
44 | and gives inkling of the power of programming
45 | * key concept: VOCABULARY = SET OF WORDS
46 |
47 | IDLE session:
48 | * str.lower()
49 | * defining simple functions (diagram of unary function)
50 |
51 |
52 | >>> sorted(set(word for word in text3 if word.endswith("eth")))
53 | ['Hazarmaveth', 'Heth', 'Japheth', 'Jetheth', 'Seth', 'aileth', 'asketh', 'biteth', 'blesseth', 'breaketh', 'cometh', 'compasseth', 'creepeth', 'crieth', 'curseth', 'divineth', 'doeth', 'drinketh', 'faileth', 'findeth', 'giveth', 'goeth', 'knoweth', 'lieth', 'liveth', 'longeth', 'loveth', 'meeteth', 'moveth', 'needeth', 'pleaseth', 'proceedeth', 'remaineth', 'repenteth', 'seeth', 'sheddeth', 'sheweth', 'slayeth', 'speaketh', 'teeth', 'togeth', 'toucheth', 'twentieth', 'walketh', 'wotteth']
54 |
55 |
56 |
57 | -------
58 | Corpora
59 | -------
60 |
61 | * definition
62 | * accessing
63 |
64 |
65 |
66 | --------------
67 | Changing Tense
68 | --------------
69 |
70 | * convert a verb into past tense (perfect)
71 | * motivation?
72 | * key concepts: CONDITIONAL EXPRESSIONS, STRINGS
73 |
74 | IDLE session:
75 | * string concatenation
76 | * string indexing
77 | * conditional expressions
78 | * function past(word) -> past-tense form
79 |
80 | --------------
81 | Classification
82 | --------------
83 |
84 | * informal study of how texts differ
85 | * genre, author, language
86 | * FreqDists initialized with list comprehensions
87 | * key concept: COMPREHENSIONS (ITERATION)
88 |
89 | IDLE session:
90 | * word length distribution plot: FreqDist(len(word) for word in text).plot()
91 | (comparing languages, text difficulties)
92 | (need to permit >1 plot to be overlaid)
93 | * character distribution plot: FreqDist(char for word in text for char in word).plot()
94 | (comparing languages)
95 | * relative frequency of modals: FreqDist(word for word in text if word in modals).plot()
96 | (comparing Brown corpus genres)
97 |
98 |
99 |
100 |
101 |
102 |
--------------------------------------------------------------------------------
/LSA325/log_fds.txt:
--------------------------------------------------------------------------------
1 | Python 2.5 (r25:51918, Sep 19 2006, 08:49:13)
2 | [GCC 4.0.1 (Apple Computer, Inc. build 5341)] on darwin
3 | Type "copyright", "credits" or "license()" for more information.
4 |
5 | ****************************************************************
6 | Personal firewall software may warn about the connection IDLE
7 | makes to its subprocess using this computer's internal loopback
8 | interface. This connection is not visible on any external
9 | interface and no data is sent to or received from the Internet.
10 | ****************************************************************
11 |
12 | IDLE 1.2
13 | >>>
14 | >>>
15 | >>> from nltk.book import *
16 | >>> text = '''Hello. Isn't this fun?'''
17 | >>> list(tokenize.regexp(text, r'[a-z]'))
18 | ['e', 'l', 'l', 'o', 's', 'n', 't', 't', 'h', 'i', 's', 'f', 'u', 'n']
19 | >>> list(tokenize.regexp(text, r'[a-z]+'))
20 | ['ello', 'sn', 't', 'this', 'fun']
21 | >>> list(tokenize.regexp(text, r'[A-Za-z]+'))
22 | ['Hello', 'Isn', 't', 'this', 'fun']
23 | >>> list(tokenize.regexp(text, r'[A-Za-z]+|[.?!;]'))
24 | ['Hello', '.', 'Isn', 't', 'this', 'fun', '?']
25 | >>> list(tokenize.regexp(text, r'[A-Za-z]+|[.?!;']'))
26 | SyntaxError: invalid syntax
27 | >>>
28 | >>> list(tokenize.regexp(text, r"[A-Za-z]+|[.?!;']"))
29 | ['Hello', '.', 'Isn', "'", 't', 'this', 'fun', '?']
30 | >>> list(tokenize.regexp(text, r"\w+[.?!;']\w+|[.?!;']"))
31 | ['.', "Isn't", '?']
32 | >>> list(tokenize.regexp(text, r"\w+([.?!;']\w+)?|[.?!;']"))
33 | ['Hello', '.', "Isn't", 'this', 'fun', '?']
34 | >>> list(tokenize.whitespace(text))
35 | ['Hello.', "Isn't", 'this', 'fun?']
36 | >>> list(tokenize.wordpunct(text))
37 | ['Hello', '.', 'Isn', "'", 't', 'this', 'fun', '?']
38 | >>> nltk.FreqDist
39 |
40 | >>> sentence = "the cat sat on the mat"
41 | >>> words = sentence.split()
42 | >>> words
43 | ['the', 'cat', 'sat', 'on', 'the', 'mat']
44 | >>> fd = nltk.FreqDist(words)
45 | >>>
46 | >>> fd['the']
47 | 2
48 | >>> fd['sat']
49 | 1
50 | >>> fd2 = nltk.FreqDist(sentence)
51 | >>> fd2.keys()
52 | ['a', ' ', 'c', 'e', 'h', 'm', 'o', 'n', 's', 't']
53 | >>> fd2['c']
54 | 1
55 | >>> corpus.inaugural.items
56 | ['1789-Washington', '1793-Washington', '1797-Adams', '1801-Jefferson', '1805-Jefferson', '1809-Madison', '1813-Madison', '1817-Monroe', '1821-Monroe', '1825-Adams', '1829-Jackson', '1833-Jackson', '1837-VanBuren', '1841-Harrison', '1845-Polk', '1849-Taylor', '1853-Pierce', '1857-Buchanan', '1861-Lincoln', '1865-Lincoln', '1869-Grant', '1873-Grant', '1877-Hayes', '1881-Garfield', '1885-Cleveland', '1889-Harrison', '1893-Cleveland', '1897-McKinley', '1901-McKinley', '1905-Roosevelt', '1909-Taft', '1913-Wilson', '1917-Wilson', '1921-Harding', '1925-Coolidge', '1929-Hoover', '1933-Roosevelt', '1937-Roosevelt', '1941-Roosevelt', '1945-Roosevelt', '1949-Truman', '1953-Eisenhower', '1957-Eisenhower', '1961-Kennedy', '1965-Johnson', '1969-Nixon', '1973-Nixon', '1977-Carter', '1981-Reagan', '1985-Reagan', '1989-Bush', '1993-Clinton', '1997-Clinton', '2001-Bush', '2005-Bush']
57 | >>> for word in corpus.inaugural.tokenized('2005-Bush'):
58 | if word in ['he', 'him', 'she', 'her', 'man', 'woman']:
59 | print word,
60 | man woman her
61 | >>> fd = nltk.FreqDist()
62 | >>> fd
63 |
64 | >>> fd.inc('male')
65 | >>> fd
66 |
67 | >>>
68 | >>> fd.inc('female')
69 | >>> fd.inc('female')
70 | >>> fd.inc('female')
71 | >>> fd.inc('female')
72 | >>> fd
73 |
74 | >>>
75 | >>> fd['male']
76 | 1
77 | >>> fd['female']
78 | 4
79 | >>> if word in ['..', '..', '..']:
80 | fd.inc('???')
81 |
82 | >>> for word in corpus.inaugural.tokenized('2005-Bush'):
83 | fd.inc(word)
84 | >>> fd
85 |
86 | >>> fd['President']
87 | 4
88 | >>> fd['man']
89 | 1
90 | >>>
91 |
92 |
--------------------------------------------------------------------------------
/images/chunk-tagrep.fig:
--------------------------------------------------------------------------------
1 | #FIG 3.2
2 | Landscape
3 | Center
4 | Metric
5 | A4
6 | 100.00
7 | Single
8 | -2
9 | 1200 2
10 | 0 32 #aeaeae
11 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
12 | 540 1170 720 1170 720 1350 540 1350 540 1170
13 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
14 | 720 1170 900 1170 900 1350 720 1350 720 1170
15 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
16 | 900 1170 1080 1170 1080 1350 900 1350 900 1170
17 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
18 | 1080 1170 1260 1170 1260 1350 1080 1350 1080 1170
19 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
20 | 1260 1170 1440 1170 1440 1350 1260 1350 1260 1170
21 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
22 | 1440 1170 1620 1170 1620 1350 1440 1350 1440 1170
23 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
24 | 1620 1170 1800 1170 1800 1350 1620 1350 1620 1170
25 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
26 | 1800 1170 1980 1170 1980 1350 1800 1350 1800 1170
27 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
28 | 1980 1170 2160 1170 2160 1350 1980 1350 1980 1170
29 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
30 | 2160 1170 2340 1170 2340 1350 2160 1350 2160 1170
31 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
32 | 2340 1170 2520 1170 2520 1350 2340 1350 2340 1170
33 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
34 | 2700 1170 2880 1170 2880 1350 2700 1350 2700 1170
35 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
36 | 2520 1170 2700 1170 2700 1350 2520 1350 2520 1170
37 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
38 | 3060 1170 3240 1170 3240 1350 3060 1350 3060 1170
39 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
40 | 2880 1170 3060 1170 3060 1350 2880 1350 2880 1170
41 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
42 | 3240 1170 3420 1170 3420 1350 3240 1350 3240 1170
43 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
44 | 3420 1170 3600 1170 3600 1350 3420 1350 3420 1170
45 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
46 | 3600 1170 3780 1170 3780 1350 3600 1350 3600 1170
47 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
48 | 3780 1170 3960 1170 3960 1350 3780 1350 3780 1170
49 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
50 | 495 1125 945 1125 945 1800 495 1800 495 1125
51 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
52 | 1035 1125 1665 1125 1665 1800 1035 1800 1035 1125
53 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
54 | 1755 1125 2385 1125 2385 1800 1755 1800 1755 1125
55 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
56 | 2475 1125 3105 1125 3105 1800 2475 1800 2475 1125
57 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
58 | 3195 1125 3825 1125 3825 1800 3195 1800 3195 1125
59 | 4 1 0 50 0 14 12 0.0000 4 75 135 1170 1305 s\001
60 | 4 1 0 50 0 14 12 0.0000 4 75 135 1350 1305 a\001
61 | 4 1 0 50 0 14 12 0.0000 4 75 135 1530 1305 w\001
62 | 4 1 0 50 0 14 12 0.0000 4 105 135 1890 1305 t\001
63 | 4 1 0 50 0 14 12 0.0000 4 105 135 2070 1305 h\001
64 | 4 1 0 50 0 14 12 0.0000 4 75 135 2250 1305 e\001
65 | 4 1 0 50 0 14 12 0.0000 4 105 135 2610 1305 b\001
66 | 4 1 0 50 0 14 12 0.0000 4 105 135 2790 1305 i\001
67 | 4 1 0 50 0 14 12 0.0000 4 105 135 2970 1305 g\001
68 | 4 1 0 50 0 14 12 0.0000 4 105 135 3330 1305 d\001
69 | 4 1 0 50 0 14 12 0.0000 4 75 135 3510 1305 o\001
70 | 4 1 0 50 0 14 12 0.0000 4 105 135 3690 1305 g\001
71 | 4 1 0 50 0 14 12 0.0000 4 30 135 3870 1305 .\001
72 | 4 1 0 50 0 14 12 0.0000 4 105 135 630 1305 H\001
73 | 4 1 0 50 0 14 12 0.0000 4 75 135 810 1305 e\001
74 | 4 1 0 50 0 14 12 0.0000 4 105 405 720 1575 PRP\001
75 | 4 1 0 50 0 14 12 0.0000 4 105 405 1350 1575 VBD\001
76 | 4 1 0 50 0 14 12 0.0000 4 105 270 2070 1575 DT\001
77 | 4 1 0 50 0 14 12 0.0000 4 105 270 2790 1575 JJ\001
78 | 4 1 0 50 0 14 12 0.0000 4 105 270 3510 1575 NN\001
79 | 4 1 0 50 0 14 9 0.0000 4 75 420 720 1755 B-NP\001
80 | 4 1 0 50 0 14 9 0.0000 4 75 105 1350 1755 O\001
81 | 4 1 0 50 0 14 9 0.0000 4 75 420 2070 1755 B-NP\001
82 | 4 1 0 50 0 14 9 0.0000 4 75 420 2790 1755 I-NP\001
83 | 4 1 0 50 0 14 9 0.0000 4 75 420 3510 1755 I-NP\001
84 |
--------------------------------------------------------------------------------
/xmlpp.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2008, Fredrik Ekholdt
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
6 |
7 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
8 |
9 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
10 |
11 | * Neither the name of None nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
12 |
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
14 |
15 | (Minor modifications by Steven Bird)
16 | """
17 | import sys
18 |
19 | def usage(this_file):
20 | return """SYNOPSIS: pretty print an XML document
21 | USAGE: python %s or use stdin as input\n""" % this_file
22 |
23 | def pprint(indent_level, line):
24 | if line.strip():
25 | sys.stdout.write(" " * indent_level + line + "\n")
26 |
27 | def get_next_elem(data):
28 | start_pos = data.find("<")
29 | end_pos = data.find(">") + 1
30 | retval = data[start_pos:end_pos]
31 | stopper = retval.find("/")
32 | single = (stopper > -1 and ((retval.find(">") - stopper) < (stopper - retval.find("<"))))
33 |
34 | ignore_excl = retval.find(" -1
35 | ignore_question = retval.find("") > -1
36 |
37 | if ignore_excl:
38 | cdata = retval.find(" -1
39 | if cdata:
40 | end_pos = data.find("]]>")
41 | if end_pos > -1:
42 | end_pos = end_pos + len("]]>")
43 |
44 | elif ignore_question:
45 | end_pos = data.find("?>") + len("?>")
46 | ignore = ignore_excl or ignore_question
47 |
48 | no_indent = ignore or single
49 |
50 | #print retval, end_pos, start_pos, no_indent
51 | return start_pos, \
52 | end_pos, \
53 | stopper > -1, \
54 | no_indent
55 |
56 |
57 | if __name__ == "__main__":
58 | if "-h" in sys.argv or "--help" in sys.argv:
59 | sys.stderr.write(usage(sys.argv[0]))
60 | sys.exit(1)
61 | if len(sys.argv) < 2:
62 | data = sys.stdin.read()
63 | else:
64 | filename = sys.argv[1]
65 | data = open(filename).read()
66 |
67 | INDENT = 2
68 |
69 | indent_level = 0
70 |
71 | start_pos, end_pos, is_stop, no_indent = get_next_elem(data)
72 | while ((start_pos > -1 and end_pos > -1)):
73 | pprint(indent_level, data[:start_pos].strip())
74 | data = data[start_pos:]
75 | if is_stop and not no_indent:
76 | indent_level = indent_level - INDENT
77 | pprint(indent_level, data[:end_pos - start_pos])
78 | data = data[end_pos - start_pos:]
79 | if not is_stop and not no_indent :
80 | indent_level = indent_level + INDENT
81 |
82 | if not data:
83 | break
84 | else:
85 | start_pos, end_pos, is_stop, no_indent = get_next_elem(data)
86 |
--------------------------------------------------------------------------------
/images/chunk-segmentation.fig:
--------------------------------------------------------------------------------
1 | #FIG 3.2
2 | Landscape
3 | Center
4 | Metric
5 | A4
6 | 100.00
7 | Single
8 | -2
9 | 1200 2
10 | 0 32 #aeaeae
11 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
12 | 540 1170 720 1170 720 1350 540 1350 540 1170
13 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
14 | 720 1170 900 1170 900 1350 720 1350 720 1170
15 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
16 | 900 1170 1080 1170 1080 1350 900 1350 900 1170
17 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
18 | 1080 1170 1260 1170 1260 1350 1080 1350 1080 1170
19 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
20 | 1260 1170 1440 1170 1440 1350 1260 1350 1260 1170
21 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
22 | 1440 1170 1620 1170 1620 1350 1440 1350 1440 1170
23 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
24 | 1620 1170 1800 1170 1800 1350 1620 1350 1620 1170
25 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
26 | 1800 1170 1980 1170 1980 1350 1800 1350 1800 1170
27 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
28 | 1980 1170 2160 1170 2160 1350 1980 1350 1980 1170
29 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
30 | 2160 1170 2340 1170 2340 1350 2160 1350 2160 1170
31 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
32 | 2340 1170 2520 1170 2520 1350 2340 1350 2340 1170
33 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
34 | 2700 1170 2880 1170 2880 1350 2700 1350 2700 1170
35 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
36 | 2520 1170 2700 1170 2700 1350 2520 1350 2520 1170
37 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
38 | 3060 1170 3240 1170 3240 1350 3060 1350 3060 1170
39 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
40 | 2880 1170 3060 1170 3060 1350 2880 1350 2880 1170
41 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
42 | 3240 1170 3420 1170 3420 1350 3240 1350 3240 1170
43 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
44 | 3420 1170 3600 1170 3600 1350 3420 1350 3420 1170
45 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
46 | 3600 1170 3780 1170 3780 1350 3600 1350 3600 1170
47 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
48 | 3780 1170 3960 1170 3960 1350 3780 1350 3780 1170
49 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
50 | 495 1125 945 1125 945 1620 495 1620 495 1125
51 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
52 | 1035 1125 1665 1125 1665 1620 1035 1620 1035 1125
53 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
54 | 2475 1125 3105 1125 3105 1620 2475 1620 2475 1125
55 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
56 | 3195 1125 3825 1125 3825 1620 3195 1620 3195 1125
57 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
58 | 1755 1125 2385 1125 2385 1620 1755 1620 1755 1125
59 | 2 2 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 5
60 | 1710 1080 3870 1080 3870 1890 1710 1890 1710 1080
61 | 2 2 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 5
62 | 450 1080 990 1080 990 1890 450 1890 450 1080
63 | 4 1 0 50 0 14 12 0.0000 4 75 105 1170 1305 s\001
64 | 4 1 0 50 0 14 12 0.0000 4 75 105 1350 1305 a\001
65 | 4 1 0 50 0 14 12 0.0000 4 75 105 1530 1305 w\001
66 | 4 1 0 50 0 14 12 0.0000 4 105 105 1890 1305 t\001
67 | 4 1 0 50 0 14 12 0.0000 4 105 105 2070 1305 h\001
68 | 4 1 0 50 0 14 12 0.0000 4 75 105 2250 1305 e\001
69 | 4 1 0 50 0 14 12 0.0000 4 105 105 2610 1305 b\001
70 | 4 1 0 50 0 14 12 0.0000 4 105 105 2790 1305 i\001
71 | 4 1 0 50 0 14 12 0.0000 4 120 105 2970 1305 g\001
72 | 4 1 0 50 0 14 12 0.0000 4 105 105 3330 1305 d\001
73 | 4 1 0 50 0 14 12 0.0000 4 75 105 3510 1305 o\001
74 | 4 1 0 50 0 14 12 0.0000 4 120 105 3690 1305 g\001
75 | 4 1 0 50 0 14 12 0.0000 4 30 105 3870 1305 .\001
76 | 4 1 0 50 0 14 12 0.0000 4 105 105 630 1305 H\001
77 | 4 1 0 50 0 14 12 0.0000 4 75 105 810 1305 e\001
78 | 4 1 0 50 0 14 12 0.0000 4 105 315 720 1575 PRP\001
79 | 4 1 0 50 0 14 12 0.0000 4 105 315 1350 1575 VBD\001
80 | 4 1 0 50 0 14 12 0.0000 4 105 210 2070 1575 DT\001
81 | 4 1 0 50 0 14 12 0.0000 4 105 210 2790 1575 JJ\001
82 | 4 1 0 50 0 14 12 0.0000 4 105 210 3510 1575 NN\001
83 | 4 1 0 50 0 14 12 0.0000 4 105 210 2790 1845 NP\001
84 | 4 1 0 50 0 14 12 0.0000 4 105 210 720 1845 NP\001
85 |
--------------------------------------------------------------------------------
/HouseStyle.txt:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 | .. include:: definitions.rst
3 |
4 | ==========================================
5 | NLTK Book House Style: Restructured Text
6 | ==========================================
7 |
8 | ------------------
9 | ReST Inline Markup
10 | ------------------
11 |
12 |
13 | Double backquotes are used for code::
14 |
15 | e.g., the ``tokenize.whitespace`` function
16 |
17 | Double quotes used for quoted speech::
18 |
19 | e.g., a noun is "the name of a person, place or thing"
20 |
21 | Single quotes used for scare quotes::
22 |
23 | e.g., there is no one 'right way' to assign tags
24 |
25 | ----------
26 | Text Roles
27 | ----------
28 |
29 | LX: Linguistic eXample -- cited form in running text::
30 |
31 | e.g., the verb `walks`:lx:
32 |
33 | DT: Defined Term -- first or canonical use of technical term::
34 |
35 | e.g., the process of `parsing`:dt:
36 |
37 | GC: Grammatical Category::
38 |
39 | e.g., `NP`:gc: and `verb`:gc: as technical terms
40 |
41 | EM: Emphasis::
42 |
43 | e.g., this word is `emphasised`:em: here.
44 |
45 | -----------------------------
46 | Examples and Cross-references
47 | -----------------------------
48 |
49 | Write examples using the custom 'example' directive (or 'ex' for short)::
50 |
51 | .. ex:: John went to the store.
52 | .. ex:: John bought some bread.
53 |
54 | The example directive may be nested, to define groups of related examples::
55 |
56 | .. ex::
57 | .. ex:: John went to the store.
58 | .. ex:: \* John went from the store.
59 |
60 | (Note that \* must be backslashed when used to indicate grammaticality
61 | judgements.)
62 |
63 | To refer to an example, mark it with a crossreference target, and
64 | refer to it with a crossreference link::
65 |
66 | .. _johntostore:
67 | .. ex:: John went to the store.
68 |
69 | In example johntostore_, `John`:lx: is the subject.
70 |
71 | .. Old instructions:
72 |
73 | When the text above an example refers to the example by number, the
74 | text ends with a period. When the text does not refer to the example
75 | by number, it ends with a colon.
76 | In text references, place letters referring to subexample numbers
77 | inside parentheses. (2a), (5a – b), (6a,c), (6a – f),
78 | (8a,c,d),
79 |
80 |
81 | ------
82 | Titles
83 | ------
84 |
85 | Section titles and captions should have initial capitals on non-function words.
86 |
87 | -----
88 | Trees
89 | -----
90 |
91 | Write trees using the custom 'tree' directive::
92 |
93 | .. tree:: (S (NP John) (VP (V saw) (NP Mary)))
94 |
95 | Constituents that should be drawn with a 'roof' (i.e., a triangle
96 | between the node & its children, rather than individual lines)
97 | are marked using angle brackets::
98 |
99 | .. tree:: (S (NP John) )
100 |
101 | Subscripting is done using underscore (similar to latex). If
102 | the subscripted string is more than one character long, it should
103 | be enclosed in brackets::
104 |
105 | .. tree:: (S (NP Mary_i) (VP was (VP seen t_i)))
106 |
107 | Substrings can be italicized by using '*...*'::
108 |
109 | .. tree:: (S (NP *Mary_i*) (VP was (VP seen *t_i*)))
110 |
111 | Backslash can be used to escape characters that would otherwise
112 | be treated as markup (i.e., any of C{'<>()_* '}). Note that this
113 | list includes space::
114 |
115 | .. tree:: (S (NP Mary) (VP went (PP to (NP New\ York))))
116 |
117 | Typically, trees will be included as the single element of an example::
118 |
119 | .. ex::
120 | .. tree:: (S (NP Mary) (VP went (PP to (NP New\ York))))
121 |
122 | ------------------
123 | Dashes and Hyphens
124 | ------------------
125 |
126 | Use an em-dash between words::
127 |
128 | e.g., you should try yourself |mdash| in fact, we insist!
129 |
130 | Use an en-dash between numerals::
131 |
132 | e.g., during the persiod 1900\ |ndash|\ 1950
133 |
--------------------------------------------------------------------------------
/LSA325/lsa110_1.tex:
--------------------------------------------------------------------------------
1 | %\documentclass{beamer} % for slides
2 | \documentclass[handout]{beamer} % for handout
3 | \input{beamer}
4 |
5 | \title{Python Programming for Linguists\\LSA 100 Presession}
6 |
7 | % \author{Steven Bird \and Ewan Klein \and Edward Loper}
8 | % \institute{
9 | % University of Melbourne, AUSTRALIA
10 | % \and
11 | % University of Edinburgh, UK
12 | % \and
13 | % University of Pennsylvania, USA
14 | % }
15 |
16 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
17 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
18 |
19 | \begin{document}
20 |
21 |
22 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
23 |
24 | \begin{frame}
25 | \titlepage
26 | \end{frame}
27 |
28 |
29 |
30 | \begin{frame}
31 | \frametitle{Introduction}
32 | \begin{itemize}
33 | \item Who we are
34 | \item Python and NLTK
35 | \item Materials and Resources
36 | \item Goals
37 | \item Syllabus
38 | \end{itemize}
39 | \end{frame}
40 |
41 | \begin{frame}
42 | \frametitle{Who we are}
43 |
44 | Instructors:
45 | \begin{itemize}
46 | \item Steven Bird
47 | \item Ewan Klein
48 | \item Edward Loper (here tomorrow)
49 | \end{itemize}
50 |
51 | TAs:
52 | \begin{itemize}
53 | \item David Hall
54 | \item Yaron Greif
55 | \item Yun-Hsuan Sung
56 | % \item Jette Viethen
57 | \end{itemize}
58 |
59 | \end{frame}
60 |
61 |
62 | \begin{frame}
63 | \frametitle{Python and NLTK}
64 | \begin{itemize}
65 | \item Pre-session for \textit{Introduction to
66 | Computational Linguistics} (LSA 325)
67 | \item First steps in using Python and Natural Language Toolkit (NLTK)
68 | \item Why Python?
69 | \begin{itemize}
70 | \item designed to be easy to learn;
71 | \item good for processing linguistic data;
72 | \item good for interactive experiments.
73 | \end{itemize}
74 | \item Many online tutorials (see \url{www.python.org})
75 | \end{itemize}
76 | \end{frame}
77 |
78 |
79 | \begin{frame}
80 | \frametitle{Materials and Resources}
81 | \begin{itemize}
82 | \item Chapter 2, \textit{Programming Fundamentals and Python} in the NLTK Book
83 | (\url{http://nltk.org/index.php/Book})
84 | \item \textbf{Su07-LSA-110} page on \url{http://coursework.stanford.edu}
85 | \item Main NLTK page: \url{http://nltk.org}
86 | \begin{itemize}
87 | \item Chatroom
88 | \item \texttt{nltk-users} Mailing List
89 | \end{itemize}
90 |
91 | \end{itemize}
92 | \end{frame}
93 |
94 | \begin{frame}
95 | \frametitle{Audience and Goals}
96 | \begin{itemize}
97 | \item We are assuming you have not done programming before.
98 | \item So, getting you to a point where:
99 | \begin{itemize}
100 | \item you have got some confidence in using basic Python commands;
101 | \item you can use Python for carrying out simple operations on text;
102 | \item you can do all the easy and intermediate exercises in
103 | Chapter 2;
104 | \item you have found out where to get more information (fellow
105 | students, the web, textbooks)
106 | \end{itemize}
107 | \end{itemize}
108 | \end{frame}
109 |
110 | \begin{frame}
111 | \frametitle{Syllabus}
112 | \begin{description}
113 | \item[Class 1] Manipulating strings, lists and other sequences.
114 | \item[Class 2] Conditionals, dictionaries, functions and regular
115 | expressions.
116 | \item[Class 3] Preview of NLTK chapters on Words and Tagging
117 | \end{description}
118 | \end{frame}
119 |
120 | \begin{frame}
121 | \frametitle{Almost there \ldots}
122 | \begin{itemize}
123 | \item Installation CDs
124 | \item Today: at least Python
125 | \item Tomorrow: full NLTK installation
126 | \item Homework: catch up on exercises and reading
127 | \end{itemize}
128 | \end{frame}
129 |
130 |
131 |
132 |
133 | \end{document}
134 |
--------------------------------------------------------------------------------
/slides/lsa110_1.tex:
--------------------------------------------------------------------------------
1 | \documentclass{beamer} % for slides
2 | % \documentclass[handout]{beamer} % for handout
3 | \input{beamer}
4 |
5 | \title{Python Programming for Linguists\\LSA 100 Presession}
6 |
7 | % \author{Steven Bird \and Ewan Klein \and Edward Loper}
8 | % \institute{
9 | % University of Melbourne, AUSTRALIA
10 | % \and
11 | % University of Edinburgh, UK
12 | % \and
13 | % University of Pennsylvania, USA
14 | % }
15 |
16 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
17 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
18 |
19 | \begin{document}
20 |
21 |
22 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
23 |
24 | \begin{frame}
25 | \titlepage
26 | \end{frame}
27 |
28 |
29 |
30 | \begin{frame}
31 | \frametitle{Introduction}
32 | \begin{itemize}
33 | \item Who we are
34 | \item Python and NLTK
35 | \item Materials and Resources
36 | \item Goals
37 | \item Syllabus
38 | \end{itemize}
39 | \end{frame}
40 |
41 | \begin{frame}
42 | \frametitle{Who we are}
43 |
44 | Instructors:
45 | \begin{itemize}
46 | \item Steven Bird
47 | \item Ewan Klein
48 | \item Edward Loper (here tomorrow)
49 | \end{itemize}
50 |
51 | TAs:
52 | \begin{itemize}
53 | \item David Hall
54 | \item Yaron Greif
55 | \item Yun-Hsuan Sung
56 | % \item Jette Viethen
57 | \end{itemize}
58 |
59 | \end{frame}
60 |
61 |
62 | \begin{frame}
63 | \frametitle{Python and NLTK}
64 | \begin{itemize}
65 | \item Pre-session for \textit{Introduction to
66 | Computational Linguistics} (LSA 325)
67 | \item First steps in using Python and Natural Language Toolkit (NLTK)
68 | \item Why Python?
69 | \begin{itemize}
70 | \item designed to be easy to learn;
71 | \item good for processing linguistic data;
72 | \item good for interactive experiments.
73 | \end{itemize}
74 | \item Many online tutorials (see \url{www.python.org})
75 | \end{itemize}
76 | \end{frame}
77 |
78 |
79 | \begin{frame}
80 | \frametitle{Materials and Resources}
81 | \begin{itemize}
82 | \item Chapter 2, \textit{Programming Fundamentals and Python} in the NLTK Book
83 | (\url{http://nltk.org/index.php/Book})
84 | \item \textbf{Su07-LSA-110} page on \url{http://coursework.stanford.edu}
85 | \item Main NLTK page: \url{http://nltk.org}
86 | \begin{itemize}
87 | \item Chatroom
88 | \item \texttt{nltk-users} Mailing List
89 | \end{itemize}
90 |
91 | \end{itemize}
92 | \end{frame}
93 |
94 | \begin{frame}
95 | \frametitle{Audience and Goals}
96 | \begin{itemize}
97 | \item We are assuming you have not done programming before.
98 | \item So, getting you to a point where:
99 | \begin{itemize}
100 | \item you have got some confidence in using basic Python commands;
101 | \item you can use Python for carrying out simple operations on text;
102 | \item you can do all the easy and intermediate exercises in
103 | Chapter 2;
104 | \item you have found out where to get more information (fellow
105 | students, the web, textbooks)
106 | \end{itemize}
107 | \end{itemize}
108 | \end{frame}
109 |
110 | \begin{frame}
111 | \frametitle{Syllabus}
112 | \begin{description}
113 | \item[Class 1] Manipulating strings, lists and other sequences.
114 | \item[Class 2] Conditionals, dictionaries, functions and regular
115 | expressions.
116 | \item[Class 3] Preview of NLTK chapters on Words and Tagging
117 | \end{description}
118 | \end{frame}
119 |
120 | \begin{frame}
121 | \frametitle{Almost there \ldots}
122 | \begin{itemize}
123 | \item Installation CDs
124 | \item Today: at least Python
125 | \item Tomorrow: full NLTK installation
126 | \item Homework: catch up on exercises and reading
127 | \end{itemize}
128 | \end{frame}
129 |
130 |
131 |
132 |
133 | \end{document}
134 |
--------------------------------------------------------------------------------
/images/chunk-treerep.fig:
--------------------------------------------------------------------------------
1 | #FIG 3.2
2 | Landscape
3 | Center
4 | Metric
5 | A4
6 | 100.00
7 | Single
8 | -2
9 | 1200 2
10 | 0 32 #aeaeae
11 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
12 | 540 1170 720 1170 720 1350 540 1350 540 1170
13 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
14 | 720 1170 900 1170 900 1350 720 1350 720 1170
15 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
16 | 900 1170 1080 1170 1080 1350 900 1350 900 1170
17 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
18 | 1080 1170 1260 1170 1260 1350 1080 1350 1080 1170
19 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
20 | 1260 1170 1440 1170 1440 1350 1260 1350 1260 1170
21 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
22 | 1440 1170 1620 1170 1620 1350 1440 1350 1440 1170
23 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
24 | 1620 1170 1800 1170 1800 1350 1620 1350 1620 1170
25 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
26 | 1800 1170 1980 1170 1980 1350 1800 1350 1800 1170
27 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
28 | 1980 1170 2160 1170 2160 1350 1980 1350 1980 1170
29 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
30 | 2160 1170 2340 1170 2340 1350 2160 1350 2160 1170
31 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
32 | 2340 1170 2520 1170 2520 1350 2340 1350 2340 1170
33 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
34 | 2700 1170 2880 1170 2880 1350 2700 1350 2700 1170
35 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
36 | 2520 1170 2700 1170 2700 1350 2520 1350 2520 1170
37 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
38 | 3060 1170 3240 1170 3240 1350 3060 1350 3060 1170
39 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
40 | 2880 1170 3060 1170 3060 1350 2880 1350 2880 1170
41 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
42 | 3240 1170 3420 1170 3420 1350 3240 1350 3240 1170
43 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
44 | 3420 1170 3600 1170 3600 1350 3420 1350 3420 1170
45 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
46 | 3600 1170 3780 1170 3780 1350 3600 1350 3600 1170
47 | 2 2 0 1 32 7 50 0 -1 0.000 0 0 -1 0 0 5
48 | 3780 1170 3960 1170 3960 1350 3780 1350 3780 1170
49 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
50 | 495 1125 945 1125 945 1620 495 1620 495 1125
51 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
52 | 1035 1125 1665 1125 1665 1620 1035 1620 1035 1125
53 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
54 | 2475 1125 3105 1125 3105 1620 2475 1620 2475 1125
55 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
56 | 3195 1125 3825 1125 3825 1620 3195 1620 3195 1125
57 | 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
58 | 1755 1125 2385 1125 2385 1620 1755 1620 1755 1125
59 | 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
60 | 720 1620 720 1800
61 | 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
62 | 2790 1620 2790 1800
63 | 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
64 | 2250 1620 2700 1800
65 | 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
66 | 3330 1620 2880 1800
67 | 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
68 | 2790 1980 1845 2205
69 | 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
70 | 720 1980 1665 2205
71 | 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
72 | 1350 1620 1755 2205
73 | 4 1 0 50 0 14 12 0.0000 4 75 105 1170 1305 s\001
74 | 4 1 0 50 0 14 12 0.0000 4 75 105 1350 1305 a\001
75 | 4 1 0 50 0 14 12 0.0000 4 75 105 1530 1305 w\001
76 | 4 1 0 50 0 14 12 0.0000 4 105 105 1890 1305 t\001
77 | 4 1 0 50 0 14 12 0.0000 4 105 105 2070 1305 h\001
78 | 4 1 0 50 0 14 12 0.0000 4 75 105 2250 1305 e\001
79 | 4 1 0 50 0 14 12 0.0000 4 105 105 2610 1305 b\001
80 | 4 1 0 50 0 14 12 0.0000 4 105 105 2790 1305 i\001
81 | 4 1 0 50 0 14 12 0.0000 4 120 105 2970 1305 g\001
82 | 4 1 0 50 0 14 12 0.0000 4 105 105 3330 1305 d\001
83 | 4 1 0 50 0 14 12 0.0000 4 75 105 3510 1305 o\001
84 | 4 1 0 50 0 14 12 0.0000 4 120 105 3690 1305 g\001
85 | 4 1 0 50 0 14 12 0.0000 4 30 105 3870 1305 .\001
86 | 4 1 0 50 0 14 12 0.0000 4 105 105 630 1305 H\001
87 | 4 1 0 50 0 14 12 0.0000 4 75 105 810 1305 e\001
88 | 4 1 0 50 0 14 12 0.0000 4 105 315 720 1575 PRP\001
89 | 4 1 0 50 0 14 12 0.0000 4 105 315 1350 1575 VBD\001
90 | 4 1 0 50 0 14 12 0.0000 4 105 210 2070 1575 DT\001
91 | 4 1 0 50 0 14 12 0.0000 4 105 210 2790 1575 JJ\001
92 | 4 1 0 50 0 14 12 0.0000 4 105 210 3510 1575 NN\001
93 | 4 1 0 50 0 14 12 0.0000 4 105 210 720 1935 NP\001
94 | 4 1 0 50 0 14 12 0.0000 4 105 210 2790 1935 NP\001
95 | 4 1 0 50 0 14 12 0.0000 4 105 105 1755 2340 S\001
96 |
--------------------------------------------------------------------------------
/definitions.sty:
--------------------------------------------------------------------------------
1 |
2 | \usepackage{times}
3 | \usepackage{boxedminipage}
4 | \setlength{\parindent}{0pt}
5 | \setlength{\parskip}{1ex}
6 |
7 | %%%%%%%% UNICODE SUPPORT %%%%%%%%
8 |
9 | \usepackage{ucs}
10 | \usepackage{pdffonts}
11 | \usepackage{color}
12 | \providecommand{\textalpha}{{\usefont{OML}{hlcm}{m}{n} \ensuremath{\alpha}}}
13 | \providecommand{\textbeta}{{\usefont{OML}{hlcm}{m}{n} \ensuremath{\beta}}}
14 | \providecommand{\textgamma}{{\usefont{OML}{hlcm}{m}{n} \ensuremath{\gamma}}}
15 | \providecommand{\textmu}{{\usefont{OML}{hlcm}{m}{n} \ensuremath{\mu}}}
16 |
17 | \renewcommand{\labelitemi}{$\blacksquare$}
18 |
19 | %%%%%%%% ATTRIBUTE VALUE MATRICES %%%%%%%%
20 |
21 | \usepackage{avm}
22 | \avmfont{\sc}
23 | \avmvalfont{\it}
24 |
25 | %%%%%%%% HEADERS AND FOOTERS %%%%%%%%
26 |
27 | \usepackage{fancyheadings}
28 | \pagestyle{fancy}
29 | \setlength{\headrulewidth}{0.5pt}
30 | \setlength{\footrulewidth}{0.5pt}
31 |
32 | \newcommand{\authors}{\small \emph{Bird, Klein \& Loper}}
33 | \newcommand{\booktitle}{\small \emph{Natural Language Processing (DRAFT)}}
34 | \newcommand{\thedate}{\small \emph{\today}}
35 | \renewcommand{\chaptermark}[1]{\markboth{\emph{\thechapter.\ #1}}{}}
36 | \renewcommand{\sectionmark}[1]{\markright{\emph{\thesection.\ {#1}}}}
37 |
38 | \lhead [] {\leftmark}
39 | \chead [] {}
40 | \rhead [\rightmark] {\booktitle}
41 | \rfoot [\authors] {\thedate}
42 | \lfoot [\thedate] {\authors}
43 | \cfoot [\thepage] {\thepage}
44 |
45 |
46 | %%%%%%%% CUSTOM INLINE ROLES %%%%%%%%
47 |
48 | % Placeholder -- to be replaced by some actual value in a program
49 | \newcommand{\docutilsroleplaceholder}[1]{\texttt{\textit{#1}}}
50 | % Linguistic example - cited form in running text
51 | \newcommand{\docutilsroleexample}[1]{\textit{#1}}
52 | % Emphasized text
53 | \newcommand{\docutilsroleemphasis}[1]{\emph{#1}}
54 | % Defined term - first of canonical use of technical term
55 | \newcommand{\docutilsroleterm}[1]{\textbf{#1}}
56 | % Gramatical category - e.g. NP and verb as technical terms
57 | \newcommand{\docutilsrolecategory}[1]{\textsc{#1}}
58 | % Math symbols
59 | \newcommand{\docutilsrolemath}[1]{${#1}$}
60 | % Text in math env
61 | % Currently implemented as \textit since we can't do embedded text
62 | % roles in RST
63 | \newcommand{\docutilsrolemathit}[1]{\textsf{#1}}
64 | % Features and values
65 | \newcommand{\docutilsrolefeature}[1]{\textsc{#1}}
66 | \newcommand{\docutilsrolefval}[1]{\textit{#1}}
67 | % Lexemes
68 | \newcommand{\docutilsrolelex}[1]{\textsc{#1}}
69 |
70 | \newcommand{\docutilsroleNone}[1]{#1}
71 |
72 | %%%%%%%% PYTHON SOURCE CODE MARKUP %%%%%%%%
73 |
74 | % Note -- there is no bold tt font, so currently most of these commands
75 | % don't really do anything. :-/
76 |
77 | \definecolor{py@keywordcolour}{rgb}{1,0.45882,0}
78 | \definecolor{py@stringcolour}{rgb}{0,0.666666,0}
79 | \definecolor{py@commentcolour}{rgb}{1,0,0}
80 | \definecolor{py@ps1colour}{rgb}{0.60784,0,0}
81 | \definecolor{py@ps2colour}{rgb}{0.60784,0,1}
82 | \definecolor{py@inputcolour}{rgb}{0,0,0}
83 | \definecolor{py@outputcolour}{rgb}{0,0,1}
84 | \definecolor{py@exceptcolour}{rgb}{1,0,0}
85 | \definecolor{py@builtincolour}{rgb}{0.58039,0,0.58039}
86 | \definecolor{py@identifiercolour}{rgb}{0,0,0}
87 | \definecolor{py@linenumcolour}{rgb}{0.4,0.4,0.4}
88 | \definecolor{py@inputcolour}{rgb}{0,0,0}
89 | \definecolor{py@defnamecolour}{rgb}{0,0.5,0.5}
90 |
91 | % Prompt
92 | \renewcommand{\pysrcprompt}[1]{\textcolor{py@ps1colour}{#1}}
93 | \renewcommand{\pysrcmore}[1]{\textcolor{py@ps2colour}{#1}}
94 | % Source code
95 | \renewcommand{\pysrckeyword}[1]{\textcolor{py@keywordcolour}{#1}}
96 | \renewcommand{\pysrcbuiltin}[1]{\textcolor{py@builtincolour}{#1}}
97 | \renewcommand{\pysrcstring}[1]{\textcolor{py@stringcolour}{#1}}
98 | \renewcommand{\pysrcdefname}[1]{\textcolor{py@defnamecolour}{#1}}
99 | \renewcommand{\pysrcother}[1]{#1}
100 | % Comments
101 | \renewcommand{\pysrccomment}[1]{\textcolor{py@commentcolour}{#1}}
102 | % Output
103 | \renewcommand{\pysrcoutput}[1]{\textcolor{py@outputcolour}{#1}}
104 | % Exceptions
105 | \renewcommand{\pysrcexcept}[1]{\textcolor{py@exceptcolour}{#1}}
106 |
107 | %%%%%%%% HYPHENATION CONTROL %%%%%%%%
108 |
109 | \pretolerance 250
110 | \tolerance 500
111 | % \hyphenpenalty 250
112 | \hyphenpenalty 200
113 | \exhyphenpenalty 100
114 | \doublehyphendemerits 7500
115 | \finalhyphendemerits 7500
116 | \brokenpenalty 10000
117 | \lefthyphenmin 3
118 | \righthyphenmin 3
119 | \widowpenalty 10000
120 | \clubpenalty 10000
121 | \displaywidowpenalty 10000
122 | \looseness 1
123 |
--------------------------------------------------------------------------------
/LSA325/lsa110_2.tex:
--------------------------------------------------------------------------------
1 | \documentclass{beamer} % for slides
2 | %\documentclass[handout]{beamer} % for handout
3 | \input{beamer}
4 |
5 | \title{Python Programming for Linguists\\LSA 100 Presession}
6 |
7 | % \author{Steven Bird \and Ewan Klein \and Edward Loper}
8 | % \institute{
9 | % University of Melbourne, AUSTRALIA
10 | % \and
11 | % University of Edinburgh, UK
12 | % \and
13 | % University of Pennsylvania, USA
14 | % }
15 |
16 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
17 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
18 |
19 | \begin{document}
20 |
21 |
22 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
23 |
24 | \begin{frame}
25 | \titlepage
26 | \end{frame}
27 |
28 |
29 |
30 | \begin{frame}
31 | \frametitle{Exercise for the Break}
32 | \begin{enumerate}
33 | \item Get together in groups of around 5 or 6
34 | \item Try to agree on:
35 | \begin{itemize}
36 | \item a question; or
37 | \item something that puzzles you; or
38 | \item a gripe; or
39 | \item a suggestion for improving this course.
40 | \end{itemize}
41 |
42 | \item Write it down on a piece of paper, and give it to us.
43 | \item We will respond, either in today's session or on the web.
44 | \end{enumerate}
45 | \end{frame}
46 |
47 | \begin{frame}
48 | \frametitle{Keeping in Touch}
49 |
50 | \begin{itemize}
51 | \item We circulated the class list earlier in today's session.
52 | \item If you haven't already done so, please add your email address.
53 | \item We will make sure you are subscribed to \texttt{nltk-announce}
54 | \end{itemize}
55 |
56 | \end{frame}
57 |
58 | \begin{frame}
59 | \frametitle{LSA 325: Introduction to Computational Linguistics}
60 |
61 | \begin{itemize}
62 | \item We can accept new participants:
63 | \item Cordura 100
64 | \item Mon/Thu 1:30-3:15
65 | \end{itemize}
66 |
67 | \end{frame}
68 |
69 |
70 | % \begin{frame}
71 | % \frametitle{Python and NLTK}
72 | % \begin{itemize}
73 | % \item Pre-session for \textit{Introduction to
74 | % Computational Linguistics} (LSA 325)
75 | % \item First steps in using Python and Natural Language Toolkit (NLTK)
76 | % \item Why Python?
77 | % \begin{itemize}
78 | % \item designed to be easy to learn;
79 | % \item good for processing linguistic data;
80 | % \item good for interactive experiments.
81 | % \end{itemize}
82 | % \item Many online tutorials (see \url{www.python.org})
83 | % \end{itemize}
84 | % \end{frame}
85 |
86 |
87 | % \begin{frame}
88 | % \frametitle{Materials and Resources}
89 | % \begin{itemize}
90 | % \item Chapter 2, \textit{Programming Fundamentals and Python} in the NLTK Book
91 | % (\url{http://nltk.org/index.php/Book})
92 | % \item \textbf{Su07-LSA-110} page on \url{http://coursework.stanford.edu}
93 | % \item Main NLTK page: \url{http://nltk.org}
94 | % \begin{itemize}
95 | % \item Chatroom
96 | % \item \texttt{nltk-users} Mailing List
97 | % \end{itemize}
98 |
99 | % \end{itemize}
100 | % \end{frame}
101 |
102 | % \begin{frame}
103 | % \frametitle{Audience and Goals}
104 | % \begin{itemize}
105 | % \item We are assuming you have not done programming before.
106 | % \item So, getting you to a point where:
107 | % \begin{itemize}
108 | % \item you have got some confidence in using basic Python commands;
109 | % \item you can use Python for carrying out simple operations on text;
110 | % \item you can do all the easy and intermediate exercises in
111 | % Chapter 2;
112 | % \item you have found out where to get more information (fellow
113 | % students, the web, textbooks)
114 | % \end{itemize}
115 | % \end{itemize}
116 | % \end{frame}
117 |
118 | % \begin{frame}
119 | % \frametitle{Syllabus}
120 | % \begin{description}
121 | % \item[Class 1] Manipulating strings, lists and other sequences.
122 | % \item[Class 2] Conditionals, dictionaries, functions and regular
123 | % expressions.
124 | % \item[Class 3] Preview of NLTK chapters on Words and Tagging
125 | % \end{description}
126 | % \end{frame}
127 |
128 | % \begin{frame}
129 | % \frametitle{Almost there \ldots}
130 | % \begin{itemize}
131 | % \item Installation CDs
132 | % \item Today: at least Python
133 | % \item Tomorrow: full NLTK installation
134 | % \item Homework: catch up on exercises and reading
135 | % \end{itemize}
136 | % \end{frame}
137 |
138 |
139 |
140 |
141 | \end{document}
142 |
--------------------------------------------------------------------------------
/howto/update_list.py:
--------------------------------------------------------------------------------
1 | #
2 | # Script that updates test-list.txt
3 | #
4 |
5 | import os, os.path, re, sys
6 |
7 | DOCTEST_SRC = '../../nltk/test'
8 |
9 | HEAD = (".. ==========================================================\n"
10 | ".. AUTO-GENERATED LISTING -- DO NOT EDIT!:\n\n"
11 | ".. role:: passed\n"
12 | " :class: doctest-passed\n\n"
13 | ".. role:: failed\n"
14 | " :class: doctest-failed\n\n"
15 | ".. role:: guide-linecount\n"
16 | " :class: guide-linecount\n\n"
17 | ".. container:: doctest-list\n\n"
18 | " .. list-table::\n"
19 | " :class: doctest-list \n"
20 | " :widths: 60 10 10 20\n"
21 | " :header-rows: 1\n\n"
22 | " * - `Topic `__\n"
23 | " - `Lines `__\n"
24 | " - `Tests `__\n"
25 | " - `Test Outcome `__\n")
26 | FOOT = (".. END AUTO-GENERATED LISTING\n"
27 | ".. ==========================================================\n")
28 |
29 | TITLE_REGEXPS = (
30 | '\s*----+[ ]*\n(.*)\n----+[ ]*\n',
31 | '\s*====+[ ]*\n(.*)\n====+[ ]*\n',
32 | '\s*(.*)\n====+[ ]*\n',
33 | '\s*(.*)\n----+[ ]*\n')
34 |
35 | def find_title(basename):
36 | filename = os.path.join(DOCTEST_SRC, basename + '.doctest')
37 | head = open(filename).read(800)
38 | for regexp in TITLE_REGEXPS:
39 | regexp = '\A\s*(?:\.\..*\n)*'+regexp
40 | m = re.match(regexp, head)
41 | if m: return m.group(1).strip().replace('`', "'")
42 | print 'Warning: no title found for %s' % basename
43 | return basename
44 |
45 | def linecount(basename):
46 | filename = os.path.join(DOCTEST_SRC, basename + '.doctest')
47 | s = open(filename).read()
48 | return len(re.findall('(?m)^\s*>>>', s)), s.count('\n')
49 |
50 | def fmt_num(n):
51 | if n > 50:
52 | n = n - n%10
53 | if n > 500:
54 | n = n - n%100
55 | if n >= 1000:
56 | n = str(n)[:-3]+','+str(n)[-3:]
57 | return n
58 |
59 | def doctest_listing(sortkey=None):
60 | listing = ''
61 |
62 | files = [f for f in os.listdir(DOCTEST_SRC) if f.endswith('.doctest')]
63 | err_refs = []
64 | lines = []
65 | for filename in files:
66 | basename = filename.replace('.doctest', '')
67 | if basename == 'temp': continue
68 |
69 | result = '`Passed!`:passed:'
70 | if os.path.exists(basename+'.errs'):
71 | s = open(basename+'.errs').read()
72 | num_failed = 0
73 | if not re.search(r'OK\s*\Z', s):
74 | num_failed = len(re.findall(r'(?m)^Failed [Ee]xample:', s))
75 | result = '|%s|_' % basename
76 | err_refs.append( (basename, num_failed) )
77 | if sortkey is None:
78 | print ('test %s failed (%d examples)' %
79 | (basename, num_failed))
80 |
81 | title = find_title(basename)
82 | numtests, numlines = linecount(basename)
83 | lines.append([title, basename, numtests, numlines, result, num_failed])
84 |
85 | if sortkey in ('title', None):
86 | lines.sort(key=lambda v:v[0])
87 | if sortkey == 'lines':
88 | lines.sort(key=lambda v:(-int(v[2]), v[0]))
89 | if sortkey == 'tests':
90 | lines.sort(key=lambda v:(-int(v[3]), v[0]))
91 | if sortkey == 'outcome':
92 | lines.sort(key=lambda v:(-v[5], v[0]))
93 |
94 | for (title, basename, numtests, numlines, result, num_failed) in lines:
95 | numlines = fmt_num(numlines)
96 | numtests = fmt_num(numtests)
97 | listing += (' * - `%s <%s.html>`__\n' % (title,basename) +
98 | ' - :guide-linecount:`%s`\n' % numlines +
99 | ' - :guide-linecount:`%s`\n' % numtests +
100 | ' - %s\n' % result)
101 |
102 | for (basename, num_failed) in err_refs:
103 | plural = (num_failed!=1 and 's' or '')
104 | listing += ('\n.. |%s| replace:: `%d test%s failed!`:failed:'
105 | '\n.. _%s: %s.errs\n' %
106 | (basename, num_failed, plural, basename, basename))
107 |
108 | return listing
109 |
110 | def main():
111 | out = open('test-list.txt', 'w')
112 | out.write('%s\n%s\n%s' % (HEAD, doctest_listing(), FOOT))
113 | out.close()
114 |
115 | for sortkey in ('title', 'basename', 'lines', 'tests', 'outcome'):
116 | out = open('test-list-sort-%s.txt' % sortkey, 'w')
117 | out.write('%s\n%s\n%s' % (HEAD, doctest_listing(sortkey), FOOT))
118 | out.close()
119 |
120 | if __name__ == '__main__':
121 | main()
122 |
--------------------------------------------------------------------------------