├── images ├── info.png ├── titanic.pdf ├── autofill.png ├── c_argmax.png ├── freqplot.png ├── inner-join.png ├── lab5-plot1.png ├── lab5-plot2.png ├── lab5-plot3.png ├── decisiontree.png ├── french-words.png ├── lab6-output.png ├── english-words.png ├── lsa_2dim_tfidf.png ├── lab6-sawyer-plot.png ├── lab8-group-column.png ├── thestand-valence.png └── titanic-zoomed-out.png ├── data ├── hvd.hn6ltf.json.bz2 ├── mdp.49015002392919.json.bz2 ├── uc2.ark13960t1xd0sc6x.json.bz2 ├── classification │ ├── test │ │ ├── pst.000062491532.json.bz2 │ │ ├── mdp.39015004295880.json.bz2 │ │ ├── mdp.39015005725919.json.bz2 │ │ ├── mdp.39015008815865.json.bz2 │ │ ├── mdp.39015066049530.json.bz2 │ │ └── mdp.39076002736721.json.bz2 │ ├── train │ │ ├── pst.000029579440.json.bz2 │ │ ├── wu.89104415476.json.bz2 │ │ ├── hvd.32044014292023.json.bz2 │ │ ├── hvd.32044102860673.json.bz2 │ │ ├── mdp.39015038910694.json.bz2 │ │ └── uiug.30112037882914.json.bz2 │ └── english_french_class_labels.csv └── contemporary_books │ ├── contemporary-pages.csv.gz │ ├── dataset_files │ ├── pst.000023498051.json.bz2 │ ├── pst.000026748658.json.bz2 │ ├── pst.000026751405.json.bz2 │ ├── pst.000044406462.json.bz2 │ ├── pst.000050069378.json.bz2 │ ├── mdp.39015005028686.json.bz2 │ ├── mdp.39015010763418.json.bz2 │ ├── mdp.39015027242315.json.bz2 │ ├── mdp.39015029244657.json.bz2 │ ├── mdp.39015031703609.json.bz2 │ ├── mdp.39015038148048.json.bz2 │ ├── mdp.39015040702071.json.bz2 │ ├── mdp.39015043780249.json.bz2 │ ├── mdp.39015043798936.json.bz2 │ ├── mdp.39015046381565.json.bz2 │ ├── mdp.39015046788223.json.bz2 │ ├── mdp.39015046835560.json.bz2 │ ├── mdp.39015054084192.json.bz2 │ ├── mdp.39015054263903.json.bz2 │ ├── mdp.39015055831070.json.bz2 │ ├── mdp.39015058207492.json.bz2 │ ├── mdp.39015060663583.json.bz2 │ ├── mdp.39015062842383.json.bz2 │ ├── mdp.39015063682309.json.bz2 │ ├── mdp.39015066084040.json.bz2 │ ├── mdp.39015070756609.json.bz2 │ ├── mdp.39015073669205.json.bz2 │ ├── mdp.39015073669312.json.bz2 │ ├── uc1.32106011612402.json.bz2 │ ├── uc1.32106012198112.json.bz2 │ └── uc1.32106017944551.json.bz2 │ └── contemporary_labels.csv ├── writing.md ├── README.md ├── labs ├── Lab 04.ipynb ├── Lab 03.ipynb ├── Lab 05 - Part of Speech Tagging, Starting with Pandas.ipynb ├── Lab 02.ipynb └── Lab 06 - More Pandas and Intro to Classification.ipynb ├── assignments.md ├── examples ├── French-English Classification.ipynb ├── Topic Modelling Trump Tweets.ipynb └── Pivot Example.ipynb └── syllabus.md /images/info.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/info.png -------------------------------------------------------------------------------- /images/titanic.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/titanic.pdf -------------------------------------------------------------------------------- /images/autofill.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/autofill.png -------------------------------------------------------------------------------- /images/c_argmax.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/c_argmax.png -------------------------------------------------------------------------------- /images/freqplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/freqplot.png -------------------------------------------------------------------------------- /images/inner-join.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/inner-join.png -------------------------------------------------------------------------------- /images/lab5-plot1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/lab5-plot1.png -------------------------------------------------------------------------------- /images/lab5-plot2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/lab5-plot2.png -------------------------------------------------------------------------------- /images/lab5-plot3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/lab5-plot3.png -------------------------------------------------------------------------------- /images/decisiontree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/decisiontree.png -------------------------------------------------------------------------------- /images/french-words.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/french-words.png -------------------------------------------------------------------------------- /images/lab6-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/lab6-output.png -------------------------------------------------------------------------------- /data/hvd.hn6ltf.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/hvd.hn6ltf.json.bz2 -------------------------------------------------------------------------------- /images/english-words.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/english-words.png -------------------------------------------------------------------------------- /images/lsa_2dim_tfidf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/lsa_2dim_tfidf.png -------------------------------------------------------------------------------- /images/lab6-sawyer-plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/lab6-sawyer-plot.png -------------------------------------------------------------------------------- /images/lab8-group-column.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/lab8-group-column.png -------------------------------------------------------------------------------- /images/thestand-valence.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/thestand-valence.png -------------------------------------------------------------------------------- /images/titanic-zoomed-out.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/titanic-zoomed-out.png -------------------------------------------------------------------------------- /data/mdp.49015002392919.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/mdp.49015002392919.json.bz2 -------------------------------------------------------------------------------- /data/uc2.ark13960t1xd0sc6x.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/uc2.ark13960t1xd0sc6x.json.bz2 -------------------------------------------------------------------------------- /data/classification/test/pst.000062491532.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/test/pst.000062491532.json.bz2 -------------------------------------------------------------------------------- /data/classification/train/pst.000029579440.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/train/pst.000029579440.json.bz2 -------------------------------------------------------------------------------- /data/classification/train/wu.89104415476.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/train/wu.89104415476.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/contemporary-pages.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/contemporary-pages.csv.gz -------------------------------------------------------------------------------- /data/classification/test/mdp.39015004295880.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/test/mdp.39015004295880.json.bz2 -------------------------------------------------------------------------------- /data/classification/test/mdp.39015005725919.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/test/mdp.39015005725919.json.bz2 -------------------------------------------------------------------------------- /data/classification/test/mdp.39015008815865.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/test/mdp.39015008815865.json.bz2 -------------------------------------------------------------------------------- /data/classification/test/mdp.39015066049530.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/test/mdp.39015066049530.json.bz2 -------------------------------------------------------------------------------- /data/classification/test/mdp.39076002736721.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/test/mdp.39076002736721.json.bz2 -------------------------------------------------------------------------------- /data/classification/train/hvd.32044014292023.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/train/hvd.32044014292023.json.bz2 -------------------------------------------------------------------------------- /data/classification/train/hvd.32044102860673.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/train/hvd.32044102860673.json.bz2 -------------------------------------------------------------------------------- /data/classification/train/mdp.39015038910694.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/train/mdp.39015038910694.json.bz2 -------------------------------------------------------------------------------- /data/classification/train/uiug.30112037882914.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/train/uiug.30112037882914.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/pst.000023498051.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/pst.000023498051.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/pst.000026748658.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/pst.000026748658.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/pst.000026751405.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/pst.000026751405.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/pst.000044406462.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/pst.000044406462.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/pst.000050069378.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/pst.000050069378.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015005028686.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015005028686.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015010763418.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015010763418.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015027242315.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015027242315.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015029244657.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015029244657.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015031703609.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015031703609.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015038148048.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015038148048.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015040702071.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015040702071.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015043780249.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015043780249.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015043798936.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015043798936.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015046381565.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015046381565.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015046788223.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015046788223.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015046835560.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015046835560.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015054084192.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015054084192.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015054263903.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015054263903.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015055831070.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015055831070.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015058207492.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015058207492.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015060663583.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015060663583.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015062842383.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015062842383.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015063682309.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015063682309.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015066084040.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015066084040.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015070756609.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015070756609.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015073669205.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015073669205.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/mdp.39015073669312.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015073669312.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/uc1.32106011612402.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/uc1.32106011612402.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/uc1.32106012198112.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/uc1.32106012198112.json.bz2 -------------------------------------------------------------------------------- /data/contemporary_books/dataset_files/uc1.32106017944551.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/uc1.32106017944551.json.bz2 -------------------------------------------------------------------------------- /data/classification/english_french_class_labels.csv: -------------------------------------------------------------------------------- 1 | book,title,language 2 | hvd.32044014292023,"Alice's adventures in Wonderland ; and, Through the looking-glass / by Lewis Carroll ; with ninety-two illustrations by John Tenniel.",eng 3 | hvd.32044102860673,"Notre Dame de Paris. Abridged and edited, with introd. and notes, by John R. Wightman.",fre 4 | mdp.39015038910694,"Moby Dick,",eng 5 | pst.000029579440,The adventures of Huckleberry Finn / by Mark Twain.,eng 6 | uiug.30112037882914,Candide ou L'optimisme.,fre 7 | wu.89104415476,Les liaisons dangereuses / Choderlos de Laclos ;édition publiée d'après le texte original précédée d'une étude sur Choderlos de Laclos et suivie d'une bibliographie par Ad. Van Berver.,fre 8 | mdp.39015004295880,"Les caves du Vatican,",fre 9 | mdp.39015005725919,Madame Bovary de Gustave Flaubert; étude et analyse.,fre 10 | mdp.39015008815865,Jean Barois ...,fre 11 | mdp.39015066049530,The catcher in the rye / J. D. Salinger.,eng 12 | mdp.39076002736721,Catch-22 / Joseph Heller.,eng 13 | pst.000062491532,The lord of the rings / J.R.R. Tolkien.,eng 14 | -------------------------------------------------------------------------------- /data/contemporary_books/contemporary_labels.csv: -------------------------------------------------------------------------------- 1 | book,author,title 2 | mdp.39015005028686,King,The stand 3 | mdp.39015010763418,Atwood,Lady oracle; 4 | mdp.39015027242315,Atwood,The robber bride 5 | mdp.39015029244657,Grisham,The pelican brief 6 | mdp.39015031703609,Grisham,The rainmaker 7 | mdp.39015038148048,King,Desperation 8 | mdp.39015040702071,Atwood,Alias Grace 9 | mdp.39015043780249,King,The girl who loved Tom Gordon 10 | mdp.39015043798936,King,Bag of bones 11 | mdp.39015046381565,Grisham,A time to kill 12 | mdp.39015046788223,Grisham,The rainmaker 13 | mdp.39015046835560,Grisham,The partner 14 | mdp.39015054084192,Grisham,The testament 15 | mdp.39015054263903,King,Everything's eventual : 14 dark tales 16 | mdp.39015055831070,King,From a Buick 8 : a novel 17 | mdp.39015058207492,Grisham,The last juror 18 | mdp.39015060663583,Atwood,The handmaid's tale 19 | mdp.39015062842383,Atwood,The Penelopiad 20 | mdp.39015063682309,King,Cell : a novel 21 | mdp.39015066084040,Grisham,The summons 22 | mdp.39015070756609,Grisham,Playing for pizza 23 | mdp.39015073669205,King,Duma Key 24 | mdp.39015073669312,Grisham,The appeal 25 | pst.000023498051,King,Carrie 26 | pst.000026748658,Atwood,Bodily harm 27 | pst.000026751405,Atwood,Cat's eye 28 | pst.000044406462,Atwood,Life before man 29 | pst.000050069378,Grisham,The king of torts (large print) 30 | uc1.32106011612402,King,The dark half 31 | uc1.32106012198112,King,Stephen King's Danse macabre 32 | uc1.32106017944551,King,Cujo 33 | -------------------------------------------------------------------------------- /writing.md: -------------------------------------------------------------------------------- 1 | # Writing about Text Mining 2 | 3 | The tools and concepts that we learn in this class ultimately serve to answer a research question. 4 | Here, I've collected online writing that uses text mining well to investigate and tell a good story. 5 | Use these as inspiration for your final projects. 6 | 7 | ## Posts / Articles 8 | 9 | - [The Language of the State of the Union](https://www.theatlantic.com/politics/archive/2015/01/the-language-of-the-state-of-the-union/384575/). Benjamin Schmidt and Mitch Fraas. 10 | - [Text analysis of Trump's tweets confirms he writes only the (angrier) Android half](http://varianceexplained.org/r/trump-tweets/). David Robinson. 11 | - [The Birdy Pulpit — Analyzing Trump's Twitter Account](http://www.thecrosstab.com/2017/03/07/analysing-trump-tweets/). George Elliott Morris. 12 | - [Does sentiment analysis work? A tidy analysis of Yelp reviews](http://varianceexplained.org/r/yelp-sentiment/). David Robinson. 13 | - [Screen Time!](http://sappingattention.blogspot.com/2014/09/screen-time.html#more). Benjamin Schmidt. 14 | - [How Reddit Talked About The 2016 Presidential Campaign, From ‘Basket Of Deplorables’ To ‘Yuge’](http://fivethirtyeight.com/features/how-reddit-talked-about-the-2016-presidential-campaign-from-basket-of-deplorables-to-yuge/). Ritchie King. 15 | - [The Foreign Language of 'Mad Men'](https://www.theatlantic.com/entertainment/archive/2012/03/the-foreign-language-of-mad-men/254668/). Benjamin Schmidt. 16 | - [The instability of gender](https://tedunderwood.com/2016/01/09/the-instability-of-gender/). Ted Underwood. 17 | 18 | ## Websites / Blogs 19 | 20 | - [Language Log](http://languagelog.ldc.upenn.edu/nll/) 21 | - [Sapping Attention](http://sappingattention.blogspot.com) - Benjamin Schmidt's blog. 22 | - [Variance Explained](http://varianceexplained.org) - Blog by David Robinson. 23 | - [The Crosstab](http://www.thecrosstab.com) - Political data blog. 24 | - [FiveThirtyEight]( ) - News organization focused on data reporting. 25 | - [The Stone and The Shell](https://tedunderwood.com/) - Blog by Ted Underwood. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text Mining 2 | 3 | [Assignments](assignments.md) | [Lab Worksheets](https://github.com/organisciak/Text-Mining-Course/tree/master/labs) | [Syllabus](syllabus.md) 4 | 5 | ## Overview 6 | 7 | This course introduces students to the knowledge discovery process and methods used to mine patterns from a collection of text. We will critically review text mining methods developed in the knowledge discovery and databases, information science, and computational linguistics communities. Students will develop proficiency with modeling text through individual projects. 8 | 9 | How can computers read? When we look at a paragraph of text, we have a set of skills to understand and interpret it: what is the message? Is it an argument? What is the sentiment? Computers don't have the same context or literacy. Their language is quantitative. Through text mining, this course will equip you with the skills to use understanding text through computing. 10 | 11 | Text mining is most useful in the new affordances that it allows. In most cases, the tools of text mining aren't meant to replace 'close reading'; they give us new ways to ask questions - about literature, news, scholarship, correspondence, etc. - and are best applied in service of that novelty. Computing allows for: 12 | 13 | - Scale: Computers compare poorly to us in their ability to interpret meaning, but the things they can do may be applied to enormous scales. If you're interested in hundreds of books, thousands or web pages, or millions of tweets, simply reading them is unfeasible. 14 | - Re-contextualization: With text mining, you take apart texts and put them together in new ways. These give you new ways to understand information in a text or appreciate a book. Likewise, breaking down text to data also provides new comparative or critical tools. For example, we can understand what makes Jane Austen's books different from her contemporaries, or attribute authorship for anonymous or pseudonymous writing. 15 | - Summarization: Aggregation, extraction, and visualization all serve to report patterns you. For example, text summarization models can extract the takeaway points from a set of medical literature. 16 | A few final notes on course philosophy. 17 | 18 | First, the broad view of text mining can encompass many disciplinary approaches. This course hews closely to the sub-area referred to as text analysis, intended to treat text mining in the services of qualitative questions. This is closest to the treatments in the digital humanities and computational social sciences. 19 | 20 | For this course, you will be expected to learn new programming skills. Note that this is not a programming course. We will cover a subset of skills in Python that pertain to data science. Most of the time, your needs will be served by tinkering with and modifying code examples that I provide for you. 21 | 22 | I understand the time constraints of being a student. To account for the time you will spend in this course learning new tools and writing code, I have tried to keep reading and writing loads reasonable. 23 | 24 | Succeeding in this course will be through many little steps. The assignments are small but frequent. If you are looking at the entire outline of ideas and skills in this course, it may look overwhelming. However, going one step at a time, learning the language of text mining won't be scary. 25 | 26 | ## Pre- and Co-requisites 27 | An introductory level database and programming course or permission of the instructor. 28 | 29 | ## Required Texts 30 | This course incorporated readings from a variety of sources. Readings will openly accessible and posted on/linked from the course website. In addition to individual essays and papers, we will also return repeatedly to the following texts: 31 | 32 | - [Art of Literary Text Analysis](https://github.com/sgsinclair/alta/tree/master/ipynb) - Stefan Sinclair, 2015- 33 | - [Introduction to Information Retrieval](http://nlp.stanford.edu/IR-book/information-retrieval-book.html) - Manning and Schutz, 2008 34 | - [Speech and Language Processing](https://web.stanford.edu/~jurafsky/slp3/) 3rd edition. Dan Jurafsky and James H. Martin. 2017. 35 | - [Search Engines: Information Retrieval in Practice](http://ciir.cs.umass.edu/irbook/) - Croft, Metzler and Strohman. 2009. 36 | 37 | ## Schedule 38 | 39 | - Week 1: Introduction 40 | - Week 2: Fundamentals 41 | - Week 3: Features 42 | - Week 4: Text Mining for Art and Criticism 43 | - Week 5: Documentation Access; Natural Language Processing 1 - Part of Speech Tagging 44 | - Week 6: Natural Language Processing 2 - Information Extraction and Dependency Parsing 45 | - Week 7: Classification 1 46 | - Week 8: Classification 2 47 | - Week 9: Clustering 48 | - Week 10: Topic Modeling and Dimensionality Reduction 1 49 | - Week 11:Topic Modelling 2; Sentiment Analysis 50 | - Week 12: Visualization 51 | - Week 13: Word Embeddings 52 | - Week 14: What's Next: Remainder Notes from Text Mining 53 | 54 | The week-to-week syllabus, with readings, slides, and schedule notes is on the [Syllabus page](syllabus.md). 55 | 56 | ## Assignments 57 | 58 | - 30% Lab Tasks - Due Weekly 59 | - 20% Small Assigments 60 | - 10% - Twitter Bot Assignment 61 | - 10% - Topic Modelling Assignment 62 | - 35% Text Mining Project 63 | - 5% Problem Statement 64 | - 5% Literature review + 5% Data collection 65 | - 20% Final report 66 | - 15% Participation 67 | - 5% Attendance 68 | - 10% Forum posts, comments, class engagement 69 | 70 | Details are on the [Assignments page](assignments.md). 71 | -------------------------------------------------------------------------------- /labs/Lab 04.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Hopefully by this week, you're comfortable with lists (i.e. [a, b, c, ..]) and growing to understand list comprehensions.\n", 8 | "\n", 9 | "Two fundamental Python skills to be aware of. First, there's a general purpose method called `len()` that returns the length of an object, like \"how many items in this list\" or \"how many characters in this string\". e.g." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "collapsed": false 17 | }, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/plain": [ 22 | "3" 23 | ] 24 | }, 25 | "execution_count": 1, 26 | "metadata": {}, 27 | "output_type": "execute_result" 28 | } 29 | ], 30 | "source": [ 31 | "l = ['hello', 'text', 'mining']\n", 32 | "len(l)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "collapsed": false 40 | }, 41 | "outputs": [ 42 | { 43 | "data": { 44 | "text/plain": [ 45 | "11" 46 | ] 47 | }, 48 | "execution_count": 2, 49 | "metadata": {}, 50 | "output_type": "execute_result" 51 | } 52 | ], 53 | "source": [ 54 | "len(\"Text Mining\")" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "There is also an object called a `set`, which is like a list, but without an ordering and only allowing unique elements. This is useful for us, because it gives a quick way to see just the unique words of a list: the vocabulary." 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "List: ['Buffalo', 'buffalo', 'Buffalo', 'buffalo', 'buffalo', 'Buffalo', 'buffalo', 'buffalo']\n", 76 | "Set: {'Buffalo', 'buffalo'}\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "l = ['Buffalo', 'buffalo', 'Buffalo', 'buffalo', 'buffalo', 'Buffalo', 'buffalo', 'buffalo']\n", 82 | "s = set(l)\n", 83 | "print(\"List:\", l)\n", 84 | "print(\"Set:\", s)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "# Normalization" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "This week, follow along with [Searching for Meaning](https://github.com/sgsinclair/alta/blob/41f389f3d9708573c44c883bcd95fd16bad54a24/ipynb/SearchingMeaning.ipynb) from the Art of Literary Text Analysis.\n", 99 | "\n", 100 | "Use the trimmed version of Frankenstein from last week to try some of the concepts in the chapter. This should get you up to speed." 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "collapsed": false 108 | }, 109 | "outputs": [ 110 | { 111 | "data": { 112 | "text/plain": [ 113 | "85440" 114 | ] 115 | }, 116 | "execution_count": 1, 117 | "metadata": {}, 118 | "output_type": "execute_result" 119 | } 120 | ], 121 | "source": [ 122 | "import nltk\n", 123 | "with open('../data/frankenstein.txt') as f:\n", 124 | " frankensteinString = f.read()\n", 125 | "frankensteinTokens = nltk.word_tokenize(frankensteinString)\n", 126 | "cleanedTokens = [word.lower() for word in frankensteinTokens if word[0].isalpha()]\n", 127 | "len(frankensteinTokens)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "Note, that there are 85440 tokens in the text. If we count just the _unique_ words (the _vocabulary size_), we find 7038:" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "collapsed": false, 142 | "scrolled": true 143 | }, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "text/plain": [ 148 | "7038" 149 | ] 150 | }, 151 | "execution_count": 2, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "len(set(cleanedTokens))" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "**Questions**\n", 165 | "\n", 166 | "- 1) Create a lemmatized version of cleanedTokens and count the unique lemmas. Share the code to do this: the answer that it gives you should be 6416.\n", 167 | "- 2) Re-do the lemmatization after stopping words against the default NLTK stoplist, and tabulate the top ten words. Paste the code and output.\n", 168 | "- 3) How does the tabulation of lemmas differ from the tabulation of the non-lemmatized (but still stopped and case-folded) tokens?\n", 169 | "- 4) What are the WordNet synsets for 'monster'?\n", 170 | "- 5) A synset has a method called `definition()`. Noting that the code for Q4 resulted in a list, write a list comprehension to extracts all the definitions for each synset. Share the code and output.\n", 171 | "- 6) Each synset is a child of a more general synset. For example, `crab` is an example of a `decapod_crustacean`, which is more generally a `crustacean`, and so on. You can get at the paths to the root of this tree with `hypernym_paths()`. Paste the code and hypernym path for `freak.n.01`.\n", 172 | "- 7) (for 2 points): We've already seen some corpora that NLTK can pull in, from the complex WordNet information to a basic stoplist. Using the NLTK information on male/female names, determine and paste in the unique female names in Frankenstein. This isn't in the ALTA book, but searching Google sometimes helps ;)\n", 173 | "\n", 174 | "Our copy of Frankenstein is from Project Gutenberg, a collection of transcriptions of public domain (i.e. legally shareable) books. NLTK offers a small selection of those books through `nltk.corpus.gutenberg`.\n", 175 | "\n", 176 | "Load the gutenberg corpus and convert it to what NLTK calls a TextCollection:" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": { 183 | "collapsed": false 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "from nltk.text import TextCollection\n", 188 | "gutenberg_docs = nltk.corpus.gutenberg\n", 189 | "gutenberg_collection = TextCollection(gutenberg_docs)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "Using a method of `gutenberg_collection` (remember auto-complete!), answer the final two-part question.\n", 197 | "\n", 198 | "Questions:\n", 199 | " \n", 200 | " - 8) For 2 points:\n", 201 | " - What is the TFIDF for 'monster' in Frankenstein? You'll need the original (unnormalized) tokens.\n", 202 | " - What word has the highest TF-IDF for the following: 'miserable', 'horror', 'monster'? If you need it, you can compare numbers in python with > (greater than) or < (less than)." 203 | ] 204 | } 205 | ], 206 | "metadata": { 207 | "kernelspec": { 208 | "display_name": "Python 3", 209 | "language": "python", 210 | "name": "python3" 211 | }, 212 | "language_info": { 213 | "codemirror_mode": { 214 | "name": "ipython", 215 | "version": 3 216 | }, 217 | "file_extension": ".py", 218 | "mimetype": "text/x-python", 219 | "name": "python", 220 | "nbconvert_exporter": "python", 221 | "pygments_lexer": "ipython3", 222 | "version": "3.5.1" 223 | } 224 | }, 225 | "nbformat": 4, 226 | "nbformat_minor": 0 227 | } 228 | -------------------------------------------------------------------------------- /labs/Lab 03.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Week 3 Lab Task\n", 8 | "\n", 9 | "## More Jupyter Tips\n", 10 | "\n", 11 | "Hopefully by this week, you are growing more comfortable with starting Jupyter Notebooks and adding/editing cells. Remember that the keyboard shortcuts are invaluable: running a cell with `Ctrl+Enter`, or adding a new cell below with `B` (in command mode).\n", 12 | "\n", 13 | "Two tricks to try this week: autocompletion and retrieving documentation.\n", 14 | "\n", 15 | "**Autocomplete**\n", 16 | "\n", 17 | "If you start typing a known object or function into Jupyter, you can press `TAB` to finish it. This is especially useful for seeing what functions are available." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": { 24 | "collapsed": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "test = \"this is a string\"" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "Above, I've set a string to `test`. If I type `te` then press tab, it will complete the word. This is especially useful for long variable names that you don't want to keep typing. Note that it only completed because there no other options: in that case, there's a scrollable list of candidates for what you might be looking for.\n", 36 | "\n", 37 | "The `test` variable is a string. Last week, we saw a two functions that can be performed on strings: `split()` and `join()`. If you would like to see what other options there are for strings, try typing `test.` then press TAB. Magic!\n", 38 | "\n", 39 | "![Auto-fill](../images/autofill.png)\n", 40 | "\n", 41 | "** Documentation reference **\n", 42 | "\n", 43 | "If you want to look up information about a function, you can precede the code running that function with a `?`. For example, if I want to learn how I would use `split()` on `test`, I can type:" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "collapsed": true 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "?test.split()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "This will open a panel that looks like this in Jupyter:\n", 62 | "\n", 63 | "![Info](../images/info.png)\n", 64 | "\n", 65 | "The documentation is only as good as what the library is documented, so some libraries might be more or less detailed in this feature.\n", 66 | "\n", 67 | "*Questions*\n", 68 | "\n", 69 | "- 1) What does `test.isalpha()` do? Copy the documentation string.\n", 70 | "- 2) Strings have access to a function (whose name starts with a `ce`) that will let you change \"HEADING\" to \"====HEADING====\" (that is, padding with `=` to make the string 15 characters wide). What's the code to do that? (tip: this is an auto-fill question!)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "## Intro to the NLTK\n", 78 | "\n", 79 | "This week we'll start using the Natural Language toolkit. For the remaining questions, follow along with:\n", 80 | "\n", 81 | "- [Getting NLTK for Text Processing](https://github.com/sgsinclair/alta/blob/2acb6ed09f298f631e4025d33f062f980758a1ce/ipynb/GettingNltk.ipynb), Art of Literary Text Analysis\n", 82 | "\n", 83 | "Two notes. First, the tutorial suggests downloading \"all\" packages. However, install the packages from 'book' should be sufficient for now.\n", 84 | "\n", 85 | "Also, skip the text processing section, which deals with automatically downloading and cleaning a book. Instead, download this [already-cleaned version of Mary Shelley's Frankenstein](https://raw.githubusercontent.com/organisciak/Text-Mining-Course/master/data/frankenstein.txt), put it into the same folder as your notebook, and load it as follows:" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "collapsed": false 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "with open(\"../data/frankenstein.txt\") as f:\n", 97 | " frankensteinString = f.read()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "Here's a quick way of viewing part of our string: the first 250 characters. Notice that you can select subsets of strings like you select subsets of lists. " 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": { 111 | "collapsed": false 112 | }, 113 | "outputs": [ 114 | { 115 | "data": { 116 | "text/plain": [ 117 | "'Letter 1\\n\\nSt. Petersburgh, Dec. 11th, 17--\\n\\nTO Mrs. Saville, England\\n\\nYou will rejoice to hear that no disaster has accompanied the\\ncommencement of an enterprise which you have regarded with such evil\\nforebodings. I arrived here yesterday, and my'" 118 | ] 119 | }, 120 | "execution_count": 114, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "frankensteinString[0:250]" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | " > Side-note for the Python novice: you don't actually need the zero in [0:250]. If left blank, like '[:250]`, Python will assume \"from the very start\", which is the same as using a 0. If you leave the second part blank, Python will assume \"until the very end\".\n", 134 | " \n", 135 | "For the rest of the ALTA chapter, follow along using `frankensteinString` string instead of `goldBugString`." 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "__Questions__\n", 143 | "\n", 144 | " - 3) Use the `word_tokenize` function on Frankenstein, as shown in ALTA. What are tokens 39:67? Hint: this is a full sentence. Include your code.\n", 145 | " - 4) Create a sample of only the tokens where the first character is an alphabetical character. In this sample, what are tokens 1215:1221? Again, this will be a sentence, but won't include punctuation as tokens. Include your code.\n", 146 | " \n", 147 | " _For the next questions use the list of tokens that start with an alphabetical character._\n", 148 | " \n", 149 | " \n", 150 | " - 5) What are the ten most frequent words in this book? Create a frequency distribution of the words from question 4, then tabulate the top 10 words. Include your code.\n", 151 | " - 6) After case-folding, what are the ten most frequent words in this book? Include your code.\n", 152 | " - 7) Rewrite this list comprehension as a `for` loop (what ALTA called technique 1): `[word for word in listOfWords if word.find('-') >= 0]`. No output necessary, just the code, but feel free to test it out.\n", 153 | " - 8) We're going to use a customized stoplist. First, load the NLTK stoplist, and add the words 'could', 'would', 'upon', and 'yet' to the stoplist. What are the top ten case-folded words when stopping against the stoplist. Include your code and paste the tabulated output.\n", 154 | " \n", 155 | "Using the autocomplete in Jupyter, you may notice that a list of tokens converted to a `FreqDist` object has more methods than just `tabulate()`. One really cool one is `plot()`.\n", 156 | "\n", 157 | "`plot` gives you a visualization of the top frequency words. However, you may notice that if you try to run it, the visualization doesn't show up.\n", 158 | "\n", 159 | "It _is_ created, but Jupyter just doesn't know that you want the visualization shown _within_ the notebook. To turn that option on, run the following line of code:" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": { 166 | "collapsed": true 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "%matplotlib inline" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "This is only necessary once: it tells Jupyter to show plots 'inline' (ie. inside the notebook).\n", 178 | "\n", 179 | "**Questions**\n", 180 | "\n", 181 | "- 9) Write the code to plot the top forty stoplisted, lowercase words (from question 8). And again, remember the docs! The output will look similar to this:" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "![test](../images/freqplot.png)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "- 10) Enter the first 5 concordances for the word \"monster\" in the original token list - the list straight from word_tokenize that included punctuation and numbers - narrowing the search to a 49-characters window. Include the code. Tip: See the docs for the concordance tool in Jupyter." 196 | ] 197 | } 198 | ], 199 | "metadata": { 200 | "kernelspec": { 201 | "display_name": "Python 3", 202 | "language": "python", 203 | "name": "python3" 204 | }, 205 | "language_info": { 206 | "codemirror_mode": { 207 | "name": "ipython", 208 | "version": 3 209 | }, 210 | "file_extension": ".py", 211 | "mimetype": "text/x-python", 212 | "name": "python", 213 | "nbconvert_exporter": "python", 214 | "pygments_lexer": "ipython3", 215 | "version": "3.5.1" 216 | } 217 | }, 218 | "nbformat": 4, 219 | "nbformat_minor": 0 220 | } 221 | -------------------------------------------------------------------------------- /labs/Lab 05 - Part of Speech Tagging, Starting with Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lab 5" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Part of Speech Tagging\n", 15 | "\n", 16 | "And tuples!\n", 17 | "\n", 18 | "Here is how you tag parts-of-speech with NLTK:" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/plain": [ 31 | "[('And', 'CC'),\n", 32 | " ('now', 'RB'),\n", 33 | " ('for', 'IN'),\n", 34 | " ('something', 'NN'),\n", 35 | " ('completely', 'RB'),\n", 36 | " ('different', 'JJ')]" 37 | ] 38 | }, 39 | "execution_count": 3, 40 | "metadata": {}, 41 | "output_type": "execute_result" 42 | } 43 | ], 44 | "source": [ 45 | "import nltk\n", 46 | "text = \"And now for something completely different\"\n", 47 | "tokens = nltk.word_tokenize(text)\n", 48 | "tagged = nltk.pos_tag(tokens)\n", 49 | "tagged" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "The output of `pos_tag` is a list of objects called 'tuples'. You can access a tuple by index or you can easily expand it into multiple variables:" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": false 64 | }, 65 | "outputs": [ 66 | { 67 | "data": { 68 | "text/plain": [ 69 | "('England', 'PRP')" 70 | ] 71 | }, 72 | "execution_count": 16, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "test_tuple = ('England', 'PRP')\n", 79 | "test_tuple" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": { 86 | "collapsed": false 87 | }, 88 | "outputs": [ 89 | { 90 | "data": { 91 | "text/plain": [ 92 | "'England'" 93 | ] 94 | }, 95 | "execution_count": 17, 96 | "metadata": {}, 97 | "output_type": "execute_result" 98 | } 99 | ], 100 | "source": [ 101 | "test_tuple[0]" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": { 108 | "collapsed": false 109 | }, 110 | "outputs": [ 111 | { 112 | "data": { 113 | "text/plain": [ 114 | "'England'" 115 | ] 116 | }, 117 | "execution_count": 18, 118 | "metadata": {}, 119 | "output_type": "execute_result" 120 | } 121 | ], 122 | "source": [ 123 | "word, pos = test_tuple\n", 124 | "word" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "Since you can expand tuples easily, you can name the parts of a tuple in a list comprehension. Note in the following example that we follow a `for x, y in list_of_tuples` pattern instead of `for x in list_of_tuples` as we've encountered before: " 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "['CC', 'RB', 'IN', 'NN', 'RB', 'JJ']" 145 | ] 146 | }, 147 | "execution_count": 5, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "[tag for word, tag in tagged]" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "The `for x,y in list_of_tuples` approach also works for in for loops.\n", 161 | "\n", 162 | "Tuples don't need to have only two values in Python, but that's the most common. " 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "**Q1**: How do you get a list of all the singular proper nouns tagged by NLTK in Frankenstein? Share just the code.\n", 170 | "\n", 171 | "Part of Speech tag definitions are at [Penn Treebank](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html). To double check, the output of your code should start with `['St.', 'Petersburgh', 'Dec.', 'TO', 'Mrs.']`. For Python beginners, note that comparing strings is done with `==`, as in `string == string2`." 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": { 178 | "collapsed": false 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "# Getting started\n", 183 | "with open(\"../data/frankenstein.txt\") as f:\n", 184 | " frank_string = f.read()\n", 185 | "frank_tokens = nltk.word_tokenize(frank_string)\n", 186 | "\n", 187 | "# ... what's next?" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "# Getting into Pandas and the HTRC Extracted Features\n", 195 | "\n", 196 | "For the rest of the lab, follow along with [Text Mining in Python with the HTRC Feature Reader](http://programminghistorian.org/lessons/text-mining-with-extracted-features) up to and including \"Selecting Subsets of a DataFrame by a Condition\" (i.e. stop when you see 'Slicing DataFrames'). This tutorial will introduce you to two things:\n", 197 | " 1. The HTRC Extracted Features Dataset, which we discussed last week.\n", 198 | " 2. A library called Pandas, an important part of our toolkit moving forward.\n", 199 | "\n", 200 | "You'll be able to skim many of the early parts of the tutorial, since you've already learned those skills. Don't overlook \"Installing the HTRC Feature Reader\", though." 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "*Questions*\n", 208 | "\n", 209 | "I've posted an HTRC Extracted Features file: [mdp.49015002392919.json.bz2]( https://github.com/organisciak/Text-Mining-Course/blob/master/data/mdp.49015002392919.json.bz2). Use the Feature Reader library to answer the following questions about that file:\n", 210 | "\n", 211 | "**Q2**: What is the title of the book?\n", 212 | "\n", 213 | " 1. 'The adventures of Tom Sawyer, by Mark Twain (Samuel L. Clemens)...'\n", 214 | " 2. 'Frankenstein : or, The modern Prometheus.'\n", 215 | " 3. 'June / by Edith Barnard Delano ; with illustrations.'\n", 216 | " 4. 'Anne of Green Gables / L.M. Montgomery.'\n", 217 | " 5. None of the above." 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "**Q3**: What is the URL to read this book online at the HathiTrust Digital Library?" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "**Q4**: Which of these charts is the plot of tokens/page across the entire book?\n", 232 | "\n", 233 | "1. ![](../images/lab5-plot1.png)\n", 234 | "2. ![](../images/lab5-plot2.png)\n", 235 | "3. ![](../images/lab5-plot3.png)\n", 236 | "5. None of the above." 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": { 242 | "collapsed": true 243 | }, 244 | "source": [ 245 | "**Q5**: How do you get the word frequencies for the header throughout the book? (Not changing any of the other default arguments)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "**Q6**: How do you get the count of each word in the body of the text for the entire book, not worrying about pages or parts of speech? Share your code. *Hint: the length of the correct output (i.e. `len(object_from_your_answer)`) is `9267`.*" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "**Q7**: In the output from Q6, which of the following parts are indexes or columns?\n", 260 | "\n", 261 | "*section*: index, column, or doesn't exist\n", 262 | "\n", 263 | "*word*: index, column, or doesn't exist\n", 264 | "\n", 265 | "*token*: index, column, or doesn't exist\n", 266 | "\n", 267 | "*count*: index, column, or doesn't exist" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "**Q8**: Setting the output to Q6 to a variable called `tl`, what is the line of code to sort values in descending order? To figure out the answer, you can try searching online about sorting in Pandas, or try auto-complete and documentation lookup in Jupyter to see what `tl` can do and how.\n", 275 | "\n", 276 | "If it is ordered correctly, the top words will be '`,`' (4934 occurrences), '`.`' (3866), and '`the`' (3320)." 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "**Q9**: Here is a list of words that show up 64 times in `tl`: [can, face, seemed, where]. What other words with a count of 64 are missing?\n", 284 | "\n", 285 | " - than\n", 286 | " - hand\n", 287 | " - want\n", 288 | " - heart" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "What if we wanted to work with our text in a DataFrame? Here's how you would convert the list of part-of-speech tagged tuples into a DataFrame, where I called my initial list `frank_tagged`:" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": { 302 | "collapsed": false 303 | }, 304 | "outputs": [ 305 | { 306 | "data": { 307 | "text/html": [ 308 | "
\n", 309 | "\n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | "
wordpos
0LetterNN
11CD
2St.NNP
3PetersburghNNP
\n", 340 | "
" 341 | ], 342 | "text/plain": [ 343 | " word pos\n", 344 | "0 Letter NN\n", 345 | "1 1 CD\n", 346 | "2 St. NNP\n", 347 | "3 Petersburgh NNP" 348 | ] 349 | }, 350 | "execution_count": 96, 351 | "metadata": {}, 352 | "output_type": "execute_result" 353 | } 354 | ], 355 | "source": [ 356 | "import pandas as pd\n", 357 | "frank_df = pd.DataFrame(frank_tagged, columns=['word', 'pos'])\n", 358 | "frank_df.head(4)" 359 | ] 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "metadata": {}, 364 | "source": [ 365 | "Pandas is - by convention, not rule - imported with the name `pd`. Note that gave the columns names.\n", 366 | "\n", 367 | "**Q10**: What code would you use on `frank_df` to get the singular proper nouns? It should give you 1371 rows." 368 | ] 369 | } 370 | ], 371 | "metadata": { 372 | "kernelspec": { 373 | "display_name": "Python 3", 374 | "language": "python", 375 | "name": "python3" 376 | }, 377 | "language_info": { 378 | "codemirror_mode": { 379 | "name": "ipython", 380 | "version": 3 381 | }, 382 | "file_extension": ".py", 383 | "mimetype": "text/x-python", 384 | "name": "python", 385 | "nbconvert_exporter": "python", 386 | "pygments_lexer": "ipython3", 387 | "version": "3.5.1" 388 | } 389 | }, 390 | "nbformat": 4, 391 | "nbformat_minor": 0 392 | } 393 | -------------------------------------------------------------------------------- /assignments.md: -------------------------------------------------------------------------------- 1 | # Assignments 2 | 3 | ## Lab Tasks 4 | 5 | _30% - Weekly_ 6 | 7 | Lab tasks are meant to exercise a particular practical skill from our lectures. 8 | 9 | There are 9 lab exercises, evenly marked (each 3% of mark). Labs are marked on a scale of 0-10. Sometimes, just doing the task is a 10/10, other times it is divided by tasks. 10 | 11 | **Due**: 1 hour before the following week's class. 12 | 13 | ## Small Assignments 14 | 15 | - _20% - Two Assignments_ 16 | - _10% - Twitter Bot Assignment_ 17 | - _10% - Topic Modelling Assignment_ 18 | 19 | The small assignments are in lieu of lab tasks for their weeks. They differ in that you have more than 1 week to complete them, and their value is slightly higher. 20 | 21 | ### Twitter Bot Assignment 22 | 23 | This project asks you to create a simple, rule-based Twitter Bot. There is no need to actually put it online: this is optional. 24 | 25 | We'll use Cheap Bots Done Quick, powered by Tracery. To get started, follow along with [Your First Twitter Bot, in 20 minutes](https://sense.porganized.com/your-first-twitter-bot-in-20-minutes-35b2c610482d#.uz41sqy0k). Note that you can use any sources for data that you like, Corpora is merely one suggestion from the tutorial. 26 | 27 | Bot topic: Have fun with this. Your bot can be silly, avant garde, activitist. You may notice that there are limitations to Tracery: some things are really easy to do, but complex programming logic is not possible. How can you work within these restrictions? 28 | 29 | One possibility to make a good bot is to apply what you've learned so far to prepare good source material. For example, can you use concordances or bigrams from a book to learn phrases that represent a writer that you want your bot to mimic? If you want to create realistic sounds but nonsensical sentences, look ahead to next week's Natural Language for Programmers reading by Liza Daly, which introduces Context Free Grammar, and the lab reading on identifying Parts of Speech with NLTK. 30 | 31 | __Week 5: Post your idea for a Twitter bot on the Twitter Assignment forum.__ 32 | 33 | This can be brief: we want you thinking what can be done, and seeing what your colleagues are thinking about. 34 | 35 | __Week 6: Twitter bot due - describe it on the forum at least 1 hour before class.__ 36 | 37 | Post a short write-up (up to 600 words) about your bot and what motivated it. What issues did you run into? Include at least 10 randomly generated messages. If you put the bot up on Twitter (optional) share the handle*. 38 | 39 | Finally, attach a text file with the full JSON for your bot. 40 | 41 | The assignment will be marked out of 10, on the quality of the idea, the depth of the execution, and the clarity and quality of the written post. 1 point is automatically assigned for submitting the draft. 42 | 43 | Engaging with colleagues in discussing their draft ideas will contribute to your participation mark. 44 | 45 | *Also, make sure to identify it as a bot in the bio, a convention that many botmakers follow for ethical reasons. 46 | 47 | ### Topic Modeling Assignment 48 | For the topic modeling assignment, you'll use MALLET to perform text analysis on a collection of your choice. 49 | 50 | MALLET is a Java-based toolkit for machine learning, including a module for Topic Modeling with Latent Dirichlet Allocation. To install and learn to use MALLET, follow along with [Getting Started with Topic Modeling and MALLET](http://programminghistorian.org/lessons/topic-modeling-and-mallet). As with the Twitter Bot Assignment, there is some self-directed learning with this assignment. 51 | 52 | 1. Find a good research question 53 | 54 | Decide on an interesting set of texts to learn from. You can try something new, but you're also welcome to build on an idea from earlier in the course (e.g. from the Voyant lab) or do a preliminary version of something from your final lab. 55 | 56 | What do you hope to explore? A better grade is rewarded for an idea that is appropriate for unsupervised learning, and for which learning topics can be insightful. 57 | 58 | Are you stuck? Think about the genres of texts that we've seen in this class and in your colleagues' assignments: books, emails, tweets, lyrics, scripts, letters. Many of these can lead to an interesting idea. Think also about the examples we discussed in class. 59 | 60 | Topic modeling is well suited for cases with many short texts. Since it learns from co-occurrence, you want your training texts to conceivably be about the same thing at the start as at the end. For example, modeling pages will give you better topics than modeling books. 61 | 62 | Tip: When deciding on a research question, think ahead to the data collection step. Choose something that won't make this short assignment into a long one; remember that you'll have a final project to work on a more complex project if you want to return to topic modeling. 63 | 64 | 2. Find and prepare your source texts 65 | 66 | Getting data is hard! You'll need to be resourceful. 67 | 68 | MALLET needs input files that are structured either as a set of text files or one long text file, with each document on a different line. How do you collect your data? Can you find it already prepared? Do you have to do any cleaning? 69 | 70 | Note that topic modeling is a bag-of-words approach, so if you want to use books from the HTRC Extracted Features dataset, you can write out tokens to files in a random order. 71 | 72 | 3. Build a topic model and write about it 73 | 74 | Build a model and post about it on the Topic Modeling submission forum. Discuss both what you see in the output and what your process for building the models was. 75 | 76 | Some possible questions to answer: 77 | 78 | - What was your goal? Did topic modeling help? 79 | - What is interesting about your topics? Do they match what you expect? If not, what looks peculiar? 80 | - Which topics stand out? Which topics seem to be junk? 81 | - How many topics did you choose? Why? Did you try alternate parameterizations? 82 | - How did you collect your data? Were there headaches or necessary workarounds? 83 | - What new research questions does this assignment inspire? If you had time for a bigger project, what would it be? 84 | 85 | _Due: Week 12._ 86 | 87 | Grading - out of 10 88 | 89 | - /2 - The Research question 90 | - /2 - Data collection 91 | - /6 - Forum post: quality of analysis and discussion of process 92 | 93 | ## Text Mining project 94 | 95 | The final project is a culmination of your text mining expertise. You'll be putting your text mining skills to work. Up to now, the assignments have been method-based, where we tell you what to use and you find a problem for it. This time, you choose a problem to explore, formulate it as a research question, determine the methods to address it, and use it as part of a larger narrative. 96 | 97 | The project has 3 components: 98 | 99 | - 5% Problem statement: Due Week 12. 100 | - 5% Literature review + 5% Data collection: Due Week 13. 101 | - 20% Final report: Due one week after final class. 102 | 103 | We'll still be learning methods that you may want to use throughout April, though the most prominent ones have already been covered: classification, clustering, topic modeling, stoplisting, concordances, part of speech tagging, document similarity etc. 104 | 105 | A few more that we'll learn: feature selection - identifying the most discriminatory words in a collection; word embeddings - understanding the contexts of words and the relationships between them, removed from the document context; more document similarity methods; visualization, for better understanding what is going on in an analysis and for communicating it to others; sentiment analysis, for mining the opinions of texts. 106 | 107 | Tell a compelling story. Remember that the complexity of the tool is not as important and the appropriateness. For example, something the top word frequencies make your point. 108 | 109 | Here are some random research questions, alongside ways to ask them. 110 | 111 | - RQ: Is there a specific language that belies conservative or liberal partisan media? Possible approaches: sentiment mining; term-weighted top frequencies. 112 | - RQ: What characterizes my style in writing email? Possible approaches: topic modeling sent messages; building a classifier for time of day and seeing which words are notable for each class; visualizing sent email lengths. 113 | - RQ: How do Alec Baldwin and Stephen Baldwin use Twitter different? Possible approaches: Term frequencies and concordances; classification (notable features via decision trees?); topic modeling; dimensionality reduction to 2 dimensions and observing outliers via scatter plot. 114 | - RQ: Are there underlying trends motivating baby naming? Possible methods: classification or logistic regression using character patterns as features (e.g. last letter of the name, second letter, etc.). 115 | - RQ: What do characters on Game of Thrones talk about? Possible methods: TF-IDF over scripts, compared to a [general language reference for IDF](https://www.ideals.illinois.edu/handle/2142/89691). 116 | - RQ: How do people talk about food? Possible methods: identify types of food in a [dataset of food reviews](https://www.kaggle.com/snap/amazon-fine-food-reviews), and look at terms that coccur with them; topic model reviews and see which topics are most prominent for different words. 117 | Good luck. We've been very impressed by the quality of your project ideas this term, and look forward to seeing what you come up with. 118 | 119 | ### Details 120 | 121 | #### Problem Statement (Week 12): 122 | 123 | Develop your idea and share a description of what you hope to do, what methods you hope to use, and early ideas for getting the data. Post it on the Problem Statement forum. Max 400 words. 124 | 125 | #### Literature Review and Data Collection (Week 13): 126 | 127 | Find examples of other people pursuing similar questions or using similar methods, and tell us about them. This will require some self-directed reading, searching. This isn't a lofty academic literature review, so you do not need to worry about how formal the literature is: it can include forum posts, blogs, new articles. (Tips: finding information online can be tricky - don't be afraid to share cool sources with your classmates or ask for advice on the forums). 128 | 129 | For the data collection section, we want to see that you've started trying to compile your data. If it's been easy, them us more about the data. If it's been hard, tell us about the problems that you're running into, and whether you've had to adapt from your original problem. 130 | 131 | Post these parts on the Final Project Lit + Data forum. 132 | 133 | #### Final Report (One week after final class) 134 | 135 | For the final report, write about your findings. Structure a narrative about what you hoped to do, how you pursued it, and what you found. 136 | 137 | Think of the report as a piece for the portfolio: to show your text mining skills to future employers while demonstrating your ability to communicate the results. Tell us how about how you addressed your problem. You want to catch the reader by having the most intriguing points summarized at the start, then give us the details: what analyses you ran, what subquestions you asked, what was seen. When appropriate, use tables or visualizations. 138 | 139 | Below is our suggested structure. This is not a research paper, so you do not need to use these headings or structure. Rather, it's just a set of guidelines. Foremost, structure your report so it is easy to read for a non-expert. 140 | 141 | 142 | 143 | 1. Introduction 144 | 145 | Provide a high-level explanation of what you did and the main interesting points. This is the section that convinces us to read further. 146 | 147 | 148 | For the next three parts, reuse your text from the Problem Statement, Literature Review, Data Collection, editing it as you might see fit. 149 | 150 | 2. Problem 151 | 152 | 3. Related Work 153 | 154 | 4. Data 155 | 156 | 5. Findings 157 | 158 | Tell your data's story! 159 | 160 | 6. Conclusions and Next Steps 161 | 162 | What was most salient or intriguing? What interesting new questions came out of your project? What else can be done? 163 | 164 | __How to submit Final Report__ 165 | 166 | Post the final report on the submission forums, in one of the following ways: 167 | 168 | - Written directly in Moodle, as a post 169 | - Linked to a blog post (e.g. on Medium) 170 | - Linking to a Jupyter notebook (e.g. on Github) 171 | - Attaching a Jupyter Notebook (make sure you check that any images are included) 172 | 173 | __Inspiration for writing__ 174 | 175 | Here's a list of good [writing about text or data mining](https://github.com/organisciak/Text-Mining-Course/blob/master/writing.md) for inspiration. Share your own examples on the forums. 176 | 177 | ## Participation 178 | 179 | - 15% of mark 180 | - 5% Attendance 181 | - 10% Forum posts, comments, class engagement 182 | 183 | ## Late Policy 184 | 185 | - Lose 10% day, up to 50%. Late is better than never. 186 | - 2 late 'freebies': We won't count late marks for two labs, because sometimes life gets in the way. 187 | 188 | - Last day for late assignments: 189 | - Labs: Turn in by May 3rd 190 | - Anything else: May 8th 191 | -------------------------------------------------------------------------------- /examples/French-English Classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from htrc_features import FeatureReader\n", 12 | "import glob\n", 13 | "from nltk import word_tokenize\n", 14 | "import pandas as pd" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "hvd.32044014292023 \t eng \t http://hdl.handle.net/2027/hvd.32044014292023 \t Alice's adventures in Wonderland ; and, \n", 29 | "hvd.32044102860673 \t fre \t http://hdl.handle.net/2027/hvd.32044102860673 \t Notre Dame de Paris. Abridged and edited\n", 30 | "mdp.39015038910694 \t eng \t http://hdl.handle.net/2027/mdp.39015038910694 \t Moby Dick,\n", 31 | "pst.000029579440 \t eng \t http://hdl.handle.net/2027/pst.000029579440 \t The adventures of Huckleberry Finn / by \n", 32 | "uiug.30112037882914 \t fre \t http://hdl.handle.net/2027/uiug.30112037882914 \t Candide ou L'optimisme.\n", 33 | "wu.89104415476 \t fre \t http://hdl.handle.net/2027/wu.89104415476 \t Les liaisons dangereuses / Choderlos de \n" 34 | ] 35 | } 36 | ], 37 | "source": [ 38 | "paths = glob.glob('../data/classification/*bz2')\n", 39 | "fr = FeatureReader(paths)\n", 40 | "\n", 41 | "for vol in fr.volumes():\n", 42 | " print(vol.id, '\\t', vol.language, '\\t', vol.handle_url, '\\t', vol.title[:40])" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "Collect the token counts for french and english separately." 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "collapsed": false 57 | }, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/html": [ 62 | "
\n", 63 | "\n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | "
tokencountlanguage
0!573fre
1!..i1fre
2!je1fre
3\"12fre
4\"de1fre
\n", 105 | "
" 106 | ], 107 | "text/plain": [ 108 | " token count language\n", 109 | "0 ! 573 fre\n", 110 | "1 !..i 1 fre\n", 111 | "2 !je 1 fre\n", 112 | "3 \" 12 fre\n", 113 | "4 \"de 1 fre" 114 | ] 115 | }, 116 | "execution_count": 24, 117 | "metadata": {}, 118 | "output_type": "execute_result" 119 | } 120 | ], 121 | "source": [ 122 | "tl = vol.tokenlist(pages=False, pos=False).head().reset_index()\n", 123 | "tl['language'] = vol.language\n", 124 | "tl[['token', 'count', 'language']].head()" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": { 131 | "collapsed": false 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "book_dfs = []\n", 136 | "classes_count = {'eng': 0, 'fre': 0}\n", 137 | "\n", 138 | "for vol in fr.volumes():\n", 139 | " tl = vol.tokenlist(pages=False, pos=False, case=False).reset_index()\n", 140 | " classes_count[vol.language] += 1\n", 141 | " tl['language'] = vol.language\n", 142 | " book_dfs.append(tl[['lowercase', 'count', 'language']])" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": { 149 | "collapsed": false 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "corpus = (pd.concat(book_dfs)\n", 154 | " .groupby(by=['language', 'lowercase']).sum()\n", 155 | " )" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": { 162 | "collapsed": false 163 | }, 164 | "outputs": [ 165 | { 166 | "data": { 167 | "text/plain": [ 168 | "eng 0.5\n", 169 | "fre 0.5\n", 170 | "dtype: float64" 171 | ] 172 | }, 173 | "execution_count": 4, 174 | "metadata": {}, 175 | "output_type": "execute_result" 176 | } 177 | ], 178 | "source": [ 179 | "# P(c)\n", 180 | "p_c = pd.Series(classes_count) / len(paths)\n", 181 | "p_c" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "Next, we want to sum up the counts for the entire class, so each language x word only has one, total sum:" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": { 195 | "collapsed": false, 196 | "scrolled": true 197 | }, 198 | "outputs": [ 199 | { 200 | "data": { 201 | "text/html": [ 202 | "
\n", 203 | "\n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | "
countP(w|c)
languagelowercase
eng!22300.006570
!'10.000003
!110.000003
!3310.000003
!«lm10.000003
\n", 247 | "
" 248 | ], 249 | "text/plain": [ 250 | " count P(w|c)\n", 251 | "language lowercase \n", 252 | "eng ! 2230 0.006570\n", 253 | " !' 1 0.000003\n", 254 | " !1 1 0.000003\n", 255 | " !33 1 0.000003\n", 256 | " !«lm 1 0.000003" 257 | ] 258 | }, 259 | "execution_count": 32, 260 | "metadata": {}, 261 | "output_type": "execute_result" 262 | } 263 | ], 264 | "source": [ 265 | "corpus['P(w|c)'] = corpus.groupby(level='language').transform(lambda word: word / word.sum())['count']\n", 266 | "corpus.head()" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": { 273 | "collapsed": false 274 | }, 275 | "outputs": [ 276 | { 277 | "data": { 278 | "text/html": [ 279 | "
\n", 280 | "\n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | "
countP(w|c)
lowercase
,227370.066985
the159620.047025
and113560.033456
.107320.031617
\"74460.021936
to66110.019476
a65290.019235
of58570.017255
i55810.016442
it50770.014957
;45590.013431
in43120.012703
was35960.010594
that33920.009993
he30480.008980
you29590.008717
's26090.007686
n't26080.007683
but22940.006758
!22300.006570
\n", 396 | "
" 397 | ], 398 | "text/plain": [ 399 | " count P(w|c)\n", 400 | "lowercase \n", 401 | ", 22737 0.066985\n", 402 | "the 15962 0.047025\n", 403 | "and 11356 0.033456\n", 404 | ". 10732 0.031617\n", 405 | "\" 7446 0.021936\n", 406 | "to 6611 0.019476\n", 407 | "a 6529 0.019235\n", 408 | "of 5857 0.017255\n", 409 | "i 5581 0.016442\n", 410 | "it 5077 0.014957\n", 411 | "; 4559 0.013431\n", 412 | "in 4312 0.012703\n", 413 | "was 3596 0.010594\n", 414 | "that 3392 0.009993\n", 415 | "he 3048 0.008980\n", 416 | "you 2959 0.008717\n", 417 | "'s 2609 0.007686\n", 418 | "n't 2608 0.007683\n", 419 | "but 2294 0.006758\n", 420 | "! 2230 0.006570" 421 | ] 422 | }, 423 | "execution_count": 15, 424 | "metadata": {}, 425 | "output_type": "execute_result" 426 | } 427 | ], 428 | "source": [ 429 | "corpus.loc[('eng')].sort_values('count', ascending=False).head(20)" 430 | ] 431 | }, 432 | { 433 | "cell_type": "markdown", 434 | "metadata": {}, 435 | "source": [ 436 | "For estimating P(w|c), divide each per-class count by the total words in that class." 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": null, 442 | "metadata": { 443 | "collapsed": false 444 | }, 445 | "outputs": [ 446 | { 447 | "data": { 448 | "text/plain": [ 449 | "['bonjour']" 450 | ] 451 | }, 452 | "execution_count": 35, 453 | "metadata": {}, 454 | "output_type": "execute_result" 455 | } 456 | ], 457 | "source": [ 458 | "string_to_classify = \"bonjour\"\n", 459 | "relevant_tokens = word_tokenize(string_to_classify.lower())\n", 460 | "relevant_tokens" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": null, 466 | "metadata": { 467 | "collapsed": false 468 | }, 469 | "outputs": [ 470 | { 471 | "data": { 472 | "text/plain": [ 473 | "language\n", 474 | "eng 7.588625e-19\n", 475 | "fre 1.814643e-22\n", 476 | "Name: P(w|c), dtype: float64" 477 | ] 478 | }, 479 | "execution_count": 8, 480 | "metadata": {}, 481 | "output_type": "execute_result" 482 | } 483 | ], 484 | "source": [ 485 | "classified = (corpus.loc[(slice(None), relevant_tokens),]\n", 486 | " .groupby(level='language')['P(w|c)'].prod()\n", 487 | " )\n", 488 | "classified" 489 | ] 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "metadata": {}, 494 | "source": [ 495 | "Now, though it doesn't matter when the same classes were seen equally, remember to multiple by P(c):" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": null, 501 | "metadata": { 502 | "collapsed": false 503 | }, 504 | "outputs": [ 505 | { 506 | "data": { 507 | "text/plain": [ 508 | "language\n", 509 | "eng 3.794313e-19\n", 510 | "fre 9.073217e-23\n", 511 | "dtype: float64" 512 | ] 513 | }, 514 | "execution_count": 9, 515 | "metadata": {}, 516 | "output_type": "execute_result" 517 | } 518 | ], 519 | "source": [ 520 | "classified * p_c" 521 | ] 522 | }, 523 | { 524 | "cell_type": "markdown", 525 | "metadata": {}, 526 | "source": [ 527 | "Sort that, to make it more clear:" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": null, 533 | "metadata": { 534 | "collapsed": false 535 | }, 536 | "outputs": [ 537 | { 538 | "data": { 539 | "text/plain": [ 540 | "language\n", 541 | "eng 3.794313e-19\n", 542 | "fre 9.073217e-23\n", 543 | "dtype: float64" 544 | ] 545 | }, 546 | "execution_count": 10, 547 | "metadata": {}, 548 | "output_type": "execute_result" 549 | } 550 | ], 551 | "source": [ 552 | "(classified * p_c).sort_values(ascending=False)" 553 | ] 554 | } 555 | ], 556 | "metadata": { 557 | "kernelspec": { 558 | "display_name": "Python 3", 559 | "language": "python", 560 | "name": "python3" 561 | }, 562 | "language_info": { 563 | "codemirror_mode": { 564 | "name": "ipython", 565 | "version": 3 566 | }, 567 | "file_extension": ".py", 568 | "mimetype": "text/x-python", 569 | "name": "python", 570 | "nbconvert_exporter": "python", 571 | "pygments_lexer": "ipython3", 572 | "version": "3.5.1" 573 | } 574 | }, 575 | "nbformat": 4, 576 | "nbformat_minor": 0 577 | } 578 | -------------------------------------------------------------------------------- /syllabus.md: -------------------------------------------------------------------------------- 1 | ## Week 1: Introduction 2 | 3 | ### Readings 4 | 5 | - Marti Hearst. 1999. [Untangling Text Data 6 | Mining](http://people.ischool.berkeley.edu/~hearst/papers/acl99/acl99-tdm.html). 7 | 8 | ### Slides 9 | 10 | - [Introduction](https://docs.google.com/presentation/d/1cBc9yX2wRSmQoOiG2viUHkZAsKI_dAB2VO3powLBt74/edit?usp=sharing) 11 | 12 | ### For Next Week 13 | 14 | #### Homework 15 | 16 | Post a little bit about yourself in the Introductions forum, following the instructions there. 17 | 18 | #### Lab Task 19 | 20 | This week's lab task is mostly to play! It is intended to get 21 | you comfortable with out-of-the-box text analysis tools. 22 | 23 | Use [Voyant](https://voyant-tools.org) 24 | to visualize a text or set of texts. It can be anything you want: a 25 | book, a set of lyrics, scripts from a show you like, news articles. 26 | Try out the various features in Voyant: phrases, keywords in 27 | contexts, etc. 28 | 29 | Once you've had a chance to play with Voyant, *post a short response 30 | to the __lab task forum__ 31 | (no more than 300 words) about your experience. Some possible things 32 | to post about: What was interesting or confusing about the tool? Did 33 | you find anything intriguing about your text or texts? Did it find 34 | any recurring patterns or phrases? Did you find any visualisations 35 | beyond the word cloud to be interesting? Any other thoughts? Don't 36 | forget to tell us what text you used with Voyant. 37 | 38 | ## Week 2: Fundamentals 39 | 40 | *Just a reminder that 'readings' refer to the readings you should have done by the lecture, while lab tasks are done by next week. The intention is that they are both related to the current week's theme: readings prepare you for the lecture, and the lecture lets you practice that learning.* 41 | 42 | ### Readings 43 | 44 | - Sections 4.1, 4.3, and 4.4 of [Search 45 | Engines: Information Retrieval in 46 | Practice](http://ciir.cs.umass.edu/irbook/) (Croft, 47 | Metzler and Strohman). Starts on page 72. 48 | - Parts of Chapter 2, Introduction to Information Retrieval 49 | (Manning, Raghavan, 50 | Schütze): [Intro](http://nlp.stanford.edu/IR-book/html/htmledition/the-term-vocabulary-and-postings-lists-1.html), [Tokenization](http://nlp.stanford.edu/IR-book/html/htmledition/determining-the-vocabulary-of-terms-1.html), [Stop 51 | lists](http://nlp.stanford.edu/IR-book/html/htmledition/dropping-common-terms-stop-words-1.html) 52 | 53 | ### Slides 54 | 55 | - [Week 2: 56 | Fundamentals](https://docs.google.com/presentation/d/18R7pWmc49PemCgAJ4020lgibNO1Hp1KdST08kd_a-d4/edit?usp=sharing) 57 | 58 | ### For Next Week 59 | 60 | This week's lab task is about getting started with powerful tools that 61 | will underlie many of the skills you learn in the course. The lab task is posted in a [Jupyter 62 | notebook](labs/Lab%202.ipynb) 63 | format on Github. 64 | 65 | ## Week 3: _Treating Text as Data_ - Features 66 | 67 | ### Readings 68 | 69 | - [2.2.3](http://nlp.stanford.edu/IR-book/pdf/02voc.pdf) of Intro to IR: Normalization. If you missed 2.2.1 and 2.2.2 last week, catch up on those also. 70 | - [Term Weighting for 71 | Humanists](https://sense.porganized.com/term-weighting-for-humanists-bf2ed42628c8). 72 | Peter Organisciak. 73 | 74 | Supplemental: 75 | - [Term frequency and 76 | weighting](http://nlp.stanford.edu/IR-book/html/htmledition/term-frequency-and-weighting-1.html). 77 | Intro to IR. 78 | 79 | ### Slides 80 | 81 | - [Week 3: 82 | Features](https://docs.google.com/presentation/d/16jZxqi7zpZrOUA2z14aSpg8BPGJxSO9Qkb6rx4bZLLw/edit?usp=sharing) 83 | 84 | ### For Next Week 85 | 86 | #### Lab Task 87 | 88 | This week's lab task is again a series of questions, following along 89 | with a worksheet. Find it 90 | [here](labs/Lab%203.ipynb). 91 | 92 | ## Week 4: Text Mining for Art and Criticism 93 | 94 | ### Readings 95 | 96 | - [Liza Daly's Generative Blackout 97 | Poetry](http://waxy.org/2016/11/liza-dalys-generative-blackout-poetry/) - 98 | This work uses some simple language rules that will be useful in 99 | the future. 100 | 101 | The following three readings are web articles related to Twitter bots: for activism, for recontextualization, and a roundup of interesting bots. Not all of these are text related, but serve as a good overview. 102 | 103 | - [How Twitter Bots Turn Tweeters into 104 | Activists](https://www.technologyreview.com/s/544851/how-twitter-bots-turn-tweeters-into-activists/) 105 | - [Introducing censusAmericans, A Twitter Bot For 106 | America](https://fivethirtyeight.com/datalab/introducing-censusamericans-a-twitter-bot-for-america/) 107 | - [12 108 | Weird, Excellent Twitter Bots Chosen by Twitter’s Best 109 | Bot-Makers](http://nymag.com/selectall/2015/11/12-weirdest-funniest-smartest-twitter-bots.html) 110 | - Optional: [The Rise of Twitter 111 | Bots](http://www.newyorker.com/tech/elements/the-rise-of-twitter-bots) 112 | 113 | Slides 114 | 115 | - [3.5 - 116 | Features Cont.](https://docs.google.com/presentation/d/1dljGL0QmjY-QJ9O-wpXeVgqk8lO12Klrm6EfgaEQsDg/edit?usp=sharing) 117 | - [4.0 - Text Mining for Art and 118 | Criticism](https://docs.google.com/presentation/d/1FZmIQdS5cEuJEG7pudzHCI5iWxrWZb5yus4eT24tW_Y/edit?usp=sharing) 119 | 120 | Assignments 121 | ----------- 122 | 123 | The Twitter Bot assignment is posted on the 124 | [Assignments](assignments.md) page. 125 | There is a draft posting next week (post about your plans) and the 126 | final is due in two weeks. 127 | 128 | ### For Next Week 129 | 130 | - Submit Twitter bot draft 131 | - [Lab 4 132 | Worksheet](labs/Lab%204.ipynb). 133 | 134 | ## Week 5.1: Document Access 135 | 136 | ### Readings 137 | 138 | [Against 139 | Cleaning](http://curatingmenus.org/articles/against-cleaning/) - 140 | Katie Rawson, Trevor Muñoz 141 | 142 | ## Week 5.2: _Understanding Words_ - Natural Language Processing 1, Part of Speech Tagging 143 | 144 | ### Readings 145 | 146 | - [Natural Language Processing for 147 | programmers](https://worldwritable.com/natural-language-processing-for-programmers-90c4e04dc6de#.dhfapdhxv) part 148 | 2 - Liza Daly 149 | - This talks about an old concept, but is written from a 150 | beginner perspective and is useful for your assignment. 151 | - [Part of Speech 152 | Tagging ](https://web.stanford.edu/~jurafsky/slp3/10.pdf)- Chapter 153 | 10 (up to 10.4) of Speech and Language Processing (3rd ed. 154 | draft) 155 | - [Chapter 5.7 of the NLTK 156 | Book](http://www.nltk.org/book/ch05.html) - Bird et. al 157 | - Just section 7, but sections 1-2, 4-6 are useful as 158 | supplements to the SLP reading if you need more info or 159 | simply find it interesting. Section 7 is the conclusion of 160 | the chapter, which succinctly describes the ways that we 161 | understand a part of speech. 162 | 163 | ### Slides 164 | 165 | [05 - Getting Data](https://docs.google.com/presentation/d/1N7qvqvTTxldbTiZ2tqx8OQBUq4dtD3PoUAEdwjD6FGc/edit?usp=sharing) 166 | 167 | ### For Next Week 168 | 169 | Twitter bot: Post to the Twitter Bot Final forum. 170 | 171 | No lab task. Complete your bot! 172 | 173 | ## Week 6: _Understanding Words_ - Natural Language Processing 2, Information Extraction and Dependency Parsing 174 | 175 | ### Readings 176 | 177 | - *Information 178 | Extraction*. Section 4.6 of [Search Engines: Information 179 | Retrieval in Practice](http://ciir.cs.umass.edu/irbook/) (Croft, 180 | Metzler and Strohman). Starts on page 113. 181 | - [Information 182 | Extraction](https://web.stanford.edu/~jurafsky/slp3/21.pdf) 183 | (up to and including section 21.2.3). Speech and Language 184 | Processing (3rd ed. draft). 185 | 186 | **Optional 187 | Reading** 188 | - [SyntaxNet Detailed 189 | Tutorial](https://github.com/tensorflow/models/tree/master/syntaxnet#detailed-tutorial-building-an-nlp-pipeline-with-syntaxnet) 190 | - 191 | 192 | Google's approach for dependency parsing, SyntaxNet, and 193 | their model trained on it - Parsey McParseFace - are the 194 | current state of the art. This tutorial, while optional, 195 | offers a look at Part of Speech tagging using feed-forward 196 | neural networks and has a nicely described description of 197 | transition-based dependency parsing. 198 | 199 | ### Slides 200 | 201 | [06 - Natural Language Processing 1 - Part of Speech 202 | Tagging](https://docs.google.com/presentation/d/17psGonrrwj0R2DT-Nu34D5kpTP-jBEmthbIQKSeZG2Q/edit?usp=sharing) 203 | 204 | ### For Next Week 205 | 206 | - [Worksheet for the Lab Task 207 | 05](labs/Lab%2005%20-%20Part%20of%20Speech%20Tagging%2C%20Starting%20with%20Pandas.ipynb). 208 | 209 | ## Week 7: Classification 1 210 | 211 | ### Readings 212 | 213 | [Naive Bayes Classification and 214 | Sentiment](https://web.stanford.edu/~jurafsky/slp3/6.pdf), Speech 215 | and Language Processing (3rd edition). Dan Jurafsky and James H. 216 | Martin. 217 | 218 | **Notation** 219 | 220 | We getting to the point of the term where some mathematic notation 221 | is necessary for our readings to communicate the underlying theory. 222 | 223 | If you are unfamiliar with Bayesian inference, the description on 224 | the 3rd page of this chapter might not satisfy your curiosity. 225 | The [introduction to Bayes' Theorem from Khan 226 | Academy](https://www.khanacademy.org/partner-content/wi-phi/wiphi-critical-thinking/wiphi-fundamentals/v/bayes-theorem) 227 | can help equip you with some more background about what we use 228 | Bayes' Theorem for. 229 | 230 | Since we're looking at classes, you'll start seeing set theory, 231 | like c ∈ C. This means 'c' is an element of 'C', or in the context 232 | our reading, this *class (c) is part of a set of all the possible 233 | classes (C)*.* *Why is that something we'd want to state? Because 234 | for Naive Bayes classification, we'll be choosing the class *c* with 235 | the highest probability given the evidence. The equations simply 236 | need a way to state "consider P(c|d) for all possible classes and 237 | choose the class with the highest value", which they do 238 | with ![](images/c_argmax.png). 239 | 240 | ### Slides 241 | 242 | - [07 - 243 | Classification](https://docs.google.com/presentation/d/1u_VZgEK45u4zbbfxZKo_G-uztNvR0As_gfc5X2c2nU0/edit?usp=sharing) 244 | - Includes material from: SLP v.3 slides (Jurafsky and Martin ) 245 | 246 | ### For Next Week 247 | 248 | - [Lab Task 06 249 | Worksheet](labs/Lab%2006%20-%20More%20Pandas%20and%20Intro%20to%20Classification.ipynb) 250 | 251 | ### Week 8.1: Classification 2 252 | 253 | ### Week 8.2 Ethics in Text Mining 254 | 255 | ### Readings 256 | 257 | No required readings this week, focus on the lab task! 258 | 259 | **Optional Reading** 260 | 261 | - Brent Daniel Mittelstadt, Patrick Allo, Mariarosaria Taddeo, 262 | Sandra Wachter, Luciano Floridi. 2016. "[The ethics of 263 | algorithms: Mapping the 264 | debate](http://journals.sagepub.com/doi/abs/10.1177/2053951716679679)". *Big 265 | Data & Society. *Vol 3, Issue 2. 266 | - Recent BBC2 Story (audio): [Controlling the Unaccountable 267 | Algorithm](http://www.bbc.co.uk/programmes/b085wj18) 268 | 269 | As with our class on art and criticism, some of the most accessible work on ethics is from the bot-making community. 270 | 271 | - [Bots Should Punch Up](https://www.crummy.com/2013/11/27/0) 272 | - [Ethical Bot Making](http://mewo2.com/notes/bot-ethics/) 273 | - [How to Make a Bot that Isn't 274 | Racist](https://motherboard.vice.com/en_us/article/how-to-make-a-not-racist-bot) 275 | 276 | ### Slides 277 | 278 | - [Week 08 - Classification 279 | 2 and Ethics in Text Mining](https://docs.google.com/presentation/d/1TL4a0SGRcOHXmq4cKXs4dRY6ASDbr-V3vFHh0c_Nj-c/edit#slide=id.g1edffbd9d5_0_177) 280 | - Includes material from: SLP v.3 slides (Jurafsky and Martin) 281 | 282 | ### For Next Week 283 | 284 | - [Lab Task 7 285 | Worksheet](labs/Lab%2007%20-%20Classification.ipynb) 286 | 287 | ## Week 9: Clustering 288 | 289 | ### Readings 290 | 291 | - [Textual 292 | Analysis](http://www.digitalhumanities.org/companion/view?docId=blackwell/9781405103213/9781405103213.xml&chunk.id=ss1-4-4&toc.depth=1&toc.id=ss1-4-4&brand=default) - 293 | John Burrows, A Companion to Digital Humanities 294 | - [Clustering](http://scikit-learn.org/stable/modules/clustering.html) - 295 | Sci-Kit Learn Documentation: Read *Overview* and the intros to 296 | 2.3.2 (K-Means) and 2.3.6 (Hierarchical clustering) 297 | 298 | Supplemental Readings 299 | 300 | - [Cluster 301 | Analysis](http://www-users.cs.umn.edu/~kumar/dmbook/ch8.pdf) - 302 | Pang-Ning Tan, Michael Steinbach, Vipin Kumar. *Introduction to 303 | Data Mining* 304 | - [Beyond tokens: what character counts say about a 305 | page](https://sense.porganized.com/beyond-tokens-what-character-counts-say-about-a-page-278d0ccea34c#.nmrtloz6i). 306 | Peter Organisciak 307 | 308 | ### Slides 309 | 310 | [Week 9 - 311 | Clustering](https://docs.google.com/presentation/d/1UnHbclWT--wxOPwEB5U9uqQ8GPKfliEEpVhQYsDrtJA/edit?usp=sharing) 312 | 313 | ### For the next two weeks 314 | 315 | [Lab 08 316 | Worksheet](labs/Lab%2008%20-%20Clustering.ipynb) 317 | 318 | ## Spring Break Week 319 | 320 | Spring Break. No class. 321 | 322 | ## Week 10: Topic Modeling and Dimensionality Reduction 1 323 | 324 | ### Readings 325 | 326 | [](https://tedunderwood.com/2012/04/07/topic-modeling-made-just-simple-enough/)[Topic 327 | modeling made just 328 | simple enough.](https://tedunderwood.com/2012/04/07/topic-modeling-made-just-simple-enough/) 2012. 329 | Ted Underwood. 330 | 331 | [Probabilistic Topic 332 | Models](http://dl.acm.org/citation.cfm?id=2133826). 2012. 333 | David Blei. 334 | 335 | **Supplemental** 336 | 337 | [Introduction to Latent Dirichlet 338 | Allocation](http://blog.echen.me/2011/08/22/introduction-to-latent-dirichlet-allocation/). 2011. 339 | Edwin Chen. 340 | 341 | ### Slides 342 | 343 | [Topic Modeling 344 | Slides](https://docs.google.com/presentation/d/1X5NvF-CvTQk0jwhL74eUSu8u2QblastwkF6jTzzTkeM/edit?usp=sharing) 345 | 346 | ### For Next Week 347 | 348 | [Lab task 09 - Dimensionality Reduction and Sentiment 349 | Analysis](labs/Lab%209%20-%20Dimensionality%20Reduction%20and%20Sentiment%20Analysis.ipynb) 350 | 351 | *Recommended*: Get started on your topic modeling assignment. Make 352 | sure you can get MALLET running on your system. 353 | 354 | ### For Two Weeks from Now 355 | 356 | Topic Modeling Assignment Due. See description on the 357 | [Assignments](assignments.md) page. 358 | 359 | Post the Problem Statement for your Text Mining Project. See description on the [Assignments](assignments.md) page. 360 | 361 | 362 | ## Week 11.1 Topic Modelling 2 363 | 364 | ## Week 11.2 Sentiment Analysis 365 | 366 | ### Readings 367 | 368 | [Narrative framing of consumer sentiment in online restaurant 369 | reviews](http://journals.uic.edu/ojs/index.php/fm/article/view/4944). 370 | Dan Jurafsky, Victor Chahuneau, Bryan R. Routledge, Noah A. Smith. 371 | 372 | **Optional but Recommended** 373 | 374 | [Indexing by Latent Semantic 375 | Analysis](http://lsa.colorado.edu/papers/JASIS.lsi.90.pdf). 376 | Deerwester, Dumais, Furnas, Landauer, Harshman. 377 | 378 | *This is one of our core papers in Library and Information Science - 13k citations can't be wrong. You'll notice that these famous papers are particularly easy to read - Chengzheng Zhai's smoothing paper is 379 | another example - a good reminder that being clever is only useful if you can communicate it.* 380 | 381 | ### Slides 382 | 383 | [Topic Modelling II and Sentiment 384 | Analysis](https://docs.google.com/presentation/d/1aRo0-Ho9auR751MDKYIE4HIedmnYzqyELFignJN05Yk/edit?usp=sharing) 385 | 386 | ### For Next Week 387 | 388 | Topic Modeling Assignment Due. See description on 389 | the [Assignments](assignments.md) page. 390 | 391 | Post the Problem Statement for your Text Mining Project. See description on the [Assignments](assignments.md) page. 392 | 393 | ## Week 12: Visualization 394 | 395 | ### Readings 396 | 397 | It's a busy time, no readings this week! 398 | 399 | ### Slides 400 | 401 | [Week 13 - 402 | Visualization](https://docs.google.com/presentation/d/1R72aBkSYzqZlvtOVm9q-8-_Ogmc0QW9XQ7cZ2JKEm4Y/edit?usp=sharing) 403 | 404 | ### For Next Week 405 | 406 | - Literature Review and Data Collection for your final project. 407 | 408 | ## Week 13: Word Embeddings 409 | 410 | ### Readings 411 | 412 | - [Word Embeddings for the digital 413 | humanities](http://bookworm.benschmidt.org/posts/2015-10-25-Word-Embeddings.html). 2015. 414 | Benjamin Schmidt. 415 | 416 | - [Vector Representations of 417 | Words](https://www.tensorflow.org/tutorials/word2vec) (stop at 418 | 'Building the Graph'). Tensorflow Tutorials. 419 | 420 | **Supplemental (Optional)** 421 | 422 | - [Distributed Representations of Words and Phrases and their Compositionality](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf). 423 | Mikolov et. al. 424 | 425 | **Bonus** 426 | 427 | Something to play with: [the "Bonus App" at the bottom of Radim Řehůřek's Word2Vec 428 | tutorial](https://rare-technologies.com/word2vec-tutorial/). 429 | 430 | ## Week 14: What's Next: Remainder Notes from Text Mining 431 | 432 | ### Slides 433 | 434 | [Week 15 - What's 435 | Next](https://docs.google.com/presentation/d/1GwGK3b4U_Z3xt_fFZiWB86jRWreGn00p9arfi2oPvYg/edit?usp=sharing) 436 | 437 | ### Reminders 438 | 439 | May 3rd is the last day to turn in late lab tasks! Get them in! 440 | -------------------------------------------------------------------------------- /labs/Lab 02.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Week 2 Lab Task\n", 8 | "This week is about getting started with powerful tools that will underlie many of the skills you learn in the course. Much of the effort is in setting up your programming environment: the lab questions will ensure that it is done correctly and help you grow familiar with it.\n", 9 | "\n", 10 | "In this course we'll be using the Python programming language, using an innovative environment called Jupyter Notebooks.\n", 11 | "\n", 12 | "Your _environment_ is similar to your local workspace. Look at your desk: how do you organize your pens, paper, mouse, monitor? Or maybe you have a barebones workspace, working at a coffee shop or kitchen table with only a cup of coffee. In the same way, you can have many different environments for how you work with Python: working on a command line, or running scripts. Jupyter Notebooks is an environment that gives you an interactive, browser based version of Python. It allows you to play with code in a way that gives you immediate feedback, and allows you to break, tinker, and retry.\n", 13 | "\n", 14 | "Jupyter Notebooks will be installed through Anaconda.\n", 15 | "\n", 16 | "When programming, you're usually not writing everything from scratch. Some code is needed by many other people, so most languages have a concept of a _library_: code written and distributed by other people that you can easily use in your own work. \n", 17 | "\n", 18 | "Anaconda is a scientific distribution of Python, which installs Python on your system alongside a great deal of libraries that scientists use. To be clear: it is possible to install Python in other ways and individually install the libraries, but Anaconda puts it all into a tidy package. As scientists want complicated mathematical algorithms, installing some scientific libraries can be very difficult: Anaconda makes it easy!" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## 1. Installing Jupyter Notebooks through Anaconda\n", 26 | "\n", 27 | "Install Jupyter Notebooks following the instructions in the Art of Literary Text Analysis, following the [Getting Setup](https://github.com/sgsinclair/alta/blob/master/ipynb/GettingSetup.ipynb) and [Getting Started](https://github.com/sgsinclair/alta/blob/master/ipynb/GettingStarted.ipynb) (you can stop before the Printing Dynamic Content section). Make sure you install the Python 3 version. Because this is our first introduction to ALTA, it's worth reading the [short introductory text](https://github.com/sgsinclair/alta/blob/master/ipynb/ArtOfLiteraryTextAnalysis.ipynb). If you have trouble with installation, start a discussion in the Open Discussion forum.\n", 28 | "\n", 29 | "After you're done installation, start a new notebook and follow along with the tour at Help > User Interface Tour.\n", 30 | "\n", 31 | "_Questions_\n", 32 | "\n", 33 | "- 1) What are the two modes of a notebook?\n", 34 | "- 2) What do you press to leave edit mode while in a cell?\n", 35 | "- 3) What are the Keyboard Shortcuts for:\n", 36 | " - a) insert cell below\n", 37 | " - b) insert cell above\n", 38 | " - c) run selected cells" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## 2. A Little bit of code\n", 46 | "\n", 47 | "Create a new cell in your notebook with the '+' button in the toolbar (or one of the keyboard shortcuts from the previous question). We're going to try two simple Python commands: setting a variable, and splitting it by whitespace. In the process, we'll encounter two types of data that Python can hold: a string, and a list.\n", 48 | "\n", 49 | "Add the following code to the cell and 'run' it. If it runs properly, it should look like below, with the 'In' and 'Out' information." 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "collapsed": false 57 | }, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/plain": [ 62 | "'Hello world.'" 63 | ] 64 | }, 65 | "execution_count": 9, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "sentence = \"Hello world.\"\n", 72 | "sentence" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "Here, we set a string to a variable, then we called that variable.\n", 80 | "\n", 81 | "_Questions_\n", 82 | "- 4) What output is there if you run the cell without the second line (which simply says `sentence`)?" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "A string is a type of data in Python. By setting it to the variable `sentence`, everywhere you use `sentence` is the exact same as simply writing `\"Hello world.\"` Consider the following examples, or even try them out, which show that the way of joining two strings works the same with a variable or directly with a string:" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "collapsed": false 97 | }, 98 | "outputs": [ 99 | { 100 | "data": { 101 | "text/plain": [ 102 | "'Hello world. Hello moon.'" 103 | ] 104 | }, 105 | "execution_count": 12, 106 | "metadata": {}, 107 | "output_type": "execute_result" 108 | } 109 | ], 110 | "source": [ 111 | "\"Hello world.\" + \" Hello moon.\"" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "collapsed": false 119 | }, 120 | "outputs": [ 121 | { 122 | "data": { 123 | "text/plain": [ 124 | "'Hello world. Hello moon.'" 125 | ] 126 | }, 127 | "execution_count": 14, 128 | "metadata": {}, 129 | "output_type": "execute_result" 130 | } 131 | ], 132 | "source": [ 133 | "sentence + \" Hello moon.\"" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "collapsed": false 141 | }, 142 | "outputs": [ 143 | { 144 | "data": { 145 | "text/plain": [ 146 | "'Hello world.Hello world.'" 147 | ] 148 | }, 149 | "execution_count": 15, 150 | "metadata": {}, 151 | "output_type": "execute_result" 152 | } 153 | ], 154 | "source": [ 155 | "sentence + sentence" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "We can even see the datatype of a variable with `type()`:" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": { 169 | "collapsed": false, 170 | "scrolled": true 171 | }, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "text/plain": [ 176 | "str" 177 | ] 178 | }, 179 | "execution_count": 80, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "type(sentence)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "If you have a really long string that needs to go across lines, you can use `\\` before the line break to tell Python that _this line of code is not done yet_. Set this famously long sentence from _Paul Clifton_ to the variable `paragraph` in your notebook:" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": { 199 | "collapsed": false 200 | }, 201 | "outputs": [ 202 | { 203 | "data": { 204 | "text/plain": [ 205 | "'It was a dark and stormy night; the rain fell in torrents — except at occasional intervals, when it was checked by a violent gust of wind which swept up the streets (for it is in London that our scene lies), rattling along the housetops, and fiercely agitating the scanty flame of the lamps that struggled against the darkness.'" 206 | ] 207 | }, 208 | "execution_count": 30, 209 | "metadata": {}, 210 | "output_type": "execute_result" 211 | } 212 | ], 213 | "source": [ 214 | "paragraph = \"It was a dark and stormy night; the rain fell in torrents — except at occasional intervals, when it was \" + \\\n", 215 | " \"checked by a violent gust of wind which swept up the streets (for it is in London that our scene lies), rattling \" + \\\n", 216 | " \"along the housetops, and fiercely agitating the scanty flame of the lamps that struggled against the darkness.\"\n", 217 | "paragraph" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "_Questions_ \n", 225 | "- 5) For the code block above, \n", 226 | " - a) Are the indents necessary for the code to run?\n", 227 | " - b) Are the pluses (+) necessary for the code to run?\n", 228 | " - c) Are the backslashes (\\\\) necessary for the code to run?\n", 229 | " \n", 230 | "_tinker with the code and re-run as necessary_" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "Another important datatype in Python is the `list`. This is a way to hold multiple things together: strings, numbers, etc. For example:" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": { 244 | "collapsed": false 245 | }, 246 | "outputs": [ 247 | { 248 | "data": { 249 | "text/plain": [ 250 | "['Never', 'gonna', 'give', 'you', 'up']" 251 | ] 252 | }, 253 | "execution_count": 57, 254 | "metadata": {}, 255 | "output_type": "execute_result" 256 | } 257 | ], 258 | "source": [ 259 | "list_of_strings = [\"Never\", \"gonna\", \"give\", \"you\", \"up\"]\n", 260 | "list_of_strings" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": { 267 | "collapsed": false 268 | }, 269 | "outputs": [ 270 | { 271 | "data": { 272 | "text/plain": [ 273 | "[4, 8, 15, 16, 23, 42]" 274 | ] 275 | }, 276 | "execution_count": 38, 277 | "metadata": {}, 278 | "output_type": "execute_result" 279 | } 280 | ], 281 | "source": [ 282 | "list_of_numbers = [ 4, 8, 15, 16, 23, 42]\n", 283 | "list_of_numbers" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "Individual objects from a list can be retrieved using a square bracket referencing the place in the list (starting with 0):" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": { 297 | "collapsed": false 298 | }, 299 | "outputs": [ 300 | { 301 | "data": { 302 | "text/plain": [ 303 | "'Hello'" 304 | ] 305 | }, 306 | "execution_count": 36, 307 | "metadata": {}, 308 | "output_type": "execute_result" 309 | } 310 | ], 311 | "source": [ 312 | "list_of_strings[0]" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": { 319 | "collapsed": false 320 | }, 321 | "outputs": [ 322 | { 323 | "data": { 324 | "text/plain": [ 325 | "8" 326 | ] 327 | }, 328 | "execution_count": 40, 329 | "metadata": {}, 330 | "output_type": "execute_result" 331 | } 332 | ], 333 | "source": [ 334 | "list_of_numbers[1]" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "You can select a list range by specify two numbers in the square brackets with a colon in-between:" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": { 348 | "collapsed": false 349 | }, 350 | "outputs": [ 351 | { 352 | "data": { 353 | "text/plain": [ 354 | "['gonna', 'give', 'you']" 355 | ] 356 | }, 357 | "execution_count": 63, 358 | "metadata": {}, 359 | "output_type": "execute_result" 360 | } 361 | ], 362 | "source": [ 363 | "list_of_strings[1:4]" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "Using the colon without a number means _from the very start_ or _until the very end_:" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": { 377 | "collapsed": false 378 | }, 379 | "outputs": [ 380 | { 381 | "data": { 382 | "text/plain": [ 383 | "['Never', 'gonna', 'give', 'you']" 384 | ] 385 | }, 386 | "execution_count": 64, 387 | "metadata": {}, 388 | "output_type": "execute_result" 389 | } 390 | ], 391 | "source": [ 392 | "list_of_strings[:4]" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "metadata": { 399 | "collapsed": false 400 | }, 401 | "outputs": [ 402 | { 403 | "data": { 404 | "text/plain": [ 405 | "['gonna', 'give', 'you', 'up']" 406 | ] 407 | }, 408 | "execution_count": 65, 409 | "metadata": {}, 410 | "output_type": "execute_result" 411 | } 412 | ], 413 | "source": [ 414 | "list_of_strings[1:]" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": {}, 420 | "source": [ 421 | "You can add to a list with `list.append()`:" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": { 428 | "collapsed": false 429 | }, 430 | "outputs": [ 431 | { 432 | "data": { 433 | "text/plain": [ 434 | "['Hello', 'world', 'Word', 'Word']" 435 | ] 436 | }, 437 | "execution_count": 42, 438 | "metadata": {}, 439 | "output_type": "execute_result" 440 | } 441 | ], 442 | "source": [ 443 | "list_of_strings.append(\"Word\")\n", 444 | "list_of_strings" 445 | ] 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "metadata": {}, 450 | "source": [ 451 | "_Questions_\n", 452 | "\n", 453 | "- 6) Can a list have a mix of numbers and strings?\n", 454 | "- 7) We joined strings with '+'. What happens if you try to use '+' on two lists?" 455 | ] 456 | }, 457 | { 458 | "cell_type": "markdown", 459 | "metadata": {}, 460 | "source": [ 461 | "# 3. Splitting a string to a list" 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "metadata": {}, 467 | "source": [ 468 | "A string can be split into a list using a splitting character. In the (useless) example below, we tell Python that everywhere there is an 'o' should be considered a place to split the string into a list:" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": null, 474 | "metadata": { 475 | "collapsed": false 476 | }, 477 | "outputs": [ 478 | { 479 | "data": { 480 | "text/plain": [ 481 | "['Hell', ' w', 'rld.']" 482 | ] 483 | }, 484 | "execution_count": 49, 485 | "metadata": {}, 486 | "output_type": "execute_result" 487 | } 488 | ], 489 | "source": [ 490 | "sentence.split(\"o\")" 491 | ] 492 | }, 493 | { 494 | "cell_type": "markdown", 495 | "metadata": {}, 496 | "source": [ 497 | "This can be used for a simple word tokenization by space characters:" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "metadata": { 504 | "collapsed": false 505 | }, 506 | "outputs": [ 507 | { 508 | "data": { 509 | "text/plain": [ 510 | "['Hello', 'world.']" 511 | ] 512 | }, 513 | "execution_count": 54, 514 | "metadata": {}, 515 | "output_type": "execute_result" 516 | } 517 | ], 518 | "source": [ 519 | "words = sentence.split(\" \")\n", 520 | "words" 521 | ] 522 | }, 523 | { 524 | "cell_type": "markdown", 525 | "metadata": { 526 | "collapsed": false, 527 | "scrolled": true 528 | }, 529 | "source": [ 530 | "_Questions:_\n", 531 | "\n", 532 | " - 8) How would you select a list with the first seven words in the `paragraph` variable? This will require two steps. Show your code and the output.\n", 533 | " - 9) The opposite of `split` is possible with `\"string_to_join_list_items_by\".join(your_list)`. Set the list from question 8 to a variable and join it into a single string. The output will be 'It was a dark and stormy night;': write your code.\n", 534 | " - 10) Split the following text into a list of *sentences*. Don't worry if one of your sentences is an empty string (''). Show the code and output.\n", 535 | " > The shows opens at Duckburg. After Donald Duck enlists in the navy, Uncle Scrooge has to take care of grand-nephews Huey, Dewey, and Louie. Uncle Scrooge brings the boys to the McDuck's mansion where they are presented to Duckworth, the butler. The nephews are forced to sleep in the attic." 536 | ] 537 | } 538 | ], 539 | "metadata": { 540 | "kernelspec": { 541 | "display_name": "Python 3", 542 | "language": "python", 543 | "name": "python3" 544 | }, 545 | "language_info": { 546 | "codemirror_mode": { 547 | "name": "ipython", 548 | "version": 3 549 | }, 550 | "file_extension": ".py", 551 | "mimetype": "text/x-python", 552 | "name": "python", 553 | "nbconvert_exporter": "python", 554 | "pygments_lexer": "ipython3", 555 | "version": "3.5.1" 556 | } 557 | }, 558 | "nbformat": 4, 559 | "nbformat_minor": 0 560 | } 561 | -------------------------------------------------------------------------------- /examples/Topic Modelling Trump Tweets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stderr", 12 | "output_type": "stream", 13 | "text": [ 14 | "C:\\Users\\organis2\\AppData\\Local\\Continuum\\Anaconda3\\lib\\site-packages\\gensim\\utils.py:855: UserWarning: detected Windows; aliasing chunkize to chunkize_serial\n", 15 | " warnings.warn(\"detected Windows; aliasing chunkize to chunkize_serial\")\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "import gensim\n", 21 | "import os\n", 22 | "import pandas as pd\n", 23 | "from gensim.corpora.dictionary import Dictionary " 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [ 33 | { 34 | "data": { 35 | "text/html": [ 36 | "
\n", 37 | "\n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | "
TextDateFavoritesRetweetsTweet ID
0Nielson Media Research final numbers on ACCEPT...2016-07-30 23:32:40138504130759592590106849280
1Thank you to all of the television viewers tha...2016-07-30 19:00:07276596842759524001613918208
2Can you imagine if I had the small crowds that...2016-07-30 18:28:22199686488759516008272932864
3NATO commander agrees members should pay up vi...2016-07-30 18:24:40116244668759515080010719232
4Wow, NATO's top commander just announced that ...2016-07-30 18:18:58239227819759513644258525184
\n", 91 | "
" 92 | ], 93 | "text/plain": [ 94 | " Text Date \\\n", 95 | "0 Nielson Media Research final numbers on ACCEPT... 2016-07-30 23:32:40 \n", 96 | "1 Thank you to all of the television viewers tha... 2016-07-30 19:00:07 \n", 97 | "2 Can you imagine if I had the small crowds that... 2016-07-30 18:28:22 \n", 98 | "3 NATO commander agrees members should pay up vi... 2016-07-30 18:24:40 \n", 99 | "4 Wow, NATO's top commander just announced that ... 2016-07-30 18:18:58 \n", 100 | "\n", 101 | " Favorites Retweets Tweet ID \n", 102 | "0 13850 4130 759592590106849280 \n", 103 | "1 27659 6842 759524001613918208 \n", 104 | "2 19968 6488 759516008272932864 \n", 105 | "3 11624 4668 759515080010719232 \n", 106 | "4 23922 7819 759513644258525184 " 107 | ] 108 | }, 109 | "execution_count": 3, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "tweets = pd.read_csv(\"https://raw.githubusercontent.com/sashaperigo/Trump-Tweets/master/data.csv\").dropna()\n", 116 | "tweets.head()" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": { 123 | "collapsed": false 124 | }, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/plain": [ 129 | "0 #a\n", 130 | "1 test\n", 131 | "dtype: object" 132 | ] 133 | }, 134 | "execution_count": 4, 135 | "metadata": {}, 136 | "output_type": "execute_result" 137 | } 138 | ], 139 | "source": [ 140 | "# Tokenize tweets, while stoplisting, case-folding, and filtering\n", 141 | "from nltk import word_tokenize\n", 142 | "from nltk.corpus import stopwords\n", 143 | "stoplist = stopwords.words('english')\n", 144 | "\n", 145 | "def clean_tweet(tweet):\n", 146 | " lower = tweet.lower()\n", 147 | " # Small hack to keep hashtags without modifying tokenizer:\n", 148 | " # replace # with text, then replace back later\n", 149 | " terms = word_tokenize(lower.replace(\"#\", \"HASH_\"))\n", 150 | " terms_stopped = [term for term in terms if term not in stoplist]\n", 151 | " terms_alpha = [term for term in terms_stopped if (term.isalpha() or \"HASH_\" in term)]\n", 152 | " if len(terms_alpha) == 0:\n", 153 | " return pd.Series()\n", 154 | " else:\n", 155 | " return pd.Series(terms_alpha).str.replace(\"HASH_\", \"#\")\n", 156 | " \n", 157 | "clean_tweet(\"This is #a test\")" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": { 164 | "collapsed": false 165 | }, 166 | "outputs": [ 167 | { 168 | "data": { 169 | "text/html": [ 170 | "
\n", 171 | "\n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | "
word##1#2#2016#2a#alsicebucketchallenge#america#americafirst#apprentice#autism...yrsyuanzerozimmermanzogbyzonezoneszuckerzuckermanzuker
Tweet ID
16983089350.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
17014611820.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
17374799870.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
17411607160.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
17735613380.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
\n", 345 | "

5 rows × 5445 columns

\n", 346 | "
" 347 | ], 348 | "text/plain": [ 349 | "word # #1 #2 #2016 #2a #alsicebucketchallenge #america \\\n", 350 | "Tweet ID \n", 351 | "1698308935 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 352 | "1701461182 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 353 | "1737479987 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 354 | "1741160716 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 355 | "1773561338 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 356 | "\n", 357 | "word #americafirst #apprentice #autism ... yrs yuan zero \\\n", 358 | "Tweet ID ... \n", 359 | "1698308935 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", 360 | "1701461182 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", 361 | "1737479987 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", 362 | "1741160716 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", 363 | "1773561338 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", 364 | "\n", 365 | "word zimmerman zogby zone zones zucker zuckerman zuker \n", 366 | "Tweet ID \n", 367 | "1698308935 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 368 | "1701461182 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 369 | "1737479987 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 370 | "1741160716 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 371 | "1773561338 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 372 | "\n", 373 | "[5 rows x 5445 columns]" 374 | ] 375 | }, 376 | "execution_count": 5, 377 | "metadata": {}, 378 | "output_type": "execute_result" 379 | } 380 | ], 381 | "source": [ 382 | "# Create a 'long' dataframe of term counts\n", 383 | "tweet_words = tweets['Text'].str.lower().apply(clean_tweet)\n", 384 | "tweet_words.index = tweets['Tweet ID']\n", 385 | "\n", 386 | "word_counts = (tweet_words.stack().to_frame()\n", 387 | " .reset_index()\n", 388 | " .rename(columns={0:'word', 'level_1':'count'})\n", 389 | " .groupby(['Tweet ID', 'word'], as_index=False).count()\n", 390 | " )\n", 391 | "\n", 392 | "# Filter to words that have been used 5 or more times\n", 393 | "words_filtered = word_counts.groupby('word').filter(lambda x: x['count'].sum() >= 5)\n", 394 | "\n", 395 | "# Make 'wide' dataframe, i.e. a document-term matrix\n", 396 | "trump_counts = words_filtered.pivot(index='Tweet ID', columns='word', values='count').fillna(0)\n", 397 | "trump_counts.head()" 398 | ] 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "metadata": {}, 403 | "source": [ 404 | "The size of our document-term matrix, `count(tweets) x count(unique_words)`:" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "metadata": { 411 | "collapsed": false 412 | }, 413 | "outputs": [ 414 | { 415 | "data": { 416 | "text/plain": [ 417 | "901 \"@NathanDWilsonFL: @MariaBartiromo you had a g...\n", 418 | "2821 \"@AniesiODaniels: #DemDebate Q: Who are you vo...\n", 419 | "3646 \"@TradingStreetCo:Donald Trump Is Ratings ‘Gol...\n", 420 | "4359 \"@moshe_mkmdca: @realDonaldTrump @007lLisav @C...\n", 421 | "4981 \"@jimlibertarian: @SlwStdySque Donald has alr...\n", 422 | "Name: Text, dtype: object" 423 | ] 424 | }, 425 | "execution_count": 29, 426 | "metadata": {}, 427 | "output_type": "execute_result" 428 | } 429 | ], 430 | "source": [ 431 | "q = trump_counts.loc[:,[\"donald\"]].query('donald > 1').index.values\n", 432 | "tweets[tweets[\"Tweet ID\"].isin(q)]['Text'].head()" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": null, 438 | "metadata": { 439 | "collapsed": true 440 | }, 441 | "outputs": [], 442 | "source": [ 443 | "# Number all the columns and create a gensim dictionary\n", 444 | "dictionary = Dictionary()\n", 445 | "dictionary.token2id = dict(zip(trump_counts.columns, range(0, trump_counts.shape[1])))" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": { 452 | "collapsed": false 453 | }, 454 | "outputs": [], 455 | "source": [ 456 | "# If I haven't already trained and saved a model, train it now\n", 457 | "if not os.path.exists('trump-tweets.pickle'):\n", 458 | " # Train a model\n", 459 | " # Gensim has a way to read numpy arrays, but they use columns for documents - so rotate ('transpose') the DataFrame\n", 460 | " corpus = gensim.matutils.Dense2Corpus(trump_counts.values.T)\n", 461 | " lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,\n", 462 | " num_topics=20, update_every=1, chunksize=1000, passes=6, alpha='auto')\n", 463 | " lda.save('trump-tweets.pickle')\n", 464 | "else:\n", 465 | " # Load a model\n", 466 | " lda = gensim.models.ldamodel.LdaModel.load('trump-tweets.pickle')" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "metadata": { 473 | "collapsed": false, 474 | "scrolled": false 475 | }, 476 | "outputs": [ 477 | { 478 | "name": "stdout", 479 | "output_type": "stream", 480 | "text": [ 481 | "0\t0.060*\"nice\" + 0.054*\"got\" + 0.052*\"wow\" + 0.050*\"say\" + 0.038*\"nothing\" + 0.032*\"wonderful\"\n", 482 | "1\t0.070*\"cnn\" + 0.068*\"poll\" + 0.046*\"think\" + 0.041*\"true\" + 0.037*\"day\" + 0.036*\"man\"\n", 483 | "2\t0.279*\"thank\" + 0.090*\"vote\" + 0.087*\"big\" + 0.036*\"crowd\" + 0.035*\"needs\" + 0.018*\"apprentice\"\n", 484 | "3\t0.092*\"clinton\" + 0.043*\"megynkelly\" + 0.042*\"ever\" + 0.029*\"presidential\" + 0.024*\"women\" + 0.021*\"truth\"\n", 485 | "4\t0.098*\"people\" + 0.072*\"get\" + 0.048*\"cruz\" + 0.047*\"many\" + 0.040*\"bad\" + 0.036*\"really\"\n", 486 | "5\t0.181*\"http\" + 0.139*\"trump\" + 0.103*\"donald\" + 0.045*\"via\" + 0.022*\"morning\" + 0.020*\"hampshire\"\n", 487 | "6\t0.090*\"make\" + 0.087*\"foxnews\" + 0.063*\"win\" + 0.040*\"gop\" + 0.039*\"interview\" + 0.038*\"foxandfriends\"\n", 488 | "7\t0.043*\"hope\" + 0.038*\"watching\" + 0.032*\"person\" + 0.031*\"far\" + 0.028*\"year\" + 0.027*\"party\"\n", 489 | "8\t0.070*\"see\" + 0.050*\"know\" + 0.047*\"tomorrow\" + 0.045*\"speech\" + 0.037*\"let\" + 0.037*\"years\"\n", 490 | "9\t0.119*\"#makeamericagreatagain\" + 0.043*\"support\" + 0.040*\"campaign\" + 0.040*\"jobs\" + 0.035*\"american\" + 0.034*\"join\"\n", 491 | "10\t0.201*\"great\" + 0.044*\"thanks\" + 0.038*\"tonight\" + 0.031*\"today\" + 0.030*\"show\" + 0.029*\"last\"\n", 492 | "11\t0.099*\"president\" + 0.071*\"would\" + 0.049*\"good\" + 0.043*\"obama\" + 0.042*\"never\" + 0.042*\"need\"\n", 493 | "12\t0.043*\"work\" + 0.037*\"national\" + 0.032*\"oreillyfactor\" + 0.029*\"golf\" + 0.027*\"hard\" + 0.027*\"place\"\n", 494 | "13\t0.220*\"realdonaldtrump\" + 0.076*\"trump\" + 0.052*\"hillary\" + 0.051*\"america\" + 0.047*\"#trump2016\" + 0.032*\"like\"\n", 495 | "14\t0.180*\"https\" + 0.123*\"new\" + 0.079*\"crooked\" + 0.022*\"york\" + 0.020*\"rally\" + 0.019*\"politicians\"\n", 496 | "15\t0.063*\"much\" + 0.039*\"republican\" + 0.038*\"better\" + 0.037*\"money\" + 0.037*\"bernie\" + 0.033*\"deal\"\n", 497 | "16\t0.038*\"happy\" + 0.038*\"jebbush\" + 0.036*\"change\" + 0.032*\"florida\" + 0.026*\"endorsement\" + 0.025*\"ready\"\n", 498 | "17\t0.046*\"keep\" + 0.037*\"donaldtrump\" + 0.035*\"soon\" + 0.033*\"wants\" + 0.026*\"agree\" + 0.026*\"sanders\"\n", 499 | "18\t0.094*\"love\" + 0.058*\"ted\" + 0.035*\"isis\" + 0.026*\"immigration\" + 0.021*\"terrible\" + 0.020*\"wo\"\n", 500 | "19\t0.083*\"country\" + 0.042*\"even\" + 0.039*\"right\" + 0.038*\"debate\" + 0.037*\"media\" + 0.035*\"must\"\n" 501 | ] 502 | } 503 | ], 504 | "source": [ 505 | "print(\"\\n\".join([\"%d\\t%s\" % info for info in lda.show_topics(num_topics=20, num_words=6)]))" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "metadata": { 512 | "collapsed": false 513 | }, 514 | "outputs": [ 515 | { 516 | "data": { 517 | "text/plain": [ 518 | "(20, 5445)" 519 | ] 520 | }, 521 | "execution_count": 56, 522 | "metadata": {}, 523 | "output_type": "execute_result" 524 | } 525 | ], 526 | "source": [ 527 | "lda.state.get_lambda().shape" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": null, 533 | "metadata": { 534 | "collapsed": false 535 | }, 536 | "outputs": [ 537 | { 538 | "ename": "ValueError", 539 | "evalue": "too many values to unpack (expected 2)", 540 | "output_type": "error", 541 | "traceback": [ 542 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 543 | "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", 544 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0ma\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlda\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_document_topics\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdense\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtolist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 545 | "\u001b[1;32mC:\\Users\\organis2\\AppData\\Local\\Continuum\\Anaconda3\\lib\\site-packages\\gensim\\models\\ldamodel.py\u001b[0m in \u001b[0;36mget_document_topics\u001b[1;34m(self, bow, minimum_probability, minimum_phi_value, per_word_topics)\u001b[0m\n\u001b[0;32m 913\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 914\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 915\u001b[1;33m \u001b[0mgamma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mphis\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minference\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mbow\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcollect_sstats\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mper_word_topics\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 916\u001b[0m \u001b[0mtopic_dist\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgamma\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m/\u001b[0m \u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mgamma\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# normalize distribution\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 917\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 546 | "\u001b[1;32mC:\\Users\\organis2\\AppData\\Local\\Continuum\\Anaconda3\\lib\\site-packages\\gensim\\models\\ldamodel.py\u001b[0m in \u001b[0;36minference\u001b[1;34m(self, chunk, collect_sstats)\u001b[0m\n\u001b[0;32m 428\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mdoc\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdoc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msix\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minteger_types\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 429\u001b[0m \u001b[1;31m# make sure the term IDs are ints, otherwise np will get upset\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 430\u001b[1;33m \u001b[0mids\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mid\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mid\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mdoc\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 431\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 432\u001b[0m \u001b[0mids\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mid\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mid\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mdoc\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 547 | "\u001b[1;32mC:\\Users\\organis2\\AppData\\Local\\Continuum\\Anaconda3\\lib\\site-packages\\gensim\\models\\ldamodel.py\u001b[0m in \u001b[0;36m\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m 428\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mdoc\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdoc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msix\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minteger_types\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 429\u001b[0m \u001b[1;31m# make sure the term IDs are ints, otherwise np will get upset\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 430\u001b[1;33m \u001b[0mids\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mid\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mid\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mdoc\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 431\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 432\u001b[0m \u001b[0mids\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mid\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mid\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mdoc\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 548 | "\u001b[1;31mValueError\u001b[0m: too many values to unpack (expected 2)" 549 | ] 550 | } 551 | ], 552 | "source": [ 553 | "a = lda.get_document_topics(corpus.dense.tolist())" 554 | ] 555 | } 556 | ], 557 | "metadata": { 558 | "kernelspec": { 559 | "display_name": "Python 3", 560 | "language": "python", 561 | "name": "python3" 562 | }, 563 | "language_info": { 564 | "codemirror_mode": { 565 | "name": "ipython", 566 | "version": 3 567 | }, 568 | "file_extension": ".py", 569 | "mimetype": "text/x-python", 570 | "name": "python", 571 | "nbconvert_exporter": "python", 572 | "pygments_lexer": "ipython3", 573 | "version": "3.5.1" 574 | } 575 | }, 576 | "nbformat": 4, 577 | "nbformat_minor": 0 578 | } 579 | -------------------------------------------------------------------------------- /examples/Pivot Example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": { 19 | "collapsed": false, 20 | "scrolled": true 21 | }, 22 | "outputs": [ 23 | { 24 | "data": { 25 | "text/html": [ 26 | "
\n", 27 | "\n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | "
classcountdocumentword
1class 117doc 1word 1
2class 13doc 1word 2
3class 110doc 1word 3
4class 13doc 1word 4
5class 11doc 1word 5
6class 110doc 1word 6
7class 14doc 1word 7
8class 15doc 1word 8
9class 17doc 1word 9
10class 212doc 2word 0
11class 29doc 2word 1
12class 26doc 2word 2
13class 22doc 2word 3
14class 27doc 2word 4
15class 21doc 2word 5
16class 24doc 2word 6
17class 25doc 2word 7
18class 217doc 2word 8
\n", 166 | "
" 167 | ], 168 | "text/plain": [ 169 | " class count document word\n", 170 | "1 class 1 17 doc 1 word 1\n", 171 | "2 class 1 3 doc 1 word 2\n", 172 | "3 class 1 10 doc 1 word 3\n", 173 | "4 class 1 3 doc 1 word 4\n", 174 | "5 class 1 1 doc 1 word 5\n", 175 | "6 class 1 10 doc 1 word 6\n", 176 | "7 class 1 4 doc 1 word 7\n", 177 | "8 class 1 5 doc 1 word 8\n", 178 | "9 class 1 7 doc 1 word 9\n", 179 | "10 class 2 12 doc 2 word 0\n", 180 | "11 class 2 9 doc 2 word 1\n", 181 | "12 class 2 6 doc 2 word 2\n", 182 | "13 class 2 2 doc 2 word 3\n", 183 | "14 class 2 7 doc 2 word 4\n", 184 | "15 class 2 1 doc 2 word 5\n", 185 | "16 class 2 4 doc 2 word 6\n", 186 | "17 class 2 5 doc 2 word 7\n", 187 | "18 class 2 17 doc 2 word 8" 188 | ] 189 | }, 190 | "execution_count": 24, 191 | "metadata": {}, 192 | "output_type": "execute_result" 193 | } 194 | ], 195 | "source": [ 196 | "#Creating fake data for the example\n", 197 | "words = [\"word \" + str(number) for number in np.arange(0,10)] * 2\n", 198 | "documents = [\"doc 1\"] * 10 + [\"doc 2\"] * 10\n", 199 | "classes = [\"class 1\"] * 10 + [\"class 2\"] * 10\n", 200 | "counts = np.random.randint(1, 20, 20)\n", 201 | "# Create dataframe, and deliberately at missing data by select 1:-1 (this drops the first and last row)\n", 202 | "df = pd.DataFrame({'document':documents, 'word':words, 'class':classes, 'count': counts}).iloc[1:-1]\n", 203 | "df" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "The example DataFrame, above, is a \"long\" dataframe with each row representing the count for a word for a document. It is expect that there each document/word has one row; if it doesn't, do a `groupby` with a `sum` for the column.\n", 211 | "\n", 212 | "To make it `wide`, here is one example:" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": { 219 | "collapsed": false 220 | }, 221 | "outputs": [ 222 | { 223 | "data": { 224 | "text/html": [ 225 | "
\n", 226 | "\n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | "
wordword 0word 1word 2word 3word 4word 5word 6word 7word 8word 9
document
doc 1NaN17.03.010.03.01.010.04.05.07.0
doc 212.09.06.02.07.01.04.05.017.0NaN
\n", 284 | "
" 285 | ], 286 | "text/plain": [ 287 | "word word 0 word 1 word 2 word 3 word 4 word 5 word 6 word 7 \\\n", 288 | "document \n", 289 | "doc 1 NaN 17.0 3.0 10.0 3.0 1.0 10.0 4.0 \n", 290 | "doc 2 12.0 9.0 6.0 2.0 7.0 1.0 4.0 5.0 \n", 291 | "\n", 292 | "word word 8 word 9 \n", 293 | "document \n", 294 | "doc 1 5.0 7.0 \n", 295 | "doc 2 17.0 NaN " 296 | ] 297 | }, 298 | "execution_count": 25, 299 | "metadata": {}, 300 | "output_type": "execute_result" 301 | } 302 | ], 303 | "source": [ 304 | "wide_df = df.pivot(index='document', columns='word', values='count')\n", 305 | "wide_df" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "Note that doc1 didn't have word 0, and doc2 didn't have word 9, so they have NaN (Not a Number) values. We can fill these in with fillna(0). Redoing the previous step in a better way:" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": { 319 | "collapsed": false 320 | }, 321 | "outputs": [ 322 | { 323 | "data": { 324 | "text/html": [ 325 | "
\n", 326 | "\n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | "
wordword 0word 1word 2word 3word 4word 5word 6word 7word 8word 9
document
doc 10.017.03.010.03.01.010.04.05.07.0
doc 212.09.06.02.07.01.04.05.017.00.0
\n", 384 | "
" 385 | ], 386 | "text/plain": [ 387 | "word word 0 word 1 word 2 word 3 word 4 word 5 word 6 word 7 \\\n", 388 | "document \n", 389 | "doc 1 0.0 17.0 3.0 10.0 3.0 1.0 10.0 4.0 \n", 390 | "doc 2 12.0 9.0 6.0 2.0 7.0 1.0 4.0 5.0 \n", 391 | "\n", 392 | "word word 8 word 9 \n", 393 | "document \n", 394 | "doc 1 5.0 7.0 \n", 395 | "doc 2 17.0 0.0 " 396 | ] 397 | }, 398 | "execution_count": 26, 399 | "metadata": {}, 400 | "output_type": "execute_result" 401 | } 402 | ], 403 | "source": [ 404 | "wide_df = df.pivot(index='document', columns='word', values='count').fillna(0)\n", 405 | "wide_df" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": null, 411 | "metadata": { 412 | "collapsed": false 413 | }, 414 | "outputs": [ 415 | { 416 | "data": { 417 | "text/html": [ 418 | "
\n", 419 | "\n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | "
documentwordcount
1doc 1word 117
2doc 1word 23
3doc 1word 310
4doc 1word 43
5doc 1word 51
6doc 1word 610
7doc 1word 74
8doc 1word 85
9doc 1word 97
10doc 2word 012
11doc 2word 19
12doc 2word 26
13doc 2word 32
14doc 2word 47
15doc 2word 51
16doc 2word 64
17doc 2word 75
18doc 2word 817
\n", 539 | "
" 540 | ], 541 | "text/plain": [ 542 | " document word count\n", 543 | "1 doc 1 word 1 17\n", 544 | "2 doc 1 word 2 3\n", 545 | "3 doc 1 word 3 10\n", 546 | "4 doc 1 word 4 3\n", 547 | "5 doc 1 word 5 1\n", 548 | "6 doc 1 word 6 10\n", 549 | "7 doc 1 word 7 4\n", 550 | "8 doc 1 word 8 5\n", 551 | "9 doc 1 word 9 7\n", 552 | "10 doc 2 word 0 12\n", 553 | "11 doc 2 word 1 9\n", 554 | "12 doc 2 word 2 6\n", 555 | "13 doc 2 word 3 2\n", 556 | "14 doc 2 word 4 7\n", 557 | "15 doc 2 word 5 1\n", 558 | "16 doc 2 word 6 4\n", 559 | "17 doc 2 word 7 5\n", 560 | "18 doc 2 word 8 17" 561 | ] 562 | }, 563 | "execution_count": 28, 564 | "metadata": {}, 565 | "output_type": "execute_result" 566 | } 567 | ], 568 | "source": [ 569 | "df[['document', 'word', 'count']]" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": null, 575 | "metadata": { 576 | "collapsed": false 577 | }, 578 | "outputs": [ 579 | { 580 | "data": { 581 | "text/html": [ 582 | "
\n", 583 | "\n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | "
count
documentword
doc 1word 117
word 23
word 310
word 43
word 51
word 610
word 74
word 85
word 97
doc 2word 012
word 19
word 26
word 32
word 47
word 51
word 64
word 75
word 817
\n", 673 | "
" 674 | ], 675 | "text/plain": [ 676 | " count\n", 677 | "document word \n", 678 | "doc 1 word 1 17\n", 679 | " word 2 3\n", 680 | " word 3 10\n", 681 | " word 4 3\n", 682 | " word 5 1\n", 683 | " word 6 10\n", 684 | " word 7 4\n", 685 | " word 8 5\n", 686 | " word 9 7\n", 687 | "doc 2 word 0 12\n", 688 | " word 1 9\n", 689 | " word 2 6\n", 690 | " word 3 2\n", 691 | " word 4 7\n", 692 | " word 5 1\n", 693 | " word 6 4\n", 694 | " word 7 5\n", 695 | " word 8 17" 696 | ] 697 | }, 698 | "execution_count": 27, 699 | "metadata": {}, 700 | "output_type": "execute_result" 701 | } 702 | ], 703 | "source": [ 704 | "summed_counts = df.groupby(['document', 'word'])[['count']].sum()\n", 705 | "summed_counts" 706 | ] 707 | }, 708 | { 709 | "cell_type": "markdown", 710 | "metadata": {}, 711 | "source": [ 712 | "Note also that we only kept the document information as the index. The class labels are still in the long DataFrame.\n", 713 | "\n", 714 | "Here, I \n", 715 | " 1. select just those two columns\n", 716 | " 2. only look at the unique combinations\n", 717 | " 3. set the index to document so it mimics `wide_df`. This is optional, but helps consistency." 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": null, 723 | "metadata": { 724 | "collapsed": false 725 | }, 726 | "outputs": [ 727 | { 728 | "data": { 729 | "text/html": [ 730 | "
\n", 731 | "\n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | "
class
document
doc 1class 1
doc 2class 2
\n", 753 | "
" 754 | ], 755 | "text/plain": [ 756 | " class\n", 757 | "document \n", 758 | "doc 1 class 1\n", 759 | "doc 2 class 2" 760 | ] 761 | }, 762 | "execution_count": 7, 763 | "metadata": {}, 764 | "output_type": "execute_result" 765 | } 766 | ], 767 | "source": [ 768 | "labels = (df[['document', 'class']]\n", 769 | " .drop_duplicates()\n", 770 | " .set_index('document')\n", 771 | " )\n", 772 | "labels" 773 | ] 774 | }, 775 | { 776 | "cell_type": "markdown", 777 | "metadata": {}, 778 | "source": [ 779 | "*Important*: when sending things to SciKit Learn, make sure the rows on the training data and labels are in the same order! Here, they are correct (e.g. doc1 is the first row both times, doc2 is the second row both times).\n", 780 | "\n", 781 | "If they were incorrect, you can take the index from the data (`wide_df.index`) and select the rows in labels to match that order, like this:" 782 | ] 783 | }, 784 | { 785 | "cell_type": "code", 786 | "execution_count": null, 787 | "metadata": { 788 | "collapsed": false 789 | }, 790 | "outputs": [ 791 | { 792 | "data": { 793 | "text/html": [ 794 | "
\n", 795 | "\n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | "
class
document
doc 1class 1
doc 2class 2
\n", 817 | "
" 818 | ], 819 | "text/plain": [ 820 | " class\n", 821 | "document \n", 822 | "doc 1 class 1\n", 823 | "doc 2 class 2" 824 | ] 825 | }, 826 | "execution_count": 8, 827 | "metadata": {}, 828 | "output_type": "execute_result" 829 | } 830 | ], 831 | "source": [ 832 | "labels.loc[wide_df.index]" 833 | ] 834 | } 835 | ], 836 | "metadata": { 837 | "kernelspec": { 838 | "display_name": "Python 3", 839 | "language": "python", 840 | "name": "python3" 841 | }, 842 | "language_info": { 843 | "codemirror_mode": { 844 | "name": "ipython", 845 | "version": 3 846 | }, 847 | "file_extension": ".py", 848 | "mimetype": "text/x-python", 849 | "name": "python", 850 | "nbconvert_exporter": "python", 851 | "pygments_lexer": "ipython3", 852 | "version": "3.5.1" 853 | } 854 | }, 855 | "nbformat": 4, 856 | "nbformat_minor": 0 857 | } 858 | -------------------------------------------------------------------------------- /labs/Lab 06 - More Pandas and Intro to Classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lab 06\n", 8 | "\n", 9 | "This week, we'll continue with the tutorial on using the HTRC Extracted Features Dataset, through Python. Last week was the preparation, this week is the fun stuff!\n", 10 | "\n", 11 | "## Pandas and the Extracted Features Dataset, continued" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import pandas as pd" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "### Method Chaining\n", 30 | "\n", 31 | "In Pandas, you may find yourself combining a number of Dataframe methods in a row. When the output of each step is a DataFrame, you don't have to save each step to a variable: you can 'chain' the commands. So, if you want to transfer a DataFrame called `original`:\n", 32 | "\n", 33 | "```python\n", 34 | "df1 = original.do_something()\n", 35 | "df2 = df1.do_something_else()\n", 36 | "df3 = df2.do_more()\n", 37 | "```\n", 38 | ", you can get the same result as follows:\n", 39 | "\n", 40 | "```python\n", 41 | "df3 = original.do_something().do_something_else().do_more()\n", 42 | "```\n", 43 | "\n", 44 | "You may see the benefit and the downside of method chaining above.\n", 45 | "\n", 46 | "The benefit: you're not saving intermediate DataFrames to variables. `df1` and `df2` were only necessary to get you to `df3`, so why even save them?\n", 47 | "\n", 48 | "The downside is less readability: yuck! This is fine for short chains, but for longer ones you still want the line breaks. That way, when you return to your code in the future, you can make sense of it (and so I can read it when marking!).\n", 49 | "\n", 50 | "To format chained methods better, you can wrap everything in braces, which tells Python that the current line of code isn't done until the braces end:\n", 51 | "\n", 52 | "```python\n", 53 | "(df3 = original.do_something()\n", 54 | " .do_something_else()\n", 55 | " .do_more()\n", 56 | ")\n", 57 | "```\n", 58 | "\n", 59 | "Much prettier. This style will be useful once things get more complex. Remember that you're not forced to use chaining: saving intermediate variables is fine, and can be helpful if you find a bug somewhere in the chain. However, you'll see it occasionally in example code, so it is good to understand what is happening." 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "### Slicing\n", 67 | "\n", 68 | "Following from last week's reading on [Text Mining in Python through the HTRC Feature Reader](http://programminghistorian.org/lessons/text-mining-with-extracted-features), we'll be continuing from the 'Slicing DataFrames' section to the end.\n", 69 | "\n", 70 | "First, lets reload the volume from last lab task." 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": { 77 | "collapsed": false 78 | }, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "" 84 | ] 85 | }, 86 | "execution_count": 2, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "from htrc_features import FeatureReader\n", 93 | "fr = FeatureReader('../data/mdp.49015002392919.json.bz2')\n", 94 | "vol = fr.first()\n", 95 | "vol" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "**Q1**: Fill in the blanks to produce the output show in the image below:\n", 103 | "\n", 104 | "```\n", 105 | "(vol.tokenlist(pages=**BLANK1**, pos=**BLANK2**, case=False)\n", 106 | " .loc[(\"body\", slice(None), \"**BLANK3**\"),]\n", 107 | " .sort_values(\"count\", ascending=**BLANK4**)\n", 108 | " .head(**BLANK5**)\n", 109 | ")\n", 110 | "```\n", 111 | "\n", 112 | "![](../images/lab6-output.png)\n", 113 | "\n", 114 | "_Multiple Choice_\n", 115 | "1. True, False\n", 116 | "2. True, False\n", 117 | "3. slice(None), \"body\", \"RB\", \"NNP\"\n", 118 | "4. True, False\n", 119 | "5. 3, 5, 7" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "**Q2**: What is the code to get the token frequencies for page 39 of the book? You'll start with `tl = vol.tokenlist()`, what's next?" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "**Q3**: How would you get the five most frequent words tagged as a proper noun or a plural proper noun? Since the question doesn't involve page-level counts, you'll want to start with `tl = vol.tokenlist(pages=False)`." 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "### Grouping" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "**Q4**: What does the following code do?\n", 148 | "\n", 149 | "```python\n", 150 | "tl = vol.tokenlist()\n", 151 | "tl.groupby(level='page').count().sort_values('count', ascending=False)\n", 152 | "```\n", 153 | "\n", 154 | "How does it differ from the following?\n", 155 | "\n", 156 | "```python\n", 157 | "tl = vol.tokenlist()\n", 158 | "tl.groupby(level='page').sum().sort_values('count', ascending=False)\n", 159 | "```" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "**Q5** (2pts): Set a new variable to `vol.tokenlist().reset_index()`.\n", 167 | "\n", 168 | "**a)** What did `reset_index` do?\n", 169 | "**b)** How would you get run the summing code from above (i.e. the second example in Q4)?" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "**Q6**: Using the DataFrame from Q5, how would you select the rows with counts for the word `Tom`? Remember from the reading that 'slicing' is something done only on indexes - you learned to select based on a column value last week." 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "**Q7**: Using the result from Q6, figure out how to plot the counts of 'Tom' by page. The plot method for DataFrames takes `x` and `y` arguments. Share the code to produce this:\n", 184 | "\n", 185 | "![](../images/lab6-sawyer-plot.png)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "### Pandas Series\n", 193 | "\n", 194 | "Where a Pandas DataFrame object is like a spreadsheet, with rows and columns, a Pandas Series object is like just one column: it is a sequence of just one value at a time. You can think of it as a supercharged list.\n", 195 | "\n", 196 | "To pull out a single column of a DataFrame as a Series, use square brackets to reference the column by name. Using the DataFrame from Q7, where the index has been reset to columns, here's an example:" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": { 203 | "collapsed": false 204 | }, 205 | "outputs": [ 206 | { 207 | "data": { 208 | "text/plain": [ 209 | "31992 of\n", 210 | "35613 Least\n", 211 | "16341 them\n", 212 | "4477 bear\n", 213 | "15935 finally\n", 214 | "Name: token, dtype: object" 215 | ] 216 | }, 217 | "execution_count": 186, 218 | "metadata": {}, 219 | "output_type": "execute_result" 220 | } 221 | ], 222 | "source": [ 223 | "token_series = tl['token']\n", 224 | "\n", 225 | "# Show five random items from the series\n", 226 | "token_series.sample(5)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "If you want to add a series to a DataFrame as a column, you can do the same in reverse:" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": { 240 | "collapsed": false 241 | }, 242 | "outputs": [ 243 | { 244 | "data": { 245 | "text/html": [ 246 | "
\n", 247 | "\n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | "
pagesectiontokenposcountnew_column
1149889bodythenRB2then
32915221bodyNoUH2No
22816158bodysatVBD1sat
45905297bodytowardIN1toward
1193493body73CD173
\n", 307 | "
" 308 | ], 309 | "text/plain": [ 310 | " page section token pos count new_column\n", 311 | "11498 89 body then RB 2 then\n", 312 | "32915 221 body No UH 2 No\n", 313 | "22816 158 body sat VBD 1 sat\n", 314 | "45905 297 body toward IN 1 toward\n", 315 | "11934 93 body 73 CD 1 73" 316 | ] 317 | }, 318 | "execution_count": 187, 319 | "metadata": {}, 320 | "output_type": "execute_result" 321 | } 322 | ], 323 | "source": [ 324 | "tl['new_column'] = token_series\n", 325 | "tl.sample(5)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "metadata": {}, 331 | "source": [ 332 | "Tada!" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "A series has a couple of useful features. For example, you can apply a function against each item with `apply`. If we wanted to get the length of every string (like we manually would do with `len('string')`, it's possible in this way:" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": { 346 | "collapsed": false, 347 | "scrolled": true 348 | }, 349 | "outputs": [ 350 | { 351 | "data": { 352 | "text/plain": [ 353 | "0 1\n", 354 | "1 1\n", 355 | "2 1\n", 356 | "3 4\n", 357 | "4 6\n", 358 | "Name: token, dtype: int64" 359 | ] 360 | }, 361 | "execution_count": 188, 362 | "metadata": {}, 363 | "output_type": "execute_result" 364 | } 365 | ], 366 | "source": [ 367 | "token_series.apply(len).head()" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": {}, 373 | "source": [ 374 | "Is it clear what happened there? `apply` took the function we gave it, `len`, and for each value in the Series applied `len(value)`.\n", 375 | "\n", 376 | "If this was a list instead of a Series, the equivalent would be `[len(string) for string in list_of_strings]`.\n", 377 | "\n", 378 | "Just to be more clear, I'll add it as a column:" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": { 385 | "collapsed": false 386 | }, 387 | "outputs": [ 388 | { 389 | "data": { 390 | "text/html": [ 391 | "
\n", 392 | "\n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | "
pagesectiontokenposcountnew_columntoken_length
42327276bodypockets—yetNN1pockets—yet11
19755140bodyatIN1at2
455446bodyotherJJ1other5
13279100bodytenCD1ten3
1266797bodycarefullyRB1carefully9
\n", 458 | "
" 459 | ], 460 | "text/plain": [ 461 | " page section token pos count new_column token_length\n", 462 | "42327 276 body pockets—yet NN 1 pockets—yet 11\n", 463 | "19755 140 body at IN 1 at 2\n", 464 | "4554 46 body other JJ 1 other 5\n", 465 | "13279 100 body ten CD 1 ten 3\n", 466 | "12667 97 body carefully RB 1 carefully 9" 467 | ] 468 | }, 469 | "execution_count": 189, 470 | "metadata": {}, 471 | "output_type": "execute_result" 472 | } 473 | ], 474 | "source": [ 475 | "tl['token_length'] = token_series.apply(len)\n", 476 | "tl.sample(5)" 477 | ] 478 | }, 479 | { 480 | "cell_type": "markdown", 481 | "metadata": {}, 482 | "source": [ 483 | "Looks right!\n", 484 | "\n", 485 | "Another useful method of a Series is `value_counts`, which simply counts how often each value occurs:" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": null, 491 | "metadata": { 492 | "collapsed": false 493 | }, 494 | "outputs": [ 495 | { 496 | "data": { 497 | "text/plain": [ 498 | "that 502\n", 499 | "\" 485\n", 500 | "'s 364\n", 501 | ". 297\n", 502 | "the 296\n", 503 | "Name: token, dtype: int64" 504 | ] 505 | }, 506 | "execution_count": 190, 507 | "metadata": {}, 508 | "output_type": "execute_result" 509 | } 510 | ], 511 | "source": [ 512 | "token_series.value_counts().head()" 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "metadata": {}, 518 | "source": [ 519 | "Finally, for a Series that specifically has strings, there are string methods. Try `token_series.str.` to see the autofill of what is possible.\n", 520 | "\n", 521 | "Going back to our ALTA filtering for `isalpha()`, we can quickly do the same here:" 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": null, 527 | "metadata": { 528 | "collapsed": false 529 | }, 530 | "outputs": [ 531 | { 532 | "data": { 533 | "text/plain": [ 534 | "0 False\n", 535 | "1 False\n", 536 | "2 False\n", 537 | "3 False\n", 538 | "4 False\n", 539 | "5 False\n", 540 | "6 True\n", 541 | "7 True\n", 542 | "8 True\n", 543 | "9 True\n", 544 | "Name: token, dtype: bool" 545 | ] 546 | }, 547 | "execution_count": 191, 548 | "metadata": {}, 549 | "output_type": "execute_result" 550 | } 551 | ], 552 | "source": [ 553 | "is_alpha_matches = token_series.str.isalpha()\n", 554 | "is_alpha_matches.head(10)" 555 | ] 556 | }, 557 | { 558 | "cell_type": "markdown", 559 | "metadata": {}, 560 | "source": [ 561 | "We saw in Lab 5 that supplying a set of True or False values to a DataFrame allows us to select rows. lets try it with the above Series:" 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": null, 567 | "metadata": { 568 | "collapsed": false 569 | }, 570 | "outputs": [ 571 | { 572 | "data": { 573 | "text/html": [ 574 | "
\n", 575 | "\n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | "
pagesectiontokenposcountnew_columntoken_length
03body..1.1
13body0CD101
23body1CD111
33body2003CD120034
43body38-297CD138-2976
53body4CD141
63bodyDEMCONNP1DEMCO5
73bodyMNNP1M1
87bodyLEATHERNNP1LEATHER7
97bodyLIMPNNP1LIMP4
\n", 691 | "
" 692 | ], 693 | "text/plain": [ 694 | " page section token pos count new_column token_length\n", 695 | "0 3 body . . 1 . 1\n", 696 | "1 3 body 0 CD 1 0 1\n", 697 | "2 3 body 1 CD 1 1 1\n", 698 | "3 3 body 2003 CD 1 2003 4\n", 699 | "4 3 body 38-297 CD 1 38-297 6\n", 700 | "5 3 body 4 CD 1 4 1\n", 701 | "6 3 body DEMCO NNP 1 DEMCO 5\n", 702 | "7 3 body M NNP 1 M 1\n", 703 | "8 7 body LEATHER NNP 1 LEATHER 7\n", 704 | "9 7 body LIMP NNP 1 LIMP 4" 705 | ] 706 | }, 707 | "execution_count": 192, 708 | "metadata": {}, 709 | "output_type": "execute_result" 710 | } 711 | ], 712 | "source": [ 713 | "tl.head(10)" 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": null, 719 | "metadata": { 720 | "collapsed": false 721 | }, 722 | "outputs": [ 723 | { 724 | "data": { 725 | "text/html": [ 726 | "
\n", 727 | "\n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | "
pagesectiontokenposcountnew_columntoken_length
63bodyDEMCONNP1DEMCO5
73bodyMNNP1M1
87bodyLEATHERNNP1LEATHER7
97bodyLIMPNNP1LIMP4
107bodyMARKNNP1MARK4
\n", 793 | "
" 794 | ], 795 | "text/plain": [ 796 | " page section token pos count new_column token_length\n", 797 | "6 3 body DEMCO NNP 1 DEMCO 5\n", 798 | "7 3 body M NNP 1 M 1\n", 799 | "8 7 body LEATHER NNP 1 LEATHER 7\n", 800 | "9 7 body LIMP NNP 1 LIMP 4\n", 801 | "10 7 body MARK NNP 1 MARK 4" 802 | ] 803 | }, 804 | "execution_count": 193, 805 | "metadata": {}, 806 | "output_type": "execute_result" 807 | } 808 | ], 809 | "source": [ 810 | "tl[is_alpha_matches].head()" 811 | ] 812 | }, 813 | { 814 | "cell_type": "markdown", 815 | "metadata": {}, 816 | "source": [ 817 | "It worked! Of the top ten rows, the only ones that are selected are solely alphabetical. Remember that `is_alpha_matches` is simple `tl['token'].str.isalpha()`, which could have been used for selection.\n", 818 | "\n", 819 | "Finally, one more string method, `lower()`:" 820 | ] 821 | }, 822 | { 823 | "cell_type": "code", 824 | "execution_count": null, 825 | "metadata": { 826 | "collapsed": false 827 | }, 828 | "outputs": [ 829 | { 830 | "data": { 831 | "text/html": [ 832 | "
\n", 833 | "\n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | "
pagesectiontokenposcountnew_columntoken_lengthlowercase
44538289bodytimeNN1time4time
20868146bodytopNN1top3top
16134118bodymatterNN1matter6matter
42279276bodyhealingNN1healing7healing
23931165bodyeffortNN1effort6effort
\n", 905 | "
" 906 | ], 907 | "text/plain": [ 908 | " page section token pos count new_column token_length lowercase\n", 909 | "44538 289 body time NN 1 time 4 time\n", 910 | "20868 146 body top NN 1 top 3 top\n", 911 | "16134 118 body matter NN 1 matter 6 matter\n", 912 | "42279 276 body healing NN 1 healing 7 healing\n", 913 | "23931 165 body effort NN 1 effort 6 effort" 914 | ] 915 | }, 916 | "execution_count": 194, 917 | "metadata": {}, 918 | "output_type": "execute_result" 919 | } 920 | ], 921 | "source": [ 922 | "tl['lowercase'] = token_series.str.lower()\n", 923 | "tl.sample(5)" 924 | ] 925 | }, 926 | { 927 | "cell_type": "markdown", 928 | "metadata": {}, 929 | "source": [ 930 | "**Q8**: How is `token_series.str.istitle()` different from `token_series.str.isupper()`?" 931 | ] 932 | }, 933 | { 934 | "cell_type": "markdown", 935 | "metadata": {}, 936 | "source": [ 937 | "**Q9**: Which of the following options returns the tokens that have a hyphen in them?\n", 938 | "\n", 939 | " 1. `tl[tl['token'].str.has('-')]`\n", 940 | " 2. `tl[tl['token'].str.contains('-')]`\n", 941 | " 3. `tl[tl['token'].contains('-')]`\n", 942 | " 4. `tl[tl['token'] == '-']`\n", 943 | " 5. None of the above" 944 | ] 945 | } 946 | ], 947 | "metadata": { 948 | "kernelspec": { 949 | "display_name": "Python 3", 950 | "language": "python", 951 | "name": "python3" 952 | }, 953 | "language_info": { 954 | "codemirror_mode": { 955 | "name": "ipython", 956 | "version": 3 957 | }, 958 | "file_extension": ".py", 959 | "mimetype": "text/x-python", 960 | "name": "python", 961 | "nbconvert_exporter": "python", 962 | "pygments_lexer": "ipython3", 963 | "version": "3.5.1" 964 | } 965 | }, 966 | "nbformat": 4, 967 | "nbformat_minor": 0 968 | } 969 | --------------------------------------------------------------------------------