├── images
├── info.png
├── titanic.pdf
├── autofill.png
├── c_argmax.png
├── freqplot.png
├── inner-join.png
├── lab5-plot1.png
├── lab5-plot2.png
├── lab5-plot3.png
├── decisiontree.png
├── french-words.png
├── lab6-output.png
├── english-words.png
├── lsa_2dim_tfidf.png
├── lab6-sawyer-plot.png
├── lab8-group-column.png
├── thestand-valence.png
└── titanic-zoomed-out.png
├── data
├── hvd.hn6ltf.json.bz2
├── mdp.49015002392919.json.bz2
├── uc2.ark13960t1xd0sc6x.json.bz2
├── classification
│ ├── test
│ │ ├── pst.000062491532.json.bz2
│ │ ├── mdp.39015004295880.json.bz2
│ │ ├── mdp.39015005725919.json.bz2
│ │ ├── mdp.39015008815865.json.bz2
│ │ ├── mdp.39015066049530.json.bz2
│ │ └── mdp.39076002736721.json.bz2
│ ├── train
│ │ ├── pst.000029579440.json.bz2
│ │ ├── wu.89104415476.json.bz2
│ │ ├── hvd.32044014292023.json.bz2
│ │ ├── hvd.32044102860673.json.bz2
│ │ ├── mdp.39015038910694.json.bz2
│ │ └── uiug.30112037882914.json.bz2
│ └── english_french_class_labels.csv
└── contemporary_books
│ ├── contemporary-pages.csv.gz
│ ├── dataset_files
│ ├── pst.000023498051.json.bz2
│ ├── pst.000026748658.json.bz2
│ ├── pst.000026751405.json.bz2
│ ├── pst.000044406462.json.bz2
│ ├── pst.000050069378.json.bz2
│ ├── mdp.39015005028686.json.bz2
│ ├── mdp.39015010763418.json.bz2
│ ├── mdp.39015027242315.json.bz2
│ ├── mdp.39015029244657.json.bz2
│ ├── mdp.39015031703609.json.bz2
│ ├── mdp.39015038148048.json.bz2
│ ├── mdp.39015040702071.json.bz2
│ ├── mdp.39015043780249.json.bz2
│ ├── mdp.39015043798936.json.bz2
│ ├── mdp.39015046381565.json.bz2
│ ├── mdp.39015046788223.json.bz2
│ ├── mdp.39015046835560.json.bz2
│ ├── mdp.39015054084192.json.bz2
│ ├── mdp.39015054263903.json.bz2
│ ├── mdp.39015055831070.json.bz2
│ ├── mdp.39015058207492.json.bz2
│ ├── mdp.39015060663583.json.bz2
│ ├── mdp.39015062842383.json.bz2
│ ├── mdp.39015063682309.json.bz2
│ ├── mdp.39015066084040.json.bz2
│ ├── mdp.39015070756609.json.bz2
│ ├── mdp.39015073669205.json.bz2
│ ├── mdp.39015073669312.json.bz2
│ ├── uc1.32106011612402.json.bz2
│ ├── uc1.32106012198112.json.bz2
│ └── uc1.32106017944551.json.bz2
│ └── contemporary_labels.csv
├── writing.md
├── README.md
├── labs
├── Lab 04.ipynb
├── Lab 03.ipynb
├── Lab 05 - Part of Speech Tagging, Starting with Pandas.ipynb
├── Lab 02.ipynb
└── Lab 06 - More Pandas and Intro to Classification.ipynb
├── assignments.md
├── examples
├── French-English Classification.ipynb
├── Topic Modelling Trump Tweets.ipynb
└── Pivot Example.ipynb
└── syllabus.md
/images/info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/info.png
--------------------------------------------------------------------------------
/images/titanic.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/titanic.pdf
--------------------------------------------------------------------------------
/images/autofill.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/autofill.png
--------------------------------------------------------------------------------
/images/c_argmax.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/c_argmax.png
--------------------------------------------------------------------------------
/images/freqplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/freqplot.png
--------------------------------------------------------------------------------
/images/inner-join.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/inner-join.png
--------------------------------------------------------------------------------
/images/lab5-plot1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/lab5-plot1.png
--------------------------------------------------------------------------------
/images/lab5-plot2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/lab5-plot2.png
--------------------------------------------------------------------------------
/images/lab5-plot3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/lab5-plot3.png
--------------------------------------------------------------------------------
/images/decisiontree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/decisiontree.png
--------------------------------------------------------------------------------
/images/french-words.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/french-words.png
--------------------------------------------------------------------------------
/images/lab6-output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/lab6-output.png
--------------------------------------------------------------------------------
/data/hvd.hn6ltf.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/hvd.hn6ltf.json.bz2
--------------------------------------------------------------------------------
/images/english-words.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/english-words.png
--------------------------------------------------------------------------------
/images/lsa_2dim_tfidf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/lsa_2dim_tfidf.png
--------------------------------------------------------------------------------
/images/lab6-sawyer-plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/lab6-sawyer-plot.png
--------------------------------------------------------------------------------
/images/lab8-group-column.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/lab8-group-column.png
--------------------------------------------------------------------------------
/images/thestand-valence.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/thestand-valence.png
--------------------------------------------------------------------------------
/images/titanic-zoomed-out.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/images/titanic-zoomed-out.png
--------------------------------------------------------------------------------
/data/mdp.49015002392919.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/mdp.49015002392919.json.bz2
--------------------------------------------------------------------------------
/data/uc2.ark13960t1xd0sc6x.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/uc2.ark13960t1xd0sc6x.json.bz2
--------------------------------------------------------------------------------
/data/classification/test/pst.000062491532.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/test/pst.000062491532.json.bz2
--------------------------------------------------------------------------------
/data/classification/train/pst.000029579440.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/train/pst.000029579440.json.bz2
--------------------------------------------------------------------------------
/data/classification/train/wu.89104415476.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/train/wu.89104415476.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/contemporary-pages.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/contemporary-pages.csv.gz
--------------------------------------------------------------------------------
/data/classification/test/mdp.39015004295880.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/test/mdp.39015004295880.json.bz2
--------------------------------------------------------------------------------
/data/classification/test/mdp.39015005725919.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/test/mdp.39015005725919.json.bz2
--------------------------------------------------------------------------------
/data/classification/test/mdp.39015008815865.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/test/mdp.39015008815865.json.bz2
--------------------------------------------------------------------------------
/data/classification/test/mdp.39015066049530.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/test/mdp.39015066049530.json.bz2
--------------------------------------------------------------------------------
/data/classification/test/mdp.39076002736721.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/test/mdp.39076002736721.json.bz2
--------------------------------------------------------------------------------
/data/classification/train/hvd.32044014292023.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/train/hvd.32044014292023.json.bz2
--------------------------------------------------------------------------------
/data/classification/train/hvd.32044102860673.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/train/hvd.32044102860673.json.bz2
--------------------------------------------------------------------------------
/data/classification/train/mdp.39015038910694.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/train/mdp.39015038910694.json.bz2
--------------------------------------------------------------------------------
/data/classification/train/uiug.30112037882914.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/classification/train/uiug.30112037882914.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/pst.000023498051.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/pst.000023498051.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/pst.000026748658.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/pst.000026748658.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/pst.000026751405.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/pst.000026751405.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/pst.000044406462.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/pst.000044406462.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/pst.000050069378.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/pst.000050069378.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015005028686.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015005028686.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015010763418.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015010763418.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015027242315.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015027242315.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015029244657.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015029244657.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015031703609.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015031703609.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015038148048.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015038148048.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015040702071.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015040702071.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015043780249.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015043780249.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015043798936.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015043798936.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015046381565.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015046381565.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015046788223.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015046788223.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015046835560.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015046835560.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015054084192.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015054084192.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015054263903.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015054263903.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015055831070.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015055831070.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015058207492.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015058207492.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015060663583.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015060663583.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015062842383.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015062842383.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015063682309.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015063682309.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015066084040.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015066084040.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015070756609.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015070756609.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015073669205.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015073669205.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/mdp.39015073669312.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/mdp.39015073669312.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/uc1.32106011612402.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/uc1.32106011612402.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/uc1.32106012198112.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/uc1.32106012198112.json.bz2
--------------------------------------------------------------------------------
/data/contemporary_books/dataset_files/uc1.32106017944551.json.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/organisciak/Text-Mining-Course/HEAD/data/contemporary_books/dataset_files/uc1.32106017944551.json.bz2
--------------------------------------------------------------------------------
/data/classification/english_french_class_labels.csv:
--------------------------------------------------------------------------------
1 | book,title,language
2 | hvd.32044014292023,"Alice's adventures in Wonderland ; and, Through the looking-glass / by Lewis Carroll ; with ninety-two illustrations by John Tenniel.",eng
3 | hvd.32044102860673,"Notre Dame de Paris. Abridged and edited, with introd. and notes, by John R. Wightman.",fre
4 | mdp.39015038910694,"Moby Dick,",eng
5 | pst.000029579440,The adventures of Huckleberry Finn / by Mark Twain.,eng
6 | uiug.30112037882914,Candide ou L'optimisme.,fre
7 | wu.89104415476,Les liaisons dangereuses / Choderlos de Laclos ;édition publiée d'après le texte original précédée d'une étude sur Choderlos de Laclos et suivie d'une bibliographie par Ad. Van Berver.,fre
8 | mdp.39015004295880,"Les caves du Vatican,",fre
9 | mdp.39015005725919,Madame Bovary de Gustave Flaubert; étude et analyse.,fre
10 | mdp.39015008815865,Jean Barois ...,fre
11 | mdp.39015066049530,The catcher in the rye / J. D. Salinger.,eng
12 | mdp.39076002736721,Catch-22 / Joseph Heller.,eng
13 | pst.000062491532,The lord of the rings / J.R.R. Tolkien.,eng
14 |
--------------------------------------------------------------------------------
/data/contemporary_books/contemporary_labels.csv:
--------------------------------------------------------------------------------
1 | book,author,title
2 | mdp.39015005028686,King,The stand
3 | mdp.39015010763418,Atwood,Lady oracle;
4 | mdp.39015027242315,Atwood,The robber bride
5 | mdp.39015029244657,Grisham,The pelican brief
6 | mdp.39015031703609,Grisham,The rainmaker
7 | mdp.39015038148048,King,Desperation
8 | mdp.39015040702071,Atwood,Alias Grace
9 | mdp.39015043780249,King,The girl who loved Tom Gordon
10 | mdp.39015043798936,King,Bag of bones
11 | mdp.39015046381565,Grisham,A time to kill
12 | mdp.39015046788223,Grisham,The rainmaker
13 | mdp.39015046835560,Grisham,The partner
14 | mdp.39015054084192,Grisham,The testament
15 | mdp.39015054263903,King,Everything's eventual : 14 dark tales
16 | mdp.39015055831070,King,From a Buick 8 : a novel
17 | mdp.39015058207492,Grisham,The last juror
18 | mdp.39015060663583,Atwood,The handmaid's tale
19 | mdp.39015062842383,Atwood,The Penelopiad
20 | mdp.39015063682309,King,Cell : a novel
21 | mdp.39015066084040,Grisham,The summons
22 | mdp.39015070756609,Grisham,Playing for pizza
23 | mdp.39015073669205,King,Duma Key
24 | mdp.39015073669312,Grisham,The appeal
25 | pst.000023498051,King,Carrie
26 | pst.000026748658,Atwood,Bodily harm
27 | pst.000026751405,Atwood,Cat's eye
28 | pst.000044406462,Atwood,Life before man
29 | pst.000050069378,Grisham,The king of torts (large print)
30 | uc1.32106011612402,King,The dark half
31 | uc1.32106012198112,King,Stephen King's Danse macabre
32 | uc1.32106017944551,King,Cujo
33 |
--------------------------------------------------------------------------------
/writing.md:
--------------------------------------------------------------------------------
1 | # Writing about Text Mining
2 |
3 | The tools and concepts that we learn in this class ultimately serve to answer a research question.
4 | Here, I've collected online writing that uses text mining well to investigate and tell a good story.
5 | Use these as inspiration for your final projects.
6 |
7 | ## Posts / Articles
8 |
9 | - [The Language of the State of the Union](https://www.theatlantic.com/politics/archive/2015/01/the-language-of-the-state-of-the-union/384575/). Benjamin Schmidt and Mitch Fraas.
10 | - [Text analysis of Trump's tweets confirms he writes only the (angrier) Android half](http://varianceexplained.org/r/trump-tweets/). David Robinson.
11 | - [The Birdy Pulpit — Analyzing Trump's Twitter Account](http://www.thecrosstab.com/2017/03/07/analysing-trump-tweets/). George Elliott Morris.
12 | - [Does sentiment analysis work? A tidy analysis of Yelp reviews](http://varianceexplained.org/r/yelp-sentiment/). David Robinson.
13 | - [Screen Time!](http://sappingattention.blogspot.com/2014/09/screen-time.html#more). Benjamin Schmidt.
14 | - [How Reddit Talked About The 2016 Presidential Campaign, From ‘Basket Of Deplorables’ To ‘Yuge’](http://fivethirtyeight.com/features/how-reddit-talked-about-the-2016-presidential-campaign-from-basket-of-deplorables-to-yuge/). Ritchie King.
15 | - [The Foreign Language of 'Mad Men'](https://www.theatlantic.com/entertainment/archive/2012/03/the-foreign-language-of-mad-men/254668/). Benjamin Schmidt.
16 | - [The instability of gender](https://tedunderwood.com/2016/01/09/the-instability-of-gender/). Ted Underwood.
17 |
18 | ## Websites / Blogs
19 |
20 | - [Language Log](http://languagelog.ldc.upenn.edu/nll/)
21 | - [Sapping Attention](http://sappingattention.blogspot.com) - Benjamin Schmidt's blog.
22 | - [Variance Explained](http://varianceexplained.org) - Blog by David Robinson.
23 | - [The Crosstab](http://www.thecrosstab.com) - Political data blog.
24 | - [FiveThirtyEight]( ) - News organization focused on data reporting.
25 | - [The Stone and The Shell](https://tedunderwood.com/) - Blog by Ted Underwood.
26 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Text Mining
2 |
3 | [Assignments](assignments.md) | [Lab Worksheets](https://github.com/organisciak/Text-Mining-Course/tree/master/labs) | [Syllabus](syllabus.md)
4 |
5 | ## Overview
6 |
7 | This course introduces students to the knowledge discovery process and methods used to mine patterns from a collection of text. We will critically review text mining methods developed in the knowledge discovery and databases, information science, and computational linguistics communities. Students will develop proficiency with modeling text through individual projects.
8 |
9 | How can computers read? When we look at a paragraph of text, we have a set of skills to understand and interpret it: what is the message? Is it an argument? What is the sentiment? Computers don't have the same context or literacy. Their language is quantitative. Through text mining, this course will equip you with the skills to use understanding text through computing.
10 |
11 | Text mining is most useful in the new affordances that it allows. In most cases, the tools of text mining aren't meant to replace 'close reading'; they give us new ways to ask questions - about literature, news, scholarship, correspondence, etc. - and are best applied in service of that novelty. Computing allows for:
12 |
13 | - Scale: Computers compare poorly to us in their ability to interpret meaning, but the things they can do may be applied to enormous scales. If you're interested in hundreds of books, thousands or web pages, or millions of tweets, simply reading them is unfeasible.
14 | - Re-contextualization: With text mining, you take apart texts and put them together in new ways. These give you new ways to understand information in a text or appreciate a book. Likewise, breaking down text to data also provides new comparative or critical tools. For example, we can understand what makes Jane Austen's books different from her contemporaries, or attribute authorship for anonymous or pseudonymous writing.
15 | - Summarization: Aggregation, extraction, and visualization all serve to report patterns you. For example, text summarization models can extract the takeaway points from a set of medical literature.
16 | A few final notes on course philosophy.
17 |
18 | First, the broad view of text mining can encompass many disciplinary approaches. This course hews closely to the sub-area referred to as text analysis, intended to treat text mining in the services of qualitative questions. This is closest to the treatments in the digital humanities and computational social sciences.
19 |
20 | For this course, you will be expected to learn new programming skills. Note that this is not a programming course. We will cover a subset of skills in Python that pertain to data science. Most of the time, your needs will be served by tinkering with and modifying code examples that I provide for you.
21 |
22 | I understand the time constraints of being a student. To account for the time you will spend in this course learning new tools and writing code, I have tried to keep reading and writing loads reasonable.
23 |
24 | Succeeding in this course will be through many little steps. The assignments are small but frequent. If you are looking at the entire outline of ideas and skills in this course, it may look overwhelming. However, going one step at a time, learning the language of text mining won't be scary.
25 |
26 | ## Pre- and Co-requisites
27 | An introductory level database and programming course or permission of the instructor.
28 |
29 | ## Required Texts
30 | This course incorporated readings from a variety of sources. Readings will openly accessible and posted on/linked from the course website. In addition to individual essays and papers, we will also return repeatedly to the following texts:
31 |
32 | - [Art of Literary Text Analysis](https://github.com/sgsinclair/alta/tree/master/ipynb) - Stefan Sinclair, 2015-
33 | - [Introduction to Information Retrieval](http://nlp.stanford.edu/IR-book/information-retrieval-book.html) - Manning and Schutz, 2008
34 | - [Speech and Language Processing](https://web.stanford.edu/~jurafsky/slp3/) 3rd edition. Dan Jurafsky and James H. Martin. 2017.
35 | - [Search Engines: Information Retrieval in Practice](http://ciir.cs.umass.edu/irbook/) - Croft, Metzler and Strohman. 2009.
36 |
37 | ## Schedule
38 |
39 | - Week 1: Introduction
40 | - Week 2: Fundamentals
41 | - Week 3: Features
42 | - Week 4: Text Mining for Art and Criticism
43 | - Week 5: Documentation Access; Natural Language Processing 1 - Part of Speech Tagging
44 | - Week 6: Natural Language Processing 2 - Information Extraction and Dependency Parsing
45 | - Week 7: Classification 1
46 | - Week 8: Classification 2
47 | - Week 9: Clustering
48 | - Week 10: Topic Modeling and Dimensionality Reduction 1
49 | - Week 11:Topic Modelling 2; Sentiment Analysis
50 | - Week 12: Visualization
51 | - Week 13: Word Embeddings
52 | - Week 14: What's Next: Remainder Notes from Text Mining
53 |
54 | The week-to-week syllabus, with readings, slides, and schedule notes is on the [Syllabus page](syllabus.md).
55 |
56 | ## Assignments
57 |
58 | - 30% Lab Tasks - Due Weekly
59 | - 20% Small Assigments
60 | - 10% - Twitter Bot Assignment
61 | - 10% - Topic Modelling Assignment
62 | - 35% Text Mining Project
63 | - 5% Problem Statement
64 | - 5% Literature review + 5% Data collection
65 | - 20% Final report
66 | - 15% Participation
67 | - 5% Attendance
68 | - 10% Forum posts, comments, class engagement
69 |
70 | Details are on the [Assignments page](assignments.md).
71 |
--------------------------------------------------------------------------------
/labs/Lab 04.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Hopefully by this week, you're comfortable with lists (i.e. [a, b, c, ..]) and growing to understand list comprehensions.\n",
8 | "\n",
9 | "Two fundamental Python skills to be aware of. First, there's a general purpose method called `len()` that returns the length of an object, like \"how many items in this list\" or \"how many characters in this string\". e.g."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {
16 | "collapsed": false
17 | },
18 | "outputs": [
19 | {
20 | "data": {
21 | "text/plain": [
22 | "3"
23 | ]
24 | },
25 | "execution_count": 1,
26 | "metadata": {},
27 | "output_type": "execute_result"
28 | }
29 | ],
30 | "source": [
31 | "l = ['hello', 'text', 'mining']\n",
32 | "len(l)"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "metadata": {
39 | "collapsed": false
40 | },
41 | "outputs": [
42 | {
43 | "data": {
44 | "text/plain": [
45 | "11"
46 | ]
47 | },
48 | "execution_count": 2,
49 | "metadata": {},
50 | "output_type": "execute_result"
51 | }
52 | ],
53 | "source": [
54 | "len(\"Text Mining\")"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "There is also an object called a `set`, which is like a list, but without an ordering and only allowing unique elements. This is useful for us, because it gives a quick way to see just the unique words of a list: the vocabulary."
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {
68 | "collapsed": false
69 | },
70 | "outputs": [
71 | {
72 | "name": "stdout",
73 | "output_type": "stream",
74 | "text": [
75 | "List: ['Buffalo', 'buffalo', 'Buffalo', 'buffalo', 'buffalo', 'Buffalo', 'buffalo', 'buffalo']\n",
76 | "Set: {'Buffalo', 'buffalo'}\n"
77 | ]
78 | }
79 | ],
80 | "source": [
81 | "l = ['Buffalo', 'buffalo', 'Buffalo', 'buffalo', 'buffalo', 'Buffalo', 'buffalo', 'buffalo']\n",
82 | "s = set(l)\n",
83 | "print(\"List:\", l)\n",
84 | "print(\"Set:\", s)"
85 | ]
86 | },
87 | {
88 | "cell_type": "markdown",
89 | "metadata": {},
90 | "source": [
91 | "# Normalization"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "This week, follow along with [Searching for Meaning](https://github.com/sgsinclair/alta/blob/41f389f3d9708573c44c883bcd95fd16bad54a24/ipynb/SearchingMeaning.ipynb) from the Art of Literary Text Analysis.\n",
99 | "\n",
100 | "Use the trimmed version of Frankenstein from last week to try some of the concepts in the chapter. This should get you up to speed."
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {
107 | "collapsed": false
108 | },
109 | "outputs": [
110 | {
111 | "data": {
112 | "text/plain": [
113 | "85440"
114 | ]
115 | },
116 | "execution_count": 1,
117 | "metadata": {},
118 | "output_type": "execute_result"
119 | }
120 | ],
121 | "source": [
122 | "import nltk\n",
123 | "with open('../data/frankenstein.txt') as f:\n",
124 | " frankensteinString = f.read()\n",
125 | "frankensteinTokens = nltk.word_tokenize(frankensteinString)\n",
126 | "cleanedTokens = [word.lower() for word in frankensteinTokens if word[0].isalpha()]\n",
127 | "len(frankensteinTokens)"
128 | ]
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "metadata": {},
133 | "source": [
134 | "Note, that there are 85440 tokens in the text. If we count just the _unique_ words (the _vocabulary size_), we find 7038:"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {
141 | "collapsed": false,
142 | "scrolled": true
143 | },
144 | "outputs": [
145 | {
146 | "data": {
147 | "text/plain": [
148 | "7038"
149 | ]
150 | },
151 | "execution_count": 2,
152 | "metadata": {},
153 | "output_type": "execute_result"
154 | }
155 | ],
156 | "source": [
157 | "len(set(cleanedTokens))"
158 | ]
159 | },
160 | {
161 | "cell_type": "markdown",
162 | "metadata": {},
163 | "source": [
164 | "**Questions**\n",
165 | "\n",
166 | "- 1) Create a lemmatized version of cleanedTokens and count the unique lemmas. Share the code to do this: the answer that it gives you should be 6416.\n",
167 | "- 2) Re-do the lemmatization after stopping words against the default NLTK stoplist, and tabulate the top ten words. Paste the code and output.\n",
168 | "- 3) How does the tabulation of lemmas differ from the tabulation of the non-lemmatized (but still stopped and case-folded) tokens?\n",
169 | "- 4) What are the WordNet synsets for 'monster'?\n",
170 | "- 5) A synset has a method called `definition()`. Noting that the code for Q4 resulted in a list, write a list comprehension to extracts all the definitions for each synset. Share the code and output.\n",
171 | "- 6) Each synset is a child of a more general synset. For example, `crab` is an example of a `decapod_crustacean`, which is more generally a `crustacean`, and so on. You can get at the paths to the root of this tree with `hypernym_paths()`. Paste the code and hypernym path for `freak.n.01`.\n",
172 | "- 7) (for 2 points): We've already seen some corpora that NLTK can pull in, from the complex WordNet information to a basic stoplist. Using the NLTK information on male/female names, determine and paste in the unique female names in Frankenstein. This isn't in the ALTA book, but searching Google sometimes helps ;)\n",
173 | "\n",
174 | "Our copy of Frankenstein is from Project Gutenberg, a collection of transcriptions of public domain (i.e. legally shareable) books. NLTK offers a small selection of those books through `nltk.corpus.gutenberg`.\n",
175 | "\n",
176 | "Load the gutenberg corpus and convert it to what NLTK calls a TextCollection:"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": null,
182 | "metadata": {
183 | "collapsed": false
184 | },
185 | "outputs": [],
186 | "source": [
187 | "from nltk.text import TextCollection\n",
188 | "gutenberg_docs = nltk.corpus.gutenberg\n",
189 | "gutenberg_collection = TextCollection(gutenberg_docs)"
190 | ]
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "metadata": {},
195 | "source": [
196 | "Using a method of `gutenberg_collection` (remember auto-complete!), answer the final two-part question.\n",
197 | "\n",
198 | "Questions:\n",
199 | " \n",
200 | " - 8) For 2 points:\n",
201 | " - What is the TFIDF for 'monster' in Frankenstein? You'll need the original (unnormalized) tokens.\n",
202 | " - What word has the highest TF-IDF for the following: 'miserable', 'horror', 'monster'? If you need it, you can compare numbers in python with > (greater than) or < (less than)."
203 | ]
204 | }
205 | ],
206 | "metadata": {
207 | "kernelspec": {
208 | "display_name": "Python 3",
209 | "language": "python",
210 | "name": "python3"
211 | },
212 | "language_info": {
213 | "codemirror_mode": {
214 | "name": "ipython",
215 | "version": 3
216 | },
217 | "file_extension": ".py",
218 | "mimetype": "text/x-python",
219 | "name": "python",
220 | "nbconvert_exporter": "python",
221 | "pygments_lexer": "ipython3",
222 | "version": "3.5.1"
223 | }
224 | },
225 | "nbformat": 4,
226 | "nbformat_minor": 0
227 | }
228 |
--------------------------------------------------------------------------------
/labs/Lab 03.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Week 3 Lab Task\n",
8 | "\n",
9 | "## More Jupyter Tips\n",
10 | "\n",
11 | "Hopefully by this week, you are growing more comfortable with starting Jupyter Notebooks and adding/editing cells. Remember that the keyboard shortcuts are invaluable: running a cell with `Ctrl+Enter`, or adding a new cell below with `B` (in command mode).\n",
12 | "\n",
13 | "Two tricks to try this week: autocompletion and retrieving documentation.\n",
14 | "\n",
15 | "**Autocomplete**\n",
16 | "\n",
17 | "If you start typing a known object or function into Jupyter, you can press `TAB` to finish it. This is especially useful for seeing what functions are available."
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": null,
23 | "metadata": {
24 | "collapsed": true
25 | },
26 | "outputs": [],
27 | "source": [
28 | "test = \"this is a string\""
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "Above, I've set a string to `test`. If I type `te` then press tab, it will complete the word. This is especially useful for long variable names that you don't want to keep typing. Note that it only completed because there no other options: in that case, there's a scrollable list of candidates for what you might be looking for.\n",
36 | "\n",
37 | "The `test` variable is a string. Last week, we saw a two functions that can be performed on strings: `split()` and `join()`. If you would like to see what other options there are for strings, try typing `test.` then press TAB. Magic!\n",
38 | "\n",
39 | "\n",
40 | "\n",
41 | "** Documentation reference **\n",
42 | "\n",
43 | "If you want to look up information about a function, you can precede the code running that function with a `?`. For example, if I want to learn how I would use `split()` on `test`, I can type:"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {
50 | "collapsed": true
51 | },
52 | "outputs": [],
53 | "source": [
54 | "?test.split()"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "This will open a panel that looks like this in Jupyter:\n",
62 | "\n",
63 | "\n",
64 | "\n",
65 | "The documentation is only as good as what the library is documented, so some libraries might be more or less detailed in this feature.\n",
66 | "\n",
67 | "*Questions*\n",
68 | "\n",
69 | "- 1) What does `test.isalpha()` do? Copy the documentation string.\n",
70 | "- 2) Strings have access to a function (whose name starts with a `ce`) that will let you change \"HEADING\" to \"====HEADING====\" (that is, padding with `=` to make the string 15 characters wide). What's the code to do that? (tip: this is an auto-fill question!)"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "## Intro to the NLTK\n",
78 | "\n",
79 | "This week we'll start using the Natural Language toolkit. For the remaining questions, follow along with:\n",
80 | "\n",
81 | "- [Getting NLTK for Text Processing](https://github.com/sgsinclair/alta/blob/2acb6ed09f298f631e4025d33f062f980758a1ce/ipynb/GettingNltk.ipynb), Art of Literary Text Analysis\n",
82 | "\n",
83 | "Two notes. First, the tutorial suggests downloading \"all\" packages. However, install the packages from 'book' should be sufficient for now.\n",
84 | "\n",
85 | "Also, skip the text processing section, which deals with automatically downloading and cleaning a book. Instead, download this [already-cleaned version of Mary Shelley's Frankenstein](https://raw.githubusercontent.com/organisciak/Text-Mining-Course/master/data/frankenstein.txt), put it into the same folder as your notebook, and load it as follows:"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {
92 | "collapsed": false
93 | },
94 | "outputs": [],
95 | "source": [
96 | "with open(\"../data/frankenstein.txt\") as f:\n",
97 | " frankensteinString = f.read()"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {},
103 | "source": [
104 | "Here's a quick way of viewing part of our string: the first 250 characters. Notice that you can select subsets of strings like you select subsets of lists. "
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {
111 | "collapsed": false
112 | },
113 | "outputs": [
114 | {
115 | "data": {
116 | "text/plain": [
117 | "'Letter 1\\n\\nSt. Petersburgh, Dec. 11th, 17--\\n\\nTO Mrs. Saville, England\\n\\nYou will rejoice to hear that no disaster has accompanied the\\ncommencement of an enterprise which you have regarded with such evil\\nforebodings. I arrived here yesterday, and my'"
118 | ]
119 | },
120 | "execution_count": 114,
121 | "metadata": {},
122 | "output_type": "execute_result"
123 | }
124 | ],
125 | "source": [
126 | "frankensteinString[0:250]"
127 | ]
128 | },
129 | {
130 | "cell_type": "markdown",
131 | "metadata": {},
132 | "source": [
133 | " > Side-note for the Python novice: you don't actually need the zero in [0:250]. If left blank, like '[:250]`, Python will assume \"from the very start\", which is the same as using a 0. If you leave the second part blank, Python will assume \"until the very end\".\n",
134 | " \n",
135 | "For the rest of the ALTA chapter, follow along using `frankensteinString` string instead of `goldBugString`."
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {},
141 | "source": [
142 | "__Questions__\n",
143 | "\n",
144 | " - 3) Use the `word_tokenize` function on Frankenstein, as shown in ALTA. What are tokens 39:67? Hint: this is a full sentence. Include your code.\n",
145 | " - 4) Create a sample of only the tokens where the first character is an alphabetical character. In this sample, what are tokens 1215:1221? Again, this will be a sentence, but won't include punctuation as tokens. Include your code.\n",
146 | " \n",
147 | " _For the next questions use the list of tokens that start with an alphabetical character._\n",
148 | " \n",
149 | " \n",
150 | " - 5) What are the ten most frequent words in this book? Create a frequency distribution of the words from question 4, then tabulate the top 10 words. Include your code.\n",
151 | " - 6) After case-folding, what are the ten most frequent words in this book? Include your code.\n",
152 | " - 7) Rewrite this list comprehension as a `for` loop (what ALTA called technique 1): `[word for word in listOfWords if word.find('-') >= 0]`. No output necessary, just the code, but feel free to test it out.\n",
153 | " - 8) We're going to use a customized stoplist. First, load the NLTK stoplist, and add the words 'could', 'would', 'upon', and 'yet' to the stoplist. What are the top ten case-folded words when stopping against the stoplist. Include your code and paste the tabulated output.\n",
154 | " \n",
155 | "Using the autocomplete in Jupyter, you may notice that a list of tokens converted to a `FreqDist` object has more methods than just `tabulate()`. One really cool one is `plot()`.\n",
156 | "\n",
157 | "`plot` gives you a visualization of the top frequency words. However, you may notice that if you try to run it, the visualization doesn't show up.\n",
158 | "\n",
159 | "It _is_ created, but Jupyter just doesn't know that you want the visualization shown _within_ the notebook. To turn that option on, run the following line of code:"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": null,
165 | "metadata": {
166 | "collapsed": true
167 | },
168 | "outputs": [],
169 | "source": [
170 | "%matplotlib inline"
171 | ]
172 | },
173 | {
174 | "cell_type": "markdown",
175 | "metadata": {},
176 | "source": [
177 | "This is only necessary once: it tells Jupyter to show plots 'inline' (ie. inside the notebook).\n",
178 | "\n",
179 | "**Questions**\n",
180 | "\n",
181 | "- 9) Write the code to plot the top forty stoplisted, lowercase words (from question 8). And again, remember the docs! The output will look similar to this:"
182 | ]
183 | },
184 | {
185 | "cell_type": "markdown",
186 | "metadata": {},
187 | "source": [
188 | ""
189 | ]
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "metadata": {},
194 | "source": [
195 | "- 10) Enter the first 5 concordances for the word \"monster\" in the original token list - the list straight from word_tokenize that included punctuation and numbers - narrowing the search to a 49-characters window. Include the code. Tip: See the docs for the concordance tool in Jupyter."
196 | ]
197 | }
198 | ],
199 | "metadata": {
200 | "kernelspec": {
201 | "display_name": "Python 3",
202 | "language": "python",
203 | "name": "python3"
204 | },
205 | "language_info": {
206 | "codemirror_mode": {
207 | "name": "ipython",
208 | "version": 3
209 | },
210 | "file_extension": ".py",
211 | "mimetype": "text/x-python",
212 | "name": "python",
213 | "nbconvert_exporter": "python",
214 | "pygments_lexer": "ipython3",
215 | "version": "3.5.1"
216 | }
217 | },
218 | "nbformat": 4,
219 | "nbformat_minor": 0
220 | }
221 |
--------------------------------------------------------------------------------
/labs/Lab 05 - Part of Speech Tagging, Starting with Pandas.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Lab 5"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Part of Speech Tagging\n",
15 | "\n",
16 | "And tuples!\n",
17 | "\n",
18 | "Here is how you tag parts-of-speech with NLTK:"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "collapsed": false
26 | },
27 | "outputs": [
28 | {
29 | "data": {
30 | "text/plain": [
31 | "[('And', 'CC'),\n",
32 | " ('now', 'RB'),\n",
33 | " ('for', 'IN'),\n",
34 | " ('something', 'NN'),\n",
35 | " ('completely', 'RB'),\n",
36 | " ('different', 'JJ')]"
37 | ]
38 | },
39 | "execution_count": 3,
40 | "metadata": {},
41 | "output_type": "execute_result"
42 | }
43 | ],
44 | "source": [
45 | "import nltk\n",
46 | "text = \"And now for something completely different\"\n",
47 | "tokens = nltk.word_tokenize(text)\n",
48 | "tagged = nltk.pos_tag(tokens)\n",
49 | "tagged"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "The output of `pos_tag` is a list of objects called 'tuples'. You can access a tuple by index or you can easily expand it into multiple variables:"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {
63 | "collapsed": false
64 | },
65 | "outputs": [
66 | {
67 | "data": {
68 | "text/plain": [
69 | "('England', 'PRP')"
70 | ]
71 | },
72 | "execution_count": 16,
73 | "metadata": {},
74 | "output_type": "execute_result"
75 | }
76 | ],
77 | "source": [
78 | "test_tuple = ('England', 'PRP')\n",
79 | "test_tuple"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {
86 | "collapsed": false
87 | },
88 | "outputs": [
89 | {
90 | "data": {
91 | "text/plain": [
92 | "'England'"
93 | ]
94 | },
95 | "execution_count": 17,
96 | "metadata": {},
97 | "output_type": "execute_result"
98 | }
99 | ],
100 | "source": [
101 | "test_tuple[0]"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {
108 | "collapsed": false
109 | },
110 | "outputs": [
111 | {
112 | "data": {
113 | "text/plain": [
114 | "'England'"
115 | ]
116 | },
117 | "execution_count": 18,
118 | "metadata": {},
119 | "output_type": "execute_result"
120 | }
121 | ],
122 | "source": [
123 | "word, pos = test_tuple\n",
124 | "word"
125 | ]
126 | },
127 | {
128 | "cell_type": "markdown",
129 | "metadata": {},
130 | "source": [
131 | "Since you can expand tuples easily, you can name the parts of a tuple in a list comprehension. Note in the following example that we follow a `for x, y in list_of_tuples` pattern instead of `for x in list_of_tuples` as we've encountered before: "
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": null,
137 | "metadata": {
138 | "collapsed": false
139 | },
140 | "outputs": [
141 | {
142 | "data": {
143 | "text/plain": [
144 | "['CC', 'RB', 'IN', 'NN', 'RB', 'JJ']"
145 | ]
146 | },
147 | "execution_count": 5,
148 | "metadata": {},
149 | "output_type": "execute_result"
150 | }
151 | ],
152 | "source": [
153 | "[tag for word, tag in tagged]"
154 | ]
155 | },
156 | {
157 | "cell_type": "markdown",
158 | "metadata": {},
159 | "source": [
160 | "The `for x,y in list_of_tuples` approach also works for in for loops.\n",
161 | "\n",
162 | "Tuples don't need to have only two values in Python, but that's the most common. "
163 | ]
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "metadata": {},
168 | "source": [
169 | "**Q1**: How do you get a list of all the singular proper nouns tagged by NLTK in Frankenstein? Share just the code.\n",
170 | "\n",
171 | "Part of Speech tag definitions are at [Penn Treebank](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html). To double check, the output of your code should start with `['St.', 'Petersburgh', 'Dec.', 'TO', 'Mrs.']`. For Python beginners, note that comparing strings is done with `==`, as in `string == string2`."
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": null,
177 | "metadata": {
178 | "collapsed": false
179 | },
180 | "outputs": [],
181 | "source": [
182 | "# Getting started\n",
183 | "with open(\"../data/frankenstein.txt\") as f:\n",
184 | " frank_string = f.read()\n",
185 | "frank_tokens = nltk.word_tokenize(frank_string)\n",
186 | "\n",
187 | "# ... what's next?"
188 | ]
189 | },
190 | {
191 | "cell_type": "markdown",
192 | "metadata": {},
193 | "source": [
194 | "# Getting into Pandas and the HTRC Extracted Features\n",
195 | "\n",
196 | "For the rest of the lab, follow along with [Text Mining in Python with the HTRC Feature Reader](http://programminghistorian.org/lessons/text-mining-with-extracted-features) up to and including \"Selecting Subsets of a DataFrame by a Condition\" (i.e. stop when you see 'Slicing DataFrames'). This tutorial will introduce you to two things:\n",
197 | " 1. The HTRC Extracted Features Dataset, which we discussed last week.\n",
198 | " 2. A library called Pandas, an important part of our toolkit moving forward.\n",
199 | "\n",
200 | "You'll be able to skim many of the early parts of the tutorial, since you've already learned those skills. Don't overlook \"Installing the HTRC Feature Reader\", though."
201 | ]
202 | },
203 | {
204 | "cell_type": "markdown",
205 | "metadata": {},
206 | "source": [
207 | "*Questions*\n",
208 | "\n",
209 | "I've posted an HTRC Extracted Features file: [mdp.49015002392919.json.bz2]( https://github.com/organisciak/Text-Mining-Course/blob/master/data/mdp.49015002392919.json.bz2). Use the Feature Reader library to answer the following questions about that file:\n",
210 | "\n",
211 | "**Q2**: What is the title of the book?\n",
212 | "\n",
213 | " 1. 'The adventures of Tom Sawyer, by Mark Twain (Samuel L. Clemens)...'\n",
214 | " 2. 'Frankenstein : or, The modern Prometheus.'\n",
215 | " 3. 'June / by Edith Barnard Delano ; with illustrations.'\n",
216 | " 4. 'Anne of Green Gables / L.M. Montgomery.'\n",
217 | " 5. None of the above."
218 | ]
219 | },
220 | {
221 | "cell_type": "markdown",
222 | "metadata": {},
223 | "source": [
224 | "**Q3**: What is the URL to read this book online at the HathiTrust Digital Library?"
225 | ]
226 | },
227 | {
228 | "cell_type": "markdown",
229 | "metadata": {},
230 | "source": [
231 | "**Q4**: Which of these charts is the plot of tokens/page across the entire book?\n",
232 | "\n",
233 | "1. \n",
234 | "2. \n",
235 | "3. \n",
236 | "5. None of the above."
237 | ]
238 | },
239 | {
240 | "cell_type": "markdown",
241 | "metadata": {
242 | "collapsed": true
243 | },
244 | "source": [
245 | "**Q5**: How do you get the word frequencies for the header throughout the book? (Not changing any of the other default arguments)"
246 | ]
247 | },
248 | {
249 | "cell_type": "markdown",
250 | "metadata": {},
251 | "source": [
252 | "**Q6**: How do you get the count of each word in the body of the text for the entire book, not worrying about pages or parts of speech? Share your code. *Hint: the length of the correct output (i.e. `len(object_from_your_answer)`) is `9267`.*"
253 | ]
254 | },
255 | {
256 | "cell_type": "markdown",
257 | "metadata": {},
258 | "source": [
259 | "**Q7**: In the output from Q6, which of the following parts are indexes or columns?\n",
260 | "\n",
261 | "*section*: index, column, or doesn't exist\n",
262 | "\n",
263 | "*word*: index, column, or doesn't exist\n",
264 | "\n",
265 | "*token*: index, column, or doesn't exist\n",
266 | "\n",
267 | "*count*: index, column, or doesn't exist"
268 | ]
269 | },
270 | {
271 | "cell_type": "markdown",
272 | "metadata": {},
273 | "source": [
274 | "**Q8**: Setting the output to Q6 to a variable called `tl`, what is the line of code to sort values in descending order? To figure out the answer, you can try searching online about sorting in Pandas, or try auto-complete and documentation lookup in Jupyter to see what `tl` can do and how.\n",
275 | "\n",
276 | "If it is ordered correctly, the top words will be '`,`' (4934 occurrences), '`.`' (3866), and '`the`' (3320)."
277 | ]
278 | },
279 | {
280 | "cell_type": "markdown",
281 | "metadata": {},
282 | "source": [
283 | "**Q9**: Here is a list of words that show up 64 times in `tl`: [can, face, seemed, where]. What other words with a count of 64 are missing?\n",
284 | "\n",
285 | " - than\n",
286 | " - hand\n",
287 | " - want\n",
288 | " - heart"
289 | ]
290 | },
291 | {
292 | "cell_type": "markdown",
293 | "metadata": {},
294 | "source": [
295 | "What if we wanted to work with our text in a DataFrame? Here's how you would convert the list of part-of-speech tagged tuples into a DataFrame, where I called my initial list `frank_tagged`:"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": null,
301 | "metadata": {
302 | "collapsed": false
303 | },
304 | "outputs": [
305 | {
306 | "data": {
307 | "text/html": [
308 | "
\n",
309 | "
\n",
310 | " \n",
311 | " \n",
312 | " | \n",
313 | " word | \n",
314 | " pos | \n",
315 | "
\n",
316 | " \n",
317 | " \n",
318 | " \n",
319 | " | 0 | \n",
320 | " Letter | \n",
321 | " NN | \n",
322 | "
\n",
323 | " \n",
324 | " | 1 | \n",
325 | " 1 | \n",
326 | " CD | \n",
327 | "
\n",
328 | " \n",
329 | " | 2 | \n",
330 | " St. | \n",
331 | " NNP | \n",
332 | "
\n",
333 | " \n",
334 | " | 3 | \n",
335 | " Petersburgh | \n",
336 | " NNP | \n",
337 | "
\n",
338 | " \n",
339 | "
\n",
340 | "
"
341 | ],
342 | "text/plain": [
343 | " word pos\n",
344 | "0 Letter NN\n",
345 | "1 1 CD\n",
346 | "2 St. NNP\n",
347 | "3 Petersburgh NNP"
348 | ]
349 | },
350 | "execution_count": 96,
351 | "metadata": {},
352 | "output_type": "execute_result"
353 | }
354 | ],
355 | "source": [
356 | "import pandas as pd\n",
357 | "frank_df = pd.DataFrame(frank_tagged, columns=['word', 'pos'])\n",
358 | "frank_df.head(4)"
359 | ]
360 | },
361 | {
362 | "cell_type": "markdown",
363 | "metadata": {},
364 | "source": [
365 | "Pandas is - by convention, not rule - imported with the name `pd`. Note that gave the columns names.\n",
366 | "\n",
367 | "**Q10**: What code would you use on `frank_df` to get the singular proper nouns? It should give you 1371 rows."
368 | ]
369 | }
370 | ],
371 | "metadata": {
372 | "kernelspec": {
373 | "display_name": "Python 3",
374 | "language": "python",
375 | "name": "python3"
376 | },
377 | "language_info": {
378 | "codemirror_mode": {
379 | "name": "ipython",
380 | "version": 3
381 | },
382 | "file_extension": ".py",
383 | "mimetype": "text/x-python",
384 | "name": "python",
385 | "nbconvert_exporter": "python",
386 | "pygments_lexer": "ipython3",
387 | "version": "3.5.1"
388 | }
389 | },
390 | "nbformat": 4,
391 | "nbformat_minor": 0
392 | }
393 |
--------------------------------------------------------------------------------
/assignments.md:
--------------------------------------------------------------------------------
1 | # Assignments
2 |
3 | ## Lab Tasks
4 |
5 | _30% - Weekly_
6 |
7 | Lab tasks are meant to exercise a particular practical skill from our lectures.
8 |
9 | There are 9 lab exercises, evenly marked (each 3% of mark). Labs are marked on a scale of 0-10. Sometimes, just doing the task is a 10/10, other times it is divided by tasks.
10 |
11 | **Due**: 1 hour before the following week's class.
12 |
13 | ## Small Assignments
14 |
15 | - _20% - Two Assignments_
16 | - _10% - Twitter Bot Assignment_
17 | - _10% - Topic Modelling Assignment_
18 |
19 | The small assignments are in lieu of lab tasks for their weeks. They differ in that you have more than 1 week to complete them, and their value is slightly higher.
20 |
21 | ### Twitter Bot Assignment
22 |
23 | This project asks you to create a simple, rule-based Twitter Bot. There is no need to actually put it online: this is optional.
24 |
25 | We'll use Cheap Bots Done Quick, powered by Tracery. To get started, follow along with [Your First Twitter Bot, in 20 minutes](https://sense.porganized.com/your-first-twitter-bot-in-20-minutes-35b2c610482d#.uz41sqy0k). Note that you can use any sources for data that you like, Corpora is merely one suggestion from the tutorial.
26 |
27 | Bot topic: Have fun with this. Your bot can be silly, avant garde, activitist. You may notice that there are limitations to Tracery: some things are really easy to do, but complex programming logic is not possible. How can you work within these restrictions?
28 |
29 | One possibility to make a good bot is to apply what you've learned so far to prepare good source material. For example, can you use concordances or bigrams from a book to learn phrases that represent a writer that you want your bot to mimic? If you want to create realistic sounds but nonsensical sentences, look ahead to next week's Natural Language for Programmers reading by Liza Daly, which introduces Context Free Grammar, and the lab reading on identifying Parts of Speech with NLTK.
30 |
31 | __Week 5: Post your idea for a Twitter bot on the Twitter Assignment forum.__
32 |
33 | This can be brief: we want you thinking what can be done, and seeing what your colleagues are thinking about.
34 |
35 | __Week 6: Twitter bot due - describe it on the forum at least 1 hour before class.__
36 |
37 | Post a short write-up (up to 600 words) about your bot and what motivated it. What issues did you run into? Include at least 10 randomly generated messages. If you put the bot up on Twitter (optional) share the handle*.
38 |
39 | Finally, attach a text file with the full JSON for your bot.
40 |
41 | The assignment will be marked out of 10, on the quality of the idea, the depth of the execution, and the clarity and quality of the written post. 1 point is automatically assigned for submitting the draft.
42 |
43 | Engaging with colleagues in discussing their draft ideas will contribute to your participation mark.
44 |
45 | *Also, make sure to identify it as a bot in the bio, a convention that many botmakers follow for ethical reasons.
46 |
47 | ### Topic Modeling Assignment
48 | For the topic modeling assignment, you'll use MALLET to perform text analysis on a collection of your choice.
49 |
50 | MALLET is a Java-based toolkit for machine learning, including a module for Topic Modeling with Latent Dirichlet Allocation. To install and learn to use MALLET, follow along with [Getting Started with Topic Modeling and MALLET](http://programminghistorian.org/lessons/topic-modeling-and-mallet). As with the Twitter Bot Assignment, there is some self-directed learning with this assignment.
51 |
52 | 1. Find a good research question
53 |
54 | Decide on an interesting set of texts to learn from. You can try something new, but you're also welcome to build on an idea from earlier in the course (e.g. from the Voyant lab) or do a preliminary version of something from your final lab.
55 |
56 | What do you hope to explore? A better grade is rewarded for an idea that is appropriate for unsupervised learning, and for which learning topics can be insightful.
57 |
58 | Are you stuck? Think about the genres of texts that we've seen in this class and in your colleagues' assignments: books, emails, tweets, lyrics, scripts, letters. Many of these can lead to an interesting idea. Think also about the examples we discussed in class.
59 |
60 | Topic modeling is well suited for cases with many short texts. Since it learns from co-occurrence, you want your training texts to conceivably be about the same thing at the start as at the end. For example, modeling pages will give you better topics than modeling books.
61 |
62 | Tip: When deciding on a research question, think ahead to the data collection step. Choose something that won't make this short assignment into a long one; remember that you'll have a final project to work on a more complex project if you want to return to topic modeling.
63 |
64 | 2. Find and prepare your source texts
65 |
66 | Getting data is hard! You'll need to be resourceful.
67 |
68 | MALLET needs input files that are structured either as a set of text files or one long text file, with each document on a different line. How do you collect your data? Can you find it already prepared? Do you have to do any cleaning?
69 |
70 | Note that topic modeling is a bag-of-words approach, so if you want to use books from the HTRC Extracted Features dataset, you can write out tokens to files in a random order.
71 |
72 | 3. Build a topic model and write about it
73 |
74 | Build a model and post about it on the Topic Modeling submission forum. Discuss both what you see in the output and what your process for building the models was.
75 |
76 | Some possible questions to answer:
77 |
78 | - What was your goal? Did topic modeling help?
79 | - What is interesting about your topics? Do they match what you expect? If not, what looks peculiar?
80 | - Which topics stand out? Which topics seem to be junk?
81 | - How many topics did you choose? Why? Did you try alternate parameterizations?
82 | - How did you collect your data? Were there headaches or necessary workarounds?
83 | - What new research questions does this assignment inspire? If you had time for a bigger project, what would it be?
84 |
85 | _Due: Week 12._
86 |
87 | Grading - out of 10
88 |
89 | - /2 - The Research question
90 | - /2 - Data collection
91 | - /6 - Forum post: quality of analysis and discussion of process
92 |
93 | ## Text Mining project
94 |
95 | The final project is a culmination of your text mining expertise. You'll be putting your text mining skills to work. Up to now, the assignments have been method-based, where we tell you what to use and you find a problem for it. This time, you choose a problem to explore, formulate it as a research question, determine the methods to address it, and use it as part of a larger narrative.
96 |
97 | The project has 3 components:
98 |
99 | - 5% Problem statement: Due Week 12.
100 | - 5% Literature review + 5% Data collection: Due Week 13.
101 | - 20% Final report: Due one week after final class.
102 |
103 | We'll still be learning methods that you may want to use throughout April, though the most prominent ones have already been covered: classification, clustering, topic modeling, stoplisting, concordances, part of speech tagging, document similarity etc.
104 |
105 | A few more that we'll learn: feature selection - identifying the most discriminatory words in a collection; word embeddings - understanding the contexts of words and the relationships between them, removed from the document context; more document similarity methods; visualization, for better understanding what is going on in an analysis and for communicating it to others; sentiment analysis, for mining the opinions of texts.
106 |
107 | Tell a compelling story. Remember that the complexity of the tool is not as important and the appropriateness. For example, something the top word frequencies make your point.
108 |
109 | Here are some random research questions, alongside ways to ask them.
110 |
111 | - RQ: Is there a specific language that belies conservative or liberal partisan media? Possible approaches: sentiment mining; term-weighted top frequencies.
112 | - RQ: What characterizes my style in writing email? Possible approaches: topic modeling sent messages; building a classifier for time of day and seeing which words are notable for each class; visualizing sent email lengths.
113 | - RQ: How do Alec Baldwin and Stephen Baldwin use Twitter different? Possible approaches: Term frequencies and concordances; classification (notable features via decision trees?); topic modeling; dimensionality reduction to 2 dimensions and observing outliers via scatter plot.
114 | - RQ: Are there underlying trends motivating baby naming? Possible methods: classification or logistic regression using character patterns as features (e.g. last letter of the name, second letter, etc.).
115 | - RQ: What do characters on Game of Thrones talk about? Possible methods: TF-IDF over scripts, compared to a [general language reference for IDF](https://www.ideals.illinois.edu/handle/2142/89691).
116 | - RQ: How do people talk about food? Possible methods: identify types of food in a [dataset of food reviews](https://www.kaggle.com/snap/amazon-fine-food-reviews), and look at terms that coccur with them; topic model reviews and see which topics are most prominent for different words.
117 | Good luck. We've been very impressed by the quality of your project ideas this term, and look forward to seeing what you come up with.
118 |
119 | ### Details
120 |
121 | #### Problem Statement (Week 12):
122 |
123 | Develop your idea and share a description of what you hope to do, what methods you hope to use, and early ideas for getting the data. Post it on the Problem Statement forum. Max 400 words.
124 |
125 | #### Literature Review and Data Collection (Week 13):
126 |
127 | Find examples of other people pursuing similar questions or using similar methods, and tell us about them. This will require some self-directed reading, searching. This isn't a lofty academic literature review, so you do not need to worry about how formal the literature is: it can include forum posts, blogs, new articles. (Tips: finding information online can be tricky - don't be afraid to share cool sources with your classmates or ask for advice on the forums).
128 |
129 | For the data collection section, we want to see that you've started trying to compile your data. If it's been easy, them us more about the data. If it's been hard, tell us about the problems that you're running into, and whether you've had to adapt from your original problem.
130 |
131 | Post these parts on the Final Project Lit + Data forum.
132 |
133 | #### Final Report (One week after final class)
134 |
135 | For the final report, write about your findings. Structure a narrative about what you hoped to do, how you pursued it, and what you found.
136 |
137 | Think of the report as a piece for the portfolio: to show your text mining skills to future employers while demonstrating your ability to communicate the results. Tell us how about how you addressed your problem. You want to catch the reader by having the most intriguing points summarized at the start, then give us the details: what analyses you ran, what subquestions you asked, what was seen. When appropriate, use tables or visualizations.
138 |
139 | Below is our suggested structure. This is not a research paper, so you do not need to use these headings or structure. Rather, it's just a set of guidelines. Foremost, structure your report so it is easy to read for a non-expert.
140 |
141 |
142 |
143 | 1. Introduction
144 |
145 | Provide a high-level explanation of what you did and the main interesting points. This is the section that convinces us to read further.
146 |
147 |
148 | For the next three parts, reuse your text from the Problem Statement, Literature Review, Data Collection, editing it as you might see fit.
149 |
150 | 2. Problem
151 |
152 | 3. Related Work
153 |
154 | 4. Data
155 |
156 | 5. Findings
157 |
158 | Tell your data's story!
159 |
160 | 6. Conclusions and Next Steps
161 |
162 | What was most salient or intriguing? What interesting new questions came out of your project? What else can be done?
163 |
164 | __How to submit Final Report__
165 |
166 | Post the final report on the submission forums, in one of the following ways:
167 |
168 | - Written directly in Moodle, as a post
169 | - Linked to a blog post (e.g. on Medium)
170 | - Linking to a Jupyter notebook (e.g. on Github)
171 | - Attaching a Jupyter Notebook (make sure you check that any images are included)
172 |
173 | __Inspiration for writing__
174 |
175 | Here's a list of good [writing about text or data mining](https://github.com/organisciak/Text-Mining-Course/blob/master/writing.md) for inspiration. Share your own examples on the forums.
176 |
177 | ## Participation
178 |
179 | - 15% of mark
180 | - 5% Attendance
181 | - 10% Forum posts, comments, class engagement
182 |
183 | ## Late Policy
184 |
185 | - Lose 10% day, up to 50%. Late is better than never.
186 | - 2 late 'freebies': We won't count late marks for two labs, because sometimes life gets in the way.
187 |
188 | - Last day for late assignments:
189 | - Labs: Turn in by May 3rd
190 | - Anything else: May 8th
191 |
--------------------------------------------------------------------------------
/examples/French-English Classification.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from htrc_features import FeatureReader\n",
12 | "import glob\n",
13 | "from nltk import word_tokenize\n",
14 | "import pandas as pd"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [
24 | {
25 | "name": "stdout",
26 | "output_type": "stream",
27 | "text": [
28 | "hvd.32044014292023 \t eng \t http://hdl.handle.net/2027/hvd.32044014292023 \t Alice's adventures in Wonderland ; and, \n",
29 | "hvd.32044102860673 \t fre \t http://hdl.handle.net/2027/hvd.32044102860673 \t Notre Dame de Paris. Abridged and edited\n",
30 | "mdp.39015038910694 \t eng \t http://hdl.handle.net/2027/mdp.39015038910694 \t Moby Dick,\n",
31 | "pst.000029579440 \t eng \t http://hdl.handle.net/2027/pst.000029579440 \t The adventures of Huckleberry Finn / by \n",
32 | "uiug.30112037882914 \t fre \t http://hdl.handle.net/2027/uiug.30112037882914 \t Candide ou L'optimisme.\n",
33 | "wu.89104415476 \t fre \t http://hdl.handle.net/2027/wu.89104415476 \t Les liaisons dangereuses / Choderlos de \n"
34 | ]
35 | }
36 | ],
37 | "source": [
38 | "paths = glob.glob('../data/classification/*bz2')\n",
39 | "fr = FeatureReader(paths)\n",
40 | "\n",
41 | "for vol in fr.volumes():\n",
42 | " print(vol.id, '\\t', vol.language, '\\t', vol.handle_url, '\\t', vol.title[:40])"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "Collect the token counts for french and english separately."
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {
56 | "collapsed": false
57 | },
58 | "outputs": [
59 | {
60 | "data": {
61 | "text/html": [
62 | "\n",
63 | "
\n",
64 | " \n",
65 | " \n",
66 | " | \n",
67 | " token | \n",
68 | " count | \n",
69 | " language | \n",
70 | "
\n",
71 | " \n",
72 | " \n",
73 | " \n",
74 | " | 0 | \n",
75 | " ! | \n",
76 | " 573 | \n",
77 | " fre | \n",
78 | "
\n",
79 | " \n",
80 | " | 1 | \n",
81 | " !..i | \n",
82 | " 1 | \n",
83 | " fre | \n",
84 | "
\n",
85 | " \n",
86 | " | 2 | \n",
87 | " !je | \n",
88 | " 1 | \n",
89 | " fre | \n",
90 | "
\n",
91 | " \n",
92 | " | 3 | \n",
93 | " \" | \n",
94 | " 12 | \n",
95 | " fre | \n",
96 | "
\n",
97 | " \n",
98 | " | 4 | \n",
99 | " \"de | \n",
100 | " 1 | \n",
101 | " fre | \n",
102 | "
\n",
103 | " \n",
104 | "
\n",
105 | "
"
106 | ],
107 | "text/plain": [
108 | " token count language\n",
109 | "0 ! 573 fre\n",
110 | "1 !..i 1 fre\n",
111 | "2 !je 1 fre\n",
112 | "3 \" 12 fre\n",
113 | "4 \"de 1 fre"
114 | ]
115 | },
116 | "execution_count": 24,
117 | "metadata": {},
118 | "output_type": "execute_result"
119 | }
120 | ],
121 | "source": [
122 | "tl = vol.tokenlist(pages=False, pos=False).head().reset_index()\n",
123 | "tl['language'] = vol.language\n",
124 | "tl[['token', 'count', 'language']].head()"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {
131 | "collapsed": false
132 | },
133 | "outputs": [],
134 | "source": [
135 | "book_dfs = []\n",
136 | "classes_count = {'eng': 0, 'fre': 0}\n",
137 | "\n",
138 | "for vol in fr.volumes():\n",
139 | " tl = vol.tokenlist(pages=False, pos=False, case=False).reset_index()\n",
140 | " classes_count[vol.language] += 1\n",
141 | " tl['language'] = vol.language\n",
142 | " book_dfs.append(tl[['lowercase', 'count', 'language']])"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": null,
148 | "metadata": {
149 | "collapsed": false
150 | },
151 | "outputs": [],
152 | "source": [
153 | "corpus = (pd.concat(book_dfs)\n",
154 | " .groupby(by=['language', 'lowercase']).sum()\n",
155 | " )"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {
162 | "collapsed": false
163 | },
164 | "outputs": [
165 | {
166 | "data": {
167 | "text/plain": [
168 | "eng 0.5\n",
169 | "fre 0.5\n",
170 | "dtype: float64"
171 | ]
172 | },
173 | "execution_count": 4,
174 | "metadata": {},
175 | "output_type": "execute_result"
176 | }
177 | ],
178 | "source": [
179 | "# P(c)\n",
180 | "p_c = pd.Series(classes_count) / len(paths)\n",
181 | "p_c"
182 | ]
183 | },
184 | {
185 | "cell_type": "markdown",
186 | "metadata": {},
187 | "source": [
188 | "Next, we want to sum up the counts for the entire class, so each language x word only has one, total sum:"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": null,
194 | "metadata": {
195 | "collapsed": false,
196 | "scrolled": true
197 | },
198 | "outputs": [
199 | {
200 | "data": {
201 | "text/html": [
202 | "\n",
203 | "
\n",
204 | " \n",
205 | " \n",
206 | " | \n",
207 | " | \n",
208 | " count | \n",
209 | " P(w|c) | \n",
210 | "
\n",
211 | " \n",
212 | " | language | \n",
213 | " lowercase | \n",
214 | " | \n",
215 | " | \n",
216 | "
\n",
217 | " \n",
218 | " \n",
219 | " \n",
220 | " | eng | \n",
221 | " ! | \n",
222 | " 2230 | \n",
223 | " 0.006570 | \n",
224 | "
\n",
225 | " \n",
226 | " | !' | \n",
227 | " 1 | \n",
228 | " 0.000003 | \n",
229 | "
\n",
230 | " \n",
231 | " | !1 | \n",
232 | " 1 | \n",
233 | " 0.000003 | \n",
234 | "
\n",
235 | " \n",
236 | " | !33 | \n",
237 | " 1 | \n",
238 | " 0.000003 | \n",
239 | "
\n",
240 | " \n",
241 | " | !«lm | \n",
242 | " 1 | \n",
243 | " 0.000003 | \n",
244 | "
\n",
245 | " \n",
246 | "
\n",
247 | "
"
248 | ],
249 | "text/plain": [
250 | " count P(w|c)\n",
251 | "language lowercase \n",
252 | "eng ! 2230 0.006570\n",
253 | " !' 1 0.000003\n",
254 | " !1 1 0.000003\n",
255 | " !33 1 0.000003\n",
256 | " !«lm 1 0.000003"
257 | ]
258 | },
259 | "execution_count": 32,
260 | "metadata": {},
261 | "output_type": "execute_result"
262 | }
263 | ],
264 | "source": [
265 | "corpus['P(w|c)'] = corpus.groupby(level='language').transform(lambda word: word / word.sum())['count']\n",
266 | "corpus.head()"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": null,
272 | "metadata": {
273 | "collapsed": false
274 | },
275 | "outputs": [
276 | {
277 | "data": {
278 | "text/html": [
279 | "\n",
280 | "
\n",
281 | " \n",
282 | " \n",
283 | " | \n",
284 | " count | \n",
285 | " P(w|c) | \n",
286 | "
\n",
287 | " \n",
288 | " | lowercase | \n",
289 | " | \n",
290 | " | \n",
291 | "
\n",
292 | " \n",
293 | " \n",
294 | " \n",
295 | " | , | \n",
296 | " 22737 | \n",
297 | " 0.066985 | \n",
298 | "
\n",
299 | " \n",
300 | " | the | \n",
301 | " 15962 | \n",
302 | " 0.047025 | \n",
303 | "
\n",
304 | " \n",
305 | " | and | \n",
306 | " 11356 | \n",
307 | " 0.033456 | \n",
308 | "
\n",
309 | " \n",
310 | " | . | \n",
311 | " 10732 | \n",
312 | " 0.031617 | \n",
313 | "
\n",
314 | " \n",
315 | " | \" | \n",
316 | " 7446 | \n",
317 | " 0.021936 | \n",
318 | "
\n",
319 | " \n",
320 | " | to | \n",
321 | " 6611 | \n",
322 | " 0.019476 | \n",
323 | "
\n",
324 | " \n",
325 | " | a | \n",
326 | " 6529 | \n",
327 | " 0.019235 | \n",
328 | "
\n",
329 | " \n",
330 | " | of | \n",
331 | " 5857 | \n",
332 | " 0.017255 | \n",
333 | "
\n",
334 | " \n",
335 | " | i | \n",
336 | " 5581 | \n",
337 | " 0.016442 | \n",
338 | "
\n",
339 | " \n",
340 | " | it | \n",
341 | " 5077 | \n",
342 | " 0.014957 | \n",
343 | "
\n",
344 | " \n",
345 | " | ; | \n",
346 | " 4559 | \n",
347 | " 0.013431 | \n",
348 | "
\n",
349 | " \n",
350 | " | in | \n",
351 | " 4312 | \n",
352 | " 0.012703 | \n",
353 | "
\n",
354 | " \n",
355 | " | was | \n",
356 | " 3596 | \n",
357 | " 0.010594 | \n",
358 | "
\n",
359 | " \n",
360 | " | that | \n",
361 | " 3392 | \n",
362 | " 0.009993 | \n",
363 | "
\n",
364 | " \n",
365 | " | he | \n",
366 | " 3048 | \n",
367 | " 0.008980 | \n",
368 | "
\n",
369 | " \n",
370 | " | you | \n",
371 | " 2959 | \n",
372 | " 0.008717 | \n",
373 | "
\n",
374 | " \n",
375 | " | 's | \n",
376 | " 2609 | \n",
377 | " 0.007686 | \n",
378 | "
\n",
379 | " \n",
380 | " | n't | \n",
381 | " 2608 | \n",
382 | " 0.007683 | \n",
383 | "
\n",
384 | " \n",
385 | " | but | \n",
386 | " 2294 | \n",
387 | " 0.006758 | \n",
388 | "
\n",
389 | " \n",
390 | " | ! | \n",
391 | " 2230 | \n",
392 | " 0.006570 | \n",
393 | "
\n",
394 | " \n",
395 | "
\n",
396 | "
"
397 | ],
398 | "text/plain": [
399 | " count P(w|c)\n",
400 | "lowercase \n",
401 | ", 22737 0.066985\n",
402 | "the 15962 0.047025\n",
403 | "and 11356 0.033456\n",
404 | ". 10732 0.031617\n",
405 | "\" 7446 0.021936\n",
406 | "to 6611 0.019476\n",
407 | "a 6529 0.019235\n",
408 | "of 5857 0.017255\n",
409 | "i 5581 0.016442\n",
410 | "it 5077 0.014957\n",
411 | "; 4559 0.013431\n",
412 | "in 4312 0.012703\n",
413 | "was 3596 0.010594\n",
414 | "that 3392 0.009993\n",
415 | "he 3048 0.008980\n",
416 | "you 2959 0.008717\n",
417 | "'s 2609 0.007686\n",
418 | "n't 2608 0.007683\n",
419 | "but 2294 0.006758\n",
420 | "! 2230 0.006570"
421 | ]
422 | },
423 | "execution_count": 15,
424 | "metadata": {},
425 | "output_type": "execute_result"
426 | }
427 | ],
428 | "source": [
429 | "corpus.loc[('eng')].sort_values('count', ascending=False).head(20)"
430 | ]
431 | },
432 | {
433 | "cell_type": "markdown",
434 | "metadata": {},
435 | "source": [
436 | "For estimating P(w|c), divide each per-class count by the total words in that class."
437 | ]
438 | },
439 | {
440 | "cell_type": "code",
441 | "execution_count": null,
442 | "metadata": {
443 | "collapsed": false
444 | },
445 | "outputs": [
446 | {
447 | "data": {
448 | "text/plain": [
449 | "['bonjour']"
450 | ]
451 | },
452 | "execution_count": 35,
453 | "metadata": {},
454 | "output_type": "execute_result"
455 | }
456 | ],
457 | "source": [
458 | "string_to_classify = \"bonjour\"\n",
459 | "relevant_tokens = word_tokenize(string_to_classify.lower())\n",
460 | "relevant_tokens"
461 | ]
462 | },
463 | {
464 | "cell_type": "code",
465 | "execution_count": null,
466 | "metadata": {
467 | "collapsed": false
468 | },
469 | "outputs": [
470 | {
471 | "data": {
472 | "text/plain": [
473 | "language\n",
474 | "eng 7.588625e-19\n",
475 | "fre 1.814643e-22\n",
476 | "Name: P(w|c), dtype: float64"
477 | ]
478 | },
479 | "execution_count": 8,
480 | "metadata": {},
481 | "output_type": "execute_result"
482 | }
483 | ],
484 | "source": [
485 | "classified = (corpus.loc[(slice(None), relevant_tokens),]\n",
486 | " .groupby(level='language')['P(w|c)'].prod()\n",
487 | " )\n",
488 | "classified"
489 | ]
490 | },
491 | {
492 | "cell_type": "markdown",
493 | "metadata": {},
494 | "source": [
495 | "Now, though it doesn't matter when the same classes were seen equally, remember to multiple by P(c):"
496 | ]
497 | },
498 | {
499 | "cell_type": "code",
500 | "execution_count": null,
501 | "metadata": {
502 | "collapsed": false
503 | },
504 | "outputs": [
505 | {
506 | "data": {
507 | "text/plain": [
508 | "language\n",
509 | "eng 3.794313e-19\n",
510 | "fre 9.073217e-23\n",
511 | "dtype: float64"
512 | ]
513 | },
514 | "execution_count": 9,
515 | "metadata": {},
516 | "output_type": "execute_result"
517 | }
518 | ],
519 | "source": [
520 | "classified * p_c"
521 | ]
522 | },
523 | {
524 | "cell_type": "markdown",
525 | "metadata": {},
526 | "source": [
527 | "Sort that, to make it more clear:"
528 | ]
529 | },
530 | {
531 | "cell_type": "code",
532 | "execution_count": null,
533 | "metadata": {
534 | "collapsed": false
535 | },
536 | "outputs": [
537 | {
538 | "data": {
539 | "text/plain": [
540 | "language\n",
541 | "eng 3.794313e-19\n",
542 | "fre 9.073217e-23\n",
543 | "dtype: float64"
544 | ]
545 | },
546 | "execution_count": 10,
547 | "metadata": {},
548 | "output_type": "execute_result"
549 | }
550 | ],
551 | "source": [
552 | "(classified * p_c).sort_values(ascending=False)"
553 | ]
554 | }
555 | ],
556 | "metadata": {
557 | "kernelspec": {
558 | "display_name": "Python 3",
559 | "language": "python",
560 | "name": "python3"
561 | },
562 | "language_info": {
563 | "codemirror_mode": {
564 | "name": "ipython",
565 | "version": 3
566 | },
567 | "file_extension": ".py",
568 | "mimetype": "text/x-python",
569 | "name": "python",
570 | "nbconvert_exporter": "python",
571 | "pygments_lexer": "ipython3",
572 | "version": "3.5.1"
573 | }
574 | },
575 | "nbformat": 4,
576 | "nbformat_minor": 0
577 | }
578 |
--------------------------------------------------------------------------------
/syllabus.md:
--------------------------------------------------------------------------------
1 | ## Week 1: Introduction
2 |
3 | ### Readings
4 |
5 | - Marti Hearst. 1999. [Untangling Text Data
6 | Mining](http://people.ischool.berkeley.edu/~hearst/papers/acl99/acl99-tdm.html).
7 |
8 | ### Slides
9 |
10 | - [Introduction](https://docs.google.com/presentation/d/1cBc9yX2wRSmQoOiG2viUHkZAsKI_dAB2VO3powLBt74/edit?usp=sharing)
11 |
12 | ### For Next Week
13 |
14 | #### Homework
15 |
16 | Post a little bit about yourself in the Introductions forum, following the instructions there.
17 |
18 | #### Lab Task
19 |
20 | This week's lab task is mostly to play! It is intended to get
21 | you comfortable with out-of-the-box text analysis tools.
22 |
23 | Use [Voyant](https://voyant-tools.org)
24 | to visualize a text or set of texts. It can be anything you want: a
25 | book, a set of lyrics, scripts from a show you like, news articles.
26 | Try out the various features in Voyant: phrases, keywords in
27 | contexts, etc.
28 |
29 | Once you've had a chance to play with Voyant, *post a short response
30 | to the __lab task forum__
31 | (no more than 300 words) about your experience. Some possible things
32 | to post about: What was interesting or confusing about the tool? Did
33 | you find anything intriguing about your text or texts? Did it find
34 | any recurring patterns or phrases? Did you find any visualisations
35 | beyond the word cloud to be interesting? Any other thoughts? Don't
36 | forget to tell us what text you used with Voyant.
37 |
38 | ## Week 2: Fundamentals
39 |
40 | *Just a reminder that 'readings' refer to the readings you should have done by the lecture, while lab tasks are done by next week. The intention is that they are both related to the current week's theme: readings prepare you for the lecture, and the lecture lets you practice that learning.*
41 |
42 | ### Readings
43 |
44 | - Sections 4.1, 4.3, and 4.4 of [Search
45 | Engines: Information Retrieval in
46 | Practice](http://ciir.cs.umass.edu/irbook/) (Croft,
47 | Metzler and Strohman). Starts on page 72.
48 | - Parts of Chapter 2, Introduction to Information Retrieval
49 | (Manning, Raghavan,
50 | Schütze): [Intro](http://nlp.stanford.edu/IR-book/html/htmledition/the-term-vocabulary-and-postings-lists-1.html), [Tokenization](http://nlp.stanford.edu/IR-book/html/htmledition/determining-the-vocabulary-of-terms-1.html), [Stop
51 | lists](http://nlp.stanford.edu/IR-book/html/htmledition/dropping-common-terms-stop-words-1.html)
52 |
53 | ### Slides
54 |
55 | - [Week 2:
56 | Fundamentals](https://docs.google.com/presentation/d/18R7pWmc49PemCgAJ4020lgibNO1Hp1KdST08kd_a-d4/edit?usp=sharing)
57 |
58 | ### For Next Week
59 |
60 | This week's lab task is about getting started with powerful tools that
61 | will underlie many of the skills you learn in the course. The lab task is posted in a [Jupyter
62 | notebook](labs/Lab%202.ipynb)
63 | format on Github.
64 |
65 | ## Week 3: _Treating Text as Data_ - Features
66 |
67 | ### Readings
68 |
69 | - [2.2.3](http://nlp.stanford.edu/IR-book/pdf/02voc.pdf) of Intro to IR: Normalization. If you missed 2.2.1 and 2.2.2 last week, catch up on those also.
70 | - [Term Weighting for
71 | Humanists](https://sense.porganized.com/term-weighting-for-humanists-bf2ed42628c8).
72 | Peter Organisciak.
73 |
74 | Supplemental:
75 | - [Term frequency and
76 | weighting](http://nlp.stanford.edu/IR-book/html/htmledition/term-frequency-and-weighting-1.html).
77 | Intro to IR.
78 |
79 | ### Slides
80 |
81 | - [Week 3:
82 | Features](https://docs.google.com/presentation/d/16jZxqi7zpZrOUA2z14aSpg8BPGJxSO9Qkb6rx4bZLLw/edit?usp=sharing)
83 |
84 | ### For Next Week
85 |
86 | #### Lab Task
87 |
88 | This week's lab task is again a series of questions, following along
89 | with a worksheet. Find it
90 | [here](labs/Lab%203.ipynb).
91 |
92 | ## Week 4: Text Mining for Art and Criticism
93 |
94 | ### Readings
95 |
96 | - [Liza Daly's Generative Blackout
97 | Poetry](http://waxy.org/2016/11/liza-dalys-generative-blackout-poetry/) -
98 | This work uses some simple language rules that will be useful in
99 | the future.
100 |
101 | The following three readings are web articles related to Twitter bots: for activism, for recontextualization, and a roundup of interesting bots. Not all of these are text related, but serve as a good overview.
102 |
103 | - [How Twitter Bots Turn Tweeters into
104 | Activists](https://www.technologyreview.com/s/544851/how-twitter-bots-turn-tweeters-into-activists/)
105 | - [Introducing censusAmericans, A Twitter Bot For
106 | America](https://fivethirtyeight.com/datalab/introducing-censusamericans-a-twitter-bot-for-america/)
107 | - [12
108 | Weird, Excellent Twitter Bots Chosen by Twitter’s Best
109 | Bot-Makers](http://nymag.com/selectall/2015/11/12-weirdest-funniest-smartest-twitter-bots.html)
110 | - Optional: [The Rise of Twitter
111 | Bots](http://www.newyorker.com/tech/elements/the-rise-of-twitter-bots)
112 |
113 | Slides
114 |
115 | - [3.5 -
116 | Features Cont.](https://docs.google.com/presentation/d/1dljGL0QmjY-QJ9O-wpXeVgqk8lO12Klrm6EfgaEQsDg/edit?usp=sharing)
117 | - [4.0 - Text Mining for Art and
118 | Criticism](https://docs.google.com/presentation/d/1FZmIQdS5cEuJEG7pudzHCI5iWxrWZb5yus4eT24tW_Y/edit?usp=sharing)
119 |
120 | Assignments
121 | -----------
122 |
123 | The Twitter Bot assignment is posted on the
124 | [Assignments](assignments.md) page.
125 | There is a draft posting next week (post about your plans) and the
126 | final is due in two weeks.
127 |
128 | ### For Next Week
129 |
130 | - Submit Twitter bot draft
131 | - [Lab 4
132 | Worksheet](labs/Lab%204.ipynb).
133 |
134 | ## Week 5.1: Document Access
135 |
136 | ### Readings
137 |
138 | [Against
139 | Cleaning](http://curatingmenus.org/articles/against-cleaning/) -
140 | Katie Rawson, Trevor Muñoz
141 |
142 | ## Week 5.2: _Understanding Words_ - Natural Language Processing 1, Part of Speech Tagging
143 |
144 | ### Readings
145 |
146 | - [Natural Language Processing for
147 | programmers](https://worldwritable.com/natural-language-processing-for-programmers-90c4e04dc6de#.dhfapdhxv) part
148 | 2 - Liza Daly
149 | - This talks about an old concept, but is written from a
150 | beginner perspective and is useful for your assignment.
151 | - [Part of Speech
152 | Tagging ](https://web.stanford.edu/~jurafsky/slp3/10.pdf)- Chapter
153 | 10 (up to 10.4) of Speech and Language Processing (3rd ed.
154 | draft)
155 | - [Chapter 5.7 of the NLTK
156 | Book](http://www.nltk.org/book/ch05.html) - Bird et. al
157 | - Just section 7, but sections 1-2, 4-6 are useful as
158 | supplements to the SLP reading if you need more info or
159 | simply find it interesting. Section 7 is the conclusion of
160 | the chapter, which succinctly describes the ways that we
161 | understand a part of speech.
162 |
163 | ### Slides
164 |
165 | [05 - Getting Data](https://docs.google.com/presentation/d/1N7qvqvTTxldbTiZ2tqx8OQBUq4dtD3PoUAEdwjD6FGc/edit?usp=sharing)
166 |
167 | ### For Next Week
168 |
169 | Twitter bot: Post to the Twitter Bot Final forum.
170 |
171 | No lab task. Complete your bot!
172 |
173 | ## Week 6: _Understanding Words_ - Natural Language Processing 2, Information Extraction and Dependency Parsing
174 |
175 | ### Readings
176 |
177 | - *Information
178 | Extraction*. Section 4.6 of [Search Engines: Information
179 | Retrieval in Practice](http://ciir.cs.umass.edu/irbook/) (Croft,
180 | Metzler and Strohman). Starts on page 113.
181 | - [Information
182 | Extraction](https://web.stanford.edu/~jurafsky/slp3/21.pdf)
183 | (up to and including section 21.2.3). Speech and Language
184 | Processing (3rd ed. draft).
185 |
186 | **Optional
187 | Reading**
188 | - [SyntaxNet Detailed
189 | Tutorial](https://github.com/tensorflow/models/tree/master/syntaxnet#detailed-tutorial-building-an-nlp-pipeline-with-syntaxnet)
190 | -
191 |
192 | Google's approach for dependency parsing, SyntaxNet, and
193 | their model trained on it - Parsey McParseFace - are the
194 | current state of the art. This tutorial, while optional,
195 | offers a look at Part of Speech tagging using feed-forward
196 | neural networks and has a nicely described description of
197 | transition-based dependency parsing.
198 |
199 | ### Slides
200 |
201 | [06 - Natural Language Processing 1 - Part of Speech
202 | Tagging](https://docs.google.com/presentation/d/17psGonrrwj0R2DT-Nu34D5kpTP-jBEmthbIQKSeZG2Q/edit?usp=sharing)
203 |
204 | ### For Next Week
205 |
206 | - [Worksheet for the Lab Task
207 | 05](labs/Lab%2005%20-%20Part%20of%20Speech%20Tagging%2C%20Starting%20with%20Pandas.ipynb).
208 |
209 | ## Week 7: Classification 1
210 |
211 | ### Readings
212 |
213 | [Naive Bayes Classification and
214 | Sentiment](https://web.stanford.edu/~jurafsky/slp3/6.pdf), Speech
215 | and Language Processing (3rd edition). Dan Jurafsky and James H.
216 | Martin.
217 |
218 | **Notation**
219 |
220 | We getting to the point of the term where some mathematic notation
221 | is necessary for our readings to communicate the underlying theory.
222 |
223 | If you are unfamiliar with Bayesian inference, the description on
224 | the 3rd page of this chapter might not satisfy your curiosity.
225 | The [introduction to Bayes' Theorem from Khan
226 | Academy](https://www.khanacademy.org/partner-content/wi-phi/wiphi-critical-thinking/wiphi-fundamentals/v/bayes-theorem)
227 | can help equip you with some more background about what we use
228 | Bayes' Theorem for.
229 |
230 | Since we're looking at classes, you'll start seeing set theory,
231 | like c ∈ C. This means 'c' is an element of 'C', or in the context
232 | our reading, this *class (c) is part of a set of all the possible
233 | classes (C)*.* *Why is that something we'd want to state? Because
234 | for Naive Bayes classification, we'll be choosing the class *c* with
235 | the highest probability given the evidence. The equations simply
236 | need a way to state "consider P(c|d) for all possible classes and
237 | choose the class with the highest value", which they do
238 | with .
239 |
240 | ### Slides
241 |
242 | - [07 -
243 | Classification](https://docs.google.com/presentation/d/1u_VZgEK45u4zbbfxZKo_G-uztNvR0As_gfc5X2c2nU0/edit?usp=sharing)
244 | - Includes material from: SLP v.3 slides (Jurafsky and Martin )
245 |
246 | ### For Next Week
247 |
248 | - [Lab Task 06
249 | Worksheet](labs/Lab%2006%20-%20More%20Pandas%20and%20Intro%20to%20Classification.ipynb)
250 |
251 | ### Week 8.1: Classification 2
252 |
253 | ### Week 8.2 Ethics in Text Mining
254 |
255 | ### Readings
256 |
257 | No required readings this week, focus on the lab task!
258 |
259 | **Optional Reading**
260 |
261 | - Brent Daniel Mittelstadt, Patrick Allo, Mariarosaria Taddeo,
262 | Sandra Wachter, Luciano Floridi. 2016. "[The ethics of
263 | algorithms: Mapping the
264 | debate](http://journals.sagepub.com/doi/abs/10.1177/2053951716679679)". *Big
265 | Data & Society. *Vol 3, Issue 2.
266 | - Recent BBC2 Story (audio): [Controlling the Unaccountable
267 | Algorithm](http://www.bbc.co.uk/programmes/b085wj18)
268 |
269 | As with our class on art and criticism, some of the most accessible work on ethics is from the bot-making community.
270 |
271 | - [Bots Should Punch Up](https://www.crummy.com/2013/11/27/0)
272 | - [Ethical Bot Making](http://mewo2.com/notes/bot-ethics/)
273 | - [How to Make a Bot that Isn't
274 | Racist](https://motherboard.vice.com/en_us/article/how-to-make-a-not-racist-bot)
275 |
276 | ### Slides
277 |
278 | - [Week 08 - Classification
279 | 2 and Ethics in Text Mining](https://docs.google.com/presentation/d/1TL4a0SGRcOHXmq4cKXs4dRY6ASDbr-V3vFHh0c_Nj-c/edit#slide=id.g1edffbd9d5_0_177)
280 | - Includes material from: SLP v.3 slides (Jurafsky and Martin)
281 |
282 | ### For Next Week
283 |
284 | - [Lab Task 7
285 | Worksheet](labs/Lab%2007%20-%20Classification.ipynb)
286 |
287 | ## Week 9: Clustering
288 |
289 | ### Readings
290 |
291 | - [Textual
292 | Analysis](http://www.digitalhumanities.org/companion/view?docId=blackwell/9781405103213/9781405103213.xml&chunk.id=ss1-4-4&toc.depth=1&toc.id=ss1-4-4&brand=default) -
293 | John Burrows, A Companion to Digital Humanities
294 | - [Clustering](http://scikit-learn.org/stable/modules/clustering.html) -
295 | Sci-Kit Learn Documentation: Read *Overview* and the intros to
296 | 2.3.2 (K-Means) and 2.3.6 (Hierarchical clustering)
297 |
298 | Supplemental Readings
299 |
300 | - [Cluster
301 | Analysis](http://www-users.cs.umn.edu/~kumar/dmbook/ch8.pdf) -
302 | Pang-Ning Tan, Michael Steinbach, Vipin Kumar. *Introduction to
303 | Data Mining*
304 | - [Beyond tokens: what character counts say about a
305 | page](https://sense.porganized.com/beyond-tokens-what-character-counts-say-about-a-page-278d0ccea34c#.nmrtloz6i).
306 | Peter Organisciak
307 |
308 | ### Slides
309 |
310 | [Week 9 -
311 | Clustering](https://docs.google.com/presentation/d/1UnHbclWT--wxOPwEB5U9uqQ8GPKfliEEpVhQYsDrtJA/edit?usp=sharing)
312 |
313 | ### For the next two weeks
314 |
315 | [Lab 08
316 | Worksheet](labs/Lab%2008%20-%20Clustering.ipynb)
317 |
318 | ## Spring Break Week
319 |
320 | Spring Break. No class.
321 |
322 | ## Week 10: Topic Modeling and Dimensionality Reduction 1
323 |
324 | ### Readings
325 |
326 | [](https://tedunderwood.com/2012/04/07/topic-modeling-made-just-simple-enough/)[Topic
327 | modeling made just
328 | simple enough.](https://tedunderwood.com/2012/04/07/topic-modeling-made-just-simple-enough/) 2012.
329 | Ted Underwood.
330 |
331 | [Probabilistic Topic
332 | Models](http://dl.acm.org/citation.cfm?id=2133826). 2012.
333 | David Blei.
334 |
335 | **Supplemental**
336 |
337 | [Introduction to Latent Dirichlet
338 | Allocation](http://blog.echen.me/2011/08/22/introduction-to-latent-dirichlet-allocation/). 2011.
339 | Edwin Chen.
340 |
341 | ### Slides
342 |
343 | [Topic Modeling
344 | Slides](https://docs.google.com/presentation/d/1X5NvF-CvTQk0jwhL74eUSu8u2QblastwkF6jTzzTkeM/edit?usp=sharing)
345 |
346 | ### For Next Week
347 |
348 | [Lab task 09 - Dimensionality Reduction and Sentiment
349 | Analysis](labs/Lab%209%20-%20Dimensionality%20Reduction%20and%20Sentiment%20Analysis.ipynb)
350 |
351 | *Recommended*: Get started on your topic modeling assignment. Make
352 | sure you can get MALLET running on your system.
353 |
354 | ### For Two Weeks from Now
355 |
356 | Topic Modeling Assignment Due. See description on the
357 | [Assignments](assignments.md) page.
358 |
359 | Post the Problem Statement for your Text Mining Project. See description on the [Assignments](assignments.md) page.
360 |
361 |
362 | ## Week 11.1 Topic Modelling 2
363 |
364 | ## Week 11.2 Sentiment Analysis
365 |
366 | ### Readings
367 |
368 | [Narrative framing of consumer sentiment in online restaurant
369 | reviews](http://journals.uic.edu/ojs/index.php/fm/article/view/4944).
370 | Dan Jurafsky, Victor Chahuneau, Bryan R. Routledge, Noah A. Smith.
371 |
372 | **Optional but Recommended**
373 |
374 | [Indexing by Latent Semantic
375 | Analysis](http://lsa.colorado.edu/papers/JASIS.lsi.90.pdf).
376 | Deerwester, Dumais, Furnas, Landauer, Harshman.
377 |
378 | *This is one of our core papers in Library and Information Science - 13k citations can't be wrong. You'll notice that these famous papers are particularly easy to read - Chengzheng Zhai's smoothing paper is
379 | another example - a good reminder that being clever is only useful if you can communicate it.*
380 |
381 | ### Slides
382 |
383 | [Topic Modelling II and Sentiment
384 | Analysis](https://docs.google.com/presentation/d/1aRo0-Ho9auR751MDKYIE4HIedmnYzqyELFignJN05Yk/edit?usp=sharing)
385 |
386 | ### For Next Week
387 |
388 | Topic Modeling Assignment Due. See description on
389 | the [Assignments](assignments.md) page.
390 |
391 | Post the Problem Statement for your Text Mining Project. See description on the [Assignments](assignments.md) page.
392 |
393 | ## Week 12: Visualization
394 |
395 | ### Readings
396 |
397 | It's a busy time, no readings this week!
398 |
399 | ### Slides
400 |
401 | [Week 13 -
402 | Visualization](https://docs.google.com/presentation/d/1R72aBkSYzqZlvtOVm9q-8-_Ogmc0QW9XQ7cZ2JKEm4Y/edit?usp=sharing)
403 |
404 | ### For Next Week
405 |
406 | - Literature Review and Data Collection for your final project.
407 |
408 | ## Week 13: Word Embeddings
409 |
410 | ### Readings
411 |
412 | - [Word Embeddings for the digital
413 | humanities](http://bookworm.benschmidt.org/posts/2015-10-25-Word-Embeddings.html). 2015.
414 | Benjamin Schmidt.
415 |
416 | - [Vector Representations of
417 | Words](https://www.tensorflow.org/tutorials/word2vec) (stop at
418 | 'Building the Graph'). Tensorflow Tutorials.
419 |
420 | **Supplemental (Optional)**
421 |
422 | - [Distributed Representations of Words and Phrases and their Compositionality](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf).
423 | Mikolov et. al.
424 |
425 | **Bonus**
426 |
427 | Something to play with: [the "Bonus App" at the bottom of Radim Řehůřek's Word2Vec
428 | tutorial](https://rare-technologies.com/word2vec-tutorial/).
429 |
430 | ## Week 14: What's Next: Remainder Notes from Text Mining
431 |
432 | ### Slides
433 |
434 | [Week 15 - What's
435 | Next](https://docs.google.com/presentation/d/1GwGK3b4U_Z3xt_fFZiWB86jRWreGn00p9arfi2oPvYg/edit?usp=sharing)
436 |
437 | ### Reminders
438 |
439 | May 3rd is the last day to turn in late lab tasks! Get them in!
440 |
--------------------------------------------------------------------------------
/labs/Lab 02.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Week 2 Lab Task\n",
8 | "This week is about getting started with powerful tools that will underlie many of the skills you learn in the course. Much of the effort is in setting up your programming environment: the lab questions will ensure that it is done correctly and help you grow familiar with it.\n",
9 | "\n",
10 | "In this course we'll be using the Python programming language, using an innovative environment called Jupyter Notebooks.\n",
11 | "\n",
12 | "Your _environment_ is similar to your local workspace. Look at your desk: how do you organize your pens, paper, mouse, monitor? Or maybe you have a barebones workspace, working at a coffee shop or kitchen table with only a cup of coffee. In the same way, you can have many different environments for how you work with Python: working on a command line, or running scripts. Jupyter Notebooks is an environment that gives you an interactive, browser based version of Python. It allows you to play with code in a way that gives you immediate feedback, and allows you to break, tinker, and retry.\n",
13 | "\n",
14 | "Jupyter Notebooks will be installed through Anaconda.\n",
15 | "\n",
16 | "When programming, you're usually not writing everything from scratch. Some code is needed by many other people, so most languages have a concept of a _library_: code written and distributed by other people that you can easily use in your own work. \n",
17 | "\n",
18 | "Anaconda is a scientific distribution of Python, which installs Python on your system alongside a great deal of libraries that scientists use. To be clear: it is possible to install Python in other ways and individually install the libraries, but Anaconda puts it all into a tidy package. As scientists want complicated mathematical algorithms, installing some scientific libraries can be very difficult: Anaconda makes it easy!"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "## 1. Installing Jupyter Notebooks through Anaconda\n",
26 | "\n",
27 | "Install Jupyter Notebooks following the instructions in the Art of Literary Text Analysis, following the [Getting Setup](https://github.com/sgsinclair/alta/blob/master/ipynb/GettingSetup.ipynb) and [Getting Started](https://github.com/sgsinclair/alta/blob/master/ipynb/GettingStarted.ipynb) (you can stop before the Printing Dynamic Content section). Make sure you install the Python 3 version. Because this is our first introduction to ALTA, it's worth reading the [short introductory text](https://github.com/sgsinclair/alta/blob/master/ipynb/ArtOfLiteraryTextAnalysis.ipynb). If you have trouble with installation, start a discussion in the Open Discussion forum.\n",
28 | "\n",
29 | "After you're done installation, start a new notebook and follow along with the tour at Help > User Interface Tour.\n",
30 | "\n",
31 | "_Questions_\n",
32 | "\n",
33 | "- 1) What are the two modes of a notebook?\n",
34 | "- 2) What do you press to leave edit mode while in a cell?\n",
35 | "- 3) What are the Keyboard Shortcuts for:\n",
36 | " - a) insert cell below\n",
37 | " - b) insert cell above\n",
38 | " - c) run selected cells"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "## 2. A Little bit of code\n",
46 | "\n",
47 | "Create a new cell in your notebook with the '+' button in the toolbar (or one of the keyboard shortcuts from the previous question). We're going to try two simple Python commands: setting a variable, and splitting it by whitespace. In the process, we'll encounter two types of data that Python can hold: a string, and a list.\n",
48 | "\n",
49 | "Add the following code to the cell and 'run' it. If it runs properly, it should look like below, with the 'In' and 'Out' information."
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {
56 | "collapsed": false
57 | },
58 | "outputs": [
59 | {
60 | "data": {
61 | "text/plain": [
62 | "'Hello world.'"
63 | ]
64 | },
65 | "execution_count": 9,
66 | "metadata": {},
67 | "output_type": "execute_result"
68 | }
69 | ],
70 | "source": [
71 | "sentence = \"Hello world.\"\n",
72 | "sentence"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "metadata": {},
78 | "source": [
79 | "Here, we set a string to a variable, then we called that variable.\n",
80 | "\n",
81 | "_Questions_\n",
82 | "- 4) What output is there if you run the cell without the second line (which simply says `sentence`)?"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "A string is a type of data in Python. By setting it to the variable `sentence`, everywhere you use `sentence` is the exact same as simply writing `\"Hello world.\"` Consider the following examples, or even try them out, which show that the way of joining two strings works the same with a variable or directly with a string:"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {
96 | "collapsed": false
97 | },
98 | "outputs": [
99 | {
100 | "data": {
101 | "text/plain": [
102 | "'Hello world. Hello moon.'"
103 | ]
104 | },
105 | "execution_count": 12,
106 | "metadata": {},
107 | "output_type": "execute_result"
108 | }
109 | ],
110 | "source": [
111 | "\"Hello world.\" + \" Hello moon.\""
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {
118 | "collapsed": false
119 | },
120 | "outputs": [
121 | {
122 | "data": {
123 | "text/plain": [
124 | "'Hello world. Hello moon.'"
125 | ]
126 | },
127 | "execution_count": 14,
128 | "metadata": {},
129 | "output_type": "execute_result"
130 | }
131 | ],
132 | "source": [
133 | "sentence + \" Hello moon.\""
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {
140 | "collapsed": false
141 | },
142 | "outputs": [
143 | {
144 | "data": {
145 | "text/plain": [
146 | "'Hello world.Hello world.'"
147 | ]
148 | },
149 | "execution_count": 15,
150 | "metadata": {},
151 | "output_type": "execute_result"
152 | }
153 | ],
154 | "source": [
155 | "sentence + sentence"
156 | ]
157 | },
158 | {
159 | "cell_type": "markdown",
160 | "metadata": {},
161 | "source": [
162 | "We can even see the datatype of a variable with `type()`:"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {
169 | "collapsed": false,
170 | "scrolled": true
171 | },
172 | "outputs": [
173 | {
174 | "data": {
175 | "text/plain": [
176 | "str"
177 | ]
178 | },
179 | "execution_count": 80,
180 | "metadata": {},
181 | "output_type": "execute_result"
182 | }
183 | ],
184 | "source": [
185 | "type(sentence)"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {},
191 | "source": [
192 | "If you have a really long string that needs to go across lines, you can use `\\` before the line break to tell Python that _this line of code is not done yet_. Set this famously long sentence from _Paul Clifton_ to the variable `paragraph` in your notebook:"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {
199 | "collapsed": false
200 | },
201 | "outputs": [
202 | {
203 | "data": {
204 | "text/plain": [
205 | "'It was a dark and stormy night; the rain fell in torrents — except at occasional intervals, when it was checked by a violent gust of wind which swept up the streets (for it is in London that our scene lies), rattling along the housetops, and fiercely agitating the scanty flame of the lamps that struggled against the darkness.'"
206 | ]
207 | },
208 | "execution_count": 30,
209 | "metadata": {},
210 | "output_type": "execute_result"
211 | }
212 | ],
213 | "source": [
214 | "paragraph = \"It was a dark and stormy night; the rain fell in torrents — except at occasional intervals, when it was \" + \\\n",
215 | " \"checked by a violent gust of wind which swept up the streets (for it is in London that our scene lies), rattling \" + \\\n",
216 | " \"along the housetops, and fiercely agitating the scanty flame of the lamps that struggled against the darkness.\"\n",
217 | "paragraph"
218 | ]
219 | },
220 | {
221 | "cell_type": "markdown",
222 | "metadata": {},
223 | "source": [
224 | "_Questions_ \n",
225 | "- 5) For the code block above, \n",
226 | " - a) Are the indents necessary for the code to run?\n",
227 | " - b) Are the pluses (+) necessary for the code to run?\n",
228 | " - c) Are the backslashes (\\\\) necessary for the code to run?\n",
229 | " \n",
230 | "_tinker with the code and re-run as necessary_"
231 | ]
232 | },
233 | {
234 | "cell_type": "markdown",
235 | "metadata": {},
236 | "source": [
237 | "Another important datatype in Python is the `list`. This is a way to hold multiple things together: strings, numbers, etc. For example:"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": null,
243 | "metadata": {
244 | "collapsed": false
245 | },
246 | "outputs": [
247 | {
248 | "data": {
249 | "text/plain": [
250 | "['Never', 'gonna', 'give', 'you', 'up']"
251 | ]
252 | },
253 | "execution_count": 57,
254 | "metadata": {},
255 | "output_type": "execute_result"
256 | }
257 | ],
258 | "source": [
259 | "list_of_strings = [\"Never\", \"gonna\", \"give\", \"you\", \"up\"]\n",
260 | "list_of_strings"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": null,
266 | "metadata": {
267 | "collapsed": false
268 | },
269 | "outputs": [
270 | {
271 | "data": {
272 | "text/plain": [
273 | "[4, 8, 15, 16, 23, 42]"
274 | ]
275 | },
276 | "execution_count": 38,
277 | "metadata": {},
278 | "output_type": "execute_result"
279 | }
280 | ],
281 | "source": [
282 | "list_of_numbers = [ 4, 8, 15, 16, 23, 42]\n",
283 | "list_of_numbers"
284 | ]
285 | },
286 | {
287 | "cell_type": "markdown",
288 | "metadata": {},
289 | "source": [
290 | "Individual objects from a list can be retrieved using a square bracket referencing the place in the list (starting with 0):"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": null,
296 | "metadata": {
297 | "collapsed": false
298 | },
299 | "outputs": [
300 | {
301 | "data": {
302 | "text/plain": [
303 | "'Hello'"
304 | ]
305 | },
306 | "execution_count": 36,
307 | "metadata": {},
308 | "output_type": "execute_result"
309 | }
310 | ],
311 | "source": [
312 | "list_of_strings[0]"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": null,
318 | "metadata": {
319 | "collapsed": false
320 | },
321 | "outputs": [
322 | {
323 | "data": {
324 | "text/plain": [
325 | "8"
326 | ]
327 | },
328 | "execution_count": 40,
329 | "metadata": {},
330 | "output_type": "execute_result"
331 | }
332 | ],
333 | "source": [
334 | "list_of_numbers[1]"
335 | ]
336 | },
337 | {
338 | "cell_type": "markdown",
339 | "metadata": {},
340 | "source": [
341 | "You can select a list range by specify two numbers in the square brackets with a colon in-between:"
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": null,
347 | "metadata": {
348 | "collapsed": false
349 | },
350 | "outputs": [
351 | {
352 | "data": {
353 | "text/plain": [
354 | "['gonna', 'give', 'you']"
355 | ]
356 | },
357 | "execution_count": 63,
358 | "metadata": {},
359 | "output_type": "execute_result"
360 | }
361 | ],
362 | "source": [
363 | "list_of_strings[1:4]"
364 | ]
365 | },
366 | {
367 | "cell_type": "markdown",
368 | "metadata": {},
369 | "source": [
370 | "Using the colon without a number means _from the very start_ or _until the very end_:"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": null,
376 | "metadata": {
377 | "collapsed": false
378 | },
379 | "outputs": [
380 | {
381 | "data": {
382 | "text/plain": [
383 | "['Never', 'gonna', 'give', 'you']"
384 | ]
385 | },
386 | "execution_count": 64,
387 | "metadata": {},
388 | "output_type": "execute_result"
389 | }
390 | ],
391 | "source": [
392 | "list_of_strings[:4]"
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": null,
398 | "metadata": {
399 | "collapsed": false
400 | },
401 | "outputs": [
402 | {
403 | "data": {
404 | "text/plain": [
405 | "['gonna', 'give', 'you', 'up']"
406 | ]
407 | },
408 | "execution_count": 65,
409 | "metadata": {},
410 | "output_type": "execute_result"
411 | }
412 | ],
413 | "source": [
414 | "list_of_strings[1:]"
415 | ]
416 | },
417 | {
418 | "cell_type": "markdown",
419 | "metadata": {},
420 | "source": [
421 | "You can add to a list with `list.append()`:"
422 | ]
423 | },
424 | {
425 | "cell_type": "code",
426 | "execution_count": null,
427 | "metadata": {
428 | "collapsed": false
429 | },
430 | "outputs": [
431 | {
432 | "data": {
433 | "text/plain": [
434 | "['Hello', 'world', 'Word', 'Word']"
435 | ]
436 | },
437 | "execution_count": 42,
438 | "metadata": {},
439 | "output_type": "execute_result"
440 | }
441 | ],
442 | "source": [
443 | "list_of_strings.append(\"Word\")\n",
444 | "list_of_strings"
445 | ]
446 | },
447 | {
448 | "cell_type": "markdown",
449 | "metadata": {},
450 | "source": [
451 | "_Questions_\n",
452 | "\n",
453 | "- 6) Can a list have a mix of numbers and strings?\n",
454 | "- 7) We joined strings with '+'. What happens if you try to use '+' on two lists?"
455 | ]
456 | },
457 | {
458 | "cell_type": "markdown",
459 | "metadata": {},
460 | "source": [
461 | "# 3. Splitting a string to a list"
462 | ]
463 | },
464 | {
465 | "cell_type": "markdown",
466 | "metadata": {},
467 | "source": [
468 | "A string can be split into a list using a splitting character. In the (useless) example below, we tell Python that everywhere there is an 'o' should be considered a place to split the string into a list:"
469 | ]
470 | },
471 | {
472 | "cell_type": "code",
473 | "execution_count": null,
474 | "metadata": {
475 | "collapsed": false
476 | },
477 | "outputs": [
478 | {
479 | "data": {
480 | "text/plain": [
481 | "['Hell', ' w', 'rld.']"
482 | ]
483 | },
484 | "execution_count": 49,
485 | "metadata": {},
486 | "output_type": "execute_result"
487 | }
488 | ],
489 | "source": [
490 | "sentence.split(\"o\")"
491 | ]
492 | },
493 | {
494 | "cell_type": "markdown",
495 | "metadata": {},
496 | "source": [
497 | "This can be used for a simple word tokenization by space characters:"
498 | ]
499 | },
500 | {
501 | "cell_type": "code",
502 | "execution_count": null,
503 | "metadata": {
504 | "collapsed": false
505 | },
506 | "outputs": [
507 | {
508 | "data": {
509 | "text/plain": [
510 | "['Hello', 'world.']"
511 | ]
512 | },
513 | "execution_count": 54,
514 | "metadata": {},
515 | "output_type": "execute_result"
516 | }
517 | ],
518 | "source": [
519 | "words = sentence.split(\" \")\n",
520 | "words"
521 | ]
522 | },
523 | {
524 | "cell_type": "markdown",
525 | "metadata": {
526 | "collapsed": false,
527 | "scrolled": true
528 | },
529 | "source": [
530 | "_Questions:_\n",
531 | "\n",
532 | " - 8) How would you select a list with the first seven words in the `paragraph` variable? This will require two steps. Show your code and the output.\n",
533 | " - 9) The opposite of `split` is possible with `\"string_to_join_list_items_by\".join(your_list)`. Set the list from question 8 to a variable and join it into a single string. The output will be 'It was a dark and stormy night;': write your code.\n",
534 | " - 10) Split the following text into a list of *sentences*. Don't worry if one of your sentences is an empty string (''). Show the code and output.\n",
535 | " > The shows opens at Duckburg. After Donald Duck enlists in the navy, Uncle Scrooge has to take care of grand-nephews Huey, Dewey, and Louie. Uncle Scrooge brings the boys to the McDuck's mansion where they are presented to Duckworth, the butler. The nephews are forced to sleep in the attic."
536 | ]
537 | }
538 | ],
539 | "metadata": {
540 | "kernelspec": {
541 | "display_name": "Python 3",
542 | "language": "python",
543 | "name": "python3"
544 | },
545 | "language_info": {
546 | "codemirror_mode": {
547 | "name": "ipython",
548 | "version": 3
549 | },
550 | "file_extension": ".py",
551 | "mimetype": "text/x-python",
552 | "name": "python",
553 | "nbconvert_exporter": "python",
554 | "pygments_lexer": "ipython3",
555 | "version": "3.5.1"
556 | }
557 | },
558 | "nbformat": 4,
559 | "nbformat_minor": 0
560 | }
561 |
--------------------------------------------------------------------------------
/examples/Topic Modelling Trump Tweets.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [
10 | {
11 | "name": "stderr",
12 | "output_type": "stream",
13 | "text": [
14 | "C:\\Users\\organis2\\AppData\\Local\\Continuum\\Anaconda3\\lib\\site-packages\\gensim\\utils.py:855: UserWarning: detected Windows; aliasing chunkize to chunkize_serial\n",
15 | " warnings.warn(\"detected Windows; aliasing chunkize to chunkize_serial\")\n"
16 | ]
17 | }
18 | ],
19 | "source": [
20 | "import gensim\n",
21 | "import os\n",
22 | "import pandas as pd\n",
23 | "from gensim.corpora.dictionary import Dictionary "
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {
30 | "collapsed": false
31 | },
32 | "outputs": [
33 | {
34 | "data": {
35 | "text/html": [
36 | "\n",
37 | "
\n",
38 | " \n",
39 | " \n",
40 | " | \n",
41 | " Text | \n",
42 | " Date | \n",
43 | " Favorites | \n",
44 | " Retweets | \n",
45 | " Tweet ID | \n",
46 | "
\n",
47 | " \n",
48 | " \n",
49 | " \n",
50 | " | 0 | \n",
51 | " Nielson Media Research final numbers on ACCEPT... | \n",
52 | " 2016-07-30 23:32:40 | \n",
53 | " 13850 | \n",
54 | " 4130 | \n",
55 | " 759592590106849280 | \n",
56 | "
\n",
57 | " \n",
58 | " | 1 | \n",
59 | " Thank you to all of the television viewers tha... | \n",
60 | " 2016-07-30 19:00:07 | \n",
61 | " 27659 | \n",
62 | " 6842 | \n",
63 | " 759524001613918208 | \n",
64 | "
\n",
65 | " \n",
66 | " | 2 | \n",
67 | " Can you imagine if I had the small crowds that... | \n",
68 | " 2016-07-30 18:28:22 | \n",
69 | " 19968 | \n",
70 | " 6488 | \n",
71 | " 759516008272932864 | \n",
72 | "
\n",
73 | " \n",
74 | " | 3 | \n",
75 | " NATO commander agrees members should pay up vi... | \n",
76 | " 2016-07-30 18:24:40 | \n",
77 | " 11624 | \n",
78 | " 4668 | \n",
79 | " 759515080010719232 | \n",
80 | "
\n",
81 | " \n",
82 | " | 4 | \n",
83 | " Wow, NATO's top commander just announced that ... | \n",
84 | " 2016-07-30 18:18:58 | \n",
85 | " 23922 | \n",
86 | " 7819 | \n",
87 | " 759513644258525184 | \n",
88 | "
\n",
89 | " \n",
90 | "
\n",
91 | "
"
92 | ],
93 | "text/plain": [
94 | " Text Date \\\n",
95 | "0 Nielson Media Research final numbers on ACCEPT... 2016-07-30 23:32:40 \n",
96 | "1 Thank you to all of the television viewers tha... 2016-07-30 19:00:07 \n",
97 | "2 Can you imagine if I had the small crowds that... 2016-07-30 18:28:22 \n",
98 | "3 NATO commander agrees members should pay up vi... 2016-07-30 18:24:40 \n",
99 | "4 Wow, NATO's top commander just announced that ... 2016-07-30 18:18:58 \n",
100 | "\n",
101 | " Favorites Retweets Tweet ID \n",
102 | "0 13850 4130 759592590106849280 \n",
103 | "1 27659 6842 759524001613918208 \n",
104 | "2 19968 6488 759516008272932864 \n",
105 | "3 11624 4668 759515080010719232 \n",
106 | "4 23922 7819 759513644258525184 "
107 | ]
108 | },
109 | "execution_count": 3,
110 | "metadata": {},
111 | "output_type": "execute_result"
112 | }
113 | ],
114 | "source": [
115 | "tweets = pd.read_csv(\"https://raw.githubusercontent.com/sashaperigo/Trump-Tweets/master/data.csv\").dropna()\n",
116 | "tweets.head()"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "metadata": {
123 | "collapsed": false
124 | },
125 | "outputs": [
126 | {
127 | "data": {
128 | "text/plain": [
129 | "0 #a\n",
130 | "1 test\n",
131 | "dtype: object"
132 | ]
133 | },
134 | "execution_count": 4,
135 | "metadata": {},
136 | "output_type": "execute_result"
137 | }
138 | ],
139 | "source": [
140 | "# Tokenize tweets, while stoplisting, case-folding, and filtering\n",
141 | "from nltk import word_tokenize\n",
142 | "from nltk.corpus import stopwords\n",
143 | "stoplist = stopwords.words('english')\n",
144 | "\n",
145 | "def clean_tweet(tweet):\n",
146 | " lower = tweet.lower()\n",
147 | " # Small hack to keep hashtags without modifying tokenizer:\n",
148 | " # replace # with text, then replace back later\n",
149 | " terms = word_tokenize(lower.replace(\"#\", \"HASH_\"))\n",
150 | " terms_stopped = [term for term in terms if term not in stoplist]\n",
151 | " terms_alpha = [term for term in terms_stopped if (term.isalpha() or \"HASH_\" in term)]\n",
152 | " if len(terms_alpha) == 0:\n",
153 | " return pd.Series()\n",
154 | " else:\n",
155 | " return pd.Series(terms_alpha).str.replace(\"HASH_\", \"#\")\n",
156 | " \n",
157 | "clean_tweet(\"This is #a test\")"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {
164 | "collapsed": false
165 | },
166 | "outputs": [
167 | {
168 | "data": {
169 | "text/html": [
170 | "\n",
171 | "
\n",
172 | " \n",
173 | " \n",
174 | " | word | \n",
175 | " # | \n",
176 | " #1 | \n",
177 | " #2 | \n",
178 | " #2016 | \n",
179 | " #2a | \n",
180 | " #alsicebucketchallenge | \n",
181 | " #america | \n",
182 | " #americafirst | \n",
183 | " #apprentice | \n",
184 | " #autism | \n",
185 | " ... | \n",
186 | " yrs | \n",
187 | " yuan | \n",
188 | " zero | \n",
189 | " zimmerman | \n",
190 | " zogby | \n",
191 | " zone | \n",
192 | " zones | \n",
193 | " zucker | \n",
194 | " zuckerman | \n",
195 | " zuker | \n",
196 | "
\n",
197 | " \n",
198 | " | Tweet ID | \n",
199 | " | \n",
200 | " | \n",
201 | " | \n",
202 | " | \n",
203 | " | \n",
204 | " | \n",
205 | " | \n",
206 | " | \n",
207 | " | \n",
208 | " | \n",
209 | " | \n",
210 | " | \n",
211 | " | \n",
212 | " | \n",
213 | " | \n",
214 | " | \n",
215 | " | \n",
216 | " | \n",
217 | " | \n",
218 | " | \n",
219 | " | \n",
220 | "
\n",
221 | " \n",
222 | " \n",
223 | " \n",
224 | " | 1698308935 | \n",
225 | " 0.0 | \n",
226 | " 0.0 | \n",
227 | " 0.0 | \n",
228 | " 0.0 | \n",
229 | " 0.0 | \n",
230 | " 0.0 | \n",
231 | " 0.0 | \n",
232 | " 0.0 | \n",
233 | " 0.0 | \n",
234 | " 0.0 | \n",
235 | " ... | \n",
236 | " 0.0 | \n",
237 | " 0.0 | \n",
238 | " 0.0 | \n",
239 | " 0.0 | \n",
240 | " 0.0 | \n",
241 | " 0.0 | \n",
242 | " 0.0 | \n",
243 | " 0.0 | \n",
244 | " 0.0 | \n",
245 | " 0.0 | \n",
246 | "
\n",
247 | " \n",
248 | " | 1701461182 | \n",
249 | " 0.0 | \n",
250 | " 0.0 | \n",
251 | " 0.0 | \n",
252 | " 0.0 | \n",
253 | " 0.0 | \n",
254 | " 0.0 | \n",
255 | " 0.0 | \n",
256 | " 0.0 | \n",
257 | " 0.0 | \n",
258 | " 0.0 | \n",
259 | " ... | \n",
260 | " 0.0 | \n",
261 | " 0.0 | \n",
262 | " 0.0 | \n",
263 | " 0.0 | \n",
264 | " 0.0 | \n",
265 | " 0.0 | \n",
266 | " 0.0 | \n",
267 | " 0.0 | \n",
268 | " 0.0 | \n",
269 | " 0.0 | \n",
270 | "
\n",
271 | " \n",
272 | " | 1737479987 | \n",
273 | " 0.0 | \n",
274 | " 0.0 | \n",
275 | " 0.0 | \n",
276 | " 0.0 | \n",
277 | " 0.0 | \n",
278 | " 0.0 | \n",
279 | " 0.0 | \n",
280 | " 0.0 | \n",
281 | " 0.0 | \n",
282 | " 0.0 | \n",
283 | " ... | \n",
284 | " 0.0 | \n",
285 | " 0.0 | \n",
286 | " 0.0 | \n",
287 | " 0.0 | \n",
288 | " 0.0 | \n",
289 | " 0.0 | \n",
290 | " 0.0 | \n",
291 | " 0.0 | \n",
292 | " 0.0 | \n",
293 | " 0.0 | \n",
294 | "
\n",
295 | " \n",
296 | " | 1741160716 | \n",
297 | " 0.0 | \n",
298 | " 0.0 | \n",
299 | " 0.0 | \n",
300 | " 0.0 | \n",
301 | " 0.0 | \n",
302 | " 0.0 | \n",
303 | " 0.0 | \n",
304 | " 0.0 | \n",
305 | " 0.0 | \n",
306 | " 0.0 | \n",
307 | " ... | \n",
308 | " 0.0 | \n",
309 | " 0.0 | \n",
310 | " 0.0 | \n",
311 | " 0.0 | \n",
312 | " 0.0 | \n",
313 | " 0.0 | \n",
314 | " 0.0 | \n",
315 | " 0.0 | \n",
316 | " 0.0 | \n",
317 | " 0.0 | \n",
318 | "
\n",
319 | " \n",
320 | " | 1773561338 | \n",
321 | " 0.0 | \n",
322 | " 0.0 | \n",
323 | " 0.0 | \n",
324 | " 0.0 | \n",
325 | " 0.0 | \n",
326 | " 0.0 | \n",
327 | " 0.0 | \n",
328 | " 0.0 | \n",
329 | " 0.0 | \n",
330 | " 0.0 | \n",
331 | " ... | \n",
332 | " 0.0 | \n",
333 | " 0.0 | \n",
334 | " 0.0 | \n",
335 | " 0.0 | \n",
336 | " 0.0 | \n",
337 | " 0.0 | \n",
338 | " 0.0 | \n",
339 | " 0.0 | \n",
340 | " 0.0 | \n",
341 | " 0.0 | \n",
342 | "
\n",
343 | " \n",
344 | "
\n",
345 | "
5 rows × 5445 columns
\n",
346 | "
"
347 | ],
348 | "text/plain": [
349 | "word # #1 #2 #2016 #2a #alsicebucketchallenge #america \\\n",
350 | "Tweet ID \n",
351 | "1698308935 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
352 | "1701461182 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
353 | "1737479987 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
354 | "1741160716 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
355 | "1773561338 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
356 | "\n",
357 | "word #americafirst #apprentice #autism ... yrs yuan zero \\\n",
358 | "Tweet ID ... \n",
359 | "1698308935 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
360 | "1701461182 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
361 | "1737479987 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
362 | "1741160716 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
363 | "1773561338 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n",
364 | "\n",
365 | "word zimmerman zogby zone zones zucker zuckerman zuker \n",
366 | "Tweet ID \n",
367 | "1698308935 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
368 | "1701461182 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
369 | "1737479987 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
370 | "1741160716 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
371 | "1773561338 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
372 | "\n",
373 | "[5 rows x 5445 columns]"
374 | ]
375 | },
376 | "execution_count": 5,
377 | "metadata": {},
378 | "output_type": "execute_result"
379 | }
380 | ],
381 | "source": [
382 | "# Create a 'long' dataframe of term counts\n",
383 | "tweet_words = tweets['Text'].str.lower().apply(clean_tweet)\n",
384 | "tweet_words.index = tweets['Tweet ID']\n",
385 | "\n",
386 | "word_counts = (tweet_words.stack().to_frame()\n",
387 | " .reset_index()\n",
388 | " .rename(columns={0:'word', 'level_1':'count'})\n",
389 | " .groupby(['Tweet ID', 'word'], as_index=False).count()\n",
390 | " )\n",
391 | "\n",
392 | "# Filter to words that have been used 5 or more times\n",
393 | "words_filtered = word_counts.groupby('word').filter(lambda x: x['count'].sum() >= 5)\n",
394 | "\n",
395 | "# Make 'wide' dataframe, i.e. a document-term matrix\n",
396 | "trump_counts = words_filtered.pivot(index='Tweet ID', columns='word', values='count').fillna(0)\n",
397 | "trump_counts.head()"
398 | ]
399 | },
400 | {
401 | "cell_type": "markdown",
402 | "metadata": {},
403 | "source": [
404 | "The size of our document-term matrix, `count(tweets) x count(unique_words)`:"
405 | ]
406 | },
407 | {
408 | "cell_type": "code",
409 | "execution_count": null,
410 | "metadata": {
411 | "collapsed": false
412 | },
413 | "outputs": [
414 | {
415 | "data": {
416 | "text/plain": [
417 | "901 \"@NathanDWilsonFL: @MariaBartiromo you had a g...\n",
418 | "2821 \"@AniesiODaniels: #DemDebate Q: Who are you vo...\n",
419 | "3646 \"@TradingStreetCo:Donald Trump Is Ratings ‘Gol...\n",
420 | "4359 \"@moshe_mkmdca: @realDonaldTrump @007lLisav @C...\n",
421 | "4981 \"@jimlibertarian: @SlwStdySque Donald has alr...\n",
422 | "Name: Text, dtype: object"
423 | ]
424 | },
425 | "execution_count": 29,
426 | "metadata": {},
427 | "output_type": "execute_result"
428 | }
429 | ],
430 | "source": [
431 | "q = trump_counts.loc[:,[\"donald\"]].query('donald > 1').index.values\n",
432 | "tweets[tweets[\"Tweet ID\"].isin(q)]['Text'].head()"
433 | ]
434 | },
435 | {
436 | "cell_type": "code",
437 | "execution_count": null,
438 | "metadata": {
439 | "collapsed": true
440 | },
441 | "outputs": [],
442 | "source": [
443 | "# Number all the columns and create a gensim dictionary\n",
444 | "dictionary = Dictionary()\n",
445 | "dictionary.token2id = dict(zip(trump_counts.columns, range(0, trump_counts.shape[1])))"
446 | ]
447 | },
448 | {
449 | "cell_type": "code",
450 | "execution_count": null,
451 | "metadata": {
452 | "collapsed": false
453 | },
454 | "outputs": [],
455 | "source": [
456 | "# If I haven't already trained and saved a model, train it now\n",
457 | "if not os.path.exists('trump-tweets.pickle'):\n",
458 | " # Train a model\n",
459 | " # Gensim has a way to read numpy arrays, but they use columns for documents - so rotate ('transpose') the DataFrame\n",
460 | " corpus = gensim.matutils.Dense2Corpus(trump_counts.values.T)\n",
461 | " lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,\n",
462 | " num_topics=20, update_every=1, chunksize=1000, passes=6, alpha='auto')\n",
463 | " lda.save('trump-tweets.pickle')\n",
464 | "else:\n",
465 | " # Load a model\n",
466 | " lda = gensim.models.ldamodel.LdaModel.load('trump-tweets.pickle')"
467 | ]
468 | },
469 | {
470 | "cell_type": "code",
471 | "execution_count": null,
472 | "metadata": {
473 | "collapsed": false,
474 | "scrolled": false
475 | },
476 | "outputs": [
477 | {
478 | "name": "stdout",
479 | "output_type": "stream",
480 | "text": [
481 | "0\t0.060*\"nice\" + 0.054*\"got\" + 0.052*\"wow\" + 0.050*\"say\" + 0.038*\"nothing\" + 0.032*\"wonderful\"\n",
482 | "1\t0.070*\"cnn\" + 0.068*\"poll\" + 0.046*\"think\" + 0.041*\"true\" + 0.037*\"day\" + 0.036*\"man\"\n",
483 | "2\t0.279*\"thank\" + 0.090*\"vote\" + 0.087*\"big\" + 0.036*\"crowd\" + 0.035*\"needs\" + 0.018*\"apprentice\"\n",
484 | "3\t0.092*\"clinton\" + 0.043*\"megynkelly\" + 0.042*\"ever\" + 0.029*\"presidential\" + 0.024*\"women\" + 0.021*\"truth\"\n",
485 | "4\t0.098*\"people\" + 0.072*\"get\" + 0.048*\"cruz\" + 0.047*\"many\" + 0.040*\"bad\" + 0.036*\"really\"\n",
486 | "5\t0.181*\"http\" + 0.139*\"trump\" + 0.103*\"donald\" + 0.045*\"via\" + 0.022*\"morning\" + 0.020*\"hampshire\"\n",
487 | "6\t0.090*\"make\" + 0.087*\"foxnews\" + 0.063*\"win\" + 0.040*\"gop\" + 0.039*\"interview\" + 0.038*\"foxandfriends\"\n",
488 | "7\t0.043*\"hope\" + 0.038*\"watching\" + 0.032*\"person\" + 0.031*\"far\" + 0.028*\"year\" + 0.027*\"party\"\n",
489 | "8\t0.070*\"see\" + 0.050*\"know\" + 0.047*\"tomorrow\" + 0.045*\"speech\" + 0.037*\"let\" + 0.037*\"years\"\n",
490 | "9\t0.119*\"#makeamericagreatagain\" + 0.043*\"support\" + 0.040*\"campaign\" + 0.040*\"jobs\" + 0.035*\"american\" + 0.034*\"join\"\n",
491 | "10\t0.201*\"great\" + 0.044*\"thanks\" + 0.038*\"tonight\" + 0.031*\"today\" + 0.030*\"show\" + 0.029*\"last\"\n",
492 | "11\t0.099*\"president\" + 0.071*\"would\" + 0.049*\"good\" + 0.043*\"obama\" + 0.042*\"never\" + 0.042*\"need\"\n",
493 | "12\t0.043*\"work\" + 0.037*\"national\" + 0.032*\"oreillyfactor\" + 0.029*\"golf\" + 0.027*\"hard\" + 0.027*\"place\"\n",
494 | "13\t0.220*\"realdonaldtrump\" + 0.076*\"trump\" + 0.052*\"hillary\" + 0.051*\"america\" + 0.047*\"#trump2016\" + 0.032*\"like\"\n",
495 | "14\t0.180*\"https\" + 0.123*\"new\" + 0.079*\"crooked\" + 0.022*\"york\" + 0.020*\"rally\" + 0.019*\"politicians\"\n",
496 | "15\t0.063*\"much\" + 0.039*\"republican\" + 0.038*\"better\" + 0.037*\"money\" + 0.037*\"bernie\" + 0.033*\"deal\"\n",
497 | "16\t0.038*\"happy\" + 0.038*\"jebbush\" + 0.036*\"change\" + 0.032*\"florida\" + 0.026*\"endorsement\" + 0.025*\"ready\"\n",
498 | "17\t0.046*\"keep\" + 0.037*\"donaldtrump\" + 0.035*\"soon\" + 0.033*\"wants\" + 0.026*\"agree\" + 0.026*\"sanders\"\n",
499 | "18\t0.094*\"love\" + 0.058*\"ted\" + 0.035*\"isis\" + 0.026*\"immigration\" + 0.021*\"terrible\" + 0.020*\"wo\"\n",
500 | "19\t0.083*\"country\" + 0.042*\"even\" + 0.039*\"right\" + 0.038*\"debate\" + 0.037*\"media\" + 0.035*\"must\"\n"
501 | ]
502 | }
503 | ],
504 | "source": [
505 | "print(\"\\n\".join([\"%d\\t%s\" % info for info in lda.show_topics(num_topics=20, num_words=6)]))"
506 | ]
507 | },
508 | {
509 | "cell_type": "code",
510 | "execution_count": null,
511 | "metadata": {
512 | "collapsed": false
513 | },
514 | "outputs": [
515 | {
516 | "data": {
517 | "text/plain": [
518 | "(20, 5445)"
519 | ]
520 | },
521 | "execution_count": 56,
522 | "metadata": {},
523 | "output_type": "execute_result"
524 | }
525 | ],
526 | "source": [
527 | "lda.state.get_lambda().shape"
528 | ]
529 | },
530 | {
531 | "cell_type": "code",
532 | "execution_count": null,
533 | "metadata": {
534 | "collapsed": false
535 | },
536 | "outputs": [
537 | {
538 | "ename": "ValueError",
539 | "evalue": "too many values to unpack (expected 2)",
540 | "output_type": "error",
541 | "traceback": [
542 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
543 | "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
544 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0ma\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlda\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_document_topics\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdense\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtolist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
545 | "\u001b[1;32mC:\\Users\\organis2\\AppData\\Local\\Continuum\\Anaconda3\\lib\\site-packages\\gensim\\models\\ldamodel.py\u001b[0m in \u001b[0;36mget_document_topics\u001b[1;34m(self, bow, minimum_probability, minimum_phi_value, per_word_topics)\u001b[0m\n\u001b[0;32m 913\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 914\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 915\u001b[1;33m \u001b[0mgamma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mphis\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minference\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mbow\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcollect_sstats\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mper_word_topics\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 916\u001b[0m \u001b[0mtopic_dist\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgamma\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m/\u001b[0m \u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mgamma\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# normalize distribution\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 917\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
546 | "\u001b[1;32mC:\\Users\\organis2\\AppData\\Local\\Continuum\\Anaconda3\\lib\\site-packages\\gensim\\models\\ldamodel.py\u001b[0m in \u001b[0;36minference\u001b[1;34m(self, chunk, collect_sstats)\u001b[0m\n\u001b[0;32m 428\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mdoc\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdoc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msix\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minteger_types\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 429\u001b[0m \u001b[1;31m# make sure the term IDs are ints, otherwise np will get upset\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 430\u001b[1;33m \u001b[0mids\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mid\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mid\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mdoc\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 431\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 432\u001b[0m \u001b[0mids\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mid\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mid\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mdoc\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
547 | "\u001b[1;32mC:\\Users\\organis2\\AppData\\Local\\Continuum\\Anaconda3\\lib\\site-packages\\gensim\\models\\ldamodel.py\u001b[0m in \u001b[0;36m\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m 428\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mdoc\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdoc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msix\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minteger_types\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 429\u001b[0m \u001b[1;31m# make sure the term IDs are ints, otherwise np will get upset\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 430\u001b[1;33m \u001b[0mids\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mid\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mid\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mdoc\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 431\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 432\u001b[0m \u001b[0mids\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mid\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mid\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mdoc\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
548 | "\u001b[1;31mValueError\u001b[0m: too many values to unpack (expected 2)"
549 | ]
550 | }
551 | ],
552 | "source": [
553 | "a = lda.get_document_topics(corpus.dense.tolist())"
554 | ]
555 | }
556 | ],
557 | "metadata": {
558 | "kernelspec": {
559 | "display_name": "Python 3",
560 | "language": "python",
561 | "name": "python3"
562 | },
563 | "language_info": {
564 | "codemirror_mode": {
565 | "name": "ipython",
566 | "version": 3
567 | },
568 | "file_extension": ".py",
569 | "mimetype": "text/x-python",
570 | "name": "python",
571 | "nbconvert_exporter": "python",
572 | "pygments_lexer": "ipython3",
573 | "version": "3.5.1"
574 | }
575 | },
576 | "nbformat": 4,
577 | "nbformat_minor": 0
578 | }
579 |
--------------------------------------------------------------------------------
/examples/Pivot Example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pandas as pd\n",
12 | "import numpy as np"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": null,
18 | "metadata": {
19 | "collapsed": false,
20 | "scrolled": true
21 | },
22 | "outputs": [
23 | {
24 | "data": {
25 | "text/html": [
26 | "\n",
27 | "
\n",
28 | " \n",
29 | " \n",
30 | " | \n",
31 | " class | \n",
32 | " count | \n",
33 | " document | \n",
34 | " word | \n",
35 | "
\n",
36 | " \n",
37 | " \n",
38 | " \n",
39 | " | 1 | \n",
40 | " class 1 | \n",
41 | " 17 | \n",
42 | " doc 1 | \n",
43 | " word 1 | \n",
44 | "
\n",
45 | " \n",
46 | " | 2 | \n",
47 | " class 1 | \n",
48 | " 3 | \n",
49 | " doc 1 | \n",
50 | " word 2 | \n",
51 | "
\n",
52 | " \n",
53 | " | 3 | \n",
54 | " class 1 | \n",
55 | " 10 | \n",
56 | " doc 1 | \n",
57 | " word 3 | \n",
58 | "
\n",
59 | " \n",
60 | " | 4 | \n",
61 | " class 1 | \n",
62 | " 3 | \n",
63 | " doc 1 | \n",
64 | " word 4 | \n",
65 | "
\n",
66 | " \n",
67 | " | 5 | \n",
68 | " class 1 | \n",
69 | " 1 | \n",
70 | " doc 1 | \n",
71 | " word 5 | \n",
72 | "
\n",
73 | " \n",
74 | " | 6 | \n",
75 | " class 1 | \n",
76 | " 10 | \n",
77 | " doc 1 | \n",
78 | " word 6 | \n",
79 | "
\n",
80 | " \n",
81 | " | 7 | \n",
82 | " class 1 | \n",
83 | " 4 | \n",
84 | " doc 1 | \n",
85 | " word 7 | \n",
86 | "
\n",
87 | " \n",
88 | " | 8 | \n",
89 | " class 1 | \n",
90 | " 5 | \n",
91 | " doc 1 | \n",
92 | " word 8 | \n",
93 | "
\n",
94 | " \n",
95 | " | 9 | \n",
96 | " class 1 | \n",
97 | " 7 | \n",
98 | " doc 1 | \n",
99 | " word 9 | \n",
100 | "
\n",
101 | " \n",
102 | " | 10 | \n",
103 | " class 2 | \n",
104 | " 12 | \n",
105 | " doc 2 | \n",
106 | " word 0 | \n",
107 | "
\n",
108 | " \n",
109 | " | 11 | \n",
110 | " class 2 | \n",
111 | " 9 | \n",
112 | " doc 2 | \n",
113 | " word 1 | \n",
114 | "
\n",
115 | " \n",
116 | " | 12 | \n",
117 | " class 2 | \n",
118 | " 6 | \n",
119 | " doc 2 | \n",
120 | " word 2 | \n",
121 | "
\n",
122 | " \n",
123 | " | 13 | \n",
124 | " class 2 | \n",
125 | " 2 | \n",
126 | " doc 2 | \n",
127 | " word 3 | \n",
128 | "
\n",
129 | " \n",
130 | " | 14 | \n",
131 | " class 2 | \n",
132 | " 7 | \n",
133 | " doc 2 | \n",
134 | " word 4 | \n",
135 | "
\n",
136 | " \n",
137 | " | 15 | \n",
138 | " class 2 | \n",
139 | " 1 | \n",
140 | " doc 2 | \n",
141 | " word 5 | \n",
142 | "
\n",
143 | " \n",
144 | " | 16 | \n",
145 | " class 2 | \n",
146 | " 4 | \n",
147 | " doc 2 | \n",
148 | " word 6 | \n",
149 | "
\n",
150 | " \n",
151 | " | 17 | \n",
152 | " class 2 | \n",
153 | " 5 | \n",
154 | " doc 2 | \n",
155 | " word 7 | \n",
156 | "
\n",
157 | " \n",
158 | " | 18 | \n",
159 | " class 2 | \n",
160 | " 17 | \n",
161 | " doc 2 | \n",
162 | " word 8 | \n",
163 | "
\n",
164 | " \n",
165 | "
\n",
166 | "
"
167 | ],
168 | "text/plain": [
169 | " class count document word\n",
170 | "1 class 1 17 doc 1 word 1\n",
171 | "2 class 1 3 doc 1 word 2\n",
172 | "3 class 1 10 doc 1 word 3\n",
173 | "4 class 1 3 doc 1 word 4\n",
174 | "5 class 1 1 doc 1 word 5\n",
175 | "6 class 1 10 doc 1 word 6\n",
176 | "7 class 1 4 doc 1 word 7\n",
177 | "8 class 1 5 doc 1 word 8\n",
178 | "9 class 1 7 doc 1 word 9\n",
179 | "10 class 2 12 doc 2 word 0\n",
180 | "11 class 2 9 doc 2 word 1\n",
181 | "12 class 2 6 doc 2 word 2\n",
182 | "13 class 2 2 doc 2 word 3\n",
183 | "14 class 2 7 doc 2 word 4\n",
184 | "15 class 2 1 doc 2 word 5\n",
185 | "16 class 2 4 doc 2 word 6\n",
186 | "17 class 2 5 doc 2 word 7\n",
187 | "18 class 2 17 doc 2 word 8"
188 | ]
189 | },
190 | "execution_count": 24,
191 | "metadata": {},
192 | "output_type": "execute_result"
193 | }
194 | ],
195 | "source": [
196 | "#Creating fake data for the example\n",
197 | "words = [\"word \" + str(number) for number in np.arange(0,10)] * 2\n",
198 | "documents = [\"doc 1\"] * 10 + [\"doc 2\"] * 10\n",
199 | "classes = [\"class 1\"] * 10 + [\"class 2\"] * 10\n",
200 | "counts = np.random.randint(1, 20, 20)\n",
201 | "# Create dataframe, and deliberately at missing data by select 1:-1 (this drops the first and last row)\n",
202 | "df = pd.DataFrame({'document':documents, 'word':words, 'class':classes, 'count': counts}).iloc[1:-1]\n",
203 | "df"
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {},
209 | "source": [
210 | "The example DataFrame, above, is a \"long\" dataframe with each row representing the count for a word for a document. It is expect that there each document/word has one row; if it doesn't, do a `groupby` with a `sum` for the column.\n",
211 | "\n",
212 | "To make it `wide`, here is one example:"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": null,
218 | "metadata": {
219 | "collapsed": false
220 | },
221 | "outputs": [
222 | {
223 | "data": {
224 | "text/html": [
225 | "\n",
226 | "
\n",
227 | " \n",
228 | " \n",
229 | " | word | \n",
230 | " word 0 | \n",
231 | " word 1 | \n",
232 | " word 2 | \n",
233 | " word 3 | \n",
234 | " word 4 | \n",
235 | " word 5 | \n",
236 | " word 6 | \n",
237 | " word 7 | \n",
238 | " word 8 | \n",
239 | " word 9 | \n",
240 | "
\n",
241 | " \n",
242 | " | document | \n",
243 | " | \n",
244 | " | \n",
245 | " | \n",
246 | " | \n",
247 | " | \n",
248 | " | \n",
249 | " | \n",
250 | " | \n",
251 | " | \n",
252 | " | \n",
253 | "
\n",
254 | " \n",
255 | " \n",
256 | " \n",
257 | " | doc 1 | \n",
258 | " NaN | \n",
259 | " 17.0 | \n",
260 | " 3.0 | \n",
261 | " 10.0 | \n",
262 | " 3.0 | \n",
263 | " 1.0 | \n",
264 | " 10.0 | \n",
265 | " 4.0 | \n",
266 | " 5.0 | \n",
267 | " 7.0 | \n",
268 | "
\n",
269 | " \n",
270 | " | doc 2 | \n",
271 | " 12.0 | \n",
272 | " 9.0 | \n",
273 | " 6.0 | \n",
274 | " 2.0 | \n",
275 | " 7.0 | \n",
276 | " 1.0 | \n",
277 | " 4.0 | \n",
278 | " 5.0 | \n",
279 | " 17.0 | \n",
280 | " NaN | \n",
281 | "
\n",
282 | " \n",
283 | "
\n",
284 | "
"
285 | ],
286 | "text/plain": [
287 | "word word 0 word 1 word 2 word 3 word 4 word 5 word 6 word 7 \\\n",
288 | "document \n",
289 | "doc 1 NaN 17.0 3.0 10.0 3.0 1.0 10.0 4.0 \n",
290 | "doc 2 12.0 9.0 6.0 2.0 7.0 1.0 4.0 5.0 \n",
291 | "\n",
292 | "word word 8 word 9 \n",
293 | "document \n",
294 | "doc 1 5.0 7.0 \n",
295 | "doc 2 17.0 NaN "
296 | ]
297 | },
298 | "execution_count": 25,
299 | "metadata": {},
300 | "output_type": "execute_result"
301 | }
302 | ],
303 | "source": [
304 | "wide_df = df.pivot(index='document', columns='word', values='count')\n",
305 | "wide_df"
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "metadata": {},
311 | "source": [
312 | "Note that doc1 didn't have word 0, and doc2 didn't have word 9, so they have NaN (Not a Number) values. We can fill these in with fillna(0). Redoing the previous step in a better way:"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": null,
318 | "metadata": {
319 | "collapsed": false
320 | },
321 | "outputs": [
322 | {
323 | "data": {
324 | "text/html": [
325 | "\n",
326 | "
\n",
327 | " \n",
328 | " \n",
329 | " | word | \n",
330 | " word 0 | \n",
331 | " word 1 | \n",
332 | " word 2 | \n",
333 | " word 3 | \n",
334 | " word 4 | \n",
335 | " word 5 | \n",
336 | " word 6 | \n",
337 | " word 7 | \n",
338 | " word 8 | \n",
339 | " word 9 | \n",
340 | "
\n",
341 | " \n",
342 | " | document | \n",
343 | " | \n",
344 | " | \n",
345 | " | \n",
346 | " | \n",
347 | " | \n",
348 | " | \n",
349 | " | \n",
350 | " | \n",
351 | " | \n",
352 | " | \n",
353 | "
\n",
354 | " \n",
355 | " \n",
356 | " \n",
357 | " | doc 1 | \n",
358 | " 0.0 | \n",
359 | " 17.0 | \n",
360 | " 3.0 | \n",
361 | " 10.0 | \n",
362 | " 3.0 | \n",
363 | " 1.0 | \n",
364 | " 10.0 | \n",
365 | " 4.0 | \n",
366 | " 5.0 | \n",
367 | " 7.0 | \n",
368 | "
\n",
369 | " \n",
370 | " | doc 2 | \n",
371 | " 12.0 | \n",
372 | " 9.0 | \n",
373 | " 6.0 | \n",
374 | " 2.0 | \n",
375 | " 7.0 | \n",
376 | " 1.0 | \n",
377 | " 4.0 | \n",
378 | " 5.0 | \n",
379 | " 17.0 | \n",
380 | " 0.0 | \n",
381 | "
\n",
382 | " \n",
383 | "
\n",
384 | "
"
385 | ],
386 | "text/plain": [
387 | "word word 0 word 1 word 2 word 3 word 4 word 5 word 6 word 7 \\\n",
388 | "document \n",
389 | "doc 1 0.0 17.0 3.0 10.0 3.0 1.0 10.0 4.0 \n",
390 | "doc 2 12.0 9.0 6.0 2.0 7.0 1.0 4.0 5.0 \n",
391 | "\n",
392 | "word word 8 word 9 \n",
393 | "document \n",
394 | "doc 1 5.0 7.0 \n",
395 | "doc 2 17.0 0.0 "
396 | ]
397 | },
398 | "execution_count": 26,
399 | "metadata": {},
400 | "output_type": "execute_result"
401 | }
402 | ],
403 | "source": [
404 | "wide_df = df.pivot(index='document', columns='word', values='count').fillna(0)\n",
405 | "wide_df"
406 | ]
407 | },
408 | {
409 | "cell_type": "code",
410 | "execution_count": null,
411 | "metadata": {
412 | "collapsed": false
413 | },
414 | "outputs": [
415 | {
416 | "data": {
417 | "text/html": [
418 | "\n",
419 | "
\n",
420 | " \n",
421 | " \n",
422 | " | \n",
423 | " document | \n",
424 | " word | \n",
425 | " count | \n",
426 | "
\n",
427 | " \n",
428 | " \n",
429 | " \n",
430 | " | 1 | \n",
431 | " doc 1 | \n",
432 | " word 1 | \n",
433 | " 17 | \n",
434 | "
\n",
435 | " \n",
436 | " | 2 | \n",
437 | " doc 1 | \n",
438 | " word 2 | \n",
439 | " 3 | \n",
440 | "
\n",
441 | " \n",
442 | " | 3 | \n",
443 | " doc 1 | \n",
444 | " word 3 | \n",
445 | " 10 | \n",
446 | "
\n",
447 | " \n",
448 | " | 4 | \n",
449 | " doc 1 | \n",
450 | " word 4 | \n",
451 | " 3 | \n",
452 | "
\n",
453 | " \n",
454 | " | 5 | \n",
455 | " doc 1 | \n",
456 | " word 5 | \n",
457 | " 1 | \n",
458 | "
\n",
459 | " \n",
460 | " | 6 | \n",
461 | " doc 1 | \n",
462 | " word 6 | \n",
463 | " 10 | \n",
464 | "
\n",
465 | " \n",
466 | " | 7 | \n",
467 | " doc 1 | \n",
468 | " word 7 | \n",
469 | " 4 | \n",
470 | "
\n",
471 | " \n",
472 | " | 8 | \n",
473 | " doc 1 | \n",
474 | " word 8 | \n",
475 | " 5 | \n",
476 | "
\n",
477 | " \n",
478 | " | 9 | \n",
479 | " doc 1 | \n",
480 | " word 9 | \n",
481 | " 7 | \n",
482 | "
\n",
483 | " \n",
484 | " | 10 | \n",
485 | " doc 2 | \n",
486 | " word 0 | \n",
487 | " 12 | \n",
488 | "
\n",
489 | " \n",
490 | " | 11 | \n",
491 | " doc 2 | \n",
492 | " word 1 | \n",
493 | " 9 | \n",
494 | "
\n",
495 | " \n",
496 | " | 12 | \n",
497 | " doc 2 | \n",
498 | " word 2 | \n",
499 | " 6 | \n",
500 | "
\n",
501 | " \n",
502 | " | 13 | \n",
503 | " doc 2 | \n",
504 | " word 3 | \n",
505 | " 2 | \n",
506 | "
\n",
507 | " \n",
508 | " | 14 | \n",
509 | " doc 2 | \n",
510 | " word 4 | \n",
511 | " 7 | \n",
512 | "
\n",
513 | " \n",
514 | " | 15 | \n",
515 | " doc 2 | \n",
516 | " word 5 | \n",
517 | " 1 | \n",
518 | "
\n",
519 | " \n",
520 | " | 16 | \n",
521 | " doc 2 | \n",
522 | " word 6 | \n",
523 | " 4 | \n",
524 | "
\n",
525 | " \n",
526 | " | 17 | \n",
527 | " doc 2 | \n",
528 | " word 7 | \n",
529 | " 5 | \n",
530 | "
\n",
531 | " \n",
532 | " | 18 | \n",
533 | " doc 2 | \n",
534 | " word 8 | \n",
535 | " 17 | \n",
536 | "
\n",
537 | " \n",
538 | "
\n",
539 | "
"
540 | ],
541 | "text/plain": [
542 | " document word count\n",
543 | "1 doc 1 word 1 17\n",
544 | "2 doc 1 word 2 3\n",
545 | "3 doc 1 word 3 10\n",
546 | "4 doc 1 word 4 3\n",
547 | "5 doc 1 word 5 1\n",
548 | "6 doc 1 word 6 10\n",
549 | "7 doc 1 word 7 4\n",
550 | "8 doc 1 word 8 5\n",
551 | "9 doc 1 word 9 7\n",
552 | "10 doc 2 word 0 12\n",
553 | "11 doc 2 word 1 9\n",
554 | "12 doc 2 word 2 6\n",
555 | "13 doc 2 word 3 2\n",
556 | "14 doc 2 word 4 7\n",
557 | "15 doc 2 word 5 1\n",
558 | "16 doc 2 word 6 4\n",
559 | "17 doc 2 word 7 5\n",
560 | "18 doc 2 word 8 17"
561 | ]
562 | },
563 | "execution_count": 28,
564 | "metadata": {},
565 | "output_type": "execute_result"
566 | }
567 | ],
568 | "source": [
569 | "df[['document', 'word', 'count']]"
570 | ]
571 | },
572 | {
573 | "cell_type": "code",
574 | "execution_count": null,
575 | "metadata": {
576 | "collapsed": false
577 | },
578 | "outputs": [
579 | {
580 | "data": {
581 | "text/html": [
582 | "\n",
583 | "
\n",
584 | " \n",
585 | " \n",
586 | " | \n",
587 | " | \n",
588 | " count | \n",
589 | "
\n",
590 | " \n",
591 | " | document | \n",
592 | " word | \n",
593 | " | \n",
594 | "
\n",
595 | " \n",
596 | " \n",
597 | " \n",
598 | " | doc 1 | \n",
599 | " word 1 | \n",
600 | " 17 | \n",
601 | "
\n",
602 | " \n",
603 | " | word 2 | \n",
604 | " 3 | \n",
605 | "
\n",
606 | " \n",
607 | " | word 3 | \n",
608 | " 10 | \n",
609 | "
\n",
610 | " \n",
611 | " | word 4 | \n",
612 | " 3 | \n",
613 | "
\n",
614 | " \n",
615 | " | word 5 | \n",
616 | " 1 | \n",
617 | "
\n",
618 | " \n",
619 | " | word 6 | \n",
620 | " 10 | \n",
621 | "
\n",
622 | " \n",
623 | " | word 7 | \n",
624 | " 4 | \n",
625 | "
\n",
626 | " \n",
627 | " | word 8 | \n",
628 | " 5 | \n",
629 | "
\n",
630 | " \n",
631 | " | word 9 | \n",
632 | " 7 | \n",
633 | "
\n",
634 | " \n",
635 | " | doc 2 | \n",
636 | " word 0 | \n",
637 | " 12 | \n",
638 | "
\n",
639 | " \n",
640 | " | word 1 | \n",
641 | " 9 | \n",
642 | "
\n",
643 | " \n",
644 | " | word 2 | \n",
645 | " 6 | \n",
646 | "
\n",
647 | " \n",
648 | " | word 3 | \n",
649 | " 2 | \n",
650 | "
\n",
651 | " \n",
652 | " | word 4 | \n",
653 | " 7 | \n",
654 | "
\n",
655 | " \n",
656 | " | word 5 | \n",
657 | " 1 | \n",
658 | "
\n",
659 | " \n",
660 | " | word 6 | \n",
661 | " 4 | \n",
662 | "
\n",
663 | " \n",
664 | " | word 7 | \n",
665 | " 5 | \n",
666 | "
\n",
667 | " \n",
668 | " | word 8 | \n",
669 | " 17 | \n",
670 | "
\n",
671 | " \n",
672 | "
\n",
673 | "
"
674 | ],
675 | "text/plain": [
676 | " count\n",
677 | "document word \n",
678 | "doc 1 word 1 17\n",
679 | " word 2 3\n",
680 | " word 3 10\n",
681 | " word 4 3\n",
682 | " word 5 1\n",
683 | " word 6 10\n",
684 | " word 7 4\n",
685 | " word 8 5\n",
686 | " word 9 7\n",
687 | "doc 2 word 0 12\n",
688 | " word 1 9\n",
689 | " word 2 6\n",
690 | " word 3 2\n",
691 | " word 4 7\n",
692 | " word 5 1\n",
693 | " word 6 4\n",
694 | " word 7 5\n",
695 | " word 8 17"
696 | ]
697 | },
698 | "execution_count": 27,
699 | "metadata": {},
700 | "output_type": "execute_result"
701 | }
702 | ],
703 | "source": [
704 | "summed_counts = df.groupby(['document', 'word'])[['count']].sum()\n",
705 | "summed_counts"
706 | ]
707 | },
708 | {
709 | "cell_type": "markdown",
710 | "metadata": {},
711 | "source": [
712 | "Note also that we only kept the document information as the index. The class labels are still in the long DataFrame.\n",
713 | "\n",
714 | "Here, I \n",
715 | " 1. select just those two columns\n",
716 | " 2. only look at the unique combinations\n",
717 | " 3. set the index to document so it mimics `wide_df`. This is optional, but helps consistency."
718 | ]
719 | },
720 | {
721 | "cell_type": "code",
722 | "execution_count": null,
723 | "metadata": {
724 | "collapsed": false
725 | },
726 | "outputs": [
727 | {
728 | "data": {
729 | "text/html": [
730 | "\n",
731 | "
\n",
732 | " \n",
733 | " \n",
734 | " | \n",
735 | " class | \n",
736 | "
\n",
737 | " \n",
738 | " | document | \n",
739 | " | \n",
740 | "
\n",
741 | " \n",
742 | " \n",
743 | " \n",
744 | " | doc 1 | \n",
745 | " class 1 | \n",
746 | "
\n",
747 | " \n",
748 | " | doc 2 | \n",
749 | " class 2 | \n",
750 | "
\n",
751 | " \n",
752 | "
\n",
753 | "
"
754 | ],
755 | "text/plain": [
756 | " class\n",
757 | "document \n",
758 | "doc 1 class 1\n",
759 | "doc 2 class 2"
760 | ]
761 | },
762 | "execution_count": 7,
763 | "metadata": {},
764 | "output_type": "execute_result"
765 | }
766 | ],
767 | "source": [
768 | "labels = (df[['document', 'class']]\n",
769 | " .drop_duplicates()\n",
770 | " .set_index('document')\n",
771 | " )\n",
772 | "labels"
773 | ]
774 | },
775 | {
776 | "cell_type": "markdown",
777 | "metadata": {},
778 | "source": [
779 | "*Important*: when sending things to SciKit Learn, make sure the rows on the training data and labels are in the same order! Here, they are correct (e.g. doc1 is the first row both times, doc2 is the second row both times).\n",
780 | "\n",
781 | "If they were incorrect, you can take the index from the data (`wide_df.index`) and select the rows in labels to match that order, like this:"
782 | ]
783 | },
784 | {
785 | "cell_type": "code",
786 | "execution_count": null,
787 | "metadata": {
788 | "collapsed": false
789 | },
790 | "outputs": [
791 | {
792 | "data": {
793 | "text/html": [
794 | "\n",
795 | "
\n",
796 | " \n",
797 | " \n",
798 | " | \n",
799 | " class | \n",
800 | "
\n",
801 | " \n",
802 | " | document | \n",
803 | " | \n",
804 | "
\n",
805 | " \n",
806 | " \n",
807 | " \n",
808 | " | doc 1 | \n",
809 | " class 1 | \n",
810 | "
\n",
811 | " \n",
812 | " | doc 2 | \n",
813 | " class 2 | \n",
814 | "
\n",
815 | " \n",
816 | "
\n",
817 | "
"
818 | ],
819 | "text/plain": [
820 | " class\n",
821 | "document \n",
822 | "doc 1 class 1\n",
823 | "doc 2 class 2"
824 | ]
825 | },
826 | "execution_count": 8,
827 | "metadata": {},
828 | "output_type": "execute_result"
829 | }
830 | ],
831 | "source": [
832 | "labels.loc[wide_df.index]"
833 | ]
834 | }
835 | ],
836 | "metadata": {
837 | "kernelspec": {
838 | "display_name": "Python 3",
839 | "language": "python",
840 | "name": "python3"
841 | },
842 | "language_info": {
843 | "codemirror_mode": {
844 | "name": "ipython",
845 | "version": 3
846 | },
847 | "file_extension": ".py",
848 | "mimetype": "text/x-python",
849 | "name": "python",
850 | "nbconvert_exporter": "python",
851 | "pygments_lexer": "ipython3",
852 | "version": "3.5.1"
853 | }
854 | },
855 | "nbformat": 4,
856 | "nbformat_minor": 0
857 | }
858 |
--------------------------------------------------------------------------------
/labs/Lab 06 - More Pandas and Intro to Classification.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Lab 06\n",
8 | "\n",
9 | "This week, we'll continue with the tutorial on using the HTRC Extracted Features Dataset, through Python. Last week was the preparation, this week is the fun stuff!\n",
10 | "\n",
11 | "## Pandas and the Extracted Features Dataset, continued"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {
18 | "collapsed": true
19 | },
20 | "outputs": [],
21 | "source": [
22 | "import pandas as pd"
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {},
28 | "source": [
29 | "### Method Chaining\n",
30 | "\n",
31 | "In Pandas, you may find yourself combining a number of Dataframe methods in a row. When the output of each step is a DataFrame, you don't have to save each step to a variable: you can 'chain' the commands. So, if you want to transfer a DataFrame called `original`:\n",
32 | "\n",
33 | "```python\n",
34 | "df1 = original.do_something()\n",
35 | "df2 = df1.do_something_else()\n",
36 | "df3 = df2.do_more()\n",
37 | "```\n",
38 | ", you can get the same result as follows:\n",
39 | "\n",
40 | "```python\n",
41 | "df3 = original.do_something().do_something_else().do_more()\n",
42 | "```\n",
43 | "\n",
44 | "You may see the benefit and the downside of method chaining above.\n",
45 | "\n",
46 | "The benefit: you're not saving intermediate DataFrames to variables. `df1` and `df2` were only necessary to get you to `df3`, so why even save them?\n",
47 | "\n",
48 | "The downside is less readability: yuck! This is fine for short chains, but for longer ones you still want the line breaks. That way, when you return to your code in the future, you can make sense of it (and so I can read it when marking!).\n",
49 | "\n",
50 | "To format chained methods better, you can wrap everything in braces, which tells Python that the current line of code isn't done until the braces end:\n",
51 | "\n",
52 | "```python\n",
53 | "(df3 = original.do_something()\n",
54 | " .do_something_else()\n",
55 | " .do_more()\n",
56 | ")\n",
57 | "```\n",
58 | "\n",
59 | "Much prettier. This style will be useful once things get more complex. Remember that you're not forced to use chaining: saving intermediate variables is fine, and can be helpful if you find a bug somewhere in the chain. However, you'll see it occasionally in example code, so it is good to understand what is happening."
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "### Slicing\n",
67 | "\n",
68 | "Following from last week's reading on [Text Mining in Python through the HTRC Feature Reader](http://programminghistorian.org/lessons/text-mining-with-extracted-features), we'll be continuing from the 'Slicing DataFrames' section to the end.\n",
69 | "\n",
70 | "First, lets reload the volume from last lab task."
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {
77 | "collapsed": false
78 | },
79 | "outputs": [
80 | {
81 | "data": {
82 | "text/plain": [
83 | ""
84 | ]
85 | },
86 | "execution_count": 2,
87 | "metadata": {},
88 | "output_type": "execute_result"
89 | }
90 | ],
91 | "source": [
92 | "from htrc_features import FeatureReader\n",
93 | "fr = FeatureReader('../data/mdp.49015002392919.json.bz2')\n",
94 | "vol = fr.first()\n",
95 | "vol"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "**Q1**: Fill in the blanks to produce the output show in the image below:\n",
103 | "\n",
104 | "```\n",
105 | "(vol.tokenlist(pages=**BLANK1**, pos=**BLANK2**, case=False)\n",
106 | " .loc[(\"body\", slice(None), \"**BLANK3**\"),]\n",
107 | " .sort_values(\"count\", ascending=**BLANK4**)\n",
108 | " .head(**BLANK5**)\n",
109 | ")\n",
110 | "```\n",
111 | "\n",
112 | "\n",
113 | "\n",
114 | "_Multiple Choice_\n",
115 | "1. True, False\n",
116 | "2. True, False\n",
117 | "3. slice(None), \"body\", \"RB\", \"NNP\"\n",
118 | "4. True, False\n",
119 | "5. 3, 5, 7"
120 | ]
121 | },
122 | {
123 | "cell_type": "markdown",
124 | "metadata": {},
125 | "source": [
126 | "**Q2**: What is the code to get the token frequencies for page 39 of the book? You'll start with `tl = vol.tokenlist()`, what's next?"
127 | ]
128 | },
129 | {
130 | "cell_type": "markdown",
131 | "metadata": {},
132 | "source": [
133 | "**Q3**: How would you get the five most frequent words tagged as a proper noun or a plural proper noun? Since the question doesn't involve page-level counts, you'll want to start with `tl = vol.tokenlist(pages=False)`."
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "metadata": {},
139 | "source": [
140 | "### Grouping"
141 | ]
142 | },
143 | {
144 | "cell_type": "markdown",
145 | "metadata": {},
146 | "source": [
147 | "**Q4**: What does the following code do?\n",
148 | "\n",
149 | "```python\n",
150 | "tl = vol.tokenlist()\n",
151 | "tl.groupby(level='page').count().sort_values('count', ascending=False)\n",
152 | "```\n",
153 | "\n",
154 | "How does it differ from the following?\n",
155 | "\n",
156 | "```python\n",
157 | "tl = vol.tokenlist()\n",
158 | "tl.groupby(level='page').sum().sort_values('count', ascending=False)\n",
159 | "```"
160 | ]
161 | },
162 | {
163 | "cell_type": "markdown",
164 | "metadata": {},
165 | "source": [
166 | "**Q5** (2pts): Set a new variable to `vol.tokenlist().reset_index()`.\n",
167 | "\n",
168 | "**a)** What did `reset_index` do?\n",
169 | "**b)** How would you get run the summing code from above (i.e. the second example in Q4)?"
170 | ]
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "metadata": {},
175 | "source": [
176 | "**Q6**: Using the DataFrame from Q5, how would you select the rows with counts for the word `Tom`? Remember from the reading that 'slicing' is something done only on indexes - you learned to select based on a column value last week."
177 | ]
178 | },
179 | {
180 | "cell_type": "markdown",
181 | "metadata": {},
182 | "source": [
183 | "**Q7**: Using the result from Q6, figure out how to plot the counts of 'Tom' by page. The plot method for DataFrames takes `x` and `y` arguments. Share the code to produce this:\n",
184 | "\n",
185 | ""
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {},
191 | "source": [
192 | "### Pandas Series\n",
193 | "\n",
194 | "Where a Pandas DataFrame object is like a spreadsheet, with rows and columns, a Pandas Series object is like just one column: it is a sequence of just one value at a time. You can think of it as a supercharged list.\n",
195 | "\n",
196 | "To pull out a single column of a DataFrame as a Series, use square brackets to reference the column by name. Using the DataFrame from Q7, where the index has been reset to columns, here's an example:"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": null,
202 | "metadata": {
203 | "collapsed": false
204 | },
205 | "outputs": [
206 | {
207 | "data": {
208 | "text/plain": [
209 | "31992 of\n",
210 | "35613 Least\n",
211 | "16341 them\n",
212 | "4477 bear\n",
213 | "15935 finally\n",
214 | "Name: token, dtype: object"
215 | ]
216 | },
217 | "execution_count": 186,
218 | "metadata": {},
219 | "output_type": "execute_result"
220 | }
221 | ],
222 | "source": [
223 | "token_series = tl['token']\n",
224 | "\n",
225 | "# Show five random items from the series\n",
226 | "token_series.sample(5)"
227 | ]
228 | },
229 | {
230 | "cell_type": "markdown",
231 | "metadata": {},
232 | "source": [
233 | "If you want to add a series to a DataFrame as a column, you can do the same in reverse:"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "metadata": {
240 | "collapsed": false
241 | },
242 | "outputs": [
243 | {
244 | "data": {
245 | "text/html": [
246 | "\n",
247 | "
\n",
248 | " \n",
249 | " \n",
250 | " | \n",
251 | " page | \n",
252 | " section | \n",
253 | " token | \n",
254 | " pos | \n",
255 | " count | \n",
256 | " new_column | \n",
257 | "
\n",
258 | " \n",
259 | " \n",
260 | " \n",
261 | " | 11498 | \n",
262 | " 89 | \n",
263 | " body | \n",
264 | " then | \n",
265 | " RB | \n",
266 | " 2 | \n",
267 | " then | \n",
268 | "
\n",
269 | " \n",
270 | " | 32915 | \n",
271 | " 221 | \n",
272 | " body | \n",
273 | " No | \n",
274 | " UH | \n",
275 | " 2 | \n",
276 | " No | \n",
277 | "
\n",
278 | " \n",
279 | " | 22816 | \n",
280 | " 158 | \n",
281 | " body | \n",
282 | " sat | \n",
283 | " VBD | \n",
284 | " 1 | \n",
285 | " sat | \n",
286 | "
\n",
287 | " \n",
288 | " | 45905 | \n",
289 | " 297 | \n",
290 | " body | \n",
291 | " toward | \n",
292 | " IN | \n",
293 | " 1 | \n",
294 | " toward | \n",
295 | "
\n",
296 | " \n",
297 | " | 11934 | \n",
298 | " 93 | \n",
299 | " body | \n",
300 | " 73 | \n",
301 | " CD | \n",
302 | " 1 | \n",
303 | " 73 | \n",
304 | "
\n",
305 | " \n",
306 | "
\n",
307 | "
"
308 | ],
309 | "text/plain": [
310 | " page section token pos count new_column\n",
311 | "11498 89 body then RB 2 then\n",
312 | "32915 221 body No UH 2 No\n",
313 | "22816 158 body sat VBD 1 sat\n",
314 | "45905 297 body toward IN 1 toward\n",
315 | "11934 93 body 73 CD 1 73"
316 | ]
317 | },
318 | "execution_count": 187,
319 | "metadata": {},
320 | "output_type": "execute_result"
321 | }
322 | ],
323 | "source": [
324 | "tl['new_column'] = token_series\n",
325 | "tl.sample(5)"
326 | ]
327 | },
328 | {
329 | "cell_type": "markdown",
330 | "metadata": {},
331 | "source": [
332 | "Tada!"
333 | ]
334 | },
335 | {
336 | "cell_type": "markdown",
337 | "metadata": {},
338 | "source": [
339 | "A series has a couple of useful features. For example, you can apply a function against each item with `apply`. If we wanted to get the length of every string (like we manually would do with `len('string')`, it's possible in this way:"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "metadata": {
346 | "collapsed": false,
347 | "scrolled": true
348 | },
349 | "outputs": [
350 | {
351 | "data": {
352 | "text/plain": [
353 | "0 1\n",
354 | "1 1\n",
355 | "2 1\n",
356 | "3 4\n",
357 | "4 6\n",
358 | "Name: token, dtype: int64"
359 | ]
360 | },
361 | "execution_count": 188,
362 | "metadata": {},
363 | "output_type": "execute_result"
364 | }
365 | ],
366 | "source": [
367 | "token_series.apply(len).head()"
368 | ]
369 | },
370 | {
371 | "cell_type": "markdown",
372 | "metadata": {},
373 | "source": [
374 | "Is it clear what happened there? `apply` took the function we gave it, `len`, and for each value in the Series applied `len(value)`.\n",
375 | "\n",
376 | "If this was a list instead of a Series, the equivalent would be `[len(string) for string in list_of_strings]`.\n",
377 | "\n",
378 | "Just to be more clear, I'll add it as a column:"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": null,
384 | "metadata": {
385 | "collapsed": false
386 | },
387 | "outputs": [
388 | {
389 | "data": {
390 | "text/html": [
391 | "\n",
392 | "
\n",
393 | " \n",
394 | " \n",
395 | " | \n",
396 | " page | \n",
397 | " section | \n",
398 | " token | \n",
399 | " pos | \n",
400 | " count | \n",
401 | " new_column | \n",
402 | " token_length | \n",
403 | "
\n",
404 | " \n",
405 | " \n",
406 | " \n",
407 | " | 42327 | \n",
408 | " 276 | \n",
409 | " body | \n",
410 | " pockets—yet | \n",
411 | " NN | \n",
412 | " 1 | \n",
413 | " pockets—yet | \n",
414 | " 11 | \n",
415 | "
\n",
416 | " \n",
417 | " | 19755 | \n",
418 | " 140 | \n",
419 | " body | \n",
420 | " at | \n",
421 | " IN | \n",
422 | " 1 | \n",
423 | " at | \n",
424 | " 2 | \n",
425 | "
\n",
426 | " \n",
427 | " | 4554 | \n",
428 | " 46 | \n",
429 | " body | \n",
430 | " other | \n",
431 | " JJ | \n",
432 | " 1 | \n",
433 | " other | \n",
434 | " 5 | \n",
435 | "
\n",
436 | " \n",
437 | " | 13279 | \n",
438 | " 100 | \n",
439 | " body | \n",
440 | " ten | \n",
441 | " CD | \n",
442 | " 1 | \n",
443 | " ten | \n",
444 | " 3 | \n",
445 | "
\n",
446 | " \n",
447 | " | 12667 | \n",
448 | " 97 | \n",
449 | " body | \n",
450 | " carefully | \n",
451 | " RB | \n",
452 | " 1 | \n",
453 | " carefully | \n",
454 | " 9 | \n",
455 | "
\n",
456 | " \n",
457 | "
\n",
458 | "
"
459 | ],
460 | "text/plain": [
461 | " page section token pos count new_column token_length\n",
462 | "42327 276 body pockets—yet NN 1 pockets—yet 11\n",
463 | "19755 140 body at IN 1 at 2\n",
464 | "4554 46 body other JJ 1 other 5\n",
465 | "13279 100 body ten CD 1 ten 3\n",
466 | "12667 97 body carefully RB 1 carefully 9"
467 | ]
468 | },
469 | "execution_count": 189,
470 | "metadata": {},
471 | "output_type": "execute_result"
472 | }
473 | ],
474 | "source": [
475 | "tl['token_length'] = token_series.apply(len)\n",
476 | "tl.sample(5)"
477 | ]
478 | },
479 | {
480 | "cell_type": "markdown",
481 | "metadata": {},
482 | "source": [
483 | "Looks right!\n",
484 | "\n",
485 | "Another useful method of a Series is `value_counts`, which simply counts how often each value occurs:"
486 | ]
487 | },
488 | {
489 | "cell_type": "code",
490 | "execution_count": null,
491 | "metadata": {
492 | "collapsed": false
493 | },
494 | "outputs": [
495 | {
496 | "data": {
497 | "text/plain": [
498 | "that 502\n",
499 | "\" 485\n",
500 | "'s 364\n",
501 | ". 297\n",
502 | "the 296\n",
503 | "Name: token, dtype: int64"
504 | ]
505 | },
506 | "execution_count": 190,
507 | "metadata": {},
508 | "output_type": "execute_result"
509 | }
510 | ],
511 | "source": [
512 | "token_series.value_counts().head()"
513 | ]
514 | },
515 | {
516 | "cell_type": "markdown",
517 | "metadata": {},
518 | "source": [
519 | "Finally, for a Series that specifically has strings, there are string methods. Try `token_series.str.` to see the autofill of what is possible.\n",
520 | "\n",
521 | "Going back to our ALTA filtering for `isalpha()`, we can quickly do the same here:"
522 | ]
523 | },
524 | {
525 | "cell_type": "code",
526 | "execution_count": null,
527 | "metadata": {
528 | "collapsed": false
529 | },
530 | "outputs": [
531 | {
532 | "data": {
533 | "text/plain": [
534 | "0 False\n",
535 | "1 False\n",
536 | "2 False\n",
537 | "3 False\n",
538 | "4 False\n",
539 | "5 False\n",
540 | "6 True\n",
541 | "7 True\n",
542 | "8 True\n",
543 | "9 True\n",
544 | "Name: token, dtype: bool"
545 | ]
546 | },
547 | "execution_count": 191,
548 | "metadata": {},
549 | "output_type": "execute_result"
550 | }
551 | ],
552 | "source": [
553 | "is_alpha_matches = token_series.str.isalpha()\n",
554 | "is_alpha_matches.head(10)"
555 | ]
556 | },
557 | {
558 | "cell_type": "markdown",
559 | "metadata": {},
560 | "source": [
561 | "We saw in Lab 5 that supplying a set of True or False values to a DataFrame allows us to select rows. lets try it with the above Series:"
562 | ]
563 | },
564 | {
565 | "cell_type": "code",
566 | "execution_count": null,
567 | "metadata": {
568 | "collapsed": false
569 | },
570 | "outputs": [
571 | {
572 | "data": {
573 | "text/html": [
574 | "\n",
575 | "
\n",
576 | " \n",
577 | " \n",
578 | " | \n",
579 | " page | \n",
580 | " section | \n",
581 | " token | \n",
582 | " pos | \n",
583 | " count | \n",
584 | " new_column | \n",
585 | " token_length | \n",
586 | "
\n",
587 | " \n",
588 | " \n",
589 | " \n",
590 | " | 0 | \n",
591 | " 3 | \n",
592 | " body | \n",
593 | " . | \n",
594 | " . | \n",
595 | " 1 | \n",
596 | " . | \n",
597 | " 1 | \n",
598 | "
\n",
599 | " \n",
600 | " | 1 | \n",
601 | " 3 | \n",
602 | " body | \n",
603 | " 0 | \n",
604 | " CD | \n",
605 | " 1 | \n",
606 | " 0 | \n",
607 | " 1 | \n",
608 | "
\n",
609 | " \n",
610 | " | 2 | \n",
611 | " 3 | \n",
612 | " body | \n",
613 | " 1 | \n",
614 | " CD | \n",
615 | " 1 | \n",
616 | " 1 | \n",
617 | " 1 | \n",
618 | "
\n",
619 | " \n",
620 | " | 3 | \n",
621 | " 3 | \n",
622 | " body | \n",
623 | " 2003 | \n",
624 | " CD | \n",
625 | " 1 | \n",
626 | " 2003 | \n",
627 | " 4 | \n",
628 | "
\n",
629 | " \n",
630 | " | 4 | \n",
631 | " 3 | \n",
632 | " body | \n",
633 | " 38-297 | \n",
634 | " CD | \n",
635 | " 1 | \n",
636 | " 38-297 | \n",
637 | " 6 | \n",
638 | "
\n",
639 | " \n",
640 | " | 5 | \n",
641 | " 3 | \n",
642 | " body | \n",
643 | " 4 | \n",
644 | " CD | \n",
645 | " 1 | \n",
646 | " 4 | \n",
647 | " 1 | \n",
648 | "
\n",
649 | " \n",
650 | " | 6 | \n",
651 | " 3 | \n",
652 | " body | \n",
653 | " DEMCO | \n",
654 | " NNP | \n",
655 | " 1 | \n",
656 | " DEMCO | \n",
657 | " 5 | \n",
658 | "
\n",
659 | " \n",
660 | " | 7 | \n",
661 | " 3 | \n",
662 | " body | \n",
663 | " M | \n",
664 | " NNP | \n",
665 | " 1 | \n",
666 | " M | \n",
667 | " 1 | \n",
668 | "
\n",
669 | " \n",
670 | " | 8 | \n",
671 | " 7 | \n",
672 | " body | \n",
673 | " LEATHER | \n",
674 | " NNP | \n",
675 | " 1 | \n",
676 | " LEATHER | \n",
677 | " 7 | \n",
678 | "
\n",
679 | " \n",
680 | " | 9 | \n",
681 | " 7 | \n",
682 | " body | \n",
683 | " LIMP | \n",
684 | " NNP | \n",
685 | " 1 | \n",
686 | " LIMP | \n",
687 | " 4 | \n",
688 | "
\n",
689 | " \n",
690 | "
\n",
691 | "
"
692 | ],
693 | "text/plain": [
694 | " page section token pos count new_column token_length\n",
695 | "0 3 body . . 1 . 1\n",
696 | "1 3 body 0 CD 1 0 1\n",
697 | "2 3 body 1 CD 1 1 1\n",
698 | "3 3 body 2003 CD 1 2003 4\n",
699 | "4 3 body 38-297 CD 1 38-297 6\n",
700 | "5 3 body 4 CD 1 4 1\n",
701 | "6 3 body DEMCO NNP 1 DEMCO 5\n",
702 | "7 3 body M NNP 1 M 1\n",
703 | "8 7 body LEATHER NNP 1 LEATHER 7\n",
704 | "9 7 body LIMP NNP 1 LIMP 4"
705 | ]
706 | },
707 | "execution_count": 192,
708 | "metadata": {},
709 | "output_type": "execute_result"
710 | }
711 | ],
712 | "source": [
713 | "tl.head(10)"
714 | ]
715 | },
716 | {
717 | "cell_type": "code",
718 | "execution_count": null,
719 | "metadata": {
720 | "collapsed": false
721 | },
722 | "outputs": [
723 | {
724 | "data": {
725 | "text/html": [
726 | "\n",
727 | "
\n",
728 | " \n",
729 | " \n",
730 | " | \n",
731 | " page | \n",
732 | " section | \n",
733 | " token | \n",
734 | " pos | \n",
735 | " count | \n",
736 | " new_column | \n",
737 | " token_length | \n",
738 | "
\n",
739 | " \n",
740 | " \n",
741 | " \n",
742 | " | 6 | \n",
743 | " 3 | \n",
744 | " body | \n",
745 | " DEMCO | \n",
746 | " NNP | \n",
747 | " 1 | \n",
748 | " DEMCO | \n",
749 | " 5 | \n",
750 | "
\n",
751 | " \n",
752 | " | 7 | \n",
753 | " 3 | \n",
754 | " body | \n",
755 | " M | \n",
756 | " NNP | \n",
757 | " 1 | \n",
758 | " M | \n",
759 | " 1 | \n",
760 | "
\n",
761 | " \n",
762 | " | 8 | \n",
763 | " 7 | \n",
764 | " body | \n",
765 | " LEATHER | \n",
766 | " NNP | \n",
767 | " 1 | \n",
768 | " LEATHER | \n",
769 | " 7 | \n",
770 | "
\n",
771 | " \n",
772 | " | 9 | \n",
773 | " 7 | \n",
774 | " body | \n",
775 | " LIMP | \n",
776 | " NNP | \n",
777 | " 1 | \n",
778 | " LIMP | \n",
779 | " 4 | \n",
780 | "
\n",
781 | " \n",
782 | " | 10 | \n",
783 | " 7 | \n",
784 | " body | \n",
785 | " MARK | \n",
786 | " NNP | \n",
787 | " 1 | \n",
788 | " MARK | \n",
789 | " 4 | \n",
790 | "
\n",
791 | " \n",
792 | "
\n",
793 | "
"
794 | ],
795 | "text/plain": [
796 | " page section token pos count new_column token_length\n",
797 | "6 3 body DEMCO NNP 1 DEMCO 5\n",
798 | "7 3 body M NNP 1 M 1\n",
799 | "8 7 body LEATHER NNP 1 LEATHER 7\n",
800 | "9 7 body LIMP NNP 1 LIMP 4\n",
801 | "10 7 body MARK NNP 1 MARK 4"
802 | ]
803 | },
804 | "execution_count": 193,
805 | "metadata": {},
806 | "output_type": "execute_result"
807 | }
808 | ],
809 | "source": [
810 | "tl[is_alpha_matches].head()"
811 | ]
812 | },
813 | {
814 | "cell_type": "markdown",
815 | "metadata": {},
816 | "source": [
817 | "It worked! Of the top ten rows, the only ones that are selected are solely alphabetical. Remember that `is_alpha_matches` is simple `tl['token'].str.isalpha()`, which could have been used for selection.\n",
818 | "\n",
819 | "Finally, one more string method, `lower()`:"
820 | ]
821 | },
822 | {
823 | "cell_type": "code",
824 | "execution_count": null,
825 | "metadata": {
826 | "collapsed": false
827 | },
828 | "outputs": [
829 | {
830 | "data": {
831 | "text/html": [
832 | "\n",
833 | "
\n",
834 | " \n",
835 | " \n",
836 | " | \n",
837 | " page | \n",
838 | " section | \n",
839 | " token | \n",
840 | " pos | \n",
841 | " count | \n",
842 | " new_column | \n",
843 | " token_length | \n",
844 | " lowercase | \n",
845 | "
\n",
846 | " \n",
847 | " \n",
848 | " \n",
849 | " | 44538 | \n",
850 | " 289 | \n",
851 | " body | \n",
852 | " time | \n",
853 | " NN | \n",
854 | " 1 | \n",
855 | " time | \n",
856 | " 4 | \n",
857 | " time | \n",
858 | "
\n",
859 | " \n",
860 | " | 20868 | \n",
861 | " 146 | \n",
862 | " body | \n",
863 | " top | \n",
864 | " NN | \n",
865 | " 1 | \n",
866 | " top | \n",
867 | " 3 | \n",
868 | " top | \n",
869 | "
\n",
870 | " \n",
871 | " | 16134 | \n",
872 | " 118 | \n",
873 | " body | \n",
874 | " matter | \n",
875 | " NN | \n",
876 | " 1 | \n",
877 | " matter | \n",
878 | " 6 | \n",
879 | " matter | \n",
880 | "
\n",
881 | " \n",
882 | " | 42279 | \n",
883 | " 276 | \n",
884 | " body | \n",
885 | " healing | \n",
886 | " NN | \n",
887 | " 1 | \n",
888 | " healing | \n",
889 | " 7 | \n",
890 | " healing | \n",
891 | "
\n",
892 | " \n",
893 | " | 23931 | \n",
894 | " 165 | \n",
895 | " body | \n",
896 | " effort | \n",
897 | " NN | \n",
898 | " 1 | \n",
899 | " effort | \n",
900 | " 6 | \n",
901 | " effort | \n",
902 | "
\n",
903 | " \n",
904 | "
\n",
905 | "
"
906 | ],
907 | "text/plain": [
908 | " page section token pos count new_column token_length lowercase\n",
909 | "44538 289 body time NN 1 time 4 time\n",
910 | "20868 146 body top NN 1 top 3 top\n",
911 | "16134 118 body matter NN 1 matter 6 matter\n",
912 | "42279 276 body healing NN 1 healing 7 healing\n",
913 | "23931 165 body effort NN 1 effort 6 effort"
914 | ]
915 | },
916 | "execution_count": 194,
917 | "metadata": {},
918 | "output_type": "execute_result"
919 | }
920 | ],
921 | "source": [
922 | "tl['lowercase'] = token_series.str.lower()\n",
923 | "tl.sample(5)"
924 | ]
925 | },
926 | {
927 | "cell_type": "markdown",
928 | "metadata": {},
929 | "source": [
930 | "**Q8**: How is `token_series.str.istitle()` different from `token_series.str.isupper()`?"
931 | ]
932 | },
933 | {
934 | "cell_type": "markdown",
935 | "metadata": {},
936 | "source": [
937 | "**Q9**: Which of the following options returns the tokens that have a hyphen in them?\n",
938 | "\n",
939 | " 1. `tl[tl['token'].str.has('-')]`\n",
940 | " 2. `tl[tl['token'].str.contains('-')]`\n",
941 | " 3. `tl[tl['token'].contains('-')]`\n",
942 | " 4. `tl[tl['token'] == '-']`\n",
943 | " 5. None of the above"
944 | ]
945 | }
946 | ],
947 | "metadata": {
948 | "kernelspec": {
949 | "display_name": "Python 3",
950 | "language": "python",
951 | "name": "python3"
952 | },
953 | "language_info": {
954 | "codemirror_mode": {
955 | "name": "ipython",
956 | "version": 3
957 | },
958 | "file_extension": ".py",
959 | "mimetype": "text/x-python",
960 | "name": "python",
961 | "nbconvert_exporter": "python",
962 | "pygments_lexer": "ipython3",
963 | "version": "3.5.1"
964 | }
965 | },
966 | "nbformat": 4,
967 | "nbformat_minor": 0
968 | }
969 |
--------------------------------------------------------------------------------