├── .gitignore ├── GLOSSARY.md ├── README.md ├── SUMMARY.md ├── acknowledgements.md ├── archives.md ├── archives ├── exercises.md ├── nines.md └── tei.md ├── assets ├── Screen Shot 2016-08-25 at 10.33.39 AM.jpg ├── Screen Shot 2016-08-25 at 10.33.39 AM.png ├── Screen Shot 2016-08-25 at 10.33.54 AM.jpg ├── Screen Shot 2016-08-25 at 10.33.54 AM.png ├── Screen Shot 2016-08-25 at 10.34.08 AM.jpg ├── Screen Shot 2016-08-25 at 10.34.08 AM.png ├── archives │ ├── nines-crime-search.jpg │ ├── nines-federated.jpg │ ├── nines-old-bailey-search.jpg │ ├── nines-splash.jpg │ ├── old-bailey-sans-tei.jpg │ ├── old-bailey-tei.jpg │ ├── tei-graphic.jpg │ └── tei.jpg ├── bagel.jpg ├── bagel2.jpg ├── classifiers │ ├── authorship-function.jpg │ ├── authorship-lexical.jpg │ ├── authorship-punctuation.jpg │ ├── owl.jpg │ └── sleepingstarbuck.jpg ├── close-reading │ ├── prism-raven-font-size.jpg │ ├── prism-raven-highlights.jpg │ ├── prism-raven-winning-facet.jpg │ └── prism-splash-page.jpg ├── conclusion │ ├── clone-url.jpg │ ├── fork-button.jpg │ ├── gitbook-add-book.jpg │ ├── gitbook-repo-selection.jpg │ ├── gitbooks-clone.jpg │ ├── gitbooks-editor-interface.jpg │ ├── gitbooks-github-complete-import-template.jpg │ ├── gitbooks-import-github.jpg │ ├── gitbooks-sync.jpg │ ├── github-forking.jpg │ └── sentence-difficulty.jpg ├── crime-and-scandal-syllabus.txt ├── crowdsourcing │ ├── prism-create-one.jpg │ ├── prism-create-two.jpg │ ├── prism-future-stacked.jpg │ └── prism-myprisms.jpg ├── cyborg-readers │ ├── scandal-in-bohemia-word-cloud.jpg │ ├── stopword-free-concordance.jpg │ ├── voyant-frequency-graph.jpg │ ├── voyant-links.jpg │ ├── voyant-overview.jpg │ ├── voyant-phrases.jpg │ ├── voyant-settings.jpg │ ├── voyant-splash-page.jpg │ ├── voyant-stopwords.jpg │ ├── voyant-summary.jpg │ ├── voyant-term-frequencies.jpg │ ├── voyant-word-cloud-default.jpg │ └── voyant-word-cloud-dense.jpg ├── data-cleaning │ ├── data-cat-high-five.jpg │ ├── dog-blanket.jpeg │ ├── holmes-ocr-text.jpg │ ├── holmes-ocr.txt │ ├── holmes.jpg │ ├── zotero-add-citation.jpg │ ├── zotero-document-with-bibliography.jpg │ ├── zotero-download-from-chrome.jpg │ ├── zotero-download.jpg │ ├── zotero-editing-pane.jpg │ ├── zotero-example-citation.jpg │ ├── zotero-give-page-numbers-to-citation.jpg │ ├── zotero-input-by-isbn.jpg │ ├── zotero-input-from-web.jpg │ ├── zotero-magic-wand.jpg │ ├── zotero-menu-in-word.jpg │ ├── zotero-searching-for-citation.jpg │ ├── zotero-select-citation-style.jpg │ └── zotero-standalone.jpg ├── early-modern-stopwords.txt ├── image-convert.py ├── issues │ ├── english-crime-ngram.jpg │ ├── forms-of-scandal.jpg │ ├── french-crime-ngram.jpg │ ├── google-ngram-viewer.jpg │ ├── race.jpg │ ├── scandal.jpg │ ├── science-religion.jpg │ ├── visual-clarity.jpg │ └── wildcard-scandal.jpg ├── reading-at-scale │ ├── distant-reading-dinosaur.jpg │ ├── distant-reading-graphs.jpg │ ├── sweeney-said.jpg │ ├── voyant-collocates.jpg │ ├── voyant-contexts.jpg │ └── voyant-word-cloud-default.jpg ├── sentiment-analysis │ ├── emoji-sentiment-angry.jpg │ ├── emoji-sentiment-friday.jpg │ └── jockers-portrait.jpg ├── sleepingstarbuck.jpg ├── the-string-of-pearls-36-to-39.txt ├── the-string-of-pearls-full.txt └── topic-modeling │ ├── topic-modeling-french-german.jpg │ └── topic-modeling-highlights.jpg ├── book.json ├── classifiers.md ├── classifiers ├── classifying-texts.md ├── exercises.md └── supervised-classifiers.md ├── close-reading.md ├── close-reading ├── close-reading.md ├── exercises.md └── prism-part-one.md ├── conclusion.md ├── conclusion ├── adapting.md ├── resources.md └── where-to-go.md ├── contexts-and-claims.md ├── cover.jpg ├── crowdsourcing.md ├── crowdsourcing ├── crowdsourcing.md ├── exercises.md └── prism-part-two.md ├── cyborg-readers.md ├── cyborg-readers ├── computer-reading.md ├── exercises.md └── voyant-part-one.md ├── data-cleaning.md ├── data-cleaning ├── exercises.md ├── problems-with-data.md └── zotero.md ├── introduction.md ├── introduction ├── for-instructors.md ├── for-students.md └── schedule.md ├── issues-in-digital-text-analysis.md ├── issues ├── exercises.md ├── google-ngram.md └── why-read-with-a-computer.md ├── reading-at-scale.md ├── reading-at-scale ├── distant-reading.md ├── exercises.md └── voyant-part-two.md ├── schedule.md ├── sentiment-analysis.md ├── sentiment-analysis ├── exercises.md ├── sentiment-analysis-in-action.md └── sentiment-analysis.md ├── styles ├── ebook.css ├── epub.css ├── mobi.css ├── pdf.css ├── print.css └── website.css ├── topic-modeling.md └── topic-modeling ├── bags-of-words.md ├── exercises.md └── topic-modeling-case-study.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Node rules: 2 | ## Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 3 | .grunt 4 | 5 | ## Dependency directory 6 | ## Commenting this out is preferred by some people, see 7 | ## https://docs.npmjs.com/misc/faq#should-i-check-my-node_modules-folder-into-git 8 | node_modules 9 | 10 | # Book build output 11 | _book 12 | 13 | # eBook build output 14 | *.epub 15 | *.mobi 16 | *.pdf -------------------------------------------------------------------------------- /GLOSSARY.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/GLOSSARY.md -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Note: We have moved platforms since releasing this book. To see the new and improved version of the book running off Jekyll, check it out [here](http://walshbr.com/textanalysiscoursebook/). Materials here may be slightly out of date. 2 | 3 | # Preface 4 | 5 | *(Note: We welcome feedback on this book! If you find an error, want clarification on a particular issue, or find deep problems with particular explanations, drop us a line on our [GitHub issues page](https://github.com/walshbr/textanalysiscoursebook/issues). We'll be grateful and list you in our [acknowledgements](acknowledgements.md)!)* 6 | 7 | This workbook provides a brief introduction to digital text analysis through a series of three-part units. Each unit introduces students to a concept, a tool for or method of digital text analysis, and a series of exercises for practicing the new skills. In some cases, studies of particular projects are presented instead of tools in the third section of each unit. 8 | 9 | The materials here are meant to form the basis for a digital text analysis course that does not require extensive training in programming and is written with student readers in mind. Originally developed for use in a course titled "Scandal, Crime, and Spectacle in the Nineteenth Century," this workbook draws from these course materials for its datasets and prompts. The book is intended to be modularized enough that it could be used in conjunction with other courses either in whole or in part, as all of its materials are [openly available on GitHub](https://github.com/walshbr/textanalysiscoursebook). The tripartite structure of each chapter means that sections can be easily removed and replaced with different tools or content. In particular, we envision our course-specific exercises in the third section of each chapter to be removable. For more guidance on how to remix the work for your own ends, see [Adapting This Book](/conclusion/adapting.md). 10 | 11 | The book is best viewed online using either Chrome or Firefox. You can also download it to read as a PDF [here](https://www.gitbook.com/book/bmw9t/introduction-to-text-analysis/details). 12 | 13 | *Introduction to Text Analysis: A Coursebook* by Brandon Walsh and Sarah Horowitz is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 14 | 15 | Creative Commons License
16 | -------------------------------------------------------------------------------- /SUMMARY.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | * [Preface](README.md) 4 | * [Acknowledgements](acknowledgements.md) 5 | * [Introduction](introduction.md) 6 | * [For Instructors](introduction/for-instructors.md) 7 | * [For Students](introduction/for-students.md) 8 | * [Schedule](introduction/schedule.md) 9 | * [Issues in Digital Text Analysis](issues-in-digital-text-analysis.md) 10 | * [Why Read with a Computer?](issues/why-read-with-a-computer.md) 11 | * [Google NGram Viewer](issues/google-ngram.md) 12 | * [Exercises](issues/exercises.md) 13 | * [Close Reading](close-reading.md) 14 | * [Close Reading and Sources](close-reading/close-reading.md) 15 | * [Prism Part One](close-reading/prism-part-one.md) 16 | * [Exercises](close-reading/exercises.md) 17 | * [Crowdsourcing](crowdsourcing.md) 18 | * [Crowdsourcing](crowdsourcing/crowdsourcing.md) 19 | * [Prism Part Two](crowdsourcing/prism-part-two.md) 20 | * [Exercises](crowdsourcing/exercises.md) 21 | * [Digital Archives](archives.md) 22 | * [Text Encoding Initiative](archives/tei.md) 23 | * [NINES and Digital Archives](archives/nines.md) 24 | * [Exercises](archives/exercises.md) 25 | * [Data Cleaning](data-cleaning.md) 26 | * [Problems with Data](data-cleaning/problems-with-data.md) 27 | * [Zotero](data-cleaning/zotero.md) 28 | * [Exercises](data-cleaning/exercises.md) 29 | * [Cyborg Readers](cyborg-readers.md) 30 | * [How Computers Read Texts](cyborg-readers/computer-reading.md) 31 | * [Voyant Part One](cyborg-readers/voyant-part-one.md) 32 | * [Exercises](cyborg-readers/exercises.md) 33 | * [Reading at Scale](reading-at-scale.md) 34 | * [Distant Reading](reading-at-scale/distant-reading.md) 35 | * [Voyant Part Two](reading-at-scale/voyant-part-two.md) 36 | * [Exercises](reading-at-scale/exercises.md) 37 | * [Topic Modeling](topic-modeling.md) 38 | * [Bags of Words](topic-modeling/bags-of-words.md) 39 | * [Topic Modeling Case Study](topic-modeling/topic-modeling-case-study.md) 40 | * [Exercises](topic-modeling/exercises.md) 41 | * [Classifiers](classifiers.md) 42 | * [Supervised Classifiers](classifiers/supervised-classifiers.md) 43 | * [Classifying Texts](classifiers/classifying-texts.md) 44 | * [Exercises](classifiers/exercises.md) 45 | * [Sentiment Analysis](sentiment-analysis.md) 46 | * [Sentiment Analysis](sentiment-analysis/sentiment-analysis.md) 47 | * [Sentiment Analysis in Action](sentiment-analysis/sentiment-analysis-in-action.md) 48 | * [Exercises](sentiment-analysis/exercises.md) 49 | * [Conclusion](conclusion.md) 50 | * [Where to Go Next](conclusion/where-to-go.md) 51 | * [Further Resources](conclusion/resources.md) 52 | * [Adapting This Book](conclusion/adapting.md) 53 | 54 | -------------------------------------------------------------------------------- /acknowledgements.md: -------------------------------------------------------------------------------- 1 | # Acknowledgements 2 | 3 | We are very grateful to the encouragement and feeback from all of our colleagues as we put these materials together. In particular, we would like to thank the following individuals for their advice on specific portions of the book: 4 | 5 | * Mackenzie Brooks 6 | * Eliza Fox 7 | * Julie Kane 8 | * Eric Rochester 9 | * Demitra Tsioulos 10 | * Students from HIST 211 11 | 12 | The Roy Rosenzweig Center for History and New Media holds the copyright for the excerpts used in the [lesson](https://bmw9t.gitbooks.io/introduction-to-text-analysis/content/close-reading/close-reading.html) of the Close Reading chapter. The full text of the passage can be found on the CHNM site [here](http://chnm.gmu.edu/revolution/d/261/). 13 | 14 | In addition, the cover image is word cloud generated by [Voyant](https://voyant-tools.org), an excellent tool developed by Stéfan Sinclair and Geoffrey Rockwell that we discuss in our chapters on "[Cyborg Readers](/cyborg-readers.md)" and "[Reading at Scale](/reading-at-scale.md)." -------------------------------------------------------------------------------- /archives.md: -------------------------------------------------------------------------------- 1 | # Digital Archives 2 | 3 | * [Text Encoding Initiative](/archives/tei.md) 4 | 5 | * [NINES and Digital Archives](/archives/nines.md) 6 | 7 | * [Exercises](/archives/exercises.md) -------------------------------------------------------------------------------- /archives/exercises.md: -------------------------------------------------------------------------------- 1 | # Exercises 2 | 3 | * Imagine you are marking the following [passage](http://www.victorianlondon.org/prisons/breakingwindows.htm) on prison life from [The Dictionary of Victorian London](http://www.victorianlondon.org/) in TEI. What elements would you tag or mark? \(No need to actually look up the valid TEI codes for such things - you can just invent fake tags for what you would be interested in.\) 4 | 5 | --- 6 | 7 | Victorian London - Prisons - breaking windows to get into prison 8 | 9 | WINDOW BREAKING 10 | 11 | Sir, - Instances are now becoming more frequent of paupers preferring a prison to a workhouse, and resorting to the method of window breaking, as described in your police report of yesterday. Now, the law in its present state is merely an incentive to a repetition of the act; and, therefore, as it affords me no redress, I intend to take it into my own hands. I employ two porters on my premises, and have provided them with stout cudgels. If any pauper should deliberately break a large square of glass they will rush out, and thrash them most unmercifully. Where is the advantage in giving them into custody? By that means you confer a favour on the offender; and the very hour he is at liberty he will return and continue to repeat the offence until again incarcerated. It is no argument to tell us to use less expensive glass, as the pauper would soon find other means of accomplishing his object. What is required is this - and I ask the assistance of your all powerful pen in its favour - that a law should be passed condemning the perpetrator to a sound whipping and immediate discharge. 12 | 13 | I am, Sir, your obedient servant, A CITY TRADESMAN. 14 | 15 | _letter_ in The Times, January 5, 1850 16 | 17 | --- 18 | 19 | * If you could create an archive of some nineteenth-century materials, what would interest you and why? 20 | 21 | * What legal or propietary issues would you have to sort out as part of the project? 22 | 23 | * Who do you imagine would be interested in your archive? 24 | 25 | * Would your site be open access or behind a paywall? Why? 26 | 27 | 28 | -------------------------------------------------------------------------------- /archives/nines.md: -------------------------------------------------------------------------------- 1 | # NINES and Digital Archives 2 | 3 | Our discussion of TEI has given you a sense of some of the work that can go into creating digital editions of texts. Indeed, encoding texts in this way is often the first step in a long process of preparing documents, historical or otherwise, for presentation and distribution on the web. In this section, we'll talk more about the stakes of putting digital collections like these online and help you understand some of the archives that are out there for you to use. 4 | 5 | One good reason to put a collection online is be to increase access to the materials. After all, a manuscript kept in a museum requires that someone go to that location in order to read the document. An online version can reach a wider audience than a physical copy. However, putting materials on the internet raises a variety of legal and financial issues. After all, these digital resources require a great deal of time, funding, and energy to produce. Imagine you are the curator of an archive: 6 | 7 | * Will you make your materials freely available to anyone with an internet connection? 8 | * Will you require payment to see them? 9 | * Why? 10 | 11 | If you have ever tried to access a resource from an online newspaper only to be told that you need to subscribe to see their content, you have encountered such **paywalled** materials. Resources like these can be juxtaposed with **open access** materials. While there are different levels and variants, open access broadly means that the materials are available with little to no restrictions: you can read them without having to pay for them. For many, the choice is an ethical and a political one. But open access materials do raise serious financial questions: 12 | 13 | * Keeping materials online requires sustained funding over time. How can open access work be maintained if they are presented for free? 14 | 15 | Once materials are put online, it is possible to connect them to a wider, global network of similar digital materials. In the same way that a library gathers information about its materials to organize in a systematic way \(more on **metadata** in our lesson on "[Problems with Data](/data-cleaning.problems-with-data.md)"\), scholars and archivists have to oversee this process for this networking to happen. For instance, technical standards shift \(TEI tags can change over time\), so archival materials require constant maintenance. If you have ever used a digital archive, you have benefited from a vast and often invisible amount of labor happening behind the scenes. The hidden work of gallery, library, archive, and museum \(or **GLAM**\) professionals ensures that our cultural heritage will remain accessible and sustainable for centuries to come. 16 | 17 | ![nines splash page](/assets/archives/nines-splash.jpg) 18 | 19 | The **Networked Infrastructure for Nineteenth-Century Electronic Scholarship ([NINES](https://www.nines.org))** is one such digital humanities organization that attempts to facilitate the archiving process by gathering archived materials pertaining to the nineteenth century. You might think of NINES as something like a one-stop shop for all your nineteenth-century archival needs. It gathers together peer-reviewed projects on literature and culture that different research teams around the globe put together; some focus on an individual author, others on a genre \(periodicals or "penny dreadfuls"\) or a particular issue \(disability or court cases\). If you go to the site and scroll through "Federated Websites," you'll see the range of projects you can access from NINES, from one on the eighteenth-century book trade in France to another featuring the letters of Emily Dickinson. For the purposes of this class, you'll notice that some of the projects will be extremely useful to you, such as the [Old Bailey Online](https://www.oldbaileyonline.org/), which contains trials records from London's central criminal court. Others, such as a project on the [journals of Lewis and Clark](http://lewisandclarkjournals.unl.edu/), won't be relevant for this class, but might be for others you are taking. 20 | 21 | You might also notice that NINES has a relatively expansive view of what the nineteenth century is, since this site includes projects that deal with the late eighteenth and early twentieth century. Historians often talk of the "long nineteenth century" as the period from the beginning of the French Revolution in 1789 to the outbreak of World War I in 1914. (In other words, historians of the nineteenth century like to claim big chunks of other people's time periods.) 22 | 23 | ![nines federation](/assets/archives/nines-federated.jpg) 24 | 25 | Archives submit themselves for affiliation with NINES so that their materials can be searchable alongside other NINES sites, but first they must pass a rigorous process of **peer review** first. Academic journals rely on peer review to ensure that scholarship meets particular standards of rigor and relevance; it is a bit like quality control for scholarly writing. The peer review process typically involves submitting an article or book to a series of reviewers who write letters in support or rejection of the project and offer suggestions for improvement. The process is double-blind; the reviewers don't know who the authors are and vice versa. Should the piece pass, it moves onto publication and receives the explicit seal of approval from the publication. 26 | 27 | old bailey federated search results 28 | 29 | **Why go through peer review?** 30 | 31 | Peer review sounds like a lot of work (and it is), but going through this process has benefits for you as an author. For one, it's a way to get suggestions for improvement. Scholars also see peer-reviewed projects as being more prestigious than non-peer reviewed works and, for the purposes of promotion, they "count" more than non-peer reviewed works. Peer review allows faculty members to assure their colleagues that their work is worthy of funding. 32 | 33 | Digital projects, in particular, take an extraordinary amount of work and resources, so it makes sense that their contributors want credit for their work. But it can be difficult to evaluate something like an archive, since scholars are primarily trained to produce and evaluate secondary sources as opposed to primary-source repositories. Digital projects also require reviewers who understand not only the content but also the technical aspects of a project. 34 | 35 | One of the early missions of NINES was to faciliate peer review for digital projects. By assembling digital humanities scholars that could evaluate digital archives and attest to their successes or flaws, project coordinators could better make their work available and understandable to colleagues who weren't working with digital material. So, say you worked on The Old Bailey Online and are up for a promotion at your institution; submitting this project to NINES for peer review is a way to make sure that your colleagues recognize the hard work you put into this project. Once reviewed, NINES makes the archival materials available for searching alongside other peer-reviewed projects. (You can see an example search of The Old Bailey Online [here](http://www.nines.org/search?q=old%20bailey). Because the Old Bailey's archival materials are part of NINES, a search for 'old bailey' in NINES reveals objects not only in NINES, but also in a wide range of other archives.) 36 | 37 | **What does peer review mean for you as a user of an archive?** 38 | 39 | If you've made it this far in life, you've probably realized that you can't trust everything you find on the internet. In this case, knowing that something is peer reviewed allows you to put more trust in what you find on NINES than what you find elsewhere; you know that other scholars in the field have signed off on this material and think it is a worthy project. 40 | 41 | **Why else should I use NINES?** 42 | 43 | Beyond the fact that you can have a lot of confidence in the projects you find here, NINES is going to make it easier for you to find things. For one, you might not have known about all these different projects. NINES has also made sure that these projects "play nice" with each other (a.k.a. interoperability), which means you can find references to a particular topic or word across these projects with a simple search. 44 | 45 | ![nines crime search](/assets/archives/nines-crime-search.jpg) 46 | 47 | Doing a search for "crime" gets you all the references to this term in all of the different projects linked to NINES, saving you from having to search each individual archive. 48 | 49 | One warning: only some of the results you get in the left pane are to material from the online projects affiliated with NINES. In other cases, NINES is searching library catalogs where the material is not available digitally. In this instance, if you wanted to read the first work, Alexandre Dumas's _Celebrated Crimes_, you would have to drive to Charlottesville and go to UVA's Special Collections Library. 50 | 51 | * What archives do you use on a regular basis? 52 | 53 | * What kinds of work do you imagine went into them? 54 | 55 | -------------------------------------------------------------------------------- /assets/Screen Shot 2016-08-25 at 10.33.39 AM.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/Screen Shot 2016-08-25 at 10.33.39 AM.jpg -------------------------------------------------------------------------------- /assets/Screen Shot 2016-08-25 at 10.33.39 AM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/Screen Shot 2016-08-25 at 10.33.39 AM.png -------------------------------------------------------------------------------- /assets/Screen Shot 2016-08-25 at 10.33.54 AM.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/Screen Shot 2016-08-25 at 10.33.54 AM.jpg -------------------------------------------------------------------------------- /assets/Screen Shot 2016-08-25 at 10.33.54 AM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/Screen Shot 2016-08-25 at 10.33.54 AM.png -------------------------------------------------------------------------------- /assets/Screen Shot 2016-08-25 at 10.34.08 AM.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/Screen Shot 2016-08-25 at 10.34.08 AM.jpg -------------------------------------------------------------------------------- /assets/Screen Shot 2016-08-25 at 10.34.08 AM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/Screen Shot 2016-08-25 at 10.34.08 AM.png -------------------------------------------------------------------------------- /assets/archives/nines-crime-search.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/archives/nines-crime-search.jpg -------------------------------------------------------------------------------- /assets/archives/nines-federated.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/archives/nines-federated.jpg -------------------------------------------------------------------------------- /assets/archives/nines-old-bailey-search.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/archives/nines-old-bailey-search.jpg -------------------------------------------------------------------------------- /assets/archives/nines-splash.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/archives/nines-splash.jpg -------------------------------------------------------------------------------- /assets/archives/old-bailey-sans-tei.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/archives/old-bailey-sans-tei.jpg -------------------------------------------------------------------------------- /assets/archives/old-bailey-tei.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/archives/old-bailey-tei.jpg -------------------------------------------------------------------------------- /assets/archives/tei-graphic.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/archives/tei-graphic.jpg -------------------------------------------------------------------------------- /assets/archives/tei.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/archives/tei.jpg -------------------------------------------------------------------------------- /assets/bagel.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/bagel.jpg -------------------------------------------------------------------------------- /assets/bagel2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/bagel2.jpg -------------------------------------------------------------------------------- /assets/classifiers/authorship-function.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/classifiers/authorship-function.jpg -------------------------------------------------------------------------------- /assets/classifiers/authorship-lexical.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/classifiers/authorship-lexical.jpg -------------------------------------------------------------------------------- /assets/classifiers/authorship-punctuation.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/classifiers/authorship-punctuation.jpg -------------------------------------------------------------------------------- /assets/classifiers/owl.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/classifiers/owl.jpg -------------------------------------------------------------------------------- /assets/classifiers/sleepingstarbuck.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/classifiers/sleepingstarbuck.jpg -------------------------------------------------------------------------------- /assets/close-reading/prism-raven-font-size.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/close-reading/prism-raven-font-size.jpg -------------------------------------------------------------------------------- /assets/close-reading/prism-raven-highlights.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/close-reading/prism-raven-highlights.jpg -------------------------------------------------------------------------------- /assets/close-reading/prism-raven-winning-facet.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/close-reading/prism-raven-winning-facet.jpg -------------------------------------------------------------------------------- /assets/close-reading/prism-splash-page.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/close-reading/prism-splash-page.jpg -------------------------------------------------------------------------------- /assets/conclusion/clone-url.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/conclusion/clone-url.jpg -------------------------------------------------------------------------------- /assets/conclusion/fork-button.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/conclusion/fork-button.jpg -------------------------------------------------------------------------------- /assets/conclusion/gitbook-add-book.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/conclusion/gitbook-add-book.jpg -------------------------------------------------------------------------------- /assets/conclusion/gitbook-repo-selection.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/conclusion/gitbook-repo-selection.jpg -------------------------------------------------------------------------------- /assets/conclusion/gitbooks-clone.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/conclusion/gitbooks-clone.jpg -------------------------------------------------------------------------------- /assets/conclusion/gitbooks-editor-interface.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/conclusion/gitbooks-editor-interface.jpg -------------------------------------------------------------------------------- /assets/conclusion/gitbooks-github-complete-import-template.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/conclusion/gitbooks-github-complete-import-template.jpg -------------------------------------------------------------------------------- /assets/conclusion/gitbooks-import-github.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/conclusion/gitbooks-import-github.jpg -------------------------------------------------------------------------------- /assets/conclusion/gitbooks-sync.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/conclusion/gitbooks-sync.jpg -------------------------------------------------------------------------------- /assets/conclusion/github-forking.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/conclusion/github-forking.jpg -------------------------------------------------------------------------------- /assets/conclusion/sentence-difficulty.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/conclusion/sentence-difficulty.jpg -------------------------------------------------------------------------------- /assets/crowdsourcing/prism-create-one.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/crowdsourcing/prism-create-one.jpg -------------------------------------------------------------------------------- /assets/crowdsourcing/prism-create-two.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/crowdsourcing/prism-create-two.jpg -------------------------------------------------------------------------------- /assets/crowdsourcing/prism-future-stacked.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/crowdsourcing/prism-future-stacked.jpg -------------------------------------------------------------------------------- /assets/crowdsourcing/prism-myprisms.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/crowdsourcing/prism-myprisms.jpg -------------------------------------------------------------------------------- /assets/cyborg-readers/scandal-in-bohemia-word-cloud.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/cyborg-readers/scandal-in-bohemia-word-cloud.jpg -------------------------------------------------------------------------------- /assets/cyborg-readers/stopword-free-concordance.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/cyborg-readers/stopword-free-concordance.jpg -------------------------------------------------------------------------------- /assets/cyborg-readers/voyant-frequency-graph.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/cyborg-readers/voyant-frequency-graph.jpg -------------------------------------------------------------------------------- /assets/cyborg-readers/voyant-links.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/cyborg-readers/voyant-links.jpg -------------------------------------------------------------------------------- /assets/cyborg-readers/voyant-overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/cyborg-readers/voyant-overview.jpg -------------------------------------------------------------------------------- /assets/cyborg-readers/voyant-phrases.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/cyborg-readers/voyant-phrases.jpg -------------------------------------------------------------------------------- /assets/cyborg-readers/voyant-settings.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/cyborg-readers/voyant-settings.jpg -------------------------------------------------------------------------------- /assets/cyborg-readers/voyant-splash-page.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/cyborg-readers/voyant-splash-page.jpg -------------------------------------------------------------------------------- /assets/cyborg-readers/voyant-stopwords.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/cyborg-readers/voyant-stopwords.jpg -------------------------------------------------------------------------------- /assets/cyborg-readers/voyant-summary.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/cyborg-readers/voyant-summary.jpg -------------------------------------------------------------------------------- /assets/cyborg-readers/voyant-term-frequencies.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/cyborg-readers/voyant-term-frequencies.jpg -------------------------------------------------------------------------------- /assets/cyborg-readers/voyant-word-cloud-default.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/cyborg-readers/voyant-word-cloud-default.jpg -------------------------------------------------------------------------------- /assets/cyborg-readers/voyant-word-cloud-dense.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/cyborg-readers/voyant-word-cloud-dense.jpg -------------------------------------------------------------------------------- /assets/data-cleaning/data-cat-high-five.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/data-cleaning/data-cat-high-five.jpg -------------------------------------------------------------------------------- /assets/data-cleaning/dog-blanket.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/data-cleaning/dog-blanket.jpeg -------------------------------------------------------------------------------- /assets/data-cleaning/holmes-ocr-text.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/data-cleaning/holmes-ocr-text.jpg -------------------------------------------------------------------------------- /assets/data-cleaning/holmes-ocr.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | THESE EYES COULD 4 | 5 | READ THE muosr ‘ 6 | THOUGHTS OF THE ‘ 7 | GUILTY! \ 8 | 9 | Cool, canny. baffling. ‘ 10 | Shrrlock Holman was 11 | : figurc [0 be reek. 12 | oncd with by [he 13 | muun of lawleunm 14 | 15 | anymore 16 | 17 | Idol ofAmnica and ‘ 18 | prawn ado! of our 19 | time bring: a! but la 20 | man'un picmm [he 21 | most thrilling con. 22 | 23 | aption n] all fiction 24 | 25 | ERLOCK 26 | SIIlIOLMES 27 | 28 | Ii-lyAIL-IW- My: 29 | “uni-WM 30 | 31 | Dm'lndufiwhanflml 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /assets/data-cleaning/holmes.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/data-cleaning/holmes.jpg -------------------------------------------------------------------------------- /assets/data-cleaning/zotero-add-citation.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/data-cleaning/zotero-add-citation.jpg -------------------------------------------------------------------------------- /assets/data-cleaning/zotero-document-with-bibliography.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/data-cleaning/zotero-document-with-bibliography.jpg -------------------------------------------------------------------------------- /assets/data-cleaning/zotero-download-from-chrome.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/data-cleaning/zotero-download-from-chrome.jpg -------------------------------------------------------------------------------- /assets/data-cleaning/zotero-download.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/data-cleaning/zotero-download.jpg -------------------------------------------------------------------------------- /assets/data-cleaning/zotero-editing-pane.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/data-cleaning/zotero-editing-pane.jpg -------------------------------------------------------------------------------- /assets/data-cleaning/zotero-example-citation.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/data-cleaning/zotero-example-citation.jpg -------------------------------------------------------------------------------- /assets/data-cleaning/zotero-give-page-numbers-to-citation.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/data-cleaning/zotero-give-page-numbers-to-citation.jpg -------------------------------------------------------------------------------- /assets/data-cleaning/zotero-input-by-isbn.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/data-cleaning/zotero-input-by-isbn.jpg -------------------------------------------------------------------------------- /assets/data-cleaning/zotero-input-from-web.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/data-cleaning/zotero-input-from-web.jpg -------------------------------------------------------------------------------- /assets/data-cleaning/zotero-magic-wand.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/data-cleaning/zotero-magic-wand.jpg -------------------------------------------------------------------------------- /assets/data-cleaning/zotero-menu-in-word.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/data-cleaning/zotero-menu-in-word.jpg -------------------------------------------------------------------------------- /assets/data-cleaning/zotero-searching-for-citation.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/data-cleaning/zotero-searching-for-citation.jpg -------------------------------------------------------------------------------- /assets/data-cleaning/zotero-select-citation-style.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/data-cleaning/zotero-select-citation-style.jpg -------------------------------------------------------------------------------- /assets/data-cleaning/zotero-standalone.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/data-cleaning/zotero-standalone.jpg -------------------------------------------------------------------------------- /assets/early-modern-stopwords.txt: -------------------------------------------------------------------------------- 1 | ! 2 | $ 3 | % 4 | & 5 | - 6 | . 7 | 0 8 | 1 9 | 10 10 | 100 11 | 11 12 | 12 13 | 13 14 | 14 15 | 15 16 | 16 17 | 17 18 | 18 19 | 19 20 | 1990 21 | 1991 22 | 1992 23 | 1993 24 | 1994 25 | 1995 26 | 1996 27 | 1997 28 | 1998 29 | 1999 30 | 2 31 | 20 32 | 2000 33 | 2001 34 | 2002 35 | 2003 36 | 2004 37 | 2005 38 | 2006 39 | 2007 40 | 2008 41 | 2009 42 | 2010 43 | 2011 44 | 2012 45 | 2013 46 | 2014 47 | 2015 48 | 2016 49 | 2017 50 | 2018 51 | 2019 52 | 2020 53 | 21 54 | 22 55 | 23 56 | 24 57 | 25 58 | 26 59 | 27 60 | 28 61 | 29 62 | 3 63 | 30 64 | 31 65 | 32 66 | 33 67 | 34 68 | 35 69 | 36 70 | 37 71 | 38 72 | 39 73 | 4 74 | 40 75 | 41 76 | 42 77 | 43 78 | 44 79 | 45 80 | 46 81 | 47 82 | 48 83 | 49 84 | 5 85 | 50 86 | 51 87 | 52 88 | 53 89 | 54 90 | 55 91 | 56 92 | 57 93 | 58 94 | 59 95 | 6 96 | 60 97 | 61 98 | 62 99 | 63 100 | 64 101 | 65 102 | 66 103 | 67 104 | 68 105 | 69 106 | 7 107 | 70 108 | 71 109 | 72 110 | 73 111 | 74 112 | 75 113 | 76 114 | 77 115 | 78 116 | 8 117 | 80 118 | 81 119 | 82 120 | 83 121 | 84 122 | 85 123 | 86 124 | 87 125 | 88 126 | 89 127 | 9 128 | 90 129 | 91 130 | 92 131 | 93 132 | 94 133 | 95 134 | 96 135 | 97 136 | 98 137 | 99 138 | : 139 | ; 140 | < 141 | > 142 | @ 143 | \( 144 | \) 145 | \* 146 | \+ 147 | \? 148 | \[ 149 | \] 150 | \^ 151 | \{ 152 | \} 153 | a 154 | about 155 | above 156 | across 157 | after 158 | afterwards 159 | again 160 | against 161 | all 162 | almost 163 | alone 164 | along 165 | already 166 | also 167 | although 168 | always 169 | am 170 | among 171 | amongst 172 | amoungst 173 | amount 174 | an 175 | and 176 | another 177 | any 178 | anyhow 179 | anyone 180 | anything 181 | anyway 182 | anywhere 183 | are 184 | around 185 | as 186 | at 187 | b 188 | back 189 | be 190 | because 191 | bee 192 | been 193 | before 194 | beforehand 195 | being 196 | beside 197 | besides 198 | between 199 | both 200 | bottom 201 | but 202 | by 203 | c 204 | call 205 | can 206 | cannot 207 | cant 208 | co 209 | con 210 | could 211 | couldnt 212 | d 213 | de 214 | did 215 | didn't 216 | do 217 | doe 218 | does 219 | doesn't 220 | don't 221 | done 222 | doth 223 | down 224 | due 225 | during 226 | e 227 | each 228 | eg 229 | eight 230 | either 231 | eleven 232 | else 233 | elsewhere 234 | enough 235 | enter 236 | etc 237 | even 238 | ever 239 | every 240 | everyone 241 | everything 242 | everywhere 243 | except 244 | f 245 | few 246 | fifteen 247 | fify 248 | fill 249 | find 250 | fire 251 | first 252 | five 253 | for 254 | former 255 | formerly 256 | forty 257 | found 258 | four 259 | from 260 | front 261 | full 262 | further 263 | g 264 | get 265 | give 266 | go 267 | h 268 | had 269 | has 270 | hasnt 271 | hath 272 | have 273 | he 274 | hence 275 | her 276 | here 277 | hereafter 278 | hereby 279 | herein 280 | hereupon 281 | hers 282 | herself 283 | him 284 | himself 285 | his 286 | how 287 | however 288 | hundred 289 | i 290 | ie 291 | if 292 | in 293 | inc 294 | indeed 295 | into 296 | is 297 | it 298 | its 299 | itself 300 | j 301 | k 302 | keep 303 | l 304 | last 305 | latter 306 | latterly 307 | least 308 | less 309 | lo 310 | ltd 311 | m 312 | made 313 | many 314 | may 315 | me 316 | meanwhile 317 | might 318 | mill 319 | mine 320 | more 321 | moreover 322 | most 323 | mostly 324 | move 325 | much 326 | must 327 | my 328 | myself 329 | n 330 | name 331 | namely 332 | nay 333 | neither 334 | never 335 | nevertheless 336 | next 337 | nine 338 | no 339 | nobody 340 | none 341 | noone 342 | nor 343 | not 344 | nothing 345 | now 346 | nowhere 347 | o 348 | of 349 | off 350 | often 351 | on 352 | once 353 | one 354 | only 355 | onto 356 | or 357 | other 358 | others 359 | otherwise 360 | our 361 | ours 362 | ourselves 363 | out 364 | over 365 | own 366 | p 367 | part 368 | per 369 | perhaps 370 | please 371 | put 372 | q 373 | r 374 | rather 375 | re 376 | s 377 | same 378 | see 379 | seem 380 | seemed 381 | seeming 382 | seems 383 | serious 384 | several 385 | she 386 | should 387 | since 388 | sir 389 | six 390 | sixty 391 | so 392 | some 393 | somehow 394 | someone 395 | something 396 | sometime 397 | sometimes 398 | somewhere 399 | still 400 | such 401 | system 402 | t 403 | take 404 | ten 405 | than 406 | that 407 | the 408 | thee 409 | their 410 | them 411 | themselves 412 | then 413 | thence 414 | there 415 | thereafter 416 | thereby 417 | therefore 418 | therein 419 | thereupon 420 | these 421 | they 422 | thing 423 | third 424 | this 425 | those 426 | thou 427 | though 428 | three 429 | through 430 | throughout 431 | thru 432 | thus 433 | thy 434 | to 435 | together 436 | too 437 | toward 438 | towards 439 | twelve 440 | twenty 441 | two 442 | u 443 | un 444 | under 445 | until 446 | up 447 | upon 448 | us 449 | v 450 | very 451 | via 452 | w 453 | was 454 | we 455 | well 456 | were 457 | what 458 | whatever 459 | when 460 | whence 461 | whenever 462 | where 463 | whereafter 464 | whereas 465 | whereby 466 | wherein 467 | whereupon 468 | wherever 469 | whether 470 | which 471 | while 472 | whither 473 | who 474 | whoever 475 | whole 476 | whom 477 | whose 478 | why 479 | will 480 | with 481 | within 482 | without 483 | would 484 | x 485 | y 486 | ye 487 | yet 488 | you 489 | your 490 | yours 491 | yourself 492 | yourselves 493 | z 494 | | 495 | || 496 | shall 497 | unto 498 | vs 499 | wee 500 | hee 501 | like 502 | bee 503 | doth 504 | nay -------------------------------------------------------------------------------- /assets/image-convert.py: -------------------------------------------------------------------------------- 1 | import os 2 | from PIL import Image 3 | 4 | 5 | def all_files(dirname): 6 | for (root, _, files) in os.walk(dirname): 7 | for fn in files: 8 | yield os.path.join(root, fn) 9 | 10 | 11 | def convert_image(file): 12 | img = Image.open(file) 13 | print(os.path.splitext(file)[0] + '.jpg') 14 | img.save(os.path.splitext(file)[0] + '.jpg') 15 | os.remove(file) 16 | 17 | 18 | def main(): 19 | fns = [] 20 | for fn in all_files('.'): 21 | if os.path.splitext(fn)[1] == '.png': 22 | fns.append(fn) 23 | 24 | for fn in fns: 25 | convert_image(fn) 26 | 27 | if __name__ == '__main__': 28 | main() 29 | -------------------------------------------------------------------------------- /assets/issues/english-crime-ngram.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/issues/english-crime-ngram.jpg -------------------------------------------------------------------------------- /assets/issues/forms-of-scandal.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/issues/forms-of-scandal.jpg -------------------------------------------------------------------------------- /assets/issues/french-crime-ngram.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/issues/french-crime-ngram.jpg -------------------------------------------------------------------------------- /assets/issues/google-ngram-viewer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/issues/google-ngram-viewer.jpg -------------------------------------------------------------------------------- /assets/issues/race.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/issues/race.jpg -------------------------------------------------------------------------------- /assets/issues/scandal.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/issues/scandal.jpg -------------------------------------------------------------------------------- /assets/issues/science-religion.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/issues/science-religion.jpg -------------------------------------------------------------------------------- /assets/issues/visual-clarity.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/issues/visual-clarity.jpg -------------------------------------------------------------------------------- /assets/issues/wildcard-scandal.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/issues/wildcard-scandal.jpg -------------------------------------------------------------------------------- /assets/reading-at-scale/distant-reading-dinosaur.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/reading-at-scale/distant-reading-dinosaur.jpg -------------------------------------------------------------------------------- /assets/reading-at-scale/distant-reading-graphs.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/reading-at-scale/distant-reading-graphs.jpg -------------------------------------------------------------------------------- /assets/reading-at-scale/sweeney-said.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/reading-at-scale/sweeney-said.jpg -------------------------------------------------------------------------------- /assets/reading-at-scale/voyant-collocates.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/reading-at-scale/voyant-collocates.jpg -------------------------------------------------------------------------------- /assets/reading-at-scale/voyant-contexts.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/reading-at-scale/voyant-contexts.jpg -------------------------------------------------------------------------------- /assets/reading-at-scale/voyant-word-cloud-default.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/reading-at-scale/voyant-word-cloud-default.jpg -------------------------------------------------------------------------------- /assets/sentiment-analysis/emoji-sentiment-angry.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/sentiment-analysis/emoji-sentiment-angry.jpg -------------------------------------------------------------------------------- /assets/sentiment-analysis/emoji-sentiment-friday.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/sentiment-analysis/emoji-sentiment-friday.jpg -------------------------------------------------------------------------------- /assets/sentiment-analysis/jockers-portrait.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/sentiment-analysis/jockers-portrait.jpg -------------------------------------------------------------------------------- /assets/sleepingstarbuck.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/sleepingstarbuck.jpg -------------------------------------------------------------------------------- /assets/topic-modeling/topic-modeling-french-german.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/topic-modeling/topic-modeling-french-german.jpg -------------------------------------------------------------------------------- /assets/topic-modeling/topic-modeling-highlights.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/assets/topic-modeling/topic-modeling-highlights.jpg -------------------------------------------------------------------------------- /book.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | } 4 | -------------------------------------------------------------------------------- /classifiers.md: -------------------------------------------------------------------------------- 1 | # Classifiers 2 | 3 | * [Supervised Classifiers](/classifiers/supervised-classifiers.md) 4 | 5 | * [Classifying Texts](/classifiers/classifying-texts.md) 6 | 7 | * [Exercises](/classifiers/exercises.md) -------------------------------------------------------------------------------- /classifiers/classifying-texts.md: -------------------------------------------------------------------------------- 1 | # Classifying Texts 2 | 3 | At this point, you might be saying, "Supervised classification is all well and good, but how does this relate to text analysis? I'm going to go back to googling for animal photos." 4 | 5 | Stop right there! We've got one for you. If you think you're tired, how do you think this dog feels? Impersonating bagels is exhausting. 6 | 7 | ![sarah's sleeping dog](/assets/classifiers/sleepingstarbuck.jpg) 8 | 9 | Now that you're back and not going anywhere, we should acknowledge that your point is a good one. We wanted to stay relatively simple in the last lesson so that you could get a handle on the basics of supervised classification, but let's think about the ways you could apply this method to texts. The [NLTK book](http://www.nltk.org/book/ch06.html) \(which you should check out if you want to go into more depth into text analysis\) lists some common text classification tasks: 10 | 11 | > * Deciding whether an email is spam or not. 12 | > * Deciding what the topic of a news article is, from a fixed list of topic areas such as "sports," "technology," and "politics." 13 | > * Deciding whether a given occurrence of the word bank is used to refer to a river bank, a financial institution, the act of tilting to the side, or the act of depositing something in a financial institution. 14 | 15 | Let's break each of these tasks down. Remember, a supervised classifier relies on labeled data for a training set. This sample data that you'll be using depends directly on the type of problem you are interested in. You could work backwards, and figure out what kind of training data you would need from the question you are interested in: 16 | 17 | * To decide whether an email is spam, you will need lots of examples of junk email. 18 | * To tag a news article as belonging to a particular category, you will need examples of articles from each of those categories. 19 | * To determine the use of the word 'bank,' you will need examples of the word used in all these possible contexts. 20 | 21 | In each case, it's not enough to just dump data into the classifier. You would also have to decide what feature sets you want to examine for the training sets for each task. Take the task of building a spam filter. To determine whether or not a text is spam, you would need to decide what features you find to be indicative of junk mail. And you have many options! Here are just a few: 22 | 23 | * You might decide that word choice is indicative of spam. An email that reads "Buy now! Click this link to see an urgent message!" is probably junk. So you'd need to break up your representative spam texts into tokenized lists of vocabulary. From there you would work to give the classifier a sense of those words likely to result in unwanted messages. 24 | * You might notice that all your spam notifications come from similar emails. You could train the classifier to identify certain email addresses, pull out those which have known spam addresses, and tag them as spam. 25 | 26 | You could certainly come up with other approaches. In any case, you would need to think about a series of questions common to all text analysis projects: 27 | 28 | * What is my research question? 29 | * How can my large question be broken down into smaller pieces? 30 | * Which of those can be measured by the computer? 31 | * What kinds of example data do I need for this problem? What kinds do I already have in my possession? 32 | 33 | Going through these questions can be difficult at first, but, with practice, you will be able to separate feasible digital humanities questions from those that are impossible to answer. You will start to gain a sense of what could be measured and analyzed as well as figure out whether or not you might want to do so at all. You will also start to get a sense of what kind of projects are interesting and worth pursuing. 34 | 35 | Now, let's practice on a supervised approach to a common problem in text analysis: authorship attribution. Sometimes texts come down to us with no authors at all attributed to them, and we might want to know who wrote them. Or maybe a single text might be written under a pseudonym, but you might have a good guess as to whom might be the author. You could approach this problem in a variety of unsupervised ways, graphing the similarity or difference of particular authors based on a number of algorithms. But if you have a pretty good guess as to whom the author of a particular text might be, you can take a supervised approach to the problem. To step through our same list of steps: 36 | 37 | * What is my research question? 38 | * We want to be able to identify the unknown author of a text. 39 | * How can my large question be broken down into smaller pieces? 40 | * We have a reasonable guess as to some possible authors, so we can use those as objects of study. We also are assuming that authorship can be associated with style. 41 | * Which of those can the computer measure? 42 | * Well, style is the sum total of vocabulary, punctuation, and rhetorical patterns, among other things. Those can all be counted! 43 | * What kind of example data do we have that we can for this problem? 44 | * We have the unknown text. And we also have several texts by my potential authors that we can compare against it. 45 | 46 | To illustrate this experiment, we took two authors from our syllabus: Danielle Bowler and Pia Glenn. Using their author pages on [Eyewitness News](http://ewn.co.za/Contributors/Danielle-Bowler) and [xoJane](http://www.xojane.com/author/pia-glenn), we gathered articles that belonged to each. Bowler tends to write shorter pieces than Glenn, so our training set included about double the number of pieces by Bowler (10) as by Glenn (5). With this body of training data for each author, we uploaded the texts to a great online [authorship attribution tool](http://aicbt.com/authorship-attribution/online-software/) by AICBT. The tool allows you to upload sample data for two authors. With this set, you can then upload a text by an unknown author, and the software will try to guess who wrote it based on a variety of text analysis protocols. In this case, the mystery text was "[Freedom, Justice, and John Legend](http://ewn.co.za/2015/02/23/OPINION-Danielle-Bowler-Freedom-justice-and-John-Legend)" by Bowler. Author 1 is Glenn, and Author 2 is Bowler. The tool attempted to identify the author of the mystery text as follows. The images also include AICBT's helpful explanation of the different metrics that they are using to analyze the unknown text. 47 | 48 | ![authorship function](/assets/classifiers/authorship-function.jpg) 49 | ![authorship lexical](/assets/classifiers/authorship-lexical.jpg) 50 | ![authorship diversity](/assets/classifiers/authorship-punctuation.jpg) 51 | 52 | This text is actually by author 2, so for the classifier to be accurate according to these measures, the arrow should point towards the right. You'll immediately see that we have some success, but also one failure! Function word analysis is a slight indicator of the correct author, lexical analysis is virtually useless, and punctuation analysis is way _wrong_. In a real classification project, we would want to use the success or failure of our classifier to revise our sense of which features are useful for our particular project. In this case, punctuation is not a good measure at all, so we would throw that out. Instead, we might focus on function words as an indicator of authorship. We can tweak our approach accordingly. But measures like these are only ever working probabilistically. We can say that the mystery text _might_ be written by our second author, but only in rare cases could we ever know for certain. 53 | 54 | Note also how these measures of authorship overlap with other things we have studied in this book. Remember stopwords, those words that are so common in a text that they are frequently removed before text analysis? In cases like this one, we actually care a lot about these simple terms. Two of the three measures for authorship here deal with just those words that we might otherwise throw away: punctuation, articles, etc. These words might not tell you much about the subject of a text, but they can tell you an awful lot about _how_ a text discusses them. 55 | 56 | Take a text that we've talked a lot about in this course: _The String of Pearls_. This penny dreadful was published in weekly installments and was written \(we think\) by James Malcolm Rymer and Thomas Peckett Prest. But the work was published anonymously, so we don't know which author wrote which chapter \(or even if Rymer and Prest wrote the novel\). 57 | 58 | For the purposes of this demonstration, let's assume that we know that Rymer wrote Chapter One and Prest wrote Chapter Two. So who wrote Chapter Thirty-Three? If we go back to our author attribution tool and copy Chapter One into the box for Author 1 and Chapter Two into the box for Author 2, here's what we get for Chapter Thirty-Three: 59 | 60 | ![](/assets/Screen Shot 2016-08-25 at 10.33.39 AM.jpg) 61 | 62 | ![](/assets/Screen Shot 2016-08-25 at 10.33.54 AM.jpg) 63 | 64 | ![](/assets/Screen Shot 2016-08-25 at 10.34.08 AM.jpg) 65 | 66 | Here, it looks like the tool is trending towards Rymer as the author of the chapter, but we're mainly dealing with uncertainty. But that uncertainty itself is pretty interesting! Maybe what this is showing us is that the authors had a pretty similar style. Or maybe both authors had a hand in each chapter, and our training set is not particularly useful. If we had large bodies of text by each author we might have better luck. We might want to drill down further and investigate the uses of puncutation in different chapters or the lexical diversity, word length, and sentence length in much more detail. 67 | 68 | * Are other penny dreadfuls similar to _The String of Pearls_ in these respects? 69 | 70 | * If so, what differentiates the style of these works from other types of serial novels? 71 | 72 | Similar processes have been used for a variety of authorship attribution cases. The most famous one in recent times is probably that of Robert Galbraith, who came out with _The Cuckoo's Calling_ in 2013. Using a similar process of measuring linguistic similarity, Patrick Juola was able to test a hypothesis that J.K. Rowling had released the detective novel under a pseudonym. You can read more about the process [here](http://www.scientificamerican.com/article/how-a-computer-program-helped-show-jk-rowling-write-a-cuckoos-calling/). 73 | 74 | If we can measure it, we can test it. And you would be surprised at just how many humanities-based research questions we can measure. Taking complicated human concepts like authorship and breaking them down into quantifiable pieces is part of the fun. It is also what makes the process intellectually interesting. If it were easy, it would be boring. 75 | 76 | We have just barely scratched the surface of the field of **stylometry**, or the study of linguistic style using a variety of statistical metrics. You can carry this research process using a variety of programming languages, so you might take a look at our concluding chapter on [Where to Go Next](/conclusion/where-to-go.md) if you are interested in learning how to implement these sorts of experiments yourself. 77 | 78 | -------------------------------------------------------------------------------- /classifiers/exercises.md: -------------------------------------------------------------------------------- 1 | # Exercises 2 | 3 | Both W. T. Stead and Edward Tyas Cook were working for the _Pall Mall Gazette_ at the time of the Jack the Ripper murders. [Use AICBT's online authorship attribution tool](http://aicbt.com/authorship-attribution/online-software/) to figure out which one was covering the case. [Here](http://www.casebook.org/press_reports/pall_mall_gazette/) is a link to the newspaper articles for you to test; be sure to find at least two that aren't just reproducing someone else's writing \(like a letter to the editor\). 4 | 5 | [Here](http://www.attackingthedevil.co.uk/steadworks/) is a link to W. T. Stead's works. You can find one of Cook's works [here](https://archive.org/stream/lifeofflorenceni01cookuoft/lifeofflorenceni01cookuoft_djvu.txt). 6 | 7 | * What articles from the _Pall Mall Gazette_ did you choose? Why did you choose them? 8 | * What excerpts from Cook and Stead's writing did you choose? Why did you choose those excerpts? \(Remember, giving the tool more data to train on means the more likely you are to get accurate data.\) 9 | * What did you find using the attribution tool? Are you coming up with clear results for either article? 10 | * How confident are you that you can attribute either article to Cook or Stead? Why or why not? 11 | 12 | -------------------------------------------------------------------------------- /classifiers/supervised-classifiers.md: -------------------------------------------------------------------------------- 1 | # Supervised Classifiers 2 | 3 | In the lesson on our [Topic Modeling Case Study](/topic-modeling/topic-modeling-case-study.md), we talked about unsupervised classifiers. When topic modeling explores texts to find the underlying discourses at work within them, our texts were not really labeled in any way. We did not say, "topic modeler, go out and search for a topic called, 'medicine.' A medicine topic will consist primarily of the words 'anatomy,' 'science,' 'hospital,' etc. Let me know what else you find!" Instead, the topic modeling software came up with groups of words that it thought were related with relatively little input from us. This has the advantage of showing us patterns that we might not even know are there. Topic modeling is useful for exploring a corpus and discovering new things about it. 4 | 5 | You could think of unsupervised classifiers as similar to a [roomba](https://www.youtube.com/watch?v=A0Z79ycisDU). You hit a button, and the tiny little robot dutifully goes out and starts cleaning your floor. It knows when it reaches walls and corners that it should try to scoot around them. And its cleaning brushes are spinning furiously the whole time. You haven't told the machine how to clean, or how to navigate your floor. You just push the button and trust that it has inherent assumptions and protocols that it will follow. That covers the unsupervised part, but an unsupervised _classifier_ is obviously more sophisticated and different in kind. Instead of cleaning your floor, topic modeling uses statistics to sort the words in your texts in such a way that you can get a sense of underlying patterns in word usage. 6 | 7 | Let's try another example, adapted from Lisa Rhody's [farmers' market game](https://github.com/lmrhody/topicmodelgame) that teaches topic modeling. Imagine you have a bag with apples, oranges, and bananas in it. Imagine you also don't have any idea what an apple, an orange, or a banana is. Now we tell you to sort the things in the bag. You can probably still sort the fruit even without knowing anything about them, and you would do so by creating three piles. You take the first item, and place it into a pile on its own. Pull out a second item. Does it look similar to the first? No? Gets a new pile. Third item? Looks like the first one, so it goes next to that one. Each time you pull a new item, you compare it to all the others and revise your piles accordingly. At the end, you'll have three different piles, organized by fruit. But you didn't know anything about those fruits ahead of time. Topic modeling employs a few other variables in the process, so check out Rhody's lesson to learn more. For now, we will move on. 8 | 9 | Now imagine, instead, that we give you a slightly different exercise. We give you a bag filled with dragon fruit, star fruit, and durian. Imagine that you don't know anything about these fruits. We say, "find me all the durian." You could sort the fruit into piles all day long, but, if you don't know anything about durian, you won't be able to pick out the fruit you need. So we give you a little **training** by first bringing in ten examples of durian for you to study. We say, "Look at them. Study them. Pay attention to these characteristics: durian have spikes, they are big, and they are yellow-ish." We might also give you several examples of non-durian fruit so that you can get a sense of what durian doesn't look like. This set of fruit, where we tell you the correct labels for the fruit, is called our **training set**. Now, you have something to work with! You pull a fruit. No spikes. So you start a pile called not durian. The next one has spikes. Durian! You keep going until you have two piles, one that contains fruit that you think is a durian and one that contains fruit that you think are not. 10 | 11 | This kind of classification is called **supervised classification**. You had to be taught what the characteristics of a durian were before you could really do anything. We would call this collection of traits a **feature set**, and it might look something like this: 12 | 13 | ``` 14 | feature_set = { 15 | 'has_spikes': True, 16 | 'size': 'big', 17 | 'color': 'yellow-ish' 18 | } 19 | ``` 20 | 21 | Don't worry too much about the brackets, equals sign, etc. These are just a common way of organizing the information so that the computer can read them. Here, we're just saying that this feature set defines what a durian looks like: the fruit has to have spikes, be large, and yellow-ish. This allows us to make a reasonable guess as to whether or not any one piece of fruit we pull out of the bag was a durian. Notice how you can only work in binaries: the fruit is either a durian or not. Your not-durian pile had star fruit and dragon fruit in it, since you weren't really able to distinguish between the two in this thought experiment. If we pulled out a star fruit, we could only answer something like the following: 22 | 23 | ``` 24 | fruit.is_durian? 25 | >>> False 26 | ``` 27 | 28 | Or this if we were looking at a durian: 29 | 30 | ``` 31 | fruit.is_durian? 32 | >>> True 33 | ``` 34 | 35 | The test is actually pretty simple in its results, even if the feature set that leads to them is more nuanced. True and False are referred to as **boolean data types** in programming, and these boolean values are used to test or represent whether something is just that - true or false. 36 | 37 | We have been developing a series of tests for fruit types, but they might not be perfectly correct: after all, there are other fruits that are large, spikey and yellow-ish. A kiwano melon could have gotten thrown into the mix, and you might have incorrectly identified it as a durian. Or you might have gotten an unripe durian, which you incorrectly tossed in the wrong pile because it was green. So we could better characterize our two piles as "probably not durian" and "probably durian." 38 | 39 | Likewise, maybe you want to figure out a classification system to sort bagels. So you ask: is it round? Yes. Then it's a bagel. Does it have black dots? Then it's a poppy-seed bagel. Does it have white dots? Then it's a sesame-seed bagel. Neither one? Mainly light brown in color? Then it's a plain bagel. 40 | 41 | ![bagel dog](/assets/bagel2.jpg) 42 | 43 | But wait: this dog fits all the criteria for a plain bagel, and it is definitely not a bagel. Our classifier can say, at best, "probably bagel" or "probably not bagel." And sometimes it's wrong. Sometimes life gives you a dog, and all you can see is a bagel. \(Go [here](http://www.boredpanda.com/dog-food-comparison-bagel-muffin-lookalike-teenybiscuit-karen-zack/) for more on this classification problem.\) 44 | 45 | The use of the word "probably" should be a clue here - we have drifted into probability and statistics. What we have developed above are very basic **naive Bayes classifiers**. Thomas Bayes was an eighteenth-century statistician, and this classifier relies on his underlying [theory of statistics](https://en.wikipedia.org/wiki/Bayesian_statistics). There are other types of classifiers, but this kind assumes that each feature \(size, color, spikiness in the fruit example; shape and dotted-ness in the bagel example\) in our feature set will have some say in determining how to classify something that is unknown. 46 | 47 | In a real-world situation, we probably would have given you negative examples as well, examples of fruit that are not durian so that you had a more nuanced sense of what you were studying. In the case of a naive Bayes classifier and our fruit example, the classifier takes the number of times that durian actually occurred in our training set as the **prior probability**. The classifier then combines this number with the actual features that we provided to give a weighted probability as to whether or not what it is looking at is a durian. 48 | 49 | In this case, our labels are durian or not-durian, true or false, though you could have more than just two labels. The classifier then picks the label with the highest likelihood. We have trained ourselves to classify fruit, and we could replicate that same process on durian at a later date. If a master fruit vendor comes along, she could probably tell us how accurate we were. We could then compare our accuracy to that of another person trained to classify fruit, and we could figure out who is the better classifier. We could even figure out the percentage of the time that each of our classification systems is likely to be correct! 50 | 51 | This might all seem a bit removed from the kinds of work that we have been doing elsewhere in the book, but we wanted you to give a firm foundation in what classification is before we modeled an example relative to text analysis. 52 | 53 | ## Further Resources 54 | 55 | * The NLTK book has [a good section](http://www.nltk.org/book/ch06.html#naive-bayes-classifiers) on naive Bayes classifiers. The book is a Python tutorial, though, so it will quickly get technical. 56 | 57 | * [A Visual Introduction to Machine Learning](http://www.r2d3.us/visual-intro-to-machine-learning-part-1/) provides a very handy introduction to other types of classifiers. 58 | 59 | 60 | -------------------------------------------------------------------------------- /close-reading.md: -------------------------------------------------------------------------------- 1 | # Close Reading 2 | 3 | * [Close Reading and Sources](/close-reading/close-reading.md) 4 | 5 | * [Prism Part One](/close-reading/prism-part-one.md) 6 | 7 | * [Exercises](/close-reading/exercises.md) -------------------------------------------------------------------------------- /close-reading/close-reading.md: -------------------------------------------------------------------------------- 1 | # Close Reading and Sources 2 | 3 | Text analysis is something that we all engage in, whether we realize it or not. The term is broad and capacious and encapsulates a variety of different activities. Even something as simple as slowing down when you see a stop sign is a kind of text analysis: doing so means you have parsed the meaning of the words on the sign and reacted accordingly. 4 | 5 | Indeed any of the following, related activities are forms of text analysis: 6 | 7 | * Paraphrasing a text 8 | * Searching for hidden meanings in a text 9 | * Adapting a text and reflecting on it 10 | * Examining the details in a text 11 | 12 | This last point is worth pausing over: **close reading**, in particular, is often proclaimed as one of the primary analytical tool of scholars and students in the humanities. To read closely means to give careful attention to the various components that make up a text, ones which cause us to think or feel a certain way about it. Close reading relies on a core principle about the text under study: 13 | 14 | * Everything about the text matters, whether the author intended for it to matter or not. 15 | 16 | Consider the following thought experiment. One day you come home to find the following note from your roommate on the counter: 17 | 18 | > took care of these dishes? Thanks. 19 | 20 | Next to the note: dirty dishes. Was your roommate in a hurry and actually asking you to wash dishes? Or were they sarcastically trying to give you grief for not having done your part? Lots of questions. To imagine responses to them you might employ a range of assumptions and interpretations depending on the scenario: 21 | 22 | **Context**: you have been growing more and more irritated with your roommate for some time now. Their actions just really get under your skin: dirty dishes, laundry everywhere, the works. They clearly meant the note as an insult. 23 | 24 | **Author**: your roommate is actually a great person and would never leave a passive aggressive note. In fact, they probably meant it as a joke. 25 | 26 | **Text**: Take a look at that question mark. And then the curt second sentence. Your roommate put those things there on purpose to be rude. 27 | 28 | The list could go on and on. We employ a similar range of skills when we read anything, be it fiction, poetry, or historical documents. Close reading might be best described as an activity in which a reader simply lets no detail of the text go unquestioned. The best way at approaching a good close reading is by asking \(and attempting to answer\) questions about every piece of a text. 29 | 30 | Take a sentence from the 1775 _Anecdotes on the Countess du Barry_, a _libelle_ \(which you can find [here](http://chnm.gmu.edu/revolution/d/261/)\) similar to the ones discussed in Sarah Maza's “The Diamond Necklace Affair Revisited: The Case of the Missing Queen.” Mme du Barry was a prostitute who was Louis XV's mistress at the end of his reign \(1715-1774\). Here is how the Count du Barry tells one of Louis XV's courtiers that he has a woman in mind for the king: 31 | 32 | > "I’ve got your business for you. You know I don’t lack taste. Trust me: you come to dinner at my house and tell me that I’m a cad if I don’t give you the most beautiful woman, the most fresh, the most seductive; a true morsel for a king." 33 | 34 | In beginning a close reading here, I might ask: 35 | 36 | * What adjectives and nouns describe Mme du Barry here? 37 | 38 | * More specifically, what does it mean that she is compared to a "business" or a "morsel"? 39 | 40 | * If she is a piece of food, what does that mean about the relationship she might have with Louis XV? 41 | 42 | * Why is she not named here? 43 | 44 | * If you read the rest of the text, you'll see that most of the language in this excerpt is flowery -- but not the Count du Barry's words. What does that suggest about who he is and what his character is like? 45 | 46 | 47 | You can answer these questions any number of ways, and this ambiguity is part of the point. Close reading as a method is a way of training yourself to look for details, the evidence that you will use to interpret a passage, but how you use them depends on you. This evidence becomes the material you use to produce an analysis of your own \(sometimes also called a close reading\). Using the questions about _Anecdotes on the Countess du Barry_, I might make the argument that these sentences establish her as an object, a commerical good or a commodity for the king's consumption. I might also think that the Count du Barry's words render him as vulgar and coarse, a figure unworthy of contact with the court of Versailles. 48 | 49 | ## Primary and Secondary Texts for Historical Analysis 50 | 51 | In addition to reading texts closely, you also want to think about the kind of text you are working with and its relationship to its historical context. For starters, you need to know if the work you are reading is a **primary** text or a **secondary** text. [The Healey Library](http://umb.libguides.com/c.php?g=351019&p=2367357) has a good set of definitions: 52 | 53 | > **Primary Sources** are immediate, first-hand accounts of a topic, from people who had a direct connection with it. 54 | > **Secondary Sources** are one step removed from primary sources, though they often quote or otherwise use primary sources. They can cover the same topic, but add a layer of interpretation and analysis. 55 | 56 | Sarah Maza's article is a secondary text, whereas the _Anedcotes_, discussed above, is a primary text. 57 | 58 | Reading primary texts is absolutely invaluable, particularly in the study of history. There is no better way to understand events in the past than by examining the sources – whether journals, newspaper articles, letters, court case records, novels, artworks, music or autobiographies – that people from that period left behind. However, you need to approach primary sources with care and as something other than a 100% accurate representation of the truth. For instance, in reading the _Anecdotes_, you might ask: did the author actually witness the events he or she was describing? Probably not. In that is the case, what can this document help us understand? And what can't we use it to do? 59 | 60 | Thus, you want to read primary sources with a series of questions in mind. The following is adapted from a guide provided by [Carleton College](http://apps.carleton.edu/curricular/history/study/): 61 | 62 | 1. What implicit and explicit messages does this text contain? What did the author choose NOT to talk about? 63 | 64 | 2. What do you know about the author? How might his or her beliefs or background have affected the writing of and views in this document? 65 | 66 | 3. Who constituted the intended audience? Was this source meant for one person's eyes or for the public? How does that affect the nature of the source? 67 | 68 | 4. Is it prescriptive \(telling you what people thought should happen\) or descriptive \(telling you what people thought did happen\)? 69 | 70 | 5. Does it tell you about the beliefs\/actions of the elite, or of “ordinary” people? From whose perspective? 71 | 72 | 6. What historical questions can you answer using this source? What questions can this source NOT help you answer? What are the limitations of this type of source? What historical perspectives are left out of this source? 73 | 74 | 7. What assumptions do you see being made? What surprises you about this text? 75 | 76 | 77 | For instance, take the following passage from the first paragraph of the _Anecdotes_: 78 | 79 | > Advancing age and the ability of a great prince to satisfy all his passions had dulled his attraction towards women. But this need, though diminished, continued ... The doctors assured the King that it was dangerous to give up so abruptly a pleasure necessary for his existence. 80 | 81 | At one level, this work is giving an account of how Louis XV and the Countess du Barry began their liaison. But these sentences also have an implicit message: the king's sexual desire for women was natural and indeed necessary to his well-being. This view that the king needed to have a mistress for the sake of his health may be surprising to you and it certainly reveals a set of assumptions about extra-marital activity at the time. So if we can't take this primary source as an accurate representation of the relationship between du Barry and the king, it does serve as a fascinating window into into the culture of late eighteenth-century France. 82 | 83 | ## Digital Reading 84 | 85 | Interrogating sources in this fashion is just one mode of understanding a text. It relies on the idea that sustained attention to a document will allow you to understand new things about it. This same approach can be applied to virtually any object of study, be they books, films, music, or ideas themselves. Our primary motivation in this book, then, is how the process can be changed with the introduction of technology. You might start by asking a series of questions about how close reading might interact with digital tools. 86 | 87 | * Can we still read closely if the computer is doing the reading for us? 88 | 89 | * How might the computer change the kinds of close readings available to us? 90 | 91 | * Can we close read new things using digital tools that we couldn't before? 92 | 93 | 94 | -------------------------------------------------------------------------------- /close-reading/exercises.md: -------------------------------------------------------------------------------- 1 | # Exercises 2 | 3 | Practice your close reading skills on the following short passage: 4 | 5 | --- 6 | Excerpt from "The Respectful Petition of an Humble Subject to her Majesty Queen Caroline," The Morning Post, September 09, 1820, issue 15439. Full text available from the [British Library Newspapers 1800-1900](http://find.galegroup.com/bncn/advancedSearch.do;jsessionid=0589C5BB221A07943B7D47A671F715D1) archive. 7 | 8 | That the prosperity of a nation does, very materially, depend on the preservation of the moral virtues has ever been indisputable, and it would be totally unbecoming your MAJESTY’s sex to question that morality principally depends on females. Now, with all due deference for your MAJESTY’s enlarged sentiments and highly cultivated understanding, is it fit, or proper, that whilst accusations of the grossest nature, alike reflecting on your MAJESTY’s virtue and delicacy, and solemnly discussing in the highest tribunal of the empire — whilst wives blush to hear their husbands read the tale — whilst mothers hide the details of profligate prostitution from their families — whilst virgin innocence blushes at the mention of England’s QUEEN - whilst the eye of modesty is averted, and chastity throws her mantle over the display of imputed, boundless passion, is it befitting a woman - can your MAJESTY think at all, and reconciling reflection with regal dignity and female importance, pronounce it blameless, that bearing the weight of these heavy charges, your MAJESTY should parade the streets of this metropolis, triumphantly proving your possessing a front which no respect for yourself, or consideration for the guiltless, can induce you to conceal? ... Oh! Madam, there are females in our island who earn their daily bread by daily toil, that would not exchange conditions with you, whether viewing you in a Neapolitan theatre, rioting in the mirthful buffoonery of your villa at Como, or drawn by half a dozen richly caparisoned studs... Though late, Madam, still deign to take counsel, and be persuaded, that the vulgar shouts of a shameless mob are not the hymns of a sensible, reflecting populace ; nor deem the ditties of itinerant ballad-mongers from the purlieus of St. Giles’s the carols of those who esteem retirement from public gaze, and absence from the page of notoriety, a woman’s most amiable sphere. 9 | 10 | 11 | --- 12 | 13 | * Write a paragraph describing your reading process in as great detail as you can manage. What kinds of things do you notice about the passage? Are you looking for particular things? What goes through your head? 14 | 15 | * Then read the passage two more times and repeat the same exercise, writing down what goes through your head in detail. 16 | 17 | 18 | Then go to [our class Prism](http://prism.scholarslab.org/prisms/2750132e-7b70-11e6-88c8-005056b3784e/highlight?locale=en). Highlight the Prism according to the rules of the game and then respond to the following questions: 19 | 20 | * How did you read differently when using Prism? Can you imagine other variations of the tool? How might things with five interpretive categories, for example? 21 | 22 | * Say something interesting about the results that Prism gives you. What new insights does it give you into the text? 23 | 24 | * Think more critically about the categories we have chosen for you to highlight. What assumptions do you think we have about them? How does pairing these two categories change how you think about the categories themselves? 25 | 26 | 27 | -------------------------------------------------------------------------------- /close-reading/prism-part-one.md: -------------------------------------------------------------------------------- 1 | # Prism Part One 2 | 3 | [Prism](prism.scholarslab.org) is a digital tool that enables readers to think about how they interpret texts in new ways. The project grew out of a series of conversations and exercises carried out by Johanna Drucker, Bethany Nowviskie, and Jerome McGann at the University of Virginia. Every member of the group would receive a copy of a single text as well as a transparency and a few highlighters. One person would give the highlighters to the group and say something to the effect of, "OK. Read this passage, and mark passages that seem to suggest 'democracy' with the green highlighter and 'anarchy' with the blue." With the transparency laid over the passage, the readers would all mark their own copy as they read. The marking process would end at a certain point, and the transparencies would be collected. 4 | 5 | The transparency game crystallizes a very basic element of textual analysis: 6 | 7 | * When we close read, certain textual components - phrases, words, characters - make us interpret a document in a particular way. 8 | 9 | The game asks you to make graphic representations of these decisions, to identify the words that make you think or feel a certain way. By lifting the transparency off the text, you are left with a series of colors that correspond to your reading of the document. You have traced the broad outlines of your interpretation onto the page. 10 | 11 | ![prism splash page](/assets/close-reading/prism-splash-page.jpg) 12 | 13 | Prism is a digital version of the same game. Given a choice between a few predetermined categories, Prism asks you to highlight a given text. In this Prism example, readers are asked to mark an excerpt from Edgar Allan Poe's "The Raven." By selecting one of the buttons next to the categories on the right, your cursor will change into a colored highlighter. Clicking and dragging across the text will highlight it in the same way that you might if you were reading a print version. 14 | 15 | ![prism highlights of the raven](/assets/close-reading/prism-raven-highlights.jpg) 16 | 17 | After you click "Save Highlights", the tool combines your markings with those everyone else who has ever read the same Prism text made to help you visualize how people are marking things. By default, Prism will bring up the **winning facet visualization**, which colors the text according to the category that was most frequently marked for each individual word. Clicking on an individual word will color the pie chart and tell you exactly what percentage the word got from each category. 18 | 19 | ![prism winning facet](/assets/close-reading/prism-raven-winning-facet.jpg) 20 | Seeing a graphic representation of the reading process might help you to notice things that you might not otherwise. For example, here you might notice that people tended to mark passages containing first person pronouns as "sense." Is it because "sense" implies thinking? Phrases like "I remember," "my soul grew," and "I stood there wondering" do suggest an emphasis on introspection, at the very least. Did you mark the same phrases, or did you select other passages? 21 | 22 | Prism comes with two visualizations baked into it. To change visualizations, click the "Font Size Visualization" button on the right sidebar. The **font size visualization** lets you see which parts of the text were more frequently thought of as belonging to a particular category: Prism resizes the text to reflect concentrations of reading. So in this example, where readers were marking for "sound," they tended to mark rhyming words more frequently. 23 | ![prism font size visualization](/assets/close-reading/prism-raven-font-size.jpg) 24 | Makes sense, and you might have done the same. By selecting the other category, you could check out what readers tended to mark for "sense." 25 | 26 | By design, Prism forces you to think more deeply about the categories that you are given for highlighting. The creator of this Prism wants you to mark for "sound" and "sense" - categories that relate to Alexander Pope's famous formulation of poetry from [_An Essay on Criticism_](https://www.poetryfoundation.org/resources/learning/essays/detail/69379). In it, Pope suggests that the sound of a poem should complement the underlying meaning of the poem. So the creator of this game wants you to try and pinpoint where these categories overlap and where they depart. You might not have known this context, though you might have intuited elements of it. Guided reading in this way might change how you would otherwise read the passage, and the absence of clear guidelines complicates your experience of the text. 27 | 28 | * How would reading be different if you do not know the exact meanings behind the categories? 29 | 30 | 31 | -------------------------------------------------------------------------------- /conclusion.md: -------------------------------------------------------------------------------- 1 | # Conclusion 2 | 3 | * [Where to Go Next](/conclusion/where-to-go-next.md) 4 | 5 | * [Further Resources](/conclusion/further-resources.md) 6 | 7 | * [Adapting This Book](/conclusion/adapting-this-book.md) -------------------------------------------------------------------------------- /conclusion/adapting.md: -------------------------------------------------------------------------------- 1 | # Adapting This Book for Another Course 2 | 3 | ***The GitBook platform that we use for publishing is changing rapidly. While you can fork our GitHub Repository and edit your own versions of the files, the GitBook platform as of this writing is too unstable for us to develop reliable documentation about how to publish your own version of the text. We will update this page when the issue has been resolved. Until then, the instructions in the Publishing section below should be considered out of date and unstable. If you are able to import your own copy of the text on GitHub by mimicking the instructions below, please make an issue on our [GitHub page](https://github.com/bmw9t/introduction-to-text-analysis/issues) to let us know. *** 4 | 5 | We encourage others to use this book for their own courses and to change it to meet the needs of their own contexts. The publishing platform here helps to facilitate this process. We especially imagine people reworking the exercises in each chapter to reflect their own disciplinary content. With a little effort you can rework the book for your own purposes and publish it to GitBooks for your students to use. 6 | 7 | **Note:** 8 | 9 | * **Copying the book will only get you a particular version of the book at a particular point in time. By default, any changes we make to the book after you copy it will not be reflected in your version of the book. [Syncing your version](https://help.github.com/articles/syncing-a-fork/) of the book with ours will likely conflict with any changes you have made, so we would only try that with great care.** 10 | 11 | ## Getting Your Own Copy 12 | 13 | The contents of this book are hosted in [a repository on GitHub](https://github.com/bmw9t/introduction-to-text-analysis) and rendered to the internet via [GitBooks](http://gitbook.com). When we make changes to the file structure hosted on GitHub, the changes populate out to our GitBooks account, which renders the various files into the web version of the book. To make your own remixable copy of the book, you will need to make a copy our GitHub repository and sync your copy with a GitBook of your own. Things you'll need to begin: 14 | 15 | - GitBooks Account 16 | - GitHub Account 17 | - GitBooks Editor (optional depending on your command line and markdown fluency) 18 | 19 | First you will need to make a copy of our GitHub repository for your own account. When logged in and looking at our repository page, you should see these three buttons in the top-left corner of the window: 20 | 21 | ![fork button on github](/assets/conclusion/fork-button.jpg) 22 | 23 | **Forking** is Github's term for creating a copy of a repository for yourself - imagine a road forking and diverging into two paths. If you click fork, GitHub should start the copying process. When finished, you will be redirected to your fresh copy of the repository. 24 | 25 | ![copy of github repository after forking](/assets/conclusion/github-forking.jpg) 26 | 27 | Note the "forked from bmw9t/introduction-to-text-analysis" statement at the top of the window, which lets you know where the book originated from. Above that you will see your own book's location. You now have your own version of the book's various files, and any changes you make to your own version will not affect our original book. GitHub will also keep track of your book's history for you. 28 | 29 | ## Publishing 30 | 31 | **Note: GitBook is still under heavy development, and these steps might have changed since last writing.** 32 | 33 | You have a copy of all the files that make up the book, but you will need to sync them with GitBooks if you want to publish them online in the same way that we have done here. To do so, after logging into GitBooks you will click on the green 'Import Button.' ![gitbook add book button](/assets/conclusion/gitbook-add-book.jpg) 34 | 35 | Selecting the "GITHUB" option, you will need to link your GitHub account and verify your account by an email. 36 | 37 | ![import github repository to gitbook](/assets/conclusion/gitbooks-import-github.jpg) 38 | 39 | After linking your GitHub account, if you have more than one respository under your name you will have to select the one that you want to import to GitBooks. In this case, we will import the *Introduction to Text Analysis* repository. 40 | 41 | ![select your repo in GitBooks](/assets/conclusion/gitbook-repo-selection.jpg) 42 | 43 | Give your repository a name and a description, and you're all set. A complete form should look something like this: 44 | 45 | ![Complete form for importing a github repository into GitBooks](/assets/conclusion/gitbooks-github-complete-import-template.jpg) 46 | 47 | You now have a working copy of the book hosted on GitHub and rendered in GitBooks (GitBooks should automatically redirect you to your copy). You can do anything you want with these files, and they won't affect our own base copy of the resources. 48 | 49 | ## Editing 50 | 51 | ### Markdown 52 | 53 | From here you just need to know a few more things to edit your new and ready-to-remix textbook. The book is written as a series of files in **markdown**, a form of markup that can easily be converted into HTML. GitBooks provides a [great tutorial on markdown](https://gitbookio.gitbooks.io/markdown/content/) that help get you started. 54 | 55 | ### Editing with GitBooks Editor 56 | 57 | If markdown feels too complicated, GitBooks also provides a handy [desktop editor](https://www.gitbook.com/editor/osx) that can make the process just about as intuitive as writing in Microsoft Word. You can type in markdown, but the editor will also convert certain commands to markdown for you: 58 | 59 | \*\*bolded text\*\* will render as **bolded text**. 60 | 61 | But you can also highlight text and press command + b as you would in Microsoft Word to produce the same effect. 62 | 63 | ![gitbooks editor interface](/assets/conclusion/gitbooks-editor-interface.jpg) 64 | 65 | The interface provides a preview of what your text will look like to the right of the window, which can be very helpful if you are new to markdown. If you decide to work in the GitBooks Editor, you will need to log in the first time you do so. Then select the "GitBooks.com" option for importing. 66 | 67 | ![gitbooks cloning locally](/assets/conclusion/gitbooks-clone.jpg) 68 | 69 | The computer will **clone**, or copy, the book to your computer. From there, you can follow the instructions in the [editor's documentation](https://help.gitbook.com/). The only significant difference from MS Word is that, after saving your work, you will need to click the sync button to upload your content to GitHub. 70 | 71 | ![gitbooks sync](/assets/conclusion/gitbooks-sync.jpg) 72 | 73 | After doing so, any changes you have made from the GitBooks editor will also change the GitHub repository's files, which will then automatically get rendered in the GitBooks version of the site. You are all set! 74 | 75 | #### Editing with Terminal 76 | 77 | If you are planning to use terminal, the process is fairly similar. Once you have forked and have your own copy of the book on GitHub, you will just clone it to your computer using the clone url found at the top of your repository's page on GitHub. Here is the one for the original book: 78 | 79 | ![github clone url](/assets/conclusion/clone-url.jpg) 80 | 81 | Find your own clone url, copy it to your clipboard, and use it like so (without curly braces): 82 | 83 | ```$ git clone {your_clone_url here}``` 84 | 85 | This will copy the repository to your machine. From there, you can edit using a plain text editor as normal and make changes to the repository using [git](https://git-scm.com/). 86 | 87 | At this point you should have everything you need to edit your copy of the book as you see fit for your own needs. If we haven't covered something here or you run into problems, drop us a line in our [discussions forum](https://www.gitbook.com/book/bmw9t/introduction-to-text-analysis/discussions). 88 | -------------------------------------------------------------------------------- /conclusion/resources.md: -------------------------------------------------------------------------------- 1 | # Further Resources 2 | 3 | Each individual lesson contains suggested further readings on the particular topic discussed in that section. Here we wanted to gather two types of resources. First, we wanted to gather a few more useful tidbits that didn't fit well anywhere but that will be helpful to anyone exploring text analysis. Second, we wanted to point you towards other fantastic tutorials and textbooks for text analysis that go further in depth than we do here. Interested browsers should also check out the lessons on particular topics of interest to make sure you see any and all resources. 4 | 5 | ## Secondary Readings on the Digital Humanities 6 | 7 | * [Digital Humanities Zotero Group](https://www.zotero.org/groups/digital_humanities/items) 8 | * Leary, Patrick. "[Googling the Victorians](http://www.victorianresearch.org/googling.pdf)." 9 | * Moretti, Franco. *[Graphs, Maps, Trees](https://www.amazon.com/Graphs-Maps-Trees-Abstract-Literary/dp/1844671852).* 10 | * Kirsch, Adam. "[Technology is Taking Over English Departments: The False Promise of the Digital Humanities](https://newrepublic.com/article/117428/limits-digital-humanities-adam-kirsch)." 11 | * LA Review of Books, *[The Digital in the Humanities](https://lareviewofbooks.org/feature/the-digital-in-the-humanities)* series 12 | * Rockwell, Geoffrey and Stéfan Sinclair. *[Hermeneutica: Computer-Assisted Interpretation in the Humanities](https://mitpress.mit.edu/books/hermeneutica)*. 13 | 14 | ## Tutorials and Textbooks 15 | * Arnold, Taylor and Lauren Tilton. [Humanities Data in R](http://www.humanitiesdata.org/). 16 | * Bird, Steven, Ewan Klein, and Edward Loper. [Natural Language Processing with Python: Analyzing Text with the Natural Language Toolkit](http://www.nltk.org/book/). 17 | * Crymble, Adam, Fred Gibbs, Allison Hegel, Caleb McDaniel, Ian Milligan, Evan Taparata, Amanda Visconti, and Jeri Wieringa, eds. [The Programming Historian](http://programminghistorian.org/). 18 | * Graham, Shawn, Ian Milligan, and Scott Weingart. *[Exploring Big Historical Data: The Historian's Macroscope](http://www.themacroscope.org/2.0/)*. 19 | * Jockers, Matt. [Text Analysis with R for Students of Literature](http://www.matthewjockers.net/text-analysis-with-r-for-students-of-literature/). 20 | 21 | ## Tools 22 | * [Google NGram Viewer](https://books.google.com/ngrams) 23 | * [Prism](https://prism.scholarslab.org) 24 | * [Voyant](https://voyant-tools.org) 25 | * [Zotero](https://zotero.org) 26 | 27 | ## Applications of Text Analysis 28 | * [Quantifying Kissinger](http://www.quantifyingkissinger.com/) 29 | * [Viral Texts](http://viraltexts.org/) 30 | * [Syuzhet Part One](http://www.matthewjockers.net/2015/02/02/syuzhet/), [Syuzhet Part Two](http://www.matthewjockers.net/2015/02/25/the-rest-of-the-story/) 31 | * [Mining the Dispatch](http://dsl.richmond.edu/dispatch/pages/home) 32 | * [How a Computer Program Helped Show J.K. Rowling write A Cuckoo's Calling](http://www.scientificamerican.com/article/how-a-computer-program-helped-show-jk-rowling-write-a-cuckoos-calling/) 33 | * [Text Analysis of Trump's Tweets Confirms He Writes only the (Angrier) Android Half](http://varianceexplained.org/r/trump-tweets/) 34 | * [Walmart Gets Its Worst Yelp Reviews in Black and Latino Neighborhoods](http://www.citylab.com/work/2016/08/walmart-get-its-worst-yelp-reviews-in-black-and-latino-neighborhoods/497864/) 35 | 36 | ## Other 37 | * [Networked Infrastructure for Nineteenth-Century Electronic Scholarship (NINES)](https://www.nines.org) 38 | * [Stanford Literary Lab](http://litlab.stanford.edu/) -------------------------------------------------------------------------------- /conclusion/where-to-go.md: -------------------------------------------------------------------------------- 1 | # Where to go Next 2 | 3 | You've come a long way! Thanks for sticking with us! 4 | 5 | You've learned a lot. Thought a lot. Read a lot. What comes next? We've talked about a bunch of different approaches to text analysis, so hopefully you have some ideas about things that interest you. One way forward might be to experiment with new tools. Or you could delve more deeply into a particular approach that piqued your interest. Perhaps while moving along you found yourself wishing that a tool could do something that it just wasn't set up to do. 6 | 7 | * "If only it would do X!" 8 | 9 | Maybe you can make something for yourself to do X! 10 | 11 | While writing this book, we used [GitBook's text editor](https://www.gitbook.com/editor/osx) so that we could preview the final product of our writing before it was published online. But the text editor has all sorts of added features. For example, it offers a style editor that suggests certain kinds of sentences might contain difficult syntaxs or formulations. Potentially useful, right? Maybe, but we turned it off right away. We found it really annoying to type while our text was screaming at us like this: 12 | 13 | ![sentence difficulty in GitBook editor](/assets/conclusion/sentence-difficulty.jpg) 14 | 15 | The most irritating thing was that we could not tell what metrics they were using to diagnose our writing. What makes a sentence difficult? The number of words in each sentence? The number of clauses? Subjects in particular positions? We have all sorts of opinions about why writing might be unclear, but, as best we could tell, the editor was mostly basing their suggestions on the number of words in each sentence. We turned the feature off and went on with our lives, but not before noting a truism of working digital humanities: using a tool built by someone else forces you to abide by their own assumptions and conventions. We could not change how the style checker worked beyond the minimal configuration options given to us in the platform. 16 | 17 | You might have had similar feelings while reading this book. You have used a series of powerful tools in the course of working through this book, but each one has its limitations. While using Prism, for example, you might have wished that you could see an individual user's interpretations to compare it with the group's reading. Or when using Voyant, you might have wondered if you could analyze patterns in the use of particular parts of speech throughout a text. Or maybe you were really interested in sentiment analysis or topic modeling. We didn't really offer tools for either of these approaches, because they quickly get a little more technical than we wanted. You need to be comfortable with some basic programming approaches to use tools of that nature. 18 | 19 | A logical next step for you might be to learn a programming language that can help facilitate textual analysis. Python and R are two widely used languages for these applications with a [wealth of resources](/resources.html) to help get you started. Exploring a new programming language takes time and dedication, but it can help guide you towards new types of questions that you might not otherwise be able to ask. Learning to program can help you determine what types of questions and projects are doable, and which ones might need to be let go. Most importantly, it can help you realize when using a tool someone else has built is better and easier than reinventing the wheel. While we would have loved nothing more than to turn you all into self-sufficient Python gurus, we believed that the purposes of this introduction could be better served by showing you what was possible first by tools and case studies. If you want to go further, you always can. 20 | 21 | This workbook by no means exhausts the topic of digital text analysis, but we hope that you have learned enough to get a sense of the possibilities that such methods can bring. Check out the [Further Resources](resources.md) page for other approaches, inspirations, and provocations. If, while reading the book, you found errors or sections that need clarification, please drop a note in our [discussion forums](https://www.gitbook.com/book/bmw9t/introduction-to-text-analysis/discussions) or on our [GitHub issues page](https://github.com/bmw9t/introduction-to-text-analysis/issues). 22 | 23 | Thanks for reading! 24 | 25 | Brandon and Sarah 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /contexts-and-claims.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/contexts-and-claims.md -------------------------------------------------------------------------------- /cover.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/cover.jpg -------------------------------------------------------------------------------- /crowdsourcing.md: -------------------------------------------------------------------------------- 1 | # Crowdsourcing 2 | 3 | * [Crowdsourcing](/crowdsourcing/crowdsourcing.md) 4 | 5 | * [Prism Part Two](/crowdsourcing/prism-part-two.md) 6 | 7 | * [Exercises](/crowdsourcing/exercises.md) -------------------------------------------------------------------------------- /crowdsourcing/crowdsourcing.md: -------------------------------------------------------------------------------- 1 | # Crowdsourcing 2 | 3 | Think of a common scenario: you are buying something online and, before the system will allow you to check out, you have to enter some text, transcribe some audio, or otherwise prove that you are not a robot. Doing so only takes a few seconds of your time, but such transactions happen millions of times everyday on the internet. The combined energy wasted on such simple interactions is astounding. 4 | 5 | Now imagine that you could take all those hours of human labor and put them to something useful. [reCAPTCHA](https://www.google.com/recaptcha/intro/index.html) aims to do just that, and you've probably unwittingly used it. Those human-validation tasks we described in the last paragraph? The chances are pretty high that, when you carried one out, you may have unwittingly corrected an image transcription, helped provide a test set for artificial intelligence, or helped to make Google Maps more precise. The next time you fill out a text box to prove that you are a human, you might take a closer look at what you are doing and ask, "what is my work being used for?" 6 | 7 | **Crowdsourcing**, broadly defined, can be thought of as the application of many different people to a single problem by having them each work on a piece of the project. The most common type of crowdsourcing is used to correct text transcriptions. When you scan an image with text in it, sophisticated computer programs run over the image to make their own best guess as to what that text might be based on vast quantities of information. This process is known as **optical character recognition \(OCR\)**, and these guesses are often incorrect: given many variations in font, ink type, contrast, and more, the task is actually very complicated and difficult. These mistakes are often called **dirty OCR**, and an example might look something like this: 8 | 9 | "Qi-jtb" 10 | 11 | That might not mean a lot out of context, but alongside an image you could probably piece together the word it was meant to represent from the original print artifact. [Ryan Cordell](http://ryancordell.org/research/qijtb-the-raven/) fixes on this particular poor scanning of the first word in the famous phrase "Quoth the Raven" from Edgar Allen Poe's "The Raven" as an example of the problems that scanning can present for studying historical documents. Such errors complicate a lot of digital text analysis: a search through the document for "Quoth" would not return this instance unless someone \(or something\) cleaned it. 12 | 13 | You will learn more about this and other problems with digital data in our chapter on "[Data Cleaning](/data-cleaning/problems-with-data.md)". For now, suffice it to say that correcting such errors is long, tedious work. Doing so doesn't require much intellectual energy for a human, but it does take a lot of time. On the other hand, these correcting tasks would be very difficult for a computer to do accurately, but computers can manage the large scale of the work with little trouble. Projects like these use a technique called **microtasking**, meaning they direct the human energy of many many people to finite tasks. Microtasking projects find solutions to big problems by making them into small tasks that individuals can solve. OCR Correction is a common scenario for such projects: [Transcribe Bentham](http://blogs.ucl.ac.uk/transcribe-bentham/) asks users to prepare corrected versions of the papers of Jeremy Bentham's unpublished manuscripts, and [18thConnect](http://www.18thconnect.org/) has developed [Typewright](http://www.18thconnect.org/typewright/documents) to correct a vast number of early modern materials. Each of these projects relies on individual people correcting texts one piece at a time. 14 | 15 | Whereas microtasking projects ask users to work on a problem already laid out for them, **macrotasking** projects are lead by the interests and aims of the group itself. _Wikipedia_ is probably the most famous example of crowdsourcing, and it falls under this category. Its many users apply their energy to common problems: the production of knowledge regarding particular topics. But _Wikipedia_ is different from other forms of crowdsourcing in that it has no clear goal in sight. We can never write down all of human knowledge: instead, _Wikipedia_'s devoted users will continually work to develop a better understanding until the website goes offline. The user community creates its own goals, priorities, and tasks, all of which can lead to systemic problems: the articles online do not necessarily reflect the inherent interest of the material but, instead, the interests of the community of editors. \(In the case of Wikipedia, this means that it has a significant [gender problem](https://www.insidehighered.com/blogs/library-babel-fish/woes-wikipedia).\) Whereas microtasking projects are about really small, repeatable problems, macrotasking problems are fundamentally different in kind. 16 | 17 | It is worth pausing over all of these examples to consider the labor going into them. We are talking about an incredible amount of energy and work that is essentially volunteer. If we go onto _Typewright_ and help transcribe an eighteenth-century text, that is time that we could have spent doing something else, including work for which we could have been compensated in more explicit ways. 18 | 19 | * Is it enough that the users are contributing to the public good for these projects? 20 | 21 | * At what point does volunteer labor become exploitation? 22 | 23 | 24 | In many cases, these digital projects cost vast sums of money, and, so the critique goes, these funds could have provided for actual paid employees instead of volunteers. Some of these crowdsourcing participants may not even have realized they were working. In the case of _Recaptcha_, you probably unwittingly volunteered your time for a crowdsourcing project without even realizing it. 25 | 26 | * What are ethical practices for conducting volunteer projects on such a scale? 27 | 28 | * What would it take for you to feel adequately compensated for your time? 29 | 30 | 31 | These are open questions with no clear answers, but they are worth keeping in mind. We think this awareness and self-reflection must be the foundation of ethical ways of engaging with projects like these. After all, _Typewright_, _Recaptcha_, and _Transcribe Bentham_ produce great results, but they do so by employing human energy to fairly menial tasks. _Wikipedia_ raises other important questions about crowdsourcing: 32 | 33 | * Can the crowd do more? 34 | * What happens when we give control of the project over to the crowd? 35 | 36 | ## Further Resources 37 | 38 | * Brandon Walsh, et al. have a [piece on Prism](http://llc.oxfordjournals.org/content/29/3/379.full) and crowdsourcing with a useful bibliography for pointing to other crowdsourcing projects. 39 | 40 | * Mia Ridge's book on _[Crowdsourcing our Cultural Heritage](http://www.worldcat.org/title/crowdsourcing-our-cultural-heritage/oclc/883391279)_ is fantastic on crowdsouring in a cultural context. 41 | 42 | 43 | -------------------------------------------------------------------------------- /crowdsourcing/exercises.md: -------------------------------------------------------------------------------- 1 | # Exercises 2 | 3 | Design a Prism game and answer the following questions. For your text, use an excerpt of at least 100 words from one of the following two writings by Jeremy Bentham on the panopticon: 4 | 5 | * [Letter VI](http://oll.libertyfund.org/titles/bentham-the-works-of-jeremy-bentham-vol-4#lf0872-04_label_299) 6 | * [Preface](http://oll.libertyfund.org/titles/bentham-the-works-of-jeremy-bentham-vol-4#lf0872-04_head_014) 7 | 8 | 9 | 1. What about this text are you interested in exploring? What is the interpretive intervention that you hope to make? 10 | 2. Who will you address it to? Why this particular group? 11 | 3. After designing your Prism get at least five people to read and mark the text for you. 12 | 4. What do you notice about the results? Anything interesting in the visualizations? The way people were interpreting the markings? The groups? 13 | 5. What limitations do you see with Prism? Can you imagine other ways of examining individual interpretations in the context of the group? 14 | 15 | Implement your game and send us the link. -------------------------------------------------------------------------------- /crowdsourcing/prism-part-two.md: -------------------------------------------------------------------------------- 1 | # Prism Part Two 2 | 3 | Think back to [Prism](prism.scholarslab.org) and the transparency game. So far we have only really focused on the single transparencies and the interpretations one individual supplied. But the crucial last element of the game involves collecting the transparencies and stacking them. Hold the stack up the light, and you get a whole rainbow: you can see what everyone thinks about the text. Prism's visualizations offer one way of adapting this activity to a digital environment. 4 | 5 | ![prism transparencies stacked](/assets/crowdsourcing/prism-future-stacked.jpg) 6 | 7 | In this photo from the "Future Directions" page for Prism, you can see the prototype for another possible visualization that would shuffle through the various sets of highlights. Even without this animated interpretation, Prism allows you to get a sense of how a whole group interprets a text. The program collections you markings along with those of everyone who has ever read that text in Prism. We can begin to get some sense of trends in the ways that the group reads. 8 | 9 | Prism was designed as a middle road between the two types of crowdsourcing projects that we discussed in the last section. By asking users to mark for a restricted number of categories, it can quantify those readings and visualize them in interesting ways. But it also asks for readers to actually read the text - interpreting a document along certain guidelines still asks readers to exercise the full range of their powers as thinking people. For this reason, the [designers of Prism see it as ](http://llc.oxfordjournals.org/content/early/2014/07/08/llc.fqu030.full?keytype=ref&ijkey=4zaX5fIvQwiLhIJ)**[crowdsourcing interpretation](http://llc.oxfordjournals.org/content/early/2014/07/08/llc.fqu030.full?keytype=ref&ijkey=4zaX5fIvQwiLhIJ)**. 10 | 11 | Prism offers a few options to facilitate group reading. Most importantly, it assumes very little about how its users will use the tool. Anyone can upload their own text as a Prism and, within certain guidelines, adapt the tool to their own purposes. When logged in, you can create a Prism by clicking the big create button to pull up the uploading interface: 12 | 13 | ![prism creation interface](/assets/crowdsourcing/prism-create-one.jpg) 14 | 15 | You upload a text by pasting it into the window provided. Prism does not play well with super long texts, so you may have to play around in order to find a length that works for the tool as well as for you. The three facets on the right correspond to the three marking categories according to which you want users to highlight. The rest of these categories should be self-explanatory. Note, however, that you will only be able to give a short description to readers: your document and marking categories will largely have to stand on their own. 16 | 17 | listed vs unlisted prism interface 18 | 19 | Below these main parameters for your text, you will be asked to make some other choices that may be less intuitive. 20 | 21 | By default, Prism assumes that you want the text and all its markings to be made available to the public. Selecting **unlisted** will make your Prism private so that only to people to whom you send the URL can view it. Once you create the Prism, you will want to be extra certain that you copy that URL down somewhere so that you can send it out to your group. 22 | 23 | Prism will also ask you what license you want to attribute to your materials. Many of the choices offered here are [creative commons](https://creativecommons.org/) licenses, but you can also choose public domain or fair use depending on your needs. If you are unsure, you can always select no license, but it would be worth doing a little research about the materials you are uploading to find out their legal status. 24 | 25 | Once you upload a text, the easiest way to find it is to go your personal page by clicking the "MYPRISMS" link from the top menu. In this profile page, you can easily access both the texts that you have uploaded as well as the ones that you have highlighted but that belong to others \(quite handy if you lose the URL for an unlisted text\). 26 | 27 | ![myprisms page](/assets/crowdsourcing/prism-myprisms.jpg) 28 | 29 | With these tools, you can upload a range of texts for any kind of experiment. It is tempting to say that you are limited by your imagination, but you will run up against scenarios in which the parameters for the tool cause you headaches. That's OK! Take these opportunities to reflect: 30 | 31 | * What will the tool not let you do? 32 | 33 | * Can you imagine good reasons for these limitations? 34 | 35 | * How would you design a different tool? 36 | 37 | 38 | As you work through Prism, think critically about the concept of crowdsourced interpretation. 39 | 40 | * Do you feel that this sort of work is fundamentally more empowering than the kind we saw with Typewright, Recaptcha, and Transcribe Bentham? 41 | 42 | * Are there other, better ways of facilitating group collaboration, digital or otherwise? 43 | 44 | 45 | -------------------------------------------------------------------------------- /cyborg-readers.md: -------------------------------------------------------------------------------- 1 | # Cyborg Readers 2 | 3 | * [How Computers Read Texts](/cyborg-readers/computer-reading.md) 4 | 5 | * [Voyant Part One](/cyborg-readers/voyant-part-one.md) 6 | 7 | * [Exercises](/cyborg-readers/exercises.md) -------------------------------------------------------------------------------- /cyborg-readers/computer-reading.md: -------------------------------------------------------------------------------- 1 | # How Computers Read Texts 2 | 3 | If you have been dutifuly following along until now, it should be clear that computers and humans don't think the same way. With respect to text analysis, we can say that computers and humans have complementary skills. Computers are good at doing things that would take us a long time to do or that would be incredibly tedious. Computers can easily count and compare and will do so for pretty much as long as you tell them to do so. In contrast, humans are very good at understanding nuance and context. Thus, you wouldn't want a computer to do any close reading, or unpack the claims of a primary or secondary text; this is something you are far better at. By the same token, it's probably easier to have a computer list all the numbers between one and 45678987 than to do it yourself. 4 | 5 | If such a disparity in skills exists between you and computers, you may be wondering why we're teaching a class on digital text analysis. Why bring technology into the equation when it is a poor approximation for a lot of the things that we do when we read? The answer is that there are a lot of instances where you can combine the nuance of human thinking with the quantitative power of computers to look at texts in new and creative ways. In particular, you can make computers do a lot of the repetitive work that you might find tedious. 6 | 7 | To do so, though, you need to know a bit about how computers process texts. In many ways, they have a hard time understanding data. They can interact with and use information, but they make very few assumptions and even fewer interpretations about what they're working with. Any interpretative abilities that they do have been specifically programmed into the computer's software. So what follows, then, is a lesson in not taking anything for granted. 8 | 9 | In the context of text analysis, all of this means that computers do not read with the same ease that we do. Consider the following sentence: 10 | 11 | "We saw 81/2." 12 | 13 | Taken alone, the sentence doesn't tell us much. Its meaning depends a lot on the question to which we might be responding, and we can think of two possible questions with very different contexts: 14 | 15 | > "How many movies did you see? 16 | 17 | > "What movie did you see?" 18 | 19 | In the first case, we might be responding with the number of movies that we had seen. It was a slow weekend, and we spent it at the local movie theatre hopping from film to film. It was a great time! In the second situation, we might be responding with the title of a specific film, [*81/2* by Italian director Frederico Fellini](https://en.wikipedia.org/wiki/8%C2%BD). So one answer is a number, and one answer is a name. Since humans are good at grasping context, we would easily be able to distinguish between the two. In most situations, we would just adjust our understanding internally before moving on with the conversation. 20 | 21 | Computers cannot make inferences like these, and this fact has serious implications: numbers and words have significantly different uses. Here are two further extensions of the conversation: 22 | 23 | > If you add four to how many movies you saw, what is the result? 24 | 25 | If we were talking about a number of movies, my response would clearly be, "Oh that's 12.5. Why are you giving me a math quiz?" If we were talking about the Fellini film, we might respond, "What? Oh, we were talking about a title, not a number. We can't add things to a title." Again, humans have the ability to respond to context, infer, and adapt. Computers aren't nearly as flexible: they need to know ahead of time, in most cases, what kind of information they are dealing with. That way they can act as you anticipated. 26 | 27 | Programmers have developed conventions for telling computers to distinguish between these different kinds of information, or **data types**. The distinction we outline above contains the two most important ones for our purposes here: 28 | 29 | * **Strings**: characters, the stuff of words 30 | 31 | * **Integers**: a whole numbers 32 | 33 | The misunderstanding about films depends on a confusion around data types like these. If you go on to learn how to program, you might find slightly different names depending on the programming language, and you will be introduced to other data types as well. But the distinction between strings and integers is important for text analysis. You can perform arithmetic operations on integers while strings respond less well to such things. You can capitalize words, but not numbers. And computers generally want you to deal with similar objects: you can combine strings (words can become sentences) or add numbers, but trying to combine a string and an integer will break things. 34 | 35 | But notice that our beginning scenario hinged on the ambiguity between strings and integers. How does a computer know whether we are talking about strings or about integers in cases where they could refer to either? How does it know that we want 8 to function as a word and not as a number in this context? 36 | 37 | Programmers over the years have built a variety of functions and tools into different languages to get around some of these difficulties, but they still remain. When processing text by a computer, we have to account for such problems. We generally do this by following very strict guidelines for inputting information. This **syntax** works in much the same way as grammar does for humans - helping the computer to keep track of what we mean and what we want it to do. 38 | 39 | In this case, we can tell the computer that something is a string or not by the presence or absence of quotation marks: 40 | 41 | * 8 vs "8" 42 | 43 | The computer looks at those quotation marks and can intuit the difference in datatypes: 44 | 45 | * A number without quotation marks? That's an integer. 46 | * Ah quotation marks. That means I'm looking at a string. 47 | 48 | Programming and text analysis more generally are built on such subtle distinctions. A computer needs to have its hand held in order to recognize difference and similarity. To a computer, the following are entirely unrelated: 49 | 50 | * 8 ≠ "8" ≠ "Eight" ≠ "Eighth" 51 | 52 | The computer would not recognize the relationships among those four clearly related words. It goes even further: computers think of lowercase and capital letters as different characters entirely. 53 | 54 | "H" ≠ "h" 55 | 56 | These differences can be extremely frustrating when you are beginning to practice text analysis, but don't worry: you don't have to reinvent the wheel. You benefit from years of programmers developing ways to account for these things. In any programming context, you probably have access to a range of utilities to capitalize things, lowercase them, convert integers to strings, convert date timestamps into words, etc. What this means is that sometime, years ago, someone first invented that wheel for you. A diligent programmer came along and told the computer that "h" and "H" have a special relationship along with how to navigate that link. You can benefit from their work. 57 | 58 | But there are advantages to these rigid restrictions. By following them, we can get very detailed information about texts that we might otherwise gloss over. The first part of any text analysis project involves converting complex language into organized data that the computer can understand. This first step involves smoothing out problematic bits and filling in any gaps, all with an eye to the issues outlined above and in the chapter on "[Data Cleaning](/data-cleaning.md)." 59 | 60 | "This is a sentence" ≠ "This" "is" "a" "sentence" 61 | 62 | A computer would not recognize the two sides of the equals sign as being equivalent. The left side, after all, contains spaces, and the right side contains a series of smaller strings, since each word is in quotation marks. Annoying? Maybe. But also useful! After all, we are rarely interested in whole sentences. We commonly refer to individual words as **tokens**, and the process of breaking sentences into words then becomes called **tokenization**. This allows us to structure our text into a collection of pieces that we can manipulate more easily. Tokenization almost always breaks apart punctuation from words into their own tokens. So "I heard him!" would become "I", "heard", "him", "!". Punctuation tokens may or not be thrown away, depending on whether they are tokens that you care about. 63 | 64 | We can break things down even further once we've divided a text into individual words. While we often care about how many times each particular token or word occurs, we might also care about the different kinds of words. We might want to keep track, on the one hand, of all the different words in a text regardless of how often they occur. But we might also want a different kind of vocabulary list. Rather than counting all the words, we might just want to grab a single example of each token **type**. If we have the following document: 65 | 66 | > Test test test sentence sentence 67 | 68 | We have five tokens and two types ('test' and 'sentence'). A list of types might be good for getting a sense of the kinds of language used in a text, while a raw list of tokens could be useful for figuring out what kinds of words occur in which proportions. Depending on our research questions and interests, statistics like these can help us figure out what the document discusses as well as how it is being discussed. 69 | 70 | If sentences are broken up into words, we might care also about breaking documents into sentences first. We have a name for that too: **segmentation**. 71 | 72 | > "But wait," you say, "computers care about capitalization. So if we tokenize a text and try to compare 'word' and 'Word' they will think they are entirely different things!" 73 | 74 | Good catch! You're right, those differences in capitalization often aren't meaningful. It is a fairly common practice to lowercase all the words after you tokenize them. This process is often called **normalizing** the data, since we are smoothing out inconsistencies that might get in the way of our research questions. This whole collection of processes of segmentation, tokenization, and normalization has a name of its own as well: **preprocessing**, all those things you do to data before you work with it. Depending on your interests, you might include other steps, such as tagging tokens for parts of speech or filtering out particular types of words. Note that preprocessing can change your list of types. A computer would not recognize "Said" and "said" as being of the same type, but, if you normalize capitalization so that every token is lowercased, a computer would see them as the same word. So you often need to decide what pieces you care about at the beginning of the process. 75 | 76 | Textual data is messy, and it requires a lot of work to convert it into usable material. Very often, the process involves even more steps than those that we outline here. But once you get a handle on the fixed set of methods for doing so, a whole world of possibility opens up. After all, the internet is *filled* with unstructured textual data, and we could learn a lot about our world by analyzing it. This field of study is referred to as **natural language processing** ("natural language" refers to human languages like English, French, Arabic or Chinese, as opposed to computer languages, which were invented). A wide range of careers are built upon these foundations in fields in the sciences, medicine, government, and many more. The world needs people who can make sense of our textual world. You could be one of them. 77 | -------------------------------------------------------------------------------- /cyborg-readers/exercises.md: -------------------------------------------------------------------------------- 1 | # Exercises 2 | 3 | * How many tokens are in the following sentence? How many types? 4 | 5 | > 'Speak!' cried Todd, 'speak! and speak the truth, or your last hour is come! 6 | 7 | * Write out a normalized, tokenized version of the sentence. 8 | 9 | Upload the text for _The String of Pearls_ available [here](https://raw.githubusercontent.com/bmw9t/introduction-to-text-analysis/master/assets/the-string-of-pearls-full.txt) into [_Voyant_](voyant-tools.org). Analyze the results. If things seem particularly slow, you can try working with a smaller chunk of the text. 10 | 11 | * Use Voyant to examine gender in the text. What kind of words do you need to look at? Which parts of Voyant? Make some sort of observations about your findings \(3-5 sentences\). Feel free to include a screenshot of the visualizations to help describe your observations. 12 | * How would you measure moments of heightened suspense in the text? Take a spin at doing it if you think have a solid idea. Or simply theorize in 3-5 sentences. 13 | 14 | Now upload the text for the various articles on [_Lloyd's Weekly Newspaper about The Hampstead Murders_](http://vrchristensen.com/http:/vrchristensen.com/category/newspaper-articles/lloyds-weekly-newspaper/) to Voyant and analyze them. This is the coverage of a late nineteenth-century murder case with a female victim and perpetrator. 15 | 16 | * What is one other thing that you notice about the word cloud for this text? How might you back up these claims and interpretations if you were to read this series of articles? 3-5 sentences. 17 | 18 | 19 | -------------------------------------------------------------------------------- /cyborg-readers/voyant-part-one.md: -------------------------------------------------------------------------------- 1 | # Voyant Part One 2 | 3 | We will be using a tool called [Voyant](http://voyant-tools.org/) to introduce some basic topics in text analysis using cyborg readers. 4 | 5 | Upon arriving at Voyant you will encounter a space where you can upload texts. For the following graphs, we have uploaded the full text of _The String of Pearls_, the 1846-1847 penny dreadful that featured Sweeney Todd, the demon barber of Fleet Street. Feel free to [download that dataset](/assets/the-string-of-pearls-full.txt) and use it to produce the same results for following along, or upload your own texts using the window provided. 6 | 7 | ![Voyant splash page and text uploader](/assets/cyborg-readers/voyant-splash-page.jpg) 8 | After Voyant processes your text you'll get a series of window panes with lots of information. Voyant packages several features into one tight digital package: each pane offers you different ways of interacting with the text. 9 | 10 | ![default view of string of pearls in voyant](/assets/cyborg-readers/voyant-overview.jpg) 11 | 12 | Voyant gives you lots of options, so do not be overwhelmed. Voyant provides [great documentation](http://docs.voyant-tools.org/start/) for working through their interface, and we will not rehearse them all again here. Instead, we will just focus on a few features. The top left pane may be the most familiar to you: 13 | 14 | ![voyant default wordcloud of string of pearls](/assets/cyborg-readers/voyant-word-cloud-default.jpg) 15 | 16 | Word clouds like these have been made popular in recent years by [Wordle](http://www.wordle.net/). They do nothing more than count the different words in a text: the more frequent a particular word appears, the larger its presence in the word cloud. In fact, Voyant allows you to see the underlying frequencies that it is using to generate the cloud if you click the "Corpus Terms" button above the word cloud. 17 | 18 | ![underlying corpus term frequency](/assets/cyborg-readers/voyant-term-frequencies.jpg) 19 | 20 | Concordances like these are some of the oldest forms of text analysis that we have, and computers are especially good at producing them. In fact, a project of this kind is frequently cited as one of the origin stories of digital humanities: [Father Roberto Busa's massive concordance of the works of St. Thomas Aquinas](http://www.historyofinformation.com/expanded.php?id=2321), begun on punch cards in the 1940's and 1950's. It was one of the first works of its kind and was instrumental in expanding the kinds of things that we could use computers to do. 21 | 22 | Busa's work took years. We can now carry out similar searches in seconds, and we can learn a lot by simply counting words. The most frequent words, by far, are 'said' and 'Todd," which makes a certain amount of sense. Many characters might speak and, when they do, they are probably talking about or to the central character, if they aren't Todd himself. 23 | 24 | voyant settings 25 | 26 | Notice the words that you do not see on this list: words like 'a' or 'the.' Words like these, what we call **stopwords**, are _so_ common that they are frequently excluded from analyses entirely, the reasoning being that they become something like linguistic noise, overshadowing words that might be more meaningful to the document. To see the words that Voyant excludes by default, hover next to the question mark at the top of the pane and click the second option from the right. 27 | 28 | Use the dropdown list to switch from 'auto-detect' to none. Now the concordance will show you the actual word frequencies in the text. Notice that the frequency of 'said', the number one result in the original graph, does not even come close to the usage of articles, prepositions, and pronouns. 29 | 30 | ![concordance with no stopwords](/assets/cyborg-readers/stopword-free-concordance.jpg) 31 | 32 | Words like these occur with such frequency that we often need to remove them entirely in order to get meaningful results. But the list of words that we might want to remove changes depending on the context. For example, language does not remain stable over time. Different decades and centuries have different linguistic patterns for which you might need to account. Shakespearean scholars might want to use an [early modern stopword list](/assets/early-modern-stopwords.txt) provided by Stephen Wittek. You can use this same area of _Voyant_ to edit the stoplist for this session of Voyant. Doing so will give you greater control over the tool and allow you to fine-tune it to your particular research questions. 33 | 34 | There are some instances in which we might care a lot about just these noisy words. They can tell us _how_ an author writes: those very words that might seem not to convey much about the content are the building blocks of an author's style. Tamper with someone's use of prepositions or pronouns and you will quickly change the nature of their voice. 35 | 36 | Let's return to the word cloud. Using the slider below the word cloud, you can reduce or expand the number of terms visible in the visualization. Slide it all the way to the right to include the maximum number of words. 37 | 38 | ![voyant word clouse dense](/assets/cyborg-readers/voyant-word-cloud-dense.jpg) 39 | 40 | Just like the stopword list can be used to adjust the filters to give you meaningful results, this slider adjusts the visualization that you get. It should become clear as you play with both options that different filters and different visualizations can give you radically different readings. The results are far from objective: your own reading, the tool itself, and how you use it all shape the data as it comes to be known. 41 | 42 | This is a good reminder that you should not think of digital tools as gateways to fixed and clear truths, either about historical periods or individual texts. Voyant may seem somehow objective in that it produces mathematical calculations and data visualizations, but now you've seen that you can easily alter the results. These techniques of "distant reading" are same as the models of "close reading" we talked about earlier in that both only lead to asking more questions and positing more interpretations. 43 | 44 | That is a good thing. 45 | 46 | ## Interpreting Word Clouds 47 | 48 | Given that what's important about word clouds is not producing visualizations so much as the interpreting the results, you might ask: what does this help us learn about _The String of Pearls_? 49 | 50 | For one, looking at these word clouds suggests that much of the vocabulary of this novel either refers to or exists in the context of speech. One of the most prominent words is "said," but you also see "say" and "speak" and words like "yes," "I'll," and "oh" which probably -- although not necessarily -- come from written dialog. 51 | 52 | Additionally, most of the words are short, one or two syllables long. Penny dreadfuls like _The String of Pearls_ were aimed at working class audiences: could the prevalence of these relatively 'simple' words reflect the audience of the text? In order to substantiate such a claim, we would probably want to look at other publications of the period to see whether or not this vocabulary was typical. 53 | 54 | If we load Arthur Conan Doyle's "[A Scandal in Bohemia](http://www.gutenberg.org/files/1661/1661-h/1661-h.htm#1)" into Voyant, you can see that we get quite different results. \(Again, feel free to follow along.\) 55 | 56 | ![scandal in bohemia word cloud](/assets/cyborg-readers/scandal-in-bohemia-word-cloud.jpg) 57 | 58 | A quick glance shows that the most common words tend to be longer than those in _A String of Pearls_. Indeed, the three syllable "photograph" is one of the most frequently used terms in this short story, one written for a middle-class as opposed to working-class audience. So maybe the simple vocabulary of the penny dreadful _is_ related to the nature of its readership. 59 | 60 | But let's not stop there! You may also notice that the word cloud for "A Scandal in Bohemia" has a lot of words related to high status: "king," "magesty," "gentleman," and "lady," for instance. In contrast, with the possible exception of the words "colonel" and "sir" in _A String of Pearls_, there are hardly any words in this novel that refer to rank. This gives you some indication that these two works are set in different social milieus in London. 61 | 62 | Alternately, the types of words in these two works are not at all the same. The word cloud for _A String of Pearls_ contains a lot of verbs \("shall," "said," "come," "know," "suppose," "thought"\), whereas that for "A Scandal in Bohemia" is made up of a lot of nouns, particularly those referring to places \("room," "house," "street," "lodge," "window," and "adress"\). This is an interesting thing to note, but you still want to think about what this means about the two different texts. Perhaps _A String of Pearls_ is more concerned with action and on the excitement of people doing things than "A Scandal in Bohemia," where the emphasis is on moving through and exploring different spaces. Or maybe all of these observations are artifacts of the visualizations, things that the word clouds suggest but that might not actually hold up under scrutiny of the data. More thinking is needed! 63 | 64 | Some of these conclusions were probably pretty obvious as you read these two works \(or portions of them\). You probably picked up the fact that _A String of Pearls_ is set in working-class London, whereas "A Scandal in Bohemia" takes place in a more elevated milieu. You might even have noticed a difference in vocabulary, even if using Voyant made these distinctions more apparent and gave you further data to back up any claims you were making about them. But you probably didn't notice the emphasis on action vs. the importance of place in these two works. So this is a good example of how reading with one eye to the computer can lead you to new interpretations. 65 | 66 | ## Further Resources 67 | 68 | * Geoffrey Rockwell and Stéfan Sinclair, the creators of Voyant, have a great book on using it for text analysis: *[Hermeneutica](https://mitpress.mit.edu/books/hermeneutica)*. 69 | 70 | * Shawn Graham, Ian Milligan, and Scott Weingart have an excellent introduction to working with humanities data in *[Exploring Big Historical Data: The Historian's Macroscope](http://www.themacroscope.org/2.0/)*. -------------------------------------------------------------------------------- /data-cleaning.md: -------------------------------------------------------------------------------- 1 | # Data Cleaning 2 | 3 | * [Problems with Data](/data-cleaning/problems-with-data.md) 4 | 5 | * [Zotero](/data-cleaning/zotero.md) 6 | 7 | * [Exercises](/data-cleaning/exercises.md) -------------------------------------------------------------------------------- /data-cleaning/exercises.md: -------------------------------------------------------------------------------- 1 | # Exercises 2 | 3 | 1. Take all the readings you've done so far for this class and pull the sources into Zotero. 4 | 5 | 2. In a new document, practice using Zotero by adding citations for each of the sources to sample sentences. 6 | 7 | 3. Add a Zotero bibliography at the end of your document. 8 | 9 | 4. [Register](https://www.zotero.org/user/register/) with Zotero to create an account. This will allow you to participate in groups, which you will do for your final projects. 10 | 11 | 12 | -------------------------------------------------------------------------------- /data-cleaning/problems-with-data.md: -------------------------------------------------------------------------------- 1 | # Problems with Data 2 | 3 | So you have a text. You want to do something with it. It might be tempting to dive in and start using one of the tools in this book, but you should take a moment to examine the materials you are working with. Not all text is created equal, and your results can have real problems if you don't take care to examine the quality of the materials before you work with them. 4 | 5 | The basic principle to remember is **garbage in, garbage out (or GIGO)**: you won't get good results unless you have good data to begin with. 6 | 7 | ## OCR 8 | 9 | sherlock holmes article clipping 10 | 11 | Take this image, drawn from a 1922 printing of *[The Duluth Herald](https://archive.org/details/duluthherald10311922unse)*, of a newspaper ad for the American film version of Sherlock Holmes. 12 | 13 | By default, the computer has no idea that there is text inside of this image. For a computer, an image is just an image, and you can only do image-y things to it. The computer could rotate it, crop it, zoom in, or paint over parts of it, but your machine cannot read the text there - unless you tell it how to do so. In fact, the computer doesn't even really know that there *is* text there. As far as it's concerned, an abstract painting and an image like this contain the same amount of textual information. The computer requires a little extra help to pull out the text information from the image. 14 | 15 | The process of using software to extract the text from an image of a text is called **optical character recognition** or OCR. We occasionally use OCR as a noun, as in "the OCR for that document is pretty poor" or as a verb, as in "we need to OCR this text before we can process it." There are many tools that can generate OCR for a text, and some of them are proprietary, meaning you have to pay for the abilty to use them. All of these tools are only so good at the process: what is easy for you requires a lot of computing power to carry out effectively. 16 | 17 |
18 | 19 | 20 | ocr'd sherlock holmes text 21 | 22 | Running this image through tesseract, a common free tool for OCR'ing text, we get the computer's best garbled attempt at translating image into text (at right). 23 | 24 | The material here is still recognizable as being part of the same text, though there are obvious problems with the reproduction. At first blush, you might think, "This should be easy! I learned to read a long time ago. I can even read things written in cursive! Why does the computer have such a hard time with this?" This is one of those instances where what is no trouble for you is much harder for a computer. Humans are great at pattern recognition, which is essentially what OCR is. Computers, not so much. 25 | 26 | OCR'ing text is actually a pretty complicated problem for computers. [WhatFontis.com](http://www.whatfontis.com) lists over 342,000 fonts, and this count only appears to include Western fonts. A single word will look slightly different in each font and at each size. And that doesn't even begin to account for hand-written text or text that has been partially damaged: even a slight imperfection in a letter can complicate the scanning process. The process is complicated and takes a lot of work: even the most expensive OCR software is prone to errors. If you see clean text transcriptions of an image online, odds are high that a human cleaned up the OCR to make it readable. 27 | 28 | ## Data Cleaning 29 | 30 | Let me say it again, computers cannot infer. Imagine this scenario: 31 | 32 | We're going to count to ten! 33 | 34 | 1,2,3,4,5,6,7,8,10 35 | 36 | You probably meant to have a 9 in there, and a human reading it would most likely know that there was a mistake. But the computer will have no idea that you accidentally left out a number. You would have to specifically tell it to account for such errors. This simple fact about computational logic becomes a big problem in the humanities, because humanities data is _messy_. To see what we mean, go check out the Wikipedia section on Sir Arthur Conan Doyle's [name](https://en.wikipedia.org/wiki/Arthur_Conan_Doyle#Name). We will wait. Here is a picture of a cat in the meantime. Imagine it's a cat high fiving you when you clean up some data. 37 | 38 | high fiving cat 39 | 40 | Did you read it? Promise? 41 | 42 | Doyle has a complicated naming history, to say the least. Now imagine you are putting together a database of authors. You get to Doyle. How will you save his name? We can think of a number of possibilities: 43 | 44 | ``` 45 | Doyle Arthur Doyle 46 | A.C. Doyle 47 | Doyle, A.C. 48 | Doyle, Sir Arthur 49 | Doyle, Sir Arthur Conan 50 | Sir Arthur Conan Doyle 51 | ``` 52 | 53 | You can probably imagine others. All of these are technically correct, and they might serve your purposes just fine. But you need to be consistent. Remember how computers cannot infer anything? Imagine this as part of your database of authors: 54 | 55 | ``` 56 | Author Name 57 | --- 58 | Austen, Jane 59 | James Joyce 60 | Arthur Conan Doyle 61 | ``` 62 | 63 | You are working with a number of formats: 64 | 65 | ``` 66 | Author Name 67 | --- 68 | Austen, Jane: last_name, first_name 69 | Arthur Doyle: first_name last_name 70 | ``` 71 | 72 | A computer program would need a way to understand what you are giving it, something like: 73 | 74 | 1. Look at this 'Author Name' database. 75 | 2. Each Author has a line of its own. 76 | 3. Get the Author's name. 77 | 78 | This data would cause all sorts of problems with the third step. To begin, how does the computer get the names? There are two options here: 79 | 80 | * Look at the line for a comma. Before the comma, you will find the last name. After it, you will find the first name. 81 | * Look at the line for a space. Before the space, you will find the first name. After it, you will find the last name. 82 | 83 | The former is the more common way of representing data like this. Using commas to denote the different pieces of data is so popular that the format has its own name: **comma seperated value** or **csv**. It has an advantage over the second format that breaks apart data based on spaces: 84 | 85 | ``` 86 | Author Names 87 | --- 88 | Austen Jane 89 | Arthur Doyle 90 | Arthur Conan Doyle 91 | ``` 92 | 93 | If we used spaces to denote breaks between first name and last name, Arthur Conan Doyle would cause our program to error. It would likely interpret 'Arthur' as the first name and 'Conan' as the last name. 'Doyle' would be an unkown. Reformatting this as a csv allows us to handle Conan Doyle's full name: 94 | 95 | ``` 96 | Author Names 97 | --- 98 | Austen, Jane 99 | Arthur, Doyle 100 | Arthur Conan, Doyle 101 | ``` 102 | 103 | The next problem should be obvious: Jane Austen is in a last\_name, first\_name format, while the others are in the reverse. So our final version of this dataset would look like this: 104 | 105 | ``` 106 | Author Names 107 | --- 108 | Austen, Jane 109 | Doyle, Arthur 110 | Doyle, Arthur Conan 111 | ``` 112 | 113 | We might go further to associate Arthur Doyle and Arthur Conan Doyle as being representations of the same person, a process known as **authority control**. A common way of referring to data that contains inconsistencies and/or errors is as **dirty data**. To keep the metaphor, then, the process of revising data to remove such problems and prepare it for use is called **data cleaning**. 114 | 115 | ## Metadata 116 | 117 | If you have ever searched for a book using a library search interface, you have interacted with metadata categories. **Metadata**, in its most basic sense, is data about data. A text, after all, is more than just the words on the page. We have a whole range of other information that we use to describe the document. The author, its date of publication, its publisher, its copyright status, etc.: we might care deeply about these pieces of information, and we might want you to use them for particular analyses. These categories allow us to do things like search for books with particular titles from particular time periods. In our previous example, we were actually working with metadata without realizing. 118 | 119 | ``` 120 | Author Names 121 | --- 122 | last_name, first_name 123 | Austen, Jane 124 | Doyle, Arthur 125 | Doyle, Arthur Conan 126 | ``` 127 | 128 | We have two metadata categories here: last\_name, and first\_name. Each are separated by a comma. We might even think of author\_name as being its own metadata category for someone else's list of books! Databases are really these sorts of things at their heart: data and metadata, organized in systematic ways to make them easily usable. 129 | 130 | Imagine you have started to put together your own table of author names and you notice that your neighbor is putting together one of her own. You want to be able to compare notes and, even more, you want to combine lists. It should be obvious that you will have real problems if you organize things as "first\_name last\_name" and she organizes things as "last\_name, first\_name". You would need to do a lot of extra work to merge your two lists. It would have been easier if you were working with an accepted standard for how author names should be listed. 131 | 132 | Such metadata standards exist, and a lot of work goes into maintaining them \(check out [Dublin Core](dublincore.org/specifications) if you are interested in learning more\). These standards ensure that anyone producing a new dataset creates work that could easily translate and communicate with other systems. They ensure that your local library's data could eventually be drawn into the [Digial Public Library of America](https://dp.la) and made available on a large scale. The process might seem easy with this basic author name example, but imagine trying to coordinate such metadata standards for all people working on all types of cultural objects, all over the world. The work never ends. 133 | 134 | You can fall down a deep pit looking at all the different metadata standards and their uses. For now, we just want you to be familiar with the concepts. 135 | 136 | ## Further Resources 137 | 138 | * Chris Woolford has a more detailed explanation of how OCR works at [explainthatstuff.com](http://www.explainthatstuff.com/how-ocr-works.html). 139 | 140 | -------------------------------------------------------------------------------- /introduction.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | * [For Instructors](/introduction/for-instructors.md) 4 | * [For Students](/introduction/for-students.md) -------------------------------------------------------------------------------- /introduction/for-instructors.md: -------------------------------------------------------------------------------- 1 | # For Instructors 2 | 3 | This coursebook provides a brief introduction to digital text analysis through a series of three-part units. Each unit introduces a concept, a tool for digital text analysis \(or case studies based on the associated concept\), and then provides a series of exercises for practicing the new skills. Our intended audience is students who have no background in programming, text analysis, or digital humanities. 4 | 5 | We designed this book with three goals: 6 | 7 | First, we wanted to provide materials for a text analysis course that does not require extensive training in programming. Courses in text analysis with R or Python are valuable and have their place, but many concepts in text analysis can be covered through a tools-based approach. In part, this decision was made due to time restrictions. These particular materials developed as companion pieces to the equivalent of a one-credit digital humanities lab for a three-credit history course at Washington and Lee University. Thus, the amount of time available for instruction in digital humanities and programming was minimal. Choosing tools instead of languages, we hoped, would allow for the exploration of more disciplinary material than we might otherwise have time for. Accordingly, here we introduce concepts and methods gradually and over the course of the term. While some of these tools are more difficult to use than others, the book requires minimal prior experience with programming to work through these materials. In the course of the book, however, we introduce basic programming concepts necessary to working with unstructured data in a natural language processing context. If anything, we hope this book will provide a taste of what can be gained from further study that _does_ use programming. 8 | 9 | Second, we wanted to provide a set of materials that could be resuable in other contexts. In this, we were inspired by Shawn Graham's course workbook on [Crafting Digital History](http://workbook.craftingdigitalhistory.ca/). Our own workbook was originally developed a course in nineteenth-century European cultural history, and it draws from these course materials for its datasets and discussions. As much as possible, we tried to separate text analysis discussions from the disciplinary content specific to our course. Some overlap was necessary to enmesh the two portions of the course together. But the tripartite sequence in each unit - concept, case study, practice - is intended to modularize the book enough that it could be used in other courses and contexts. This book and its contents are licensed under a 10 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License, meaning that you are free to share and remix any part of the work under those conditions. The book's materials are available on [GitHub](https://github.com/bmw9t/introduction-to-text-analysis), where they can be copied and repurposed. Sections, especially our course-specific exercises, can be easily skipped and replaced with different tools or content. For the ambitious, you could even remix your own version that includes portions of the book and host your own [GitBook](http://gitbook.com) course site. For more guidance in how to do so, see [Adapting This Book for Another Course](/conclusion/adapting.md), but make special note of our prefatory warning about the instability of the GitBooks platform at this time. 11 | 12 | Third, the book is an experiment in open, versioned, collaborative writing. In this, we were particularly inspired by the work of Robin DeRosa on [The Open Anthology of Earlier American Literature](https://openamlit.pressbooks.com/). Our text was composed using a variety of technologies and practices relevant to digital humanities: markdown, HTML/CSS, version control, GitHub, and more. The authors had varying degrees of familiarity with these topics, and this book served as object lesson in how to generate new research and teaching materials while also developing new skillsets. The [GitBook Editor]((https://www.gitbook.com/editor/osx), in particular, was crucial for enabling us to polish technical skills in a way that did not detract from the forward momentum of writing. The two authors are in different fields \(Brandon Walsh is in English and Sarah Horowitz is in History\); accordingly, you will see vocabulary and examples that come from our different disciplinary backgrounds. But as we stress to students, although we may at times use different terms for essentially the same thing \(close reading vs. primary text analysis\) and have different knowledge bases, we are united by the same interest in using text analysis to explore meaning and context. 13 | 14 | The workbook is not meant to exhaust the topic of digital text analysis: quite the contrary. If you have more than one credit of class time at your disposal, you will have much more room to navigate and explore. If your students come in with a base-level of programming knowledge, you will likely be able to skip portions of the book. The book provides only one, surface-level introduction to text analyis. There are many other approaches to the topic, and we reference some of our favorites in the "[Further Resources](/conclusion/resources.md)" section of the concluding chapter. But we hope that some will build upon these materials and find the examples laid out here to be useful in developing their own courses and workshops. 15 | 16 | -------------------------------------------------------------------------------- /introduction/for-students.md: -------------------------------------------------------------------------------- 1 | # For Students 2 | 3 | This is a workbook about text analysis. Many of you are probably used to analyzing texts in one form or another, whether that be by carefully considering the parts of a literary text or thinking about the words in a historical document. But even though we'll be doing both those things, we are using the phrase "text analysis" in a slightly different fashion: to talk about how we can use computers to help analyze texts in new ways. 4 | 5 | Text analysis is often understood as one of the methodologies of the **digital humanities**, alongside other activities like creating digital exhibits and online mapping. We'll talk a lot \(and you'll read a lot\) about digital humanities in class. Essentially, this refers to how we are using computers and new technology in the humanities. Laura Mandell offers one helpful definition of what the digital humanities are in an [interview with the *LA Review of Books*](https://lareviewofbooks.org/article/digital-humanities-interview-laura-mandell/): 6 | 7 | > But the best definition of the digital humanities, I think, is bringing digital methods to bear on humanities research and then interrogating the digital humanities by humanities research. 8 | 9 | In the [same interview series](https://lareviewofbooks.org/article/digital-humanities-interview-ted-underwood/), Ted Underwood describes digital humanities as "a vague interest in technology." We will keep a kindred definition in mind as we move forward: the digital humanities involves using technology to think critically _and_ thinking critically about technology. We will use new tools to think about old texts and we explore the applications, perils and pitfalls of these new methods. 10 | 11 | You may have heard of the term **big data** to describe how scholars, businesses and governments are using vast amounts of data to understand the complexities of human behavior. What you will do in this class is learn about how you can use texts in these same ways. But at the same time, we want to introduce you to some of the ways that seemingly objective "facts" and "findings" can be misleading and the product of prejudice, error and/or flawed design. Humanities students are often very good at understanding the biases and assumptions of a text. You might not necessarily be as versed in doing the same with statistical models or charts and graphs, but we hope this class will give you some experience in doing so. You will get some exposure to working with textual data and also learn about what you can contribute to these conversations. 12 | 13 | You may be wondering: what is this thing called **the humanities**? At one level, this is just a group of disciplines or fields of study, one that often includes literature, philosophy, history, art history, and religion, and is distinct from the social sciences \(politics, economics, psychology\) and the natural sciences \(biology, chemistry, physics\). If you search the internet, you can probably find thousands of different definitions of what unites the students and scholars in these different fields. One that we particularly like is from Daniel Boyarin and Michael Nylan \(in religion and history, respectively\). [They propose](http://www.npr.org/sections/13.7/2015/10/26/452003593/the-humanities-what-s-the-big-idea) that the humanities examines: 14 | 15 | > the different ways that human beings have chosen or been able to live their lives as human beings. 16 | 17 | They also suggest that what unites the humanities is a common methodology: 18 | 19 | > The primary method for the study of humans through the investigation of their cultural products is _interpretation_....I would say that the greatest difference, as far as I understand scientific method, is that for us hypotheses emerge from the data as we study and interpret, and are constantly being modified and corrected, while the natural sciences seem to begin with hypotheses that they test. 20 | 21 | This class is taught by an English professor and a History professor. You'll probably notice that we have slightly different approaches and knowledge bases and occassionally use a somewhat different terminology to describe the same things. But fundamentally, we work in many of the same ways: reading and analyzing texts and thinking about meaning and cultural context. You do too, whether you realize or not, and this class aims to help you do so in different ways. 22 | 23 | ## For Students in History 211 24 | 25 | We suggest that you read this coursebook online as opposed to downloading it as a PDF, Mobi or ePub, since some of the embedded material will only show up online. Additionally, we may make changes to the book during the course of the term, so you want to make sure you are reading the most up-to-date version of this book. 26 | 27 | ## Further Resources 28 | 29 | * The LA Review of Books series "[The Digital in the Humanities](https://lareviewofbooks.org/feature/the-digital-in-the-humanities)" contains interviews with many luminaries in the field and can be a good introduction for further reading as to just what this baggy field is. We especially like the interviews of [Bethany Nowviskie](https://lareviewofbooks.org/article/digital-humanities-interview-bethany-nowviskie) and of [Jessica Marie Johnson](https://lareviewofbooks.org/article/digital-humanities-interview-jessica-marie-johnson). 30 | -------------------------------------------------------------------------------- /introduction/schedule.md: -------------------------------------------------------------------------------- 1 | # Course Schedule 2 | 3 | ## Note for Students in History 211 4 | 5 | The following is the course schedule from the beginning of the term. It's very likely that we will change it during the course of the term, but will not update the schedule here. Please consult Sakai for the official schedule. 6 | 7 | ## Description 8 | 9 | This course examines the intersection between scandal, crime and spectacle in 19th-century France and Britain. We will discuss the nature of scandals, the connection between scandals and political change, and how scandals and ideas about crime were used to articulate new ideas about class, gender and sexuality. In addition, this class will cover the rise of new theories of criminality in the 19th century and the popular fascination with crime and violence. Crime and scandal also became interwoven into the fabric of the city as sources of urban spectacle. Lastly, we will have an opportunity to discuss how issues of crime, scandal and spectacle resonate in the 21st century. Some of the particular events and trends this class will cover include the Diamond Necklace Affair, the trial of Oscar Wilde, the Jack the Ripper murders, and the birth of detective fiction. 10 | 11 | Through this course, students will be introduced to text analysis and data mining for the humanities. This course assumes no prior knowledge of these skills, but asks: how can newly developed technologies that allow computers to “read” large quantities of text shed light on the past? Students will work in groups throughout the course of the term to complete a digital history project that analyzes an element of the 19th century fascination with crime and scandal. 12 | 13 | ## Schedule 14 | 15 | ### Week 1 16 | 17 | * Introductions 18 | 19 | * Understanding Scandal 20 | * Ari Adut, _On Scandal_, Introduction and Chapter 1 21 | * Patrick Leary, "[Googling the Victorians](http://www.victorianresearch.org/googling.pdf)" 22 | * [Introduction](/introduction.md) and [Issues in Digital Text Analysis](/issues-in-digital-text-analysis.md) in this book 23 | 24 | ### Week 2 25 | 26 | * Scandal and Monarchy, Part I 27 | * Sarah Maza, “The Diamond Necklace Affair Revisited: The Case of the Missing Queen” 28 | * [Historical Essays on the Life of Marie–Antoinette of Austria](http://chnm.gmu.edu/revolution/d/262/) 29 | 30 | 31 | * Scandal and Monarchy, Part II 32 | * Tamara Hunt, “Morality and Monarchy in the Queen Caroline Affair” 33 | * Find two articles dating from the Queen Caroline Affair in the 19th Century British Newspapers Collection 34 | * [Close Reading](/close-reading.md) in this book 35 | 36 | * **First Paper Due: Analysis of a Scandal** 37 | 38 | ### Week 3 39 | 40 | * Scandal and Sexuality, Continued 41 | * Ari Adut, _On Scandal_, Chapter 2 42 | * Edward Carson’s [Opening Speech for the Defense of Lord Queensberry](http://law2.umkc.edu/faculty/projects/ftrials/wilde/defenopening.htm) 43 | 44 | 45 | * The Spectacle of Punishment 46 | * Michel Foucault, _Discipline and Punish_, selections 47 | * [Crowdsourcing](/crowdsourcing.md) in this book 48 | 49 | ### Week 4 50 | 51 | * Crime and the City 52 | * Louis Chevalier, _Working Classes, Dangerous Classes_, selections 53 | * Henry Mayhew, _The London Underworld_, selections 54 | 55 | 56 | * Female Criminality 57 | * Lisa Downing, “Murder in the Feminine: Marie Lafarge and the Sexualization of the Nineteenth-Century Criminal Woman” 58 | * Cesare Lombroso, _Criminal Woman, the Prostitute and the Normal Woman_, selections 59 | * [Digital Archives](/archives.md) in this book 60 | 61 | 62 | * **Second Paper Due: Analysis of a Nineteeth-Century Archive** 63 | 64 | ### Week 5 65 | 66 | * Detection in the 19th Century 67 | * Simon Cole, _Suspect Identities_, Chapters 1 and 2 68 | 69 | 70 | * The Rise of Detective Fiction 71 | * Michael Saler, “’Clap if You Believe in Sherlock Holmes’: Mass Culture and the Re-Enchantment of Modernity, c. 1890-1940" 72 | * Arthur Conan Doyle, "[A Scandal in Bohemia](http://www.gutenberg.org/files/1661/1661-h/1661-h.htm#)" 73 | * [Data Cleaning](/data-cleaning.md) in this book 74 | 75 | ### Week 6 76 | 77 | * Violence and Entertainment, Part I 78 | * Rosalind Crone, _Violent Victorians_, Chapters 1 and 3 79 | * [_The String of Pearls_, Chapters 36-39](http://www.victorianlondon.org/mysteries/sweeney_todd-00.htm) 80 | * Franco Moretti, “Graphs” from _Graphs, Maps, Trees_ 81 | 82 | 83 | * Violence and Entertainment, Part II 84 | * Rosalind Crone, _Violent Victorians_, Chapter 6 85 | * Find an article on a 19th century murder from the _Times_ from the [_Dictionary of Victorian London_](http://www.victorianlondon.org/index-2012.htm) 86 | * [Cyborg Readers](cyborg-readers.md) in this book 87 | 88 | 89 | * **Final Group Project Proposals Due** 90 | 91 | ### Week 7 92 | 93 | * The Spectacle of the City, Part I 94 | * Vanessa Schwartz, _Spectacular Realities_, Chapter 1 95 | 96 | 97 | * The Spectacle of the City, Part II 98 | * Vanessa Schwartz, _Spectacular Realities_, Chapters 2 and 3 99 | * [Reading at Scale](/reading-at-scale.md) in this book 100 | 101 | ### Week 8 102 | 103 | * Sex and the City 104 | * Judith Walkowitz, “Male Vice and Feminist Virtue: Feminism and the Politics of Prostitution in Nineteenth-Century Britain” 105 | * W.T. Stead, “The Maiden Tribute of Modern Babylon" 106 | 107 | 108 | * Sex and Death in the City 109 | * Judith Walkowitz, “Jack the Ripper and the Myth of Male Violence” 110 | * Find two articles on Jack the Ripper from [Casebook: Jack the Ripper](http://www.casebook.org/press_reports/) 111 | * [Topic Modeling](/topic-modeling.md) in this book 112 | 113 | 114 | * **Annotated Bibliography Due** 115 | 116 | ### Week 9 117 | 118 | * The Spectacle of Race, Part I 119 | * Clifton Crais and Pamela Scully, _Sara Baartman and the Hottentot Venus_, Introduction, Chapters 3 and 4 120 | * Tressie McMillan Cottom, "[When Your (Brown) Body is a (White) Wonderland](http://tressiemc.com/2013/08/27/when-your-brown-body-is-a-white-wonderland/)" 121 | 122 | 123 | * The Spectacle of Race, Part II 124 | * Clifton Crais and Pamela Scully, _Sara Baartman and the Hottentot Venus_, Chapter 6 125 | * Cleuci de Oliveira, "[Saartjie Baartman: The Original Booty Queen](http://jezebel.com/saartje-baartman-the-original-booty-queen-1658569879)" 126 | * Pia Glenn, "[You Can’t Ignore the Degradation of Saartjie Baartman to Connect Her to Kim Kardashian. You Just Can’t](http://www.xojane.com/issues/saartjie-baartman-kim-kardashian)" 127 | * Danielle Bowler, "[Saartjie Baartman is not ‘The Original Booty Queen’](http://ewn.co.za/2014/11/17/OPINION-Danielle-Bowler-Saartjie-Baartman-is-not-the-original-booty-queen)" 128 | * [Classifiers](/classifiers.md) in this book 129 | 130 | ### Week 10 131 | 132 | * Politics, National Identity and Scandal 133 | * Michael Burns, _France and the Dreyfus Affair_, selections 134 | * [Sentiment Analysis](/sentiment-analysis.md) in this book 135 | 136 | 137 | * Scandals and Contemporary Media 138 | * Anita Sarkeesian Interview: "'[The word “troll” feels too childish. This is abuse]"'(http://www.theguardian.com/technology/2015/aug/29/anita-sarkeesian-gamergate-interview-jessica-valenti) 139 | * [NSA Files Decoded](http://www.theguardian.com/world/interactive/2013/nov/01/snowden-nsa-files-surveillance-revelations-decoded) 140 | * Adam Kirsch, "[Technology is Taking Over English Departments: The False Promise of the Digital Humanities](https://newrepublic.com/article/117428/limits-digital-humanities-adam-kirsch)" 141 | 142 | 143 | * **Draft of Final Project Due** 144 | 145 | ### Week 11 146 | 147 | * No class, meetings with professors about final projects 148 | 149 | * Crime, Scandal and Politics in the Present Day 150 | * Julia Angwin, Jeff Larson, Surya Mattu and Lauren Kirchner, "[Machine Bias: There’s software used across the country to predict future criminals. And it’s biased against blacks](https://www.propublica.org/article/machine-bias-risk-assessments-in-criminal-sentencing)" 151 | * Matt Bai, "[How Gary Hart’s Downfall Forever Changed American Politics](http://www.nytimes.com/2014/09/21/magazine/how-gary-harts-downfall-forever-changed-american-politics.html)" 152 | * [Conclusion](/conclusion.md) in this book 153 | 154 | 155 | ### Week 12 156 | 157 | * Class Presentations 158 | 159 | * Wrap-Up and Class Presentations 160 | 161 | ### Exam Week 162 | 163 | * **Final Project and Process Paper Due** 164 | -------------------------------------------------------------------------------- /issues-in-digital-text-analysis.md: -------------------------------------------------------------------------------- 1 | # Issues in Digital Text Analysis 2 | 3 | * [Why Read with a Computer](/issues/why-read-with-a-computer.md) 4 | 5 | * [Google NGram Viewer](/issues/google-ngram.md) 6 | 7 | * [Exercises](/issues/exercises.md) -------------------------------------------------------------------------------- /issues/exercises.md: -------------------------------------------------------------------------------- 1 | # Exercises 2 | 3 | 1. What is your own background with computers? Interpret this question as broadly as you'd like. 4 | 2. Take a few minutes to reflect on how you read and what happens when you are reading. Then describe your process in 3-5 sentences. 5 | 3. What kinds of technologies do you use for reading? Do you feel that your use of different technologies and different ways of reading \(reading a physical book versus reading something on your computer versus reading it on an e-reader\) changes your experience of reading? 6 | 4. How do you imagine that reading was different in the nineteenth century than it is today? 7 | 5. Use the Google NGram Viewer to track two different terms that interest you. Interpret the results. What seems interesting? What kind of historical events might account for any shifts that you see? What terms rise or fall and when? Feel free to search on Wikipedia or Google to back up your interpretations, but don't overdo it. Keep your responses to 3 sentences per term. Provide a screenshot for the NGram search you carried out. 8 | 9 | -------------------------------------------------------------------------------- /issues/why-read-with-a-computer.md: -------------------------------------------------------------------------------- 1 | # Why Read with a Computer? 2 | 3 | How are you reading this book, right now? Even though you could have printed these pages out, the odds are fairly good that you are reading it on a screen. Digital reading is a practice that most of us take part in every day \(or even every waking hour of every day\), and many of us probably don't think very much about the computers that facilitate the process. 4 | 5 | Some people, of course, do: with the vast increase in digital reading over the past decade, a number of think pieces have come out describing the negative consequences of reading on a computer. [Some](http://www.businessinsider.com/why-its-bad-to-use-your-phone-before-bed-2015-7) suggest that reading on a screen before going to bed can make it difficult to relax and fall asleep. [Others](http://mashable.com/2013/01/16/e-books-vs-print/#ODRLdijcJPqA) sigh wistfully remembering the tangible, physical nature of books that gets lost when text is translated to pixels on a screen. [Some](http://psychminds.com/is-the-internet-destroying-our-attentions-span/) might argue that your attention span is fundamentally changed by the kind of reading you do. Or that your ownership over what you read is in danger when your books are electronic: [Apple](https://9to5mac.com/2016/05/13/apple-officially-acknowledges-reports-of-personal-music-files-being-deleted-itunes-update-coming-next-week-to-hopefully-fix-the-bug/) or [Amazon](http://www.nytimes.com/2009/07/18/technology/companies/18amazon.html) might delete a book you have bought from your device without your permission, but it seems far less likely that Penguin will break into your home to retroactively burn a physical copy of a book they have decided you shouldn't own. 6 | 7 | On the other hand, digital reading is often just so much more convenient. Online newspapers mean a lot less recycling. Ebooks are often cheaper than physical copies and are delivered to us in a matter of seconds -- no waiting for your books to arrive in the mail and no need for a trip to the library or bookstore! 8 | 9 | Regardless of how you fall in these debates, it is important to recognize that a change in format necessarily changes the reading experience. We can debate the positive or negative effects of electronic reading endlessly, but we should recognize an underlying truth: you interact with a text in different ways in print than on a screen. The same is true with any medium: in a film, you are processing images and sound; in a book you are dealing with text layout and language; with recorded music you are dealing almost exclusively with sound. The technologies that carry these texts, films, and sounds to us affect our understanding of them. The scholar Marshall Mcluhan [put it succinctly](https://en.wikipedia.org/wiki/The_medium_is_the_message): 10 | 11 | > The medium is the message. 12 | 13 | The technologies that transmit a message -- its text in this case -- fundamentally alter the meaning and experience of the work. And we can think about the message in richer ways by studying the materials that convey them. 14 | 15 | This is a book about reading and technology, but not quite in the same way as described above. Rather than reading _with_ about technology, we are going to discuss how we might read _through_ technology. It's not just that we now have access to books, newspapers and magazines online, it's also that we have access to **so much more**: all of the books on [Project Gutenberg](https://www.gutenberg.org/), newspaper articles from two hundred years ago, or all the blog posts that couldn't be written before the invention of the internet. 16 | 17 | This new access to material can be overwhelming and one of the questions of this course is how can computers help us deal with information overload. And furthermore, how can we harness technology to ask new questions of texts? For instance, let's say you wanted to know the number of times Arthur Conan Doyle used the term "Watson" in _The Adventures of Sherlock Holmes_ or wanted to know what the most common word was in this short-story collection. This would be a very tedious task if you just had the hard copy of the book, but it is one you can do in seconds with computer-based text analysis programs. Likewise, these same tools can help us find patterns in texts that we might not be aware of, or allow us to collaborate with others in the reading of texts. 18 | 19 | More specifically, we will talk a lot about the process by which we interpret texts, by which we translate ink on a page into meaning in our minds, and about how computers can tamper with and augment that process. We will touch on a number of topics and issues: 20 | 21 | * How can computers help us understand traditional reading processes in new ways? 22 | * How can we find new ways of reading through technology? 23 | * How can machines facilitate new types of collaborative reading? 24 | * How can we use computers to understand complicated categories like emotions and themes? 25 | 26 | The implicit claim in these bullet points is that computers affect the reading process positively, but we will also give careful consideration to the wide-ranging and compelling arguments that this is not always the case. 27 | 28 | * How does computer-assisted interpretation undermine the very point of reading? 29 | * Do these techniques show us anything new, or are they all fancy ways to describe what we already know? 30 | * How does reading with technology exacerbate racial, social, and economic inequalities? 31 | 32 | You will have to decide for yourself the answers to these questions over the course of the book. 33 | 34 | Confused? Good. That means you're learning. 35 | 36 | -------------------------------------------------------------------------------- /reading-at-scale.md: -------------------------------------------------------------------------------- 1 | # Reading at Scale 2 | 3 | * [Distant Reading](/reading-at-scale/distant-reading.md) 4 | 5 | * [Voyant Part Two](/reading-at-scale/voyant-part-two.md) 6 | 7 | * [Exercises](/reading-at-scale/exercises.md) -------------------------------------------------------------------------------- /reading-at-scale/distant-reading.md: -------------------------------------------------------------------------------- 1 | # Distant Reading 2 | 3 | When Brandon was entering graduate school, an older student once summed up one of life's problems as a sort of equation: 4 | 5 | > There is an infinite of material that one could read. 6 | 7 | > There is a finite amount of time that you can spend reading. 8 | 9 | The lesson was that there are limits to the amount of material that even the most voracious reader can take in. One's eyes can only move so quickly, one's mind only process so much. This might sound depressing, as if you're playing a losing game. But it can also be freeing: if you cannot read everything, why feel the need to try to do so? Instead, read what you can with care. 10 | 11 | Computers flip the problem: their problem is not so much with quantity of reading as it is quality. As we have discussed before, computers cannot read with any particular nuance or understanding of what they are ingesting. Instead, technology might be best suited for helping us read at scale. Critics like [Franco Moretti](http://www.versobooks.com/books/1421-distant-reading) refer to this kind of analysis, when we use technology to get a bird's eye view of a corpus, as **distant reading**. If close reading, which we talked about earlier, gives careful attention to every word in a text, distant reading assumes that we can get new insight from thinking more broadly, by using computers to take in more texts than would otherwise be possible. Thus, we might have a computer give us schematic representations of thousands or even hundreds of thousands of texts. In the last chapter, we worked with stopwords and frequency analyses. We were mostly interested in the numbers of times that particular words appeared over the course of our corpus. Computers are especially good at reading for things just like this. On our own, we would never be able to read all 19th century British novels. But computers can help us to at least get *some* sense of this great body of work. Reading at such a great scale can also offer us a chance to chip away at what Margaret Cohen has called the ["great unread"](http://press.princeton.edu/titles/6645.html), all that writing that has gone unnoticed because it never became part of the literary canon. 12 | 13 | It might appear as though distant reading is less critical: after all, you could theoretically construct a beautiful program to analyze thousands of books for you without you having to ever crack open a single one of them. And some people do. As Matt Jockers, Ryan Cordell, and others have [argued](http://ryancordell.org/research/scale-as-deformance/), however, even reading at this macro level requires attention to micro detail. Those same skills you were practicing with close reading earlier in the book? They are still deeply relevant. The work only begins once you have some results and a graph. You then have to figure out what elements are meaningful and what they might indicate. And the exploration very often takes you back to particular parts of the corpus that you want to read in more detail. 14 | 15 | ## Patterns 16 | 17 | One way to begin thinking about developing approaches to using tools and methods like these is to take a step back. When looking at the results of distant reading, you are, more than anything else, looking for patterns and outliers. You could ask yourself a number of questions when looking at the results of tools like Voyant. 18 | 19 | * Does anything clearly not belong or not make sense? 20 | * What surprises you? 21 | 22 | If you know your text is about the American South, and you find that the fourth most common token is 'France,' that probably says something interesting. You might need to revise your expectations and your research questions, and that is perfectly fine. This is actually part of the research process; if you don't revise your analysis, that means you aren't responding to what you are encountering in your sources. The most interesting thing about a project is rarely the first thing we think it will be. 23 | 24 | * How do the numbers that the tool spits out at you connect with underlying concepts in the text? 25 | 26 | Our reading experience, our interpretations of a text, the way it makes us feel: these are the result of may things, but language plays a role in constructing all of them. Words form the basis for everything we get out of reading, so we can work backwards from word to concept. Think about what underlying concepts might be taking shape as a result of particular words. For example, if four of the top five words in a text are male names or male pronouns, that might say something about gender representation in the text. Personal pronouns might say something about what it means to be a self in your text. Four times more exclamation points than periods? That might say something about the rhetorical impression the author wants to convey. 27 | 28 | * What trends do you see in the data? 29 | * Is anything clearly decreasing or increasing over time? 30 | * Are things largely the same over time? 31 | 32 | If you have a corpus where the dates for each text are known, you can begin to draw inferences based around language use over time. The Google NGram tool is built on such assumptions, though you should take care to think about how changes in language itself might affect your results \(the classic example of this is the [long s](https://en.wikipedia.org/wiki/Long_s), which computers frequently read as an 'f' in older texts\). The trends you see can offer good opportunities to reflect on your own understanding of what happens historically over the same time period. Alternatively, since we experience individual texts over time, we can examine how the use of a concept or word changes from the beginning of a text to the end. All of this might offer a way into thinking about the text as a whole. 33 | 34 | * Does something just look plain wrong? 35 | 36 | It is easy to think that the results the computer gives you are correct, and to take them at their word. After all, how could numbers lie? The truth is, however, that any data is the result of the biases of the people who produced them. Seemingly good statistics can make anything seem like objective truth when there might not be anything more than a pretty picture: 37 | 38 | ![bad statistics make a dinosaur](/assets/reading-at-scale/distant-reading-dinosaur.jpg) 39 | 40 | And a flashy visualization can just as easily show nothing. 41 | 42 | ![bad visualization](/assets/reading-at-scale/distant-reading-graphs.jpg) 43 | 44 | Your own results might be the result of some setting that you have configured just slightly incorrectly. Or maybe you uploaded the wrong text. Or maybe you are misunderstanding how the tool works in the first place. If something has you scratching your head, take a step back and see if there is something wrong with your setup. 45 | 46 | > But wait, you say, I don't know enough about X to be able to do this kind of work! 47 | 48 | You're fine! You don't need to know anything about statistics or computer science in order to be able to say something meaningful about texts through distant reading. Knowledge about both of these fields can go a long way and give you more meaningful and interesting things to say, but these tools, methods, and ideas should not be beyond anyone. Take a tool out for a spin and see what happens. You can always learn more about these fields to help give your analysis a stronger foundation, but it will all be for nothing if you don't even try because of such anxieties. Play first. Then enrich your work with further study. 49 | 50 | You cannot read everything. Instead, focus on what humans are good at: reading with care and offering interpretations. The computer can work with big numbers much quicker than you. Your job is to help it do so in a meaningful way. 51 | 52 | ## Further Reading 53 | 54 | * Ryan Cordell provides a helpful examination of the interconnectedness of close and distant reading in "[Scale as Deformance](http://ryancordell.org/research/scale-as-deformance/)" 55 | 56 | In addition, the following resources offer great introductions to distant reading.: 57 | * Franco Moretti, *[Graphs, Maps, Trees: Abstract Models for Literary History](https://www.amazon.com/Graphs-Maps-Trees-Abstract-Literary/dp/1844671852)*. 58 | * Margaret Cohen, *[The Sentimental Education of the Novel](http://press.princeton.edu/titles/6645.html)*. 59 | * Matt Jockers, *[Macroanalysis](http://www.press.uillinois.edu/books/catalog/88wba3wn9780252037528.html)*. 60 | -------------------------------------------------------------------------------- /reading-at-scale/exercises.md: -------------------------------------------------------------------------------- 1 | # Exercises 2 | 3 | Here are three projects that take distant reading approaches of various kinds: 4 | 5 | 1. [Quantifying Kissinger](http://www.quantifyingkissinger.com/) 6 | 7 | 2. [Viral Texts](http://viraltexts.org/) 8 | 9 | 3. [Syuzhet Part One](http://www.matthewjockers.net/2015/02/02/syuzhet/), [Syuzhet Part Two](http://www.matthewjockers.net/2015/02/25/the-rest-of-the-story/) 10 | 11 | 12 | Select one of the projects and familiarize yourself with it. Answer the following questions: 13 | 14 | * What is their object of study? What is their corpus? 15 | * What research questions are they interested in answering? 16 | * What methodologies do they use? 17 | * Select one visualization from the project. Screenshot it, and explain what is going on in the image. 18 | * What do you think about the project? What are some questions that you have about it? What interests you about it? 19 | 20 | These projects may incorporate methodologies, tools, or programming languages that we have not covered in this book. Don't worry! You are not expected to understand everything. 21 | 22 | -------------------------------------------------------------------------------- /reading-at-scale/voyant-part-two.md: -------------------------------------------------------------------------------- 1 | # Voyant Part Two 2 | 3 | Let's look at [Voyant](https://voyant-tools.org) in a bit more detail. Feel free to [download the Sweeney Todd dataset](/assets/the-string-of-pearls-full.txt) and use it to produce the same results and follow along, or upload your own texts using the window provided. Look back at the word cloud that Voyant gave us for _The String of Pearls_: 4 | 5 | ![voyant default wordcloud of string of pearls](/assets/reading-at-scale/voyant-word-cloud-default.jpg) 6 | 7 | Using the standard stopword filter in Voyant, the most common word by far is 'said.' Taken alone, that might not mean an awful lot to you. But it implies a range of conversations: people speaking to each other, people speaking about different things. One of the limitations of frequency-based measurements like these is that they only show you a very high-level view of the text. Once you find an interesting observation, such as 'said' being one of the most frequent words in the text, you might want to drill down more deeply to see particular uses of the word. Voyant can help you just do that by providing a number of context-driven tools. 8 | 9 | In the bottom-right pane, Voyant provides a series of options for examining the contexts around a particular word. The first one is a **keyword in context \(KWIC\)** interface, Voyant's representation of one of the most common concordance tools. You can change the word being examined by selecting a new word from the 'Reader' pane. By adjusting the context slider, you can modify exactly how much context \(i.e., how many words\) you see around the instances of the word you are examining. Tools like these can be helpful for interpreting the more quantitative results that the tool provides you. 670 instances of 'said' might not mean an awful lot, and the contexts pane can help you to understand how this word is being used. In this case, it can be useful for seeing different conversations: frequently, said followed by a name indicates dialogue from a particular character. 10 | 11 | ![voyant contexts](/assets/reading-at-scale/voyant-contexts.jpg) 12 | 13 | In this list of the first ten uses of 'said', two of them are closely joined with a name: 'Sweeney Todd.' If we look back at the word cloud for the text, we can see that these two words also occur with high frequency in the text itself. Given this information, we might become interested in a series of related questions: 14 | 15 | * How often is Sweeney Todd talking? 16 | * What is he talking about? 17 | * Who is he talking to? 18 | 19 | As we move away from particular words towards clusters of phrases in contexts, we also need a new vocabulary to represent those relationships. You may have heard of **n-grams** from [the Google Ngram Viewer](https://books.google.com/ngrams), which allows users to search large corpora for specified words or phrases. An n-gram is a sequence of words that occurs next to each other of a particular length: the 'n' becomes a stand-in for the specified length of phrase. So take the following sentence: 20 | 21 | "This is a sentence to illustrate ngrams." 22 | 23 | "To illustrate" is an n-gram of length two, while "is a sentence" is an n-gram of length three. We use a convenient shorthand for referring to ngrams of this length: **bigrams** and **trigrams**. 24 | 25 | **Collocations** are words that tend to occur together in meaningful patterns: so 'good night' is a collocation because it is part of a recognized combination of words whose meaning changes when put together. 'A night,' on the other hand, is not a collocation because the words do not form a new unit of meaning in the same way. We can think of collocations as bigrams that occur with such frequency that the combination itself is meaningful in some way. 26 | 27 | ![voyant collocates](/assets/reading-at-scale/voyant-collocates.jpg) 28 | 29 | In a similar way, Voyant allows you to see the phrases and words that occur to next to each other in the text on a regular basis. To access the 'Collocates' tool, you might need to hover over the question mark in a pane, and select the grid option to add a tool pane. From this menu, select 'grid tools' and 'collocates.' The 'context' slider allows you to find sentences where two words occur near each other. So setting a context of three for 'sweeney' and 'todd' will give you all the three word phrases in which those two words occur: they do not need to be contiguous. In this case "Sweeney Todd said" would match, as would "Sweeney said Todd." Each row tells you how often those words appear within a certain distance from each other. 30 | 31 | Looking at this data on collocations can lead to interpretive questions you might want to pursue. For one, a lot of the collocations are men's names \(Sweeney Todd, Mr. Oakley, Mr Lupin, Mr. Fogg, etc.\), whereas no female characters appear in the list of most frequent collocations. What might that tell you about the nature of the text and the way the author was shaping it to appeal to an intended male audience? Or, consider the very high number of collocations for "Sweeney Todd" -- more than twice the next highest collocation. At one level, this isn't terribly surprising. Sweeney Todd is the main character, after all. At another level, you might find it odd that his name was repeated so frequently in the text: how often did the readers need to be reminded of what his full name was? But maybe this gives you some insight into the nature of the work as a serial novel. If readers were consuming the work in weekly installments over the course of many months \(while they were probably reading other serial novels\), they may indeed have needed reminders about the main character's name \(as well as other essential plot points\). These are all potential avenues of interpretation; to make any of these arguments, you'd need to gather additional evidence -- but at least Voyant gives you places to start. 32 | 33 | Click on the row that lists 'said sweeney 52'. Many of the windows in Voyant are interactive, and selecting something will modify the visualizations and options available to you elsewhere in the tool. Selecting a row here will modify a line graph that shares a space with the collocates table. You'll need to select 'Trends' at the top of the pane in order to see the line graph. 34 | 35 | When you do, you will see a graph of the selected collocation over time. 'Sweeny' and 'said' occur within a space of three words in highly variable amounts over the course of the text. By looking at the graph, we can get a rough idea of when Sweeny Todd speaks over the course of the narrative. 36 | 37 | ![graph of sweeney said](/assets/reading-at-scale/sweeney-said.jpg) 38 | 39 | To graph things, Voyant breaks up your document into a segments \(you can change how many it uses\). Within each piece of the text it calculates how often the selected phrase or word appears. In this case, we might say Sweeney Todd talks significantly more in the first 70% of the text than he does in the last portion. Since you read the last few chapters of the novel, you might have a sense of why this is. The end of the text deals primarily with revelations about Todd's actions than his actions themselves. Of course, you wouldn't know this if you hadn't read portions of the text, a good example of how "distant reading" and regular, old-fashioned reading can and should enrich each other. 40 | 41 | The 'trends' pane can be quite handy, and it will allow you to see individual words or phrases as they rise and fall over the course of a corpus. Think of it as the next step in critically analyzing a concordance. After all, texts occur in sequence, and we can learn a lot from examining the locations in which significant words tend to cluster. You can think of this graph feature as roughly charting the time of the narrative and as helping you think about the order in which words occur. 42 | 43 | Thinking about language as it unfolds over time in this way can offer new opportunities for analysis. It can also raise issues. In a single text like _The String of Pearls_, composed over a relatively brief period of time, we don't need to worry too much about changes in language. But very often we might be studying loads of texts published over the course of many years, decades, or centuries. We cannot assume that language means the same thing over corpora like these. Words change over time. Your data and interests will determine how important this caveat is for you. Think carefully about whether any significant political, social, or technological events during that time could inflect how language works in the texts you care about. History can enrich your work or deeply complicate it. 44 | 45 | -------------------------------------------------------------------------------- /schedule.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walshbr/introduction-to-text-analysis/e66f3faf66e6ba8488c7446df187c04cfd3d2df2/schedule.md -------------------------------------------------------------------------------- /sentiment-analysis.md: -------------------------------------------------------------------------------- 1 | # Sentiment Analysis 2 | 3 | * [Sentiment Analysis](/sentiment-analysis/sentiment-analysis.md) 4 | 5 | * [Prism for Sentiment Analysis](/sentiment-analysis/sentiment-analysis-in-action.md) 6 | 7 | * [Exercises](/sentiment-analysis/exercises.md) -------------------------------------------------------------------------------- /sentiment-analysis/exercises.md: -------------------------------------------------------------------------------- 1 | # Exercises 2 | 3 | * Take the following text, and mark it on [our class Prism](http://prism.scholarslab.org/prisms/84f15902-686a-11e6-905c-005056b3784e/highlight?locale=en) (The Prism has a longer excerpt, not just what you see below): 4 | 5 | > Anita Sarkeesian doesn’t give me the address of her San Francisco apartment over email. Instead, she texts it to me a few hours before we’re set to meet. After thousands of rape and death threats, a bomb scare and an email promising a mass shooting at one of her speaking events, a woman can’t be too careful. For some male gaming aficionados, the most frightening enemy isn’t an animated foe but this 31-year-old feminist with a penchant for hoop earrings, sitting across from me. They’ve called Sarkeesian a con artist, and raised thousands of dollars to film an exposé-style documentary about her \(which exposes nothing\). Some even created a game in which users can punch an image of her face until it is bloodied. 6 | 7 | 8 | * Now take the same text and process it on the live demo for [Stanford's sentiment analysis software](http://nlp.stanford.edu:8080/sentiment/rntnDemo.html). \(Note how each sentence is its own tree. The demo color codes individual words as very negative, negative, neutral, positive or very positive\). 9 | 10 | * What are the differences between how the emotions are marked in the Prism versus how they are marked in Stanford's tool? 11 | 12 | * What does all this say about what humans are better at? Computers? 13 | 14 | 15 | -------------------------------------------------------------------------------- /sentiment-analysis/sentiment-analysis-in-action.md: -------------------------------------------------------------------------------- 1 | # Sentiment Analysis in Action 2 | 3 | To illustrate how sentiment analysis works, let's walk through a couple different projects. We will do a fair amount of handwaving at technical details, but hopefully you will get a sense of the kind of work that goes into sentiment analysis projects. 4 | 5 | ## Jockers and Syuzhet 6 | 7 | Matt Jockers has been working on using [sentiment analysis to discover plot trajectories in fiction](http://www.matthewjockers.net/2015/02/02/syuzhet/) in just the same terms as the [video](https://www.youtube.com/embed/oP3c1h8v2ZQ) in the previous lesson (indeed, Jockers's writing is what pointed us to the Vonnegut clip in the first place!). By taking thousands of texts and classifying their sentences for sentiment, he has developed a software procedure for tracing plot trajectories and [suggested](http://www.matthewjockers.net/2015/02/25/the-rest-of-the-story/) that there are only six or seven different plot shapes based on this type of analysis. Jockers's bold claim has since come under serious critique by Joanna Swafford, who argues that the shapes are the results of configurations in Jockers's software rather than of any inherent quality in the text (also a recurrent theme throughout this book!). 8 | 9 | Let's take a closer look at how Jockers is able to make such a claim. He uses a sophisticated software package that he constructed in the [R programming language](/conclusion/where-to-go.md). We won't get into the details of the code itself, but we can cover the general approach. To find a more technical explanation you can look at Jockers's "[Introduction to the Syuzhet Package](https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html)." 10 | 11 | Jockers's project combines supervised classifiers and unsupervised classifiers. Remember: supervised classifiers rely on training data that tells the software how to interpret and classify data. Unsupervised classifiers are not based on any prior training data. Instead, they rely on underlying assumptions and algorithms to categorize texts \(in the case of topic modeling, this means that the unsupervised classifiers make assumptions about the relation between texts and statistics\). We will focus on the supervised portion of his work below. 12 | 13 | So, first, Jockers needed training data. In order for his software to read sentiment in sentences, it needed example sentences that had already been marked for emotions. By providing the software with example sentences, the software will be able to categorize related sentences in the future. So imagine that we train our computer with these sentences: 14 | 15 | 1. "I am happy!", positive 16 | 2. "I am sad!", negative 17 | 18 | Later, we fire up our classifier and ask it to mark a given text for sentiment. Imagine that the computer encounters sentence 1 again later in the text. The computer could look in its bank of knowledge and remember that it should be marked as positive. But this classifier would not work very well: it only knows the sentiment for two specific sentences. When it encounters new sentences that we haven't pre-marked, it would not know what to do. In practice, we want to train a classifier on as much as data as possible to maximize its ability to handle new information. And we probably won't train it on full sentences. After all, computers distinguish between sentences and individual words in quite profound ways (we talked about this in "[How Computers Read](/cyborg-readers/computer-reading.md)"). Depending on how thorough we want to be, we might give the computer vocabulary and phrases marked for sentiment instead. Since working with numbers gives us more options for graphing things, we might use "1" and "-1" to represent sentences with positive and negative values. And rather then a binary positive/negative, we might mark for a continuum: numbers between -5 and 5, say. After all, 'good' is less positive than 'exuberant.' So each word or phrase gets converted into a series of positive and negative numbers. 19 | 20 | You can find information on the training sets used by Jockers [here](https://github.com/mjockers/syuzhet#references). He uses a training lexicon of his own but gives the option to categorize sentiment using other training sets. Basically the software reads a text, looks at its memory of the training corpus to determine how positive or negative a sentence or word is, then converts the text into a series of values like this: 21 | 22 | ``` 23 | 2.50 0.60 0.00 -0.25 0.00 0.00 24 | ``` 25 | 26 | Now the text is converted into a series of values that represent the sentiment of the text. As the numbers of the text becomes negative or positive, we get a sense of how the classifier reads the emotions of the text. From there, it is just a matter of plotting numbers to get a better representation of the sentiment trends over time. In the end, we can get something like this graph for James Joyce's _A Portrait of the Artist as a Young Man_, taken from Jockers's explanation of the software: 27 | 28 | ![plot trajectory in portrait](/assets/sentiment-analysis/jockers-portrait.jpg) 29 | 30 | Just like we did with [Voyant](/reading-at-scale/voyant-part-two), Jockers breaks the novel into a number of segments and aggregates the sentiment for each section to get a sense of how the emotion changes over time. At its core, measuring sentiment computationally in this way relies on solid training data. The computer needs to learn how to map emotion-laden words and phrases onto some sort of numerical system. The robustness of your training set can strengthen or complicate your results. Getting a good training set can be difficult, however, since assembling one takes a great deal of time and labor. Notably, it is a lot of work to manually label single words with positive or negative valence. With a series of values like these for each text, Jockers then has a basis for comparison among his whole corpus. He can start to look for patterns in plot trajectories, which eventually leads to his claim that there are only a set number of plot arcs for novels. 31 | 32 | ## EmojiSentiment 33 | 34 | Another interesting use of sentiment analysis is [EmojiSentiment](http://www.emojisentiment.com/#About). The project approaches the problem from a different angle: rather than trying to analyze textual content for sentiment, the site postulates that emojis embedded in tweets might be a good predictor of their sentiment. There are only around 2000 emoji and only a small subset have emotional valences to them; tagging these emojis is a lot easier than tagging all the words that convey emotion. The authors of this project use emojis to determine the overall sentiment for particular hashtag, as opposed to any one tweet. The project postulates that if you gather up all the emoji associated with a particular hashtag, you will get a pretty good sense of the emotional valence for that stream of conversation. For example, EmojiSentiment reads \#friday as being relatively positive: 35 | 36 | ![friday sentiment](/assets/sentiment-analysis/emoji-sentiment-friday.jpg) 37 | 38 | On first blush, this makes sense. Everyone gets excited for the weekend. But look more closely. The three most dominant emoji are fire, police sirens, and flowers, which might seem a bit unusual. Note that we're only getting 32 emoji in the last 1000 tweets - these might just be artifacts of whatever is going right now, and the project can't handle more volume than that \(it's a student project. Yay students!\). So our sample size is rather small and could easily be skewed by a handful of active users. We would want far more emoji to really get a good measurement of sentiment. If \#friday got a relatively happy average sentiment, \#angry is much more negative: 39 | 40 | ![angry sentiment](/assets/sentiment-analysis/emoji-sentiment-angry.jpg) 41 | 42 | Note how even \#angry just barely dips below 50 to be predominantly negative. We actually had to search around for a while to find a hashtag that read as predominantly negative. Could this be a function of language - are people just happier than they are sad? Certainly not. This might have to do with how we represent our emotions on social media; maybe we don't use emojis to represent negative emotions that often. Or maybe sentiment analysis by way of computation is imprecise at best. To improve on this project, we would want to scale it up to read vastly more data. We might then use the emoji a tad differently. Instead of using them to measure sentiment, you could use them to train the sentiment classifer further so that it refines itself over time! All of this would require far more funding than the EmojiSentiment team has, however, and the tool is a great provocation as it stands. 43 | 44 | * What might you imagine using sentiment analysis for? 45 | 46 | * What kinds of texts lend themselves especially well to reading for emotion? 47 | 48 | -------------------------------------------------------------------------------- /sentiment-analysis/sentiment-analysis.md: -------------------------------------------------------------------------------- 1 | # Sentiment Analysis 2 | 3 | We began this book by talking about interpretation on a micro level: close reading asks you to pay attention to every small detail in a text to produce analysis. We have since zoomed out to think about what we could gain from macro reading and how computers enable us to understand texts in new ways. In our final moments, we will loop back around to the beginning. 4 | 5 | We have repeatedly stressed the interplay of computation and interpretation: when the computer presents some results to you, your work has only begun. The computer can supply data, but you must interpret that data yourself. The computer does not really read. You do. What you've learned about is how to read _with_ computers. 6 | 7 | But you have probably also noticed in the last few chapters that the kinds of readings we are using our computers for have become more sophisticated. When we use software to discover the topics a text is discussing or to identify anonymous authors, we are not quite having them read in the same way as a person would. But we are getting closer. These techniques aim to provide a richer sense of a text, and they do so in quite sophisticated ways. We will close with a somewhat simpler problem, but one that is profoundly difficult for computers: is a particular text happy or sad? For that matter, is a sentence? A word? 8 | 9 | This type of analysis that tries to capture the emotional resonance of a text is called **sentiment analysis**. You've probably engaged with this kind of work without realizing it. If you've ever been to [Rotten Tomatoes](https://www.rottentomatoes.com/) to see what score a movie has gotten, you are looking at an aggregated number of reviews that have been marked as positive or negative. Businesses have a stake in such things as well. If you tweet about your recent flight, the airline would probably want to know whether you hated it or loved it. The former might result in you being directed to customer service, while the latter could result in a benign response like "thanks for flying with us!" 10 | 11 | Sentiment analysis can also offer interesting opportunities for textual analysis. Check out this clip of a lecture by Kurt Vonnegut: 12 | 13 | 14 | 15 | The idea makes enough sense as Vonnegut presents it: at certain times in a story, things are varying degrees of good or bad. As with any form of text analysis, this kind of information could be very useful for understanding a text. 16 | 17 | * What kind of emotions does the author employ in the text? When? 18 | 19 | * How do emotions map onto other aesthetic categories, like narrative structure? 20 | 21 | It would be fascinating to have a computer that could easily mark the sentiments in texts for you. If you have been following dutifuly along, however, you should know that computers can't do much of anything without being explicitly told how. They can do very little in the way of understanding data without a human to guide them. Trying to extract complicated information like the sentimental arc of a text, how we are meant to feel about a sentence, or how an author intended us to feel are all complicated tasks that computers have a difficult time with. In fact, they can be hard for two different people to agree on. Try to guess whether these two sentences would be classified as good or bad: 22 | 23 | * "I am very happy." 24 | 25 | * "She is so sad." 26 | 27 | Those were easy ones: good and bad. Hot and cold. How about this one: 28 | 29 | * "It was the best of times, it was the worst of times…" 30 | 31 | This sentence is from Charles Dickens's _Tale of Two Cities_ and is probably a bit hard to parse in such a binary way. If it is both good and bad, it probably comes out as neutral, right? But Dickens was talking about the era of the French Revolution here; his whole point was that this was an extraordinary time, hardly a "neutral" situation. In fact, he is interested in juxtaposing different things - best/worst, London/Paris, etc. - not in resolving them. We would probably need some system for determining what to do in such situations. 32 | 33 | Try this sentence, by Jane Austen, which complicates matters even further: 34 | 35 | * "It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife." 36 | 37 | An avid reader of Austen would know that her texts come loaded with satire. It is unlikely she actually means her words to be taken at face value. Virtually no truths actually are *universally* acknowledged to be true, so the sentence winks at the reader and should not be taken in full seriousness. In fact, much of her work is meant as a scathing criticism of the culture and people around her. These opinions are largely indirect, couched in irony and satire that asks the reader to read against what the text says on the surface. 38 | 39 | All of these things are difficult to convey to readers, let alone computers. Sentiment analysis through technology is tricky, but that doesn't mean that researchers don't try. The process is difficult and riddled with error, but also intellectually interesting in a number of ways. 40 | 41 | * How do we map complicated abstract ideas like emotion in a way that computers could understand them? 42 | 43 | * What can sentiment analysis like this tell us about the objects that we study? 44 | 45 | As with any form of text analysis, the potential uses range as widely as your imagination. One compelling recent [use of sentiment analysis](http://varianceexplained.org/r/trump-tweets/) by David Robinson sought to gauge the degree of control that Donald Trump's campaign had over his Twitter account. Knowing that Trump tended to use a Samsung Galaxy to tweet, Robinson wanted to determine if tweets from different techonologies might have different characteristics. If so, one could reasonably separate out his personal persona on Twitter from the one curated by his campaign stuff. Robinson found that we could reasonably determine that the angrier, more hyperbolic tweets came from a Samsung Galaxy (and were more likely to be by Trump himself). The tweets from iPhones were more likely to be "fairly benign declarations." With this knowledge we could reasonably trace the thumbprint of the Trump campaign handlers as distinct from Trump himself. 46 | 47 | Computers might not be able to feel, but perhaps we can train them to know what emotions look like. The very idea of measuring sentiment computationally is provocative. If we were working in big business, we would care a lot about the results of such projects. But, as humanists, we can also gain a lot just from trying to model such complicated topics. The process is as enlightening as the product. 48 | 49 | ## Further Resources 50 | 51 | * "[The Universal Shapes of Stores, According to Kurt Vonnegut](http://io9.gizmodo.com/the-universal-shapes-of-stories-according-to-kurt-vonn-1526559996)" has a brief explanation of Vonngeut's relationshop to the theory about plot trajectories. 52 | 53 | * Maya Eilam has represented Vonnegut's theory about shapes in [a variety of infographics](http://www.mayaeilam.com/2012/01/01/the-shapes-of-stories-a-kurt-vonnegut-infographic/). 54 | 55 | * Jockers has a series of posts on his sentiment analysis project that begins [here](http://www.matthewjockers.net/2015/02/02/syuzhet/). These posts were where we first read about the connection to the Vonnegut clip. 56 | -------------------------------------------------------------------------------- /styles/ebook.css: -------------------------------------------------------------------------------- 1 | /* CSS for ebook */ 2 | -------------------------------------------------------------------------------- /styles/epub.css: -------------------------------------------------------------------------------- 1 | /* CSS for epub */ 2 | -------------------------------------------------------------------------------- /styles/mobi.css: -------------------------------------------------------------------------------- 1 | /* CSS for mobi */ 2 | -------------------------------------------------------------------------------- /styles/pdf.css: -------------------------------------------------------------------------------- 1 | /* CSS for pdf */ 2 | -------------------------------------------------------------------------------- /styles/print.css: -------------------------------------------------------------------------------- 1 | /* CSS for print */ 2 | -------------------------------------------------------------------------------- /styles/website.css: -------------------------------------------------------------------------------- 1 | /* CSS for website */ 2 | img{ 3 | max-height:500px; 4 | } 5 | 6 | .img-right{ 7 | float: right; 8 | padding-left: 13.6px; 9 | } 10 | 11 | .medium{ 12 | max-width: 280px !important; 13 | } 14 | 15 | #ocr-image{ 16 | max-width: 165.275px; 17 | padding-left:0px; 18 | margin-left: 13.6px; 19 | } 20 | 21 | #prism-licensing{ 22 | max-width: 280px; 23 | } 24 | 25 | .clear{ 26 | clear:both; 27 | } -------------------------------------------------------------------------------- /topic-modeling.md: -------------------------------------------------------------------------------- 1 | # Topic Modeling 2 | 3 | * [Bags of Words](/topic-modeling/bags-of-words.md) 4 | 5 | * [Topic Modeling Case Study](/topic-modeling/topic-modeling-case-study.md) 6 | 7 | * [Exercises](/topic-modeling/exercises.md) -------------------------------------------------------------------------------- /topic-modeling/exercises.md: -------------------------------------------------------------------------------- 1 | # Exercises 2 | 3 | Read Robert Nelson's "Of Monsters, Men -- and Topic Modelling" from the _New York Times_. It's available [here](http://opinionator.blogs.nytimes.com/2011/05/29/of-monsters-men-and-topic-modeling/?_r=0) or, if you can't get behind the paywall, you can search for it on your favorite search engine; the link should carry you to the article. You can find more information about the larger project and individual topics [here](https://dsl.richmond.edu/dispatch/Topics). 4 | 5 | Answer the following questions about this project. 6 | 7 | 1. What kind of documents is Nelson working with? 8 | 2. What kind of assumptions or research questions does he have for these texts? 9 | 3. What general conclusions does he draw from topic modelling? 10 | 4. What conclusions could not be reached through close reading of these texts? In other words, what is the unique contribution of topic modelling to our understanding of the Civil War? 11 | 5. Look at the graphs/data for the other topics in the Richmond Dispatch. You can find the full list [here](https://dsl.richmond.edu/dispatch/Topics). What do _you_ notice? 12 | 13 | 14 | --------------------------------------------------------------------------------