├── .gitignore ├── docs ├── _config.yml ├── start │ ├── inspect.png │ ├── html-options.png │ └── index.md ├── convert │ ├── indents.png │ ├── replace.png │ ├── find-in-project.png │ ├── find-in-project-dialog.png │ └── index.md ├── images │ ├── jupyter300.png │ ├── jupyter48.png │ ├── voyant300.png │ ├── voyant48.png │ ├── observable48.png │ └── observable300.png ├── visualize │ ├── iframe.png │ ├── plot.jpeg │ └── index.md ├── count │ ├── terms-columns.png │ └── index.md ├── setup │ ├── observable-login.png │ ├── jupyter-architecture.png │ ├── jupyter-architecture.graffle │ └── index.md ├── index.md ├── scrape │ └── index.md └── collocate │ └── index.md ├── ipynb ├── .gitignore ├── utilities │ ├── .DS_Store │ ├── .ipynb_checkpoints │ │ ├── Untitled-checkpoint.ipynb │ │ ├── My First Notebook-checkpoint.ipynb │ │ ├── SimpleSentimentAnalysis-checkpoint.ipynb │ │ └── Concordances-checkpoint.ipynb │ ├── Untitled.ipynb │ ├── SimpleSentimentAnalysis.ipynb │ └── Concordances.ipynb ├── experiments │ ├── .DS_Store │ ├── SmithImagery.png │ ├── SmithImageryFreqsByChapter.png │ └── SmithImageryWordList.txt ├── images │ ├── stop-server.png │ ├── access-texts.png │ ├── folder-rename.png │ ├── logo_anaconda.png │ ├── markdown-cell.png │ ├── new-notebook.png │ ├── notebook-launch.png │ ├── notebook-ui-tour.png │ ├── rename-notebook.png │ ├── anaconda-download.png │ ├── anaconda-launcher.png │ ├── cosine-similarity.png │ ├── hello-world-error.png │ ├── new-notebook-header.png │ ├── nltk-data-download.png │ ├── hello-world-markdown.png │ ├── anaconda-launcher-menu-2.png │ ├── anaconda-launcher-menu.png │ ├── hello-world-dynamic-time.png │ ├── hello-world-first-code.png │ ├── terminal-ipython-install.png │ ├── ipython-notebook-root-tree.png │ ├── characteristic-curve-mendenhall.png │ └── network-graph-students-schools.png ├── HelloWorld.ipynb ├── Useful Resources.ipynb ├── ArtOfLiteraryTextAnalysis.ipynb ├── Nltk.ipynb ├── GettingSetup.ipynb ├── Converting.ipynb ├── Glossary.ipynb └── GettingNltk.ipynb ├── README.md ├── spiral └── CharacteristicCurve.json └── assets └── css └── style.scss /.gitignore: -------------------------------------------------------------------------------- 1 | /.project 2 | .DS_Store 3 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-slate -------------------------------------------------------------------------------- /ipynb/.gitignore: -------------------------------------------------------------------------------- 1 | /.ipynb_checkpoints/ 2 | /.DS_Store 3 | -------------------------------------------------------------------------------- /docs/start/inspect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/start/inspect.png -------------------------------------------------------------------------------- /docs/convert/indents.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/convert/indents.png -------------------------------------------------------------------------------- /docs/convert/replace.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/convert/replace.png -------------------------------------------------------------------------------- /docs/images/jupyter300.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/images/jupyter300.png -------------------------------------------------------------------------------- /docs/images/jupyter48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/images/jupyter48.png -------------------------------------------------------------------------------- /docs/images/voyant300.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/images/voyant300.png -------------------------------------------------------------------------------- /docs/images/voyant48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/images/voyant48.png -------------------------------------------------------------------------------- /docs/visualize/iframe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/visualize/iframe.png -------------------------------------------------------------------------------- /docs/visualize/plot.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/visualize/plot.jpeg -------------------------------------------------------------------------------- /ipynb/utilities/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/utilities/.DS_Store -------------------------------------------------------------------------------- /docs/count/terms-columns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/count/terms-columns.png -------------------------------------------------------------------------------- /docs/images/observable48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/images/observable48.png -------------------------------------------------------------------------------- /docs/start/html-options.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/start/html-options.png -------------------------------------------------------------------------------- /ipynb/experiments/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/experiments/.DS_Store -------------------------------------------------------------------------------- /ipynb/images/stop-server.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/stop-server.png -------------------------------------------------------------------------------- /docs/images/observable300.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/images/observable300.png -------------------------------------------------------------------------------- /docs/setup/observable-login.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/setup/observable-login.png -------------------------------------------------------------------------------- /ipynb/images/access-texts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/access-texts.png -------------------------------------------------------------------------------- /ipynb/images/folder-rename.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/folder-rename.png -------------------------------------------------------------------------------- /ipynb/images/logo_anaconda.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/logo_anaconda.png -------------------------------------------------------------------------------- /ipynb/images/markdown-cell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/markdown-cell.png -------------------------------------------------------------------------------- /ipynb/images/new-notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/new-notebook.png -------------------------------------------------------------------------------- /docs/convert/find-in-project.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/convert/find-in-project.png -------------------------------------------------------------------------------- /ipynb/images/notebook-launch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/notebook-launch.png -------------------------------------------------------------------------------- /ipynb/images/notebook-ui-tour.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/notebook-ui-tour.png -------------------------------------------------------------------------------- /ipynb/images/rename-notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/rename-notebook.png -------------------------------------------------------------------------------- /docs/setup/jupyter-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/setup/jupyter-architecture.png -------------------------------------------------------------------------------- /ipynb/experiments/SmithImagery.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/experiments/SmithImagery.png -------------------------------------------------------------------------------- /ipynb/images/anaconda-download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/anaconda-download.png -------------------------------------------------------------------------------- /ipynb/images/anaconda-launcher.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/anaconda-launcher.png -------------------------------------------------------------------------------- /ipynb/images/cosine-similarity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/cosine-similarity.png -------------------------------------------------------------------------------- /ipynb/images/hello-world-error.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/hello-world-error.png -------------------------------------------------------------------------------- /ipynb/images/new-notebook-header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/new-notebook-header.png -------------------------------------------------------------------------------- /ipynb/images/nltk-data-download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/nltk-data-download.png -------------------------------------------------------------------------------- /ipynb/images/hello-world-markdown.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/hello-world-markdown.png -------------------------------------------------------------------------------- /docs/convert/find-in-project-dialog.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/convert/find-in-project-dialog.png -------------------------------------------------------------------------------- /docs/setup/jupyter-architecture.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/setup/jupyter-architecture.graffle -------------------------------------------------------------------------------- /ipynb/images/anaconda-launcher-menu-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/anaconda-launcher-menu-2.png -------------------------------------------------------------------------------- /ipynb/images/anaconda-launcher-menu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/anaconda-launcher-menu.png -------------------------------------------------------------------------------- /ipynb/images/hello-world-dynamic-time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/hello-world-dynamic-time.png -------------------------------------------------------------------------------- /ipynb/images/hello-world-first-code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/hello-world-first-code.png -------------------------------------------------------------------------------- /ipynb/images/terminal-ipython-install.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/terminal-ipython-install.png -------------------------------------------------------------------------------- /ipynb/images/ipython-notebook-root-tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/ipython-notebook-root-tree.png -------------------------------------------------------------------------------- /ipynb/experiments/SmithImageryFreqsByChapter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/experiments/SmithImageryFreqsByChapter.png -------------------------------------------------------------------------------- /ipynb/images/characteristic-curve-mendenhall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/characteristic-curve-mendenhall.png -------------------------------------------------------------------------------- /ipynb/images/network-graph-students-schools.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/network-graph-students-schools.png -------------------------------------------------------------------------------- /ipynb/utilities/.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /ipynb/utilities/.ipynb_checkpoints/My First Notebook-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | The Art of Literary Text Analysis 2 | ==== 3 | 4 | Please see the [Juypter (python) version](https://github.com/sgsinclair/alta/blob/master/ipynb/ArtOfLiteraryTextAnalysis.ipynb). 5 | -------------------------------------------------------------------------------- /ipynb/utilities/Untitled.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Gathering web pages\n", 8 | "\n", 9 | "This utility script is for gathering the text of a collection of web sites. It assumes you have a CSV with a list of URLs and it adds the results of the gathering back into the CSV." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Opening the CSV\n", 17 | "\n", 18 | "This opens a CSV and extracts the URLs putting them into a list. Alternatively you can use a " 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## Getting the HTML\n", 26 | "\n", 27 | "This function gets the HTML given a URL." 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Cleaning the HTML\n", 35 | "\n", 36 | "This function cleans the HTML " 37 | ] 38 | } 39 | ], 40 | "metadata": { 41 | "kernelspec": { 42 | "display_name": "Python 3", 43 | "language": "python", 44 | "name": "python3" 45 | }, 46 | "language_info": { 47 | "codemirror_mode": { 48 | "name": "ipython", 49 | "version": 3 50 | }, 51 | "file_extension": ".py", 52 | "mimetype": "text/x-python", 53 | "name": "python", 54 | "nbconvert_exporter": "python", 55 | "pygments_lexer": "ipython3", 56 | "version": "3.5.1" 57 | } 58 | }, 59 | "nbformat": 4, 60 | "nbformat_minor": 0 61 | } 62 | -------------------------------------------------------------------------------- /docs/visualize/index.md: -------------------------------------------------------------------------------- 1 | # Visualizing with the Art of Literary Text Mining 2 | 3 | ## Visualizing with Voyant 4 | 5 | ![Voyant](../images/voyant48.png) Voyant is in large part about visualization so we won't spend too much time with it here except to refer to a couple of tools that are perhaps less on the beaten path: 6 | 7 | 1. Bubbles 8 | 1. TextArc 9 | 10 | But there are many others, have a look! 11 | 12 | ## Embedding Voyant 13 | 14 | One of the more powerful aspects of Voyant is that you can embed a live, functional tool in another page, much as you would embed a video clip from YouTube or Vimeo. See the [documentation](https://voyant-tools.org/docs/#!/guide/embedding). For instance, the tool to the right has been embedded with this code:
15 | 16 | 18 | 19 | It's worth noting that the <iframe> tag is usually filtered out of a markdown document in GitHub, but it *is* possible to embed Voyant into a Jupyter Notebook. Just using the `iframe` tag won't work directly, but you can use the `IFRAME` class from the [IPython.display] module](https://ipython.readthedocs.io/en/stable/api/generated/IPython.display.html?highlight=iframe#classes). 20 | 21 | from IPython.display import IFrame 22 | IFrame('https://voyant-tools.org/tool/Cirrus/?corpus=austen', width=300, height=300) 23 | 24 | IFRAME 25 | 26 | ## Visualizing with Jupyter 27 | 28 | ![Jupyter](../images/jupyter48.png) One of the benefits of working with libraries like NLTK (which we've already introduced in a previous notebook) is that there are built-in libraries for simple plotting. For example, it's very easy to go from a text to a graph of word frequencies, something like this: 29 | 30 | ```python 31 | import nltk 32 | %matplotlib inline # magical incantation needed for first graph 33 | 34 | emma = nltk.corpus.gutenberg.words('austen-emma.txt') # load words 35 | stopwords = nltk.corpus.stopwords.words("English") # load stopwords 36 | # filter words that are alphabetic and not in stopword list 37 | words = [word.lower() for word in emma if word[0].isalpha() and not word.lower() in stopwords] 38 | freqs = nltk.FreqDist(words) # build frequency list 39 | freqs.plot(25) # plot the top 25 words 40 | ``` 41 | 42 | Plot of frequencies 43 | 44 | To continue with graphing, please consult [Getting Graphical](https://nbviewer.jupyter.org/github/sgsinclair/alta/blob/master/ipynb/GettingGraphical.ipynb) in the Art of Literary Programming with Python. 45 | -------------------------------------------------------------------------------- /ipynb/HelloWorld.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Hello World!" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "This is _Hello World!_, my first iPython Notebook" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 3, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "Hello World!\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "print(\"Hello World!\")" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "Now let's try printing dynamic content like the current time." 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 4, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "name": "stdout", 48 | "output_type": "stream", 49 | "text": [ 50 | "Hello World! It's Monday January 12, 2015\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "import time\n", 56 | "print(\"Hello World! It's\", time.strftime(\"%A %B %e, %Y\"))" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "Things we've learned in this Notebook\n", 64 | "--\n", 65 | "* creating a new notebook\n", 66 | "* basic user interface of a notebook\n", 67 | "* printing a static string like _Hello World!_\n", 68 | "* debugging syntax errors\n", 69 | "* printing a dymamic string with the current time\n", 70 | "* a bit more about Markdown (see http://daringfireball.net/projects/markdown/syntax)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "---\n", 78 | "This is a template from the [GettingStarted](GettingStarted.ipynb) notebook.\n", 79 | "\n", 80 | "From [The Art of Literary Text Analysis](https://github.com/sgsinclair/alta) by [Stéfan Sinclair](http://stefansinclair.name) & [Geoffrey Rockwell](http://geoffreyrockwell.com), [CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/)\n", 81 | "\n", 82 | "Created January 12, 2015" 83 | ] 84 | } 85 | ], 86 | "metadata": { 87 | "kernelspec": { 88 | "display_name": "Python 3", 89 | "language": "python", 90 | "name": "python3" 91 | }, 92 | "language_info": { 93 | "codemirror_mode": { 94 | "name": "ipython", 95 | "version": 3 96 | }, 97 | "file_extension": ".py", 98 | "mimetype": "text/x-python", 99 | "name": "python", 100 | "nbconvert_exporter": "python", 101 | "pygments_lexer": "ipython3", 102 | "version": "3.6.3" 103 | } 104 | }, 105 | "nbformat": 4, 106 | "nbformat_minor": 1 107 | } 108 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # The Art of Literary Text Mining 2 | 3 | This is a meta-guide that is intended to help you work through our guides for _The Art of Literary Text Mining_. 4 | 5 | ## Guides 6 | 7 | ![Voyant](images/voyant48.png) [The Art of Literary Text Mining for Voyant](./voyant/): Voyant is a *web-based* collection of text analysis and visualizations tools, it can be relatively easy to start using but is limited to the pre-packaged functionality that is already implemented. 8 | 9 | ![Jupyter](images/jupyter48.png) [The Art of Literary Text Mining for Python Jupyter Notebooks](../ipynb/ArtOfLiteraryTextAnalysis.ipynb): Python is a programming language with a huge number of useful libraries but it can take a while to become proficient in any programming language. 10 | 11 | ![ObservableHQ](images/observable48.png) [The Art of Literary Text Mining for ObservableHQ and VoyantJS](https://beta.observablehq.com/@sgsinclair/alta): This uses Javascript as a core programming language (takes some effort to learn) but has the benefit of being highly shareable as web-based resources. This approach exposes some of the analytic and visualization functionality of Voyant while allowing for more customized processing. 12 | 13 | Usually you would probably want to work through just one of these guides but there are cases when working with one or more guides together is preferable; this meta-guide is for this mixed approach. 14 | 15 | Why work through the materials of more than one guide? One reason is to fully appreciate the strengths and weaknesses of more than one approach. We firmly believe that no one tool or even one framework is ideal for all problems and that it can be useful to be familiar with more than solution. Indeed, our three guides have their own pros and cons that can be significant for a given task or a given project. The following is a very simplistic view of some of the characteristics of each approach: 16 | 17 | ## Comparison 18 | 19 | | | ![Voyant](images/voyant48.png)
Voyant | ![Jupyter](images/jupyter48.png)
Juypter | ![ObservableHQ](images/observable48.png)
ObservableHQ+VoyantJS | 20 | |-|-|-|-| 21 | | **setup and configuration** | no setup for hosted version, easy desktop version | usually requires some setup | no setup | 22 | | **text analysis specificity** | text analysis specific | infinitely generalizable | mixed specificity of VoyantJS for text analysis and Javascript more generally | 23 | | **shareable** | Voyant URLs of tools and corpora | compatible with GitHub | web-based | 24 | | **scalable** | optimized for up to hundreds of documents | very scalable | somewhat limited to browser resources | 25 | 26 | ## Topics 27 | 28 | * [setup the environments](./setup/) 29 | * [getting started](./start/) 30 | * [scraping a corpus](./scrape/) 31 | * [converting a corpus](./convert/) 32 | * [frequencies](./count/) 33 | * [collocates](./collocate/) 34 | * [visualize](./visualize/) 35 | * semantics 36 | * parts-of-speech 37 | * sentiment 38 | * similarity 39 | -------------------------------------------------------------------------------- /ipynb/Useful Resources.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Useful Resources\n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Programming Basics \n", 15 | "* [Codecademy](https://www.codecademy.com/learn/learn-python) - Online learning platform which offers free interactive lessons covering the very basics of programming languages.\n", 16 | "* [Google's Python Class](https://developers.google.com/edu/python/) - A combination of written materials, instructional videos and coding exersises to practice Python programming. \n", 17 | "* [Pyschools](http://www.pyschools.com/) - Practical python tutorials for beginners and beyond. Note - you must have a google account to sign-up.\n", 18 | "* [Udacity](https://www.udacity.com/course/programming-foundations-with-python--ud036) - Introduction python programming class with mini-projects in each lesson.\n", 19 | "* [Tutorialspoint](https://www.tutorialspoint.com/python/python_basic_syntax.htm) - Basics of python syntax.\n", 20 | "---" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "### Software & Libraries\n", 28 | "* [Anaconda](https://www.anaconda.com/download/#macos) - Suite of data science applications \n", 29 | "* [Gensim](https://radimrehurek.com/gensim/) - Topic Modelling toolkit for Python\n", 30 | "* [NLTK](http://www.nltk.org/) - Natural Language Toolkit\n", 31 | "\n", 32 | "---\n" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "### Python Resources\n", 40 | "* [The Python Wiki](https://wiki.python.org/moin/FrontPage) - A comprehensive encyclopedia of python related information including a beginners guide, common problems and links to many useful resources.\n", 41 | "* [Stack Overflow](https://stackoverflow.com/) - An excellent community driven question-answer problem solving resource for even the trickiest of python conundrums.\n", 42 | "\n", 43 | "---" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "### Further Explorations\n", 51 | "* [Voyant](https://voyant-tools.org/) - Open source web application for text analysis featuring a plethora of data and visualization tools.\n", 52 | "* [Big Data by Neal Caren](http://nealcaren.web.unc.edu/big-data/) - Tutorials which cover the fundamentals of quantitative text analysis for social scientists.\n", 53 | "---" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "### Open Source Materials\n", 61 | "\n", 62 | "* [Project Gutenberg](http://gutenberg.ca/index.html) - Digital editions of classic literature in the public domain\n", 63 | "\n", 64 | "---" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "[CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/) From [The Art of Literary Text Analysis](ArtOfLiteraryTextAnalysis.ipynb) by [Stéfan Sinclair](http://stefansinclair.name) & [Geoffrey Rockwell](http://geoffreyrockwell.com). Edited and revised by [Melissa Mony](http://melissamony.com).
" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "collapsed": true 79 | }, 80 | "outputs": [], 81 | "source": [] 82 | } 83 | ], 84 | "metadata": { 85 | "kernelspec": { 86 | "display_name": "Python 3", 87 | "language": "python", 88 | "name": "python3" 89 | }, 90 | "language_info": { 91 | "codemirror_mode": { 92 | "name": "ipython", 93 | "version": 3 94 | }, 95 | "file_extension": ".py", 96 | "mimetype": "text/x-python", 97 | "name": "python", 98 | "nbconvert_exporter": "python", 99 | "pygments_lexer": "ipython3", 100 | "version": "3.6.3" 101 | } 102 | }, 103 | "nbformat": 4, 104 | "nbformat_minor": 2 105 | } 106 | -------------------------------------------------------------------------------- /docs/setup/index.md: -------------------------------------------------------------------------------- 1 | # Setting up The Art of Setting up Literary Text Mining 2 | 3 | This is part of the [Art of Literary Text Mining](../) collection. This page is intended to briefly describe how to get setup and configured with our three environments: Voyant Tools, Juypter Notebooks, ObservableHQ. 4 | 5 | The first step with any tool or framework is to ensure that whatever setup and configuration needed are performed. Because of the nature of the technologies the work involved is different for each of our guides. 6 | 7 | ## Voyant 8 | 9 | ![Voyant](../images/voyant48.png) Voyant Tools is a hosted website [voyant-tools.org](https://voyant-tools.org) that requires no setup, no login, and no configuration. However, that simplicity comes with a price: the hosted version is widely used by people all over the world and that excerpts pressure on the server, which sometimes causes downtime and other issues. For this reason (and others, such as data privacy), it's highly recommended that you [download and install the Desktop version of Voyant Tools](https://github.com/sgsinclair/VoyantServer/wiki/VoyantServer-Desktop) – in most cases it's as simple as downloading a zip file, uncompressing it, and clicking on the application launcher. 10 | 11 | As mentioned, the hosted version is sometimes over-extended. If the server doesn't seem to respond, wait a few seconds, up to a minute, and try again (the server usually restored itself within a few seconds). 12 | 13 | If you're trying to get the Desktop version functioning and it won't, there are three common issues to check: 14 | 15 | 1. [On Windows](https://github.com/sgsinclair/VoyantServer/wiki/VoyantServer-Desktop#windows), be sure that you extracted the downloaded VoyantServer.zip file into a real directory, not just double-click on the ZIP file to uncompress it. 16 | 17 | 1. [On Mac](https://github.com/sgsinclair/VoyantServer/wiki/VoyantServer-Desktop#mac), the first time you launch VoyantServer, you should right-click or ctrl-click on the VoyantServer.jar file as this will allow you to circumvent the operating system's security block for unsigned applications. 18 | 19 | 1. Check the memory [settings](https://github.com/sgsinclair/VoyantServer/wiki/VoyantServer-Desktop#settings): if you have an older machine with a limited amount of RAM memory, try opening the file called `server_settings.txt` in the same directory as VoyantServer.jar and change the value "1024" to "512" (or even "256") before saving the text file and trying to relaunch VoyantServer. 20 | 21 | ## Jupyter Notebooks 22 | 23 | ![Jupyter](../images/jupyter48.png) Jupyter tends to be the most intensive solution to setup and configure, especially if you set it up on your local machine. There are a lot of instructions out there for getting setup, especially depending on platform on system preferences, but the [Getting Setup notebook](https://nbviewer.jupyter.org/github/sgsinclair/alta/blob/master/ipynb/GettingSetup.ipynb) is a good place to start. 24 | 25 | One very important thing: we want to use Python 3.x or higher (not Python 2.x) – that should be obvious throughout, but it's worth double-checking as you select the dowload file from Anaconda. 26 | 27 | The recommended approach is to intall Anaconda on your system. Think of Anaconda its own environment that's installed on your system and that is isolated from other important system files. Anaconda is a sandbox that contains the Jupyter application and the Jupyter application allows you to create Jupyter notebooks. 28 | 29 | ![Anaconda Architecture](jupyter-architecture.png) 30 | 31 | Unlike Voyant and ObservableHQ that are always-available web applications, Jupyter Notebooks has to be launched and be running in order to be used. This is an important distinction from our other environments: a "live" notebook (that can be edited) must have a process running somewhere, most likely on your computer. That process stores current contents in memory and handles the execution of code. So getting started each time will involve the following steps: 32 | 33 | 1. launch Anaconda Navigator (from your applications or desktop) 34 | 1. launch Jupyter Notebooks (from Anaconda Navigator, which launches browswer window) 35 | 1. create or open a Juypyter Notebook (in browser) 36 | 37 | As we proceed we will want to use some Python helper libraries that are not installed by default in Anaconda. We will return to this, but it's worth emphasizing now that installation happens within our Anaconda environment (and doesn't interfere with other system files). Similarly, it's possible to have multiple Anaconda installations that are independent, but for now we'll assume that we have one installation and that any modifications happen to that one installation. 38 | 39 | ## ObservableHQ 40 | 41 | ![ObservableHQ](../images/observable48.png) ObservableHQ is also a hosted website [observablehq.com](https://observablehq.com). It's possible to visit ObservableHQ and make anonymous changes to a notebook (like [this one](https://beta.observablehq.com/@observablehq/fork-share-merge)), but in order to save changes you need to login through one of the authentication services (currently GitHub, Twitter and Google – because we can use GitHub to store data, we strongly recommend that option). 42 | 43 | ![ObservableHQ Login](observable-login.png) 44 | 45 | ## Next Steps 46 | 47 | Now that we have a minimal setup for all three environments we can proceed to [getting started](../start/). -------------------------------------------------------------------------------- /ipynb/ArtOfLiteraryTextAnalysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# The Art of Literary Text Analysis" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "The Art of Literary Text Analysis (ALTA) has three objectives. \n", 15 | "\n", 16 | "- First, to introduce concepts and methodologies for literary text analysis programming. It doesn't assume you know how to program or how to use digital tools for analyzing texts. \n", 17 | "\n", 18 | "- Second, to show a range of analytical techniques for the study of texts. While it cannot explain and demonstrate everything, it provides a starting point for humanists with links to other materials.\n", 19 | "\n", 20 | "- Third, to provide utility notebooks you can use for operating on different texts. These are less well documented and combine ideas from the introductory notebooks.\n", 21 | "\n", 22 | "This instance of The Art of Literary Text Analysis is created in Jupyter Notebooks based on the Python scripting language. Other programming choices are available, and many conceptual aspects of the guide are relevant regardless of the language and implementation. \n", 23 | "\n", 24 | "**Jupyter Notebooks** was chosen for three main reasons: \n", 25 | "\n", 26 | "1. Python (the programming language used in Jupyter Notebooks) features extensive support for text analysis and natural language processing; \n", 27 | "\n", 28 | "2. Python is a great programming language to learn for those learning to program for the first time – it's not easy, but it represents a good balance between power, speed, readability and learnability;\n", 29 | "\n", 30 | "3. Jupyter Notebooks offers a _literate programming_ model of writing where blocks of prose text (like this one) can be interspersed with bits of code and output allowing us to use it to write this guide and you to write up your experiments. _The Art of Literary Text Analysis_ focuses on the thinking through of analytical processes, and the documentation-rich format offered by Jupyter Notebooks is well-suited to the nature of this guide and to helping you think through what you want to do." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## Table of Contents" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "This guide is a work in progress. It was developed over the Winter of 2015 in conjunction with a course on literary text mining at McGill. It has been [forked](Glossary.ipynb#Fork \"A cloned copy of a project which is set-up on a independent branch seperate to the original.\") and extended for a course in the Winter of 2016 on big data and analysis at the University of Alberta. Here is the current outline:\n", 45 | "\n", 46 | "* First Encounters (basics for working with Jupyter Notebooks and digital texts)\n", 47 | "\t* [Getting Setup](GettingSetup.ipynb) (installing and setting up Jupyter Notebooks)\n", 48 | "\t* [Getting Started](GettingStarted.ipynb) (introducing core Jupyter Notebooks concepts)\n", 49 | "\t* [Getting Texts](GettingTexts.ipynb) (an example of acquiring digital texts)\n", 50 | "\t* [Getting NLTK](GettingNltk.ipynb) (foundations for text processing using the Natural Language Toolkit)\n", 51 | "\t* [Getting Graphical](GettingGraphical.ipynb) (foundations for visualizing data)\n", 52 | "* Close Encounters\n", 53 | "\t* [Searching for Meaning](SearchingMeaning.ipynb) (searching variant word forms and word meanings)\n", 54 | "\t* [Parts of Speech](PartsOfSpeech.ipynb) (analysing parts of speech (nouns, adjectives, verbs, etc.) of documents\n", 55 | "\t* [Repeating Phrases](RepeatingPhrases.ipynb) (analyzing repeating sequences of words)\n", 56 | "* Distant Encounters \n", 57 | "\t* [Sentiment Analysis](SentimentAnalysis.ipynb) (measuring opinion or mood of texts)\n", 58 | " * [Topic Modelling](TopicModelling.ipynb) (finding recurring groups of terms)\n", 59 | " * [Document Similarity](DocumentSimilarity.ipynb) (measuring and visualizing distances between documents)\n", 60 | "* Utility Examples\n", 61 | " * [Simple Sentiment Analysis](utilities/SimpleSentimentAnalysis.ipynb) (measuring sentiment with a simple dictionary in the notebook)\n", 62 | " * [Complex Sentiment Analysis](utilities/ComplexSentimentAnalysis.ipynb) (using research dictionaries to measure sentiment)\n", 63 | " * [Collocates](utilities/Collocates.ipynb) (identifying collocates for a target word)\n", 64 | " * [Concordances](utilities/Concordances.ipynb) (generating a concordance for a target word)\n", 65 | " * [Exploring a text with NLTK](utilities/Exploring a text with NLTK.ipynb) (shows simple ways you can explore a text with NLTK.)\n", 66 | "* Resources\n", 67 | " * [Useful Links](Useful Resources.ipynb) (A myriad of helpful python and text analysis resources)\n", 68 | " * [Glossary](Glossary.ipynb) (Definitions and explanations for concepts and jargon)\n", 69 | " " 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": { 75 | "collapsed": true 76 | }, 77 | "source": [ 78 | "---\n", 79 | "[CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/) From [The Art of Literary Text Analysis](ArtOfLiteraryTextAnalysis.ipynb) by [Stéfan Sinclair](http://stefansinclair.name) & [Geoffrey Rockwell](http://geoffreyrockwell.com). Edited and revised by [Melissa Mony](http://melissamony.com).\n", 80 | "
Created January 7, 2015 and last modified January 12, 2018 (Jupyter 5.0.0)" 81 | ] 82 | } 83 | ], 84 | "metadata": { 85 | "kernelspec": { 86 | "display_name": "Python 3", 87 | "language": "python", 88 | "name": "python3" 89 | }, 90 | "language_info": { 91 | "codemirror_mode": { 92 | "name": "ipython", 93 | "version": 3 94 | }, 95 | "file_extension": ".py", 96 | "mimetype": "text/x-python", 97 | "name": "python", 98 | "nbconvert_exporter": "python", 99 | "pygments_lexer": "ipython3", 100 | "version": "3.6.3" 101 | } 102 | }, 103 | "nbformat": 4, 104 | "nbformat_minor": 1 105 | } 106 | -------------------------------------------------------------------------------- /docs/scrape/index.md: -------------------------------------------------------------------------------- 1 | # Web Scraping with the Art of Literary Text Analysis 2 | 3 | This is part of the [Art of Literary Text Mining](../) collection. This page is intended to briefly describe how to get started with web scraping, particularly with Juypter Notebooks and the `wget` command. 4 | 5 | A very common task when working with text analysis is aquiring a corpus of texts, frequently sourced from the web. Web scraping (or harvesting) is the act of fetching content from the web and extracting relevant content. There are two major kinds of web scraping: 6 | 7 | 1. fetching the contents from a list of specific URLs 8 | 1. fetching as much of web site as possible, often by following links from one page to another (sometimes also called web crawling) 9 | 10 | For the first type it's possible to have code that produces a list of URLs to fetch, this is essentially what we did in the [Getting Started](../start/) guide page, especially the Jupyter version using Beautiful Soup. This is a good example of how tools can be mixed and matched in various ways: you could have a Jupyter notebook produce a list of URLs and then provide that list of URLs to Voyant. 11 | 12 | ## Voyant 13 | 14 | ![Voyant](../images/voyant48.png) Voyant's web scraping abilities are limited in that it assumes that you'll provide a list of URLs and there's no mechanism for parsing the contents of those URLs in order fetch additional URLs. Still, it can be enormously convenient to paste a list of several URLs and have Voyant construct a corpus from them. Please note that any processing options are applied to all documents as appropriate (for instance, it's not possible to have different HTML CSS Selectors for different URLs, though it is possible to add documents individually by [modifying a corpus](https://voyant-tools.org/docs/#!/guide/modifyingcorpus). 15 | 16 | Even if URL fetching in Voyant is convenient, there are times where doing the web scraping outside of Voyant is preferable. One such situation is where you have many URLs, say more than about a dozen. Voyant has to fetch each URL one at a time and that can be time-consuming, which can cause a server timeout. Moreover, if ever an error is encountered you'd need to fetching over from the beginning next time. In fact, we recommend only fetching up to about three URLs at a time. 17 | 18 | Another situation is where you need to do some intermediate processing to the documents before analyzing them. In that case, you would scrape (download) them (possibly using techniques described below), edit the documents, and then upload them to Voyant. 19 | 20 | ## Jupyter Notebook 21 | 22 | ![Jupyter](../images/jupyter48.png) Our Juypyter notebook will walk through the following steps: 23 | 24 | * fetching the contents at http://www.digitalhumanities.org/dhq/index/title.html 25 | * parsing that document to get list of all the articles in the journal 26 | * fetching the contents of each of the article 27 | 28 | To continue, please see [Web Scraping](https://nbviewer.jupyter.org/github/sgsinclair/alta/blob/master/ipynb/Scraping.ipynb) with the Art of Literary Text Analysis. 29 | 30 | ## Wget Command 31 | 32 | Web scraping is such a common task that there are dedicated tools for doing it. Web scraping is not only important for people doing text analysis, but also, for instance, to anyone building a web search engine or otherwise wanting to create an archive of a site. One of the most widely used tools is a command-line utility called [`wget`](https://en.m.wikipedia.org/wiki/Wget). Here's a partial list of some of `wget`'s functionality: 33 | 34 | * fetch a single page (HTML source only): `wget http://www.digitalhumanities.org/dhq/` 35 | * fetch a single page and its assets: `wget -p -k http://www.digitalhumanities.org/dhq/` 36 | * fetch all URLs listed in the specified file: `wget -i urls.txt` 37 | * fetch a URL and recursively fetch all URLs in the contents: `wget -r http://www.digitalhumanities.org/dhq/` 38 | 39 | A disadvantage of `wget` is that it's not pre-installed on OS X or Windows, but we can remedy that by following the easy instructions found at the _Programming Historian_'s [Automated Downloading with Wget](https://programminghistorian.org/en/lessons/automated-downloading-with-wget#step-one-installation). 40 | 41 | For OS X the instructions on the page above are a bit out of date, here are the commands that seem to work best currently (from the [Homebrew](https://brew.sh) page: 42 | 43 | /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" 44 | 45 | brew install wget 46 | 47 | Once installed, we'll also follow the instructions in the [next section](https://programminghistorian.org/en/lessons/automated-downloading-with-wget#step-two-learning-about-the-structure-of-wget--downloading-a-specific-set-of-files) on creating a data directory from which we'll run our command. 48 | 49 | mkdir dhq 50 | cd dhq 51 | 52 | The first command is to "make directory" (`mkdir`) and the second command is to "change directory" (`cd`). 53 | 54 | One of `wget` strengths is in fetching multiple URLs and especially in finding links in one page and following those links to download contents in other pages, and so on recursively. Since `wget` is often used to fetch many URLs it's best to configure it such that is doesn't strain the target server too heavily (by trying to fetch hundreds of URLs as quickly as possible, for instance). A couple of common arguments are added to be a good net citizen (and avoid being blacklisted by servers, which would prevent you from fetching more content). 55 | 56 | * `-w`: number of seconds to wait between requests: `wget -w 1 http://www.digitalhumanities.org/dhq/` 57 | * `--limit-rate`: the bandwidth to use in kilobytes/second: `wget --limit-rate=200k http://www.digitalhumanities.org/dhq/` 58 | 59 | (Note about arguments: typically one hyphen is used for abbreviations like "w" and two hyphens for full names like "limit-rate".) 60 | 61 | A final argument that's useful for our purposes is to tell `wget` to only fetch URLs matching a certain pattern, namely "/dhq/vol/…". We do that with the argument `--accept-regex` 62 | 63 | wget -r --accept-regex -w 1 "/dhq/vol/" http://www.digitalhumanities.org/dhq/ 64 | 65 | This says "go get the contents of the http://www.digitalhumanities.org/dhq/ recursively fetching URLs that match our simple regular expression (actual articles) while waiting a second between each request and limiting the bandwidth to 200KB/second. 66 | 67 | And presto, we've scraped an entire journal! -------------------------------------------------------------------------------- /ipynb/Nltk.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Natural Language Toolkit" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Let's load _The Gold Bug_" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "THE GOLD-BUG\n", 27 | "\n", 28 | " What ho! what ho! this fellow is dancing mad!\n", 29 | "\n", 30 | " He hath been b\n" 31 | ] 32 | } 33 | ], 34 | "source": [ 35 | "with open(\"data/goldBug.txt\", \"r\") as f:\n", 36 | " goldBugString = f.read()\n", 37 | "print(goldBugString[:100])" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "Let's tokenize!" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "data": { 54 | "text/plain": [ 55 | "['the', 'gold-bug', 'what', 'ho', '!', 'what', 'ho', '!', 'this', 'fellow']" 56 | ] 57 | }, 58 | "execution_count": 10, 59 | "metadata": {}, 60 | "output_type": "execute_result" 61 | } 62 | ], 63 | "source": [ 64 | "import nltk\n", 65 | "goldBugTokens = nltk.word_tokenize(goldBugString.lower())\n", 66 | "goldBugTokens[:10]" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "['the', 'what', 'ho', 'what', 'ho', 'this', 'fellow']\n", 79 | "['the', 'what', 'ho', 'what', 'ho', 'this', 'fellow']\n" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "filterTokens = []\n", 85 | "for word in goldBugTokens[:10]:\n", 86 | " if word.isalpha():\n", 87 | " filterTokens.append(word)\n", 88 | "print(filterTokens)\n", 89 | "\n", 90 | "print([word for word in goldBugTokens[:10] if word.isalpha()])" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "goldBugWords = [word for word in goldBugTokens if any([char for char in word if char.isalpha()])]" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/plain": [ 110 | "[('the', 877),\n", 111 | " ('of', 465),\n", 112 | " ('and', 359),\n", 113 | " ('i', 336),\n", 114 | " ('to', 329),\n", 115 | " ('a', 327),\n", 116 | " ('in', 238),\n", 117 | " ('it', 213),\n", 118 | " ('you', 162),\n", 119 | " ('was', 137)]" 120 | ] 121 | }, 122 | "execution_count": 38, 123 | "metadata": {}, 124 | "output_type": "execute_result" 125 | } 126 | ], 127 | "source": [ 128 | "wordFrequencies = nltk.FreqDist(goldBugWords)\n", 129 | "wordFrequencies.most_common(10)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "name": "stdout", 139 | "output_type": "stream", 140 | "text": [ 141 | "['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']\n" 142 | ] 143 | } 144 | ], 145 | "source": [ 146 | "stopwords = nltk.corpus.stopwords.words(\"English\")\n", 147 | "print(stopwords)" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [ 155 | { 156 | "data": { 157 | "text/plain": [ 158 | "[('upon', 81),\n", 159 | " ('de', 73),\n", 160 | " (\"'s\", 56),\n", 161 | " ('jupiter', 53),\n", 162 | " ('legrand', 47),\n", 163 | " ('one', 38),\n", 164 | " ('said', 35),\n", 165 | " ('well', 35),\n", 166 | " ('massa', 34),\n", 167 | " ('could', 33),\n", 168 | " ('bug', 32),\n", 169 | " ('skull', 29),\n", 170 | " ('parchment', 27),\n", 171 | " ('made', 25),\n", 172 | " ('tree', 25),\n", 173 | " ('first', 24),\n", 174 | " ('time', 24),\n", 175 | " ('two', 23),\n", 176 | " ('much', 23),\n", 177 | " ('us', 23)]" 178 | ] 179 | }, 180 | "execution_count": 43, 181 | "metadata": {}, 182 | "output_type": "execute_result" 183 | } 184 | ], 185 | "source": [ 186 | "goldBugFilteredWords = [word for word in goldBugWords if not word in stopwords]\n", 187 | "nltk.FreqDist(goldBugFilteredWords).most_common(20)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [] 196 | } 197 | ], 198 | "metadata": { 199 | "kernelspec": { 200 | "display_name": "Python 3", 201 | "language": "python", 202 | "name": "python3" 203 | }, 204 | "language_info": { 205 | "codemirror_mode": { 206 | "name": "ipython", 207 | "version": 3 208 | }, 209 | "file_extension": ".py", 210 | "mimetype": "text/x-python", 211 | "name": "python", 212 | "nbconvert_exporter": "python", 213 | "pygments_lexer": "ipython3", 214 | "version": "3.6.3" 215 | } 216 | }, 217 | "nbformat": 4, 218 | "nbformat_minor": 1 219 | } 220 | -------------------------------------------------------------------------------- /docs/start/index.md: -------------------------------------------------------------------------------- 1 | # Getting Started with the Art of Literary Text Analysis 2 | 3 | This is part of the [Art of Literary Text Mining](../) collection. This page is intended to briefly describe how to get started, particularly with Voyant Tools. 4 | 5 | So you want to do some text analysis, but where to start? Let's imagine that we have a favourite news source and we want to try to determine what's being discussed (without necessarily just reading the front page articles). You can do this with most websites and media outlets, but for the purposes of this example, let's say that we want to look at the Canadian Broadcasting Corporation (Canada's public Anglophone broadcaster) at [CBC.ca](https://cbc.ca). 6 | 7 | ## Voyant 8 | 9 | ![Voyant](../images/voyant48.png) In Voyant analyzing the contents of a URL is dead simple, all that needs to be done is to visit the main page [voyant-tools.org](https://voyant-tools.org) and paste in the URL of interest. We can also use the query parameters (part of the URL) to specify an input argument: 10 | 11 | [https://voyant-tools.org/?input=https://cbc.ca](https://voyant-tools.org/?corpus=9094634e2f37d5e29cf93431c4c7bb5a&input=https://www.cbc.ca) 12 | 13 | The full interface can show some interesting aspects, but even just the summary points out some interesting aspects. For instance, even though the CBC page is essentially a compilation of blocks linking to other pages we can see that our corpus contains only one document: 14 | 15 | 16 | 17 | We said we wouldn't read the page directly, but it is worth having a look at what exactly we caught when we cast the net over the URL. To do that, we could have a look at the [Reader](https://voyant-tools.org/docs/#!/guide/reader) tool in Voyant. 18 | 19 | 20 | 21 | What we see is that there's a main title on the page "CBC.ca - watch, listen, and discover with Canada's Public Broadcaster…" but there's also navigational items like "Skip to Main Content", "CBCMenu", and "Search". While there's nothing wrong with that necessarily, it may be misleading to think that the news is talking about search (and rescue, for instance), when we have a keyword that is really from the navigational elements of the page (sometimes called paratextual elements). Can we do better? 22 | 23 | DOM-model.svg We can, and the way we do that is to dive into an exploration of what's called the [Document Object Model](https://en.wikipedia.org/wiki/Document_Object_Model), that is, the hierarchical elements that are part of the tree of this web document. 24 | 25 | ### The DOM and CSS Selectors 26 | 27 | HTML is a markup language that starts with a root node or tag (usually <html>), then splits into a <head> and a <body>, each of which may have its own children nodes (or tags or text). Within the DOM there are also ways of identifying unique elements and group similar elements into a class of objects that share some characteristics. This is precisely the syntax that's used to add styling to pages using Cascading Stylesheets (CSS). 28 | 29 | | Examples | Type | Explanation | 30 | |-|-|-| 31 | | body, p | tag name selector | select every tag that is either <body> or <p> | 32 | | #mainsection | ID selector | select the unique element with matching ID, as in <div id="mainsection"> | 33 | | .insight | class selector | select all elements with matching class, as in <div class="insight"> | 34 | 35 | The syntax of CSS Selectors is actually [much more powerful](https://en.wikipedia.org/wiki/Cascading_Style_Sheets#Selector), but for now this will suffice. 36 | 37 | So, back to our CBC news page, how do we clean up the input a bit? We can explore the DOM in the browser using built-in tools, depending on your browser: 38 | 39 | * **Firefox** Menu ➤ Web Developer ➤ Toggle Tools, or Tools ➤ Web Developer ➤ Toggle Tools 40 | * **Chrome** More tools ➤ Developer tools 41 | * **Safari** Develop ➤ Show Web Inspector. If you can't see the Develop menu, go to Safari ➤ Preferences ➤ Advanced, and check the Show Develop menu in menu bar checkbox. 42 | 43 | In this case (as of writing of this document, though things may change of course), one reasonable choice would be to select either the tag "main" (if we believe there's just one) or the ID #content. 44 | 45 | DOM Inspect 46 | 47 | To experiment, use Voyant (preferably the Desktop version) and try different settings while consulting the [documentation for the HTML Corpus Creation](https://voyant-tools.org/docs/#!/guide/corpuscreator-section-html) as necessary. When starting at the landing page of Voyant, be sure to click on the options icon to open this dialog box: 48 | 49 | HTML Options 50 | 51 | ### Exercise 52 | 53 | Voyant allows you to define a corpus with multiple documents using the "Documents" field, even if the original content is in only just one file. Is there a CSS Selector that allows you to compile all of the individual story blocks as separate documents (not the full contents if you visit any one story, just the title and blurb shown on the main page)? 54 | 55 | ### Gotchas 56 | 57 | Most web pages are rendered from the HTML code that is sent from the server to the browser, but there are cases where the browser receives further instructions to fetch and generate parts of a page. Those interactively generated pages probably won't work with Voyant (and other similar systems) since it can only see the HTML that's initially sent, not the rest of the content that is fetched after the page has loaded. 58 | 59 | Voyant (and similar systems) can only work with the content that is fetched from the URL but in some cases you may be looking at priviledged content that you can see in your browser but that's invisible to the server (the contents of your Facebook page, for instance). Any URL sent to Voyant assumes that the content is open and essentially the same regardless of who is fetching the page. 60 | 61 | ## Jupyter Notebook 62 | 63 | ![Jupyter](../images/jupyter48.png) We can also see the DOM and CSS Selection at play in our Jupyter Notebook for [Getting Started](https://nbviewer.jupyter.org/github/sgsinclair/alta/blob/master/ipynb/GettingStarted.ipynb). The notebook walks through the steps of creating a new notebook and some basic Python syntax, but if you don't need that you can skip ahead to the [Fetch URL Example](https://nbviewer.jupyter.org/github/sgsinclair/alta/blob/master/ipynb/GettingStarted.ipynb#Fetch-URL-Example). 64 | 65 | -------------------------------------------------------------------------------- /ipynb/GettingSetup.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Getting Setup: Installing Jupyter\n", 8 | "\n", 9 | "This notebook describes how to get setup with Jupyter (formerly iPython Notebooks). It's part of the [The Art of Literary Text Analysis](ArtOfLiteraryTextAnalysis.ipynb). In particular, we'll look at:\n", 10 | "\n", 11 | "* [Downloading and installing Jupyter](#Downloading-and-Installing-Jupyter-with-Anaconda)\n", 12 | "* [Launching Jupyter and creating a working directory](#Launching-Jupyter)\n", 13 | "* [Creating a notebook](#Creating-a-Notebook)\n", 14 | "* [Quitting Jupyter](#Quitting-Jupyter)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Downloading and Installing Jupyter with Anaconda\n", 22 | "\n", 23 | "[\"Anaconda](https://www.continuum.io/downloads) Setting up Jupyter is (usually) a smooth and painless process. The easiest and recommended option is to [download and install Anaconda](https://www.continuum.io/downloads), which is a freely available bundle that includes Python, Jupyter, and several other things that will be useful to us. It's *very important* for the purposes of our notebooks to select a version for [Mac OS X](https://www.continuum.io/downloads#_macosx), [Windows](https://www.continuum.io/downloads#_windows) or [Linux](https://www.continuum.io/downloads#_unix) of **Anaconda with Python 3.x** (not Python 2.x).\n", 24 | "\n", 25 | "Once the Anaconda 3.x installer program is downloaded you can click on the installer and follow the instructions (using the defaults will work just fine). If you encounter difficulties, you may want to consult the [Jupyter installation documentation](http://jupyter.readthedocs.org/en/latest/install.html)." 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## Launching Jupyter\n", 33 | "\n", 34 | "\"Anaconda Once you've installed Anaconda, the easiest way to launch Jupyter is to use the Anaconda Navigator (which you should be able to find in your Applications folder on Mac or your Programs menu on Windows).\n", 35 | "\n", 36 | "The Anaconda Navigator will present several applications to choose from, we'll click on the _Launch_ button of _notebook_ (Jupyter Notebook):\n", 37 | "\n", 38 | "\"Anaconda\n", 39 | "\n", 40 | "This should launch two more windows:\n", 41 | "\n", 42 | "1. A terminal window where the Jupyter server ([kernel](Glossary.ipynb#kernel \"The core computer program of the operating system which can control all system processes\")) is running (this will be used to quit Jupyter later), and\n", 43 | "1. A web browser window that shows a [directory tree](Glossary.ipynb#directorytree \"A tree like structure which represents the organization and hierachy of files within a directory\") of your computer's file system (starting at the default path of Jupyter).\n", 44 | "\n", 45 | "The default path on a Mac is the user's \"Home\" directory. We probably don't want to create Jupyter notebooks there, so we'll navigate to another directory (like \"Documents\") and create a new folder (like \"Notebooks\"). The location and names aren't important but we'll need to remember where our notebooks are for future use." 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## Creating Folders\n", 53 | "To create a new folder or notebook you use the _New_ button in the directory browser window.\n", 54 | "\n", 55 | "\"Jupyter\n", 56 | "\n", 57 | "Creating a new folder gives a default name (like \"Untitled Folder\") but we can select the folder using the checkbox to the left and then click the rename button that appears before giving the folder a new name.\n", 58 | "\n", 59 | "\"Folder\n", 60 | "\n", 61 | "Now we have Jupyter running and we have a new folder for our notebooks, we're ready for the next step in [Getting Started](GettingStarted.ipynb). But just before that, let's look quickly at how we create a notebook and how we quit Jupyter." 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "## Creating a Notebook\n", 69 | "\n", 70 | "Now you can create your first notebook. Use the same _New_ menu and pull down to the **Python 3** under the Notebooks heading in the menu. This will create your first notebook. We will review this and how to use notebooks in the next notebook [Getting Started](GettingStarted.ipynb)." 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "## Quitting Jupyter\n", 78 | "\n", 79 | "The browser window that was opened by the Anaconda Launcher is just a regular window. If we'd been working on a notebook, we'd of course want to save our work before quitting. We don't need to do this for browser (directory) windows. We can close the browser window(s) created by Jupyter in the usual browser way. Then we have to shut down the server [kernel](Glossary.ipynb#kernel \"The core computer program of the operating system which can control all system processes\") (so that our computer doesn't waste memory resources). To do that we do the following:\n", 80 | "\n", 81 | "1. _Close and Halt_ any notebooks you have running by going to the _File_ menu of each running notebook,\n", 82 | "1. Switch to the terminal window that was opened by the launcher and hit Ctrl-c twice (keep your finger on the Control key and press the \"c\" twice), then\n", 83 | "1. Switch to the Anaconda Launcher application and quit it (just as you would any other application)." 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "## Next Steps\n", 91 | "\n", 92 | "Let's now proceed to [Getting Started](GettingStarted.ipynb)." 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "---\n", 100 | "[CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/) From [The Art of Literary Text Analysis](ArtOfLiteraryTextAnalysis.ipynb) by [Stéfan Sinclair](http://stefansinclair.name) & [Geoffrey Rockwell](http://geoffreyrockwell.com). Edited and revised by [Melissa Mony](http://melissamony.com).\n", 101 | "
Created January 7, 2015 and last modified January 14, 2018 (Jupyter 5.0.0)" 102 | ] 103 | } 104 | ], 105 | "metadata": { 106 | "kernelspec": { 107 | "display_name": "Python 3", 108 | "language": "python", 109 | "name": "python3" 110 | }, 111 | "language_info": { 112 | "codemirror_mode": { 113 | "name": "ipython", 114 | "version": 3 115 | }, 116 | "file_extension": ".py", 117 | "mimetype": "text/x-python", 118 | "name": "python", 119 | "nbconvert_exporter": "python", 120 | "pygments_lexer": "ipython3", 121 | "version": "3.7.1" 122 | } 123 | }, 124 | "nbformat": 4, 125 | "nbformat_minor": 1 126 | } 127 | -------------------------------------------------------------------------------- /ipynb/utilities/SimpleSentimentAnalysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Simple Sentiment Analysis\n", 8 | "\n", 9 | "This notebook shows how to analyze a collection of passages like Tweets for sentiment.\n", 10 | "\n", 11 | "This is based on Neal Caron's [An introduction to text analysis with Python, Part 1](http://nealcaren.web.unc.edu/an-introduction-to-text-analysis-with-python-part-1/).\n", 12 | "\n", 13 | "This Notebook shows how to analyze one tweet." 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "### Setting up our data\n", 21 | "\n", 22 | "Here we will define the data to test our positive and negative dictionaries." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 6, 28 | "metadata": { 29 | "collapsed": true 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "theTweet = \"No food is good food. Ha. I'm on a diet and the food is awful and lame.\"\n", 34 | "positive_words=['awesome','good','nice','super','fun','delightful']\n", 35 | "negative_words=['awful','lame','horrible','bad']" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 7, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "list" 47 | ] 48 | }, 49 | "execution_count": 7, 50 | "metadata": {}, 51 | "output_type": "execute_result" 52 | } 53 | ], 54 | "source": [ 55 | "type(positive_words)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "### Tokenizing the text\n", 63 | "\n", 64 | "Now we will tokenize the text." 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 8, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "['no', 'food', 'is', 'good', 'food', 'ha', 'i', 'm', 'on', 'a']\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "import re\n", 82 | "theTokens = re.findall(r'\\b\\w[\\w-]*\\b', theTweet.lower())\n", 83 | "print(theTokens[:10])" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "### Calculating postive words\n", 91 | "\n", 92 | "Now we will count the number of positive words." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 14, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "1\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "numPosWords = 0\n", 110 | "for banana in theTokens:\n", 111 | " if banana in positive_words:\n", 112 | " numPosWords += 1\n", 113 | "print(numPosWords) " 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "### Calculating negative words\n", 121 | "\n", 122 | "Now we will count the number of negative words." 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 10, 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "name": "stdout", 132 | "output_type": "stream", 133 | "text": [ 134 | "2\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "numNegWords = 0\n", 140 | "for word in theTokens:\n", 141 | " if word in negative_words:\n", 142 | " numNegWords += 1\n", 143 | "print(numNegWords) " 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 18, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/plain": [ 154 | "True" 155 | ] 156 | }, 157 | "execution_count": 18, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "v1 = \"0\"\n", 164 | "v2 = 0\n", 165 | "v3 = str(v2)\n", 166 | "v1 == v3" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "### Calculating percentages\n", 174 | "\n", 175 | "Now we calculate the percentages of postive and negative." 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 11, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "Positive: 6% Negative: 11%\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "numWords = len(theTokens)\n", 193 | "percntPos = numPosWords / numWords\n", 194 | "percntNeg = numNegWords / numWords\n", 195 | "print(\"Positive: \" + \"{:.0%}\".format(percntPos) + \" Negative: \" + \"{:.0%}\".format(percntNeg))" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "### Deciding if it is postive or negative\n", 203 | "\n", 204 | "We are going assume that a simple majority will define if the Tweet is positive or negative." 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 12, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "name": "stdout", 214 | "output_type": "stream", 215 | "text": [ 216 | "Negative 1:2\n", 217 | "\n" 218 | ] 219 | } 220 | ], 221 | "source": [ 222 | "if numPosWords > numNegWords:\n", 223 | " print(\"Positive \" + str(numPosWords) + \":\" + str(numNegWords))\n", 224 | "elif numNegWords > numPosWords:\n", 225 | " print(\"Negative \" + str(numPosWords) + \":\" + str(numNegWords))\n", 226 | "elif numNegWords == numPosWords:\n", 227 | " print(\"Neither \" + str(numPosWords) + \":\" + str(numNegWords))\n", 228 | " \n", 229 | "print()" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "## Next Steps" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "Let's try another utility example, this time looking at more [Complex Sentiment Analysis](ComplexSentimentAnalysis.ipynb)." 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "---\n", 256 | "[CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/) From [The Art of Literary Text Analysis](../ArtOfLiteraryTextAnalysis.ipynb) by [Stéfan Sinclair](http://stefansinclair.name) & [Geoffrey Rockwell](http://geoffreyrockwell.com). Edited and revised by [Melissa Mony](http://melissamony.com).
Created August 8, 2014 (Jupyter 4.2.1)" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": { 263 | "collapsed": true 264 | }, 265 | "outputs": [], 266 | "source": [] 267 | } 268 | ], 269 | "metadata": { 270 | "kernelspec": { 271 | "display_name": "Python 3", 272 | "language": "python", 273 | "name": "python3" 274 | }, 275 | "language_info": { 276 | "codemirror_mode": { 277 | "name": "ipython", 278 | "version": 3 279 | }, 280 | "file_extension": ".py", 281 | "mimetype": "text/x-python", 282 | "name": "python", 283 | "nbconvert_exporter": "python", 284 | "pygments_lexer": "ipython3", 285 | "version": "3.6.3" 286 | } 287 | }, 288 | "nbformat": 4, 289 | "nbformat_minor": 1 290 | } 291 | -------------------------------------------------------------------------------- /ipynb/utilities/.ipynb_checkpoints/SimpleSentimentAnalysis-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Simple Sentiment Analysis\n", 8 | "\n", 9 | "This notebook shows how to analyze a collection of passages like Tweets for sentiment.\n", 10 | "\n", 11 | "This is based on Neal Caron's [An introduction to text analysis with Python, Part 1](http://nealcaren.web.unc.edu/an-introduction-to-text-analysis-with-python-part-1/).\n", 12 | "\n", 13 | "This Notebook shows how to analyze one tweet." 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "### Setting up our data\n", 21 | "\n", 22 | "Here we will define the data to test our positive and negative dictionaries." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 6, 28 | "metadata": { 29 | "collapsed": true 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "theTweet = \"No food is good food. Ha. I'm on a diet and the food is awful and lame.\"\n", 34 | "positive_words=['awesome','good','nice','super','fun','delightful']\n", 35 | "negative_words=['awful','lame','horrible','bad']" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 7, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "list" 47 | ] 48 | }, 49 | "execution_count": 7, 50 | "metadata": {}, 51 | "output_type": "execute_result" 52 | } 53 | ], 54 | "source": [ 55 | "type(positive_words)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "### Tokenizing the text\n", 63 | "\n", 64 | "Now we will tokenize the text." 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 8, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "['no', 'food', 'is', 'good', 'food', 'ha', 'i', 'm', 'on', 'a']\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "import re\n", 82 | "theTokens = re.findall(r'\\b\\w[\\w-]*\\b', theTweet.lower())\n", 83 | "print(theTokens[:10])" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "### Calculating postive words\n", 91 | "\n", 92 | "Now we will count the number of positive words." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 14, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "1\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "numPosWords = 0\n", 110 | "for banana in theTokens:\n", 111 | " if banana in positive_words:\n", 112 | " numPosWords += 1\n", 113 | "print(numPosWords) " 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "### Calculating negative words\n", 121 | "\n", 122 | "Now we will count the number of negative words." 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 10, 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "name": "stdout", 132 | "output_type": "stream", 133 | "text": [ 134 | "2\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "numNegWords = 0\n", 140 | "for word in theTokens:\n", 141 | " if word in negative_words:\n", 142 | " numNegWords += 1\n", 143 | "print(numNegWords) " 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 18, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/plain": [ 154 | "True" 155 | ] 156 | }, 157 | "execution_count": 18, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "v1 = \"0\"\n", 164 | "v2 = 0\n", 165 | "v3 = str(v2)\n", 166 | "v1 == v3" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "### Calculating percentages\n", 174 | "\n", 175 | "Now we calculate the percentages of postive and negative." 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 11, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "Positive: 6% Negative: 11%\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "numWords = len(theTokens)\n", 193 | "percntPos = numPosWords / numWords\n", 194 | "percntNeg = numNegWords / numWords\n", 195 | "print(\"Positive: \" + \"{:.0%}\".format(percntPos) + \" Negative: \" + \"{:.0%}\".format(percntNeg))" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "### Deciding if it is postive or negative\n", 203 | "\n", 204 | "We are going assume that a simple majority will define if the Tweet is positive or negative." 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 12, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "name": "stdout", 214 | "output_type": "stream", 215 | "text": [ 216 | "Negative 1:2\n", 217 | "\n" 218 | ] 219 | } 220 | ], 221 | "source": [ 222 | "if numPosWords > numNegWords:\n", 223 | " print(\"Positive \" + str(numPosWords) + \":\" + str(numNegWords))\n", 224 | "elif numNegWords > numPosWords:\n", 225 | " print(\"Negative \" + str(numPosWords) + \":\" + str(numNegWords))\n", 226 | "elif numNegWords == numPosWords:\n", 227 | " print(\"Neither \" + str(numPosWords) + \":\" + str(numNegWords))\n", 228 | " \n", 229 | "print()" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "## Next Steps" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "Let's try another utility example, this time looking at more [Complex Sentiment Analysis](ComplexSentimentAnalysis.ipynb)." 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "---\n", 256 | "[CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/) From [The Art of Literary Text Analysis](../ArtOfLiteraryTextAnalysis.ipynb) by [Stéfan Sinclair](http://stefansinclair.name) & [Geoffrey Rockwell](http://geoffreyrockwell.com). Edited and revised by [Melissa Mony](http://melissamony.com).
Created August 8, 2014 (Jupyter 4.2.1)" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": { 263 | "collapsed": true 264 | }, 265 | "outputs": [], 266 | "source": [] 267 | } 268 | ], 269 | "metadata": { 270 | "kernelspec": { 271 | "display_name": "Python 3", 272 | "language": "python", 273 | "name": "python3" 274 | }, 275 | "language_info": { 276 | "codemirror_mode": { 277 | "name": "ipython", 278 | "version": 3 279 | }, 280 | "file_extension": ".py", 281 | "mimetype": "text/x-python", 282 | "name": "python", 283 | "nbconvert_exporter": "python", 284 | "pygments_lexer": "ipython3", 285 | "version": "3.6.3" 286 | } 287 | }, 288 | "nbformat": 4, 289 | "nbformat_minor": 1 290 | } 291 | -------------------------------------------------------------------------------- /ipynb/Converting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Converting with the Art of Literary Text Analysis\n", 8 | "\n", 9 | "Our objective here is to process a plain text file so that it is more suitable for analysis. In particular. we will take two _Godfather_ screenplays and remove the stage directions. Here are the steps:\n", 10 | "\n", 11 | "* fetch the two screenplays\n", 12 | "* extract the screenplay text from the files\n", 13 | "* remove the stage directions\n", 14 | "\n", 15 | "Since we're doing this for two files we will introduce the concept of reusable functions. We've used functions in Python, in this case we're defining our own functions for the first time and using them. The basic syntax is simple:\n", 16 | "\n", 17 | " def function_name(arguments):\n", 18 | " # processing\n", 19 | " # return a value (usually)\n", 20 | " \n", 21 | "We can start by defining our function to fetch a URL, building on the materials we saw with [Scraping](Scraping.ipynb)." 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 64, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import urllib.request\n", 31 | "\n", 32 | "# this function simply fetches the contents of a URL\n", 33 | "def fetch(url):\n", 34 | " response = urllib.request.urlopen(url) # open for reading\n", 35 | " return response.read() # read and return" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 65, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "b'\\r\\nGodfather Script at IMSDb.\\r\\nCreated January 31, 2019 (Jupyter 5)." 220 | ] 221 | } 222 | ], 223 | "metadata": { 224 | "kernelspec": { 225 | "display_name": "Python 3", 226 | "language": "python", 227 | "name": "python3" 228 | }, 229 | "language_info": { 230 | "codemirror_mode": { 231 | "name": "ipython", 232 | "version": 3 233 | }, 234 | "file_extension": ".py", 235 | "mimetype": "text/x-python", 236 | "name": "python", 237 | "nbconvert_exporter": "python", 238 | "pygments_lexer": "ipython3", 239 | "version": "3.7.1" 240 | } 241 | }, 242 | "nbformat": 4, 243 | "nbformat_minor": 2 244 | } 245 | -------------------------------------------------------------------------------- /docs/collocate/index.md: -------------------------------------------------------------------------------- 1 | # Collocates with the Art of Literary Text Mining 2 | 3 | In the previous meta-guide we considered the nature of bits and bytes and strings and how we might fruitfully [count occurrences](../count/). This page shifts focus from finding and counting terms to considering the lexical context of terms. In other words, if we were to collect the terms that are in proximity to our keyword, what might we observe? 4 | 5 | ## Concordances 6 | 7 | Concordances are a very old "technology" or technique, reaching back at least to the 12the Century when theologians sought to better understand certain concepts by creating a type of extended index of the occurrences of terms. Even though concordances pre-date the computer by centuries, the digital makes it far easier to re-organize data. Imagine we have the following excerpt from Ursula LeGuin's _The Left Hand of Darkness_: 8 | 9 | Insofar as I love life, I love the hills of the Domain of Estre, but that sort of love does not have a boundary-line of hate. And beyond that, I am ignorant, I hope. 10 | 11 | This is one text, but now let's imagine that we want to generate a concordance where the keyword is "love", in other words, each occurrence of "love" with some context (three words) on each side: 12 | 13 | Insofar as I **love** life, I love 14 | love life, I **love** the hills of 15 | that sort of **love** does not have 16 | 17 | This, in certain ways, is a new text and we can consider it as such for counting – what are the top frequency terms in this new text? Our counting now has an additional layer of potential meaning: we are now focusing on terms that are related to our keyword. High frequency terms in this new text may be said to be related to the keywords, they seem to tend to occur together (as always, it's probably more useful if we remove the stopwords. In fact, our concordance might be more useful already if we skip stopwords: 18 | 19 | **love** life, love hills 20 | love life **love** hills Domain Estre 21 | hills Domain Estre **love** boundary-line hate ignorant 22 | 23 | This example demonstrates one of the possible dangers of simple concordancing: because of the proximity of two occurrences of "love" some of the same words are duplicated. We could resolve this with some additional coding, but in most cases, depending on the term of interest and the size of the context, it's rarely a problem. 24 | 25 | ## Collocates 26 | 27 | Collocates (we see co-located here) are terms that appear in some pre-defined proximity. At some levels any two terms in a text are collocates (this is sometimes referred to bag-of-words where all words are considered together regardless of position). But usually collocates are considered in a smaller window (often single-digit terms in each direction). 28 | 29 | What do collocates tell us? In many cases probably not much. In novels we often see "he said" or "she said" so we'd expect "said" to have as high collocates "he" and "she" (likewise, "he" and "she" might have as an important collocate "said"). In practice "he" and "she" (and possibly "said") may be hidden from view because of a stoplist. 30 | 31 | ## Collocates in Voyant 32 | 33 | ![Voyant](../images/voyant48.png) Voyant has several tools that use collocate information. 34 | 35 | ### Links 36 | 37 | A first collocates-based tool is _Links_ (shown on the right), which is in the tab of the upper left-hand tool panel (where Cirrus is shown by default). When first opened _Links_ selects three of the highest frequency terms (shown in blue boxes) and then fetches collocates of those terms (shown in the orange boxes). 38 | 39 | A line between two terms indicates a collocate relationship, in other words, those two terms occur together more often. The thicker the line, the more frequent (relative to all collocate links shown) the collocation. This is a relatively complex visualization between it's showing multiple things including: 40 | 41 | * highest frequency terms 42 | * collocates of those high frequency terms, indicated with lines 43 | * other collocate relationships indicated by lines (such as between orange boxes) 44 | 45 | This is a network graph in that it's showing the various relationships (by virtue of collocation) of both keywords and collocate words in the text. The trick is knowing the extent to which a connection between two words is more coincidental or more indicative of a potentially significant relationship. 46 | 47 | It's worth noting that when you click on a term in the _Links_ tool that term will likely appear in other tools, such as the _Reader_ and _Trends_. You can also click on the lines in _Links_ to initiate a search of when the two terms at the ends of the line occur in proximity, this is a proximity search. 48 | 49 | For instance, I can delete the current terms in _Links_ (see the button near the bottom of the tool) and add the term "love" to the screen by searching and selecting it in the textbox. I can click on the term "love" multiple times to fetch additional occurrences. Then I can choose one of the collocates by clicking a the line that separates two terms of interest, such as "love" and "young". 50 | 51 | Although I can click on a word to fetch more collocates, sometimes it's useful to see many more collocates at once. That's possible in _Links_ for one keyword at a time: right-click or Ctrl-click on a term and select _Centralize_ from the menu that appears. That will place the keyword in the middle and show all the collocates that have been fetched (to some limit), ordered by frequency, in the periphery. To revert to the previous mode, right/ctrl-click and choose _Fetch Collocates_ from the menu. 52 | 53 | ### TermsBerry 54 | 55 | Another useful tool for exploring collocates in Voyant is _TermsBerry_, which can be found in the middle panel (top row) in the second tab. Although visually it's very different from _Links_ it also provides much of the same information. Whereas the default view in _Links_ shows data for the top 3 terms in the corpus (after the stoplist has been applied), _TermsBerry_ by default shows 75 of the top frequency words, so it's much denser with information. You can also click on the "Strategy" button at the bottom to determine how the initial seed words are shown: it can be "top terms" (by frequency) or "distinct terms" (higher frequency compared to other texts). 56 | 57 | 58 | Whereas _Links_ shows lines between words that collocate, _TermsBerry_ indicates collocates as you hover over different terms. If you hover over any term the background colour of the other terms will update, with darker items showing more frequent collocates (the count is visible under each term). In what ways is this tool easier and harder to study collocates? 59 | 60 | ### Collocates 61 | 62 | The final tool that we'll mention is a more classic presentation of data in tabular format. By default it shows several high frequency terms (the keyword in the _Term_ column) as well as several collocate forms. One benefit of the tabular view is that results can be organized by sorting columns: terms, term counts, collocates, collocate counts. The search is another powerful aspect of this tools, allowing you to work with one or more keyword terms at a time, even word collocates of phrases (what terms occur close to "love him", for instance?). Again, what are the pros and cons of this tool compared to the others? Are they complementary? 63 | 64 | ## Collocates in Jupyter 65 | 66 | ![Jupyter](../images/jupyter48.png) For our exploration of collocates in Jupyter we'll follow a link into [Getting NLTK](https://nbviewer.jupyter.org/github/sgsinclair/alta/blob/master/ipynb/GettingNltk.ipynb) in the Art of Literary Text Mining with Jupyter. 67 | -------------------------------------------------------------------------------- /docs/convert/index.md: -------------------------------------------------------------------------------- 1 | # Format Conversion with Web Scraping with the Art of Literary Text Analysis 2 | 3 | This is part of the [Art of Literary Text Mining](../) collection. This page is intended to briefly describe how to get started with format conversion, particularly with Juypter Notebooks. 4 | 5 | ### Plain Text 6 | 7 | For better or for worse, the vast majority of text mining projects either start with plain text versions of the documents, or convert existing document to plain text. There are some projects and some tools that make use of markup in the text during analysis, but they're relatively rare (we'll see some examples later in the guide). 8 | 9 | We identify three major kinds of conversion and pre-processing steps: 10 | 11 | 1. files are already in plain text but require some cleaning (to remove a license statement or regular page numbers, for instance) 12 | 1. files are in HTML or XML format in ways that are conducive to text extraction (we've already seen) and other pre-processing (especially thanks to libraries like BeautifulSoup) 13 | 1. files are in some other format that may require special or manual handling, especially for binary formats like MS Word and PDF. 14 | 15 | For documents that are already in plain text, the easiest is often to make changes manually in the files or to use an application to make the same changes in multiple documents at a time (if the editor supports such functionality). 16 | 17 | Let's work through a real example, three drafts screenplays for The Godfather movies, available from [IMSDb search](https://www.imsdb.com/search.php). If you type "Godfather" in the search you should get three hits: 18 | 19 | #### Search results for 'godfather' 20 | 21 | Godfather (1971-03 Draft) 22 | Written by Mario Puzo,Francis Ford Coppola 23 | Godfather Part II (1973-09 Draft) 24 | Written by Mario Puzo,Francis Ford Coppola 25 | Godfather Part III, The (1979-03 First draft) 26 | Written by Mario Puzo,Francis Ford Coppola 27 | 28 | Let's just work with two of the three scripts, the first two (the third has a slightly different format that confuses things somewhat, though it would be possible to use as well). Near the bottom of the first section and near the bottom of the page you'll find links to _Read "Godfather" script_, we can compile links for each document: 29 | 30 | https://www.imsdb.com/scripts/Godfather.html 31 | https://www.imsdb.com/scripts/Godfather-Part-II.html 32 | 33 | Rather than save the HTML file to our hard drive, we will select the actual script and copy it into the clipboard (starting with **_THE GODFATHER_** and going until **THE END**: select that entire section and then copy it into the clipboard. 34 | 35 | Now we need a plain text editor. Several very good ones exist, including [TextMate](https://macromates.com) for Mac and [Sublime](https://www.sublimetext.com) for multiple platforms, but we will use [Atom](https://atom.io), a relatively new kid on the block that has excellent GitHub integration. Start by downloading Atom, unzipping the download and placing the executable where you want it (I put in my Applications folder). 36 | 37 | When Atom first opens you should open your LLCU-212 GitHub folder (any folder can be a project). Then from the _File_ menu select _Add Project Folder…_ and add a folder calld "Godfather" (without the quotes). Finally you can select _New File_ from the _File_ menu, paste the screenplay into the document, and then save the file in the "Godfather" folder with the name "Godfather.txt". Next get the [second screenplay](https://www.imsdb.com/scripts/Godfather-Part-II.html), select the actual text, copy to the clipboard, return to Atom, choose _New File_ from the _File_ menu, paste the contents, and save in the "Godfather" as "Godfather2.txt". 38 | 39 | Without much fanfare we just demonstrated a simple but powerful mechanism for converting between formats. The original web page was in HTML and when we copied the screenplay into the clipboard it was still styled text (you can see this if you paste the same text into a styled editor like MS Word). However, when you paste HTML or styled text into a text editor you also convert your document to plain text. Needless to say things like images will be lost, but in our case all we really need is the plain text, so this operation is suitable. 40 | 41 | If you keep the Godfather screenplay open you can see some layout particularities. Namely, stage directions are all preceded with a single tab (then other characters) whereas speeches are all preceded by a double-tag (then other characters). 42 | 43 | Find in Project 44 | 45 | If we wanted to remove all the stage directions, one way to do so would be to select and remove all lines that have only a single tab. That's where regular expressions come in. 46 | 47 | [Regular Expressions](https://en.wikipedia.org/wiki/Regular_expression) are a powerful mechanism for not only identifying characters, but also invisible characters (tabs, newlines, etc.), character classes (lowercase characters, digits), and a whole bunch of other things. We won't go deep into regular expressions here, but suffice it to introduce a few very common aspects of the syntax: 48 | 49 | * **.**: any character 50 | * **\w**: any ASCII letter or word character (a to z) 51 | * **\d**: any digit (number) 52 | * **\t**: a tab character 53 | * **\n**: a newline character 54 | * **\s**: a whitespace character 55 | * **[aeiou]**: any of the character enumerated 56 | * **[a-z]**: any character in the range 57 | * **[^aeiou]**: none of the characters mentioned 58 | * **^**: zero-length match at the start of a line 59 | * **$**: zero-length match at the end of a line 60 | * **\b**: zero-length match of a word boundary 61 | * **(one|two)**: any word between the pipes 62 | 63 | In addition, there are ways of repeating these forms: 64 | 65 | * **.\***: zero or more times 66 | * **.?**: zero or one times 67 | * **.+**: one or more times 68 | * **.{5}***: five times 69 | * **.{2,5}**: two to five times 70 | 71 | Now that we have our two data files in place, we can demonstrate the powerful search and replace capabilities. We will jump straight to replacing things in multiple files, but of course Atom has a more conventional search and replace mechanism for the currently open file. Replacing across documents is powerful because it can be performed on one or on hundreds or more documents at once; a type of automation (without programming). 72 | 73 | From the _File_ menu, select _Find in Project_ (on Mac the shortcut is Command-Shift-F). 74 | 75 | Find in Project 76 | 77 | That will cause a dialog to appear near the bottom of the page. 78 | 79 | Find in Project 80 | 81 | In the first box we have `^\t?[^\t].*`: 82 | 83 | * **^**: match to the beginning of the line 84 | * **\t?**: match zero or tab characters 85 | * **[^\t]**: match anything except a tab character 86 | * **.\***: match until the end of the line 87 | 88 | We also add the "Godfather" in the bottom box to ensure that the search and replace only happens in our new data directory. And presto! We have gotten rid of stage directions (assuming that's what we wanted). 89 | 90 | ## Jupyter Notebook 91 | 92 | ![Jupyter](../images/jupyter48.png) Using a friendly application like Atom is usually preferable and quicker than writing code ourselves, but there are times where having code is preferable, especially when the circumstances are more complex. Another reason to use code is that the code can be repeatedly re-run whereas the steps taken in the application probably have to be repeated manually each time. 93 | 94 | We demonstrate a similar situation with the [Converting Jupyter notebook](https://nbviewer.jupyter.org/github/sgsinclair/alta/blob/master/ipynb/Converting.ipynb). 95 | 96 | ## Voyant 97 | 98 | ![Voyant](../images/voyant48.png) The moment to do format conversion in Voyant is at the outset when one first creates a corpus. As we've seen previously, we can use powerful CSS Selectors and XML XPath expressions to determine which parts of a document should be used. There's even support for some simple filtering of plain text file. The real power of conversion is in Voyant's ability to read dozens of file formats, including PDF, MS Word, OpenOffice, Apple Pages, RTF, etc. 99 | 100 | Moreover, since it's possible to export a corpus in a variety of formats one could think of Voyant as a conversion utility for a wide range of formats: upload files in weird and wonderful format and then download the corpus as Voyant XML (minimal structural tagging) or plain text. The download button is located in the toolbar of the [Documents](https://voyant-tools.org/docs/#!/guide/documents) tool. 101 | -------------------------------------------------------------------------------- /ipynb/utilities/Concordances.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Generating Concordances\n", 8 | "\n", 9 | "This notebook shows how you can generate a concordance using lists." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "First we see what text files we have. " 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "Hume Enquiry.txt negative.txt positive.txt\r\n", 29 | "Hume Treatise.txt obama_tweets.txt\r\n" 30 | ] 31 | } 32 | ], 33 | "source": [ 34 | "ls *.txt" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "We are going to use the \"Hume Enquiry.txt\" from the Gutenberg Project. You can use whatever text you want. We print the first 50 characters to check." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "This string has 1344061 characters.\n", 54 | "The Project Gutenberg EBook of A Treatise of Human\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "theText2Use = \"Hume Treatise.txt\"\n", 60 | "with open(theText2Use, \"r\") as fileToRead:\n", 61 | " fileRead = fileToRead.read()\n", 62 | " \n", 63 | "print(\"This string has\", len(fileRead), \"characters.\")\n", 64 | "print(fileRead[:50])" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "## Tokenization\n", 72 | "\n", 73 | "Now we tokenize the text producing a list called \"listOfTokens\" and check the first words. This eliminates punctuation and lowercases the words." 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 3, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "['the', 'project', 'gutenberg', 'ebook', 'of', 'a', 'treatise', 'of', 'human', 'nature']\n" 86 | ] 87 | } 88 | ], 89 | "source": [ 90 | "import re\n", 91 | "listOfTokens = re.findall(r'\\b\\w[\\w-]*\\b', fileRead.lower())\n", 92 | "print(listOfTokens[:10])" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "## Input\n", 100 | "\n", 101 | "Now we get the word you want a concordance for and the context wanted." 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 4, 107 | "metadata": {}, 108 | "outputs": [ 109 | { 110 | "name": "stdout", 111 | "output_type": "stream", 112 | "text": [ 113 | "What word do you want collocates for? truth\n", 114 | "How much context do you want? 10\n" 115 | ] 116 | } 117 | ], 118 | "source": [ 119 | "word2find = input(\"What word do you want collocates for? \").lower() # Ask for the word to search for\n", 120 | "context = input(\"How much context do you want? \")# This asks for the context of words on either side to grab" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 5, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "str" 132 | ] 133 | }, 134 | "execution_count": 5, 135 | "metadata": {}, 136 | "output_type": "execute_result" 137 | } 138 | ], 139 | "source": [ 140 | "type(context)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 7, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "data": { 150 | "text/plain": [ 151 | "int" 152 | ] 153 | }, 154 | "execution_count": 7, 155 | "metadata": {}, 156 | "output_type": "execute_result" 157 | } 158 | ], 159 | "source": [ 160 | "contextInt = int(context)\n", 161 | "type(contextInt)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 9, 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "data": { 171 | "text/plain": [ 172 | "228958" 173 | ] 174 | }, 175 | "execution_count": 9, 176 | "metadata": {}, 177 | "output_type": "execute_result" 178 | } 179 | ], 180 | "source": [ 181 | "len(listOfTokens)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "## Main function\n", 189 | "\n", 190 | "Here is the main function that does the work populating a new list with the lines of concordance. We check the first 5 concordance lines." 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 10, 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "data": { 200 | "text/plain": [ 201 | "['220330: a reason why the faculty of recalling past ideas with truth and clearness should not have as much merit in it',\n", 202 | " '223214: confessing my errors and should esteem such a return to truth and reason to be more honourable than the most unerring',\n", 203 | " '223680: from the other this therefore being regarded as an undoubted truth that belief is nothing but a peculiar feeling different from',\n", 204 | " '224382: mind and he will evidently find this to be the truth secondly whatever may be the case with regard to this',\n", 205 | " '225925: by their different feeling i should have been nearer the truth end of project gutenberg s a treatise of human nature']" 206 | ] 207 | }, 208 | "execution_count": 10, 209 | "metadata": {}, 210 | "output_type": "execute_result" 211 | } 212 | ], 213 | "source": [ 214 | "def makeConc(word2conc,list2FindIn,context2Use,concList):\n", 215 | "\n", 216 | " end = len(list2FindIn)\n", 217 | " for location in range(end):\n", 218 | " if list2FindIn[location] == word2conc:\n", 219 | " # Here we check whether we are at the very beginning or end\n", 220 | " if (location - context2Use) < 0:\n", 221 | " beginCon = 0\n", 222 | " else:\n", 223 | " beginCon = location - context2Use\n", 224 | " \n", 225 | " if (location + context2Use) > end:\n", 226 | " endCon = end\n", 227 | " else:\n", 228 | " endCon = location + context2Use + 1\n", 229 | " \n", 230 | " theContext = (list2FindIn[beginCon:endCon])\n", 231 | " concordanceLine = ' '.join(theContext)\n", 232 | " # print(str(location) + \": \" + concordanceLine)\n", 233 | " concList.append(str(location) + \": \" + concordanceLine)\n", 234 | "\n", 235 | "theConc = []\n", 236 | "makeConc(word2find,listOfTokens,int(context),theConc)\n", 237 | "theConc[-5:]" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "## Output\n", 245 | "\n", 246 | "Finally, we output to a text file." 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 11, 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "Done\n" 259 | ] 260 | } 261 | ], 262 | "source": [ 263 | "nameOfResults = word2find.capitalize() + \".Concordance.txt\"\n", 264 | "\n", 265 | "with open(nameOfResults, \"w\") as fileToWrite:\n", 266 | " for line in theConc:\n", 267 | " fileToWrite.write(line + \"\\n\")\n", 268 | " \n", 269 | "print(\"Done\")" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "Here we check that the file was created." 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 12, 282 | "metadata": {}, 283 | "outputs": [ 284 | { 285 | "name": "stdout", 286 | "output_type": "stream", 287 | "text": [ 288 | "Truth.Concordance.txt\r\n" 289 | ] 290 | } 291 | ], 292 | "source": [ 293 | "ls *.Concordance.txt" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "## Next Steps" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "Onwards to our final utility example [Exploring a text with NLTK](Exploring a text with NLTK.ipynb)" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "---\n", 315 | "[CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/) From [The Art of Literary Text Analysis](../ArtOfLiteraryTextAnalysis.ipynb) by [Stéfan Sinclair](http://stefansinclair.name) & [Geoffrey Rockwell](http://geoffreyrockwell.com). Edited and revised by [Melissa Mony](http://melissamony.com).
Created September 30th, 2016 (Jupyter 4.2.1)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": { 322 | "collapsed": true 323 | }, 324 | "outputs": [], 325 | "source": [] 326 | } 327 | ], 328 | "metadata": { 329 | "kernelspec": { 330 | "display_name": "Python 3", 331 | "language": "python", 332 | "name": "python3" 333 | }, 334 | "language_info": { 335 | "codemirror_mode": { 336 | "name": "ipython", 337 | "version": 3 338 | }, 339 | "file_extension": ".py", 340 | "mimetype": "text/x-python", 341 | "name": "python", 342 | "nbconvert_exporter": "python", 343 | "pygments_lexer": "ipython3", 344 | "version": "3.6.3" 345 | } 346 | }, 347 | "nbformat": 4, 348 | "nbformat_minor": 1 349 | } 350 | -------------------------------------------------------------------------------- /ipynb/utilities/.ipynb_checkpoints/Concordances-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Generating Concordances\n", 8 | "\n", 9 | "This notebook shows how you can generate a concordance using lists." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "First we see what text files we have. " 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "Hume Enquiry.txt negative.txt positive.txt\r\n", 29 | "Hume Treatise.txt obama_tweets.txt\r\n" 30 | ] 31 | } 32 | ], 33 | "source": [ 34 | "ls *.txt" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "We are going to use the \"Hume Enquiry.txt\" from the Gutenberg Project. You can use whatever text you want. We print the first 50 characters to check." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "This string has 1344061 characters.\n", 54 | "The Project Gutenberg EBook of A Treatise of Human\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "theText2Use = \"Hume Treatise.txt\"\n", 60 | "with open(theText2Use, \"r\") as fileToRead:\n", 61 | " fileRead = fileToRead.read()\n", 62 | " \n", 63 | "print(\"This string has\", len(fileRead), \"characters.\")\n", 64 | "print(fileRead[:50])" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "## Tokenization\n", 72 | "\n", 73 | "Now we tokenize the text producing a list called \"listOfTokens\" and check the first words. This eliminates punctuation and lowercases the words." 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 3, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "['the', 'project', 'gutenberg', 'ebook', 'of', 'a', 'treatise', 'of', 'human', 'nature']\n" 86 | ] 87 | } 88 | ], 89 | "source": [ 90 | "import re\n", 91 | "listOfTokens = re.findall(r'\\b\\w[\\w-]*\\b', fileRead.lower())\n", 92 | "print(listOfTokens[:10])" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "## Input\n", 100 | "\n", 101 | "Now we get the word you want a concordance for and the context wanted." 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 4, 107 | "metadata": {}, 108 | "outputs": [ 109 | { 110 | "name": "stdout", 111 | "output_type": "stream", 112 | "text": [ 113 | "What word do you want collocates for? truth\n", 114 | "How much context do you want? 10\n" 115 | ] 116 | } 117 | ], 118 | "source": [ 119 | "word2find = input(\"What word do you want collocates for? \").lower() # Ask for the word to search for\n", 120 | "context = input(\"How much context do you want? \")# This asks for the context of words on either side to grab" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 5, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "str" 132 | ] 133 | }, 134 | "execution_count": 5, 135 | "metadata": {}, 136 | "output_type": "execute_result" 137 | } 138 | ], 139 | "source": [ 140 | "type(context)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 7, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "data": { 150 | "text/plain": [ 151 | "int" 152 | ] 153 | }, 154 | "execution_count": 7, 155 | "metadata": {}, 156 | "output_type": "execute_result" 157 | } 158 | ], 159 | "source": [ 160 | "contextInt = int(context)\n", 161 | "type(contextInt)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 9, 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "data": { 171 | "text/plain": [ 172 | "228958" 173 | ] 174 | }, 175 | "execution_count": 9, 176 | "metadata": {}, 177 | "output_type": "execute_result" 178 | } 179 | ], 180 | "source": [ 181 | "len(listOfTokens)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "## Main function\n", 189 | "\n", 190 | "Here is the main function that does the work populating a new list with the lines of concordance. We check the first 5 concordance lines." 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 10, 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "data": { 200 | "text/plain": [ 201 | "['220330: a reason why the faculty of recalling past ideas with truth and clearness should not have as much merit in it',\n", 202 | " '223214: confessing my errors and should esteem such a return to truth and reason to be more honourable than the most unerring',\n", 203 | " '223680: from the other this therefore being regarded as an undoubted truth that belief is nothing but a peculiar feeling different from',\n", 204 | " '224382: mind and he will evidently find this to be the truth secondly whatever may be the case with regard to this',\n", 205 | " '225925: by their different feeling i should have been nearer the truth end of project gutenberg s a treatise of human nature']" 206 | ] 207 | }, 208 | "execution_count": 10, 209 | "metadata": {}, 210 | "output_type": "execute_result" 211 | } 212 | ], 213 | "source": [ 214 | "def makeConc(word2conc,list2FindIn,context2Use,concList):\n", 215 | "\n", 216 | " end = len(list2FindIn)\n", 217 | " for location in range(end):\n", 218 | " if list2FindIn[location] == word2conc:\n", 219 | " # Here we check whether we are at the very beginning or end\n", 220 | " if (location - context2Use) < 0:\n", 221 | " beginCon = 0\n", 222 | " else:\n", 223 | " beginCon = location - context2Use\n", 224 | " \n", 225 | " if (location + context2Use) > end:\n", 226 | " endCon = end\n", 227 | " else:\n", 228 | " endCon = location + context2Use + 1\n", 229 | " \n", 230 | " theContext = (list2FindIn[beginCon:endCon])\n", 231 | " concordanceLine = ' '.join(theContext)\n", 232 | " # print(str(location) + \": \" + concordanceLine)\n", 233 | " concList.append(str(location) + \": \" + concordanceLine)\n", 234 | "\n", 235 | "theConc = []\n", 236 | "makeConc(word2find,listOfTokens,int(context),theConc)\n", 237 | "theConc[-5:]" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "## Output\n", 245 | "\n", 246 | "Finally, we output to a text file." 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 11, 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "Done\n" 259 | ] 260 | } 261 | ], 262 | "source": [ 263 | "nameOfResults = word2find.capitalize() + \".Concordance.txt\"\n", 264 | "\n", 265 | "with open(nameOfResults, \"w\") as fileToWrite:\n", 266 | " for line in theConc:\n", 267 | " fileToWrite.write(line + \"\\n\")\n", 268 | " \n", 269 | "print(\"Done\")" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "Here we check that the file was created." 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 12, 282 | "metadata": {}, 283 | "outputs": [ 284 | { 285 | "name": "stdout", 286 | "output_type": "stream", 287 | "text": [ 288 | "Truth.Concordance.txt\r\n" 289 | ] 290 | } 291 | ], 292 | "source": [ 293 | "ls *.Concordance.txt" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "## Next Steps" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "Onwards to our final utility example [Exploring a text with NLTK](Exploring a text with NLTK.ipynb)" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "---\n", 315 | "[CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/) From [The Art of Literary Text Analysis](../ArtOfLiteraryTextAnalysis.ipynb) by [Stéfan Sinclair](http://stefansinclair.name) & [Geoffrey Rockwell](http://geoffreyrockwell.com). Edited and revised by [Melissa Mony](http://melissamony.com).
Created September 30th, 2016 (Jupyter 4.2.1)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": { 322 | "collapsed": true 323 | }, 324 | "outputs": [], 325 | "source": [] 326 | } 327 | ], 328 | "metadata": { 329 | "kernelspec": { 330 | "display_name": "Python 3", 331 | "language": "python", 332 | "name": "python3" 333 | }, 334 | "language_info": { 335 | "codemirror_mode": { 336 | "name": "ipython", 337 | "version": 3 338 | }, 339 | "file_extension": ".py", 340 | "mimetype": "text/x-python", 341 | "name": "python", 342 | "nbconvert_exporter": "python", 343 | "pygments_lexer": "ipython3", 344 | "version": "3.6.3" 345 | } 346 | }, 347 | "nbformat": 4, 348 | "nbformat_minor": 1 349 | } 350 | -------------------------------------------------------------------------------- /spiral/CharacteristicCurve.json: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "created": 1492037077579, 4 | "modified": 1492037077579, 5 | "version": 2 6 | }, 7 | "blocks": [ 8 | { 9 | "type": "text", 10 | "input": [ 11 | "

Mendenhall's Characteristic Curve (1887): Early Stylometrics

\n\n

In 188", 12 | "7 the polymath T. C. Mendenhall published an article in Science titled,", 13 | " \"The Characteristic Curves of Composition\" which is both one of the earliest ex", 14 | "amples of quantitative stylistics but also one of the first studies to present t", 15 | "ext visualizations based on the (manual) count of words. Mendenhall thought that", 16 | " different authors would have distinctive curves of word length frequencies whic", 17 | "h could help with authorship attribution.

\n\n

Here you can see an example of", 18 | " the characteristic curve of Oliver Twist. Mendenhall took the first 10", 19 | "00 words, counted the length in characters of these 1000 words and then graphed ", 20 | "the number of words of each length. Thus one can see that there is just under 50", 21 | " words of one letter length in the first one thousand words.

\n\n

\"M",

\n\n

Mendenhall thought this method of analy", 25 | "sis would help with the \"identification or discrimination of authorship\" or auth", 26 | "orship attribution as we call it today. Let's see if we can recapitulate his tec", 27 | "hnique here.

\n\n

Acquiring the Text

\n\n

We'll begin by fetching the ed", 28 | "ition of Oliver Twist that's ", 30 | "available from the Gutenberg Project", 32 | ". The code block below uses the loadCorpus function. The first time it was run wit", 35 | "hout the corpus option, and then the corpus ID was added for future runs.

\n" 36 | ] 37 | }, 38 | { 39 | "type": "code", 40 | "input": [ 41 | "new Corpus({", 42 | " input: 'https://gist.githubusercontent.com/sgsinclair/f895f2b37cdee761ac08e4ed8cc83d58/raw/CharlesDickens-OliverTwist.txt?1',", 43 | " inputRemoveUntil: \"CHAPTER I\",", 44 | " inputRemoveFromAfter: \"weak and erring.\"", 45 | "}).assign(\"corpus\").show();" 46 | ], 47 | "output": [ 48 | "
This corpus has 1 document with 159,006 total words and 10,438 un", 52 | "ique word forms. Created about 3 hours ago.
" 54 | ] 55 | }, 56 | { 57 | "type": "text", 58 | "input": [ 59 | "

The corpus has nearly 160,000 words, but recall that Mendenhall only con", 60 | "sidered the first 1,000 words. We can do the same by calling the loadTokens meth", 61 | "od on our corpus and specifying arguments that limit the call to 1,000 word toke", 62 | "ns while skipping non-word tokens.

\n" 63 | ] 64 | }, 65 | { 66 | "type": "code", 67 | "input": "corpus.loadTokens({limit: 1000, noOthers: true}).assign(\"wordsStore\").show();", 68 | "output": [ 69 | "
This store contains 1000 items with these fields: id, docId, ", 70 | "docIndex, token, rawFreq, tokenType, position, startOffset, endOffset.
" 71 | ] 72 | }, 73 | { 74 | "type": "text", 75 | "input": [ 76 | "

We have 1,000 terms but each one has far more fields than we need, we're only", 77 | " interested in the word length of the term. So we'll create a table where we inc", 78 | "rement the value in first column (zero-based) where the row represent the term l", 79 | "ength – this uses the updateCell function from the t", 83 | "able. Finally we use the embed function to view the table as a VoyantChart.

\n" 88 | ] 89 | }, 90 | { 91 | "type": "code", 92 | "input": [ 93 | "var table = new VoyantTable()", 94 | "wordsStore.each(function(word) {", 95 | " table.updateCell(word.getTerm().length, 0, 1);", 96 | "});", 97 | "table.embed(\"VoyantChart\", {series: {showMarkers: false}, axes: [{grid: true, title: \"Word Length\"}, {grid: true, title: \"Word Count\"}], width: 500})" 98 | ], 99 | "output": [ 100 | "
" 102 | ] 103 | }, 104 | { 105 | "type": "text", 106 | "input": [ 107 | "

If we compare to Mendenall's graph above, that seems pretty close! It's worth", 108 | " noting that Mendenhall doesn't specify what exactly was counted, such as chapte", 109 | "r titles (which might account for some slight variation).

\n\n

But Mendehall ", 110 | "was counting terms by hand – can we do better? Let's generate a similar chart bu", 111 | "t now consider all terms, not just the first 1,000.

\n" 112 | ] 113 | }, 114 | { 115 | "type": "code", 116 | "input": [ 117 | "var oliverTwistLengths;", 118 | "corpus.loadCorpusTerms().then(function(corpusTerms) {", 119 | " oliverTwistLengths = new VoyantTable();", 120 | " corpusTerms.each(function(corpusTerm) {", 121 | " oliverTwistLengths.updateCell(corpusTerm.getTerm().length, 0, corpusTerm.getRawFreq());", 122 | " });", 123 | " oliverTwistLengths.embed('voyantchart', {width: 500});", 124 | "});" 125 | ], 126 | "output": [ 127 | "
" 129 | ] 130 | }, 131 | { 132 | "type": "text", 133 | "input": [ 134 | "

Overall we have an impression that the line gets smoother, which isn't surpri", 135 | "sing given that we have more data points. The big question is whether the smooth", 136 | "ing actually makes the line less characteristic, which would somewhat contradict", 137 | " Mendhall's original hypothesis that every other has a characteristic curve. Let", 138 | "'s compare this with Austen's Emma which has about the same number of ter", 139 | "ms. Emma is the sixth document in the corpus, so we can ", 140 | "access it at index 5 (index is zero-based). 

\n" 141 | ] 142 | }, 143 | { 144 | "type": "code", 145 | "input": [ 146 | "var emma;", 147 | "new Corpus(\"austen\").then(function(corpus) {", 148 | " emma = corpus.getDocument(5);", 149 | " emma.show()", 150 | "})" 151 | ], 152 | "output": "
1815 Emma
" 153 | }, 154 | { 155 | "type": "text", 156 | "input": [ 157 | "

Now we'll calculate document term lengths for Emma almost ident", 158 | "ically to how we calculated corpus term lengths for Oliver Twist. Final", 159 | "ly, we'll chart this too.

\n" 160 | ] 161 | }, 162 | { 163 | "type": "code", 164 | "input": [ 165 | "emma.loadDocumentTerms().then(function(documentTerms) {", 166 | " emmaLengths = new VoyantTable();", 167 | " documentTerms.each(function(documentTerm) {", 168 | " emmaLengths.updateCell(documentTerm.getTerm().length, 0, documentTerm.getRawFreq()); ", 169 | " });", 170 | " ", 171 | " // embed both word length tables", 172 | " embed([oliverTwistLengths,'voyantchart',{", 173 | " width: 500,", 174 | " title: \"Word Lengths in Oliver Twist\"", 175 | " }],[emmaLengths,'voyantchart',{", 176 | " width: 500,", 177 | " title: \"Word Lengths in Emma\"", 178 | " }]);", 179 | "});", 180 | "" 181 | ], 182 | "output": [ 183 | "
" 187 | ] 188 | }, 189 | { 190 | "type": "text", 191 | "input": [ 192 | "

These do seem different, among other things the peak has different angle", 193 | "s and the middle is more jagged in Emma. We can't help wonder if Mendenhall was ", 194 | "seeing larger differences with 1,000 word segments though, which would lead him ", 195 | "to over-estimate how distinctive an author's characteristic curve would be.

\n" 196 | ] 197 | } 198 | ] 199 | } -------------------------------------------------------------------------------- /assets/css/style.scss: -------------------------------------------------------------------------------- 1 | --- 2 | --- 3 | 4 | @import "{{ site.theme }}"; 5 | 6 | @charset "UTF-8"; 7 | 8 | /* Import ET Book styles 9 | adapted from https://github.com/edwardtufte/et-book/blob/gh-pages/et-book.css */ 10 | 11 | @font-face { font-family: "et-book"; 12 | src: url("et-book/et-book-roman-line-figures/et-book-roman-line-figures.eot"); 13 | src: url("et-book/et-book-roman-line-figures/et-book-roman-line-figures.eot?#iefix") format("embedded-opentype"), url("et-book/et-book-roman-line-figures/et-book-roman-line-figures.woff") format("woff"), url("et-book/et-book-roman-line-figures/et-book-roman-line-figures.ttf") format("truetype"), url("et-book/et-book-roman-line-figures/et-book-roman-line-figures.svg#etbookromanosf") format("svg"); 14 | font-weight: normal; 15 | font-style: normal; } 16 | 17 | @font-face { font-family: "et-book"; 18 | src: url("et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.eot"); 19 | src: url("et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.eot?#iefix") format("embedded-opentype"), url("et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.woff") format("woff"), url("et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.ttf") format("truetype"), url("et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.svg#etbookromanosf") format("svg"); 20 | font-weight: normal; 21 | font-style: italic; } 22 | 23 | @font-face { font-family: "et-book"; 24 | src: url("et-book/et-book-bold-line-figures/et-book-bold-line-figures.eot"); 25 | src: url("et-book/et-book-bold-line-figures/et-book-bold-line-figures.eot?#iefix") format("embedded-opentype"), url("et-book/et-book-bold-line-figures/et-book-bold-line-figures.woff") format("woff"), url("et-book/et-book-bold-line-figures/et-book-bold-line-figures.ttf") format("truetype"), url("et-book/et-book-bold-line-figures/et-book-bold-line-figures.svg#etbookromanosf") format("svg"); 26 | font-weight: bold; 27 | font-style: normal; } 28 | 29 | @font-face { font-family: "et-book-roman-old-style"; 30 | src: url("et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.eot"); 31 | src: url("et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.eot?#iefix") format("embedded-opentype"), url("et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.woff") format("woff"), url("et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.ttf") format("truetype"), url("et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.svg#etbookromanosf") format("svg"); 32 | font-weight: normal; 33 | font-style: normal; } 34 | 35 | /* Tufte CSS styles */ 36 | html { font-size: 15px; } 37 | 38 | body { width: 87.5%; 39 | margin-left: auto; 40 | margin-right: auto; 41 | padding-left: 12.5%; 42 | font-family: et-book, Palatino, "Palatino Linotype", "Palatino LT STD", "Book Antiqua", Georgia, serif; 43 | background-color: #fffff8; 44 | color: #111; 45 | max-width: 1400px; 46 | counter-reset: sidenote-counter; } 47 | 48 | h1 { font-weight: 400; 49 | margin-top: 4rem; 50 | margin-bottom: 1.5rem; 51 | font-size: 3.2rem; 52 | line-height: 1; } 53 | 54 | h2 { font-style: italic; 55 | font-weight: 400; 56 | margin-top: 2.1rem; 57 | margin-bottom: 1.4rem; 58 | font-size: 2.2rem; 59 | line-height: 1; } 60 | 61 | h3 { font-style: italic; 62 | font-weight: 400; 63 | font-size: 1.7rem; 64 | margin-top: 2rem; 65 | margin-bottom: 1.4rem; 66 | line-height: 1; } 67 | 68 | hr { display: block; 69 | height: 1px; 70 | width: 55%; 71 | border: 0; 72 | border-top: 1px solid #ccc; 73 | margin: 1em 0; 74 | padding: 0; } 75 | 76 | p.subtitle { font-style: italic; 77 | margin-top: 1rem; 78 | margin-bottom: 1rem; 79 | font-size: 1.8rem; 80 | display: block; 81 | line-height: 1; } 82 | 83 | .numeral { font-family: et-book-roman-old-style; } 84 | 85 | .danger { color: red; } 86 | 87 | article { position: relative; 88 | padding: 5rem 0rem; } 89 | 90 | section { padding-top: 1rem; 91 | padding-bottom: 1rem; } 92 | 93 | p, ol, ul { font-size: 1.4rem; 94 | line-height: 2rem; } 95 | 96 | p { margin-top: 1.4rem; 97 | margin-bottom: 1.4rem; 98 | padding-right: 0; 99 | vertical-align: baseline; } 100 | 101 | /* Chapter Epigraphs */ 102 | div.epigraph { margin: 5em 0; } 103 | 104 | div.epigraph > blockquote { margin-top: 3em; 105 | margin-bottom: 3em; } 106 | 107 | div.epigraph > blockquote, div.epigraph > blockquote > p { font-style: italic; } 108 | 109 | div.epigraph > blockquote > footer { font-style: normal; } 110 | 111 | div.epigraph > blockquote > footer > cite { font-style: italic; } 112 | /* end chapter epigraphs styles */ 113 | 114 | blockquote { font-size: 1.4rem; } 115 | 116 | blockquote p { width: 55%; 117 | margin-right: 40px; } 118 | 119 | blockquote footer { width: 55%; 120 | font-size: 1.1rem; 121 | text-align: right; } 122 | 123 | section > p, section > footer, section > table { width: 55%; } 124 | 125 | /* 50 + 5 == 55, to be the same width as paragraph */ 126 | section > ol, section > ul { width: 50%; 127 | -webkit-padding-start: 5%; } 128 | 129 | li:not(:first-child) { margin-top: 0.25rem; } 130 | 131 | figure { padding: 0; 132 | border: 0; 133 | font-size: 100%; 134 | font: inherit; 135 | vertical-align: baseline; 136 | max-width: 55%; 137 | -webkit-margin-start: 0; 138 | -webkit-margin-end: 0; 139 | margin: 0 0 3em 0; } 140 | 141 | figcaption { float: right; 142 | clear: right; 143 | margin-top: 0; 144 | margin-bottom: 0; 145 | font-size: 1.1rem; 146 | line-height: 1.6; 147 | vertical-align: baseline; 148 | position: relative; 149 | max-width: 40%; } 150 | 151 | figure.fullwidth figcaption { margin-right: 24%; } 152 | 153 | /* Links: replicate underline that clears descenders */ 154 | a:link, a:visited { color: inherit; } 155 | 156 | a:link { text-decoration: none; 157 | background: -webkit-linear-gradient(#fffff8, #fffff8), -webkit-linear-gradient(#fffff8, #fffff8), -webkit-linear-gradient(#333, #333); 158 | background: linear-gradient(#fffff8, #fffff8), linear-gradient(#fffff8, #fffff8), linear-gradient(#333, #333); 159 | -webkit-background-size: 0.05em 1px, 0.05em 1px, 1px 1px; 160 | -moz-background-size: 0.05em 1px, 0.05em 1px, 1px 1px; 161 | background-size: 0.05em 1px, 0.05em 1px, 1px 1px; 162 | background-repeat: no-repeat, no-repeat, repeat-x; 163 | text-shadow: 0.03em 0 #fffff8, -0.03em 0 #fffff8, 0 0.03em #fffff8, 0 -0.03em #fffff8, 0.06em 0 #fffff8, -0.06em 0 #fffff8, 0.09em 0 #fffff8, -0.09em 0 #fffff8, 0.12em 0 #fffff8, -0.12em 0 #fffff8, 0.15em 0 #fffff8, -0.15em 0 #fffff8; 164 | background-position: 0% 93%, 100% 93%, 0% 93%; } 165 | 166 | @media screen and (-webkit-min-device-pixel-ratio: 0) { a:link { background-position-y: 87%, 87%, 87%; } } 167 | 168 | a:link::selection { text-shadow: 0.03em 0 #b4d5fe, -0.03em 0 #b4d5fe, 0 0.03em #b4d5fe, 0 -0.03em #b4d5fe, 0.06em 0 #b4d5fe, -0.06em 0 #b4d5fe, 0.09em 0 #b4d5fe, -0.09em 0 #b4d5fe, 0.12em 0 #b4d5fe, -0.12em 0 #b4d5fe, 0.15em 0 #b4d5fe, -0.15em 0 #b4d5fe; 169 | background: #b4d5fe; } 170 | 171 | a:link::-moz-selection { text-shadow: 0.03em 0 #b4d5fe, -0.03em 0 #b4d5fe, 0 0.03em #b4d5fe, 0 -0.03em #b4d5fe, 0.06em 0 #b4d5fe, -0.06em 0 #b4d5fe, 0.09em 0 #b4d5fe, -0.09em 0 #b4d5fe, 0.12em 0 #b4d5fe, -0.12em 0 #b4d5fe, 0.15em 0 #b4d5fe, -0.15em 0 #b4d5fe; 172 | background: #b4d5fe; } 173 | 174 | /* Sidenotes, margin notes, figures, captions */ 175 | img { max-width: 100%; } 176 | 177 | .sidenote, .marginnote { float: right; 178 | clear: right; 179 | margin-right: -60%; 180 | width: 50%; 181 | margin-top: 0; 182 | margin-bottom: 0; 183 | font-size: 1.1rem; 184 | line-height: 1.3; 185 | vertical-align: baseline; 186 | position: relative; } 187 | 188 | .sidenote-number { counter-increment: sidenote-counter; } 189 | 190 | .sidenote-number:after, .sidenote:before { font-family: et-book-roman-old-style; 191 | position: relative; 192 | vertical-align: baseline; } 193 | 194 | .sidenote-number:after { content: counter(sidenote-counter); 195 | font-size: 1rem; 196 | top: -0.5rem; 197 | left: 0.1rem; } 198 | 199 | .sidenote:before { content: counter(sidenote-counter) " "; 200 | font-size: 1rem; 201 | top: -0.5rem; } 202 | 203 | blockquote .sidenote, blockquote .marginnote { margin-right: -82%; 204 | min-width: 59%; 205 | text-align: left; } 206 | 207 | div.fullwidth, table.fullwidth { width: 100%; } 208 | 209 | div.table-wrapper { overflow-x: auto; 210 | font-family: "Trebuchet MS", "Gill Sans", "Gill Sans MT", sans-serif; } 211 | 212 | .sans { font-family: "Gill Sans", "Gill Sans MT", Calibri, sans-serif; 213 | letter-spacing: .03em; } 214 | 215 | code { font-family: Consolas, "Liberation Mono", Menlo, Courier, monospace; 216 | font-size: 1.0rem; 217 | line-height: 1.42; } 218 | 219 | .sans > code { font-size: 1.2rem; } 220 | 221 | h1 > code, h2 > code, h3 > code { font-size: 0.80em; } 222 | 223 | .marginnote > code, .sidenote > code { font-size: 1rem; } 224 | 225 | pre.code { font-size: 0.9rem; 226 | width: 52.5%; 227 | margin-left: 2.5%; 228 | overflow-x: auto; } 229 | 230 | pre.code.fullwidth { width: 90%; } 231 | 232 | .fullwidth { max-width: 90%; 233 | clear:both; } 234 | 235 | span.newthought { font-variant: small-caps; 236 | font-size: 1.2em; } 237 | 238 | input.margin-toggle { display: none; } 239 | 240 | label.sidenote-number { display: inline; } 241 | 242 | label.margin-toggle:not(.sidenote-number) { display: none; } 243 | 244 | .iframe-wrapper { position: relative; 245 | padding-bottom: 56.25%; /* 16:9 */ 246 | padding-top: 25px; 247 | height: 0; } 248 | 249 | .iframe-wrapper iframe { position: absolute; 250 | top: 0; 251 | left: 0; 252 | width: 100%; 253 | height: 100%; } 254 | 255 | @media (max-width: 760px) { body { width: 84%; 256 | padding-left: 8%; 257 | padding-right: 8%; } 258 | hr, section > p, section > footer, section > table { width: 100%; } 259 | pre.code { width: 97%; } 260 | section > ol { width: 90%; } 261 | section > ul { width: 90%; } 262 | figure { max-width: 90%; } 263 | figcaption, figure.fullwidth figcaption { margin-right: 0%; 264 | max-width: none; } 265 | blockquote { margin-left: 1.5em; 266 | margin-right: 0em; } 267 | blockquote p, blockquote footer { width: 100%; } 268 | label.margin-toggle:not(.sidenote-number) { display: inline; } 269 | .sidenote, .marginnote { display: none; } 270 | .margin-toggle:checked + .sidenote, 271 | .margin-toggle:checked + .marginnote { display: block; 272 | float: left; 273 | left: 1rem; 274 | clear: both; 275 | width: 95%; 276 | margin: 1rem 2.5%; 277 | vertical-align: baseline; 278 | position: relative; } 279 | label { cursor: pointer; } 280 | div.table-wrapper, table { width: 85%; } 281 | img { width: 100%; } } 282 | -------------------------------------------------------------------------------- /docs/count/index.md: -------------------------------------------------------------------------------- 1 | # Counting with the Art of Literary Text Analysis 2 | 3 | If you've been following along this [guide series](../) we've now looked at various basic concepts involved in building a corpus, including web scraping and pre-processing texts for things like cleanup and format conversion. We have our texts, now what? 4 | 5 | One of the simplest but most significant tasks that we can do with a textual corpus is to count various occurrences. We can do this for its own purpose – for instance if we want to find a sequence of characters or if we want to know how many times a given phrase appears – but counting is also an analytic primitive that is part of many other more sophisticated tasks, such as distribution analysis, finding similar documents, and countless other operations. 6 | 7 | ## Counting with Voyant 8 | 9 | ![Voyant](../images/voyant48.png) We are going to visit many of the core concepts of counting with Voyant Tools, in large part because the functionality is easily accessible, which will allow us to focus on the concepts. 10 | 11 | To fully understand counting of text it's useful to revisit how computers encode and process data, and text in particular. As is commonly known (though perhaps not fully understood), computers store information in a binary format, which essentially means that everything is based on a system of choices between two values, namely zero and one (that in turn can be used by a computer transister to send either a low or high current of electricity, also a binary state. 12 | 13 | If I have one column with which to store data, I have two possible values: zero or one (black or white, heads or tails, etc.). If I have two columns, I now have 4 different possibilities (00, 01, 10, 11), I can multiply two for each column I have to determine the number of possibilities. 14 | 15 | | bits | possibilities | equation | exponent | example | 16 | |-|-|-|-|-| 17 | | 1 | 2 | 2x1 | 21 | 0 | 18 | | 2 | 4 | 2x2 | 22 | 01 | 19 | | 3 | 8 | 2x2x2 | 23 | 010 | 20 | | 4 | 16 | 2x2x2x2 | 24 | 0101 | 21 | | 5 | 32 | 2x2x2x2x2 | 25 | 01010 22 | | 6 | 64 | 2x2x2x2x2x2 | 26 | 010101 23 | | 7 | 128 | 2x2x2x2x2x2 | 27 | 0101010 24 | | 8 | 256 | 2x2x2x2x2x2x2 | 28 | 01010101 25 | 26 | As we can see the number of "bits" (left column) corresponds with the number of digits (or columns), as shown in the right column ("example"). We haven't explained how to decipher binary into comprehensible information, but we have explained the basics of how binary and bits work. 27 | 28 | If I'm trying to represent heads or tails I only need one bit (with two possibilities). If I need to represent the 26 letters of the alphabet (in lowercase) I need at least 5 bits (with 32 possibilities. If I want upper and lowercase characters as well as punctuation and so on, I need even more bits. It has become standard to work with units of 8 bits, also called one byte (with 256 possibilities). When we hear 8-bit that's what is being said, that there are 256 different possibilites (such as a gif image that can have up to 256 different colours). 29 | 30 | 1 byte (8 bits) is plenty to represent texts using our English alphabet and even accented characters like "é" or "ñ", but woefully insufficient for other languages like Mandarin with its some 50,000 ideogram characters (here's a mini exercise: how many bits are needed to represent that many possibilities)? For the past couple of decades the dominant standard for encoding text is Unicode. Plain texts with our alphabet typically use UTF-8 where the 8 indicates 8 bits, but it's also possible to have up to UTF-32 (32 bits or 4 bytes or over 4 billion possibilities). It can be useful to know that UTF-16, for instance, is actually composed of a character that span across two bytes – in other words, the byte is still the core unit of encoding. Occasionally one might see a file or web page that has strange characters in it, sometimes that can be because two-byte characters are being interpreted incorrectly as one-byte characters (or vice-versa, it should be possible to fix that by re-opening the file with the correct character encoding). 31 | 32 | So text is encoded in bits and bytes. When we ask the computer to find text, or a string sequence, we're asking it to find a matching set of bytes. Counting is similar, it's a matter of seeing how many times the byte sequence occurs. But it can also lead to surprising results. Imagine we are searching for the text "dog", without further instructions we might also inadvertently match the word "dogs" (which may be desirable) but also the word "dogmatic" (which probably isn't, unless we're reading the French comic book _Asterix_ in translation (the dog is named "Idéfix" in French and "Dogmatix" in English, surely one of the most inspired translations in history). 33 | 34 | Some systems, like Voyant, go through a process of tokenization, which means trying to identify (and then count) words. But even the concept of word is slippery and contextual. For instance, is "don't" one word or two ("don" and "t" – or should it be modified "do" and "not")? Is "computer-assisted" one or two words? What about hyphenated proper names? In some cases we can delay choosing how to treat such words, in other cases (like Voyant) the decision must be made when creating the corpus (see the [tokenization](https://voyant-tools.org/docs/#!/guide/corpuscreator-section-tokenization) options in Voyant). 35 | 36 | So, after these brief digressions into character encoding and tokenization we can now dive into working in Voyant. If you haven't already followed the [Getting Started](https://voyant-tools.org/docs/#!/guide/starthttps://voyant-tools.org/docs/#!/guide/start) in Voyant guide, you're *strongly* encouraged to do so. That guide is quick, if you want a deeper introduction to Voyant, it would also be well worth following the [Voyant tutorial](https://https://voyant-tools.org/docs/#!/guide/tutorialvoyant-tools.org/docs/#!/guide/tutorial). 37 | 38 | Counting is a key part of the default view of Voyant, in some ways every tool of the main interface uses counting of words. 39 | 40 | 41 | 42 | The Cirrus (word-cloud) tool is about term frequency (position the cursor over terms to see their frequency). Clicking on a term in Cirrus also shows frequency information in the upper middle Reader tool. In the upper-right is the Trends tool which is a combination of counting and distribution. In the bottom right-hand is the Summary tool which contains various counts (number of documents, number of words in the corpus, number of unique words in the corpus, number of words per document, frequency of distinctive words per document, etc.). Finally, the default view also shows the Keyword in context which finds occurrences of words. All five of the tools in the default view rely on term counts, as do most of the other 20 or so tools that are available in Voyant (you can switch tools by clicking on the window icon that appears in the grey header bars of any of the tools. 43 | 44 | One of the most useful tools for counting terms isn't shown by default, but it is easily accessible by clicking on the "Terms" tab in the upper left-hand tool where Cirrus is by default. 45 | 46 | 47 | 48 | The default view of _Terms_ shows a list of high frequency words with their count and a mini-graph (called a sparkline in this case) that shows the distribution of the word across the corpus, in this case 8 novels from Jane Austen. 49 | 50 | It's possible to view additional information about a term by clicking the plus icon in the left-most column, this expands a panel with additional information about the following: 51 | 52 | * **Distribution**: another view of the sparkline 53 | * **Collocates**: other terms that occur in higher frequency near this term 54 | * **Correlations**: terms whose frequencies increase or decrease at a similar rate has this term 55 | * **Phrases**: multi-word phrases that repeat and that start with this term (if applicable). 56 | 57 | It's possible to scroll down to lower frequency words, new words will be loaded as necessary. This is sometimes call infinite scrolling though that's a bit misleading since there's a finite number of words in the corpus and eventually we would reach the bottom. 58 | 59 | If for some reason we're more interested in sorting alphabetically rather than by frequency, it's possible to click on the "Term" header in the table. 60 | 61 | It's important to recognize early that what we are seeing is a list of high frequency words, but not necessarily all the words, since there's automatically a stoplist that's applied; a stoplist is like a blacklist of words to be ignored. It's typically populated by many function words like determiners "the", "a" and other words that don't carry much meaning (such as prepositions, pronouns, and others). 62 | 63 | It's possible to edit the stoplist by clicking on the options icon in the grey title bar (the bar with "Terms" near the top, icons will appear on the right while hovering, we want the one that looks like a slider option. We can click on that and proceed to select another list or edit the existing list (it's a very good idea to look at what's in that list, there may be some surprises). You can remove words or add words in the editor, see the [Stoplist](https://voyant-tools.org/docs/#!/guide/stopwords) documentation for more information. 64 | 65 | If you want to keep your edited stopword list, remember to export a URL (using the export icons in the header bar) to ensure that the new list is included (otherwise the default list will be shown next time the URL is visited). 66 | 67 | Voyant is designed to be user-friendly, which sometimes means showing the most useful information (to avoid overwhelming the user) while making other information available through additional steps. That's the case with the _Trends_ tool, and several other table or [grid-based tools](https://voyant-tools.org/docs/#!/guide/grids). To access additional functionality, click the down arrow that should appear in a column header when you're hovering (especially the "Terms" or "Count" headers). 68 | 69 | Terms Columns 70 | 71 | As can be seen, several options exist, including to show: 72 | 73 | * **Term**: this is the term in the corpus 74 | * **Count**: this is the frequency of the term in the corpus 75 | * **Trends**: this is a sparkline graph that shows the distribution of relative frequencies across documents in the corpus (if the corpus contains more than one document); you can hover over the sparkline to see finer-grained results 76 | * **Relative**: this is the relative frequency of the term in the corpus, per one million words (sorting by count and relative should produce the same results, the relative frequencies might be useful when comparing to another corpus) 77 | * **Comparison**: this is the relative frequency of the term in the corpus compared to the relative frequency of the same term in a comparison corpus; to specify the comparison corpus, click the Options icon and specify the comparison corpus to use 78 | * **Peakedness**: this is a statistical measure of how much the relative frequencies of a term in a corpus are bunched up into peaks (regions with higher values where the rest are lower) 79 | * **Skew**: this is a statistical measure of the symmetry of the relative frequencies of a term across the corpus 80 | 81 | Although Peakedness and Skew start to seem like advanced statistical measures, they can reveal some interesting characteristics about the general trends for term frequency in a corpus (they aren't as useful for a corpus with a single document, but there's a specialized [Document Terms](https://voyant-tools.org/docs/#!/guide/documentterms) tool that presents other useful information. 82 | 83 | The _Terms_ tool provides various counts of an existing list, but one of the most powerful features of Voyant is search, which can be done using the box in the bottom part of the tool. The following provides a guide to the supported syntax (see also [Search](https://voyant-tools.org/docs/#!/guide/search)): 84 | 85 | * [`love`](https://voyant-tools.org/?corpus=austen&query=love&view=CorpusTerms): match **exact term** love 86 | * [`love*`](https://voyant-tools.org/?corpus=austen&query=love*&view=CorpusTerms): match terms that start with the **prefix** love and then a **wildcard** as **one term** 87 | * [`^love*`](https://voyant-tools.org/?corpus=austen&query=^love*&view=CorpusTerms): match terms that start with love as **separate terms** (love, lovely, etc.) 88 | * [`*ove`](https://voyant-tools.org/?corpus=austen&query=ove*&view=CorpusTerms): match terms that end with the **suffix** _ove_ as **one term** 89 | * [`^*ove`](https://voyant-tools.org/?corpus=austen&query=^love*&view=CorpusTerms): match terms that end with **suffix** _ove_ as **separate terms** (love, above, etc.) 90 | * [`love,hate`](https://voyant-tools.org/?corpus=austen&query=love,hate&view=CorpusTerms): match each term **separated by commas** as **separate terms** 91 | * [`love\|hate`](https://voyant-tools.org/?corpus=austen&query=love\|hate&view=CorpusTerms): match terms **separated by pipes** as a **single term** 92 | * [`"love him"`](https://voyant-tools.org/?corpus=austen&query="love him"&view=CorpusTerms): _love him_ as an exact **phrase** (word order matters) 93 | * [`"love him"~0`](https://voyant-tools.org/?corpus=austen&query="love+him"~0&view=CorpusTerms): _love him_ or _him love_ **phrase** (word order doesn't matter but 0 words in between) 94 | * [`"love her"~5`](https://voyant-tools.org/?corpus=austen&query="love+her"~5&view=CorpusTerms): match _love_ **near** _her_ (within 5 words) 95 | * [`^love*,love\|hate,"love her"~5`](https://voyant-tools.org/?corpus=austen&query=^love*,hate\|love,"love+her"~5&view=CorpusTerms): **combine** syntaxes 96 | 97 | Can you find what you're looking for? 98 | 99 | ![Jupyter](../images/jupyter48.png) For the counting unit in Jupyter we'll head over to [Getting Texts](https://nbviewer.jupyter.org/github/sgsinclair/alta/blob/master/ipynb/GettingTexts.ipynb) page of the Art of Literary Text Mining with Jupyter. 100 | -------------------------------------------------------------------------------- /ipynb/Glossary.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Glossary" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "

Argument

\n", 15 | "A value which is passed to a function or method when 'called'. Arguments are assigned to named local variables in the function body. Arguments can be further classified as either keyword or positional. In the simplest terms the difference between these types is that keyword arguments are named (proceeded by an identifier) and positional arguments are unnamed (in list form). [Further information.](https://docs.python.org/3/glossary.html)\n", 16 | "\n", 17 | "---" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "#### Array \n", 25 | "A data structure consisting of an ordered collection of items of a single type i.e. an indexed list.\n", 26 | "\n", 27 | "---" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "#### Bag of words \n", 35 | "A [model](https://en.wikipedia.org/wiki/Bag-of-words_model) where text is represented as a multiset (bag) of its words. This simplification disregards features such as word order and grammar and instead focuses on term frequency.\n", 36 | "\n", 37 | "---" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "#### Cartesian Graph \n", 45 | "Also known as a [Cartesian Coordinate System](https://en.wikipedia.org/wiki/Cartesian_coordinate_system) which plots numbers on a plane using an x and y axis.\n", 46 | "\n", 47 | "---" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "#### Cell \n", 55 | "An input strucutre in a Notebook which runs either [Markdown](https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet) or Python code.\n", 56 | "\n", 57 | "---" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "#### Classifier \n", 65 | "A machine-learning algorithm that determines the class of an input element based on a set of features. \n", 66 | "\n", 67 | "---" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "#### Concatenation \n", 75 | "The process of combining strings i.e *\"This string is\" + \"Concatenating\"*\n", 76 | "\n", 77 | "---" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "#### Concordance \n", 85 | "A list of all words within a text and their frequency of occurrence.\n", 86 | "\n", 87 | "---" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "#### Conditional Block \n", 95 | "Where the program has to make a decision based on a series of options using [conditional statements](http://www.openbookproject.net/books/bpp4awd/ch04.html) such as *if, else* and *elif*\n", 96 | "\n", 97 | "---" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "#### Debug \n", 105 | "The process of identifying and removing errors from a program.\n", 106 | "\n", 107 | "---" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "#### Delimiter \n", 115 | "A character (most typically a comma) used to specify boundaries between words or regions in plain text.\n", 116 | "\n", 117 | "---" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "#### Directory Tree \n", 125 | "A tree like structure which represents the organization and hierachy of files within a directory. Terms such as *parent* and *child* are used to describe relationships between files and folders within this system.\n", 126 | "\n", 127 | "---" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "#### Dispersion Plot \n", 135 | "Also known as a [Scatter plot](https://en.wikipedia.org/wiki/Scatter_plot). A graph which uses cartesian coordinates to display values for multiple variables of a set of data. Particularly useful for displaying positional information for words within a text.\n", 136 | "\n", 137 | "---" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "#### Fork \n", 145 | "A cloned copy of a project which is set-up on a independent branch seperate to the original. Often used as a development tool in opensource software - where anyone can create a fork of the program and work on it as a distinct piece of software. [Github](https://github.com/) is an example of a tool which facilitates this sharing and development process.\n", 146 | "\n", 147 | "---" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "#### Function \n", 155 | "Put simply, functions provide functionality to a program. They are blocks of organized code which begin with the keyword *def* proceeded by the name of the function you wish to define in parentheses. The code block begins with a colon and must be indented. [Further Information.](https://www.tutorialspoint.com/python/python_functions.htm)\n", 156 | "- Function Chaining - Also known as method chaining. It is a set of rules which govern the process of calling multiple methods (functions) in a single statement.\n", 157 | "- Recursive Function - A function which calls itself one or many times in an loop until it fufils the condition of its [recursion.](https://www.python-course.eu/recursive_functions.php)\n", 158 | "- Calling a Function - Telling the program to execute a function.\n", 159 | "---" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "#### Indentation \n", 167 | "Empty spaces used as a formatting tool to designate blocks of code in programming. In Python, indentation is used to indicate a block of code, typically four spaces are used - each line of code in the block must be indented by the same amount of spaces otherwise an error may occur.\n", 168 | "\n", 169 | "---" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "#### Iteration \n", 177 | "The repetition of a procedure in the form of a loop to obtain successively closer approximations to the solution of a problem.\n", 178 | "\n", 179 | "---" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "#### Kernel \n", 187 | "The core computer program of the operating system which can control all system processes. The iPython kernel runs the code in the background for Jupyter notebooks.\n", 188 | "\n", 189 | "---" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "#### Lemmatization \n", 197 | "A lemma is the canonical form of a word. Lematization is the process of grouping together inflected forms of a word to be analysed as a single item i.e. determining the orginal lemma for the words.\n", 198 | "\n", 199 | "---" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "#### List Comprehension\n", 207 | "A method for defining and constructing lists. Particularly useful for creating a new list from an exsisting list using expressions with a *for / in* statement within a set of brackets. [Further Information.](https://www.python-course.eu/list_comprehension.php)\n", 208 | "\n", 209 | "---" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "#### Nest\n", 217 | "Placing objects or elements in a hierarchical arrangement within a set (an ordered collection of immutable objects).\n", 218 | "\n", 219 | "---" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "#### N-gram \n", 227 | "A unit (letter, words etc) of variable size (n = number of units) from a given sequence of text in a corpus used in language modelling. [Further information](https://en.wikipedia.org/wiki/N-gram)\n", 228 | "\n", 229 | "---" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "#### Normalization \n", 237 | "A process of transforming text into a single canonical form, thereby faciliating data consistentency for further processing. Examples include removing non-alphanumeric characters or changing to lower case.\n", 238 | "\n", 239 | "---" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "#### Object \n", 247 | "Data which has attributes or values AND a defined behaviour.\n", 248 | "- Response Object - An object which returns a response made through a HTTP request when collecting data from a website or URL.\n", 249 | "\n", 250 | "---" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "#### Operator \n", 258 | "Symbols which perform arithmetic or logical computation. Some basic types of operators used in Python are arithmetic (addition +, modulus % etc), comparison (greater than >, not equal to !=, etc) or logical (*and, or, not*). [Further Information](https://www.programiz.com/python-programming/operators)\n", 259 | "\n", 260 | "---" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "#### Parse \n", 268 | "Parsing or Syntactic Analysis is a process whereby sentences or strings of words are analysed by a computer into their constituents, often this is represented in a [parse tree](https://en.wikipedia.org/wiki/Parse_tree) which illustrates this syntactic structure.\n", 269 | "\n", 270 | "---" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "#### Plain Texts \n", 278 | "Text which includes only data related to the readable material. That is, without data related to grapahical presentation, formatting or other objects such as images. Encoded using Unicode standards, typically in a text editor such as Textedit on Mac or Wordpad on PC. Plain texts are particularly useful for archival storage as they are not confined to proprietary software and can be opened and edited on many systems, thereby ensuring a more universal accessibility and preservation. \n", 279 | "\n", 280 | "---" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "#### Regular Expressions \n", 288 | "The sequence of characters which define a search pattern. These patterns are useful for performing string operations such as *find* or *find and replace*\n", 289 | "\n", 290 | "---" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": {}, 296 | "source": [ 297 | "#### Regularize \n", 298 | "The replacement of irregular forms in syntax with regular forms.\n", 299 | "\n", 300 | "---" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "#### Repository \n", 308 | "A central location where where data is stored and managed. More specifically, in revision control systems a repository stores metadata for sets of files or directory structure.\n", 309 | "\n", 310 | "---" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "#### Sequence \n", 318 | "An ordered set of Lists, Tuples or Strings.\n", 319 | "\n", 320 | "---" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": {}, 326 | "source": [ 327 | "#### Sparse Matrix \n", 328 | "Also known as a [sparse array.](https://en.wikipedia.org/wiki/Sparse_matrix) It is a matrix (an array of data arranged in a rectangular structure of columns and rows) in which most of the elements are zero. If most of the elements were populated by values other than zero than the matrix could be considered dense.\n", 329 | "\n", 330 | "---" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "#### Stemming \n", 338 | "The process of reducing a word to it's base form or word stem e.g. added/adding would reduce to add.\n", 339 | "\n", 340 | "---" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "#### Stop Words \n", 348 | "A list of words which are programmed to be ignored or filtered in analysis and search queries. Lists of stop-words often contain high frequency function words such as *the, of, and* etc\n", 349 | "\n", 350 | "---" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "#### String \n", 358 | "A string is a container for data of letters, numbers or symbols.\n", 359 | "- Zero padded strings - To pad a string (usually an integer) with leading zeros to make up a specified length.\n", 360 | "\n", 361 | "---" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "#### Synset \n", 369 | "A set of synonyms. \n", 370 | "\n", 371 | "---" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "metadata": {}, 377 | "source": [ 378 | "#### Training Set \n", 379 | "A data set used to train a model in machine learning. Specific examples are chosen to fit the parameters of the model for training and the subsequent results are compared with a testing dataset.\n", 380 | "\n", 381 | "---" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "#### Tuple \n", 389 | "A sequence of immutable (fixed) objects. Tuples are created by seperating values using commas within a set of parentheses e.g. (1, 2, 3, 4, 5 );\n", 390 | "\n", 391 | "---" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": {}, 397 | "source": [ 398 | "#### Variable \n", 399 | "A variable stores a piece of data and gives it a specific name. Common data types which are stored in variables in Python include numbers and Boolean values. \n", 400 | "\n", 401 | "---" 402 | ] 403 | }, 404 | { 405 | "cell_type": "markdown", 406 | "metadata": {}, 407 | "source": [ 408 | "#### Unicode \n", 409 | "An industry standard in computing for encoding (representing) text. Letters, numbers and symbols are assigned unique numeric values which facilitate universal application across different programs and platforms. A fun example of the utility of unicode is the emoji keyboard used on smartphones when sending messages. The universal nature of unicode allows the emoji's to be accurately represented on most modern phones regardless of their differing operating systems (such as android, ios, blackberry). [Further information](http://unicode.org/standard/WhatIsUnicode.html)\n", 410 | "\n", 411 | "---" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": { 417 | "collapsed": true 418 | }, 419 | "source": [ 420 | "[CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/) From [The Art of Literary Text Analysis](ArtOfLiteraryTextAnalysis.ipynb) by [Stéfan Sinclair](http://stefansinclair.name) & [Geoffrey Rockwell](http://geoffreyrockwell.com)
\n", 421 | "Edited and revised by [Melissa Mony](http://melissamony.com)" 422 | ] 423 | } 424 | ], 425 | "metadata": { 426 | "kernelspec": { 427 | "display_name": "Python 3", 428 | "language": "python", 429 | "name": "python3" 430 | }, 431 | "language_info": { 432 | "codemirror_mode": { 433 | "name": "ipython", 434 | "version": 3 435 | }, 436 | "file_extension": ".py", 437 | "mimetype": "text/x-python", 438 | "name": "python", 439 | "nbconvert_exporter": "python", 440 | "pygments_lexer": "ipython3", 441 | "version": "3.6.3" 442 | } 443 | }, 444 | "nbformat": 4, 445 | "nbformat_minor": 2 446 | } 447 | -------------------------------------------------------------------------------- /ipynb/experiments/SmithImageryWordList.txt: -------------------------------------------------------------------------------- 1 | ABBEY 2 | ABBOTS 3 | ABLAZE 4 | ABORO 5 | ABYSS 6 | ACCENT 7 | ACHE 8 | ACOLYTE 9 | ADAM 10 | AFLAME 11 | AFRAID 12 | AFTERGLOW 13 | AGLOW 14 | AGONY 15 | AIR 16 | AISLE 17 | AJAR 18 | ALMONDS 19 | ALOUD 20 | ALTAR 21 | ALTARSTEPS 22 | AMEN 23 | ANGEL 24 | ANGER 25 | ANGRILY 26 | ANGUISH 27 | ANIMAL 28 | ANNOYED 29 | ANTELOPES 30 | ANTLIKE 31 | APE 32 | APOLOGISE 33 | APOSTLE 34 | APPLAUSE 35 | APPLE 36 | APPLETREES 37 | APRON 38 | ARCHES 39 | AROMATIC 40 | ARROW 41 | ARSE 42 | ART 43 | ASHES 44 | ASHPLANT 45 | ATBORO 46 | AUBURN 47 | AUDIBLE 48 | AUTUMN 49 | AZURE 50 | BAAING 51 | BABBLE 52 | BABY 53 | BACON 54 | BAG 55 | BAH 56 | BAIZE 57 | BALCONIES 58 | BALD 59 | BALDHEAD 60 | BALDWIN 61 | BALDYHEAD 62 | BALES 63 | BALLAD 64 | BALMY 65 | BAND 66 | BANG 67 | BANTER 68 | BAPTISED 69 | BAPTISM 70 | BARE 71 | BAREFOOT 72 | BAREHEADED 73 | BARK 74 | BARMAIDS 75 | BARRACK 76 | BARRELS 77 | BARREN 78 | BASE 79 | BASIN 80 | BASKET 81 | BATH 82 | BATLIKE 83 | BAWL 84 | BAY 85 | BEACH 86 | BEADS 87 | BEAK 88 | BEAR 89 | BEARD 90 | BEAST 91 | BEAT 92 | BECAUSEBORO 93 | BED 94 | BEER 95 | BELL 96 | BENCH 97 | BENUMBED 98 | BIBLE 99 | BICYCLE 100 | BIKE 101 | BIRD 102 | BIRDCALL 103 | BISCUIT 104 | BISHOP 105 | BITCH 106 | BITTER 107 | BLACK 108 | BLACKLOOKING 109 | BLACKROCK 110 | BLANDLY 111 | BLANK 112 | BLANKET 113 | BLASPHEMER 114 | BLASPHEMIES 115 | BLASPHEMOUS 116 | BLAST 117 | BLAZE 118 | BLEAK 119 | BLEATING 120 | BLESS 121 | BLEW 122 | BLIND 123 | BLINDCORD 124 | BLINKING 125 | BLOOD 126 | BLOODRED 127 | BLOOM 128 | BLOSSOMS 129 | BLOW 130 | BLUE 131 | BLUSH 132 | BOAT 133 | BODIES 134 | BODILY 135 | BOGHOLE 136 | BOGS 137 | BOGWATER 138 | BOILING 139 | BONE 140 | BONNET 141 | BOOING 142 | BOOK 143 | BOOTS 144 | BOUS 145 | BOWL 146 | BOX 147 | BRACKISH 148 | BRANCH 149 | BRASS 150 | BRAY 151 | BREAD 152 | BREAKWATER 153 | BREAST 154 | BREATH 155 | BREECHES 156 | BREEZE 157 | BRICK 158 | BRIDGE 159 | BRIGHT 160 | BRILLIANT 161 | BRIM 162 | BRIMSTONE 163 | BRINE 164 | BRONZE 165 | BROTHER 166 | BROW 167 | BROWN 168 | BRUSH 169 | BUDDING 170 | BUFF 171 | BULL 172 | BUM 173 | BUMP 174 | BURN 175 | BURST 176 | BUSH 177 | BUTT 178 | CABBAGE 179 | CABIN 180 | CACKLING 181 | CADENCE 182 | CAKE 183 | CALF 184 | CALICO 185 | CALM 186 | CALVARY 187 | CANAL 188 | CANCER 189 | CANDLE 190 | CANDLEBUTTS 191 | CANDLESTICK 192 | CANE 193 | CANKER 194 | CANON 195 | CANOPY 196 | CANVAS 197 | CAP 198 | CAPUCHIN 199 | CAR 200 | CARD 201 | CARESS 202 | CARMELITE 203 | CARNIVAL 204 | CARPET 205 | CARRIAGE 206 | CARROTS 207 | CASK 208 | CASTLE 209 | CAT 210 | CATACOMBS 211 | CATAFALQUE 212 | CATCALLS 213 | CATECHISM 214 | CATTLE 215 | CAVE 216 | CAVERN 217 | CELERY 218 | CEMETERY 219 | CENSER 220 | CEREMENTS 221 | CESSPOOL 222 | CHAIN 223 | CHAIR 224 | CHALICE 225 | CHAMBERPOT 226 | CHAMPAGNE 227 | CHANCES 228 | CHANDELIER 229 | CHANNEL 230 | CHAP 231 | CHAPEL 232 | CHARCOAL 233 | CHEEK 234 | CHEER 235 | CHESTNUT 236 | CHEWED 237 | CHILD 238 | CHILL 239 | CHIME 240 | CHIN 241 | CHOCOLATE 242 | CHOIR 243 | CHOKED 244 | CHORD 245 | CHRIST 246 | CHRISTENDOM 247 | CHRISTMAS 248 | CHURCH 249 | CIGAR 250 | CIGARETTE 251 | CINDERPATH 252 | CINDERS 253 | CINNAMOMUM 254 | CIRCLE 255 | CITIES 256 | CLAPPED 257 | CLASSROOM 258 | CLATTER 259 | CLAY 260 | CLEAN 261 | CLEAR 262 | CLERGY 263 | CLERICAL 264 | CLICK 265 | CLIFFS 266 | CLOAK 267 | CLOCK 268 | CLOISTER 269 | CLOTH 270 | CLOUD 271 | CLOUDLETS 272 | COAL 273 | COAT 274 | COBWEB 275 | COCK 276 | COCOA 277 | COCOON 278 | COD 279 | COFFIN 280 | COIL 281 | COIN 282 | COLD 283 | COLLAR 284 | COLLYWOBBLES 285 | COLORLESS 286 | COLOUR 287 | COMMUNED 288 | COMMUNICANT 289 | CONFESS 290 | CONFLAGRATION 291 | CONSECRATED 292 | CONVENT 293 | COOL 294 | COPPER 295 | COPYBOOK 296 | CORD 297 | CORDUROY 298 | CORK 299 | CORPSE 300 | CORPSEWHITE 301 | CORRIDOR 302 | CORRIGAN 303 | COTTAGE 304 | COUGH 305 | COW 306 | COWDUNG 307 | COWHAIRS 308 | COWHOUSE 309 | COWL 310 | COWYARD 311 | CRACK 312 | CRADLE 313 | CRASH 314 | CREAKED 315 | CREAM 316 | CRICKET 317 | CRICKETBATS 318 | CRICKETCAP 319 | CRIED 320 | CROCODILE 321 | CROSS 322 | CROWD 323 | CROWN 324 | CRUCIFIED 325 | CRUCIFIX 326 | CRY 327 | CUP 328 | CURED 329 | CURL 330 | CURTAIN 331 | CYCLE 332 | DAIRY 333 | DAMN 334 | DAMP 335 | DANCE 336 | DANK 337 | DAPPLED 338 | DARK 339 | DARKPLUMAGED 340 | DART 341 | DAWN 342 | DAY 343 | DAYLIGHT 344 | DEAD 345 | DEAF 346 | DEATH 347 | DEATHBED 348 | DEATHCHILL 349 | DEATHMASK 350 | DEATHWOUND 351 | DEDALUS 352 | DESK 353 | DEUS 354 | DEVIL 355 | DEW 356 | DEWLAPS 357 | DIAMONDS 358 | DICE 359 | DIE 360 | DIEU 361 | DIM 362 | DIMPLES 363 | DIN 364 | DINGDOHG 365 | DINNERTABLE 366 | DIRTY 367 | DISH 368 | DITCH 369 | DIZZILY 370 | DOCKS 371 | DOG 372 | DOLL 373 | DOLLYMOUNT 374 | DOME 375 | DOMINICAN 376 | DOOR 377 | DOORWAY 378 | DORMITORY 379 | DOVE 380 | DRAIN 381 | DRAWL 382 | DREGS 383 | DRENCHED 384 | DRESS 385 | DRINK 386 | DRY 387 | DUBLIN 388 | DUNG 389 | DUNGHILL 390 | DUSK 391 | DUST 392 | DYING 393 | EAGLE 394 | EAR 395 | EARSPLITTING 396 | EARTH 397 | EASYCHAIR 398 | EBRING 399 | EBONITE 400 | ECHO 401 | ECSTASY 402 | EDDIED 403 | EGGS 404 | EGGSHELLS 405 | EJACULATION 406 | ELBOW 407 | ELEPHANT 408 | ELLIPSOID 409 | ELLIPTICAL 410 | EMBERS 411 | EMERALD 412 | ENAMEL 413 | ENFLAMING 414 | EUCHARIST 415 | EVENING 416 | EXCREMENT 417 | EXCREMENTITIOUS 418 | EYE 419 | EYEBROWS 420 | EYEGLASS 421 | EYELID 422 | FACE 423 | FADE 424 | FAINT 425 | FAIR 426 | FARTED 427 | FAT 428 | FATE 429 | FATENCIRCLED 430 | FATHER 431 | FEAR 432 | FEAST 433 | FEATHERINGS 434 | FEED 435 | FENCE 436 | FESTERING 437 | FEVER 438 | FIELD 439 | FIERY 440 | FIG 441 | FIGTREE 442 | FIGURE 443 | FILE 444 | FILM 445 | FILTHILY 446 | FINGER 447 | FINGERNAILS 448 | FINGERTIPS 449 | FIRE 450 | FIRECONSUMED 451 | FIRELIGHT 452 | FIREPLACE 453 | FIRM 454 | FISH 455 | FLABBY 456 | FLAG 457 | FLAME 458 | FLASH 459 | FLAT 460 | FLECKED 461 | FLEECE 462 | FLESH 463 | FLEW 464 | FLEXIBLE 465 | FLICKERED 466 | FLIES 467 | FLIGHT 468 | FLITTING 469 | FLOG 470 | FLORID 471 | FLOWER 472 | FLOWERBEDS 473 | FLOWERGIRL 474 | FLUSH 475 | FLUTTER 476 | FLY 477 | FOAM 478 | FOETUS 479 | FOG 480 | FOOD 481 | FOOT 482 | FOOTBALL 483 | FOOTPATH 484 | FOOTSTEPS 485 | FOREFINGER 486 | FOREHEAD 487 | FOREST 488 | FORGE 489 | FORK 490 | FOUL 491 | FOULSMELLING 492 | FOUNTAIN 493 | FOWL 494 | FOX 495 | FRAGMENT 496 | FRAGRANCE 497 | FRAIL 498 | FRANCISCAN 499 | FRANKINCENSE 500 | FRECKLED 501 | FRIAR 502 | FRIGHT 503 | FRO 504 | FROG 505 | FROWN 506 | FRUIT 507 | FUME 508 | FUNGUS 509 | FUNNEL 510 | FURNACE 511 | GAMECOCKS 512 | GARDEN 513 | GAS 514 | GASFLAMES 515 | GASJETS 516 | GATE 517 | GAYCLAD 518 | GEESE 519 | GEMS 520 | GENUFLECTING 521 | GINGERNUTS 522 | GIRAFFE 523 | GIRDLE 524 | GIRL 525 | GLARE 526 | GLASS 527 | GLASSJAPS 528 | GLEAMED 529 | GLIMMER 530 | GLINT 531 | GLISTENING 532 | GLITTERED 533 | GLOOM 534 | GLOOMILY 535 | GLOSSY 536 | GLOW 537 | GNAWED 538 | GOD 539 | GODFORSAKEN 540 | GODHEAD 541 | GOLD 542 | GONEBORO 543 | GOODBYE 544 | GOODNIGHT 545 | GOSPEL 546 | GOWN 547 | GRAIN 548 | GRAPES 549 | GRASS 550 | GRASSPLOT 551 | GRATE 552 | GRAVE 553 | GRAVECLOTHES 554 | GRAVEL 555 | GRAVEYARD 556 | GREASE 557 | GREEN 558 | GREENWHITE 559 | GREY 560 | GREYBLUE 561 | GREYFRINGED 562 | GREYGREEN 563 | GREYHOUNDS 564 | GURGLING 565 | GUST 566 | HA 567 | HAIR 568 | HALE 569 | HALLWAY 570 | HAM 571 | HAND 572 | HANDKERCHIEF 573 | HARBOUR 574 | HARD 575 | HARES 576 | HARMONIOUS 577 | HARMONISED 578 | HARMONY 579 | HARSH 580 | HASH 581 | HAT 582 | HAWK 583 | HAZE 584 | HAZEWRAPPED 585 | HEAD 586 | HEART 587 | HEAT 588 | HEAVEN 589 | HEAVILY 590 | HEDGE 591 | HEEL 592 | HELL 593 | HELLFIRE 594 | HERBS 595 | HERON 596 | HILL 597 | HIPS 598 | HISS 599 | HOLE 600 | HOLLOWSOUNDING 601 | HOME 602 | HONEY 603 | HOODED 604 | HOOFS 605 | HORIZON 606 | HORSE 607 | HOSPITAL 608 | HOT 609 | HOTEL 610 | HOUNDED 611 | HOUSE 612 | HOUSEBORO 613 | HOWL 614 | HUE 615 | HUM 616 | HUNGRILY 617 | HURRAY 618 | HURPOO 619 | HURT 620 | HUSH 621 | HYMN 622 | ICON 623 | ILLUMINATED 624 | IMAGE 625 | INAUDIBLE 626 | INCENSE 627 | INFIRMARY 628 | INJURED 629 | INJURIES 630 | INK 631 | INSECT 632 | IRON 633 | ISLAND 634 | ITCH 635 | IVORY 636 | IVY 637 | JAR 638 | JARGON 639 | JAW 640 | JEER 641 | JELLYLIKE 642 | JERKED 643 | JERUSALEM 644 | JESU 645 | JESUIT 646 | JEWEL 647 | JEWELEYED 648 | JINGLE 649 | JUG 650 | JUICE 651 | KETTLE 652 | KIDNEY 653 | KISS 654 | KNEE 655 | KNEEL 656 | KNELT 657 | KNIFE 658 | KNOCKED 659 | LACE 660 | LAKE 661 | LALA 662 | LAMB 663 | LAMP 664 | LANDBORO 665 | LANE 666 | LANTERN 667 | LAP 668 | LARK 669 | LASHES 670 | LAUGH 671 | LAUGHTER 672 | LAUREL 673 | LAVATORY 674 | LAVENDER 675 | LAWN 676 | LEAF 677 | LEATHER 678 | LEG 679 | LEMON 680 | LEND 681 | LETTER 682 | LICE 683 | LICKING 684 | LIGHT 685 | LIGHTNINGS 686 | LILY 687 | LIMES 688 | LIMP 689 | LINEN 690 | LIP 691 | LIQUID 692 | LIT 693 | LITANY 694 | LITURGY 695 | LOAFTER 696 | LOINS 697 | LOOKBORO 698 | LORD 699 | LORDBORO 700 | LOUD 701 | LOUSEMARKS 702 | LUCIFER 703 | LUKEWARM 704 | LULL 705 | LUMINARY 706 | LUMINOUS 707 | LUMPISH 708 | LUNGS 709 | LUST 710 | LUTFLIKE 711 | LYRICAL 712 | MAHOGANY 713 | MANYCOLOURED 714 | MAPLE 715 | MARBLES 716 | MARE 717 | MAROON 718 | MARSHLIGHT 719 | MASK 720 | MASS 721 | MASSBOOK 722 | MELODY 723 | MERRILY 724 | MERRIMENT 725 | MERRYMAKING 726 | METAL 727 | MICE 728 | MILK 729 | MIRE 730 | MIRROR 731 | MOAN 732 | MOCK 733 | MOIST 734 | MOLE 735 | MONEY 736 | MONK 737 | MONKEY 738 | MOOCOW 739 | MOON 740 | MOONLIT 741 | MOORINGS 742 | MORGUE 743 | MOTTLED 744 | MOULDERING 745 | MOUNT 746 | MOUSTACHE 747 | MOUTH 748 | MUD 749 | MUDDIED 750 | MUMBLED 751 | MURDER 752 | MURMUR 753 | MUSIC 754 | MUTE 755 | MUTTERED 756 | MYRRH 757 | NAIL 758 | NAKED 759 | NAME 760 | NASAL 761 | NASTY 762 | NAUSEOUS 763 | HAVE 764 | NECK 765 | NEEDLE 766 | NEST 767 | NETS 768 | NIGHT 769 | NIGHTCLOUDS 770 | NIGHTSHADE 771 | NIGHTSHIRT 772 | NOISE 773 | NOISILY 774 | NOSE 775 | NOTEBOOKS 776 | NOXIOUS 777 | NUN 778 | NURSE 779 | NURSEMAIDS 780 | NURSERY 781 | OAR 782 | OCEAN 783 | ODOROUS 784 | ODOUR 785 | OIL 786 | OILSHEET 787 | OLIVE 788 | ONIONS 789 | OOZED 790 | ORB 791 | ORCHARDS 792 | ORCHESTRA 793 | ORDER 794 | OUTBORO 795 | OUTHOUSE 796 | OVERCOAT 797 | OX 798 | OZONE 799 | PAGE 800 | PAIN 801 | PALATE 802 | PALM 803 | PANDIED 804 | PANDYBAT 805 | PANTING 806 | PAPA 807 | PAPER 808 | PARACLETE 809 | PASTORS 810 | PATCHWORK 811 | PATH 812 | PATTED 813 | PEAL 814 | PEEL 815 | PENCIL 816 | PEPPER 817 | PERFUME 818 | PHRASE 819 | PIANO 820 | PICTURE 821 | PIG 822 | PIGEON 823 | PINK 824 | PISS 825 | PLANT 826 | PLUCKED 827 | PLUMP 828 | PLUMPUDDING 829 | POCK 830 | POCKET 831 | POLISHED 832 | POLLUTES 833 | PONY 834 | POOL 835 | POPE 836 | PORCELAIN 837 | PORCH 838 | PORTRAIT 839 | POT 840 | POTATOES 841 | PRAY 842 | PRAYERBOOK 843 | PREACH 844 | PERFECT 845 | PRESS 846 | PRIEST 847 | PRIESTCRAFT 848 | PRIESTRIDDEN 849 | PRISON 850 | PRISONHOUSE 851 | PROFESSOR 852 | PROSE 853 | PROTEST 854 | PSALMS 855 | PUCK 856 | PUDDING 857 | PUDDLES 858 | PULL 859 | PULPIT 860 | PULSATION 861 | PUNCH 862 | PUNGENT 863 | PUNISH 864 | PUPPY 865 | PURGATORIAL 866 | PURPLE 867 | PURRED 868 | PUTBORO 869 | PUTREFACTION 870 | PUTRID 871 | QUADRANGLE 872 | QUAE 873 | QUAGMIRE 874 | QUEER 875 | QUIET 876 | QUIVERED 877 | RABBITS 878 | RABBITSKIN 879 | RACKET 880 | RAGE 881 | RAIL 882 | RAILWAY 883 | RAIN 884 | RAINDROPS 885 | RAINFRAGRANT 886 | RAINLADEN 887 | RAINSODDEN 888 | RAKE 889 | RAN 890 | RANG 891 | RAT 892 | RATTLE 893 | RECTOR 894 | RED 895 | REDBROWN 896 | REDEYED 897 | REDHOT 898 | REDRIMMED 899 | REEKING 900 | REFECTORY 901 | REFLECT 902 | RELIGION 903 | RELIGIOUS 904 | REPENT 905 | REPTILE 906 | REVEREND 907 | RHYME 908 | RHYTHM 909 | RIBS 910 | RICE 911 | RIDDLE 912 | RING 913 | RIOT 914 | RITE 915 | RITUAL 916 | RIVER 917 | RIVULET 918 | ROAD 919 | ROADWAY 920 | ROAR 921 | ROBE 922 | ROCK 923 | ROOF 924 | ROOM 925 | ROPE 926 | ROSE 927 | ROSEBUSHES 928 | ROSEFLIGHT 929 | ROSESOFT 930 | ROSEWAY 931 | ROSIE 932 | ROT 933 | ROTUNDA 934 | ROUGED 935 | ROUGH 936 | ROUGHHAWN 937 | ROUND 938 | ROUNDHEAD 939 | RUMBLING 940 | RUMP 941 | RUN 942 | RUSSET 943 | RUSTLING 944 | SABBATH 945 | SACK 946 | SACKCLOTH 947 | SACRAMENT 948 | SACRIFICE 949 | SACRILEGE 950 | SACRILEGIOUS 951 | SACRISTAN 952 | SACRISTY 953 | SAILOR 954 | SAINT 955 | SALT 956 | SALVATION 957 | SANCTUARY 958 | SAND 959 | SASH 960 | SATAN 961 | SAUCE 962 | SAUSAGES 963 | SAVIOUR 964 | SAVOUR 965 | SCALDED 966 | SCARLET 967 | SCHOOL 968 | SCREAM 969 | SCREECH 970 | SCUM 971 | SEA 972 | SEABIRD 973 | SEABORNE 974 | SEADUSK 975 | SEAHARVEST 976 | SEAPORT 977 | SEASHORE 978 | SEATANGLE 979 | SEAWALL 980 | SEAWATER 981 | SEAWEED 982 | SEAWRACK 983 | SECULAR 984 | SEDUCE 985 | SEED 986 | SELFBOUNDED 987 | SELFCOMMUNION 988 | SELFCONTAINED 989 | SELFEMBITTERED 990 | SELFMISTRUST 991 | SELFRESPECT 992 | SELFRESTRAINT 993 | SELFSURRENDER 994 | SENTENCE 995 | SEPULCHRE 996 | SERAPH 997 | SERAPHIM 998 | SERPENT 999 | SEWER 1000 | SHADE 1001 | SHADOW 1002 | SHAME 1003 | SHARP 1004 | SHAWL 1005 | SHED 1006 | SHELL 1007 | SHIMMER 1008 | SHINE 1009 | SHIP 1010 | SHIPWRECKS 1011 | SHIRT 1012 | SHITE 1013 | SHIVER 1014 | SHOCK 1015 | SHOE 1016 | SHONE 1017 | SHOOK 1018 | SHOUTED 1019 | SHOVED 1020 | SHOWER 1021 | SHRIEKING 1022 | SHRILL 1023 | SHRINE 1024 | SHRIVELLED 1025 | SHRUBS 1026 | SHRUNK 1027 | SICK 1028 | SIDEALTAR 1029 | SIENA 1030 | SIGH 1031 | SILENCE 1032 | SILK 1033 | SILVER 1034 | SILVERCOATED 1035 | SILVERPOINTED 1036 | SILVERVEINED 1037 | SILVERWRAPPED 1038 | SIN 1039 | SINCORRUPTED 1040 | SINFULIMPENITENCE 1041 | SING 1042 | SINLOVING 1043 | SISTER 1044 | SKIES 1045 | SKIN 1046 | SKIRTS 1047 | SKULL 1048 | SKY 1049 | SKYHIGH 1050 | SKYLIGHT 1051 | SLATE 1052 | SLATEBLUE 1053 | SLEEK 1054 | SLEEP 1055 | SLEEVE 1056 | SLENDER 1057 | SLIME 1058 | SLOBBERING 1059 | SLOTH 1060 | SLOW 1061 | SLOWDRIFTING 1062 | SLOWFLOWING 1063 | SLUGGISH 1064 | SMACKED 1065 | SMART 1066 | SMELL 1067 | SMILE 1068 | SMITHY 1069 | SMOKE 1070 | SMOOTH 1071 | SMUGGING 1072 | SNAKE 1073 | SNAP 1074 | SNEEZE 1075 | SNORT 1076 | SOARING 1077 | SOB 1078 | SOFT 1079 | SOFTHUED 1080 | SOFTLYLIGHTED 1081 | SOFTWORDED 1082 | SOLDIER 1083 | SOLITARY 1084 | SONG 1085 | SOOTHED 1086 | SORDID 1087 | SOUL 1088 | SOULFREE 1089 | SOUND 1090 | SOUR 1091 | SOURSMELLING 1092 | SOUTANE 1093 | SPACE 1094 | SPADE 1095 | SPECKLED 1096 | SPECTACLES 1097 | SPEECH 1098 | SPIED 1099 | SPIRIT 1100 | SPITE 1101 | SPITTLE 1102 | SPRAY 1103 | SQUALID 1104 | SQUALL 1105 | SQUARE 1106 | SQUEAK 1107 | STAGNATION 1108 | STAIN 1109 | STAIRCASE 1110 | STAIRS 1111 | STALE 1112 | STAMMERING 1113 | STANK 1114 | STAR 1115 | STASIS 1116 | STENCH 1117 | STEPHANEFOROS 1118 | STEPHANOS 1119 | STEHANOUMENOS 1120 | STEW 1121 | STICK 1122 | STIFF 1123 | STING 1124 | STINK 1125 | STINKPOT 1126 | STOCKED 1127 | STOMACH 1128 | STOWE 1129 | STORM 1130 | STRAW 1131 | STRAWCOLOURED 1132 | STREAKS 1133 | STREET 1134 | STUNG 1135 | SOCK 1136 | SUFFOCATED 1137 | SULPHUROUS 1138 | SULPHURYELLOW 1139 | SUN 1140 | SUNG 1141 | SUNLIGHT 1142 | SUNRISE 1143 | SUNWARMED 1144 | SUP 1145 | SURPLICES 1146 | SWALLOW 1147 | SWAMP 1148 | SWANS 1149 | SWEAR 1150 | SWEAT 1151 | SWEET 1152 | SWISH 1153 | SWOLLEN 1154 | SWORD 1155 | SYLLABLE 1156 | TABERNACLE 1157 | TABLE 1158 | TABLECLOTH 1159 | TAIL 1160 | TALLOW 1161 | TAN 1162 | TAPPED 1163 | TAR 1164 | TASTE 1165 | TAWNY 1166 | TEA 1167 | TEACUPS 1168 | TEAR 1169 | TEETH 1170 | TELEGRAPHPOLES 1171 | TENOR 1172 | TEPID 1173 | TESTAMENT 1174 | TESTIMONIAL 1175 | THEBORO 1176 | THIGH 1177 | THIN 1178 | THISTLES 1179 | THORNS 1180 | THRILL 1181 | THROAT 1182 | THROB 1183 | THRUST 1184 | THUD 1185 | THUMB 1186 | THUMBBLACKENED 1187 | THUNDER 1188 | TICKING 1189 | TICKLING 1190 | TIDE 1191 | TINGES 1192 | TINGLE 1193 | TITTER 1194 | TOAST 1195 | TOBACCO 1196 | TOBORO 1197 | TONE 1198 | TONGUE 1199 | TORTURE 1200 | TOWELS 1201 | TOWER 1202 | TRAIN 1203 | TRALALA 1204 | TRALALADDY 1205 | TRAM 1206 | TREAT 1207 | TREBLE 1208 | TREMOR 1209 | TREMULOUS 1210 | TRINITY 1211 | TROUSERS 1212 | TRUMPET 1213 | TRUMPETBLAST 1214 | TUCKED 1215 | TUCKOO 1216 | TUNNEL 1217 | TURF 1218 | TURFCOLOURED 1219 | TURKEY 1220 | TURNIPS 1221 | TURRET 1222 | TWIG 1223 | TWILIGHT 1224 | TWINECOLOURED 1225 | TWINKLED 1226 | TWIRLING 1227 | TWIST 1228 | TWITCHING 1229 | TWITTERED 1230 | UMBRELLA 1231 | UNDEPILED 1232 | UNDRESS 1233 | UNDRIED 1234 | UNLIT 1235 | URIRAL 1236 | USAGE 1237 | USBORO 1238 | VAPOUR 1239 | VEIL 1240 | VEINS 1241 | VELVET 1242 | VERMIN 1243 | VESTRY 1244 | VINEGAR 1245 | VIOLETS 1246 | VIPER 1247 | VIRGIN 1248 | VISION 1249 | VOICE 1250 | VOMITED 1251 | VOWEL 1252 | WADED 1253 | WAFER 1254 | WAIL 1255 | WAIST 1256 | WALK 1257 | WAN 1258 | WAR 1259 | WARM 1260 | WASH 1261 | WATER 1262 | WATERJUG 1263 | WATERLOGGED 1264 | WATERPROOFS 1265 | WATERY 1266 | WAVE 1267 | WAVELET 1268 | WAX 1269 | WEB 1270 | WEEDGROWN 1271 | WEEDS 1272 | WEEP 1273 | WEPT 1274 | WET 1275 | WHEEL 1276 | WHEEZING 1277 | WHIMPERING 1278 | WHINE 1279 | WHIP 1280 | WHIRL 1281 | WHIRRING 1282 | WHISPER 1283 | WHISTLE 1284 | WHITE 1285 | WHITEBOY 1286 | WHITEGREY 1287 | WHITEROBED 1288 | WHITEWASHED 1289 | WHORES 1290 | WIDE 1291 | WIDENING 1292 | WIDESPREAD 1293 | WIDEWINGED 1294 | WILLBORO 1295 | WIND 1296 | WINDOW 1297 | WINDSWEPT 1298 | WING 1299 | WINKED 1300 | WINTER 1301 | WIRE 1302 | WOMAN 1303 | WOMB 1304 | WOOD 1305 | WOOLY 1306 | WORD 1307 | WORM 1308 | WORSHIP 1309 | WRINKLED 1310 | WRISTS 1311 | YELLOW 1312 | YELLS -------------------------------------------------------------------------------- /ipynb/GettingNltk.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Getting NLTK for Text Processing" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "This notebook introduces the [Natural Language Toolkit](http://www.nltk.org/) (NLTK) which facilitates a broad range of tasks for text processing and representing results. It's part of the [The Art of Literary Text Analysis](ArtOfLiteraryTextAnalysis.ipynb) and assumes that you've already worked through previous notebooks ([Getting Setup](GettingSetup.ipynb), [Getting Started](GettingStarted.ipynb) and [Getting Texts](GettingTexts.ipynb)). In this notebook we'll look in particular at:\n", 15 | "\n", 16 | "* [Installing the NLTK library (for text processing)](#Installing-the-NLTK-Library)\n", 17 | "* [Simple tokenization of words](#Tokenization)\n", 18 | "* [Producing a simple table of frequencies of words](#Word-Frequencies)\n", 19 | "* [Applying a list of stopwords (words to ignore)](#Stop_Words)\n", 20 | "* [Producing a simple concordance for a keyword](#Building-a-Simple-Concordance)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## Installing the NLTK Library" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "The Anaconda bundle that we're using already includes [NLTK](http://www.nltk.org/), but the bundle doesn't include the NLTK data collections that are available. Fortunately, it's easy to download the data, and we can even do it within a notebook. Following the same steps as before, create a new notebook named \"GettingNltk\" and run this first code cell:" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 1, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "name": "stdout", 44 | "output_type": "stream", 45 | "text": [ 46 | "showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml\n" 47 | ] 48 | }, 49 | { 50 | "data": { 51 | "text/plain": [ 52 | "True" 53 | ] 54 | }, 55 | "execution_count": 1, 56 | "metadata": {}, 57 | "output_type": "execute_result" 58 | } 59 | ], 60 | "source": [ 61 | "import nltk\n", 62 | "\n", 63 | "nltk.download() # download NLTK data (we should only need to run this cell once)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "This should cause a new window to appear (eventually) with a dialog box to download data collections. For the sake of simplicity, if possible select the \"all\" row and press \"Download\". Once the download is complete, you can close that window.\n", 71 | "\n", 72 | "![NLTK Data Download](images/nltk-data-download.png)\n", 73 | "\n", 74 | "Now you're set! You can close and delete the temporary notebook used for installation." 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "## Text Processing" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "Now that we have NLTK installed, let's use it for text processing.\n", 89 | "\n", 90 | "We'll start by retrieving _The Gold Bug_ plain text that we had saved locally in the [Getting Texts](GettingTexts.ipynb) notebook. If you need to recapitulate the essentials of the previous notebook, try running this to retrieve the text:\n", 91 | "\n", 92 | "```python\n", 93 | "import urllib.request\n", 94 | "# retrieve Poe plain text value\n", 95 | "poeUrl = \"http://www.gutenberg.org/files/2147/2147-0.txt\"\n", 96 | "poeString = urllib.request.urlopen(poeUrl).read().decode()```\n", 97 | "\n", 98 | "And then this, in a separate cell so that we don't read repeatedly from Gutenberg:\n", 99 | "\n", 100 | "```python\n", 101 | "import os\n", 102 | "# isolate The Gold Bug\n", 103 | "start = poeString.find(\"THE GOLD-BUG\")\n", 104 | "end = poeString.find(\"FOUR BEASTS IN ONE\")\n", 105 | "goldBugString = poeString[start:end]\n", 106 | "# save the file locally\n", 107 | "directory = \"data\"\n", 108 | "if not os.path.exists(directory):\n", 109 | " os.makedirs(directory)\n", 110 | "with open(\"data/goldBug.txt\", \"w\") as f:\n", 111 | " f.write(goldBugString)```\n", 112 | "\n", 113 | "Now we should be ready to retrieve the text:" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 1, 119 | "metadata": { 120 | "collapsed": true 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "with open(\"data/goldBug.txt\", \"r\") as f:\n", 125 | " goldBugString = f.read()" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "Now we will work toward showing the top frequency words in our plain text. This involves three major steps:\n", 133 | "\n", 134 | "1. processing our plain text to find the words (also known as tokenization)\n", 135 | "1. counting the frequencies of each word\n", 136 | "1. displaying the frequencies information" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "## Tokenization" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "Tokenization is the basic process of parsing a string to divide it into smaller units of the same kind. You can tokenize text into paragraphs, sentences, words or other structures, but here we're focused on recognizing words in our text. For that, let's import the ```nltk``` library and use its convenient ```word_tokenize()``` function. NLTK actually has several ways of tokenizing texts, and for that matter we could write our own code to do it. We'll have a peek at the first ten tokens." 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 2, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/plain": [ 161 | "['THE', 'GOLD-BUG', 'What', 'ho', '!', 'what', 'ho', '!', 'this', 'fellow']" 162 | ] 163 | }, 164 | "execution_count": 2, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | } 168 | ], 169 | "source": [ 170 | "import nltk\n", 171 | "\n", 172 | "goldBugTokens = nltk.word_tokenize(goldBugString)\n", 173 | "goldBugTokens[:10]" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "We can see from the above that ```word_tokenize``` does a useful job of identifying words (including hyphenated words like \"GOLD-BUG\"), but also includes tokens like the exclamation mark. In some cases punctuation like this might be useful, but in our case we want to focus on word frequencies, so we should filter out punctuation tokens. (To be fair, nltk.word_tokenize() is expecting to work with sentences that have already been parsed so we're slightly misusing it here, but that's ok.)\n", 181 | "\n", 182 | "To accomplish the filtering we will use a construct called [list comprehension](https://docs.python.org/3.4/tutorial/datastructures.html#list-comprehensions) with a conditional test built in. Let's take it one step at a time, first using a loop structure like we've already seen in [Getting Texts](GettingTexts.ipynb), and then doing the same thing with a list comprehension." 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 3, 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "name": "stdout", 192 | "output_type": "stream", 193 | "text": [ 194 | "['THE', 'GOLD-BUG', 'What', 'ho', '!', 'what', 'ho', '!', 'this', 'fellow'] (for loop technique)\n", 195 | "['THE', 'GOLD-BUG', 'What', 'ho', '!', 'what', 'ho', '!', 'this', 'fellow'] (list comprehension technique)\n" 196 | ] 197 | } 198 | ], 199 | "source": [ 200 | "# technique 1 where we create a new list\n", 201 | "loopList = []\n", 202 | "for word in goldBugTokens[:10]:\n", 203 | " loopList.append(word)\n", 204 | "print(loopList, \"(for loop technique)\")\n", 205 | " \n", 206 | " \n", 207 | "# technique 2 with list comprehension\n", 208 | "print([word for word in goldBugTokens[:10]], \"(list comprehension technique)\")" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "Identical! So the general form of a list comprehension (which is very compact) is: \n", 216 | "\n", 217 | "> [_expression(item)_ for _item_ in _list_)]\n", 218 | "\n", 219 | "We can now go a step further and add a condition to the list comprehension : we'll only include the word in the final list if the first character in the word is alphabetic as defined by the [isalpha()](https://docs.python.org/3.4/library/stdtypes.html?highlight=isalpha#str.isalpha) function (`word[0]` – remember the [string sequence technique](GettingTexts.ipynb#Working-with-Parts-of-String))." 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 10, 225 | "metadata": {}, 226 | "outputs": [ 227 | { 228 | "name": "stdout", 229 | "output_type": "stream", 230 | "text": [ 231 | "['THE', 'GOLD-BUG', 'What', 'ho', 'what', 'ho', 'this', 'fellow']\n" 232 | ] 233 | } 234 | ], 235 | "source": [ 236 | "print([word for word in goldBugTokens[:10] if word[0].isalpha()])" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "## Word Frequencies" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "Now that we've had a first pass at word tokenization (keeping only word tokens), let's look at counting word frequencies. Essentially we want to go through the tokens and tally the number of times each one appears. Not surprisingly, the NLTK has a very convenient method for doing just this, which we can see in this small sample (the first 10 word tokens):" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 13, 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "data": { 260 | "text/plain": [ 261 | "Counter({'GOLD-BUG': 1,\n", 262 | " 'THE': 1,\n", 263 | " 'What': 1,\n", 264 | " 'fellow': 1,\n", 265 | " 'ho': 2,\n", 266 | " 'this': 1,\n", 267 | " 'what': 1})" 268 | ] 269 | }, 270 | "execution_count": 13, 271 | "metadata": {}, 272 | "output_type": "execute_result" 273 | } 274 | ], 275 | "source": [ 276 | "goldBugRealWordTokensSample = [word for word in goldBugTokens[:10] if word[0].isalpha()]\n", 277 | "goldBugRealWordFrequenciesSample = nltk.FreqDist(goldBugRealWordTokensSample)\n", 278 | "goldBugRealWordFrequenciesSample" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "This ```FreqDist``` object is a kind of dictionary, where each word is paired with its frequency (separated by a colon), and each pair is separated by a comma. This kind of dictionary also has a very convenient way of displaying results as a table:" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 14, 291 | "metadata": {}, 292 | "outputs": [ 293 | { 294 | "name": "stdout", 295 | "output_type": "stream", 296 | "text": [ 297 | " ho what GOLD-BUG fellow What THE this \n", 298 | " 2 1 1 1 1 1 1 \n" 299 | ] 300 | } 301 | ], 302 | "source": [ 303 | "goldBugRealWordFrequenciesSample.tabulate()" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "The results are displayed in descending order of frequency (two occurrences of \"ho\"). One of the things we can notice is that \"What\" and \"what\" are calculated separately, which in some cases may be good, but for our purposes probably isn't. This might lead us to rethink our steps until now and consider the possibility of converting our string to lowercase during tokenization." 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 15, 316 | "metadata": {}, 317 | "outputs": [ 318 | { 319 | "name": "stdout", 320 | "output_type": "stream", 321 | "text": [ 322 | "what ho gold-bug the fellow this \n", 323 | " 2 2 1 1 1 1 \n" 324 | ] 325 | } 326 | ], 327 | "source": [ 328 | "goldBugTokensLowercase = nltk.word_tokenize(goldBugString.lower()) # use lower() to convert entire string to lowercase\n", 329 | "goldBugRealWordTokensLowercaseSample = [word for word in goldBugTokensLowercase[:10] if word[0].isalpha()]\n", 330 | "goldBugRealWordFrequenciesSample = nltk.FreqDist(goldBugRealWordTokensLowercaseSample)\n", 331 | "goldBugRealWordFrequenciesSample.tabulate(20)" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "Good, now we have \"what\" and \"What\" as the same word form counted twice. (There are disadvantages to this as well, such as more difficulty in identifying proper names and the start of sentences, but text mining is often a set of compromises.)\n", 339 | "\n", 340 | "Let's redo our entire workflow with the full set of tokens (not just a sample)." 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 16, 346 | "metadata": { 347 | "collapsed": true 348 | }, 349 | "outputs": [], 350 | "source": [ 351 | "goldBugTokensLowercase = nltk.word_tokenize(goldBugString.lower())\n", 352 | "goldBugRealWordTokensLowercase = [word for word in goldBugTokensLowercase if word[0].isalpha()]\n", 353 | "goldBugRealWordFrequencies = nltk.FreqDist(goldBugRealWordTokensLowercase)" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "One simple way of measuring the vocabulary richness of an author is to calculate the ratio of the total number of words and the number of unique words. If an author repeats words more often, it may be because he or she is drawing on a smaller vocabulary (either deliberately or not), which is a measure of style. There are several factors to consider, such as the length of the text, but in the simplest terms we can calculate the lexical diversity of an author by dividng the number of word forms (types) by the total number of tokens. We already have the necessary ingredients:\n", 361 | "\n", 362 | "* types: number of different words (number of word: count pairs in ```goldBugRealWordFrequencies```)\n", 363 | "* tokens: total number of word tokens (length of ```goldBugRealWordTokensLowercase```" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 17, 369 | "metadata": {}, 370 | "outputs": [ 371 | { 372 | "name": "stdout", 373 | "output_type": "stream", 374 | "text": [ 375 | "number of types: 2681\n", 376 | "number of tokens: 13508\n", 377 | "type/token ratio: 0.1984749777909387\n" 378 | ] 379 | } 380 | ], 381 | "source": [ 382 | "print(\"number of types: \", len(goldBugRealWordFrequencies))\n", 383 | "print(\"number of tokens: \", len(goldBugRealWordTokensLowercase))\n", 384 | "print(\"type/token ratio: \", len(goldBugRealWordFrequencies)/len(goldBugRealWordTokensLowercase))" 385 | ] 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "metadata": {}, 390 | "source": [ 391 | "We haven't yet looked at our output for the top frequency lowercase words." 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 18, 397 | "metadata": {}, 398 | "outputs": [ 399 | { 400 | "name": "stdout", 401 | "output_type": "stream", 402 | "text": [ 403 | " the of and i to a in it you was that with for as had at he but this we \n", 404 | " 877 465 359 336 329 327 238 213 162 137 130 114 113 113 110 108 103 99 99 98 \n" 405 | ] 406 | } 407 | ], 408 | "source": [ 409 | "goldBugRealWordFrequencies.tabulate(20) # show a sample of the top frequency terms" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "We tokenized, filtered and counted in three lines of code, and then a fourth to show the top frequency terms, but the results aren't necessarily very exciting. There's not much in these top frequency words that could be construed as especially characteristic of _The Gold Bug_, in large part because the most frequent words are similar for most texts of a given language: they're so-called function words that have more of a syntactic (grammatical) function rather than a semantic (meaning-bearing) value." 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "\n", 424 | "Fortunately, our NLTK library contains a list of stop-words for English (and other languages). We can load the list and look at its contents." 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": 20, 430 | "metadata": {}, 431 | "outputs": [ 432 | { 433 | "name": "stdout", 434 | "output_type": "stream", 435 | "text": [ 436 | "['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'did', 'do', 'does', 'doing', 'don', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'itself', 'just', 'me', 'more', 'most', 'my', 'myself', 'no', 'nor', 'not', 'now', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 's', 'same', 'she', 'should', 'so', 'some', 'such', 't', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'you', 'your', 'yours', 'yourself', 'yourselves']\n" 437 | ] 438 | } 439 | ], 440 | "source": [ 441 | "import nltk\n", 442 | "stopwords = nltk.corpus.stopwords.words(\"english\")\n", 443 | "print(sorted(stopwords)) # sort them alphabetically before printing" 444 | ] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "metadata": {}, 449 | "source": [ 450 | "We can test whether one word is an item in another list with the following syntax, here using our small sample." 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": 22, 456 | "metadata": {}, 457 | "outputs": [ 458 | { 459 | "name": "stdout", 460 | "output_type": "stream", 461 | "text": [ 462 | "sample words: ['the', 'gold-bug', 'what', 'ho', 'what', 'ho', 'this', 'fellow']\n", 463 | "sample words not in stopwords list: ['gold-bug', 'ho', 'ho', 'fellow']\n" 464 | ] 465 | } 466 | ], 467 | "source": [ 468 | "print(\"sample words: \", goldBugRealWordTokensLowercaseSample)\n", 469 | "print(\"sample words not in stopwords list: \", [word for word in goldBugRealWordTokensLowercaseSample if not word in stopwords])" 470 | ] 471 | }, 472 | { 473 | "cell_type": "markdown", 474 | "metadata": {}, 475 | "source": [ 476 | "So we can now tweak our word filter with an additional condition, adding the ```and``` operator between the test for the alphabetic first character and the test for presence in the stopword list. We add a slash (\\) character to treat the code as if it were on one line. Alternatively, we could have done this in two steps (perhaps less efficient but arguably easier to read):\n", 477 | "\n", 478 | "```python\n", 479 | "# first filter tokens with alphabetic characters\n", 480 | "gbWords = [word for word in goldBugTokensLowercase if word[0].isalpha()]\n", 481 | "# then filter stopwords\n", 482 | "gbContentWords = [word for word in gbWords if word not in stopwords]```" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": 25, 488 | "metadata": {}, 489 | "outputs": [ 490 | { 491 | "name": "stdout", 492 | "output_type": "stream", 493 | "text": [ 494 | "upon de jupiter legrand one said well massa could bug skull parchment tree made first time two much us beetle \n", 495 | " 81 73 53 47 38 35 35 34 33 32 29 27 25 25 24 24 23 23 23 22 \n" 496 | ] 497 | } 498 | ], 499 | "source": [ 500 | "goldBugRealContentWordTokensLowercase = [word for word in goldBugTokensLowercase \\\n", 501 | " if word[0].isalpha() and word not in stopwords]\n", 502 | "goldBugRealContentWordFrequencies = nltk.FreqDist(goldBugRealContentWordTokensLowercase)\n", 503 | "goldBugRealContentWordFrequencies.tabulate(20) # show a sample of the top " 504 | ] 505 | }, 506 | { 507 | "cell_type": "markdown", 508 | "metadata": {}, 509 | "source": [ 510 | "Now we have words that seem a bit more meaningful (even if the table format is a bit off). The first word (\"upon\") could be considered a function word (a preposition) that should be in the stop-word list, though it's less common in modern English. The second word (\"de\") would be in a French stop-word list, but seems striking here in English. The third word \"'s\" is actually an artifact of possessive forms – sometimes tokenization keeps possessives together with the word, sometimes not. The next words (\"jupiter\" and \"legrand\") merit closer inspection, they may be proper names that have been transformed to lowercase. We can continue on like this with various observations and hypotheses, but really we probably want to have a closer look at individual occurences to see what's happening. For that, we'll build a concordance." 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "metadata": {}, 516 | "source": [ 517 | "## Building a Simple Concordance" 518 | ] 519 | }, 520 | { 521 | "cell_type": "markdown", 522 | "metadata": {}, 523 | "source": [ 524 | "A concordance allows us to see each occurrence of a term in its context. It has a rich history in textual scholarship, dating back to well before the advent of computers. It's a tool for studying word usage in context.\n", 525 | "\n", 526 | "The easiest way to build a concordance is to create an NLTK Text object from a list of word tokens (in this case we'll use the unfiltered list so that we can better read the text). So, for instance, we can ask for a concordance of \"de\" to try to better understand why it occurs so often in this English text." 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": 31, 532 | "metadata": {}, 533 | "outputs": [ 534 | { 535 | "name": "stdout", 536 | "output_type": "stream", 537 | "text": [ 538 | "Displaying 10 of 73 matches:\n", 539 | "ou , '' here interrupted Jupiter ; `` de bug is a goole bug , solid , ebery bi\n", 540 | "is your master ? '' `` Why , to speak de troof , massa , him not so berry well\n", 541 | " aint find nowhar -- dat 's just whar de shoe pinch -- my mind is got to be be\n", 542 | "taint worf while for to git mad about de matter -- Massa Will say noffin at al\n", 543 | " -- Massa Will say noffin at all aint de matter wid him -- but den what make h\n", 544 | "a gose ? And den he keep a syphon all de time -- '' '' Keeps a what , Jupiter \n", 545 | " , Jupiter ? '' `` Keeps a syphon wid de figgurs on de slate -- de queerest fi\n", 546 | "' `` Keeps a syphon wid de figgurs on de slate -- de queerest figgurs I ebber \n", 547 | " syphon wid de figgurs on de slate -- de queerest figgurs I ebber did see . Is\n", 548 | "vers . Todder day he gib me slip fore de sun up and was gone de whole ob de bl\n" 549 | ] 550 | } 551 | ], 552 | "source": [ 553 | "goldBugText = nltk.Text(goldBugTokens)\n", 554 | "goldBugText.concordance(\"de\", lines=10)" 555 | ] 556 | }, 557 | { 558 | "cell_type": "markdown", 559 | "metadata": {}, 560 | "source": [ 561 | "In the concordance view above all the occurrences of \"de\" are aligned to make scanning each occurrence easier." 562 | ] 563 | }, 564 | { 565 | "cell_type": "markdown", 566 | "metadata": {}, 567 | "source": [ 568 | "## Next Steps" 569 | ] 570 | }, 571 | { 572 | "cell_type": "markdown", 573 | "metadata": {}, 574 | "source": [ 575 | "Here are some tasks to try:\n", 576 | "\n", 577 | "* Show a table of the top 20 words\n", 578 | " * Choose 3 words to add to the stop-words list using list concatenation\n", 579 | " * Regenerate the list of the top 20 words using your new stop-words list\n", 580 | "* Instead of testing for presence in the stopword list, how would you test for words that contain 10 characters or more?\n", 581 | "* Determine whether or not the word provided to the concordance function is case sensitive\n", 582 | "\n", 583 | "In the next notebook we're going to get [Graphical](GettingGraphical.ipynb)." 584 | ] 585 | }, 586 | { 587 | "cell_type": "markdown", 588 | "metadata": {}, 589 | "source": [ 590 | "---\n", 591 | "[CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/) From [The Art of Literary Text Analysis](ArtOfLiteraryTextAnalysis.ipynb) by [Stéfan Sinclair](http://stefansinclair.name) & [Geoffrey Rockwell](http://geoffreyrockwell.com). Edited and revised by [Melissa Mony](http://melissamony.com).
Created February 7, 2015 and last modified January 14, 2018 (Jupyter 4)" 592 | ] 593 | } 594 | ], 595 | "metadata": { 596 | "kernelspec": { 597 | "display_name": "Python 3", 598 | "language": "python", 599 | "name": "python3" 600 | }, 601 | "language_info": { 602 | "codemirror_mode": { 603 | "name": "ipython", 604 | "version": 3 605 | }, 606 | "file_extension": ".py", 607 | "mimetype": "text/x-python", 608 | "name": "python", 609 | "nbconvert_exporter": "python", 610 | "pygments_lexer": "ipython3", 611 | "version": "3.7.1" 612 | } 613 | }, 614 | "nbformat": 4, 615 | "nbformat_minor": 1 616 | } 617 | --------------------------------------------------------------------------------