├── .gitignore
├── docs
    ├── _config.yml
    ├── start
    │   ├── inspect.png
    │   ├── html-options.png
    │   └── index.md
    ├── convert
    │   ├── indents.png
    │   ├── replace.png
    │   ├── find-in-project.png
    │   ├── find-in-project-dialog.png
    │   └── index.md
    ├── images
    │   ├── jupyter300.png
    │   ├── jupyter48.png
    │   ├── voyant300.png
    │   ├── voyant48.png
    │   ├── observable48.png
    │   └── observable300.png
    ├── visualize
    │   ├── iframe.png
    │   ├── plot.jpeg
    │   └── index.md
    ├── count
    │   ├── terms-columns.png
    │   └── index.md
    ├── setup
    │   ├── observable-login.png
    │   ├── jupyter-architecture.png
    │   ├── jupyter-architecture.graffle
    │   └── index.md
    ├── index.md
    ├── scrape
    │   └── index.md
    └── collocate
    │   └── index.md
├── ipynb
    ├── .gitignore
    ├── utilities
    │   ├── .DS_Store
    │   ├── .ipynb_checkpoints
    │   │   ├── Untitled-checkpoint.ipynb
    │   │   ├── My First Notebook-checkpoint.ipynb
    │   │   ├── SimpleSentimentAnalysis-checkpoint.ipynb
    │   │   └── Concordances-checkpoint.ipynb
    │   ├── Untitled.ipynb
    │   ├── SimpleSentimentAnalysis.ipynb
    │   └── Concordances.ipynb
    ├── experiments
    │   ├── .DS_Store
    │   ├── SmithImagery.png
    │   ├── SmithImageryFreqsByChapter.png
    │   └── SmithImageryWordList.txt
    ├── images
    │   ├── stop-server.png
    │   ├── access-texts.png
    │   ├── folder-rename.png
    │   ├── logo_anaconda.png
    │   ├── markdown-cell.png
    │   ├── new-notebook.png
    │   ├── notebook-launch.png
    │   ├── notebook-ui-tour.png
    │   ├── rename-notebook.png
    │   ├── anaconda-download.png
    │   ├── anaconda-launcher.png
    │   ├── cosine-similarity.png
    │   ├── hello-world-error.png
    │   ├── new-notebook-header.png
    │   ├── nltk-data-download.png
    │   ├── hello-world-markdown.png
    │   ├── anaconda-launcher-menu-2.png
    │   ├── anaconda-launcher-menu.png
    │   ├── hello-world-dynamic-time.png
    │   ├── hello-world-first-code.png
    │   ├── terminal-ipython-install.png
    │   ├── ipython-notebook-root-tree.png
    │   ├── characteristic-curve-mendenhall.png
    │   └── network-graph-students-schools.png
    ├── HelloWorld.ipynb
    ├── Useful Resources.ipynb
    ├── ArtOfLiteraryTextAnalysis.ipynb
    ├── Nltk.ipynb
    ├── GettingSetup.ipynb
    ├── Converting.ipynb
    ├── Glossary.ipynb
    └── GettingNltk.ipynb
├── README.md
├── spiral
    └── CharacteristicCurve.json
└── assets
    └── css
        └── style.scss


/.gitignore:
--------------------------------------------------------------------------------
1 | /.project
2 | .DS_Store
3 | 


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-slate


--------------------------------------------------------------------------------
/ipynb/.gitignore:
--------------------------------------------------------------------------------
1 | /.ipynb_checkpoints/
2 | /.DS_Store
3 | 


--------------------------------------------------------------------------------
/docs/start/inspect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/start/inspect.png


--------------------------------------------------------------------------------
/docs/convert/indents.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/convert/indents.png


--------------------------------------------------------------------------------
/docs/convert/replace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/convert/replace.png


--------------------------------------------------------------------------------
/docs/images/jupyter300.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/images/jupyter300.png


--------------------------------------------------------------------------------
/docs/images/jupyter48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/images/jupyter48.png


--------------------------------------------------------------------------------
/docs/images/voyant300.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/images/voyant300.png


--------------------------------------------------------------------------------
/docs/images/voyant48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/images/voyant48.png


--------------------------------------------------------------------------------
/docs/visualize/iframe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/visualize/iframe.png


--------------------------------------------------------------------------------
/docs/visualize/plot.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/visualize/plot.jpeg


--------------------------------------------------------------------------------
/ipynb/utilities/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/utilities/.DS_Store


--------------------------------------------------------------------------------
/docs/count/terms-columns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/count/terms-columns.png


--------------------------------------------------------------------------------
/docs/images/observable48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/images/observable48.png


--------------------------------------------------------------------------------
/docs/start/html-options.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/start/html-options.png


--------------------------------------------------------------------------------
/ipynb/experiments/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/experiments/.DS_Store


--------------------------------------------------------------------------------
/ipynb/images/stop-server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/stop-server.png


--------------------------------------------------------------------------------
/docs/images/observable300.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/images/observable300.png


--------------------------------------------------------------------------------
/docs/setup/observable-login.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/setup/observable-login.png


--------------------------------------------------------------------------------
/ipynb/images/access-texts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/access-texts.png


--------------------------------------------------------------------------------
/ipynb/images/folder-rename.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/folder-rename.png


--------------------------------------------------------------------------------
/ipynb/images/logo_anaconda.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/logo_anaconda.png


--------------------------------------------------------------------------------
/ipynb/images/markdown-cell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/markdown-cell.png


--------------------------------------------------------------------------------
/ipynb/images/new-notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/new-notebook.png


--------------------------------------------------------------------------------
/docs/convert/find-in-project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/convert/find-in-project.png


--------------------------------------------------------------------------------
/ipynb/images/notebook-launch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/notebook-launch.png


--------------------------------------------------------------------------------
/ipynb/images/notebook-ui-tour.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/notebook-ui-tour.png


--------------------------------------------------------------------------------
/ipynb/images/rename-notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/rename-notebook.png


--------------------------------------------------------------------------------
/docs/setup/jupyter-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/setup/jupyter-architecture.png


--------------------------------------------------------------------------------
/ipynb/experiments/SmithImagery.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/experiments/SmithImagery.png


--------------------------------------------------------------------------------
/ipynb/images/anaconda-download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/anaconda-download.png


--------------------------------------------------------------------------------
/ipynb/images/anaconda-launcher.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/anaconda-launcher.png


--------------------------------------------------------------------------------
/ipynb/images/cosine-similarity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/cosine-similarity.png


--------------------------------------------------------------------------------
/ipynb/images/hello-world-error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/hello-world-error.png


--------------------------------------------------------------------------------
/ipynb/images/new-notebook-header.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/new-notebook-header.png


--------------------------------------------------------------------------------
/ipynb/images/nltk-data-download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/nltk-data-download.png


--------------------------------------------------------------------------------
/ipynb/images/hello-world-markdown.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/hello-world-markdown.png


--------------------------------------------------------------------------------
/docs/convert/find-in-project-dialog.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/convert/find-in-project-dialog.png


--------------------------------------------------------------------------------
/docs/setup/jupyter-architecture.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/docs/setup/jupyter-architecture.graffle


--------------------------------------------------------------------------------
/ipynb/images/anaconda-launcher-menu-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/anaconda-launcher-menu-2.png


--------------------------------------------------------------------------------
/ipynb/images/anaconda-launcher-menu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/anaconda-launcher-menu.png


--------------------------------------------------------------------------------
/ipynb/images/hello-world-dynamic-time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/hello-world-dynamic-time.png


--------------------------------------------------------------------------------
/ipynb/images/hello-world-first-code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/hello-world-first-code.png


--------------------------------------------------------------------------------
/ipynb/images/terminal-ipython-install.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/terminal-ipython-install.png


--------------------------------------------------------------------------------
/ipynb/images/ipython-notebook-root-tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/ipython-notebook-root-tree.png


--------------------------------------------------------------------------------
/ipynb/experiments/SmithImageryFreqsByChapter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/experiments/SmithImageryFreqsByChapter.png


--------------------------------------------------------------------------------
/ipynb/images/characteristic-curve-mendenhall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/characteristic-curve-mendenhall.png


--------------------------------------------------------------------------------
/ipynb/images/network-graph-students-schools.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgsinclair/alta/HEAD/ipynb/images/network-graph-students-schools.png


--------------------------------------------------------------------------------
/ipynb/utilities/.ipynb_checkpoints/Untitled-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/ipynb/utilities/.ipynb_checkpoints/My First Notebook-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | The Art of Literary Text Analysis
2 | ====
3 | 
4 | Please see the [Juypter (python) version](https://github.com/sgsinclair/alta/blob/master/ipynb/ArtOfLiteraryTextAnalysis.ipynb).
5 | 


--------------------------------------------------------------------------------
/ipynb/utilities/Untitled.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Gathering web pages\n",
 8 |     "\n",
 9 |     "This utility script is for gathering the text of a collection of web sites. It assumes you have a CSV with a list of URLs and it adds the results of the gathering back into the CSV."
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "markdown",
14 |    "metadata": {},
15 |    "source": [
16 |     "## Opening the CSV\n",
17 |     "\n",
18 |     "This opens a CSV and extracts the URLs putting them into a list. Alternatively you can use a "
19 |    ]
20 |   },
21 |   {
22 |    "cell_type": "markdown",
23 |    "metadata": {},
24 |    "source": [
25 |     "## Getting the HTML\n",
26 |     "\n",
27 |     "This function gets the HTML given a URL."
28 |    ]
29 |   },
30 |   {
31 |    "cell_type": "markdown",
32 |    "metadata": {},
33 |    "source": [
34 |     "## Cleaning the HTML\n",
35 |     "\n",
36 |     "This function cleans the HTML "
37 |    ]
38 |   }
39 |  ],
40 |  "metadata": {
41 |   "kernelspec": {
42 |    "display_name": "Python 3",
43 |    "language": "python",
44 |    "name": "python3"
45 |   },
46 |   "language_info": {
47 |    "codemirror_mode": {
48 |     "name": "ipython",
49 |     "version": 3
50 |    },
51 |    "file_extension": ".py",
52 |    "mimetype": "text/x-python",
53 |    "name": "python",
54 |    "nbconvert_exporter": "python",
55 |    "pygments_lexer": "ipython3",
56 |    "version": "3.5.1"
57 |   }
58 |  },
59 |  "nbformat": 4,
60 |  "nbformat_minor": 0
61 | }
62 | 


--------------------------------------------------------------------------------
/docs/visualize/index.md:
--------------------------------------------------------------------------------
 1 | # Visualizing with the Art of Literary Text Mining
 2 | 
 3 | ## Visualizing with Voyant
 4 | 
 5 | <iframe src="https://voyant-tools.org/tool/Knots/?corpus=austen" style="width: 300px; height: 300px; float: right;"></iframe> ![Voyant](../images/voyant48.png) Voyant is in large part about visualization so we won't spend too much time with it here except to refer to a couple of tools that are perhaps less on the beaten path:
 6 | 
 7 | 1. Bubbles
 8 | 1. TextArc
 9 | 
10 | But there are many others, have a look!
11 | 
12 | ## Embedding Voyant
13 | 
14 | One of the more powerful aspects of Voyant is that you can embed a live, functional tool in another page, much as you would embed a video clip from YouTube or Vimeo. See the [documentation](https://voyant-tools.org/docs/#!/guide/embedding). For instance, the tool to the right has been embedded with this code:<br clear="all">
15 | 
16 | 	<iframe src="https://voyant-tools.org/tool/Austen/?corpus=austen&view=knots"
17 | 		style="width: 300px; height=300px; float: right;"></iframe>
18 | 
19 | It's worth noting that the &lt;iframe&gt; tag is usually filtered out of a markdown document in GitHub, but it *is* possible to embed Voyant into a Jupyter Notebook. Just using the `iframe` tag won't work directly, but you can use the `IFRAME` class from the [IPython.display] module](https://ipython.readthedocs.io/en/stable/api/generated/IPython.display.html?highlight=iframe#classes).
20 | 
21 | 	from IPython.display import IFrame    
22 | 	IFrame('https://voyant-tools.org/tool/Cirrus/?corpus=austen', width=300, height=300)
23 | 
24 | <img src="iframe.png" alt="IFRAME" style="max-width: 800px" >
25 | 
26 | ## Visualizing with Jupyter
27 | 
28 | ![Jupyter](../images/jupyter48.png) One of the benefits of working with libraries like NLTK (which we've already introduced in a previous notebook) is that there are built-in libraries for simple plotting. For example, it's very easy to go from a text to a graph of word frequencies, something like this:
29 | 
30 | ```python
31 | import nltk
32 | %matplotlib inline # magical incantation needed for first graph
33 | 
34 | emma = nltk.corpus.gutenberg.words('austen-emma.txt') # load words
35 | stopwords = nltk.corpus.stopwords.words("English") # load stopwords
36 | # filter words that are alphabetic and not in stopword list
37 | words = [word.lower() for word in emma if word[0].isalpha() and not word.lower() in stopwords]
38 | freqs = nltk.FreqDist(words) # build frequency list
39 | freqs.plot(25) # plot the top 25 words
40 | ```
41 | 
42 | <img src="plot.jpeg" alt="Plot of frequencies" style="max-width: 300px" >
43 | 
44 | To continue with graphing, please consult [Getting Graphical](https://nbviewer.jupyter.org/github/sgsinclair/alta/blob/master/ipynb/GettingGraphical.ipynb) in the Art of Literary Programming with Python.
45 | 


--------------------------------------------------------------------------------
/ipynb/HelloWorld.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Hello World!"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This is _Hello World!_, my first iPython Notebook"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 3,
 20 |    "metadata": {},
 21 |    "outputs": [
 22 |     {
 23 |      "name": "stdout",
 24 |      "output_type": "stream",
 25 |      "text": [
 26 |       "Hello World!\n"
 27 |      ]
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "print(\"Hello World!\")"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "Now let's try printing dynamic content like the current time."
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 4,
 44 |    "metadata": {},
 45 |    "outputs": [
 46 |     {
 47 |      "name": "stdout",
 48 |      "output_type": "stream",
 49 |      "text": [
 50 |       "Hello World! It's Monday January 12, 2015\n"
 51 |      ]
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "import time\n",
 56 |     "print(\"Hello World! It's\", time.strftime(\"%A %B %e, %Y\"))"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "Things we've learned in this Notebook\n",
 64 |     "--\n",
 65 |     "* creating a new notebook\n",
 66 |     "* basic user interface of a notebook\n",
 67 |     "* printing a static string like _Hello World!_\n",
 68 |     "* debugging syntax errors\n",
 69 |     "* printing a dymamic string with the current time\n",
 70 |     "* a bit more about Markdown (see http://daringfireball.net/projects/markdown/syntax)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "---\n",
 78 |     "This is a template from the [GettingStarted](GettingStarted.ipynb) notebook.\n",
 79 |     "\n",
 80 |     "From [The Art of Literary Text Analysis](https://github.com/sgsinclair/alta) by [Stéfan Sinclair](http://stefansinclair.name) &amp; [Geoffrey Rockwell](http://geoffreyrockwell.com), [CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/)\n",
 81 |     "\n",
 82 |     "Created January 12, 2015"
 83 |    ]
 84 |   }
 85 |  ],
 86 |  "metadata": {
 87 |   "kernelspec": {
 88 |    "display_name": "Python 3",
 89 |    "language": "python",
 90 |    "name": "python3"
 91 |   },
 92 |   "language_info": {
 93 |    "codemirror_mode": {
 94 |     "name": "ipython",
 95 |     "version": 3
 96 |    },
 97 |    "file_extension": ".py",
 98 |    "mimetype": "text/x-python",
 99 |    "name": "python",
100 |    "nbconvert_exporter": "python",
101 |    "pygments_lexer": "ipython3",
102 |    "version": "3.6.3"
103 |   }
104 |  },
105 |  "nbformat": 4,
106 |  "nbformat_minor": 1
107 | }
108 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # The Art of Literary Text Mining
 2 | 
 3 | This is a meta-guide that is intended to help you work through our guides for _The Art of Literary Text Mining_.
 4 | 
 5 | ## Guides
 6 | 
 7 | ![Voyant](images/voyant48.png) [The Art of Literary Text Mining for Voyant](./voyant/): Voyant is a *web-based* collection of text analysis and visualizations tools, it can be relatively easy to start using but is limited to the pre-packaged functionality that is already implemented.
 8 | 
 9 | ![Jupyter](images/jupyter48.png) [The Art of Literary Text Mining for Python Jupyter Notebooks](../ipynb/ArtOfLiteraryTextAnalysis.ipynb): Python is a programming language with a huge number of useful libraries but it can take a while to become proficient in any programming language.
10 | 
11 | ![ObservableHQ](images/observable48.png) [The Art of Literary Text Mining for ObservableHQ and VoyantJS](https://beta.observablehq.com/@sgsinclair/alta): This uses Javascript as a core programming language (takes some effort to learn) but has the benefit of being highly shareable as web-based resources. This approach exposes some of the analytic and visualization functionality of Voyant while allowing for more customized processing.
12 | 
13 | Usually you would probably want to work through just one of these guides but there are cases when working with one or more guides together is preferable; this meta-guide is for this mixed approach.
14 | 
15 | Why work through the materials of more than one guide? One reason is to fully appreciate the strengths and weaknesses of more than one approach. We firmly believe that no one tool or even one framework is ideal for all problems and that it can be useful to be familiar with more than solution. Indeed, our three guides have their own pros and cons that can be significant for a given task or a given project. The following is a very simplistic view of some of the characteristics of each approach:
16 | 
17 | ## Comparison
18 | 
19 | | | ![Voyant](images/voyant48.png)<br>Voyant | ![Jupyter](images/jupyter48.png)<br>Juypter | ![ObservableHQ](images/observable48.png)<br>ObservableHQ+VoyantJS |
20 | |-|-|-|-|
21 | | **setup&nbsp;and&nbsp;configuration** | no setup for hosted version, easy desktop version | usually requires some setup | no setup |
22 | | **text&nbsp;analysis&nbsp;specificity** | text analysis specific | infinitely generalizable | mixed specificity of VoyantJS for text analysis and Javascript more generally |
23 | | **shareable** | Voyant URLs of tools and corpora | compatible with GitHub | web-based |
24 | | **scalable** | optimized for up to hundreds of documents | very scalable | somewhat limited to browser resources |
25 | 
26 | ## Topics
27 | 
28 | * [setup the environments](./setup/)
29 | * [getting started](./start/)
30 | * [scraping a corpus](./scrape/)
31 | * [converting a corpus](./convert/)
32 | * [frequencies](./count/)
33 | * [collocates](./collocate/)
34 | * [visualize](./visualize/)
35 | * semantics
36 | * parts-of-speech
37 | * sentiment
38 | * similarity
39 | 


--------------------------------------------------------------------------------
/ipynb/Useful Resources.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Useful Resources\n"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### Programming Basics \n",
 15 |     "* [Codecademy](https://www.codecademy.com/learn/learn-python) - Online learning platform which offers free interactive lessons covering the very basics of programming languages.\n",
 16 |     "* [Google's Python Class](https://developers.google.com/edu/python/) - A combination of written materials, instructional videos and coding exersises to practice Python programming. \n",
 17 |     "* [Pyschools](http://www.pyschools.com/) - Practical python tutorials for beginners and beyond. Note - you must have a google account to sign-up.\n",
 18 |     "* [Udacity](https://www.udacity.com/course/programming-foundations-with-python--ud036) - Introduction python programming class with mini-projects in each lesson.\n",
 19 |     "* [Tutorialspoint](https://www.tutorialspoint.com/python/python_basic_syntax.htm) - Basics of python syntax.\n",
 20 |     "---"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "### Software & Libraries\n",
 28 |     "* [Anaconda](https://www.anaconda.com/download/#macos) - Suite of data science applications \n",
 29 |     "* [Gensim](https://radimrehurek.com/gensim/) - Topic Modelling toolkit for Python\n",
 30 |     "* [NLTK](http://www.nltk.org/) - Natural Language Toolkit\n",
 31 |     "\n",
 32 |     "---\n"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "### Python Resources\n",
 40 |     "* [The Python Wiki](https://wiki.python.org/moin/FrontPage) - A comprehensive encyclopedia of python related information including a beginners guide, common problems and links to many useful resources.\n",
 41 |     "* [Stack Overflow](https://stackoverflow.com/) - An excellent community driven question-answer problem solving resource for even the trickiest of python conundrums.\n",
 42 |     "\n",
 43 |     "---"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "### Further Explorations\n",
 51 |     "* [Voyant](https://voyant-tools.org/) - Open source web application for text analysis featuring a plethora of data and visualization tools.\n",
 52 |     "* [Big Data by Neal Caren](http://nealcaren.web.unc.edu/big-data/) - Tutorials which cover the fundamentals of quantitative text analysis for social scientists.\n",
 53 |     "---"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "### Open Source Materials\n",
 61 |     "\n",
 62 |     "* [Project Gutenberg](http://gutenberg.ca/index.html) - Digital editions of classic literature in the public domain\n",
 63 |     "\n",
 64 |     "---"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "[CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/) From [The Art of Literary Text Analysis](ArtOfLiteraryTextAnalysis.ipynb) by [Stéfan Sinclair](http://stefansinclair.name) &amp; [Geoffrey Rockwell](http://geoffreyrockwell.com). Edited and revised by [Melissa Mony](http://melissamony.com). <br /> "
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {
 78 |     "collapsed": true
 79 |    },
 80 |    "outputs": [],
 81 |    "source": []
 82 |   }
 83 |  ],
 84 |  "metadata": {
 85 |   "kernelspec": {
 86 |    "display_name": "Python 3",
 87 |    "language": "python",
 88 |    "name": "python3"
 89 |   },
 90 |   "language_info": {
 91 |    "codemirror_mode": {
 92 |     "name": "ipython",
 93 |     "version": 3
 94 |    },
 95 |    "file_extension": ".py",
 96 |    "mimetype": "text/x-python",
 97 |    "name": "python",
 98 |    "nbconvert_exporter": "python",
 99 |    "pygments_lexer": "ipython3",
100 |    "version": "3.6.3"
101 |   }
102 |  },
103 |  "nbformat": 4,
104 |  "nbformat_minor": 2
105 | }
106 | 


--------------------------------------------------------------------------------
/docs/setup/index.md:
--------------------------------------------------------------------------------
 1 | # Setting up The Art of Setting up Literary Text Mining
 2 | 
 3 | This is part of the [Art of Literary Text Mining](../) collection. This page is intended to briefly describe how to get setup and configured with our three environments: Voyant Tools, Juypter Notebooks, ObservableHQ.
 4 | 
 5 | The first step with any tool or framework is to ensure that whatever setup and configuration needed are performed. Because of the nature of the technologies the work involved is different for each of our guides.
 6 | 
 7 | ## Voyant
 8 | 
 9 | ![Voyant](../images/voyant48.png) Voyant Tools is a hosted website [voyant-tools.org](https://voyant-tools.org) that requires no setup, no login, and no configuration. However, that simplicity comes with a price: the hosted version is widely used by people all over the world and that excerpts pressure on the server, which sometimes causes downtime and other issues. For this reason (and others, such as data privacy), it's highly recommended that you [download and install the Desktop version of Voyant Tools](https://github.com/sgsinclair/VoyantServer/wiki/VoyantServer-Desktop) – in most cases it's as simple as downloading a zip file, uncompressing it, and clicking on the application launcher.
10 | 
11 | As mentioned, the hosted version is sometimes over-extended. If the server doesn't seem to respond, wait a few seconds, up to a minute, and try again (the server usually restored itself within a few seconds).
12 | 
13 | If you're trying to get the Desktop version functioning and it won't, there are three common issues to check:
14 | 
15 | 1. [On Windows](https://github.com/sgsinclair/VoyantServer/wiki/VoyantServer-Desktop#windows), be sure that you extracted the downloaded VoyantServer.zip file into a real directory, not just double-click on the ZIP file to uncompress it.
16 | 
17 | 1. [On Mac](https://github.com/sgsinclair/VoyantServer/wiki/VoyantServer-Desktop#mac), the first time you launch VoyantServer, you should right-click or ctrl-click on the VoyantServer.jar file as this will allow you to circumvent the operating system's security block for unsigned applications.
18 | 
19 | 1. Check the memory [settings](https://github.com/sgsinclair/VoyantServer/wiki/VoyantServer-Desktop#settings): if you have an older machine with a limited amount of RAM memory, try opening the file called `server_settings.txt` in the same directory as VoyantServer.jar and change the value "1024" to "512" (or even "256") before saving the text file and trying to relaunch VoyantServer.
20 | 
21 | ## Jupyter Notebooks
22 | 
23 | ![Jupyter](../images/jupyter48.png) Jupyter tends to be the most intensive solution to setup and configure, especially if you set it up on your local machine. There are a lot of instructions out there for getting setup, especially depending on platform on system preferences, but the [Getting Setup notebook](https://nbviewer.jupyter.org/github/sgsinclair/alta/blob/master/ipynb/GettingSetup.ipynb) is a good place to start.
24 | 
25 | One very important thing: we want to use Python 3.x or higher (not Python 2.x) – that should be obvious throughout, but it's worth double-checking as you select the dowload file from Anaconda.
26 | 
27 | The recommended approach is to intall Anaconda on your system. Think of Anaconda its own environment that's installed on your system and that is isolated from other important system files. Anaconda is a sandbox that contains the Jupyter application and the Jupyter application allows you to create Jupyter notebooks.
28 | 
29 | ![Anaconda Architecture](jupyter-architecture.png)
30 | 
31 | Unlike Voyant and ObservableHQ that are always-available web applications, Jupyter Notebooks has to be launched and be running in order to be used. This is an important distinction from our other environments: a "live" notebook (that can be edited) must have a process running somewhere, most likely on your computer. That process stores current contents in memory and handles the execution of code. So getting started each time will involve the following steps:
32 | 
33 | 1. launch Anaconda Navigator (from your applications or desktop)
34 | 1. launch Jupyter Notebooks (from Anaconda Navigator, which launches browswer window)
35 | 1. create or open a Juypyter Notebook (in browser)
36 | 
37 | As we proceed we will want to use some Python helper libraries that are not installed by default in Anaconda. We will return to this, but it's worth emphasizing now that installation happens within our Anaconda environment (and doesn't interfere with other system files). Similarly, it's possible to have multiple Anaconda installations that are independent, but for now we'll assume that we have one installation and that any modifications happen to that one installation.
38 | 
39 | ## ObservableHQ
40 | 
41 | ![ObservableHQ](../images/observable48.png) ObservableHQ is also a hosted website [observablehq.com](https://observablehq.com). It's possible to visit ObservableHQ and make anonymous changes to a notebook (like [this one](https://beta.observablehq.com/@observablehq/fork-share-merge)), but in order to save changes you need to login through one of the authentication services (currently GitHub, Twitter and Google – because we can use GitHub to store data, we strongly recommend that option).
42 | 
43 | ![ObservableHQ Login](observable-login.png)
44 | 
45 | ## Next Steps
46 | 
47 | Now that we have a minimal setup for all three environments we can proceed to [getting started](../start/).


--------------------------------------------------------------------------------
/ipynb/ArtOfLiteraryTextAnalysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# The Art of Literary Text Analysis"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "The Art of Literary Text Analysis (ALTA) has three objectives. \n",
 15 |     "\n",
 16 |     "- First, to introduce concepts and methodologies for literary text analysis programming. It doesn't assume you know how to program or how to use digital tools for analyzing texts. \n",
 17 |     "\n",
 18 |     "- Second, to show a range of analytical techniques for the study of texts. While it cannot explain and demonstrate everything, it provides a starting point for humanists with links to other materials.\n",
 19 |     "\n",
 20 |     "- Third, to provide utility notebooks you can use for operating on different texts. These are less well documented and combine ideas from the introductory notebooks.\n",
 21 |     "\n",
 22 |     "This instance of The Art of Literary Text Analysis is created in Jupyter Notebooks based on the Python scripting language. Other programming choices are available, and many conceptual aspects of the guide are relevant regardless of the language and implementation. \n",
 23 |     "\n",
 24 |     "**Jupyter Notebooks** was chosen for three main reasons: \n",
 25 |     "\n",
 26 |     "1. Python (the programming language used in Jupyter Notebooks) features extensive support for text analysis and natural language processing; \n",
 27 |     "\n",
 28 |     "2. Python is a great programming language to learn for those learning to program for the first time – it's not easy, but it represents a good balance between power, speed, readability and learnability;\n",
 29 |     "\n",
 30 |     "3. Jupyter Notebooks offers a _literate programming_ model of writing where blocks of prose text (like this one) can be interspersed with bits of code and output allowing us to use it to write this guide and you to write up your experiments. _The Art of Literary Text Analysis_ focuses on the thinking through of analytical processes, and the documentation-rich format offered by Jupyter Notebooks is well-suited to the nature of this guide and to helping you think through what you want to do."
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "## Table of Contents"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "This guide is a work in progress. It was developed over the Winter of 2015 in conjunction with a course on literary text mining at McGill. It has been [forked](Glossary.ipynb#Fork \"A cloned copy of a project which is set-up on a independent branch seperate to the original.\") and extended for a course in the Winter of 2016 on big data and analysis at the University of Alberta. Here is the current outline:\n",
 45 |     "\n",
 46 |     "* First Encounters (basics for working with Jupyter Notebooks and digital texts)\n",
 47 |     "\t* [Getting Setup](GettingSetup.ipynb) (installing and setting up Jupyter Notebooks)\n",
 48 |     "\t* [Getting Started](GettingStarted.ipynb) (introducing core Jupyter Notebooks concepts)\n",
 49 |     "\t* [Getting Texts](GettingTexts.ipynb) (an example of acquiring digital texts)\n",
 50 |     "\t* [Getting NLTK](GettingNltk.ipynb) (foundations for text processing using the Natural Language Toolkit)\n",
 51 |     "\t* [Getting Graphical](GettingGraphical.ipynb) (foundations for visualizing data)\n",
 52 |     "* Close Encounters\n",
 53 |     "\t* [Searching for Meaning](SearchingMeaning.ipynb) (searching variant word forms and word meanings)\n",
 54 |     "\t* [Parts of Speech](PartsOfSpeech.ipynb) (analysing parts of speech (nouns, adjectives, verbs, etc.) of documents\n",
 55 |     "\t* [Repeating Phrases](RepeatingPhrases.ipynb) (analyzing repeating sequences of words)\n",
 56 |     "* Distant Encounters \n",
 57 |     "\t* [Sentiment Analysis](SentimentAnalysis.ipynb) (measuring opinion or mood of texts)\n",
 58 |     "    * [Topic Modelling](TopicModelling.ipynb) (finding recurring groups of terms)\n",
 59 |     "    * [Document Similarity](DocumentSimilarity.ipynb) (measuring and visualizing distances between documents)\n",
 60 |     "* Utility Examples\n",
 61 |     "    * [Simple Sentiment Analysis](utilities/SimpleSentimentAnalysis.ipynb) (measuring sentiment with a simple dictionary in the notebook)\n",
 62 |     "    * [Complex Sentiment Analysis](utilities/ComplexSentimentAnalysis.ipynb) (using research dictionaries to measure sentiment)\n",
 63 |     "    * [Collocates](utilities/Collocates.ipynb) (identifying collocates for a target word)\n",
 64 |     "    * [Concordances](utilities/Concordances.ipynb) (generating a concordance for a target word)\n",
 65 |     "    * [Exploring a text with NLTK](utilities/Exploring a text with NLTK.ipynb) (shows simple ways you can explore a text with NLTK.)\n",
 66 |     "* Resources\n",
 67 |     "    * [Useful Links](Useful Resources.ipynb) (A myriad of helpful python and text analysis resources)\n",
 68 |     "    * [Glossary](Glossary.ipynb) (Definitions and explanations for concepts and jargon)\n",
 69 |     "    "
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {
 75 |     "collapsed": true
 76 |    },
 77 |    "source": [
 78 |     "---\n",
 79 |     "[CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/) From [The Art of Literary Text Analysis](ArtOfLiteraryTextAnalysis.ipynb) by [Stéfan Sinclair](http://stefansinclair.name) &amp; [Geoffrey Rockwell](http://geoffreyrockwell.com). Edited and revised by [Melissa Mony](http://melissamony.com).\n",
 80 |     "<br >Created January 7, 2015 and last modified January 12, 2018 (Jupyter 5.0.0)"
 81 |    ]
 82 |   }
 83 |  ],
 84 |  "metadata": {
 85 |   "kernelspec": {
 86 |    "display_name": "Python 3",
 87 |    "language": "python",
 88 |    "name": "python3"
 89 |   },
 90 |   "language_info": {
 91 |    "codemirror_mode": {
 92 |     "name": "ipython",
 93 |     "version": 3
 94 |    },
 95 |    "file_extension": ".py",
 96 |    "mimetype": "text/x-python",
 97 |    "name": "python",
 98 |    "nbconvert_exporter": "python",
 99 |    "pygments_lexer": "ipython3",
100 |    "version": "3.6.3"
101 |   }
102 |  },
103 |  "nbformat": 4,
104 |  "nbformat_minor": 1
105 | }
106 | 


--------------------------------------------------------------------------------
/docs/scrape/index.md:
--------------------------------------------------------------------------------
 1 | # Web Scraping with the Art of Literary Text Analysis
 2 | 
 3 | This is part of the [Art of Literary Text Mining](../) collection. This page is intended to briefly describe how to get started with web scraping, particularly with Juypter Notebooks and the `wget` command.
 4 | 
 5 | A very common task when working with text analysis is aquiring a corpus of texts, frequently sourced from the web. Web scraping (or harvesting) is the act of fetching content from the web and extracting relevant content. There are two major kinds of web scraping:
 6 | 
 7 | 1. fetching the contents from a list of specific URLs
 8 | 1. fetching as much of web site as possible, often by following links from one page to another (sometimes also called web crawling)
 9 | 
10 | For the first type it's possible to have code that produces a list of URLs to fetch, this is essentially what we did in the [Getting Started](../start/) guide page, especially the Jupyter version using Beautiful Soup. This is a good example of how tools can be mixed and matched in various ways: you could have a Jupyter notebook produce a list of URLs and then provide that list of URLs to Voyant.
11 | 
12 | ## Voyant
13 | 
14 | ![Voyant](../images/voyant48.png) Voyant's web scraping abilities are limited in that it assumes that you'll provide a list of URLs and there's no mechanism for parsing the contents of those URLs in order fetch additional URLs. Still, it can be enormously convenient to paste a list of several URLs and have Voyant construct a corpus from them. Please note that any processing options are applied to all documents as appropriate (for instance, it's not possible to have different HTML CSS Selectors for different URLs, though it is possible to add documents individually by [modifying a corpus](https://voyant-tools.org/docs/#!/guide/modifyingcorpus).
15 | 
16 | Even if URL fetching in Voyant is convenient, there are times where doing the web scraping outside of Voyant is preferable. One such situation is where you have many URLs, say more than about a dozen. Voyant has to fetch each URL one at a time and that can be time-consuming, which can cause a server timeout. Moreover, if ever an error is encountered you'd need to fetching over from the beginning next time. In fact, we recommend only fetching up to about three URLs at a time.
17 | 
18 | Another situation is where you need to do some intermediate processing to the documents before analyzing them. In that case, you would scrape (download) them (possibly using techniques described below), edit the documents, and then upload them to Voyant.
19 | 
20 | ## Jupyter Notebook
21 | 
22 | ![Jupyter](../images/jupyter48.png) Our Juypyter notebook will walk through the following steps:
23 | 
24 | * fetching the contents at http://www.digitalhumanities.org/dhq/index/title.html
25 | * parsing that document to get list of all the articles in the journal
26 | * fetching the contents of each of the article
27 | 
28 | To continue, please see [Web Scraping](https://nbviewer.jupyter.org/github/sgsinclair/alta/blob/master/ipynb/Scraping.ipynb) with the Art of Literary Text Analysis.
29 | 
30 | ## Wget Command
31 | 
32 | Web scraping is such a common task that there are dedicated tools for doing it. Web scraping is not only important for people doing text analysis, but also, for instance, to anyone building a web search engine or otherwise wanting to create an archive of a site. One of the most widely used tools is a command-line utility called [`wget`](https://en.m.wikipedia.org/wiki/Wget). Here's a partial list of some of `wget`'s functionality:
33 | 
34 | * fetch a single page (HTML source only): `wget http://www.digitalhumanities.org/dhq/`
35 | * fetch a single page and its assets: `wget -p -k http://www.digitalhumanities.org/dhq/`
36 | * fetch all URLs listed in the specified file: `wget -i urls.txt`
37 | * fetch a URL and recursively fetch all URLs in the contents: `wget -r http://www.digitalhumanities.org/dhq/`
38 | 
39 | A disadvantage of `wget` is that it's not pre-installed on OS X or Windows, but we can remedy that by following the easy instructions found at the _Programming Historian_'s [Automated Downloading with Wget](https://programminghistorian.org/en/lessons/automated-downloading-with-wget#step-one-installation).
40 | 
41 | For OS X the instructions on the page above are a bit out of date, here are the commands that seem to work best currently (from the [Homebrew](https://brew.sh) page:
42 | 
43 | 	/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
44 | 
45 | 	 brew install wget
46 | 
47 | Once installed, we'll also follow the instructions in the [next section](https://programminghistorian.org/en/lessons/automated-downloading-with-wget#step-two-learning-about-the-structure-of-wget--downloading-a-specific-set-of-files) on creating a data directory from which we'll run our command.
48 | 
49 | 	mkdir dhq
50 | 	cd dhq
51 | 
52 | The first command is to "make directory" (`mkdir`) and the second command is to "change directory" (`cd`).
53 | 
54 | One of `wget` strengths is in fetching multiple URLs and especially in finding links in one page and following those links to download contents in other pages, and so on recursively. Since `wget` is often used to fetch many URLs it's best to configure it such that is doesn't strain the target server too heavily (by trying to fetch hundreds of URLs as quickly as possible, for instance). A couple of common arguments are added to be a good net citizen (and avoid being blacklisted by servers, which would prevent you from fetching more content).
55 | 
56 | * `-w`: number of seconds to wait between requests: `wget -w 1 http://www.digitalhumanities.org/dhq/`
57 | * `--limit-rate`: the bandwidth to use in kilobytes/second: `wget --limit-rate=200k http://www.digitalhumanities.org/dhq/`
58 | 
59 | (Note about arguments: typically one hyphen is used for abbreviations like "w" and two hyphens for full names like "limit-rate".)
60 | 
61 | A final argument that's useful for our purposes is to tell `wget` to only fetch URLs matching a certain pattern, namely "/dhq/vol/…". We do that with the argument `--accept-regex`
62 | 
63 | 	wget -r --accept-regex -w 1 "/dhq/vol/"  http://www.digitalhumanities.org/dhq/
64 | 
65 | This says "go get the contents of the http://www.digitalhumanities.org/dhq/ recursively fetching URLs that match our simple regular expression (actual articles) while waiting a second between each request and limiting the bandwidth to 200KB/second.
66 | 
67 | And presto, we've scraped an entire journal!


--------------------------------------------------------------------------------
/ipynb/Nltk.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Natural Language Toolkit"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Let's load _The Gold Bug_"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [
 22 |     {
 23 |      "name": "stdout",
 24 |      "output_type": "stream",
 25 |      "text": [
 26 |       "THE GOLD-BUG\n",
 27 |       "\n",
 28 |       "          What ho! what ho! this fellow is dancing mad!\n",
 29 |       "\n",
 30 |       "               He hath been b\n"
 31 |      ]
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "with open(\"data/goldBug.txt\", \"r\") as f:\n",
 36 |     "    goldBugString = f.read()\n",
 37 |     "print(goldBugString[:100])"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "Let's tokenize!"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "data": {
 54 |       "text/plain": [
 55 |        "['the', 'gold-bug', 'what', 'ho', '!', 'what', 'ho', '!', 'this', 'fellow']"
 56 |       ]
 57 |      },
 58 |      "execution_count": 10,
 59 |      "metadata": {},
 60 |      "output_type": "execute_result"
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "import nltk\n",
 65 |     "goldBugTokens = nltk.word_tokenize(goldBugString.lower())\n",
 66 |     "goldBugTokens[:10]"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [
 74 |     {
 75 |      "name": "stdout",
 76 |      "output_type": "stream",
 77 |      "text": [
 78 |       "['the', 'what', 'ho', 'what', 'ho', 'this', 'fellow']\n",
 79 |       "['the', 'what', 'ho', 'what', 'ho', 'this', 'fellow']\n"
 80 |      ]
 81 |     }
 82 |    ],
 83 |    "source": [
 84 |     "filterTokens = []\n",
 85 |     "for word in goldBugTokens[:10]:\n",
 86 |     "    if word.isalpha():\n",
 87 |     "        filterTokens.append(word)\n",
 88 |     "print(filterTokens)\n",
 89 |     "\n",
 90 |     "print([word for word in goldBugTokens[:10] if word.isalpha()])"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "goldBugWords = [word for word in goldBugTokens if any([char for char in word if char.isalpha()])]"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [
107 |     {
108 |      "data": {
109 |       "text/plain": [
110 |        "[('the', 877),\n",
111 |        " ('of', 465),\n",
112 |        " ('and', 359),\n",
113 |        " ('i', 336),\n",
114 |        " ('to', 329),\n",
115 |        " ('a', 327),\n",
116 |        " ('in', 238),\n",
117 |        " ('it', 213),\n",
118 |        " ('you', 162),\n",
119 |        " ('was', 137)]"
120 |       ]
121 |      },
122 |      "execution_count": 38,
123 |      "metadata": {},
124 |      "output_type": "execute_result"
125 |     }
126 |    ],
127 |    "source": [
128 |     "wordFrequencies = nltk.FreqDist(goldBugWords)\n",
129 |     "wordFrequencies.most_common(10)"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [
137 |     {
138 |      "name": "stdout",
139 |      "output_type": "stream",
140 |      "text": [
141 |       "['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']\n"
142 |      ]
143 |     }
144 |    ],
145 |    "source": [
146 |     "stopwords = nltk.corpus.stopwords.words(\"English\")\n",
147 |     "print(stopwords)"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {},
154 |    "outputs": [
155 |     {
156 |      "data": {
157 |       "text/plain": [
158 |        "[('upon', 81),\n",
159 |        " ('de', 73),\n",
160 |        " (\"'s\", 56),\n",
161 |        " ('jupiter', 53),\n",
162 |        " ('legrand', 47),\n",
163 |        " ('one', 38),\n",
164 |        " ('said', 35),\n",
165 |        " ('well', 35),\n",
166 |        " ('massa', 34),\n",
167 |        " ('could', 33),\n",
168 |        " ('bug', 32),\n",
169 |        " ('skull', 29),\n",
170 |        " ('parchment', 27),\n",
171 |        " ('made', 25),\n",
172 |        " ('tree', 25),\n",
173 |        " ('first', 24),\n",
174 |        " ('time', 24),\n",
175 |        " ('two', 23),\n",
176 |        " ('much', 23),\n",
177 |        " ('us', 23)]"
178 |       ]
179 |      },
180 |      "execution_count": 43,
181 |      "metadata": {},
182 |      "output_type": "execute_result"
183 |     }
184 |    ],
185 |    "source": [
186 |     "goldBugFilteredWords = [word for word in goldBugWords if not word in stopwords]\n",
187 |     "nltk.FreqDist(goldBugFilteredWords).most_common(20)"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": []
196 |   }
197 |  ],
198 |  "metadata": {
199 |   "kernelspec": {
200 |    "display_name": "Python 3",
201 |    "language": "python",
202 |    "name": "python3"
203 |   },
204 |   "language_info": {
205 |    "codemirror_mode": {
206 |     "name": "ipython",
207 |     "version": 3
208 |    },
209 |    "file_extension": ".py",
210 |    "mimetype": "text/x-python",
211 |    "name": "python",
212 |    "nbconvert_exporter": "python",
213 |    "pygments_lexer": "ipython3",
214 |    "version": "3.6.3"
215 |   }
216 |  },
217 |  "nbformat": 4,
218 |  "nbformat_minor": 1
219 | }
220 | 


--------------------------------------------------------------------------------
/docs/start/index.md:
--------------------------------------------------------------------------------
 1 | # Getting Started with the Art of Literary Text Analysis
 2 | 
 3 | This is part of the [Art of Literary Text Mining](../) collection. This page is intended to briefly describe how to get started, particularly with Voyant Tools.
 4 | 
 5 | So you want to do some text analysis, but where to start? Let's imagine that we have a favourite news source and we want to try to determine what's being discussed (without necessarily just reading the front page articles). You can do this with most websites and media outlets, but for the purposes of this example, let's say that we want to look at the Canadian Broadcasting Corporation (Canada's public Anglophone broadcaster) at [CBC.ca](https://cbc.ca).
 6 | 
 7 | ## Voyant
 8 | 
 9 | ![Voyant](../images/voyant48.png) In Voyant analyzing the contents of a URL is dead simple, all that needs to be done is to visit the main page [voyant-tools.org](https://voyant-tools.org) and paste in the URL of interest. We can also use the query parameters (part of the URL) to specify an input argument:
10 | 
11 | [https://voyant-tools.org/?<span style='background-color: yellow'>input=<span style='color: red'>https://cbc.ca</span></span>](https://voyant-tools.org/?corpus=9094634e2f37d5e29cf93431c4c7bb5a&input=https://www.cbc.ca)
12 | 
13 | The full interface can show some interesting aspects, but even just the summary points out some interesting aspects. For instance, even though the CBC page is essentially a compilation of blocks linking to other pages we can see that our corpus contains only one document:
14 | 
15 | <iframe src="https://voyant-tools.org/?corpus=9094634e2f37d5e29cf93431c4c7bb5a&input=https://www.cbc.ca&view=summary" style="width: 50%; max-width: 500px; height: 300px"></iframe>
16 | 
17 | We said we wouldn't read the page directly, but it is worth having a look at what exactly we caught when we cast the net over the URL. To do that, we could have a look at the [Reader](https://voyant-tools.org/docs/#!/guide/reader) tool in Voyant.
18 | 
19 | <iframe src="https://voyant-tools.org/?corpus=9094634e2f37d5e29cf93431c4c7bb5a&input=https://www.cbc.ca&view=reader" style="width: 50%; max-width: 500px; height: 300px"></iframe>
20 | 
21 | What we see is that there's a main title on the page "CBC.ca - watch, listen, and discover with Canada's Public Broadcaster…" but there's also navigational items like "Skip to Main Content", "CBCMenu", and "Search". While there's nothing wrong with that necessarily, it may be misleading to think that the news is talking about search (and rescue, for instance), when we have a keyword that is really from the navigational elements of the page (sometimes called paratextual elements). Can we do better?
22 | 
23 | <img alt="DOM-model.svg" src="//upload.wikimedia.org/wikipedia/commons/thumb/5/5a/DOM-model.svg/220px-DOM-model.svg.png" decoding="async" width="220" height="228" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/5/5a/DOM-model.svg/330px-DOM-model.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/5/5a/DOM-model.svg/440px-DOM-model.svg.png 2x" data-file-width="428" data-file-height="443" style="float: right"> We can, and the way we do that is to dive into an exploration of what's called the [Document Object Model](https://en.wikipedia.org/wiki/Document_Object_Model), that is, the hierarchical elements that are part of the tree of this web document.
24 | 
25 | ### The DOM and CSS Selectors
26 | 
27 | HTML is a markup language that starts with a root node or tag (usually &lt;html&gt;), then splits into a &lt;head&gt; and a &lt;body&gt;, each of which may have its own children nodes (or tags or text). Within the DOM there are also ways of identifying unique elements and group similar elements into a class of objects that share some characteristics. This is precisely the syntax that's used to add styling to pages using Cascading Stylesheets (CSS).
28 | 
29 | | Examples | Type | Explanation |
30 | |-|-|-|
31 | | body, p | tag name selector | select every tag that is either &lt;body&gt; or &lt;p&gt; |
32 | | #mainsection | ID selector | select the unique element with matching ID, as in &lt;div id="mainsection"&gt; |
33 | | .insight | class selector | select all elements with matching class, as in &lt;div class="insight"&gt; |
34 | 
35 | The syntax of CSS Selectors is actually [much more powerful](https://en.wikipedia.org/wiki/Cascading_Style_Sheets#Selector), but for now this will suffice.
36 | 
37 | So, back to our CBC news page, how do we clean up the input a bit? We can explore the DOM in the browser using built-in tools, depending on your browser:
38 | 
39 | * **Firefox** Menu  ➤ Web Developer ➤ Toggle Tools, or Tools ➤ Web Developer ➤ Toggle Tools
40 | * **Chrome** More tools ➤ Developer tools
41 | * **Safari** Develop ➤ Show Web Inspector. If you can't see the Develop menu, go to Safari ➤ Preferences ➤ Advanced, and check the Show Develop menu in menu bar checkbox. 
42 | 
43 | In this case (as of writing of this document, though things may change of course), one reasonable choice would be to select either the tag "main" (if we believe there's just one) or the ID #content.
44 | 
45 | <img src="inspect.png" alt="DOM Inspect" style="max-width: 800px" >
46 | 
47 | To experiment, use Voyant (preferably the Desktop version) and try different settings while consulting the [documentation for the HTML Corpus Creation](https://voyant-tools.org/docs/#!/guide/corpuscreator-section-html) as necessary. When starting at the landing page of Voyant, be sure to click on the options icon to open this dialog box:
48 | 
49 | <img src="html-options.png" alt="HTML Options" style="max-width: 500px" >
50 | 
51 | ### Exercise
52 | 
53 | Voyant allows you to define a corpus with multiple documents using the "Documents" field, even if the original content is in only just one file. Is there a CSS Selector that allows you to compile all of the individual story blocks as separate documents (not the full contents if you visit any one story, just the title and blurb shown on the main page)?
54 | 
55 | ### Gotchas
56 | 
57 | Most web pages are rendered from the HTML code that is sent from the server to the browser, but there are cases where the browser receives further instructions to fetch and generate parts of a page. Those interactively generated pages probably won't work with Voyant (and other similar systems) since it can only see the HTML that's initially sent, not the rest of the content that is fetched after the page has loaded.
58 | 
59 | Voyant (and similar systems) can only work with the content that is fetched from the URL but in some cases you may be looking at priviledged content that you can see in your browser but that's invisible to the server (the contents of your Facebook page, for instance). Any URL sent to Voyant assumes that the content is open and essentially the same regardless of who is fetching the page.
60 | 
61 | ## Jupyter Notebook
62 | 
63 | ![Jupyter](../images/jupyter48.png) We can also see the DOM and CSS Selection at play in our Jupyter Notebook for [Getting Started](https://nbviewer.jupyter.org/github/sgsinclair/alta/blob/master/ipynb/GettingStarted.ipynb). The notebook walks through the steps of creating a new notebook and some basic Python syntax, but if you don't need that you can skip ahead to the [Fetch URL Example](https://nbviewer.jupyter.org/github/sgsinclair/alta/blob/master/ipynb/GettingStarted.ipynb#Fetch-URL-Example).
64 | 
65 | 


--------------------------------------------------------------------------------
/ipynb/GettingSetup.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Getting Setup: Installing Jupyter\n",
  8 |     "\n",
  9 |     "This notebook describes how to get setup with Jupyter (formerly iPython Notebooks). It's part of the [The Art of Literary Text Analysis](ArtOfLiteraryTextAnalysis.ipynb). In particular, we'll look at:\n",
 10 |     "\n",
 11 |     "* [Downloading and installing Jupyter](#Downloading-and-Installing-Jupyter-with-Anaconda)\n",
 12 |     "* [Launching Jupyter and creating a working directory](#Launching-Jupyter)\n",
 13 |     "* [Creating a notebook](#Creating-a-Notebook)\n",
 14 |     "* [Quitting Jupyter](#Quitting-Jupyter)"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Downloading and Installing Jupyter with Anaconda\n",
 22 |     "\n",
 23 |     "[<img src=\"images/logo_anaconda.png\" alt=\"Anaconda logo\" style=\"float: right; width: 100px; margin-left: 1em;\" />](https://www.continuum.io/downloads)  Setting up Jupyter is (usually) a smooth and painless process. The easiest and recommended option is to [download and install Anaconda](https://www.continuum.io/downloads), which is a freely available bundle that includes Python, Jupyter, and several other things that will be useful to us. It's *very important* for the purposes of our notebooks to select a version for [Mac OS X](https://www.continuum.io/downloads#_macosx), [Windows](https://www.continuum.io/downloads#_windows) or [Linux](https://www.continuum.io/downloads#_unix) of **Anaconda with Python 3.x** (not Python 2.x).\n",
 24 |     "\n",
 25 |     "Once the Anaconda 3.x installer program is downloaded you can click on the installer and follow the instructions (using the defaults will work just fine). If you encounter difficulties, you may want to consult the [Jupyter installation documentation](http://jupyter.readthedocs.org/en/latest/install.html)."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "## Launching Jupyter\n",
 33 |     "\n",
 34 |     "<img src=\"images/anaconda-launcher.png\" alt=\"Anaconda Launcher\" style=\"float: right; width: 75px; margin-left: 1em;\" /> Once you've installed Anaconda, the easiest way to launch Jupyter is to use the Anaconda Navigator (which you should be able to find in your Applications folder on Mac or your Programs menu on Windows).\n",
 35 |     "\n",
 36 |     "The Anaconda Navigator will present several applications to choose from, we'll click on the _Launch_ button of _notebook_ (Jupyter Notebook):\n",
 37 |     "\n",
 38 |     "<img src=\"images/anaconda-launcher-menu-2.png\" alt=\"Anaconda Launcher\" style=\"max-width: 800px; margin-left: auto; margin-right: auto; border: thin solid #ccc;\" />\n",
 39 |     "\n",
 40 |     "This should launch two more windows:\n",
 41 |     "\n",
 42 |     "1. A terminal window where the Jupyter server ([kernel](Glossary.ipynb#kernel \"The core computer program of the operating system which can control all system processes\")) is running (this will be used to quit Jupyter later), and\n",
 43 |     "1. A web browser window that shows a [directory tree](Glossary.ipynb#directorytree \"A tree like structure which represents the organization and hierachy of files within a directory\") of your computer's file system (starting at the default path of Jupyter).\n",
 44 |     "\n",
 45 |     "The default path on a Mac is the user's \"Home\" directory. We probably don't want to create Jupyter notebooks there, so we'll navigate to another directory (like \"Documents\") and create a new folder (like \"Notebooks\"). The location and names aren't important but we'll need to remember where our notebooks are for future use."
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## Creating Folders\n",
 53 |     "To create a new folder or notebook you use the _New_ button in the directory browser window.\n",
 54 |     "\n",
 55 |     "<img src=\"images/ipython-notebook-root-tree.png\" alt=\"Jupyter Directory\" style=\"max-width: 800px; margin-left: auto; margin-right: auto; border: thin solid #ccc;\" />\n",
 56 |     "\n",
 57 |     "Creating a new folder gives a default name (like \"Untitled Folder\") but we can select the folder using the checkbox to the left and then click the rename button that appears before giving the folder a new name.\n",
 58 |     "\n",
 59 |     "<img src=\"images/folder-rename.png\" alt=\"Folder rename\" style=\"max-width: 800px; margin-left: auto; margin-right: auto; border: thin solid #ccc;\" />\n",
 60 |     "\n",
 61 |     "Now we have Jupyter running and we have a new folder for our notebooks, we're ready for the next step in [Getting Started](GettingStarted.ipynb). But just before that, let's look quickly at how we create a notebook and how we quit Jupyter."
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "## Creating a Notebook\n",
 69 |     "\n",
 70 |     "Now you can create your first notebook. Use the same _New_ menu and pull down to the **Python 3** under the Notebooks heading in the menu. This will create your first notebook. We will review this and how to use notebooks in the next notebook [Getting Started](GettingStarted.ipynb)."
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "## Quitting Jupyter\n",
 78 |     "\n",
 79 |     "The browser window that was opened by the Anaconda Launcher is just a regular window. If we'd been working on a notebook, we'd of course want to save our work before quitting. We don't need to do this for browser (directory) windows. We can close the browser window(s) created by Jupyter in the usual browser way. Then we have to shut down the server [kernel](Glossary.ipynb#kernel \"The core computer program of the operating system which can control all system processes\") (so that our computer doesn't waste memory resources). To do that we do the following:\n",
 80 |     "\n",
 81 |     "1. _Close and Halt_ any notebooks you have running by going to the _File_ menu of each running notebook,\n",
 82 |     "1. Switch to the terminal window that was opened by the launcher and hit Ctrl-c twice (keep your finger on the Control key and press the \"c\" twice), then\n",
 83 |     "1. Switch to the Anaconda Launcher application and quit it (just as you would any other application)."
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "## Next Steps\n",
 91 |     "\n",
 92 |     "Let's now proceed to [Getting Started](GettingStarted.ipynb)."
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "---\n",
100 |     "[CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/) From [The Art of Literary Text Analysis](ArtOfLiteraryTextAnalysis.ipynb) by [Stéfan Sinclair](http://stefansinclair.name) &amp; [Geoffrey Rockwell](http://geoffreyrockwell.com). Edited and revised by [Melissa Mony](http://melissamony.com).\n",
101 |     "<br >Created January 7, 2015 and last modified January 14, 2018 (Jupyter 5.0.0)"
102 |    ]
103 |   }
104 |  ],
105 |  "metadata": {
106 |   "kernelspec": {
107 |    "display_name": "Python 3",
108 |    "language": "python",
109 |    "name": "python3"
110 |   },
111 |   "language_info": {
112 |    "codemirror_mode": {
113 |     "name": "ipython",
114 |     "version": 3
115 |    },
116 |    "file_extension": ".py",
117 |    "mimetype": "text/x-python",
118 |    "name": "python",
119 |    "nbconvert_exporter": "python",
120 |    "pygments_lexer": "ipython3",
121 |    "version": "3.7.1"
122 |   }
123 |  },
124 |  "nbformat": 4,
125 |  "nbformat_minor": 1
126 | }
127 | 


--------------------------------------------------------------------------------
/ipynb/utilities/SimpleSentimentAnalysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Simple Sentiment Analysis\n",
  8 |     "\n",
  9 |     "This notebook shows how to analyze a collection of passages like Tweets for sentiment.\n",
 10 |     "\n",
 11 |     "This is based on Neal Caron's [An introduction to text analysis with Python, Part 1](http://nealcaren.web.unc.edu/an-introduction-to-text-analysis-with-python-part-1/).\n",
 12 |     "\n",
 13 |     "This Notebook shows how to analyze one tweet."
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "### Setting up our data\n",
 21 |     "\n",
 22 |     "Here we will define the data to test our positive and negative dictionaries."
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 6,
 28 |    "metadata": {
 29 |     "collapsed": true
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "theTweet = \"No food is good food. Ha. I'm on a diet and the food is awful and lame.\"\n",
 34 |     "positive_words=['awesome','good','nice','super','fun','delightful']\n",
 35 |     "negative_words=['awful','lame','horrible','bad']"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 7,
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "data": {
 45 |       "text/plain": [
 46 |        "list"
 47 |       ]
 48 |      },
 49 |      "execution_count": 7,
 50 |      "metadata": {},
 51 |      "output_type": "execute_result"
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "type(positive_words)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "### Tokenizing the text\n",
 63 |     "\n",
 64 |     "Now we will tokenize the text."
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 8,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "name": "stdout",
 74 |      "output_type": "stream",
 75 |      "text": [
 76 |       "['no', 'food', 'is', 'good', 'food', 'ha', 'i', 'm', 'on', 'a']\n"
 77 |      ]
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "import re\n",
 82 |     "theTokens = re.findall(r'\\b\\w[\\w-]*\\b', theTweet.lower())\n",
 83 |     "print(theTokens[:10])"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "### Calculating postive words\n",
 91 |     "\n",
 92 |     "Now we will count the number of positive words."
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 14,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "name": "stdout",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "1\n"
105 |      ]
106 |     }
107 |    ],
108 |    "source": [
109 |     "numPosWords = 0\n",
110 |     "for banana in theTokens:\n",
111 |     "    if banana in positive_words:\n",
112 |     "        numPosWords += 1\n",
113 |     "print(numPosWords) "
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "### Calculating negative words\n",
121 |     "\n",
122 |     "Now we will count the number of negative words."
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 10,
128 |    "metadata": {},
129 |    "outputs": [
130 |     {
131 |      "name": "stdout",
132 |      "output_type": "stream",
133 |      "text": [
134 |       "2\n"
135 |      ]
136 |     }
137 |    ],
138 |    "source": [
139 |     "numNegWords = 0\n",
140 |     "for word in theTokens:\n",
141 |     "    if word in negative_words:\n",
142 |     "        numNegWords += 1\n",
143 |     "print(numNegWords) "
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 18,
149 |    "metadata": {},
150 |    "outputs": [
151 |     {
152 |      "data": {
153 |       "text/plain": [
154 |        "True"
155 |       ]
156 |      },
157 |      "execution_count": 18,
158 |      "metadata": {},
159 |      "output_type": "execute_result"
160 |     }
161 |    ],
162 |    "source": [
163 |     "v1 = \"0\"\n",
164 |     "v2 = 0\n",
165 |     "v3 = str(v2)\n",
166 |     "v1 == v3"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "### Calculating percentages\n",
174 |     "\n",
175 |     "Now we calculate the percentages of postive and negative."
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 11,
181 |    "metadata": {},
182 |    "outputs": [
183 |     {
184 |      "name": "stdout",
185 |      "output_type": "stream",
186 |      "text": [
187 |       "Positive: 6%  Negative: 11%\n"
188 |      ]
189 |     }
190 |    ],
191 |    "source": [
192 |     "numWords = len(theTokens)\n",
193 |     "percntPos = numPosWords / numWords\n",
194 |     "percntNeg = numNegWords / numWords\n",
195 |     "print(\"Positive: \" + \"{:.0%}\".format(percntPos) + \"  Negative: \" + \"{:.0%}\".format(percntNeg))"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {},
201 |    "source": [
202 |     "### Deciding if it is postive or negative\n",
203 |     "\n",
204 |     "We are going assume that a simple majority will define if the Tweet is positive or negative."
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 12,
210 |    "metadata": {},
211 |    "outputs": [
212 |     {
213 |      "name": "stdout",
214 |      "output_type": "stream",
215 |      "text": [
216 |       "Negative 1:2\n",
217 |       "\n"
218 |      ]
219 |     }
220 |    ],
221 |    "source": [
222 |     "if numPosWords > numNegWords:\n",
223 |     "    print(\"Positive \" + str(numPosWords) + \":\" + str(numNegWords))\n",
224 |     "elif numNegWords > numPosWords:\n",
225 |     "    print(\"Negative \" + str(numPosWords) + \":\" + str(numNegWords))\n",
226 |     "elif numNegWords == numPosWords:\n",
227 |     "    print(\"Neither \" + str(numPosWords) + \":\" + str(numNegWords))\n",
228 |     "    \n",
229 |     "print()"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "markdown",
234 |    "metadata": {},
235 |    "source": []
236 |   },
237 |   {
238 |    "cell_type": "markdown",
239 |    "metadata": {},
240 |    "source": [
241 |     "## Next Steps"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {},
247 |    "source": [
248 |     "Let's try another utility example, this time looking at more [Complex Sentiment Analysis](ComplexSentimentAnalysis.ipynb)."
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {},
254 |    "source": [
255 |     "---\n",
256 |     "[CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/) From [The Art of Literary Text Analysis](../ArtOfLiteraryTextAnalysis.ipynb) by [Stéfan Sinclair](http://stefansinclair.name) &amp; [Geoffrey Rockwell](http://geoffreyrockwell.com). Edited and revised by [Melissa Mony](http://melissamony.com). <br >Created August 8, 2014  (Jupyter 4.2.1)"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {
263 |     "collapsed": true
264 |    },
265 |    "outputs": [],
266 |    "source": []
267 |   }
268 |  ],
269 |  "metadata": {
270 |   "kernelspec": {
271 |    "display_name": "Python 3",
272 |    "language": "python",
273 |    "name": "python3"
274 |   },
275 |   "language_info": {
276 |    "codemirror_mode": {
277 |     "name": "ipython",
278 |     "version": 3
279 |    },
280 |    "file_extension": ".py",
281 |    "mimetype": "text/x-python",
282 |    "name": "python",
283 |    "nbconvert_exporter": "python",
284 |    "pygments_lexer": "ipython3",
285 |    "version": "3.6.3"
286 |   }
287 |  },
288 |  "nbformat": 4,
289 |  "nbformat_minor": 1
290 | }
291 | 


--------------------------------------------------------------------------------
/ipynb/utilities/.ipynb_checkpoints/SimpleSentimentAnalysis-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Simple Sentiment Analysis\n",
  8 |     "\n",
  9 |     "This notebook shows how to analyze a collection of passages like Tweets for sentiment.\n",
 10 |     "\n",
 11 |     "This is based on Neal Caron's [An introduction to text analysis with Python, Part 1](http://nealcaren.web.unc.edu/an-introduction-to-text-analysis-with-python-part-1/).\n",
 12 |     "\n",
 13 |     "This Notebook shows how to analyze one tweet."
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "### Setting up our data\n",
 21 |     "\n",
 22 |     "Here we will define the data to test our positive and negative dictionaries."
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 6,
 28 |    "metadata": {
 29 |     "collapsed": true
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "theTweet = \"No food is good food. Ha. I'm on a diet and the food is awful and lame.\"\n",
 34 |     "positive_words=['awesome','good','nice','super','fun','delightful']\n",
 35 |     "negative_words=['awful','lame','horrible','bad']"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 7,
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "data": {
 45 |       "text/plain": [
 46 |        "list"
 47 |       ]
 48 |      },
 49 |      "execution_count": 7,
 50 |      "metadata": {},
 51 |      "output_type": "execute_result"
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "type(positive_words)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "### Tokenizing the text\n",
 63 |     "\n",
 64 |     "Now we will tokenize the text."
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 8,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "name": "stdout",
 74 |      "output_type": "stream",
 75 |      "text": [
 76 |       "['no', 'food', 'is', 'good', 'food', 'ha', 'i', 'm', 'on', 'a']\n"
 77 |      ]
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "import re\n",
 82 |     "theTokens = re.findall(r'\\b\\w[\\w-]*\\b', theTweet.lower())\n",
 83 |     "print(theTokens[:10])"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "### Calculating postive words\n",
 91 |     "\n",
 92 |     "Now we will count the number of positive words."
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 14,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "name": "stdout",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "1\n"
105 |      ]
106 |     }
107 |    ],
108 |    "source": [
109 |     "numPosWords = 0\n",
110 |     "for banana in theTokens:\n",
111 |     "    if banana in positive_words:\n",
112 |     "        numPosWords += 1\n",
113 |     "print(numPosWords) "
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "### Calculating negative words\n",
121 |     "\n",
122 |     "Now we will count the number of negative words."
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 10,
128 |    "metadata": {},
129 |    "outputs": [
130 |     {
131 |      "name": "stdout",
132 |      "output_type": "stream",
133 |      "text": [
134 |       "2\n"
135 |      ]
136 |     }
137 |    ],
138 |    "source": [
139 |     "numNegWords = 0\n",
140 |     "for word in theTokens:\n",
141 |     "    if word in negative_words:\n",
142 |     "        numNegWords += 1\n",
143 |     "print(numNegWords) "
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 18,
149 |    "metadata": {},
150 |    "outputs": [
151 |     {
152 |      "data": {
153 |       "text/plain": [
154 |        "True"
155 |       ]
156 |      },
157 |      "execution_count": 18,
158 |      "metadata": {},
159 |      "output_type": "execute_result"
160 |     }
161 |    ],
162 |    "source": [
163 |     "v1 = \"0\"\n",
164 |     "v2 = 0\n",
165 |     "v3 = str(v2)\n",
166 |     "v1 == v3"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "### Calculating percentages\n",
174 |     "\n",
175 |     "Now we calculate the percentages of postive and negative."
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 11,
181 |    "metadata": {},
182 |    "outputs": [
183 |     {
184 |      "name": "stdout",
185 |      "output_type": "stream",
186 |      "text": [
187 |       "Positive: 6%  Negative: 11%\n"
188 |      ]
189 |     }
190 |    ],
191 |    "source": [
192 |     "numWords = len(theTokens)\n",
193 |     "percntPos = numPosWords / numWords\n",
194 |     "percntNeg = numNegWords / numWords\n",
195 |     "print(\"Positive: \" + \"{:.0%}\".format(percntPos) + \"  Negative: \" + \"{:.0%}\".format(percntNeg))"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {},
201 |    "source": [
202 |     "### Deciding if it is postive or negative\n",
203 |     "\n",
204 |     "We are going assume that a simple majority will define if the Tweet is positive or negative."
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 12,
210 |    "metadata": {},
211 |    "outputs": [
212 |     {
213 |      "name": "stdout",
214 |      "output_type": "stream",
215 |      "text": [
216 |       "Negative 1:2\n",
217 |       "\n"
218 |      ]
219 |     }
220 |    ],
221 |    "source": [
222 |     "if numPosWords > numNegWords:\n",
223 |     "    print(\"Positive \" + str(numPosWords) + \":\" + str(numNegWords))\n",
224 |     "elif numNegWords > numPosWords:\n",
225 |     "    print(\"Negative \" + str(numPosWords) + \":\" + str(numNegWords))\n",
226 |     "elif numNegWords == numPosWords:\n",
227 |     "    print(\"Neither \" + str(numPosWords) + \":\" + str(numNegWords))\n",
228 |     "    \n",
229 |     "print()"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "markdown",
234 |    "metadata": {},
235 |    "source": []
236 |   },
237 |   {
238 |    "cell_type": "markdown",
239 |    "metadata": {},
240 |    "source": [
241 |     "## Next Steps"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {},
247 |    "source": [
248 |     "Let's try another utility example, this time looking at more [Complex Sentiment Analysis](ComplexSentimentAnalysis.ipynb)."
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {},
254 |    "source": [
255 |     "---\n",
256 |     "[CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/) From [The Art of Literary Text Analysis](../ArtOfLiteraryTextAnalysis.ipynb) by [Stéfan Sinclair](http://stefansinclair.name) &amp; [Geoffrey Rockwell](http://geoffreyrockwell.com). Edited and revised by [Melissa Mony](http://melissamony.com). <br >Created August 8, 2014  (Jupyter 4.2.1)"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {
263 |     "collapsed": true
264 |    },
265 |    "outputs": [],
266 |    "source": []
267 |   }
268 |  ],
269 |  "metadata": {
270 |   "kernelspec": {
271 |    "display_name": "Python 3",
272 |    "language": "python",
273 |    "name": "python3"
274 |   },
275 |   "language_info": {
276 |    "codemirror_mode": {
277 |     "name": "ipython",
278 |     "version": 3
279 |    },
280 |    "file_extension": ".py",
281 |    "mimetype": "text/x-python",
282 |    "name": "python",
283 |    "nbconvert_exporter": "python",
284 |    "pygments_lexer": "ipython3",
285 |    "version": "3.6.3"
286 |   }
287 |  },
288 |  "nbformat": 4,
289 |  "nbformat_minor": 1
290 | }
291 | 


--------------------------------------------------------------------------------
/ipynb/Converting.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Converting with the Art of Literary Text Analysis\n",
  8 |     "\n",
  9 |     "Our objective here is to process a plain text file so that it is more suitable for analysis. In particular. we will take two _Godfather_ screenplays and remove the stage directions. Here are the steps:\n",
 10 |     "\n",
 11 |     "* fetch the two screenplays\n",
 12 |     "* extract the screenplay text from the files\n",
 13 |     "* remove the stage directions\n",
 14 |     "\n",
 15 |     "Since we're doing this for two files we will introduce the concept of reusable functions. We've used functions in Python, in this case we're defining our own functions for the first time and using them. The basic syntax is simple:\n",
 16 |     "\n",
 17 |     "    def function_name(arguments):\n",
 18 |     "        # processing\n",
 19 |     "        # return a value (usually)\n",
 20 |     "    \n",
 21 |     "We can start by defining our function to fetch a URL, building on the materials we saw with [Scraping](Scraping.ipynb)."
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 64,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import urllib.request\n",
 31 |     "\n",
 32 |     "# this function simply fetches the contents of a URL\n",
 33 |     "def fetch(url):\n",
 34 |     "    response = urllib.request.urlopen(url) # open for reading\n",
 35 |     "    return response.read() # read and return"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 65,
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "data": {
 45 |       "text/plain": [
 46 |        "b'<html>\\r\\n<head><title>Godfather Script at IMSDb.</title>\\r\\n<meta name=\"description'"
 47 |       ]
 48 |      },
 49 |      "execution_count": 65,
 50 |      "metadata": {},
 51 |      "output_type": "execute_result"
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "godfatherUrl = \"https://www.imsdb.com/scripts/Godfather.html\" # URL to use\n",
 56 |     "godfatherSource = fetch(godfatherUrl) # fetch URL\n",
 57 |     "godfatherSource[0:80] # preview"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 66,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "from bs4 import BeautifulSoup\n",
 67 |     "\n",
 68 |     "# this function extracts the text from the Godfather screenplays\n",
 69 |     "def extract(source):\n",
 70 |     "    soup = BeautifulSoup(source) # parse the source document\n",
 71 |     "    return soup.find(\"pre\").find(\"pre\").text.strip() # return the plain text (no tags)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 67,
 77 |    "metadata": {},
 78 |    "outputs": [
 79 |     {
 80 |      "data": {
 81 |       "text/plain": [
 82 |        "'THE GODFATHER\\n\\t_____________\\n\\n\\tScreenplay\\n\\n\\tby\\n\\n\\tMARIO PUZO\\n\\n\\tand\\n\\n\\tFRANCIS FORD'"
 83 |       ]
 84 |      },
 85 |      "execution_count": 67,
 86 |      "metadata": {},
 87 |      "output_type": "execute_result"
 88 |     }
 89 |    ],
 90 |    "source": [
 91 |     "godfatherText = extract(godfatherSource) # extract text from source\n",
 92 |     "godfatherText[0:80] # preview"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 68,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "import re\n",
102 |     "\n",
103 |     "directions = r'^\\t?[^\\t]' # regular expression to avoid one tab only at start of line\n",
104 |     "\n",
105 |     "# this function cleans the text by skipping lines with one tab (and multiple new lines)\n",
106 |     "def clean(text):\n",
107 |     "    lines = re.sub(r'\\n\\n+', \"\\n\\n\", text).split(\"\\n\") # create list from new line\n",
108 |     "    return [l for l in lines if not re.match(directions, l)] # create list from non-match lines"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 72,
114 |    "metadata": {},
115 |    "outputs": [
116 |     {
117 |      "data": {
118 |       "text/plain": [
119 |        "['',\n",
120 |        " '',\n",
121 |        " '',\n",
122 |        " '',\n",
123 |        " '',\n",
124 |        " '',\n",
125 |        " '\\t\\t\\t\\t\\t1 Gulf and Western Plaza',\n",
126 |        " '',\n",
127 |        " '',\n",
128 |        " '',\n",
129 |        " '\\t\\t\\t\\t  THE GODFATHER',\n",
130 |        " '',\n",
131 |        " '',\n",
132 |        " '\\t\\t\\t\\tBONASERA',\n",
133 |        " '\\t\\tAmerica has made my fortune.',\n",
134 |        " '',\n",
135 |        " '',\n",
136 |        " '\\t\\t\\t\\tBONASERA',\n",
137 |        " '\\t\\tI raised my daughter in the American',\n",
138 |        " '\\t\\tfashion; I gave her freedom, but']"
139 |       ]
140 |      },
141 |      "execution_count": 72,
142 |      "metadata": {},
143 |      "output_type": "execute_result"
144 |     }
145 |    ],
146 |    "source": [
147 |     "godfather = clean(godfatherText) # clean text\n",
148 |     "godfather[0:20] # preview"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 74,
154 |    "metadata": {},
155 |    "outputs": [
156 |     {
157 |      "data": {
158 |       "text/plain": [
159 |        "['',\n",
160 |        " '\\t\\t\\t\\t Part Two',\n",
161 |        " '',\n",
162 |        " '\\t\\t\\t\\tScreenplay by',\n",
163 |        " '',\n",
164 |        " '\\t\\t\\t\\tMario Puzo',\n",
165 |        " '',\n",
166 |        " '\\t\\t\\t\\t    and',\n",
167 |        " '',\n",
168 |        " '\\t\\t\\t Francis Ford Coppola',\n",
169 |        " '',\n",
170 |        " '',\n",
171 |        " '',\n",
172 |        " '',\n",
173 |        " '',\n",
174 |        " \"\\t\\t     Mario Puzo's THE GODFATHER\",\n",
175 |        " '',\n",
176 |        " '',\n",
177 |        " '',\n",
178 |        " '',\n",
179 |        " '',\n",
180 |        " '\\t\\t\\t\\t\\t\\t\\t\\tDISSOLVE TO:',\n",
181 |        " '',\n",
182 |        " '',\n",
183 |        " '',\n",
184 |        " '',\n",
185 |        " '',\n",
186 |        " '',\n",
187 |        " '',\n",
188 |        " '',\n",
189 |        " '\\t\\t\\t\\tWOMAN',\n",
190 |        " '\\t\\t\\t(Sicilian)',\n",
191 |        " \"\\t\\tThey've killed young Paolo!  They've\",\n",
192 |        " '\\t\\tkilled the boy Paolo!',\n",
193 |        " '',\n",
194 |        " '',\n",
195 |        " '',\n",
196 |        " '',\n",
197 |        " '',\n",
198 |        " '']"
199 |       ]
200 |      },
201 |      "execution_count": 74,
202 |      "metadata": {},
203 |      "output_type": "execute_result"
204 |     }
205 |    ],
206 |    "source": [
207 |     "godfather2url = \"https://www.imsdb.com/scripts/Godfather-Part-II.html\"\n",
208 |     "godfather2 = clean(extract(fetch(godfather2url))) # call nested functions\n",
209 |     "godfather2[0:40] # preview"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {},
215 |    "source": [
216 |     "And there we are, we now have code to process our _Godfather_ screenplays. It's not perfect, but it's a great start!\n",
217 |     "\n",
218 |     "---\n",
219 |     "[CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/) From [The Art of Literary Text Analysis](ArtOfLiteraryTextAnalysis.ipynb) by [Stéfan Sinclair](http://stefansinclair.name) &amp; [Geoffrey Rockwell](http://geoffreyrockwell.com). <br >Created January 31, 2019 (Jupyter 5)."
220 |    ]
221 |   }
222 |  ],
223 |  "metadata": {
224 |   "kernelspec": {
225 |    "display_name": "Python 3",
226 |    "language": "python",
227 |    "name": "python3"
228 |   },
229 |   "language_info": {
230 |    "codemirror_mode": {
231 |     "name": "ipython",
232 |     "version": 3
233 |    },
234 |    "file_extension": ".py",
235 |    "mimetype": "text/x-python",
236 |    "name": "python",
237 |    "nbconvert_exporter": "python",
238 |    "pygments_lexer": "ipython3",
239 |    "version": "3.7.1"
240 |   }
241 |  },
242 |  "nbformat": 4,
243 |  "nbformat_minor": 2
244 | }
245 | 


--------------------------------------------------------------------------------
/docs/collocate/index.md:
--------------------------------------------------------------------------------
 1 | # Collocates with the Art of Literary Text Mining
 2 | 
 3 | In the previous meta-guide we considered the nature of bits and bytes and strings and how we might fruitfully [count occurrences](../count/). This page shifts focus from finding and counting terms to considering the lexical context of terms. In other words, if we were to collect the terms that are in proximity to our keyword, what might we observe?
 4 | 
 5 | ## Concordances
 6 | 
 7 | Concordances are a very old "technology" or technique, reaching back at least to the 12the Century when theologians sought to better understand certain concepts by creating a type of extended index of the occurrences of terms. Even though concordances pre-date the computer by centuries, the digital makes it far easier to re-organize data. Imagine we have the following excerpt from Ursula LeGuin's _The Left Hand of Darkness_:
 8 | 
 9 | 	Insofar as I love life, I love the hills of the Domain of Estre, but that sort of love does not have a boundary-line of hate. And beyond that, I am ignorant, I hope.
10 | 
11 | This is one text, but now let's imagine that we want to generate a concordance where the keyword is "love", in other words, each occurrence of "love" with some context (three words) on each side:
12 | 
13 | 	Insofar as I **love** life, I love
14 | 	love life, I **love** the hills of
15 | 	that sort of **love** does not have
16 | 
17 | This, in certain ways, is a new text and we can consider it as such for counting – what are the top frequency terms in this new text? Our counting now has an additional layer of potential meaning: we are now focusing on terms that are related to our keyword. High frequency terms in this new text may be said to be related to the keywords, they seem to tend to occur together (as always, it's probably more useful if we remove the stopwords. In fact, our concordance might be more useful already if we skip stopwords:
18 | 
19 | 	**love** life, love hills
20 | 	love life **love** hills Domain Estre
21 | 	hills Domain Estre **love** boundary-line hate ignorant
22 | 
23 | This example demonstrates one of the possible dangers of simple concordancing: because of the proximity of two occurrences of "love" some of the same words are duplicated. We could resolve this with some additional coding, but in most cases, depending on the term of interest and the size of the context, it's rarely a problem.
24 | 
25 | ## Collocates
26 | 
27 | Collocates (we see co-located here) are terms that appear in some pre-defined proximity. At some levels any two terms in a text are collocates (this is sometimes referred to bag-of-words where all words are considered together regardless of position). But usually collocates are considered in a smaller window (often single-digit terms in each direction).
28 | 
29 | What do collocates tell us? In many cases probably not much. In novels we often see "he said" or "she said" so we'd expect "said" to have as high collocates "he" and "she" (likewise, "he" and "she" might have as an important collocate "said"). In practice "he" and "she" (and possibly "said") may be hidden from view because of a stoplist.
30 | 
31 | ## Collocates in Voyant
32 | 
33 | ![Voyant](../images/voyant48.png) Voyant has several tools that use collocate information.
34 | 
35 | ### Links 
36 | 
37 | <iframe src="https://voyant-tools.org/tool/CollocatesGraph/?corpus=austen" style="width: 375px; height: 350px; float: right;"></iframe> A first collocates-based tool is _Links_ (shown on the right), which is in the tab of the upper left-hand tool panel (where Cirrus is shown by default). When first opened _Links_ selects three of the highest frequency terms (shown in blue boxes) and then fetches collocates of those terms (shown in the orange boxes).
38 | 
39 | A line between two terms indicates a collocate relationship, in other words, those two terms occur together more often. The thicker the line, the more frequent (relative to all collocate links shown) the collocation. This is a relatively complex visualization between it's showing multiple things including:
40 | 
41 | * highest frequency terms
42 | * collocates of those high frequency terms, indicated with lines
43 | * other collocate relationships indicated by lines (such as between orange boxes)
44 | 
45 | This is a network graph in that it's showing the various relationships (by virtue of collocation) of both keywords and collocate words in the text. The trick is knowing the extent to which a connection between two words is more coincidental or more indicative of a potentially significant relationship.
46 | 
47 | It's worth noting that when you click on a term in the _Links_ tool that term will likely appear in other tools, such as the _Reader_ and _Trends_. You can also click on the lines in _Links_ to initiate a search of when the two terms at the ends of the line occur in proximity, this is a proximity search.
48 | 
49 | For instance, I can delete the current terms in _Links_ (see the button near the bottom of the tool) and add the term "love" to the screen by searching and selecting it in the textbox. I can click on the term "love" multiple times to fetch additional occurrences. Then I can choose one of the collocates by clicking a the line that separates two terms of interest, such as "love" and "young".
50 | 
51 | <iframe src="https://voyant-tools.org/tool/CollocatesGraph/?corpus=austen&query=love&mode=corpus&centralize=love" style="width: 375px; height: 350px; float: right;"></iframe> Although I can click on a word to fetch more collocates, sometimes it's useful to see many more collocates at once. That's possible in _Links_ for one keyword at a time: right-click or Ctrl-click on a term and select _Centralize_ from the menu that appears. That will place the keyword in the middle and show all the collocates that have been fetched (to some limit), ordered by frequency, in the periphery. To revert to the previous mode, right/ctrl-click and choose _Fetch Collocates_ from the menu.
52 | 
53 | ### TermsBerry
54 | 
55 | <iframe src="https://voyant-tools.org/tool/TermsBerry/?corpus=austen" style="width: 375px; height: 350px; float: left;"></iframe> Another useful tool for exploring collocates in Voyant is _TermsBerry_, which can be found in the middle panel (top row) in the second tab. Although visually it's very different from _Links_ it also provides much of the same information. Whereas the default view in _Links_ shows data for the top 3 terms in the corpus (after the stoplist has been applied), _TermsBerry_ by default shows 75 of the top frequency words, so it's much denser with information. You can also click on the "Strategy" button at the bottom to determine how the initial seed words are shown: it can be "top terms" (by frequency) or "distinct terms" (higher frequency compared to other texts).
56 | 
57 | 
58 | Whereas _Links_ shows lines between words that collocate, _TermsBerry_ indicates collocates as you hover over different terms. If you hover over any term the background colour of the other terms will update, with darker items showing more frequent collocates (the count is visible under each term). In what ways is this tool easier and harder to study collocates?
59 | 
60 | ### Collocates
61 | 
62 | <iframe src="https://voyant-tools.org/tool/CorpusCollocates/?corpus=austen" style="width: 375px; height: 350px; float: right;"></iframe> The final tool that we'll mention is a more classic presentation of data in tabular format. By default it shows several high frequency terms (the keyword in the _Term_ column) as well as several collocate forms. One benefit of the tabular view is that results can be organized by sorting columns: terms, term counts, collocates, collocate counts. The search is another powerful aspect of this tools, allowing you to work with one or more keyword terms at a time, even word collocates of phrases (what terms occur close to "love him", for instance?). Again, what are the pros and cons of this tool compared to the others? Are they complementary?
63 | 
64 | ## Collocates in Jupyter
65 | 
66 | ![Jupyter](../images/jupyter48.png) For our exploration of collocates in Jupyter we'll follow a link into [Getting NLTK](https://nbviewer.jupyter.org/github/sgsinclair/alta/blob/master/ipynb/GettingNltk.ipynb) in the Art of Literary Text Mining with Jupyter.
67 | 


--------------------------------------------------------------------------------
/docs/convert/index.md:
--------------------------------------------------------------------------------
  1 | # Format Conversion with Web Scraping with the Art of Literary Text Analysis
  2 | 
  3 | This is part of the [Art of Literary Text Mining](../) collection. This page is intended to briefly describe how to get started with format conversion, particularly with Juypter Notebooks.
  4 | 
  5 | ### Plain Text
  6 | 
  7 | For better or for worse, the vast majority of text mining projects either start with plain text versions of the documents, or convert existing document to plain text. There are some projects and some tools that make use of markup in the text during analysis, but they're relatively rare (we'll see some examples later in the guide).
  8 | 
  9 | We identify three major kinds of conversion and pre-processing steps:
 10 | 
 11 | 1. files are already in plain text but require some cleaning (to remove a license statement or regular page numbers, for instance)
 12 | 1. files are in HTML or XML format in ways that are conducive to text extraction (we've already seen) and other pre-processing (especially thanks to libraries like BeautifulSoup)
 13 | 1. files are in some other format that may require special or manual handling, especially for binary formats like MS Word and PDF.
 14 | 
 15 | For documents that are already in plain text, the easiest is often to make changes manually in the files or to use an application to make the same changes in multiple documents at a time (if the editor supports such functionality).
 16 | 
 17 | Let's work through a real example, three drafts screenplays for The Godfather movies, available from [IMSDb search](https://www.imsdb.com/search.php). If you type "Godfather" in the search you should get three hits:
 18 | 
 19 | #### Search results for 'godfather'
 20 | 
 21 | 	Godfather (1971-03 Draft)
 22 | 		Written by Mario Puzo,Francis Ford Coppola
 23 | 	Godfather Part II (1973-09 Draft)
 24 | 		Written by Mario Puzo,Francis Ford Coppola
 25 | 	Godfather Part III, The (1979-03 First draft)
 26 | 		Written by Mario Puzo,Francis Ford Coppola
 27 | 
 28 | Let's just work with two of the three scripts, the first two (the third has a slightly different format that confuses things somewhat, though it would be possible to use as well). Near the bottom of the first section and near the bottom of the page you'll find links to _Read "Godfather" script_, we can compile links for each document:
 29 | 
 30 | https://www.imsdb.com/scripts/Godfather.html
 31 | https://www.imsdb.com/scripts/Godfather-Part-II.html
 32 | 
 33 | Rather than save the HTML file to our hard drive, we will select the actual script and copy it into the clipboard (starting with **_THE GODFATHER_** and going until **THE END**: select that entire section and then copy it into the clipboard.
 34 | 
 35 | Now we need a plain text editor. Several very good ones exist, including [TextMate](https://macromates.com) for Mac and [Sublime](https://www.sublimetext.com) for multiple platforms, but we will use [Atom](https://atom.io), a relatively new kid on the block that has excellent GitHub integration. Start by downloading Atom, unzipping the download and placing the executable where you want it (I put in my Applications folder).
 36 | 
 37 | When Atom first opens you should open your LLCU-212 GitHub folder (any folder can be a project). Then from the _File_ menu select _Add Project Folder…_ and add a folder calld "Godfather" (without the quotes). Finally you can select _New File_ from the _File_ menu, paste the screenplay into the document, and then save the file in the "Godfather" folder with the name "Godfather.txt". Next get the [second screenplay](https://www.imsdb.com/scripts/Godfather-Part-II.html), select the actual text, copy to the clipboard, return to Atom, choose _New File_ from the _File_ menu, paste the contents, and save in the "Godfather" as "Godfather2.txt".
 38 | 
 39 | Without much fanfare we just demonstrated a simple but powerful mechanism for converting between formats. The original web page was in HTML and when we copied the screenplay into the clipboard it was still styled text (you can see this if you paste the same text into a styled editor like MS Word). However, when you paste HTML or styled text into a text editor you also convert your document to plain text. Needless to say things like images will be lost, but in our case all we really need is the plain text, so this operation is suitable.
 40 | 
 41 | If you keep the Godfather screenplay open you can see some layout particularities. Namely, stage directions are all preceded with a single tab (then other characters) whereas speeches are all preceded by a double-tag (then other characters).
 42 | 
 43 | <img src="indents.png" alt="Find in Project" style="max-width: 550px;" />
 44 | 
 45 | If we wanted to remove all the stage directions, one way to do so would be to select and remove all lines that have only a single tab. That's where regular expressions come in.
 46 | 
 47 | [Regular Expressions](https://en.wikipedia.org/wiki/Regular_expression) are a powerful mechanism for not only identifying characters, but also invisible characters (tabs, newlines, etc.), character classes (lowercase characters, digits), and a whole bunch of other things. We won't go deep into regular expressions here, but suffice it to introduce a few very common aspects of the syntax:
 48 | 
 49 | * **.**: any character
 50 | * **\w**: any ASCII letter or word character (a to z)
 51 | * **\d**: any digit (number)
 52 | * **\t**: a tab character
 53 | * **\n**: a newline character
 54 | * **\s**: a whitespace character
 55 | * **[aeiou]**: any of the character enumerated
 56 | * **[a-z]**: any character in the range
 57 | * **[^aeiou]**: none of the characters mentioned
 58 | * **^**: zero-length match at the start of a line
 59 | * **$**: zero-length match at the end of a line
 60 | * **\b**: zero-length match of a word boundary
 61 | * **(one|two)**: any word between the pipes
 62 | 
 63 | In addition, there are ways of repeating these forms:
 64 | 
 65 | * **.\***: zero or more times
 66 | * **.?**: zero or one times
 67 | * **.+**: one or more times
 68 | * **.{5}***: five times
 69 | * **.{2,5}**: two to five times
 70 | 
 71 | Now that we have our two data files in place, we can demonstrate the powerful search and replace capabilities. We will jump straight to replacing things in multiple files, but of course Atom has a more conventional search and replace mechanism for the currently open file. Replacing across documents is powerful because it can be performed on one or on hundreds or more documents at once; a type of automation (without programming).
 72 | 
 73 | From the _File_ menu, select _Find in Project_ (on Mac the shortcut is Command-Shift-F).
 74 | 
 75 | <img src="find-in-project.png" alt="Find in Project" style="max-width: 550px;" />
 76 | 
 77 | That will cause a dialog to appear near the bottom of the page.
 78 | 
 79 | <img src="replace.png" alt="Find in Project" style="max-width: 475px;" />
 80 | 
 81 | In the first box we have `^\t?[^\t].*`:
 82 | 
 83 | * **^**: match to the beginning of the line
 84 | * **\t?**: match zero or tab characters
 85 | * **[^\t]**: match anything except a tab character
 86 | * **.\***: match until the end of the line
 87 | 
 88 | We also add the "Godfather" in the bottom box to ensure that the search and replace only happens in our new data directory. And presto! We have gotten rid of stage directions (assuming that's what we wanted).
 89 | 
 90 | ## Jupyter Notebook
 91 | 
 92 | ![Jupyter](../images/jupyter48.png) Using a friendly application like Atom is usually preferable and quicker than writing code ourselves, but there are times where having code is preferable, especially when the circumstances are more complex. Another reason to use code is that the code can be repeatedly re-run whereas the steps taken in the application probably have to be repeated manually each time.
 93 | 
 94 | We demonstrate a similar situation with the [Converting Jupyter notebook](https://nbviewer.jupyter.org/github/sgsinclair/alta/blob/master/ipynb/Converting.ipynb).
 95 | 
 96 | ## Voyant
 97 | 
 98 | ![Voyant](../images/voyant48.png) The moment to do format conversion in Voyant is at the outset when one first creates a corpus. As we've seen previously, we can use powerful CSS Selectors and XML XPath expressions to determine which parts of a document should be used. There's even support for some simple filtering of plain text file. The real power of conversion is in Voyant's ability to read dozens of file formats, including PDF, MS Word, OpenOffice, Apple Pages, RTF, etc.
 99 | 
100 | Moreover, since it's possible to export a corpus in a variety of formats one could think of Voyant as a conversion utility for a wide range of formats: upload files in weird and wonderful format and then download the corpus as Voyant XML (minimal structural tagging) or plain text. The download button is located in the toolbar of the [Documents](https://voyant-tools.org/docs/#!/guide/documents) tool. 
101 | 


--------------------------------------------------------------------------------
/ipynb/utilities/Concordances.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Generating Concordances\n",
  8 |     "\n",
  9 |     "This notebook shows how you can generate a <a href=\"Glossary.ipynb#Concordance\" title=\"A list of all words within a text and their frequency of occurrence.\" >concordance </a> using lists."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "First we see what text files we have. "
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "name": "stdout",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "Hume Enquiry.txt   negative.txt       positive.txt\r\n",
 29 |       "Hume Treatise.txt  obama_tweets.txt\r\n"
 30 |      ]
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "ls *.txt"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "We are going to use the \"Hume Enquiry.txt\" from the Gutenberg Project. You can use whatever text you want. We print the first 50 characters to check."
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 2,
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stdout",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "This string has 1344061 characters.\n",
 54 |       "The Project Gutenberg EBook of A Treatise of Human\n"
 55 |      ]
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "theText2Use = \"Hume Treatise.txt\"\n",
 60 |     "with open(theText2Use, \"r\") as fileToRead:\n",
 61 |     "    fileRead = fileToRead.read()\n",
 62 |     "    \n",
 63 |     "print(\"This string has\", len(fileRead), \"characters.\")\n",
 64 |     "print(fileRead[:50])"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "## Tokenization\n",
 72 |     "\n",
 73 |     "Now we tokenize the text producing a list called \"listOfTokens\" and check the first words. This eliminates punctuation and lowercases the words."
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 3,
 79 |    "metadata": {},
 80 |    "outputs": [
 81 |     {
 82 |      "name": "stdout",
 83 |      "output_type": "stream",
 84 |      "text": [
 85 |       "['the', 'project', 'gutenberg', 'ebook', 'of', 'a', 'treatise', 'of', 'human', 'nature']\n"
 86 |      ]
 87 |     }
 88 |    ],
 89 |    "source": [
 90 |     "import re\n",
 91 |     "listOfTokens = re.findall(r'\\b\\w[\\w-]*\\b', fileRead.lower())\n",
 92 |     "print(listOfTokens[:10])"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "## Input\n",
100 |     "\n",
101 |     "Now we get the word you want a <a href=\"Glossary.ipynb#Concordance\" title=\"A list of all words within a text and their frequency of occurrence.\" >concordance </a> for and the context wanted."
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 4,
107 |    "metadata": {},
108 |    "outputs": [
109 |     {
110 |      "name": "stdout",
111 |      "output_type": "stream",
112 |      "text": [
113 |       "What word do you want collocates for? truth\n",
114 |       "How much context do you want? 10\n"
115 |      ]
116 |     }
117 |    ],
118 |    "source": [
119 |     "word2find = input(\"What word do you want collocates for? \").lower() # Ask for the word to search for\n",
120 |     "context = input(\"How much context do you want? \")# This asks for the context of words on either side to grab"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 5,
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "data": {
130 |       "text/plain": [
131 |        "str"
132 |       ]
133 |      },
134 |      "execution_count": 5,
135 |      "metadata": {},
136 |      "output_type": "execute_result"
137 |     }
138 |    ],
139 |    "source": [
140 |     "type(context)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 7,
146 |    "metadata": {},
147 |    "outputs": [
148 |     {
149 |      "data": {
150 |       "text/plain": [
151 |        "int"
152 |       ]
153 |      },
154 |      "execution_count": 7,
155 |      "metadata": {},
156 |      "output_type": "execute_result"
157 |     }
158 |    ],
159 |    "source": [
160 |     "contextInt = int(context)\n",
161 |     "type(contextInt)"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 9,
167 |    "metadata": {},
168 |    "outputs": [
169 |     {
170 |      "data": {
171 |       "text/plain": [
172 |        "228958"
173 |       ]
174 |      },
175 |      "execution_count": 9,
176 |      "metadata": {},
177 |      "output_type": "execute_result"
178 |     }
179 |    ],
180 |    "source": [
181 |     "len(listOfTokens)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "## Main function\n",
189 |     "\n",
190 |     "Here is the main <a href=\"Glossary.ipynb#Function\" title=\"functions provide functionality to a program\" >function </a> that does the work populating a new list with the lines of <a href=\"Glossary.ipynb#Concordance\" title=\"A list of all words within a text and their frequency of occurrence.\" >concordance. </a> We check the first 5 concordance lines."
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 10,
196 |    "metadata": {},
197 |    "outputs": [
198 |     {
199 |      "data": {
200 |       "text/plain": [
201 |        "['220330: a reason why the faculty of recalling past ideas with truth and clearness should not have as much merit in it',\n",
202 |        " '223214: confessing my errors and should esteem such a return to truth and reason to be more honourable than the most unerring',\n",
203 |        " '223680: from the other this therefore being regarded as an undoubted truth that belief is nothing but a peculiar feeling different from',\n",
204 |        " '224382: mind and he will evidently find this to be the truth secondly whatever may be the case with regard to this',\n",
205 |        " '225925: by their different feeling i should have been nearer the truth end of project gutenberg s a treatise of human nature']"
206 |       ]
207 |      },
208 |      "execution_count": 10,
209 |      "metadata": {},
210 |      "output_type": "execute_result"
211 |     }
212 |    ],
213 |    "source": [
214 |     "def makeConc(word2conc,list2FindIn,context2Use,concList):\n",
215 |     "\n",
216 |     "    end = len(list2FindIn)\n",
217 |     "    for location in range(end):\n",
218 |     "        if list2FindIn[location] == word2conc:\n",
219 |     "            # Here we check whether we are at the very beginning or end\n",
220 |     "            if (location - context2Use) < 0:\n",
221 |     "                beginCon = 0\n",
222 |     "            else:\n",
223 |     "                beginCon = location - context2Use\n",
224 |     "                \n",
225 |     "            if (location + context2Use) > end:\n",
226 |     "                endCon = end\n",
227 |     "            else:\n",
228 |     "                endCon = location + context2Use + 1\n",
229 |     "                \n",
230 |     "            theContext = (list2FindIn[beginCon:endCon])\n",
231 |     "            concordanceLine = ' '.join(theContext)\n",
232 |     "            # print(str(location) + \": \" + concordanceLine)\n",
233 |     "            concList.append(str(location) + \": \" + concordanceLine)\n",
234 |     "\n",
235 |     "theConc = []\n",
236 |     "makeConc(word2find,listOfTokens,int(context),theConc)\n",
237 |     "theConc[-5:]"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "## Output\n",
245 |     "\n",
246 |     "Finally, we output to a text file."
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 11,
252 |    "metadata": {},
253 |    "outputs": [
254 |     {
255 |      "name": "stdout",
256 |      "output_type": "stream",
257 |      "text": [
258 |       "Done\n"
259 |      ]
260 |     }
261 |    ],
262 |    "source": [
263 |     "nameOfResults = word2find.capitalize() + \".Concordance.txt\"\n",
264 |     "\n",
265 |     "with open(nameOfResults, \"w\") as fileToWrite:\n",
266 |     "    for line in theConc:\n",
267 |     "        fileToWrite.write(line + \"\\n\")\n",
268 |     "    \n",
269 |     "print(\"Done\")"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "markdown",
274 |    "metadata": {},
275 |    "source": [
276 |     "Here we check that the file was created."
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 12,
282 |    "metadata": {},
283 |    "outputs": [
284 |     {
285 |      "name": "stdout",
286 |      "output_type": "stream",
287 |      "text": [
288 |       "Truth.Concordance.txt\r\n"
289 |      ]
290 |     }
291 |    ],
292 |    "source": [
293 |     "ls *.Concordance.txt"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "markdown",
298 |    "metadata": {},
299 |    "source": [
300 |     "## Next Steps"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "markdown",
305 |    "metadata": {},
306 |    "source": [
307 |     "Onwards to our final utility example [Exploring a text with NLTK](Exploring a text with NLTK.ipynb)"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "markdown",
312 |    "metadata": {},
313 |    "source": [
314 |     "---\n",
315 |     "[CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/) From [The Art of Literary Text Analysis](../ArtOfLiteraryTextAnalysis.ipynb) by [Stéfan Sinclair](http://stefansinclair.name) &amp; [Geoffrey Rockwell](http://geoffreyrockwell.com). Edited and revised by [Melissa Mony](http://melissamony.com). <br >Created September 30th, 2016 (Jupyter 4.2.1)"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": null,
321 |    "metadata": {
322 |     "collapsed": true
323 |    },
324 |    "outputs": [],
325 |    "source": []
326 |   }
327 |  ],
328 |  "metadata": {
329 |   "kernelspec": {
330 |    "display_name": "Python 3",
331 |    "language": "python",
332 |    "name": "python3"
333 |   },
334 |   "language_info": {
335 |    "codemirror_mode": {
336 |     "name": "ipython",
337 |     "version": 3
338 |    },
339 |    "file_extension": ".py",
340 |    "mimetype": "text/x-python",
341 |    "name": "python",
342 |    "nbconvert_exporter": "python",
343 |    "pygments_lexer": "ipython3",
344 |    "version": "3.6.3"
345 |   }
346 |  },
347 |  "nbformat": 4,
348 |  "nbformat_minor": 1
349 | }
350 | 


--------------------------------------------------------------------------------
/ipynb/utilities/.ipynb_checkpoints/Concordances-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Generating Concordances\n",
  8 |     "\n",
  9 |     "This notebook shows how you can generate a <a href=\"Glossary.ipynb#Concordance\" title=\"A list of all words within a text and their frequency of occurrence.\" >concordance </a> using lists."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "First we see what text files we have. "
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "name": "stdout",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "Hume Enquiry.txt   negative.txt       positive.txt\r\n",
 29 |       "Hume Treatise.txt  obama_tweets.txt\r\n"
 30 |      ]
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "ls *.txt"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "We are going to use the \"Hume Enquiry.txt\" from the Gutenberg Project. You can use whatever text you want. We print the first 50 characters to check."
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 2,
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stdout",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "This string has 1344061 characters.\n",
 54 |       "The Project Gutenberg EBook of A Treatise of Human\n"
 55 |      ]
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "theText2Use = \"Hume Treatise.txt\"\n",
 60 |     "with open(theText2Use, \"r\") as fileToRead:\n",
 61 |     "    fileRead = fileToRead.read()\n",
 62 |     "    \n",
 63 |     "print(\"This string has\", len(fileRead), \"characters.\")\n",
 64 |     "print(fileRead[:50])"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "## Tokenization\n",
 72 |     "\n",
 73 |     "Now we tokenize the text producing a list called \"listOfTokens\" and check the first words. This eliminates punctuation and lowercases the words."
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 3,
 79 |    "metadata": {},
 80 |    "outputs": [
 81 |     {
 82 |      "name": "stdout",
 83 |      "output_type": "stream",
 84 |      "text": [
 85 |       "['the', 'project', 'gutenberg', 'ebook', 'of', 'a', 'treatise', 'of', 'human', 'nature']\n"
 86 |      ]
 87 |     }
 88 |    ],
 89 |    "source": [
 90 |     "import re\n",
 91 |     "listOfTokens = re.findall(r'\\b\\w[\\w-]*\\b', fileRead.lower())\n",
 92 |     "print(listOfTokens[:10])"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "## Input\n",
100 |     "\n",
101 |     "Now we get the word you want a <a href=\"Glossary.ipynb#Concordance\" title=\"A list of all words within a text and their frequency of occurrence.\" >concordance </a> for and the context wanted."
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 4,
107 |    "metadata": {},
108 |    "outputs": [
109 |     {
110 |      "name": "stdout",
111 |      "output_type": "stream",
112 |      "text": [
113 |       "What word do you want collocates for? truth\n",
114 |       "How much context do you want? 10\n"
115 |      ]
116 |     }
117 |    ],
118 |    "source": [
119 |     "word2find = input(\"What word do you want collocates for? \").lower() # Ask for the word to search for\n",
120 |     "context = input(\"How much context do you want? \")# This asks for the context of words on either side to grab"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 5,
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "data": {
130 |       "text/plain": [
131 |        "str"
132 |       ]
133 |      },
134 |      "execution_count": 5,
135 |      "metadata": {},
136 |      "output_type": "execute_result"
137 |     }
138 |    ],
139 |    "source": [
140 |     "type(context)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 7,
146 |    "metadata": {},
147 |    "outputs": [
148 |     {
149 |      "data": {
150 |       "text/plain": [
151 |        "int"
152 |       ]
153 |      },
154 |      "execution_count": 7,
155 |      "metadata": {},
156 |      "output_type": "execute_result"
157 |     }
158 |    ],
159 |    "source": [
160 |     "contextInt = int(context)\n",
161 |     "type(contextInt)"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 9,
167 |    "metadata": {},
168 |    "outputs": [
169 |     {
170 |      "data": {
171 |       "text/plain": [
172 |        "228958"
173 |       ]
174 |      },
175 |      "execution_count": 9,
176 |      "metadata": {},
177 |      "output_type": "execute_result"
178 |     }
179 |    ],
180 |    "source": [
181 |     "len(listOfTokens)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "## Main function\n",
189 |     "\n",
190 |     "Here is the main <a href=\"Glossary.ipynb#Function\" title=\"functions provide functionality to a program\" >function </a> that does the work populating a new list with the lines of <a href=\"Glossary.ipynb#Concordance\" title=\"A list of all words within a text and their frequency of occurrence.\" >concordance. </a> We check the first 5 concordance lines."
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 10,
196 |    "metadata": {},
197 |    "outputs": [
198 |     {
199 |      "data": {
200 |       "text/plain": [
201 |        "['220330: a reason why the faculty of recalling past ideas with truth and clearness should not have as much merit in it',\n",
202 |        " '223214: confessing my errors and should esteem such a return to truth and reason to be more honourable than the most unerring',\n",
203 |        " '223680: from the other this therefore being regarded as an undoubted truth that belief is nothing but a peculiar feeling different from',\n",
204 |        " '224382: mind and he will evidently find this to be the truth secondly whatever may be the case with regard to this',\n",
205 |        " '225925: by their different feeling i should have been nearer the truth end of project gutenberg s a treatise of human nature']"
206 |       ]
207 |      },
208 |      "execution_count": 10,
209 |      "metadata": {},
210 |      "output_type": "execute_result"
211 |     }
212 |    ],
213 |    "source": [
214 |     "def makeConc(word2conc,list2FindIn,context2Use,concList):\n",
215 |     "\n",
216 |     "    end = len(list2FindIn)\n",
217 |     "    for location in range(end):\n",
218 |     "        if list2FindIn[location] == word2conc:\n",
219 |     "            # Here we check whether we are at the very beginning or end\n",
220 |     "            if (location - context2Use) < 0:\n",
221 |     "                beginCon = 0\n",
222 |     "            else:\n",
223 |     "                beginCon = location - context2Use\n",
224 |     "                \n",
225 |     "            if (location + context2Use) > end:\n",
226 |     "                endCon = end\n",
227 |     "            else:\n",
228 |     "                endCon = location + context2Use + 1\n",
229 |     "                \n",
230 |     "            theContext = (list2FindIn[beginCon:endCon])\n",
231 |     "            concordanceLine = ' '.join(theContext)\n",
232 |     "            # print(str(location) + \": \" + concordanceLine)\n",
233 |     "            concList.append(str(location) + \": \" + concordanceLine)\n",
234 |     "\n",
235 |     "theConc = []\n",
236 |     "makeConc(word2find,listOfTokens,int(context),theConc)\n",
237 |     "theConc[-5:]"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "## Output\n",
245 |     "\n",
246 |     "Finally, we output to a text file."
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 11,
252 |    "metadata": {},
253 |    "outputs": [
254 |     {
255 |      "name": "stdout",
256 |      "output_type": "stream",
257 |      "text": [
258 |       "Done\n"
259 |      ]
260 |     }
261 |    ],
262 |    "source": [
263 |     "nameOfResults = word2find.capitalize() + \".Concordance.txt\"\n",
264 |     "\n",
265 |     "with open(nameOfResults, \"w\") as fileToWrite:\n",
266 |     "    for line in theConc:\n",
267 |     "        fileToWrite.write(line + \"\\n\")\n",
268 |     "    \n",
269 |     "print(\"Done\")"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "markdown",
274 |    "metadata": {},
275 |    "source": [
276 |     "Here we check that the file was created."
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 12,
282 |    "metadata": {},
283 |    "outputs": [
284 |     {
285 |      "name": "stdout",
286 |      "output_type": "stream",
287 |      "text": [
288 |       "Truth.Concordance.txt\r\n"
289 |      ]
290 |     }
291 |    ],
292 |    "source": [
293 |     "ls *.Concordance.txt"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "markdown",
298 |    "metadata": {},
299 |    "source": [
300 |     "## Next Steps"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "markdown",
305 |    "metadata": {},
306 |    "source": [
307 |     "Onwards to our final utility example [Exploring a text with NLTK](Exploring a text with NLTK.ipynb)"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "markdown",
312 |    "metadata": {},
313 |    "source": [
314 |     "---\n",
315 |     "[CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/) From [The Art of Literary Text Analysis](../ArtOfLiteraryTextAnalysis.ipynb) by [Stéfan Sinclair](http://stefansinclair.name) &amp; [Geoffrey Rockwell](http://geoffreyrockwell.com). Edited and revised by [Melissa Mony](http://melissamony.com). <br >Created September 30th, 2016 (Jupyter 4.2.1)"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": null,
321 |    "metadata": {
322 |     "collapsed": true
323 |    },
324 |    "outputs": [],
325 |    "source": []
326 |   }
327 |  ],
328 |  "metadata": {
329 |   "kernelspec": {
330 |    "display_name": "Python 3",
331 |    "language": "python",
332 |    "name": "python3"
333 |   },
334 |   "language_info": {
335 |    "codemirror_mode": {
336 |     "name": "ipython",
337 |     "version": 3
338 |    },
339 |    "file_extension": ".py",
340 |    "mimetype": "text/x-python",
341 |    "name": "python",
342 |    "nbconvert_exporter": "python",
343 |    "pygments_lexer": "ipython3",
344 |    "version": "3.6.3"
345 |   }
346 |  },
347 |  "nbformat": 4,
348 |  "nbformat_minor": 1
349 | }
350 | 


--------------------------------------------------------------------------------
/spiral/CharacteristicCurve.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "metadata": {
  3 |         "created": 1492037077579,
  4 |         "modified": 1492037077579,
  5 |         "version": 2
  6 |     },
  7 |     "blocks": [
  8 |         {
  9 |             "type": "text",
 10 |             "input": [
 11 |                 "<h1>Mendenhall's Characteristic Curve (1887): Early Stylometrics</h1>\n\n<p>In 188",
 12 |                 "7 the polymath T. C. Mendenhall published an article in <em>Science</em> titled,",
 13 |                 " \"The Characteristic Curves of Composition\" which is both one of the earliest ex",
 14 |                 "amples of quantitative stylistics but also one of the first studies to present t",
 15 |                 "ext visualizations based on the (manual) count of words. Mendenhall thought that",
 16 |                 " different authors would have distinctive curves of word length frequencies whic",
 17 |                 "h could help with authorship attribution.</p>\n\n<p>Here you can see an example of",
 18 |                 " the characteristic curve of <em>Oliver Twist</em>. Mendenhall took the first 10",
 19 |                 "00 words, counted the length in characters of these 1000 words and then graphed ",
 20 |                 "the number of words of each length. Thus one can see that there is just under 50",
 21 |                 " words of one letter length in the first one thousand words.</p>\n\n<p><img alt=\"M",
 22 |                 "endhall Characteristic Curve\" src=\"//github.com/sgsinclair/epistemologica/raw/c5",
 23 |                 "5822b3d4080c758a168a252eb02ca4e8d1ba07/data/Mendenhall-CharacteristicCurve/Olive",
 24 |                 "rTwist-CharacteristicCurve.png\"></p>\n\n<p>Mendenhall thought this method of analy",
 25 |                 "sis would help with the \"identification or discrimination of authorship\" or auth",
 26 |                 "orship attribution as we call it today. Let's see if we can recapitulate his tec",
 27 |                 "hnique here.</p>\n\n<h2>Acquiring the Text</h2>\n\n<p>We'll begin by fetching the ed",
 28 |                 "ition of <a href=\"http://www.gutenberg.org/cache/epub/730/pg730.txt\" data-tabind",
 29 |                 "ex-value=\"none\" tabindex=\"-1\" data-tabindex-counter=\"4\">Oliver Twist</a> that's ",
 30 |                 "available from the <a href=\"http://en.wikipedia.org/wiki/Project_Gutenberg\" data",
 31 |                 "-tabindex-value=\"none\" tabindex=\"-1\" data-tabindex-counter=\"4\">Gutenberg Project",
 32 |                 "</a>.&nbsp;The code block below uses the <a href=\"../../docs/#!/api/Voyant.data.",
 33 |                 "model.Corpus-static-method-loadCorpus\" data-tabindex-value=\"none\" tabindex=\"-1\" ",
 34 |                 "data-tabindex-counter=\"4\">loadCorpus</a> function. The first time it was run wit",
 35 |                 "hout the corpus option, and then the corpus ID was added for future runs.</p>\n"
 36 |             ]
 37 |         },
 38 |         {
 39 |             "type": "code",
 40 |             "input": [
 41 |                 "new Corpus({",
 42 |                 "    input: 'https://gist.githubusercontent.com/sgsinclair/f895f2b37cdee761ac08e4ed8cc83d58/raw/CharlesDickens-OliverTwist.txt?1',",
 43 |                 "    inputRemoveUntil: \"CHAPTER I\",",
 44 |                 "    inputRemoveFromAfter: \"weak and erring.\"",
 45 |                 "}).assign(\"corpus\").show();"
 46 |             ],
 47 |             "output": [
 48 |                 " <div class=\"info\">This corpus has 1 document with 159,006 <span class=\"info-tip",
 49 |                 "\" data-qtip=\"every occurrence of every word (like multiple occurrences of &quot;",
 50 |                 "the&quot;) is counted\">total words</span> and 10,438 <span class=\"info-tip\" data",
 51 |                 "-qtip=\"multiple occurrences of words (like &quot;the&quot;) are counted once\">un",
 52 |                 "ique word forms</span>. Created <span class=\"info-tip\" data-qtip=\"2017-04-12, 18",
 53 |                 ":41:55\">about 3 hours ago</span>.</div>"
 54 |             ]
 55 |         },
 56 |         {
 57 |             "type": "text",
 58 |             "input": [
 59 |                 "<p>The corpus has nearly 160,000 words, but recall that&nbsp;Mendenhall only con",
 60 |                 "sidered the first 1,000 words. We can do the same by calling the loadTokens meth",
 61 |                 "od on our corpus and specifying arguments that limit the call to 1,000 word toke",
 62 |                 "ns while skipping non-word tokens.</p>\n"
 63 |             ]
 64 |         },
 65 |         {
 66 |             "type": "code",
 67 |             "input": "corpus.loadTokens({limit: 1000, noOthers: true}).assign(\"wordsStore\").show();",
 68 |             "output": [
 69 |                 " <div class=\"info\">This store contains 1000 items with these fields: id, docId, ",
 70 |                 "docIndex, token, rawFreq, tokenType, position, startOffset, endOffset.</div>"
 71 |             ]
 72 |         },
 73 |         {
 74 |             "type": "text",
 75 |             "input": [
 76 |                 "<p>We have 1,000 terms but each one has far more fields than we need, we're only",
 77 |                 " interested in the word length of the term. So we'll create a table where we inc",
 78 |                 "rement the value in first column (zero-based) where the row represent the term l",
 79 |                 "ength – this uses the <a href=\"../../docs/#!/api/VoyantTable-method-updateCell\" ",
 80 |                 "target=\"_blank\" data-tabindex-value=\"none\" tabindex=\"-1\" data-tabindex-counter=\"",
 81 |                 "3\">updateCell</a> function from the <a href=\"../../docs/#!/api/VoyantTable\" targ",
 82 |                 "et=\"_blank\" data-tabindex-value=\"none\" tabindex=\"-1\" data-tabindex-counter=\"3\">t",
 83 |                 "able</a>. Finally we use the <a href=\"http://docs/#!/api/VoyantTable-method-embe",
 84 |                 "d\" target=\"_blank\" data-tabindex-value=\"none\" tabindex=\"-1\" data-tabindex-counte",
 85 |                 "r=\"3\">embed</a> function to view the table as a <a href=\"../../docs/#!/api/Voyan",
 86 |                 "t.widget.VoyantChart\" target=\"_blank\" data-tabindex-value=\"none\" tabindex=\"-1\" d",
 87 |                 "ata-tabindex-counter=\"3\">VoyantChart</a>.</p>\n"
 88 |             ]
 89 |         },
 90 |         {
 91 |             "type": "code",
 92 |             "input": [
 93 |                 "var table = new VoyantTable()",
 94 |                 "wordsStore.each(function(word) {",
 95 |                 "    table.updateCell(word.getTerm().length, 0, 1);",
 96 |                 "});",
 97 |                 "table.embed(\"VoyantChart\", {series: {showMarkers: false}, axes: [{grid: true, title: \"Word Length\"}, {grid: true, title: \"Word Count\"}], width: 500})"
 98 |             ],
 99 |             "output": [
100 |                 " <div class=\"info\"><iframe style=\"width: 500px; height: 400px\" name=\"ext-211\" da",
101 |                 "ta-tabindex-value=\"none\" tabindex=\"-1\" data-tabindex-counter=\"1\"></iframe></div>"
102 |             ]
103 |         },
104 |         {
105 |             "type": "text",
106 |             "input": [
107 |                 "<p>If we compare to Mendenall's graph above, that seems pretty close! It's worth",
108 |                 " noting that Mendenhall doesn't specify what exactly was counted, such as chapte",
109 |                 "r titles (which might account for some slight variation).</p>\n\n<p>But Mendehall ",
110 |                 "was counting terms by hand – can we do better? Let's generate a similar chart bu",
111 |                 "t now consider <em>all</em> terms, not just the first 1,000.</p>\n"
112 |             ]
113 |         },
114 |         {
115 |             "type": "code",
116 |             "input": [
117 |                 "var oliverTwistLengths;",
118 |                 "corpus.loadCorpusTerms().then(function(corpusTerms) {",
119 |                 "    oliverTwistLengths = new VoyantTable();",
120 |                 "    corpusTerms.each(function(corpusTerm) {",
121 |                 "        oliverTwistLengths.updateCell(corpusTerm.getTerm().length, 0, corpusTerm.getRawFreq());",
122 |                 "    });",
123 |                 "    oliverTwistLengths.embed('voyantchart', {width: 500});",
124 |                 "});"
125 |             ],
126 |             "output": [
127 |                 " <div class=\"info\"><iframe style=\"width: 500px; height: 400px\" name=\"ext-215\" da",
128 |                 "ta-tabindex-value=\"none\" tabindex=\"-1\" data-tabindex-counter=\"1\"></iframe></div>"
129 |             ]
130 |         },
131 |         {
132 |             "type": "text",
133 |             "input": [
134 |                 "<p>Overall we have an impression that the line gets smoother, which isn't surpri",
135 |                 "sing given that we have more data points. The big question is whether the smooth",
136 |                 "ing actually makes the line less characteristic, which would somewhat contradict",
137 |                 " Mendhall's original hypothesis that every other has a characteristic curve. Let",
138 |                 "'s compare this with Austen's <i>Emma</i> which has about the same number of ter",
139 |                 "ms.&nbsp;<em>Emma&nbsp;</em>is the sixth&nbsp;document in the corpus, so we can ",
140 |                 "access it at index 5 (index is zero-based).&nbsp;</p>\n"
141 |             ]
142 |         },
143 |         {
144 |             "type": "code",
145 |             "input": [
146 |                 "var emma;",
147 |                 "new Corpus(\"austen\").then(function(corpus) {",
148 |                 "    emma = corpus.getDocument(5);",
149 |                 "    emma.show()",
150 |                 "})"
151 |             ],
152 |             "output": " <div class=\"info\">1815 Emma</div>"
153 |         },
154 |         {
155 |             "type": "text",
156 |             "input": [
157 |                 "<p>Now we'll calculate document term lengths for <em>Emma</em>&nbsp;almost ident",
158 |                 "ically to how we calculated corpus term lengths for <em>Oliver Twist</em>. Final",
159 |                 "ly, we'll chart this too.</p>\n"
160 |             ]
161 |         },
162 |         {
163 |             "type": "code",
164 |             "input": [
165 |                 "emma.loadDocumentTerms().then(function(documentTerms) {",
166 |                 "    emmaLengths = new VoyantTable();",
167 |                 "    documentTerms.each(function(documentTerm) {",
168 |                 "       emmaLengths.updateCell(documentTerm.getTerm().length, 0, documentTerm.getRawFreq()); ",
169 |                 "    });",
170 |                 "    ",
171 |                 "    // embed both word length tables",
172 |                 "    embed([oliverTwistLengths,'voyantchart',{",
173 |                 "        width: 500,",
174 |                 "        title: \"Word Lengths in <i>Oliver Twist</i>\"",
175 |                 "    }],[emmaLengths,'voyantchart',{",
176 |                 "        width: 500,",
177 |                 "        title: \"Word Lengths in <i>Emma</i>\"",
178 |                 "    }]);",
179 |                 "});",
180 |                 ""
181 |             ],
182 |             "output": [
183 |                 " <table><tbody><tr></tr></tbody></table><iframe style=\"width: 500px; height: 400",
184 |                 "px\" name=\"ext-220\" data-tabindex-value=\"none\" tabindex=\"-1\" data-tabindex-counte",
185 |                 "r=\"1\"></iframe><iframe style=\"width: 500px; height: 400px\" name=\"ext-223\" data-t",
186 |                 "abindex-value=\"none\" tabindex=\"-1\" data-tabindex-counter=\"1\"></iframe>"
187 |             ]
188 |         },
189 |         {
190 |             "type": "text",
191 |             "input": [
192 |                 "<p>These do seem different, among other things&nbsp;the peak has different angle",
193 |                 "s and the middle is more jagged in Emma. We can't help wonder if Mendenhall was ",
194 |                 "seeing larger differences with 1,000 word segments though, which would lead him ",
195 |                 "to over-estimate how distinctive an author's characteristic curve would be.</p>\n"
196 |             ]
197 |         }
198 |     ]
199 | }


--------------------------------------------------------------------------------
/assets/css/style.scss:
--------------------------------------------------------------------------------
  1 | ---
  2 | ---
  3 | 
  4 | @import "{{ site.theme }}";
  5 | 
  6 | @charset "UTF-8";
  7 | 
  8 | /* Import ET Book styles
  9 |    adapted from https://github.com/edwardtufte/et-book/blob/gh-pages/et-book.css */
 10 | 
 11 | @font-face { font-family: "et-book";
 12 |              src: url("et-book/et-book-roman-line-figures/et-book-roman-line-figures.eot");
 13 |              src: url("et-book/et-book-roman-line-figures/et-book-roman-line-figures.eot?#iefix") format("embedded-opentype"), url("et-book/et-book-roman-line-figures/et-book-roman-line-figures.woff") format("woff"), url("et-book/et-book-roman-line-figures/et-book-roman-line-figures.ttf") format("truetype"), url("et-book/et-book-roman-line-figures/et-book-roman-line-figures.svg#etbookromanosf") format("svg");
 14 |              font-weight: normal;
 15 |              font-style: normal; }
 16 | 
 17 | @font-face { font-family: "et-book";
 18 |              src: url("et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.eot");
 19 |              src: url("et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.eot?#iefix") format("embedded-opentype"), url("et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.woff") format("woff"), url("et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.ttf") format("truetype"), url("et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.svg#etbookromanosf") format("svg");
 20 |              font-weight: normal;
 21 |              font-style: italic; }
 22 | 
 23 | @font-face { font-family: "et-book";
 24 |              src: url("et-book/et-book-bold-line-figures/et-book-bold-line-figures.eot");
 25 |              src: url("et-book/et-book-bold-line-figures/et-book-bold-line-figures.eot?#iefix") format("embedded-opentype"), url("et-book/et-book-bold-line-figures/et-book-bold-line-figures.woff") format("woff"), url("et-book/et-book-bold-line-figures/et-book-bold-line-figures.ttf") format("truetype"), url("et-book/et-book-bold-line-figures/et-book-bold-line-figures.svg#etbookromanosf") format("svg");
 26 |              font-weight: bold;
 27 |              font-style: normal; }
 28 | 
 29 | @font-face { font-family: "et-book-roman-old-style";
 30 |              src: url("et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.eot");
 31 |              src: url("et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.eot?#iefix") format("embedded-opentype"), url("et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.woff") format("woff"), url("et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.ttf") format("truetype"), url("et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.svg#etbookromanosf") format("svg");
 32 |              font-weight: normal;
 33 |              font-style: normal; }
 34 | 
 35 | /* Tufte CSS styles */
 36 | html { font-size: 15px; }
 37 | 
 38 | body { width: 87.5%;
 39 |        margin-left: auto;
 40 |        margin-right: auto;
 41 |        padding-left: 12.5%;
 42 |        font-family: et-book, Palatino, "Palatino Linotype", "Palatino LT STD", "Book Antiqua", Georgia, serif;
 43 |        background-color: #fffff8;
 44 |        color: #111;
 45 |        max-width: 1400px;
 46 |        counter-reset: sidenote-counter; }
 47 | 
 48 | h1 { font-weight: 400;
 49 |      margin-top: 4rem;
 50 |      margin-bottom: 1.5rem;
 51 |      font-size: 3.2rem;
 52 |      line-height: 1; }
 53 | 
 54 | h2 { font-style: italic;
 55 |      font-weight: 400;
 56 |      margin-top: 2.1rem;
 57 |      margin-bottom: 1.4rem;
 58 |      font-size: 2.2rem;
 59 |      line-height: 1; }
 60 | 
 61 | h3 { font-style: italic;
 62 |      font-weight: 400;
 63 |      font-size: 1.7rem;
 64 |      margin-top: 2rem;
 65 |      margin-bottom: 1.4rem;
 66 |      line-height: 1; }
 67 | 
 68 | hr { display: block;
 69 |      height: 1px;
 70 |      width: 55%;
 71 |      border: 0;
 72 |      border-top: 1px solid #ccc;
 73 |      margin: 1em 0;
 74 |      padding: 0; }
 75 | 
 76 | p.subtitle { font-style: italic;
 77 |              margin-top: 1rem;
 78 |              margin-bottom: 1rem;
 79 |              font-size: 1.8rem;
 80 |              display: block;
 81 |              line-height: 1; }
 82 | 
 83 | .numeral { font-family: et-book-roman-old-style; }
 84 | 
 85 | .danger { color: red; }
 86 | 
 87 | article { position: relative;
 88 |           padding: 5rem 0rem; }
 89 | 
 90 | section { padding-top: 1rem;
 91 |           padding-bottom: 1rem; }
 92 | 
 93 | p, ol, ul { font-size: 1.4rem;
 94 |             line-height: 2rem; }
 95 | 
 96 | p { margin-top: 1.4rem;
 97 |     margin-bottom: 1.4rem;
 98 |     padding-right: 0;
 99 |     vertical-align: baseline; }
100 | 
101 | /* Chapter Epigraphs */
102 | div.epigraph { margin: 5em 0; }
103 | 
104 | div.epigraph > blockquote { margin-top: 3em;
105 |                             margin-bottom: 3em; }
106 | 
107 | div.epigraph > blockquote, div.epigraph > blockquote > p { font-style: italic; }
108 | 
109 | div.epigraph > blockquote > footer { font-style: normal; }
110 | 
111 | div.epigraph > blockquote > footer > cite { font-style: italic; }
112 | /* end chapter epigraphs styles */
113 | 
114 | blockquote { font-size: 1.4rem; }
115 | 
116 | blockquote p { width: 55%;
117 |                margin-right: 40px; }
118 | 
119 | blockquote footer { width: 55%;
120 |                     font-size: 1.1rem;
121 |                     text-align: right; }
122 | 
123 | section > p, section > footer, section > table { width: 55%; }
124 | 
125 | /* 50 + 5 == 55, to be the same width as paragraph */
126 | section > ol, section > ul { width: 50%;
127 |                              -webkit-padding-start: 5%; }
128 | 
129 | li:not(:first-child) { margin-top: 0.25rem; }
130 | 
131 | figure { padding: 0;
132 |          border: 0;
133 |          font-size: 100%;
134 |          font: inherit;
135 |          vertical-align: baseline;
136 |          max-width: 55%;
137 |          -webkit-margin-start: 0;
138 |          -webkit-margin-end: 0;
139 |          margin: 0 0 3em 0; }
140 | 
141 | figcaption { float: right;
142 |              clear: right;
143 |              margin-top: 0;
144 |              margin-bottom: 0;
145 |              font-size: 1.1rem;
146 |              line-height: 1.6;
147 |              vertical-align: baseline;
148 |              position: relative;
149 |              max-width: 40%; }
150 | 
151 | figure.fullwidth figcaption { margin-right: 24%; }
152 | 
153 | /* Links: replicate underline that clears descenders */
154 | a:link, a:visited { color: inherit; }
155 | 
156 | a:link { text-decoration: none;
157 |          background: -webkit-linear-gradient(#fffff8, #fffff8), -webkit-linear-gradient(#fffff8, #fffff8), -webkit-linear-gradient(#333, #333);
158 |          background: linear-gradient(#fffff8, #fffff8), linear-gradient(#fffff8, #fffff8), linear-gradient(#333, #333);
159 |          -webkit-background-size: 0.05em 1px, 0.05em 1px, 1px 1px;
160 |          -moz-background-size: 0.05em 1px, 0.05em 1px, 1px 1px;
161 |          background-size: 0.05em 1px, 0.05em 1px, 1px 1px;
162 |          background-repeat: no-repeat, no-repeat, repeat-x;
163 |          text-shadow: 0.03em 0 #fffff8, -0.03em 0 #fffff8, 0 0.03em #fffff8, 0 -0.03em #fffff8, 0.06em 0 #fffff8, -0.06em 0 #fffff8, 0.09em 0 #fffff8, -0.09em 0 #fffff8, 0.12em 0 #fffff8, -0.12em 0 #fffff8, 0.15em 0 #fffff8, -0.15em 0 #fffff8;
164 |          background-position: 0% 93%, 100% 93%, 0% 93%; }
165 | 
166 | @media screen and (-webkit-min-device-pixel-ratio: 0) { a:link { background-position-y: 87%, 87%, 87%; } }
167 | 
168 | a:link::selection { text-shadow: 0.03em 0 #b4d5fe, -0.03em 0 #b4d5fe, 0 0.03em #b4d5fe, 0 -0.03em #b4d5fe, 0.06em 0 #b4d5fe, -0.06em 0 #b4d5fe, 0.09em 0 #b4d5fe, -0.09em 0 #b4d5fe, 0.12em 0 #b4d5fe, -0.12em 0 #b4d5fe, 0.15em 0 #b4d5fe, -0.15em 0 #b4d5fe;
169 |                     background: #b4d5fe; }
170 | 
171 | a:link::-moz-selection { text-shadow: 0.03em 0 #b4d5fe, -0.03em 0 #b4d5fe, 0 0.03em #b4d5fe, 0 -0.03em #b4d5fe, 0.06em 0 #b4d5fe, -0.06em 0 #b4d5fe, 0.09em 0 #b4d5fe, -0.09em 0 #b4d5fe, 0.12em 0 #b4d5fe, -0.12em 0 #b4d5fe, 0.15em 0 #b4d5fe, -0.15em 0 #b4d5fe;
172 |                          background: #b4d5fe; }
173 | 
174 | /* Sidenotes, margin notes, figures, captions */
175 | img { max-width: 100%; }
176 | 
177 | .sidenote, .marginnote { float: right;
178 |                          clear: right;
179 |                          margin-right: -60%;
180 |                          width: 50%;
181 |                          margin-top: 0;
182 |                          margin-bottom: 0;
183 |                          font-size: 1.1rem;
184 |                          line-height: 1.3;
185 |                          vertical-align: baseline;
186 |                          position: relative; }
187 | 
188 | .sidenote-number { counter-increment: sidenote-counter; }
189 | 
190 | .sidenote-number:after, .sidenote:before { font-family: et-book-roman-old-style;
191 |                                            position: relative;
192 |                                            vertical-align: baseline; }
193 | 
194 | .sidenote-number:after { content: counter(sidenote-counter);
195 |                          font-size: 1rem;
196 |                          top: -0.5rem;
197 |                          left: 0.1rem; }
198 | 
199 | .sidenote:before { content: counter(sidenote-counter) " ";
200 |                    font-size: 1rem;
201 |                    top: -0.5rem; }
202 | 
203 | blockquote .sidenote, blockquote .marginnote { margin-right: -82%;
204 |                                                min-width: 59%;
205 |                                                text-align: left; }
206 | 
207 | div.fullwidth, table.fullwidth { width: 100%; }
208 | 
209 | div.table-wrapper { overflow-x: auto;
210 |                     font-family: "Trebuchet MS", "Gill Sans", "Gill Sans MT", sans-serif; }
211 | 
212 | .sans { font-family: "Gill Sans", "Gill Sans MT", Calibri, sans-serif;
213 |         letter-spacing: .03em; }
214 | 
215 | code { font-family: Consolas, "Liberation Mono", Menlo, Courier, monospace;
216 |        font-size: 1.0rem;
217 |        line-height: 1.42; }
218 | 
219 | .sans > code { font-size: 1.2rem; }
220 | 
221 | h1 > code, h2 > code, h3 > code { font-size: 0.80em; }
222 | 
223 | .marginnote > code, .sidenote > code { font-size: 1rem; }
224 | 
225 | pre.code { font-size: 0.9rem;
226 |            width: 52.5%;
227 |            margin-left: 2.5%;
228 |            overflow-x: auto; }
229 | 
230 | pre.code.fullwidth { width: 90%; }
231 | 
232 | .fullwidth { max-width: 90%;
233 |              clear:both; }
234 | 
235 | span.newthought { font-variant: small-caps;
236 |                   font-size: 1.2em; }
237 | 
238 | input.margin-toggle { display: none; }
239 | 
240 | label.sidenote-number { display: inline; }
241 | 
242 | label.margin-toggle:not(.sidenote-number) { display: none; }
243 | 
244 | .iframe-wrapper { position: relative;
245 |                   padding-bottom: 56.25%; /* 16:9 */
246 |                   padding-top: 25px;
247 |                   height: 0; }
248 | 
249 | .iframe-wrapper iframe { position: absolute;
250 |                          top: 0;
251 |                          left: 0;
252 |                          width: 100%;
253 |                          height: 100%; }
254 | 
255 | @media (max-width: 760px) { body { width: 84%;
256 |                                    padding-left: 8%;
257 |                                    padding-right: 8%; }
258 |                             hr, section > p, section > footer, section > table { width: 100%; }
259 |                             pre.code { width: 97%; }
260 |                             section > ol { width: 90%; }
261 |                             section > ul { width: 90%; }
262 |                             figure { max-width: 90%; }
263 |                             figcaption, figure.fullwidth figcaption { margin-right: 0%;
264 |                                                                       max-width: none; }
265 |                             blockquote { margin-left: 1.5em;
266 |                                          margin-right: 0em; }
267 |                             blockquote p, blockquote footer { width: 100%; }
268 |                             label.margin-toggle:not(.sidenote-number) { display: inline; }
269 |                             .sidenote, .marginnote { display: none; }
270 |                             .margin-toggle:checked + .sidenote,
271 |                             .margin-toggle:checked + .marginnote { display: block;
272 |                                                                    float: left;
273 |                                                                    left: 1rem;
274 |                                                                    clear: both;
275 |                                                                    width: 95%;
276 |                                                                    margin: 1rem 2.5%;
277 |                                                                    vertical-align: baseline;
278 |                                                                    position: relative; }
279 |                             label { cursor: pointer; }
280 |                             div.table-wrapper, table { width: 85%; }
281 |                             img { width: 100%; } }
282 | 


--------------------------------------------------------------------------------
/docs/count/index.md:
--------------------------------------------------------------------------------
  1 | # Counting with the Art of Literary Text Analysis
  2 | 
  3 | If you've been following along this [guide series](../) we've now looked at various basic concepts involved in building a corpus, including web scraping and pre-processing texts for things like cleanup and format conversion. We have our texts, now what?
  4 | 
  5 | One of the simplest but most significant tasks that we can do with a textual corpus is to count various occurrences. We can do this for its own purpose – for instance if we want to find a sequence of characters or if we want to know how many times a given phrase appears – but counting is also an analytic primitive that is part of many other more sophisticated tasks, such as distribution analysis, finding similar documents, and countless other operations.
  6 | 
  7 | ## Counting with Voyant
  8 | 
  9 | ![Voyant](../images/voyant48.png) We are going to visit many of the core concepts of counting with Voyant Tools, in large part because the functionality is easily accessible, which will allow us to focus on the concepts.
 10 | 
 11 | To fully understand counting of text it's useful to revisit how computers encode and process data, and text in particular. As is commonly known (though perhaps not fully understood), computers store information in a binary format, which essentially means that everything is based on a system of choices between two values, namely zero and one (that in turn can be used by a computer transister to send either a low or high current of electricity, also a binary state.
 12 | 
 13 | If I have one column with which to store data, I have two possible values: zero or one (black or white, heads or tails, etc.). If I have two columns, I now have 4 different possibilities (00, 01, 10, 11), I can multiply two for each column I have to determine the number of possibilities.
 14 | 
 15 | | bits | possibilities | equation | exponent | example |
 16 | |-|-|-|-|-|
 17 | | 1 | 2 | 2x1 | 2<sup>1<sup> | 0 |
 18 | | 2 | 4 | 2x2 | 2<sup>2<sup> | 01 |
 19 | | 3 | 8 | 2x2x2 | 2<sup>3<sup> | 010 |
 20 | | 4 | 16 | 2x2x2x2 | 2<sup>4<sup> | 0101 |
 21 | | 5 | 32 | 2x2x2x2x2 | 2<sup>5<sup> | 01010
 22 | | 6 | 64 | 2x2x2x2x2x2 | 2<sup>6<sup> | 010101
 23 | | 7 | 128 | 2x2x2x2x2x2 | 2<sup>7<sup> | 0101010
 24 | | 8 | 256 | 2x2x2x2x2x2x2 | 2<sup>8<sup> | 01010101
 25 | 
 26 | As we can see the number of "bits" (left column) corresponds with the number of digits (or columns), as shown in the right column ("example"). We haven't explained how to decipher binary into comprehensible information, but we have explained the basics of how binary and bits work.
 27 | 
 28 | If I'm trying to represent heads or tails I only need one bit (with two possibilities). If I need to represent the 26 letters of the alphabet (in lowercase) I need at least 5 bits (with 32 possibilities. If I want upper and lowercase characters as well as punctuation and so on, I need even more bits. It has become standard to work with units of 8 bits, also called one byte (with 256 possibilities). When we hear 8-bit that's what is being said, that there are 256 different possibilites (such as a gif image that can have up to 256 different colours).
 29 | 
 30 | 1 byte (8 bits) is plenty to represent texts using our English alphabet and even accented characters like "é" or "ñ", but woefully insufficient for other languages like Mandarin with its some 50,000 ideogram characters (here's a mini exercise: how many bits are needed to represent that many possibilities)? For the past couple of decades the dominant standard for encoding text is Unicode. Plain texts with our alphabet typically use UTF-8 where the 8 indicates 8 bits, but it's also possible to have up to UTF-32 (32 bits or 4 bytes or over 4 billion possibilities). It can be useful to know that UTF-16, for instance, is actually composed of a character that span across two bytes – in other words, the byte is still the core unit of encoding. Occasionally one might see a file or web page that has strange characters in it, sometimes that can be because two-byte characters are being interpreted incorrectly as one-byte characters (or vice-versa, it should be possible to fix that by re-opening the file with the correct character encoding).
 31 | 
 32 | So text is encoded in bits and bytes. When we ask the computer to find text, or a string sequence, we're asking it to find a matching set of bytes. Counting is similar, it's a matter of seeing how many times the byte sequence occurs. But it can also lead to surprising results. Imagine we are searching for the text "dog", without further instructions we might also inadvertently match the word "dogs" (which may be desirable) but also the word "dogmatic" (which probably isn't, unless we're reading the French comic book _Asterix_ in translation (the dog is named "Idéfix" in French and "Dogmatix" in English, surely one of the most inspired translations in history).
 33 | 
 34 | Some systems, like Voyant, go through a process of tokenization, which means trying to identify (and then count) words. But even the concept of word is slippery and contextual. For instance, is "don't" one word or two ("don" and "t" – or should it be modified "do" and "not")? Is "computer-assisted" one or two words? What about hyphenated proper names? In some cases we can delay choosing how to treat such words, in other cases (like Voyant) the decision must be made when creating the corpus (see the [tokenization](https://voyant-tools.org/docs/#!/guide/corpuscreator-section-tokenization) options in Voyant).
 35 | 
 36 | So, after these brief digressions into character encoding and tokenization we can now dive into working in Voyant. If you haven't already followed the [Getting Started](https://voyant-tools.org/docs/#!/guide/starthttps://voyant-tools.org/docs/#!/guide/start) in Voyant guide, you're *strongly* encouraged to do so. That guide is quick, if you want a deeper introduction to Voyant, it would also be well worth following the [Voyant tutorial](https://https://voyant-tools.org/docs/#!/guide/tutorialvoyant-tools.org/docs/#!/guide/tutorial).
 37 | 
 38 | Counting is a key part of the default view of Voyant, in some ways every tool of the main interface uses counting of words.
 39 | 
 40 | <iframe src="https://voyant-tools.org/?corpus=austen" style="width: 100%; height: 600px;"></iframe>
 41 | 
 42 | The Cirrus (word-cloud) tool is about term frequency (position the cursor over terms to see their frequency). Clicking on a term in Cirrus also shows frequency information in the upper middle Reader tool. In the upper-right is the Trends tool which is a combination of counting and distribution. In the bottom right-hand is the Summary tool which contains various counts (number of documents, number of words in the corpus, number of unique words in the corpus, number of words per document, frequency of distinctive words per document, etc.). Finally, the default view also shows the Keyword in context which finds occurrences of words. All five of the tools in the default view rely on term counts, as do most of the other 20 or so tools that are available in Voyant (you can switch tools by clicking on the window icon that appears in the grey header bars of any of the tools.
 43 | 
 44 | One of the most useful tools for counting terms isn't shown by default, but it is easily accessible by clicking on the "Terms" tab in the upper left-hand tool where Cirrus is by default.
 45 | 
 46 | <iframe src="https://voyant-tools.org/tool/CorpusTerms/?corpus=austen" style="width: 400px; height: 400px;"></iframe>
 47 | 
 48 | The default view of _Terms_ shows a list of high frequency words with their count and a mini-graph (called a sparkline in this case) that shows the distribution of the word across the corpus, in this case 8 novels from Jane Austen.
 49 | 
 50 | It's possible to view additional information about a term by clicking the plus icon in the left-most column, this expands a panel with additional information about the following:
 51 | 
 52 | * **Distribution**: another view of the sparkline
 53 | * **Collocates**: other terms that occur in higher frequency near this term
 54 | * **Correlations**: terms whose frequencies increase or decrease at a similar rate has this term
 55 | * **Phrases**: multi-word phrases that repeat and that start with this term (if applicable).
 56 | 
 57 | It's possible to scroll down to lower frequency words, new words will be loaded as necessary. This is sometimes call infinite scrolling though that's a bit misleading since there's a finite number of words in the corpus and eventually we would reach the bottom.
 58 | 
 59 | If for some reason we're more interested in sorting alphabetically rather than by frequency, it's possible to click on the "Term" header in the table.
 60 | 
 61 | It's important to recognize early that what we are seeing is a list of high frequency words, but not necessarily all the words, since there's automatically a stoplist that's applied; a stoplist is like a blacklist of words to be ignored. It's typically populated by many function words like determiners "the", "a" and other words that don't carry much meaning (such as prepositions, pronouns, and others).
 62 | 
 63 | It's possible to edit the stoplist by clicking on the options icon in the grey title bar (the bar with "Terms" near the top, icons will appear on the right while hovering, we want the one that looks like a slider option. We can click on that and proceed to select another list or edit the existing list (it's a very good idea to look at what's in that list, there may be some surprises). You can remove words or add words in the editor, see the [Stoplist](https://voyant-tools.org/docs/#!/guide/stopwords) documentation for more information.
 64 | 
 65 | If you want to keep your edited stopword list, remember to export a URL (using the export icons in the header bar) to ensure that the new list is included (otherwise the default list will be shown next time the URL is visited).
 66 | 
 67 | Voyant is designed to be user-friendly, which sometimes means showing the most useful information (to avoid overwhelming the user) while making other information available through additional steps. That's the case with the _Trends_ tool, and several other table or [grid-based tools](https://voyant-tools.org/docs/#!/guide/grids). To access additional functionality, click the down arrow that should appear in a column header when you're hovering (especially the "Terms" or "Count" headers).
 68 | 
 69 | <img src="terms-columns.png" alt="Terms Columns" style="max-width: 500px;" />
 70 | 
 71 | As can be seen, several options exist, including to show:
 72 | 
 73 | * **Term**: this is the term in the corpus
 74 | * **Count**: this is the frequency of the term in the corpus
 75 | * **Trends**: this is a sparkline graph that shows the distribution of relative frequencies across documents in the corpus (if the corpus contains more than one document); you can hover over the sparkline to see finer-grained results
 76 | * **Relative**: this is the relative frequency of the term in the corpus, per one million words (sorting by count and relative should produce the same results, the relative frequencies might be useful when comparing to another corpus)
 77 | * **Comparison**: this is the relative frequency of the term in the corpus compared to the relative frequency of the same term in a comparison corpus; to specify the comparison corpus, click the Options icon and specify the comparison corpus to use
 78 | * **Peakedness**: this is a statistical measure of how much the relative frequencies of a term in a corpus are bunched up into peaks (regions with higher values where the rest are lower)
 79 | * **Skew**: this is a statistical measure of the symmetry of the relative frequencies of a term across the corpus
 80 | 
 81 | Although Peakedness and Skew start to seem like advanced statistical measures, they can reveal some interesting characteristics about the general trends for term frequency in a corpus (they aren't as useful for a corpus with a single document, but there's a specialized [Document Terms](https://voyant-tools.org/docs/#!/guide/documentterms) tool that presents other useful information.
 82 | 
 83 | The _Terms_ tool provides various counts of an existing list, but one of the most powerful features of Voyant is search, which can be done using the box in the bottom part of the tool. The following provides a guide to the supported syntax (see also [Search](https://voyant-tools.org/docs/#!/guide/search)):
 84 | 
 85 | * [`love`](https://voyant-tools.org/?corpus=austen&query=love&view=CorpusTerms): match **exact term** love
 86 | * [`love*`](https://voyant-tools.org/?corpus=austen&query=love*&view=CorpusTerms): match terms that start with the **prefix** love and then a **wildcard** as **one term**
 87 | * [`^love*`](https://voyant-tools.org/?corpus=austen&query=^love*&view=CorpusTerms): match terms that start with love as **separate terms** (love, lovely, etc.)
 88 | * [`*ove`](https://voyant-tools.org/?corpus=austen&query=ove*&view=CorpusTerms): match terms that end with the **suffix** _ove_ as **one term**
 89 | * [`^*ove`](https://voyant-tools.org/?corpus=austen&query=^love*&view=CorpusTerms): match terms that end with **suffix** _ove_ as **separate terms** (love, above, etc.)
 90 | * [`love,hate`](https://voyant-tools.org/?corpus=austen&query=love,hate&view=CorpusTerms): match each term **separated by commas** as **separate terms**
 91 | * [`love\|hate`](https://voyant-tools.org/?corpus=austen&query=love\|hate&view=CorpusTerms): match terms **separated by pipes** as a **single term**
 92 | * [`"love him"`](https://voyant-tools.org/?corpus=austen&query="love him"&view=CorpusTerms): _love him_ as an exact **phrase** (word order matters)
 93 | * [`"love him"~0`](https://voyant-tools.org/?corpus=austen&query="love+him"~0&view=CorpusTerms): _love him_ or _him love_ **phrase** (word order doesn't matter but 0 words in between)
 94 | * [`"love her"~5`](https://voyant-tools.org/?corpus=austen&query="love+her"~5&view=CorpusTerms): match _love_ **near** _her_ (within 5 words)
 95 | * [`^love*,love\|hate,"love her"~5`](https://voyant-tools.org/?corpus=austen&query=^love*,hate\|love,"love+her"~5&view=CorpusTerms): **combine** syntaxes
 96 | 
 97 | Can you find what you're looking for?
 98 | 
 99 | ![Jupyter](../images/jupyter48.png) For the counting unit in Jupyter we'll head over to [Getting Texts](https://nbviewer.jupyter.org/github/sgsinclair/alta/blob/master/ipynb/GettingTexts.ipynb) page of the Art of Literary Text Mining with Jupyter.
100 | 


--------------------------------------------------------------------------------
/ipynb/Glossary.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Glossary"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "<h4 id=\"argument\">Argument </h4>\n",
 15 |     "A value which is passed to a function or method when 'called'. Arguments are assigned to named local variables in the function body. Arguments can be further classified as either keyword or positional. In the simplest terms the difference between these types is that keyword arguments are named (proceeded by an identifier) and positional arguments are unnamed (in list form). [Further information.](https://docs.python.org/3/glossary.html)\n",
 16 |     "\n",
 17 |     "---"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "#### Array <a name=\"array\"></a>\n",
 25 |     "A data structure consisting of an ordered collection of items of a single type i.e. an indexed list.\n",
 26 |     "\n",
 27 |     "---"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "#### Bag of words <a name=\"bagofwords\"></a>\n",
 35 |     "A [model](https://en.wikipedia.org/wiki/Bag-of-words_model) where text is represented as a multiset (bag) of its words. This simplification disregards features such as word order and grammar and instead focuses on term frequency.\n",
 36 |     "\n",
 37 |     "---"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "#### Cartesian Graph <a name=\"cartesiangraph\"></a>\n",
 45 |     "Also known as a [Cartesian Coordinate System](https://en.wikipedia.org/wiki/Cartesian_coordinate_system) which plots numbers on a plane using an x and y axis.\n",
 46 |     "\n",
 47 |     "---"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "#### Cell <a name=\"cell\"></a>\n",
 55 |     "An input strucutre in a Notebook which runs either [Markdown](https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet) or Python code.\n",
 56 |     "\n",
 57 |     "---"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "#### Classifier <a name=\"Classifier\"></a>\n",
 65 |     "A machine-learning algorithm that determines the class of an input element based on a set of features. \n",
 66 |     "\n",
 67 |     "---"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "#### Concatenation <a name=\"Concatenation\"></a>\n",
 75 |     "The process of combining strings i.e *\"This string is\" + \"Concatenating\"*\n",
 76 |     "\n",
 77 |     "---"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "#### Concordance <a name=\"Concordance\"></a>\n",
 85 |     "A list of all words within a text and their frequency of occurrence.\n",
 86 |     "\n",
 87 |     "---"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "#### Conditional Block <a name=\"conditionalblock\"></a>\n",
 95 |     "Where the program has to make a decision based on a series of options using [conditional statements](http://www.openbookproject.net/books/bpp4awd/ch04.html) such as *if, else* and *elif*\n",
 96 |     "\n",
 97 |     "---"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "#### Debug <a name=\"Debug\"></a>\n",
105 |     "The process of identifying and removing errors from a program.\n",
106 |     "\n",
107 |     "---"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "#### Delimiter <a name=\"Delimiter\"></a>\n",
115 |     "A character (most typically a comma) used to specify boundaries between words or regions in plain text.\n",
116 |     "\n",
117 |     "---"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "#### Directory Tree <a name=\"directorytree\"></a>\n",
125 |     "A tree like structure which represents the organization and hierachy of files within a directory. Terms such as *parent* and *child* are used to describe relationships between files and folders within this system.\n",
126 |     "\n",
127 |     "---"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "#### Dispersion Plot <a name=\"dispersionplot\"></a>\n",
135 |     "Also known as a [Scatter plot](https://en.wikipedia.org/wiki/Scatter_plot). A graph which uses cartesian coordinates to display values for multiple variables of a set of data. Particularly useful for displaying positional information for words within a text.\n",
136 |     "\n",
137 |     "---"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "#### Fork <a name=\"Fork\"></a>\n",
145 |     "A cloned copy of a project which is set-up on a independent branch seperate to the original. Often used as a development tool in opensource software - where anyone can create a fork of the program and work on it as a distinct piece of software. [Github](https://github.com/) is an example of a tool which facilitates this sharing and development process.\n",
146 |     "\n",
147 |     "---"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "#### Function <a name=\"Function\"></a>\n",
155 |     "Put simply, functions provide functionality to a program. They are blocks of organized code which begin with the keyword *def* proceeded by the name of the function you wish to define in parentheses. The code block begins with a colon and must be indented. [Further Information.](https://www.tutorialspoint.com/python/python_functions.htm)\n",
156 |     "- Function Chaining - Also known as method chaining. It is a set of rules which govern the process of calling multiple methods (functions) in a single statement.\n",
157 |     "- Recursive Function - A function which calls itself one or many times in an loop until it fufils the condition of its [recursion.](https://www.python-course.eu/recursive_functions.php)\n",
158 |     "- Calling a Function - Telling the program to execute a function.\n",
159 |     "---"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "metadata": {},
165 |    "source": [
166 |     "#### Indentation <a name=\"Indentation\"></a>\n",
167 |     "Empty spaces used as a formatting tool to designate blocks of code in programming. In Python, indentation is used to indicate a block of code, typically four spaces are used - each line of code in the block must be indented by the same amount of spaces otherwise an error may occur.\n",
168 |     "\n",
169 |     "---"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "markdown",
174 |    "metadata": {},
175 |    "source": [
176 |     "#### Iteration <a name=\"Iteration\"></a>\n",
177 |     "The repetition of a procedure in the form of a loop to obtain successively closer approximations to the solution of a problem.\n",
178 |     "\n",
179 |     "---"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {},
185 |    "source": [
186 |     "#### Kernel <a name=\"kernel\"></a>\n",
187 |     "The core computer program of the operating system which can control all system processes. The iPython kernel runs the code in the background for Jupyter notebooks.\n",
188 |     "\n",
189 |     "---"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "#### Lemmatization <a name=\"Lemmatization\"></a>\n",
197 |     "A lemma is the canonical form of a word. Lematization is the process of grouping together inflected forms of a word to be analysed as a single item i.e. determining the orginal lemma for the words.\n",
198 |     "\n",
199 |     "---"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "#### List Comprehension<a name=\"listcomprehension\"></a>\n",
207 |     "A method for defining and constructing lists. Particularly useful for creating a new list from an exsisting list using expressions with a *for / in* statement within a set of brackets.  [Further Information.](https://www.python-course.eu/list_comprehension.php)\n",
208 |     "\n",
209 |     "---"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {},
215 |    "source": [
216 |     "#### Nest<a name=\"Nest\"></a>\n",
217 |     "Placing objects or elements in a hierarchical arrangement within a set (an ordered collection of immutable objects).\n",
218 |     "\n",
219 |     "---"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "metadata": {},
225 |    "source": [
226 |     "#### N-gram <a name=\"Ngram\"></a>\n",
227 |     "A unit (letter, words etc) of variable size (n = number of units) from a given sequence of text in a corpus used in language modelling. [Further information](https://en.wikipedia.org/wiki/N-gram)\n",
228 |     "\n",
229 |     "---"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "markdown",
234 |    "metadata": {},
235 |    "source": [
236 |     "#### Normalization <a name=\"Normalization\"></a>\n",
237 |     "A process of transforming text into a single canonical form, thereby faciliating data consistentency for further processing. Examples include removing non-alphanumeric characters or changing to lower case.\n",
238 |     "\n",
239 |     "---"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "markdown",
244 |    "metadata": {},
245 |    "source": [
246 |     "#### Object <a name=\"Object\"></a>\n",
247 |     "Data which has attributes or values AND a defined behaviour.\n",
248 |     "- Response Object - An object which returns a response made through a HTTP request when collecting data from a website or URL.\n",
249 |     "\n",
250 |     "---"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "metadata": {},
256 |    "source": [
257 |     "#### Operator <a name=\"Operator\"></a>\n",
258 |     "Symbols which perform arithmetic or logical computation. Some basic types of operators used in Python are arithmetic (addition +, modulus % etc), comparison (greater than >, not equal to !=, etc) or logical (*and, or, not*). [Further Information](https://www.programiz.com/python-programming/operators)\n",
259 |     "\n",
260 |     "---"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "metadata": {},
266 |    "source": [
267 |     "#### Parse <a name=\"Parse\"></a>\n",
268 |     "Parsing or Syntactic Analysis is a process whereby sentences or strings of words are analysed by a computer into their constituents, often this is represented in a [parse tree](https://en.wikipedia.org/wiki/Parse_tree) which illustrates this syntactic structure.\n",
269 |     "\n",
270 |     "---"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "markdown",
275 |    "metadata": {},
276 |    "source": [
277 |     "#### Plain Texts <a name=\"plaintexts\"></a>\n",
278 |     "Text which includes only data related to the readable material. That is, without data related to grapahical presentation, formatting or other objects such as images. Encoded using Unicode standards, typically in a text editor such as Textedit on Mac or Wordpad on PC. Plain texts are particularly useful for archival storage as they are not confined to proprietary software and can be opened and edited on many systems, thereby ensuring a more universal accessibility and preservation. \n",
279 |     "\n",
280 |     "---"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "markdown",
285 |    "metadata": {},
286 |    "source": [
287 |     "#### Regular Expressions <a name=\"regularexpressions\"></a>\n",
288 |     "The sequence of characters which define a search pattern. These patterns are useful for performing string operations such as *find* or *find and replace*\n",
289 |     "\n",
290 |     "---"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "markdown",
295 |    "metadata": {},
296 |    "source": [
297 |     "#### Regularize <a name=\"Regularize\"></a>\n",
298 |     "The replacement of irregular forms in syntax with regular forms.\n",
299 |     "\n",
300 |     "---"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "markdown",
305 |    "metadata": {},
306 |    "source": [
307 |     "#### Repository <a name=\"Repository\"></a>\n",
308 |     "A central location where where data is stored and managed. More specifically, in revision control systems a repository stores metadata for sets of files or directory structure.\n",
309 |     "\n",
310 |     "---"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "markdown",
315 |    "metadata": {},
316 |    "source": [
317 |     "#### Sequence <a name=\"Sequence\"></a>\n",
318 |     "An ordered set of Lists, Tuples or Strings.\n",
319 |     "\n",
320 |     "---"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "markdown",
325 |    "metadata": {},
326 |    "source": [
327 |     "#### Sparse Matrix <a name=\"sparsematrix\"></a>\n",
328 |     "Also known as a [sparse array.](https://en.wikipedia.org/wiki/Sparse_matrix) It is a matrix (an array of data arranged in a rectangular structure of columns and rows) in which most of the elements are zero. If most of the elements were populated by values other than zero than the matrix could be considered dense.\n",
329 |     "\n",
330 |     "---"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "markdown",
335 |    "metadata": {},
336 |    "source": [
337 |     "#### Stemming <a name=\"Stemming\"></a>\n",
338 |     "The process of reducing a word to it's base form or word stem e.g. added/adding would reduce to add.\n",
339 |     "\n",
340 |     "---"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "markdown",
345 |    "metadata": {},
346 |    "source": [
347 |     "#### Stop Words <a name=\"stopwords\"></a>\n",
348 |     "A list of words which are programmed to be ignored or filtered in analysis and search queries. Lists of stop-words often contain high frequency function words such as *the, of, and* etc\n",
349 |     "\n",
350 |     "---"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "markdown",
355 |    "metadata": {},
356 |    "source": [
357 |     "#### String <a name=\"String\"></a>\n",
358 |     "A string is a container for data of letters, numbers or symbols.\n",
359 |     "- Zero padded strings - To pad a string (usually an integer) with leading zeros to make up a specified length.\n",
360 |     "\n",
361 |     "---"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "markdown",
366 |    "metadata": {},
367 |    "source": [
368 |     "#### Synset <a name=\"Synset\"></a>\n",
369 |     "A set of synonyms. \n",
370 |     "\n",
371 |     "---"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "markdown",
376 |    "metadata": {},
377 |    "source": [
378 |     "#### Training Set <a name=\"trainingset\"></a>\n",
379 |     "A data set used to train a model in machine learning. Specific examples are chosen to fit the parameters of the model for training and the subsequent results are compared with a testing dataset.\n",
380 |     "\n",
381 |     "---"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "markdown",
386 |    "metadata": {},
387 |    "source": [
388 |     "#### Tuple <a name=\"Tuple\"></a>\n",
389 |     "A sequence of immutable (fixed) objects. Tuples are created by seperating values using commas within a set of parentheses e.g. (1, 2, 3, 4, 5 );\n",
390 |     "\n",
391 |     "---"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "markdown",
396 |    "metadata": {},
397 |    "source": [
398 |     "#### Variable <a name=\"Variable\"></a>\n",
399 |     "A variable stores a piece of data and gives it a specific name. Common data types which are stored in variables in Python include numbers and Boolean values. \n",
400 |     "\n",
401 |     "---"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "markdown",
406 |    "metadata": {},
407 |    "source": [
408 |     "#### Unicode <a name=\"Unicode\"></a>\n",
409 |     "An industry standard in computing for encoding (representing) text. Letters, numbers and symbols are assigned unique numeric values which facilitate universal application across different programs and platforms. A fun example of the utility of unicode is the emoji keyboard used on smartphones when sending messages.  The universal nature of unicode allows the emoji's to be accurately represented on most modern phones regardless of their differing operating systems (such as android, ios, blackberry). [Further information](http://unicode.org/standard/WhatIsUnicode.html)\n",
410 |     "\n",
411 |     "---"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "markdown",
416 |    "metadata": {
417 |     "collapsed": true
418 |    },
419 |    "source": [
420 |     "[CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/) From [The Art of Literary Text Analysis](ArtOfLiteraryTextAnalysis.ipynb) by [Stéfan Sinclair](http://stefansinclair.name) &amp; [Geoffrey Rockwell](http://geoffreyrockwell.com)<br /> \n",
421 |     "Edited and revised by [Melissa Mony](http://melissamony.com)"
422 |    ]
423 |   }
424 |  ],
425 |  "metadata": {
426 |   "kernelspec": {
427 |    "display_name": "Python 3",
428 |    "language": "python",
429 |    "name": "python3"
430 |   },
431 |   "language_info": {
432 |    "codemirror_mode": {
433 |     "name": "ipython",
434 |     "version": 3
435 |    },
436 |    "file_extension": ".py",
437 |    "mimetype": "text/x-python",
438 |    "name": "python",
439 |    "nbconvert_exporter": "python",
440 |    "pygments_lexer": "ipython3",
441 |    "version": "3.6.3"
442 |   }
443 |  },
444 |  "nbformat": 4,
445 |  "nbformat_minor": 2
446 | }
447 | 


--------------------------------------------------------------------------------
/ipynb/experiments/SmithImageryWordList.txt:
--------------------------------------------------------------------------------
   1 | ABBEY
   2 | ABBOTS
   3 | ABLAZE
   4 | ABORO
   5 | ABYSS
   6 | ACCENT
   7 | ACHE
   8 | ACOLYTE
   9 | ADAM
  10 | AFLAME
  11 | AFRAID
  12 | AFTERGLOW
  13 | AGLOW
  14 | AGONY
  15 | AIR
  16 | AISLE
  17 | AJAR
  18 | ALMONDS
  19 | ALOUD
  20 | ALTAR
  21 | ALTARSTEPS
  22 | AMEN
  23 | ANGEL
  24 | ANGER
  25 | ANGRILY
  26 | ANGUISH
  27 | ANIMAL
  28 | ANNOYED
  29 | ANTELOPES
  30 | ANTLIKE
  31 | APE
  32 | APOLOGISE
  33 | APOSTLE
  34 | APPLAUSE
  35 | APPLE
  36 | APPLETREES
  37 | APRON
  38 | ARCHES
  39 | AROMATIC
  40 | ARROW
  41 | ARSE
  42 | ART
  43 | ASHES
  44 | ASHPLANT
  45 | ATBORO
  46 | AUBURN
  47 | AUDIBLE
  48 | AUTUMN
  49 | AZURE
  50 | BAAING
  51 | BABBLE
  52 | BABY
  53 | BACON
  54 | BAG
  55 | BAH
  56 | BAIZE
  57 | BALCONIES
  58 | BALD
  59 | BALDHEAD
  60 | BALDWIN
  61 | BALDYHEAD
  62 | BALES
  63 | BALLAD
  64 | BALMY
  65 | BAND
  66 | BANG
  67 | BANTER
  68 | BAPTISED
  69 | BAPTISM
  70 | BARE
  71 | BAREFOOT
  72 | BAREHEADED
  73 | BARK
  74 | BARMAIDS
  75 | BARRACK
  76 | BARRELS
  77 | BARREN
  78 | BASE
  79 | BASIN
  80 | BASKET
  81 | BATH
  82 | BATLIKE
  83 | BAWL
  84 | BAY
  85 | BEACH
  86 | BEADS
  87 | BEAK
  88 | BEAR
  89 | BEARD
  90 | BEAST
  91 | BEAT
  92 | BECAUSEBORO
  93 | BED
  94 | BEER
  95 | BELL
  96 | BENCH
  97 | BENUMBED
  98 | BIBLE
  99 | BICYCLE
 100 | BIKE
 101 | BIRD
 102 | BIRDCALL
 103 | BISCUIT
 104 | BISHOP
 105 | BITCH
 106 | BITTER
 107 | BLACK
 108 | BLACKLOOKING
 109 | BLACKROCK
 110 | BLANDLY
 111 | BLANK
 112 | BLANKET
 113 | BLASPHEMER
 114 | BLASPHEMIES
 115 | BLASPHEMOUS
 116 | BLAST
 117 | BLAZE
 118 | BLEAK
 119 | BLEATING
 120 | BLESS
 121 | BLEW
 122 | BLIND
 123 | BLINDCORD
 124 | BLINKING
 125 | BLOOD
 126 | BLOODRED
 127 | BLOOM
 128 | BLOSSOMS
 129 | BLOW
 130 | BLUE
 131 | BLUSH
 132 | BOAT
 133 | BODIES
 134 | BODILY
 135 | BOGHOLE
 136 | BOGS
 137 | BOGWATER
 138 | BOILING
 139 | BONE
 140 | BONNET
 141 | BOOING
 142 | BOOK
 143 | BOOTS
 144 | BOUS
 145 | BOWL
 146 | BOX
 147 | BRACKISH
 148 | BRANCH
 149 | BRASS
 150 | BRAY
 151 | BREAD
 152 | BREAKWATER
 153 | BREAST
 154 | BREATH
 155 | BREECHES
 156 | BREEZE
 157 | BRICK
 158 | BRIDGE
 159 | BRIGHT
 160 | BRILLIANT
 161 | BRIM
 162 | BRIMSTONE
 163 | BRINE
 164 | BRONZE
 165 | BROTHER
 166 | BROW
 167 | BROWN
 168 | BRUSH
 169 | BUDDING
 170 | BUFF
 171 | BULL
 172 | BUM
 173 | BUMP
 174 | BURN
 175 | BURST
 176 | BUSH
 177 | BUTT
 178 | CABBAGE
 179 | CABIN
 180 | CACKLING
 181 | CADENCE
 182 | CAKE
 183 | CALF
 184 | CALICO
 185 | CALM
 186 | CALVARY
 187 | CANAL
 188 | CANCER
 189 | CANDLE
 190 | CANDLEBUTTS
 191 | CANDLESTICK
 192 | CANE
 193 | CANKER
 194 | CANON
 195 | CANOPY
 196 | CANVAS
 197 | CAP
 198 | CAPUCHIN
 199 | CAR
 200 | CARD
 201 | CARESS
 202 | CARMELITE
 203 | CARNIVAL
 204 | CARPET
 205 | CARRIAGE
 206 | CARROTS
 207 | CASK
 208 | CASTLE
 209 | CAT
 210 | CATACOMBS
 211 | CATAFALQUE
 212 | CATCALLS
 213 | CATECHISM
 214 | CATTLE
 215 | CAVE
 216 | CAVERN
 217 | CELERY
 218 | CEMETERY
 219 | CENSER
 220 | CEREMENTS
 221 | CESSPOOL
 222 | CHAIN
 223 | CHAIR
 224 | CHALICE
 225 | CHAMBERPOT
 226 | CHAMPAGNE
 227 | CHANCES
 228 | CHANDELIER
 229 | CHANNEL
 230 | CHAP
 231 | CHAPEL
 232 | CHARCOAL
 233 | CHEEK
 234 | CHEER
 235 | CHESTNUT
 236 | CHEWED
 237 | CHILD
 238 | CHILL
 239 | CHIME
 240 | CHIN
 241 | CHOCOLATE
 242 | CHOIR
 243 | CHOKED
 244 | CHORD
 245 | CHRIST
 246 | CHRISTENDOM
 247 | CHRISTMAS
 248 | CHURCH
 249 | CIGAR
 250 | CIGARETTE
 251 | CINDERPATH
 252 | CINDERS
 253 | CINNAMOMUM
 254 | CIRCLE
 255 | CITIES
 256 | CLAPPED
 257 | CLASSROOM
 258 | CLATTER
 259 | CLAY
 260 | CLEAN
 261 | CLEAR
 262 | CLERGY
 263 | CLERICAL
 264 | CLICK
 265 | CLIFFS
 266 | CLOAK
 267 | CLOCK
 268 | CLOISTER
 269 | CLOTH
 270 | CLOUD
 271 | CLOUDLETS
 272 | COAL
 273 | COAT
 274 | COBWEB
 275 | COCK
 276 | COCOA
 277 | COCOON
 278 | COD
 279 | COFFIN
 280 | COIL
 281 | COIN
 282 | COLD
 283 | COLLAR
 284 | COLLYWOBBLES
 285 | COLORLESS
 286 | COLOUR
 287 | COMMUNED
 288 | COMMUNICANT
 289 | CONFESS
 290 | CONFLAGRATION
 291 | CONSECRATED
 292 | CONVENT
 293 | COOL
 294 | COPPER
 295 | COPYBOOK
 296 | CORD
 297 | CORDUROY
 298 | CORK
 299 | CORPSE
 300 | CORPSEWHITE
 301 | CORRIDOR
 302 | CORRIGAN
 303 | COTTAGE
 304 | COUGH
 305 | COW
 306 | COWDUNG
 307 | COWHAIRS
 308 | COWHOUSE
 309 | COWL
 310 | COWYARD
 311 | CRACK
 312 | CRADLE
 313 | CRASH
 314 | CREAKED
 315 | CREAM
 316 | CRICKET
 317 | CRICKETBATS
 318 | CRICKETCAP
 319 | CRIED
 320 | CROCODILE
 321 | CROSS
 322 | CROWD
 323 | CROWN
 324 | CRUCIFIED
 325 | CRUCIFIX
 326 | CRY
 327 | CUP
 328 | CURED
 329 | CURL
 330 | CURTAIN
 331 | CYCLE
 332 | DAIRY
 333 | DAMN
 334 | DAMP
 335 | DANCE
 336 | DANK
 337 | DAPPLED
 338 | DARK
 339 | DARKPLUMAGED
 340 | DART
 341 | DAWN
 342 | DAY
 343 | DAYLIGHT
 344 | DEAD
 345 | DEAF
 346 | DEATH
 347 | DEATHBED
 348 | DEATHCHILL
 349 | DEATHMASK
 350 | DEATHWOUND
 351 | DEDALUS
 352 | DESK
 353 | DEUS
 354 | DEVIL
 355 | DEW
 356 | DEWLAPS
 357 | DIAMONDS
 358 | DICE
 359 | DIE
 360 | DIEU
 361 | DIM
 362 | DIMPLES
 363 | DIN
 364 | DINGDOHG
 365 | DINNERTABLE
 366 | DIRTY
 367 | DISH
 368 | DITCH
 369 | DIZZILY
 370 | DOCKS
 371 | DOG
 372 | DOLL
 373 | DOLLYMOUNT
 374 | DOME
 375 | DOMINICAN
 376 | DOOR
 377 | DOORWAY
 378 | DORMITORY
 379 | DOVE
 380 | DRAIN
 381 | DRAWL
 382 | DREGS
 383 | DRENCHED
 384 | DRESS
 385 | DRINK
 386 | DRY
 387 | DUBLIN
 388 | DUNG
 389 | DUNGHILL
 390 | DUSK
 391 | DUST
 392 | DYING
 393 | EAGLE
 394 | EAR
 395 | EARSPLITTING
 396 | EARTH
 397 | EASYCHAIR
 398 | EBRING
 399 | EBONITE
 400 | ECHO
 401 | ECSTASY
 402 | EDDIED
 403 | EGGS
 404 | EGGSHELLS
 405 | EJACULATION
 406 | ELBOW
 407 | ELEPHANT
 408 | ELLIPSOID
 409 | ELLIPTICAL
 410 | EMBERS
 411 | EMERALD
 412 | ENAMEL
 413 | ENFLAMING
 414 | EUCHARIST
 415 | EVENING
 416 | EXCREMENT
 417 | EXCREMENTITIOUS
 418 | EYE
 419 | EYEBROWS
 420 | EYEGLASS
 421 | EYELID
 422 | FACE
 423 | FADE
 424 | FAINT
 425 | FAIR
 426 | FARTED
 427 | FAT
 428 | FATE
 429 | FATENCIRCLED
 430 | FATHER
 431 | FEAR
 432 | FEAST
 433 | FEATHERINGS
 434 | FEED
 435 | FENCE
 436 | FESTERING
 437 | FEVER
 438 | FIELD
 439 | FIERY
 440 | FIG
 441 | FIGTREE
 442 | FIGURE
 443 | FILE
 444 | FILM
 445 | FILTHILY
 446 | FINGER
 447 | FINGERNAILS
 448 | FINGERTIPS
 449 | FIRE
 450 | FIRECONSUMED
 451 | FIRELIGHT
 452 | FIREPLACE
 453 | FIRM
 454 | FISH
 455 | FLABBY
 456 | FLAG
 457 | FLAME
 458 | FLASH
 459 | FLAT
 460 | FLECKED
 461 | FLEECE
 462 | FLESH
 463 | FLEW
 464 | FLEXIBLE
 465 | FLICKERED
 466 | FLIES
 467 | FLIGHT
 468 | FLITTING
 469 | FLOG
 470 | FLORID
 471 | FLOWER
 472 | FLOWERBEDS
 473 | FLOWERGIRL
 474 | FLUSH
 475 | FLUTTER
 476 | FLY
 477 | FOAM
 478 | FOETUS
 479 | FOG
 480 | FOOD
 481 | FOOT
 482 | FOOTBALL
 483 | FOOTPATH
 484 | FOOTSTEPS
 485 | FOREFINGER
 486 | FOREHEAD
 487 | FOREST
 488 | FORGE
 489 | FORK
 490 | FOUL
 491 | FOULSMELLING
 492 | FOUNTAIN
 493 | FOWL
 494 | FOX
 495 | FRAGMENT
 496 | FRAGRANCE
 497 | FRAIL
 498 | FRANCISCAN
 499 | FRANKINCENSE
 500 | FRECKLED
 501 | FRIAR
 502 | FRIGHT
 503 | FRO
 504 | FROG
 505 | FROWN
 506 | FRUIT
 507 | FUME
 508 | FUNGUS
 509 | FUNNEL
 510 | FURNACE
 511 | GAMECOCKS
 512 | GARDEN
 513 | GAS
 514 | GASFLAMES
 515 | GASJETS
 516 | GATE
 517 | GAYCLAD
 518 | GEESE
 519 | GEMS
 520 | GENUFLECTING
 521 | GINGERNUTS
 522 | GIRAFFE
 523 | GIRDLE
 524 | GIRL
 525 | GLARE
 526 | GLASS
 527 | GLASSJAPS
 528 | GLEAMED
 529 | GLIMMER
 530 | GLINT
 531 | GLISTENING
 532 | GLITTERED
 533 | GLOOM
 534 | GLOOMILY
 535 | GLOSSY
 536 | GLOW
 537 | GNAWED
 538 | GOD
 539 | GODFORSAKEN
 540 | GODHEAD
 541 | GOLD
 542 | GONEBORO
 543 | GOODBYE
 544 | GOODNIGHT
 545 | GOSPEL
 546 | GOWN
 547 | GRAIN
 548 | GRAPES
 549 | GRASS
 550 | GRASSPLOT
 551 | GRATE
 552 | GRAVE
 553 | GRAVECLOTHES
 554 | GRAVEL
 555 | GRAVEYARD
 556 | GREASE
 557 | GREEN
 558 | GREENWHITE
 559 | GREY
 560 | GREYBLUE
 561 | GREYFRINGED
 562 | GREYGREEN
 563 | GREYHOUNDS
 564 | GURGLING
 565 | GUST
 566 | HA
 567 | HAIR
 568 | HALE
 569 | HALLWAY
 570 | HAM
 571 | HAND
 572 | HANDKERCHIEF
 573 | HARBOUR
 574 | HARD
 575 | HARES
 576 | HARMONIOUS
 577 | HARMONISED
 578 | HARMONY
 579 | HARSH
 580 | HASH
 581 | HAT
 582 | HAWK
 583 | HAZE
 584 | HAZEWRAPPED
 585 | HEAD
 586 | HEART
 587 | HEAT
 588 | HEAVEN
 589 | HEAVILY
 590 | HEDGE
 591 | HEEL
 592 | HELL
 593 | HELLFIRE
 594 | HERBS
 595 | HERON
 596 | HILL
 597 | HIPS
 598 | HISS
 599 | HOLE
 600 | HOLLOWSOUNDING
 601 | HOME
 602 | HONEY
 603 | HOODED
 604 | HOOFS
 605 | HORIZON
 606 | HORSE
 607 | HOSPITAL
 608 | HOT
 609 | HOTEL
 610 | HOUNDED
 611 | HOUSE
 612 | HOUSEBORO
 613 | HOWL
 614 | HUE
 615 | HUM
 616 | HUNGRILY
 617 | HURRAY
 618 | HURPOO
 619 | HURT
 620 | HUSH
 621 | HYMN
 622 | ICON
 623 | ILLUMINATED
 624 | IMAGE
 625 | INAUDIBLE
 626 | INCENSE
 627 | INFIRMARY
 628 | INJURED
 629 | INJURIES
 630 | INK
 631 | INSECT
 632 | IRON
 633 | ISLAND
 634 | ITCH
 635 | IVORY
 636 | IVY
 637 | JAR
 638 | JARGON
 639 | JAW
 640 | JEER
 641 | JELLYLIKE
 642 | JERKED
 643 | JERUSALEM
 644 | JESU
 645 | JESUIT
 646 | JEWEL
 647 | JEWELEYED
 648 | JINGLE
 649 | JUG
 650 | JUICE
 651 | KETTLE
 652 | KIDNEY
 653 | KISS
 654 | KNEE
 655 | KNEEL
 656 | KNELT
 657 | KNIFE
 658 | KNOCKED
 659 | LACE
 660 | LAKE
 661 | LALA
 662 | LAMB
 663 | LAMP
 664 | LANDBORO
 665 | LANE
 666 | LANTERN
 667 | LAP
 668 | LARK
 669 | LASHES
 670 | LAUGH
 671 | LAUGHTER
 672 | LAUREL
 673 | LAVATORY
 674 | LAVENDER
 675 | LAWN
 676 | LEAF
 677 | LEATHER
 678 | LEG
 679 | LEMON
 680 | LEND
 681 | LETTER
 682 | LICE
 683 | LICKING
 684 | LIGHT
 685 | LIGHTNINGS
 686 | LILY
 687 | LIMES
 688 | LIMP
 689 | LINEN
 690 | LIP
 691 | LIQUID
 692 | LIT
 693 | LITANY
 694 | LITURGY
 695 | LOAFTER
 696 | LOINS
 697 | LOOKBORO
 698 | LORD
 699 | LORDBORO
 700 | LOUD
 701 | LOUSEMARKS
 702 | LUCIFER
 703 | LUKEWARM
 704 | LULL
 705 | LUMINARY
 706 | LUMINOUS
 707 | LUMPISH
 708 | LUNGS
 709 | LUST
 710 | LUTFLIKE
 711 | LYRICAL
 712 | MAHOGANY
 713 | MANYCOLOURED
 714 | MAPLE
 715 | MARBLES
 716 | MARE
 717 | MAROON
 718 | MARSHLIGHT
 719 | MASK
 720 | MASS
 721 | MASSBOOK
 722 | MELODY
 723 | MERRILY
 724 | MERRIMENT
 725 | MERRYMAKING
 726 | METAL
 727 | MICE
 728 | MILK
 729 | MIRE
 730 | MIRROR
 731 | MOAN
 732 | MOCK
 733 | MOIST
 734 | MOLE
 735 | MONEY
 736 | MONK
 737 | MONKEY
 738 | MOOCOW
 739 | MOON
 740 | MOONLIT
 741 | MOORINGS
 742 | MORGUE
 743 | MOTTLED
 744 | MOULDERING
 745 | MOUNT
 746 | MOUSTACHE
 747 | MOUTH
 748 | MUD
 749 | MUDDIED
 750 | MUMBLED
 751 | MURDER
 752 | MURMUR
 753 | MUSIC
 754 | MUTE
 755 | MUTTERED
 756 | MYRRH
 757 | NAIL
 758 | NAKED
 759 | NAME
 760 | NASAL
 761 | NASTY
 762 | NAUSEOUS
 763 | HAVE
 764 | NECK
 765 | NEEDLE
 766 | NEST
 767 | NETS
 768 | NIGHT
 769 | NIGHTCLOUDS
 770 | NIGHTSHADE
 771 | NIGHTSHIRT
 772 | NOISE
 773 | NOISILY
 774 | NOSE
 775 | NOTEBOOKS
 776 | NOXIOUS
 777 | NUN
 778 | NURSE
 779 | NURSEMAIDS
 780 | NURSERY
 781 | OAR
 782 | OCEAN
 783 | ODOROUS
 784 | ODOUR
 785 | OIL
 786 | OILSHEET
 787 | OLIVE
 788 | ONIONS
 789 | OOZED
 790 | ORB
 791 | ORCHARDS
 792 | ORCHESTRA
 793 | ORDER
 794 | OUTBORO
 795 | OUTHOUSE
 796 | OVERCOAT
 797 | OX
 798 | OZONE
 799 | PAGE
 800 | PAIN
 801 | PALATE
 802 | PALM
 803 | PANDIED
 804 | PANDYBAT
 805 | PANTING
 806 | PAPA
 807 | PAPER
 808 | PARACLETE
 809 | PASTORS
 810 | PATCHWORK
 811 | PATH
 812 | PATTED
 813 | PEAL
 814 | PEEL
 815 | PENCIL
 816 | PEPPER
 817 | PERFUME
 818 | PHRASE
 819 | PIANO
 820 | PICTURE
 821 | PIG
 822 | PIGEON
 823 | PINK
 824 | PISS
 825 | PLANT
 826 | PLUCKED
 827 | PLUMP
 828 | PLUMPUDDING
 829 | POCK
 830 | POCKET
 831 | POLISHED
 832 | POLLUTES
 833 | PONY
 834 | POOL
 835 | POPE
 836 | PORCELAIN
 837 | PORCH
 838 | PORTRAIT
 839 | POT
 840 | POTATOES
 841 | PRAY
 842 | PRAYERBOOK
 843 | PREACH
 844 | PERFECT
 845 | PRESS
 846 | PRIEST
 847 | PRIESTCRAFT
 848 | PRIESTRIDDEN
 849 | PRISON
 850 | PRISONHOUSE
 851 | PROFESSOR
 852 | PROSE
 853 | PROTEST
 854 | PSALMS
 855 | PUCK
 856 | PUDDING
 857 | PUDDLES
 858 | PULL
 859 | PULPIT
 860 | PULSATION
 861 | PUNCH
 862 | PUNGENT
 863 | PUNISH
 864 | PUPPY
 865 | PURGATORIAL
 866 | PURPLE
 867 | PURRED
 868 | PUTBORO
 869 | PUTREFACTION
 870 | PUTRID
 871 | QUADRANGLE
 872 | QUAE
 873 | QUAGMIRE
 874 | QUEER
 875 | QUIET
 876 | QUIVERED
 877 | RABBITS
 878 | RABBITSKIN
 879 | RACKET
 880 | RAGE
 881 | RAIL
 882 | RAILWAY
 883 | RAIN
 884 | RAINDROPS
 885 | RAINFRAGRANT
 886 | RAINLADEN
 887 | RAINSODDEN
 888 | RAKE
 889 | RAN
 890 | RANG
 891 | RAT
 892 | RATTLE
 893 | RECTOR
 894 | RED
 895 | REDBROWN
 896 | REDEYED
 897 | REDHOT
 898 | REDRIMMED
 899 | REEKING
 900 | REFECTORY
 901 | REFLECT
 902 | RELIGION
 903 | RELIGIOUS
 904 | REPENT
 905 | REPTILE
 906 | REVEREND
 907 | RHYME
 908 | RHYTHM
 909 | RIBS
 910 | RICE
 911 | RIDDLE
 912 | RING
 913 | RIOT
 914 | RITE
 915 | RITUAL
 916 | RIVER
 917 | RIVULET
 918 | ROAD
 919 | ROADWAY
 920 | ROAR
 921 | ROBE
 922 | ROCK
 923 | ROOF
 924 | ROOM
 925 | ROPE
 926 | ROSE
 927 | ROSEBUSHES
 928 | ROSEFLIGHT
 929 | ROSESOFT
 930 | ROSEWAY
 931 | ROSIE
 932 | ROT
 933 | ROTUNDA
 934 | ROUGED
 935 | ROUGH
 936 | ROUGHHAWN
 937 | ROUND
 938 | ROUNDHEAD
 939 | RUMBLING
 940 | RUMP
 941 | RUN
 942 | RUSSET
 943 | RUSTLING
 944 | SABBATH
 945 | SACK
 946 | SACKCLOTH
 947 | SACRAMENT
 948 | SACRIFICE
 949 | SACRILEGE
 950 | SACRILEGIOUS
 951 | SACRISTAN
 952 | SACRISTY
 953 | SAILOR
 954 | SAINT
 955 | SALT
 956 | SALVATION
 957 | SANCTUARY
 958 | SAND
 959 | SASH
 960 | SATAN
 961 | SAUCE
 962 | SAUSAGES
 963 | SAVIOUR
 964 | SAVOUR
 965 | SCALDED
 966 | SCARLET
 967 | SCHOOL
 968 | SCREAM
 969 | SCREECH
 970 | SCUM
 971 | SEA
 972 | SEABIRD
 973 | SEABORNE
 974 | SEADUSK
 975 | SEAHARVEST
 976 | SEAPORT
 977 | SEASHORE
 978 | SEATANGLE
 979 | SEAWALL
 980 | SEAWATER
 981 | SEAWEED
 982 | SEAWRACK
 983 | SECULAR
 984 | SEDUCE
 985 | SEED
 986 | SELFBOUNDED
 987 | SELFCOMMUNION
 988 | SELFCONTAINED
 989 | SELFEMBITTERED
 990 | SELFMISTRUST
 991 | SELFRESPECT
 992 | SELFRESTRAINT
 993 | SELFSURRENDER
 994 | SENTENCE
 995 | SEPULCHRE
 996 | SERAPH
 997 | SERAPHIM
 998 | SERPENT
 999 | SEWER
1000 | SHADE
1001 | SHADOW
1002 | SHAME
1003 | SHARP
1004 | SHAWL
1005 | SHED
1006 | SHELL
1007 | SHIMMER
1008 | SHINE
1009 | SHIP
1010 | SHIPWRECKS
1011 | SHIRT
1012 | SHITE
1013 | SHIVER
1014 | SHOCK
1015 | SHOE
1016 | SHONE
1017 | SHOOK
1018 | SHOUTED
1019 | SHOVED
1020 | SHOWER
1021 | SHRIEKING
1022 | SHRILL
1023 | SHRINE
1024 | SHRIVELLED
1025 | SHRUBS
1026 | SHRUNK
1027 | SICK
1028 | SIDEALTAR
1029 | SIENA
1030 | SIGH
1031 | SILENCE
1032 | SILK
1033 | SILVER
1034 | SILVERCOATED
1035 | SILVERPOINTED
1036 | SILVERVEINED
1037 | SILVERWRAPPED
1038 | SIN
1039 | SINCORRUPTED
1040 | SINFULIMPENITENCE
1041 | SING
1042 | SINLOVING
1043 | SISTER
1044 | SKIES
1045 | SKIN
1046 | SKIRTS
1047 | SKULL
1048 | SKY
1049 | SKYHIGH
1050 | SKYLIGHT
1051 | SLATE
1052 | SLATEBLUE
1053 | SLEEK
1054 | SLEEP
1055 | SLEEVE
1056 | SLENDER
1057 | SLIME
1058 | SLOBBERING
1059 | SLOTH
1060 | SLOW
1061 | SLOWDRIFTING
1062 | SLOWFLOWING
1063 | SLUGGISH
1064 | SMACKED
1065 | SMART
1066 | SMELL
1067 | SMILE
1068 | SMITHY
1069 | SMOKE
1070 | SMOOTH
1071 | SMUGGING
1072 | SNAKE
1073 | SNAP
1074 | SNEEZE
1075 | SNORT
1076 | SOARING
1077 | SOB
1078 | SOFT
1079 | SOFTHUED
1080 | SOFTLYLIGHTED
1081 | SOFTWORDED
1082 | SOLDIER
1083 | SOLITARY
1084 | SONG
1085 | SOOTHED
1086 | SORDID
1087 | SOUL
1088 | SOULFREE
1089 | SOUND
1090 | SOUR
1091 | SOURSMELLING
1092 | SOUTANE
1093 | SPACE
1094 | SPADE
1095 | SPECKLED
1096 | SPECTACLES
1097 | SPEECH
1098 | SPIED
1099 | SPIRIT
1100 | SPITE
1101 | SPITTLE
1102 | SPRAY
1103 | SQUALID
1104 | SQUALL
1105 | SQUARE
1106 | SQUEAK
1107 | STAGNATION
1108 | STAIN
1109 | STAIRCASE
1110 | STAIRS
1111 | STALE
1112 | STAMMERING
1113 | STANK
1114 | STAR
1115 | STASIS
1116 | STENCH
1117 | STEPHANEFOROS
1118 | STEPHANOS
1119 | STEHANOUMENOS
1120 | STEW
1121 | STICK
1122 | STIFF
1123 | STING
1124 | STINK
1125 | STINKPOT
1126 | STOCKED
1127 | STOMACH
1128 | STOWE
1129 | STORM
1130 | STRAW
1131 | STRAWCOLOURED
1132 | STREAKS
1133 | STREET
1134 | STUNG
1135 | SOCK
1136 | SUFFOCATED
1137 | SULPHUROUS
1138 | SULPHURYELLOW
1139 | SUN
1140 | SUNG
1141 | SUNLIGHT
1142 | SUNRISE
1143 | SUNWARMED
1144 | SUP
1145 | SURPLICES
1146 | SWALLOW
1147 | SWAMP
1148 | SWANS
1149 | SWEAR
1150 | SWEAT
1151 | SWEET
1152 | SWISH
1153 | SWOLLEN
1154 | SWORD
1155 | SYLLABLE
1156 | TABERNACLE
1157 | TABLE
1158 | TABLECLOTH
1159 | TAIL
1160 | TALLOW
1161 | TAN
1162 | TAPPED
1163 | TAR
1164 | TASTE
1165 | TAWNY
1166 | TEA
1167 | TEACUPS
1168 | TEAR
1169 | TEETH
1170 | TELEGRAPHPOLES
1171 | TENOR
1172 | TEPID
1173 | TESTAMENT
1174 | TESTIMONIAL
1175 | THEBORO
1176 | THIGH
1177 | THIN
1178 | THISTLES
1179 | THORNS
1180 | THRILL
1181 | THROAT
1182 | THROB
1183 | THRUST
1184 | THUD
1185 | THUMB
1186 | THUMBBLACKENED
1187 | THUNDER
1188 | TICKING
1189 | TICKLING
1190 | TIDE
1191 | TINGES
1192 | TINGLE
1193 | TITTER
1194 | TOAST
1195 | TOBACCO
1196 | TOBORO
1197 | TONE
1198 | TONGUE
1199 | TORTURE
1200 | TOWELS
1201 | TOWER
1202 | TRAIN
1203 | TRALALA
1204 | TRALALADDY
1205 | TRAM
1206 | TREAT
1207 | TREBLE
1208 | TREMOR
1209 | TREMULOUS
1210 | TRINITY
1211 | TROUSERS
1212 | TRUMPET
1213 | TRUMPETBLAST
1214 | TUCKED
1215 | TUCKOO
1216 | TUNNEL
1217 | TURF
1218 | TURFCOLOURED
1219 | TURKEY
1220 | TURNIPS
1221 | TURRET
1222 | TWIG
1223 | TWILIGHT
1224 | TWINECOLOURED
1225 | TWINKLED
1226 | TWIRLING
1227 | TWIST
1228 | TWITCHING
1229 | TWITTERED
1230 | UMBRELLA
1231 | UNDEPILED
1232 | UNDRESS
1233 | UNDRIED
1234 | UNLIT
1235 | URIRAL
1236 | USAGE
1237 | USBORO
1238 | VAPOUR
1239 | VEIL
1240 | VEINS
1241 | VELVET
1242 | VERMIN
1243 | VESTRY
1244 | VINEGAR
1245 | VIOLETS
1246 | VIPER
1247 | VIRGIN
1248 | VISION
1249 | VOICE
1250 | VOMITED
1251 | VOWEL
1252 | WADED
1253 | WAFER
1254 | WAIL
1255 | WAIST
1256 | WALK
1257 | WAN
1258 | WAR
1259 | WARM
1260 | WASH
1261 | WATER
1262 | WATERJUG
1263 | WATERLOGGED
1264 | WATERPROOFS
1265 | WATERY
1266 | WAVE
1267 | WAVELET
1268 | WAX
1269 | WEB
1270 | WEEDGROWN
1271 | WEEDS
1272 | WEEP
1273 | WEPT
1274 | WET
1275 | WHEEL
1276 | WHEEZING
1277 | WHIMPERING
1278 | WHINE
1279 | WHIP
1280 | WHIRL
1281 | WHIRRING
1282 | WHISPER
1283 | WHISTLE
1284 | WHITE
1285 | WHITEBOY
1286 | WHITEGREY
1287 | WHITEROBED
1288 | WHITEWASHED
1289 | WHORES
1290 | WIDE
1291 | WIDENING
1292 | WIDESPREAD
1293 | WIDEWINGED
1294 | WILLBORO
1295 | WIND
1296 | WINDOW
1297 | WINDSWEPT
1298 | WING
1299 | WINKED
1300 | WINTER
1301 | WIRE
1302 | WOMAN
1303 | WOMB
1304 | WOOD
1305 | WOOLY
1306 | WORD
1307 | WORM
1308 | WORSHIP
1309 | WRINKLED
1310 | WRISTS
1311 | YELLOW
1312 | YELLS


--------------------------------------------------------------------------------
/ipynb/GettingNltk.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Getting NLTK for Text Processing"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook introduces the [Natural Language Toolkit](http://www.nltk.org/) (NLTK) which facilitates a broad range of tasks for text processing and representing results. It's part of the [The Art of Literary Text Analysis](ArtOfLiteraryTextAnalysis.ipynb) and assumes that you've already worked through previous notebooks ([Getting Setup](GettingSetup.ipynb), [Getting Started](GettingStarted.ipynb) and [Getting Texts](GettingTexts.ipynb)). In this notebook we'll look in particular at:\n",
 15 |     "\n",
 16 |     "* [Installing the NLTK library (for text processing)](#Installing-the-NLTK-Library)\n",
 17 |     "* [Simple tokenization of words](#Tokenization)\n",
 18 |     "* [Producing a simple table of frequencies of words](#Word-Frequencies)\n",
 19 |     "* [Applying a list of stopwords (words to ignore)](#Stop_Words)\n",
 20 |     "* [Producing a simple concordance for a keyword](#Building-a-Simple-Concordance)"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "## Installing the NLTK Library"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "The Anaconda bundle that we're using already includes [NLTK](http://www.nltk.org/), but the bundle doesn't include the NLTK data collections that are available. Fortunately, it's easy to download the data, and we can even do it within a notebook. Following the same steps as before, create a new notebook named \"GettingNltk\" and run this first code <a href=\"Glossary.ipynb#cell\" title=\"An input strucutre in a Notebook which runs either Markdown or Python code\">cell</a>:"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 1,
 40 |    "metadata": {},
 41 |    "outputs": [
 42 |     {
 43 |      "name": "stdout",
 44 |      "output_type": "stream",
 45 |      "text": [
 46 |       "showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml\n"
 47 |      ]
 48 |     },
 49 |     {
 50 |      "data": {
 51 |       "text/plain": [
 52 |        "True"
 53 |       ]
 54 |      },
 55 |      "execution_count": 1,
 56 |      "metadata": {},
 57 |      "output_type": "execute_result"
 58 |     }
 59 |    ],
 60 |    "source": [
 61 |     "import nltk\n",
 62 |     "\n",
 63 |     "nltk.download() # download NLTK data (we should only need to run this cell once)"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "This should cause a new window to appear (eventually) with a dialog box to download data collections. For the sake of simplicity, if possible select the \"all\" row and press \"Download\". Once the download is complete, you can close that window.\n",
 71 |     "\n",
 72 |     "![NLTK Data Download](images/nltk-data-download.png)\n",
 73 |     "\n",
 74 |     "Now you're set! You can close and delete the temporary notebook used for installation."
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "## Text Processing"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "Now that we have NLTK installed, let's use it for text processing.\n",
 89 |     "\n",
 90 |     "We'll start by retrieving _The Gold Bug_ <a href=\"Glossary.ipynb#plaintexts\" title=\"Text which includes only data related to the readable material.\">plain text</a> that we had saved locally in the [Getting Texts](GettingTexts.ipynb) notebook. If you need to recapitulate the essentials of the previous notebook, try running this to retrieve the text:\n",
 91 |     "\n",
 92 |     "```python\n",
 93 |     "import urllib.request\n",
 94 |     "# retrieve Poe plain text value\n",
 95 |     "poeUrl = \"http://www.gutenberg.org/files/2147/2147-0.txt\"\n",
 96 |     "poeString = urllib.request.urlopen(poeUrl).read().decode()```\n",
 97 |     "\n",
 98 |     "And then this, in a separate <a href=\"Glossary.ipynb#cell\" title=\"An input strucutre in a Notebook which runs either Markdown or Python code\">cell</a> so that we don't read repeatedly from Gutenberg:\n",
 99 |     "\n",
100 |     "```python\n",
101 |     "import os\n",
102 |     "# isolate The Gold Bug\n",
103 |     "start = poeString.find(\"THE GOLD-BUG\")\n",
104 |     "end = poeString.find(\"FOUR BEASTS IN ONE\")\n",
105 |     "goldBugString = poeString[start:end]\n",
106 |     "# save the file locally\n",
107 |     "directory = \"data\"\n",
108 |     "if not os.path.exists(directory):\n",
109 |     "    os.makedirs(directory)\n",
110 |     "with open(\"data/goldBug.txt\", \"w\") as f:\n",
111 |     "    f.write(goldBugString)```\n",
112 |     "\n",
113 |     "Now we should be ready to retrieve the text:"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 1,
119 |    "metadata": {
120 |     "collapsed": true
121 |    },
122 |    "outputs": [],
123 |    "source": [
124 |     "with open(\"data/goldBug.txt\", \"r\") as f:\n",
125 |     "    goldBugString = f.read()"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "Now we will work toward showing the top frequency words in our <a href=\"Glossary.ipynb#plaintexts\" title=\"Text which includes only data related to the readable material.\">plain text</a>. This involves three major steps:\n",
133 |     "\n",
134 |     "1. processing our <a href=\"Glossary.ipynb#plaintexts\" title=\"Text which includes only data related to the readable material.\">plain text</a> to find the words (also known as tokenization)\n",
135 |     "1. counting the frequencies of each word\n",
136 |     "1. displaying the frequencies information"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "metadata": {},
142 |    "source": [
143 |     "## Tokenization"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "Tokenization is the basic process of <a href=\"Glossary.ipynb#Parse\" title=\"A process whereby sentences or strings of words are analysed by a computer into their constituents\">parsing</a> a <a href=\"Glossary.ipynb#String\" title=\"A container for data of letters, numbers or symbols.\">string</a> to divide it into smaller units of the same kind. You can tokenize text into paragraphs, sentences, words or other structures, but here we're focused on recognizing words in our text. For that, let's import the ```nltk``` library and use its convenient ```word_tokenize()``` <a href=\"Glossary.ipynb#Function\" title=\"functions provide functionality to a program\">function.</a> NLTK actually has several ways of tokenizing texts, and for that matter we could write our own code to do it. We'll have a peek at the first ten tokens."
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 2,
156 |    "metadata": {},
157 |    "outputs": [
158 |     {
159 |      "data": {
160 |       "text/plain": [
161 |        "['THE', 'GOLD-BUG', 'What', 'ho', '!', 'what', 'ho', '!', 'this', 'fellow']"
162 |       ]
163 |      },
164 |      "execution_count": 2,
165 |      "metadata": {},
166 |      "output_type": "execute_result"
167 |     }
168 |    ],
169 |    "source": [
170 |     "import nltk\n",
171 |     "\n",
172 |     "goldBugTokens = nltk.word_tokenize(goldBugString)\n",
173 |     "goldBugTokens[:10]"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "We can see from the above that ```word_tokenize``` does a useful job of identifying words (including hyphenated words like \"GOLD-BUG\"), but also includes tokens like the exclamation mark. In some cases punctuation like this might be useful, but in our case we want to focus on word frequencies, so we should filter out punctuation tokens. (To be fair, nltk.word_tokenize() is expecting to work with sentences that have already been <a href=\"Glossary.ipynb#Parse\" title=\"A process whereby sentences or strings of words are analysed by a computer into their constituents\">parsed</a> so we're slightly misusing it here, but that's ok.)\n",
181 |     "\n",
182 |     "To accomplish the filtering we will use a construct called [list comprehension](https://docs.python.org/3.4/tutorial/datastructures.html#list-comprehensions) with a <a href=\"Glossary.ipynb#conditionalblock\" title=\"Where the program has to make a decision based on a series of options using conditional statements\">conditional</a> test built in. Let's take it one step at a time, first using a loop structure like we've already seen in [Getting Texts](GettingTexts.ipynb), and then doing the same thing with a list comprehension."
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 3,
188 |    "metadata": {},
189 |    "outputs": [
190 |     {
191 |      "name": "stdout",
192 |      "output_type": "stream",
193 |      "text": [
194 |       "['THE', 'GOLD-BUG', 'What', 'ho', '!', 'what', 'ho', '!', 'this', 'fellow'] (for loop technique)\n",
195 |       "['THE', 'GOLD-BUG', 'What', 'ho', '!', 'what', 'ho', '!', 'this', 'fellow'] (list comprehension technique)\n"
196 |      ]
197 |     }
198 |    ],
199 |    "source": [
200 |     "#  technique 1 where we create a new list\n",
201 |     "loopList = []\n",
202 |     "for word in goldBugTokens[:10]:\n",
203 |     "    loopList.append(word)\n",
204 |     "print(loopList, \"(for loop technique)\")\n",
205 |     "    \n",
206 |     "    \n",
207 |     "# technique 2 with list comprehension\n",
208 |     "print([word for word in goldBugTokens[:10]], \"(list comprehension technique)\")"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "markdown",
213 |    "metadata": {},
214 |    "source": [
215 |     "Identical! So the general form of a <a href=\"Glossary.ipynb#listcomprehension\" title=\"A method for defining and constructing lists.\">list comprehension</a> (which is very compact) is: \n",
216 |     "\n",
217 |     "> [_expression(item)_ for _item_ in _list_)]\n",
218 |     "\n",
219 |     "We can now go a step further and add a condition to the <a href=\"Glossary.ipynb#listcomprehension\" title=\"A method for defining and constructing lists.\">list comprehension</a> : we'll only include the word in the final list if the first character in the word is alphabetic as defined by the [isalpha()](https://docs.python.org/3.4/library/stdtypes.html?highlight=isalpha#str.isalpha) function (`word[0]` – remember the [string sequence technique](GettingTexts.ipynb#Working-with-Parts-of-String))."
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 10,
225 |    "metadata": {},
226 |    "outputs": [
227 |     {
228 |      "name": "stdout",
229 |      "output_type": "stream",
230 |      "text": [
231 |       "['THE', 'GOLD-BUG', 'What', 'ho', 'what', 'ho', 'this', 'fellow']\n"
232 |      ]
233 |     }
234 |    ],
235 |    "source": [
236 |     "print([word for word in goldBugTokens[:10] if word[0].isalpha()])"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "markdown",
241 |    "metadata": {},
242 |    "source": [
243 |     "## Word Frequencies"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "metadata": {},
249 |    "source": [
250 |     "Now that we've had a first pass at word tokenization (keeping only word tokens), let's look at counting word frequencies. Essentially we want to go through the tokens and tally the number of times each one appears. Not surprisingly, the NLTK has a very convenient method for doing just this, which we can see in this small sample (the first 10 word tokens):"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 13,
256 |    "metadata": {},
257 |    "outputs": [
258 |     {
259 |      "data": {
260 |       "text/plain": [
261 |        "Counter({'GOLD-BUG': 1,\n",
262 |        "         'THE': 1,\n",
263 |        "         'What': 1,\n",
264 |        "         'fellow': 1,\n",
265 |        "         'ho': 2,\n",
266 |        "         'this': 1,\n",
267 |        "         'what': 1})"
268 |       ]
269 |      },
270 |      "execution_count": 13,
271 |      "metadata": {},
272 |      "output_type": "execute_result"
273 |     }
274 |    ],
275 |    "source": [
276 |     "goldBugRealWordTokensSample = [word for word in goldBugTokens[:10] if word[0].isalpha()]\n",
277 |     "goldBugRealWordFrequenciesSample = nltk.FreqDist(goldBugRealWordTokensSample)\n",
278 |     "goldBugRealWordFrequenciesSample"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "metadata": {},
284 |    "source": [
285 |     "This ```FreqDist``` <a href=\"Glossary.ipynb#Object\" title=\"Data which has attributes or values AND a defined behaviour.\">object</a> is a kind of dictionary, where each word is paired with its frequency (separated by a colon), and each pair is separated by a comma. This kind of dictionary also has a very convenient way of displaying results as a table:"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 14,
291 |    "metadata": {},
292 |    "outputs": [
293 |     {
294 |      "name": "stdout",
295 |      "output_type": "stream",
296 |      "text": [
297 |       "  ho what GOLD-BUG fellow What  THE this \n",
298 |       "   2    1    1    1    1    1    1 \n"
299 |      ]
300 |     }
301 |    ],
302 |    "source": [
303 |     "goldBugRealWordFrequenciesSample.tabulate()"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "markdown",
308 |    "metadata": {},
309 |    "source": [
310 |     "The results are displayed in descending order of frequency (two occurrences of \"ho\"). One of the things we can notice is that \"What\" and \"what\" are calculated separately, which in some cases may be good, but for our purposes probably isn't. This might lead us to rethink our steps until now and consider the possibility of converting our <a href=\"Glossary.ipynb#String\" title=\"A container for data of letters, numbers or symbols.\">string</a> to lowercase during tokenization."
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 15,
316 |    "metadata": {},
317 |    "outputs": [
318 |     {
319 |      "name": "stdout",
320 |      "output_type": "stream",
321 |      "text": [
322 |       "what   ho gold-bug  the fellow this \n",
323 |       "   2    2    1    1    1    1 \n"
324 |      ]
325 |     }
326 |    ],
327 |    "source": [
328 |     "goldBugTokensLowercase = nltk.word_tokenize(goldBugString.lower()) # use lower() to convert entire string to lowercase\n",
329 |     "goldBugRealWordTokensLowercaseSample = [word for word in goldBugTokensLowercase[:10] if word[0].isalpha()]\n",
330 |     "goldBugRealWordFrequenciesSample = nltk.FreqDist(goldBugRealWordTokensLowercaseSample)\n",
331 |     "goldBugRealWordFrequenciesSample.tabulate(20)"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "markdown",
336 |    "metadata": {},
337 |    "source": [
338 |     "Good, now we have \"what\" and \"What\" as the same word form counted twice. (There are disadvantages to this as well, such as more difficulty in identifying proper names and the start of sentences, but text mining is often a set of compromises.)\n",
339 |     "\n",
340 |     "Let's redo our entire workflow with the full set of tokens (not just a sample)."
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": 16,
346 |    "metadata": {
347 |     "collapsed": true
348 |    },
349 |    "outputs": [],
350 |    "source": [
351 |     "goldBugTokensLowercase = nltk.word_tokenize(goldBugString.lower())\n",
352 |     "goldBugRealWordTokensLowercase = [word for word in goldBugTokensLowercase if word[0].isalpha()]\n",
353 |     "goldBugRealWordFrequencies = nltk.FreqDist(goldBugRealWordTokensLowercase)"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "markdown",
358 |    "metadata": {},
359 |    "source": [
360 |     "One simple way of measuring the vocabulary richness of an author is to calculate the ratio of the total number of words and the number of unique words. If an author repeats words more often, it may be because he or she is drawing on a smaller vocabulary (either deliberately or not), which is a measure of style. There are several factors to consider, such as the length of the text, but in the simplest terms we can calculate the lexical diversity of an author by dividng the number of word forms (types) by the total number of tokens. We already have the necessary ingredients:\n",
361 |     "\n",
362 |     "* types: number of different words (number of word: count pairs in ```goldBugRealWordFrequencies```)\n",
363 |     "* tokens: total number of word tokens (length of ```goldBugRealWordTokensLowercase```"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": 17,
369 |    "metadata": {},
370 |    "outputs": [
371 |     {
372 |      "name": "stdout",
373 |      "output_type": "stream",
374 |      "text": [
375 |       "number of types:  2681\n",
376 |       "number of tokens:  13508\n",
377 |       "type/token ratio:  0.1984749777909387\n"
378 |      ]
379 |     }
380 |    ],
381 |    "source": [
382 |     "print(\"number of types: \", len(goldBugRealWordFrequencies))\n",
383 |     "print(\"number of tokens: \", len(goldBugRealWordTokensLowercase))\n",
384 |     "print(\"type/token ratio: \", len(goldBugRealWordFrequencies)/len(goldBugRealWordTokensLowercase))"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "markdown",
389 |    "metadata": {},
390 |    "source": [
391 |     "We haven't yet looked at our output for the top frequency lowercase words."
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": 18,
397 |    "metadata": {},
398 |    "outputs": [
399 |     {
400 |      "name": "stdout",
401 |      "output_type": "stream",
402 |      "text": [
403 |       " the   of  and    i   to    a   in   it  you  was that with  for   as  had   at   he  but this   we \n",
404 |       " 877  465  359  336  329  327  238  213  162  137  130  114  113  113  110  108  103   99   99   98 \n"
405 |      ]
406 |     }
407 |    ],
408 |    "source": [
409 |     "goldBugRealWordFrequencies.tabulate(20) # show a sample of the top frequency terms"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "markdown",
414 |    "metadata": {},
415 |    "source": [
416 |     "We tokenized, filtered and counted in three lines of code, and then a fourth to show the top frequency terms, but the results aren't necessarily very exciting. There's not much in these top frequency words that could be construed as especially characteristic of _The Gold Bug_, in large part because the most frequent words are similar for most texts of a given language: they're so-called function words that have more of a syntactic (grammatical) function rather than a semantic (meaning-bearing) value."
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "markdown",
421 |    "metadata": {},
422 |    "source": [
423 |     "<a id=\"Stop_Words\"></a>\n",
424 |     "Fortunately, our NLTK library contains a list of <a href=\"Glossary.ipynb#stopwords\" title=\"A list of words which are programmed to be ignored or filtered in analysis and search queries.\">stop-words</a> for English (and other languages). We can load the list and look at its contents."
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": 20,
430 |    "metadata": {},
431 |    "outputs": [
432 |     {
433 |      "name": "stdout",
434 |      "output_type": "stream",
435 |      "text": [
436 |       "['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'did', 'do', 'does', 'doing', 'don', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'itself', 'just', 'me', 'more', 'most', 'my', 'myself', 'no', 'nor', 'not', 'now', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 's', 'same', 'she', 'should', 'so', 'some', 'such', 't', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'you', 'your', 'yours', 'yourself', 'yourselves']\n"
437 |      ]
438 |     }
439 |    ],
440 |    "source": [
441 |     "import nltk\n",
442 |     "stopwords = nltk.corpus.stopwords.words(\"english\")\n",
443 |     "print(sorted(stopwords)) # sort them alphabetically before printing"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "markdown",
448 |    "metadata": {},
449 |    "source": [
450 |     "We can test whether one word is an item in another list with the following syntax, here using our small sample."
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": 22,
456 |    "metadata": {},
457 |    "outputs": [
458 |     {
459 |      "name": "stdout",
460 |      "output_type": "stream",
461 |      "text": [
462 |       "sample words:  ['the', 'gold-bug', 'what', 'ho', 'what', 'ho', 'this', 'fellow']\n",
463 |       "sample words not in stopwords list:  ['gold-bug', 'ho', 'ho', 'fellow']\n"
464 |      ]
465 |     }
466 |    ],
467 |    "source": [
468 |     "print(\"sample words: \", goldBugRealWordTokensLowercaseSample)\n",
469 |     "print(\"sample words not in stopwords list: \", [word for word in goldBugRealWordTokensLowercaseSample if not word in stopwords])"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "markdown",
474 |    "metadata": {},
475 |    "source": [
476 |     "So we can now tweak our word filter with an additional condition, adding the ```and``` <a href=\"Glossary.ipynb#Operator\" title=\"Symbols which perform arithmetic or logical computation\">operator</a> between the test for the alphabetic first character and the test for presence in the stopword list. We add a slash (\\) character to treat the code as if it were on one line. Alternatively, we could have done this in two steps (perhaps less efficient but arguably easier to read):\n",
477 |     "\n",
478 |     "```python\n",
479 |     "# first filter tokens with alphabetic characters\n",
480 |     "gbWords = [word for word in goldBugTokensLowercase if word[0].isalpha()]\n",
481 |     "# then filter stopwords\n",
482 |     "gbContentWords = [word for word in gbWords if word not in stopwords]```"
483 |    ]
484 |   },
485 |   {
486 |    "cell_type": "code",
487 |    "execution_count": 25,
488 |    "metadata": {},
489 |    "outputs": [
490 |     {
491 |      "name": "stdout",
492 |      "output_type": "stream",
493 |      "text": [
494 |       "upon   de jupiter legrand  one said well massa could  bug skull parchment tree made first time  two much   us beetle \n",
495 |       "  81   73   53   47   38   35   35   34   33   32   29   27   25   25   24   24   23   23   23   22 \n"
496 |      ]
497 |     }
498 |    ],
499 |    "source": [
500 |     "goldBugRealContentWordTokensLowercase = [word for word in goldBugTokensLowercase \\\n",
501 |     "        if word[0].isalpha() and word not in stopwords]\n",
502 |     "goldBugRealContentWordFrequencies = nltk.FreqDist(goldBugRealContentWordTokensLowercase)\n",
503 |     "goldBugRealContentWordFrequencies.tabulate(20) # show a sample of the top "
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "markdown",
508 |    "metadata": {},
509 |    "source": [
510 |     "Now we have words that seem a bit more meaningful (even if the table format is a bit off). The first word (\"upon\") could be considered a function word (a preposition) that should be in the <a href=\"Glossary.ipynb#stopwords\" title=\"A list of words which are programmed to be ignored or filtered in analysis and search queries.\">stop-word</a> list, though it's less common in modern English. The second word (\"de\") would be in a French <a href=\"Glossary.ipynb#stopwords\" title=\"A list of words which are programmed to be ignored or filtered in analysis and search queries.\">stop-word</a> list, but seems striking here in English. The third word \"'s\" is actually an artifact of possessive forms – sometimes tokenization keeps possessives together with the word, sometimes not. The next words (\"jupiter\" and \"legrand\") merit closer inspection, they may be proper names that have been transformed to lowercase. We can continue on like this with various observations and hypotheses, but really we probably want to have a closer look at individual occurences to see what's happening. For that, we'll build a <a href=\"Glossary.ipynb#Concordance\" title=\"A list of all words within a text and their frequency of occurrence.\">concordance</a>."
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "markdown",
515 |    "metadata": {},
516 |    "source": [
517 |     "## Building a Simple Concordance"
518 |    ]
519 |   },
520 |   {
521 |    "cell_type": "markdown",
522 |    "metadata": {},
523 |    "source": [
524 |     "A <a href=\"Glossary.ipynb#Concordance\" title=\"A list of all words within a text and their frequency of occurrence.\">concordance</a> allows us to see each occurrence of a term in its context. It has a rich history in textual scholarship, dating back to well before the advent of computers. It's a tool for studying word usage in context.\n",
525 |     "\n",
526 |     "The easiest way to build a <a href=\"Glossary.ipynb#Concordance\" title=\"A list of all words within a text and their frequency of occurrence.\">concordance</a> is to create an NLTK Text <a href=\"Glossary.ipynb#Object\" title=\"Data which has attributes or values AND a defined behaviour.\">object</a> from a list of word tokens (in this case we'll use the unfiltered list so that we can better read the text). So, for instance, we can ask for a concordance of \"de\" to try to better understand why it occurs so often in this English text."
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": 31,
532 |    "metadata": {},
533 |    "outputs": [
534 |     {
535 |      "name": "stdout",
536 |      "output_type": "stream",
537 |      "text": [
538 |       "Displaying 10 of 73 matches:\n",
539 |       "ou , '' here interrupted Jupiter ; `` de bug is a goole bug , solid , ebery bi\n",
540 |       "is your master ? '' `` Why , to speak de troof , massa , him not so berry well\n",
541 |       " aint find nowhar -- dat 's just whar de shoe pinch -- my mind is got to be be\n",
542 |       "taint worf while for to git mad about de matter -- Massa Will say noffin at al\n",
543 |       " -- Massa Will say noffin at all aint de matter wid him -- but den what make h\n",
544 |       "a gose ? And den he keep a syphon all de time -- '' '' Keeps a what , Jupiter \n",
545 |       " , Jupiter ? '' `` Keeps a syphon wid de figgurs on de slate -- de queerest fi\n",
546 |       "' `` Keeps a syphon wid de figgurs on de slate -- de queerest figgurs I ebber \n",
547 |       " syphon wid de figgurs on de slate -- de queerest figgurs I ebber did see . Is\n",
548 |       "vers . Todder day he gib me slip fore de sun up and was gone de whole ob de bl\n"
549 |      ]
550 |     }
551 |    ],
552 |    "source": [
553 |     "goldBugText = nltk.Text(goldBugTokens)\n",
554 |     "goldBugText.concordance(\"de\", lines=10)"
555 |    ]
556 |   },
557 |   {
558 |    "cell_type": "markdown",
559 |    "metadata": {},
560 |    "source": [
561 |     "In the concordance view above all the occurrences of \"de\" are aligned to make scanning each occurrence easier."
562 |    ]
563 |   },
564 |   {
565 |    "cell_type": "markdown",
566 |    "metadata": {},
567 |    "source": [
568 |     "## Next Steps"
569 |    ]
570 |   },
571 |   {
572 |    "cell_type": "markdown",
573 |    "metadata": {},
574 |    "source": [
575 |     "Here are some tasks to try:\n",
576 |     "\n",
577 |     "* Show a table of the top 20 words\n",
578 |     "    * Choose 3 words to add to the <a href=\"Glossary.ipynb#stopwords\" title=\"A list of words which are programmed to be ignored or filtered in analysis and search queries.\">stop-words</a> list using list <a href=\"Glossary.ipynb#Concatenation\" title=\"The process of combining strings\">concatenation</a>\n",
579 |     "    * Regenerate the list of the top 20 words using your new <a href=\"Glossary.ipynb#stopwords\" title=\"A list of words which are programmed to be ignored or filtered in analysis and search queries.\">stop-words</a> list\n",
580 |     "* Instead of testing for presence in the stopword list, how would you test for words that contain 10 characters or more?\n",
581 |     "* Determine whether or not the word provided to the <a href=\"Glossary.ipynb#Concordance\" title=\"A list of all words within a text and their frequency of occurrence.\">concordance</a> function is case sensitive\n",
582 |     "\n",
583 |     "In the next notebook we're going to get [Graphical](GettingGraphical.ipynb)."
584 |    ]
585 |   },
586 |   {
587 |    "cell_type": "markdown",
588 |    "metadata": {},
589 |    "source": [
590 |     "---\n",
591 |     "[CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/) From [The Art of Literary Text Analysis](ArtOfLiteraryTextAnalysis.ipynb) by [Stéfan Sinclair](http://stefansinclair.name) &amp; [Geoffrey Rockwell](http://geoffreyrockwell.com). Edited and revised by [Melissa Mony](http://melissamony.com). <br >Created February 7, 2015 and last modified January 14, 2018 (Jupyter 4)"
592 |    ]
593 |   }
594 |  ],
595 |  "metadata": {
596 |   "kernelspec": {
597 |    "display_name": "Python 3",
598 |    "language": "python",
599 |    "name": "python3"
600 |   },
601 |   "language_info": {
602 |    "codemirror_mode": {
603 |     "name": "ipython",
604 |     "version": 3
605 |    },
606 |    "file_extension": ".py",
607 |    "mimetype": "text/x-python",
608 |    "name": "python",
609 |    "nbconvert_exporter": "python",
610 |    "pygments_lexer": "ipython3",
611 |    "version": "3.7.1"
612 |   }
613 |  },
614 |  "nbformat": 4,
615 |  "nbformat_minor": 1
616 | }
617 | 


--------------------------------------------------------------------------------