├── .gitignore_global ├── .gitmodules ├── LICENSE ├── README.md ├── hist3368-week1-intro-to-jupyter ├── README.md ├── hist3368-week1-first-time-setup.ipynb ├── hist3368-week1-intro-to-jupyter.ipynb └── hist3368-week1-this-is-not-a-string.ipynb ├── hist3368-week10-distinctiveness ├── README.md ├── edgar-data.csv ├── hist3368-week10-distinctiveness-for-teaching-purposes.ipynb ├── hist3368-week10-distinctiveness.ipynb └── hist3368-word-vectors-with-ngrams.ipynb ├── hist3368-week12-word-context-vectors ├── README.md ├── edgar-data.csv ├── hist3368-week12-word-context-vectors-w-gensim-PRE-SAVED-MODEL.ipynb ├── hist3368-week12-word-context-vectors-w-gensim-TEACHING.ipynb ├── hist3368-week12-word-context-vectors-w-gensim.ipynb ├── hist3368-week12-word-context-vectors-w-sklearn.ipynb └── hist3368-week12-word-context-vectors.ipynb ├── hist3368-week2-critical-word-count ├── README.md ├── hist3368-week2-cleaning-text.ipynb ├── hist3368-week2-counting-with-series-and-dictionaries.ipynb ├── hist3368-week2-for-loops.ipynb ├── hist3368-week2-more-on-lists-and-dictionaries.ipynb └── hist3368-week2-wordcloud-and-bar-plot.ipynb ├── hist3368-week3-ngrams-lemmatization-gender ├── README.md ├── hist3368-cleaning-lemmatizing-visualization-congress-pandas.ipynb ├── hist3368-week3-lemmatizing.ipynb ├── hist3368-week3-searching-bigrams.ipynb └── hist3368-week3-using-ngrams-to-interpret-gender.ipynb ├── hist3368-week4-wordnet-controlled-vocab ├── README.md ├── hist3368-week4-controlled-vocab.ipynb ├── hist3368-week4-using-wordnet-to-study-congress.ipynb └── hist3368-week4-wordnet.ipynb ├── hist3368-week5-plotting-change-over-time ├── README.md ├── edgar-data.csv ├── hist3368-week-5-working-with-tabular-data.ipynb ├── hist3368-week5-access-data.ipynb ├── hist3368-week5-grouping-data.ipynb ├── hist3368-week5-interpret-word-change-over-time.ipynb └── hist3368-week5-working-with-dates-over-time.ipynb ├── hist3368-week6-measuring-change-and-using-groupby ├── README.md ├── hist3368-week6-collocated-words-over-time-congress.ipynb ├── hist3368-week6-collocated-words-over-time.ipynb ├── hist3368-week6-controlled-vocab-over-time.ipynb └── hist3368-week6-datetime-applied-to-congress.ipynb ├── hist3368-week9-named-entities ├── hist3368-tutorial-on-speed.ipynb ├── hist3368-week9-named-entity-recognition-congress-w-kwic.ipynb ├── hist3368-week9-named-entity-recognition-congress-workhorse.ipynb ├── hist3368-week9-named-entity-recognition.ipynb └── read.me ├── images ├── README.md ├── connect_jupyter_1.png ├── data_team_fields.png ├── double_click.png ├── fields.png ├── file_open.png ├── memory.png ├── open_filep.png ├── pipe.jpg ├── pipe.png ├── resources_1.png ├── select_jupyter.png ├── source-spacy-m2.png └── thesearenotthesame.png ├── optional-notebooks ├── Melanie_Walsh-Topic-Modeling-Text-Files.ipynb ├── embeddings-read.me ├── hist3368-week8-metadata │ ├── README.md │ └── hist3368-week8-metadata.ipynb ├── hist3368-week9-advanced-parts-of-speech │ ├── README.md │ └── hist3368-week9-advanced-parts-of-speech.ipynb ├── read.me ├── research-w-congress-embeddings.ipynb ├── store-congress-embeddings-from-gensim-in-parallel.ipynb └── word-embeddings └── utilities ├── README.md ├── concat_gutenberg_noveltm.ipynb ├── export_spacy_doc.py ├── hansard_spacy.ipynb ├── parse_city_council.py ├── reddit_subset.R └── subset_reddit.R /.gitignore_global: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "text_mining_with_python"] 2 | path = text_mining_with_python 3 | url = https://github.com/SouthernMethodistUniversity/text_mining_with_python 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text Mining as Historical Method 2 | HIST 3368 (undergrad) / Hist 6322 (Grad) 3 | 4 | Meets 3:00-3:50PM CT MWF 5 | 6 | Meetings will be partly synchronous (via Zoom), partly asynchronous (via Canvas discussion boards, etc.). 7 | 8 | ### About the Course 9 | Computer-powered methods are changing the way that we access information about society. New methods help us to detect change over time, to identify influential figures, and to name turning points. What happens when we apply these tools to a million congressional debates or tweets? This course -- which is appropriate to both computationalists as well as those with a background in the humanities (but not code) -- will teach students how to analyze texts and as data for evidence of change over time. This course is an introduction to the cutting-edge methodologies of textual analysis that are transforming the humanities today. 10 | 11 | ### About the GitHub Repository 12 | The purpose of this repository is to provide resources for digital-history. It aggregates original Notebooks written by Jo Guldi or her research assistant, Steph Buongiorno, Notebooks written by Southern Methodist University's "Data Team" (Rob Kalescky and Eric Godat), and Notebooks written by scholars in the digital humanities. Authorial credit for copied/forked Notebooks is given in associated README.md files located in the Notebook's parent directory. All code copied/forked from others' repositories are subject to the author's original licensing, not the licensing of the present repository. 13 | 14 | #### Initial Clone 15 | `git clone https://github.com/stephbuon/digital-history.git --recursive` 16 | 17 | #### Subsequent Updates 18 | From inside `digital-history` directory: 19 | ``` 20 | git reset --hard 21 | git pull 22 | git submodule update --recursive 23 | ``` 24 | #### Setting Up our M2 Environment for the First Time 25 | 26 | [Log onto M2 and Load JupyterLab](https://docs.google.com/document/d/1gDEMDv6BMZHOx_OQ2GyRdrQQq9qIXJZaLEFUFsOni-s/edit?usp=sharing) 27 | 28 | -------------------------------------------------------------------------------- /hist3368-week1-intro-to-jupyter/README.md: -------------------------------------------------------------------------------- 1 | ## Configuring your session on M2 2 | 3 | 4 | 5 | 10 | 11 | __Memory__: `6` G 12 | 13 | ## Source Information 14 | The Notebook, "Intro to Jupyter Notebook and Python Crash Course" (hist3368-week1-intro-to-jupyter.ipynb), was written by Jo Guldi at Southern Methodist University. It was inspired by Lauren Klein, and forked from [laurenfklein](https://github.com/laurenfklein) at: 15 | [laurenfklein/emory-qtm340/notebooks/class3-jupyter-intro.ipynb](https://github.com/laurenfklein/emory-qtm340/blob/master/notebooks/class3-jupyter-intro.ipynb). 16 | 17 | The Notebook "This is Not a String" (hist3368-week1-intro-to-jupyter.ipynb), was written by Steph Buongiorno, PhD candidate of applied science in engineering at Southern Methodist University. -------------------------------------------------------------------------------- /hist3368-week1-intro-to-jupyter/hist3368-week1-first-time-setup.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "For citation information, please see the \"Source Information\" section listed in the associated README file: https://github.com/stephbuon/digital-history/tree/master/hist3368-week1-intro-to-jupyter" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# System Set Up" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "Before running our first Notebook, let's set up our individual environment. \n", 22 | "\n", 23 | "We will be using Anadconda, which is a distribution of the Python and R programming langauges for scientific computing. Scientific computing includes: data science, machine learning, text mining, and more! \n", 24 | "\n", 25 | "Anaconda is generally favored in these fields over CPython, the base distribution offered by [python.org](https://www.python.org/), because Anaconda offers pre-configured settings useful to data scientists and analysts. A CPython installation is blank, while an Anaconda installation comes with 250 pre-installed packages.\n", 26 | "\n", 27 | "To see the full list of packages for 64-bit Linux (like M2), [click here](https://docs.anaconda.com/anaconda/packages/py3.7_linux-64/). " 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "### Install Additional Packags" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "While most packages we are using for class come with Anaconda, some don't. The purpose of this section is to install those additonal packages using the `pip` package manager. A package manager is a tool that allows users to install and modify software that is not distributed with a library." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 1, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "Requirement already satisfied: gutenberg in /users/dbalut/.local/lib/python3.6/site-packages (0.8.1)\n", 54 | "Requirement already satisfied: requests>=2.5.1 in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from gutenberg) (2.22.0)\n", 55 | "Requirement already satisfied: six>=1.10.0 in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from gutenberg) (1.12.0)\n", 56 | "Requirement already satisfied: rdflib<5.0.0,>=4.2.0 in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from gutenberg) (4.2.2)\n", 57 | "Requirement already satisfied: setuptools>=18.5 in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from gutenberg) (41.4.0)\n", 58 | "Requirement already satisfied: bsddb3>=6.1.0 in /users/dbalut/.local/lib/python3.6/site-packages (from gutenberg) (6.2.9)\n", 59 | "Requirement already satisfied: future>=0.15.2 in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from gutenberg) (0.18.2)\n", 60 | "Requirement already satisfied: rdflib-sqlalchemy>=0.3.8 in /users/dbalut/.local/lib/python3.6/site-packages (from gutenberg) (0.4.0)\n", 61 | "Requirement already satisfied: SPARQLWrapper>=1.8.2 in /users/dbalut/.local/lib/python3.6/site-packages (from gutenberg) (1.8.5)\n", 62 | "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from requests>=2.5.1->gutenberg) (3.0.4)\n", 63 | "Requirement already satisfied: certifi>=2017.4.17 in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from requests>=2.5.1->gutenberg) (2020.6.20)\n", 64 | "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from requests>=2.5.1->gutenberg) (1.25.6)\n", 65 | "Requirement already satisfied: idna<2.9,>=2.5 in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from requests>=2.5.1->gutenberg) (2.8)\n", 66 | "Requirement already satisfied: isodate in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from rdflib<5.0.0,>=4.2.0->gutenberg) (0.5.4)\n", 67 | "Requirement already satisfied: pyparsing in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from rdflib<5.0.0,>=4.2.0->gutenberg) (2.4.2)\n", 68 | "Requirement already satisfied: alembic>=0.8.8 in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from rdflib-sqlalchemy>=0.3.8->gutenberg) (1.2.1)\n", 69 | "Requirement already satisfied: SQLAlchemy>=1.1.4 in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from rdflib-sqlalchemy>=0.3.8->gutenberg) (1.3.10)\n", 70 | "Requirement already satisfied: python-dateutil in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from alembic>=0.8.8->rdflib-sqlalchemy>=0.3.8->gutenberg) (2.8.0)\n", 71 | "Requirement already satisfied: Mako in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from alembic>=0.8.8->rdflib-sqlalchemy>=0.3.8->gutenberg) (1.1.0)\n", 72 | "Requirement already satisfied: python-editor>=0.3 in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from alembic>=0.8.8->rdflib-sqlalchemy>=0.3.8->gutenberg) (1.0.4)\n", 73 | "Requirement already satisfied: MarkupSafe>=0.9.2 in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from Mako->alembic>=0.8.8->rdflib-sqlalchemy>=0.3.8->gutenberg) (1.1.1)\n" 74 | ] 75 | } 76 | ], 77 | "source": [ 78 | "!pip install --user gutenberg" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 2, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "Requirement already satisfied: bs4 in /users/dbalut/.local/lib/python3.6/site-packages (0.0.1)\n", 91 | "Requirement already satisfied: beautifulsoup4 in /users/dbalut/.local/lib/python3.6/site-packages (from bs4) (4.9.3)\n", 92 | "Requirement already satisfied: soupsieve>1.2; python_version >= \"3.0\" in /users/dbalut/.local/lib/python3.6/site-packages (from beautifulsoup4->bs4) (2.2)\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "!pip install --user bs4" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 3, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "Requirement already satisfied: wordcloud in /users/dbalut/.local/lib/python3.6/site-packages (1.8.1)\n", 110 | "Requirement already satisfied: matplotlib in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from wordcloud) (2.2.5)\n", 111 | "Requirement already satisfied: numpy>=1.6.1 in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from wordcloud) (1.17.3)\n", 112 | "Requirement already satisfied: pillow in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from wordcloud) (6.2.1)\n", 113 | "Requirement already satisfied: python-dateutil>=2.1 in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from matplotlib->wordcloud) (2.8.0)\n", 114 | "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from matplotlib->wordcloud) (2.4.2)\n", 115 | "Requirement already satisfied: pytz in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from matplotlib->wordcloud) (2019.3)\n", 116 | "Requirement already satisfied: six>=1.10 in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from matplotlib->wordcloud) (1.12.0)\n", 117 | "Requirement already satisfied: kiwisolver>=1.0.1 in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from matplotlib->wordcloud) (1.1.0)\n", 118 | "Requirement already satisfied: cycler>=0.10 in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from matplotlib->wordcloud) (0.10.0)\n", 119 | "Requirement already satisfied: setuptools in /hpc/applications/anaconda/3/lib/python3.6/site-packages (from kiwisolver>=1.0.1->matplotlib->wordcloud) (41.4.0)\n" 120 | ] 121 | } 122 | ], 123 | "source": [ 124 | "!pip install --user wordcloud" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 4, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "## Edgar will be addded here shortly " 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "### Download NLTK data:" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 5, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "import nltk" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 6, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "name": "stdout", 159 | "output_type": "stream", 160 | "text": [ 161 | "[nltk_data] Downloading package stopwords to\n", 162 | "[nltk_data] /users/dbalut/nltk_data...\n", 163 | "[nltk_data] Package stopwords is already up-to-date!\n" 164 | ] 165 | }, 166 | { 167 | "data": { 168 | "text/plain": [ 169 | "True" 170 | ] 171 | }, 172 | "execution_count": 6, 173 | "metadata": {}, 174 | "output_type": "execute_result" 175 | } 176 | ], 177 | "source": [ 178 | "nltk.download('stopwords')" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 7, 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "name": "stdout", 188 | "output_type": "stream", 189 | "text": [ 190 | "[nltk_data] Downloading package punkt to /users/dbalut/nltk_data...\n", 191 | "[nltk_data] Package punkt is already up-to-date!\n" 192 | ] 193 | }, 194 | { 195 | "data": { 196 | "text/plain": [ 197 | "True" 198 | ] 199 | }, 200 | "execution_count": 7, 201 | "metadata": {}, 202 | "output_type": "execute_result" 203 | } 204 | ], 205 | "source": [ 206 | "nltk.download('punkt')" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [] 215 | } 216 | ], 217 | "metadata": { 218 | "kernelspec": { 219 | "display_name": "Python 3", 220 | "language": "python", 221 | "name": "python3" 222 | }, 223 | "language_info": { 224 | "codemirror_mode": { 225 | "name": "ipython", 226 | "version": 3 227 | }, 228 | "file_extension": ".py", 229 | "mimetype": "text/x-python", 230 | "name": "python", 231 | "nbconvert_exporter": "python", 232 | "pygments_lexer": "ipython3", 233 | "version": "3.6.7" 234 | } 235 | }, 236 | "nbformat": 4, 237 | "nbformat_minor": 4 238 | } 239 | -------------------------------------------------------------------------------- /hist3368-week1-intro-to-jupyter/hist3368-week1-intro-to-jupyter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "For citation information, please see the \"Source Information\" section listed in the associated README file: https://github.com/stephbuon/digital-history/tree/master/hist3368-week1-intro-to-jupyter" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Intro to Jupyter Notebook and Python Crash Course\n", 15 | "\n", 16 | "*Based off tutorials by [Lauren Klein](https://github.com/laurenfklein/emory-qtm340/tree/master/notebooks), [Alison Parish](https://github.com/aparrish/dmep-python-intro) and [Jinho Choi](https://github.com/emory-courses/data-science)*\n", 17 | "\n", 18 | "Jupyter Notebook is a browser-based platform for programming in Python, which allows you to intersperse executable code with chunks of text like this. It lets you (mostly) avoid the command line, and results in a file that is easy to read and easy to share with others. \n", 19 | "\n", 20 | "We'll be using Jupyter Notebooks for in-class exercises and for many of the homework assignments. In order to manage these various notebooks, I recommend using [GitHub Desktop](https://desktop.github.com/), which I'll discuss at the end of this class. I'm separating these discussions so that you don't get confused as to which platform does which thing, but the TL;dr version is that Jupyter is for writing and running code, and GitHub is for sharing it. \n", 21 | "\n", 22 | "## Here we go! \n", 23 | "\n", 24 | "A Jupyter Notebook consists of a number of \"cells,\" stacked on the page from top to bottom. Cells can have text or code in them. You can change a cell's type using the \"Cell\" menu at the top of the page; go to `Cell > Cell Type` and select either `Code` for Python code or `Markdown` for text. (You can also change this for the current cell using the drop-down menu in the toolbar.)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Your first cell\n", 32 | "\n", 33 | "**Let's make a text cell!**\n", 34 | "\n", 35 | "To make a new text cell below this one, do this:\n", 36 | "* Choose `+` from the menu at the top of this page (not the JupyterLab webpage, but the page for hist3368-week1). \n", 37 | "* Use the menu at the top of this page where it says 'Code' to change the *type* of the cell you just made to `Markdown`\n", 38 | "* Type some stuff and press `Ctrl-Enter` (or `Ctrl-Return` on some keyboards) \n", 39 | "\n", 40 | "Jupyter Notebook will \"render\" the text and display it on the page in rendered format. You can hit `Enter` or click in the cell to edit its contents again. \n", 41 | "\n", 42 | "You can format the text of a cell using a set of conventions called Markdown. Markdown is a simple language for marking up text with basic text formatting information (such as bold, italics, hyperlinks, tables, etc.). \n", 43 | "\n", 44 | "[Here is a Markdown tutorial](http://markdowntutorial.com/), although I prefer to just consult a cheat sheet like [this one](https://www.markdownguide.org/cheat-sheet/)." 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "## Exercise: Make a text cell with some formatting using Markdown\n", 59 | "\n", 60 | "Here is your first exericse: Insert a new cell just below this one, and use the [Markdown cheat sheet](https://www.markdownguide.org/cheat-sheet/) to make some text **bold**, some text *italics*, and some:\n", 61 | "\n", 62 | "- text\n", 63 | "- in\n", 64 | "- a \n", 65 | "- list\n", 66 | "\n", 67 | "---" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "## Two helpful shortcuts \n", 75 | "\n", 76 | "Instead of pressing `Ctl-Enter`, you can press `Alt-Enter` to render the current cell and create a new cell just below. New cells will by default be `Code` cells. \n", 77 | "\n", 78 | "Another shortcut: instead of pressing `Ctl-Enter`, you can press `Shift-Enter` to render the current cell and move to the one below. \n", 79 | "\n", 80 | "Let's try pressing `Shift-Enter` now, so that we can learn about code cells. " 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "## Code cells\n", 88 | "\n", 89 | "Code cells-- which are actually the default type of cell in Jupyter-- use the same set of commands (`Ctl-Enter`, `Alt-Enter`, `Shift-Enter`, etc) to run chunks of Python code rather than just render text.\n", 90 | "\n", 91 | "Let's not beat around the bush! Click on the cell just below, and then press `Shift-Enter` to run your first chunk of code." 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 1, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "name": "stdout", 101 | "output_type": "stream", 102 | "text": [ 103 | "This is a code cell.\n", 104 | "\n", 105 | "Any Python code you type in this cell will be run when you press the 'Run' button\n", 106 | "up there, or when you press any of the keyboard shortcuts you just learned.\n", 107 | "\n", 108 | "If the code evaluates to something, or if it produces output, that output will be\n", 109 | "shown beneath the cell after you run it.\n", 110 | "\n", 111 | "Let's keep on going. Press Shift-Enter to see what this cell produces and keep going\n", 112 | "with our exercise.\n" 113 | ] 114 | } 115 | ], 116 | "source": [ 117 | "print(\"This is a code cell.\")\n", 118 | "print(\"\")\n", 119 | "print(\"Any Python code you type in this cell will be run when you press the 'Run' button\")\n", 120 | "print(\"up there, or when you press any of the keyboard shortcuts you just learned.\")\n", 121 | "print(\"\")\n", 122 | "print(\"If the code evaluates to something, or if it produces output, that output will be\")\n", 123 | "print(\"shown beneath the cell after you run it.\")\n", 124 | "print(\"\")\n", 125 | "print(\"Let's keep on going. Press Shift-Enter to see what this cell produces and keep going\")\n", 126 | "print(\"with our exercise.\")" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "**Sidenote: You just learned how the print function works in Python 3!**\n", 134 | "\n", 135 | "[Obligatory meme](https://twitter.com/cszhu/status/1163476712044150790)\n", 136 | "\n", 137 | "[Here](https://docs.python.org/3.3/whatsnew/3.0.html) are a few more examples of how to print things. \n", 138 | "\n", 139 | "[Here](https://realpython.com/python-print/) is more about `print()` than you likely ever wanted to know." 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "## Errors \n", 147 | "\n", 148 | "If your code generates an error, it will be displayed in addition to any output already produced. \n", 149 | "\n", 150 | "Let's commit a cardinal sin: dividing by zero.\n", 151 | "\n", 152 | "Run the cell below." 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 2, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "name": "stdout", 162 | "output_type": "stream", 163 | "text": [ 164 | "Here is some printing.\n", 165 | "And now here is an error:\n" 166 | ] 167 | }, 168 | { 169 | "ename": "ZeroDivisionError", 170 | "evalue": "division by zero", 171 | "output_type": "error", 172 | "traceback": [ 173 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 174 | "\u001b[0;31mZeroDivisionError\u001b[0m Traceback (most recent call last)", 175 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"And now here is an error:\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;36m1\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 176 | "\u001b[0;31mZeroDivisionError\u001b[0m: division by zero" 177 | ] 178 | } 179 | ], 180 | "source": [ 181 | "print(\"Here is some printing.\")\n", 182 | "print(\"And now here is an error:\")\n", 183 | "\n", 184 | "1 / 0" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "## Variables\n", 192 | "\n", 193 | "The major difference between Python and R-- at least in terms of getting started-- is that in Python, you can only use `=` to assign variables. \n", 194 | "\n", 195 | "Let's assign some variables:" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 3, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "x = 0\n", 205 | "y = .5\n", 206 | "z = True # note another difference b/t R and Python: Boolean (T/F) values are not all caps " 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 4, 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "data": { 216 | "text/plain": [ 217 | "0" 218 | ] 219 | }, 220 | "execution_count": 4, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "x" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "**A neat thing about notebooks:**\n", 234 | "\n", 235 | "Any variables you define or modules you import in one code cell will be available in subsequent code cells. \n", 236 | "\n", 237 | "So for instance, if you run the cell below:" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 5, 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "name": "stdout", 247 | "output_type": "stream", 248 | "text": [ 249 | "flea market\n" 250 | ] 251 | } 252 | ], 253 | "source": [ 254 | "import random # a useful module that we'll come back to later \n", 255 | "stuff = [\"cheddar\", \"daguerrotype\", \"elephant\", \"flea market\"]\n", 256 | "\n", 257 | "print(random.choice(stuff)) # choice is a function that-- you guessed it-- makes a random choice" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "... then in subsequent cells you can do this:" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 6, 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "name": "stdout", 274 | "output_type": "stream", 275 | "text": [ 276 | "elephant\n" 277 | ] 278 | } 279 | ], 280 | "source": [ 281 | "print(random.choice(stuff)) # choice is a function that-- you guessed it-- makes a random choice" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "## Exercise: Write some code and then run it\n", 289 | "\n", 290 | "Using the previous two code cells as a guide...\n", 291 | "- Reassign the \"stuff\" variable with four different words\n", 292 | "- Use the \"choice\" function, as above, to randomly print one of those four words " 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "**FYI:** \"stuff\" is a list, which is a type of value in Python which represents a sequence of values. It's a very common and versatile data structure and is often used to represent tabular data (among other information).\n", 307 | "\n", 308 | "More on lists shortly. But first: " 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "## Keyboard shortcuts\n", 330 | "\n", 331 | "As mentioned above, `Ctrl-Enter` runs the current cell; `Alt-Enter` runs the current cell and then creates a new cell. `Enter` will start editing whichever cell is currently selected. To quit editing a cell, hit `Esc`. If the cursor isn't currently active in any cell (i.e., after you've hit `Esc`), a number of other keyboard shortcuts are available to you:\n", 332 | "\n", 333 | "* `m` converts the selected cell to a Markdown cell\n", 334 | "* `b` inserts a new cell below the selected one\n", 335 | "* `x` \"cuts\" the selected cell; `v` pastes a previously cut cell below the selected cell\n", 336 | "* `h` brings up a help screen with many more shortcuts." 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": {}, 342 | "source": [ 343 | "## Saving your work\n", 344 | "\n", 345 | "Hit `Cmd-S` at any time to save your notebook. Jupyter Notebook also automatically saves occasionally. Make sure to give your notebook a descriptive title by clicking on \"Untitled0\" at the top of the page and replacing the text accordingly. Notebooks you save will be available on your local server whenever you log in again, from wherever you saved them. \n", 346 | "\n", 347 | "You can also \"download\" your notebook in various formats via `File > Download as`. You can download your notebook as a static HTML file (for, e.g., uploading to a web site), or as a `.ipynb` file, which you can share with other people who have Jupyter Notebook-- such as me when you need to submit a homework. \n", 348 | "\n", 349 | "There are also a couple of platforms that allow you to view notebooks online, such as [nbviewer](http://nbviewer.ipython.org/). Note that the notebook needs to already exist online for nbviewer to work. \n", 350 | "\n", 351 | "Re: that final point, [GitHub](github.com) is a good default place to store notebooks. " 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": {}, 357 | "source": [ 358 | "## GitHub Digression\n", 359 | "\n", 360 | "*forking, cloning, syncing*" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "## Permissions Digression\n", 368 | "\n", 369 | "Do you have permission to access our class folder on M2? Load a file from our folder to find out: " 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 7, 375 | "metadata": {}, 376 | "outputs": [ 377 | { 378 | "ename": "PermissionError", 379 | "evalue": "[Errno 13] Permission denied: '/scratch/group/history/hist_3368-jguldi/permissions_check.txt'", 380 | "output_type": "error", 381 | "traceback": [ 382 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 383 | "\u001b[0;31mPermissionError\u001b[0m Traceback (most recent call last)", 384 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'/scratch/group/history/hist_3368-jguldi/permissions_check.txt'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mline\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 385 | "\u001b[0;31mPermissionError\u001b[0m: [Errno 13] Permission denied: '/scratch/group/history/hist_3368-jguldi/permissions_check.txt'" 386 | ] 387 | } 388 | ], 389 | "source": [ 390 | "with open('/scratch/group/history/hist_3368-jguldi/permissions_check.txt') as f:\n", 391 | " print([line for line in f])" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": {}, 397 | "source": [ 398 | "If you don't have permission, please email Richard England directly to let him know. His email is: rengland at smu dot edu." 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": {}, 404 | "source": [ 405 | "## Resources for Basic Python \n", 406 | "\n", 407 | "In general, I'll introduce concepts as we encounter them. But a few more basic things to keep in mind that are different than in R.\n", 408 | "\n", 409 | "Also, this course exists: [https://www.datacamp.com/courses/python-for-r-users](https://www.datacamp.com/courses/python-for-r-users)\n", 410 | "\n", 411 | "## Strings: You Can Concatinate then with \"+\"\n", 412 | "\n", 413 | "Unlike R, you can use the \"+\" operator to concatinate two strings together. " 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 8, 419 | "metadata": {}, 420 | "outputs": [ 421 | { 422 | "data": { 423 | "text/plain": [ 424 | "250" 425 | ] 426 | }, 427 | "execution_count": 8, 428 | "metadata": {}, 429 | "output_type": "execute_result" 430 | } 431 | ], 432 | "source": [ 433 | "5*50" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": 9, 439 | "metadata": {}, 440 | "outputs": [ 441 | { 442 | "name": "stdout", 443 | "output_type": "stream", 444 | "text": [ 445 | "helloworld\n" 446 | ] 447 | } 448 | ], 449 | "source": [ 450 | "foo = \"hello\"\n", 451 | "bar = \"world\"\n", 452 | "\n", 453 | "\n", 454 | "foobar = foo + bar\n", 455 | "\n", 456 | "print(foobar)" 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "# If you're totally new to Python" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": {}, 469 | "source": [ 470 | "If you're totally new to Python, you might need some more time drilling with the basic operations to really master them. The following notebooks are short -- and totally optional. However, I encourage you to \n", 471 | "\n", 472 | "lists : https://github.com/laurenfklein/emory-qtm340/blob/master/notebooks/lists.ipynb\n", 473 | "expressions and strings : https://github.com/laurenfklein/emory-qtm340/blob/master/notebooks/expressions-and-strings.ipynb\n", 474 | "dictionaries, sets, tuples: https://github.com/laurenfklein/emory-qtm340/blob/master/notebooks/dictionaries-sets-tuples.ipynb\n", 475 | "\n", 476 | "for loops : https://problemsolvingwithpython.com/09-Loops/09.01-For-Loops/\n", 477 | "counting things: https://github.com/laurenfklein/emory-qtm340/blob/master/notebooks/counting.ipynb" 478 | ] 479 | } 480 | ], 481 | "metadata": { 482 | "kernelspec": { 483 | "display_name": "Python 3", 484 | "language": "python", 485 | "name": "python3" 486 | }, 487 | "language_info": { 488 | "codemirror_mode": { 489 | "name": "ipython", 490 | "version": 3 491 | }, 492 | "file_extension": ".py", 493 | "mimetype": "text/x-python", 494 | "name": "python", 495 | "nbconvert_exporter": "python", 496 | "pygments_lexer": "ipython3", 497 | "version": "3.6.7" 498 | } 499 | }, 500 | "nbformat": 4, 501 | "nbformat_minor": 4 502 | } 503 | -------------------------------------------------------------------------------- /hist3368-week10-distinctiveness/README.md: -------------------------------------------------------------------------------- 1 | ## Configuring your session on M2 2 | 3 | 10 | 11 | __Memory__: `64` G 12 | 13 | ## Source Information 14 | 15 | The Notebook, "Calculating Log-Liklihood" (hist3368-week10-distictiveness.ipynb), was written by Jo Guldi. 16 | -------------------------------------------------------------------------------- /hist3368-week10-distinctiveness/hist3368-week10-distinctiveness-for-teaching-purposes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Hist 3368 - Week 10: Measuring Distinctiveness with Log Likelihood " 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# For teaching purposes" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "#### Demonstrating word vectors" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 11, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "mystring = [\"Peter Piper picked a peck of pickled peppers. A peck of pickled peppers Peter Piper picked. If Peter Piper picked a peck of pickled peppers, Where's the peck of pickled peppers Peter Piper picked?\"]" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 22, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "['if', 'of', 'peck', 'peppers', 'peter', 'picked', 'pickled', 'piper', 'the', 'where']\n", 43 | "[[1 4 4 4 4 4 4 4 1 1]]\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "import numpy as np, pandas as pd\n", 49 | "from sklearn.feature_extraction.text import CountVectorizer\n", 50 | "vectorizer = CountVectorizer(stop_words = None)\n", 51 | "\n", 52 | "vectors = vectorizer.fit_transform(mystring)\n", 53 | "print(vectorizer.get_feature_names())\n", 54 | "print(vectors.toarray())" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "## Summary of Notebook" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "#### Import Software" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "#import some software\n", 78 | "import pandas as pd\n", 79 | "import numpy as np\n", 80 | "from sklearn.feature_extraction.text import CountVectorizer\n", 81 | "from nltk import word_tokenize \n", 82 | "from nltk.stem import WordNetLemmatizer \n", 83 | "class LemmaTokenizer(object):\n", 84 | " def __init__(self):\n", 85 | " self.wnl = WordNetLemmatizer()\n", 86 | " def __call__(self, articles):\n", 87 | " return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]\n", 88 | "import adjustText\n", 89 | "import matplotlib" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "#### Load some Data" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "cd /scratch/group/history/hist_3368-jguldi" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "congress = pd.read_csv(\"congress1967-2010.csv\")\n", 115 | "#congress = pd.read_csv(\"eighties_data.csv\")" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "#### Cleaning" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "# clean up congress\n", 132 | "congress = congress[congress['year'] == 1968]\n", 133 | "clean_congress = congress.copy()\n", 134 | "clean_congress['speech'] = clean_congress['speech'].str.replace('[^\\w\\s]','') # remove punctuation\n", 135 | "clean_congress['speech'] = clean_congress['speech'].str.replace('\\d+', '') # for digits\n", 136 | "clean_congress['speech'] = clean_congress['speech'].str.replace(r'(\\b\\w{1}\\b)', '') # for short words" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "congress[:5]" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "clean_congress[:5]" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "#### Format Data Around Units of Interest With One String per Unit" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "# format the data around our research into speakers\n", 171 | "top_speakers = clean_congress.groupby('speaker').agg({'speech': ' '.join, 'word_count': 'sum'}).sort_values('word_count', ascending = False)[:10]\n", 172 | "top_speakers = top_speakers[top_speakers.index != 'The PRESIDING OFFICER']" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "top_speakers" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "#### Make Word Vectors -- One for Each Unit of Interest" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "# make a word vector and get some information from it\n", 198 | "vectorizer = CountVectorizer(\n", 199 | " max_features=10000, \n", 200 | " lowercase=True, \n", 201 | " stop_words = 'english',\n", 202 | " ngram_range=(1, 2), \n", 203 | " analyzer = \"word\",\n", 204 | " #tokenizer=LemmaTokenizer()\n", 205 | " )\n", 206 | "\n", 207 | "vectorized = vectorizer.fit_transform(top_speakers['speech'])\n", 208 | "vectorized" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "#### Make the Word Vectors Readable" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 3, 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "ename": "NameError", 225 | "evalue": "name 'np' is not defined", 226 | "output_type": "error", 227 | "traceback": [ 228 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 229 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 230 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mall_words\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvectorizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_feature_names\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mspeaker_names\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtop_speakers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maxes\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_numpy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m vectors_dataframe = pd.DataFrame(vectorized.todense(), # the matrix we saw above is turned into a dataframe\n\u001b[1;32m 5\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mall_words\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 231 | "\u001b[0;31mNameError\u001b[0m: name 'np' is not defined" 232 | ] 233 | } 234 | ], 235 | "source": [ 236 | "all_words = np.array(vectorizer.get_feature_names())\n", 237 | "speaker_names = list(top_speakers.axes[0].to_numpy())\n", 238 | "\n", 239 | "vectors_dataframe = pd.DataFrame(vectorized.todense(), # the matrix we saw above is turned into a dataframe\n", 240 | " columns=all_words,\n", 241 | " index = speaker_names\n", 242 | " )\n", 243 | "vectors_dataframe" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "#### Compute some baseline numbers about the data" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "speaker_words_total = vectors_dataframe.sum(axis=1)\n", 260 | "word_totals = vectors_dataframe.sum(axis=0) \n", 261 | "total_corpus_words = sum(word_totals)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": {}, 267 | "source": [ 268 | "#### Measure distinctiveness with log likelihood" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "## create an empty dataframe\n", 278 | "speakers_loglikelihood = []\n", 279 | "\n", 280 | "## loop through every speaker in speaker_names\n", 281 | "for speaker_id, speaker in enumerate(speaker_names):\n", 282 | " loglikely = []\n", 283 | " # loop through every word in the wordcount vector:\n", 284 | " for word_id in vectorized[speaker_id].indices:\n", 285 | " \n", 286 | " a = vectors_dataframe.iat[speaker_id, word_id] # word in speaker\n", 287 | " b = word_totals[word_id] - a # # word in remaining speakers\n", 288 | " c = speaker_words_total[speaker_id] - a # not word in speaker\n", 289 | " d = total_corpus_words - a - b - c # not word in remaining speakers\n", 290 | " \n", 291 | " E1 = (a + c) * (a + b) / total_corpus_words \n", 292 | " E2 = (b + d) * (a + b) / total_corpus_words \n", 293 | " \n", 294 | " LL = 2 * (a * np.log(a / E1)) # the log likelihood equation\n", 295 | " if (b > 0):\n", 296 | " LL += 2 * b * np.log(b / E2)\n", 297 | " \n", 298 | " loglikely.append((LL, all_words[word_id])) # add the log likelihood score to the end of a new dataframe\n", 299 | "\n", 300 | " loglikely = sorted(loglikely, reverse=True) # the loop hits this every time it cycles through all the words in one speaker. \n", 301 | " speakers_loglikelihood.append(loglikely) # add on another speaker\n" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "print(\"The 20 most distinctive words of each speaker. The words are listed from high to low ranking\")\n", 311 | "print(\"-------------------------------------------\\n\")\n", 312 | "for i, speaker in enumerate(speaker_names):\n", 313 | " print(speaker + \": \", end='')\n", 314 | " distinct_words = [word[1] for word in speakers_loglikelihood[i][:20]]\n", 315 | " print(distinct_words)\n", 316 | " print(\"\\n-----------------------------\\n\")" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": {}, 322 | "source": [ 323 | "## Visualizing the distinctiveness of the language of each speaker" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "Note that we're going to make a big plot.\n", 331 | "\n", 332 | "***This might take a minute or two to run.***" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "cd ~/digital-history" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [ 350 | "%matplotlib inline\n", 351 | "from adjustText import adjust_text\n", 352 | "\n", 353 | "# change the figure's size here\n", 354 | "plt.figure(figsize=(10,10), dpi = 500)\n", 355 | "\n", 356 | "# style\n", 357 | "plt.style.use('seaborn-darkgrid') # this gives us a grid with a dark background. you can play with this to change the style.\n", 358 | " \n", 359 | "# create a color palette\n", 360 | "palette = plt.get_cmap('hsv') # this tells matplotlib what colors to use. you can play with this to change the colors.\n", 361 | "\n", 362 | "# start a counter at 0\n", 363 | "num = 0\n", 364 | "\n", 365 | "# create an empty list\n", 366 | "texts = []\n", 367 | "\n", 368 | "# this is the for loop that creates multiple plots. \n", 369 | "for i, speaker in enumerate(speaker_names):\n", 370 | " num += 14 # num tells the plot to choose a different color for each speaker\n", 371 | " distinct_words = speakers_loglikelihood[i][:20] # plot the top twenty words by LL-score\n", 372 | " for word in distinct_words: # for each word-per-speaker instance, plot the ll_score on the y axis\n", 373 | " ll_score = word[0] # find just the ll-score from speakers_loglikelihood\n", 374 | " word_label = word[1] # find just the keyword name from speakers_loglikelihood\n", 375 | " plt.scatter( # draw a dot for each word\n", 376 | " speaker, # with speaker on the x axis\n", 377 | " ll_score, # and ll_score on the y axis\n", 378 | " color=palette(num), # using a different color for each speaker\n", 379 | " linewidth=1, \n", 380 | " edgecolors = 'b',\n", 381 | " s = 55, # size\n", 382 | " alpha=0.3, # make the dots slightly transparent\n", 383 | " label=speaker) # label each dot with the name of the word\n", 384 | " texts.append(plt.text(speaker, ll_score, word_label)) # save these coordinates to be used in labeling\n", 385 | "\n", 386 | "# Add legend\n", 387 | "plt.xticks(rotation=90)\n", 388 | " \n", 389 | "# Add titles\n", 390 | "plt.title(\"Figure 1: Highest Log-Likelihood Scored Words per Speaker\", fontsize=30, fontweight=0, color='Red')\n", 391 | "plt.xlabel(\"Speaker\")\n", 392 | "plt.ylabel(\"Distinctiveness of Words, Measured by LL Score\")\n", 393 | "\n", 394 | "# Code to help with overlapping labels -- may take a minute to run\n", 395 | "adjust_text(texts, force_points=0.2, force_text=0.2,\n", 396 | " expand_points=(1, 1), expand_text=(1, 1),\n", 397 | " arrowprops=dict(arrowstyle=\"-\", color='black', lw=0.5))\n", 398 | "\n", 399 | "# save it\n", 400 | "fig1 = plt.gcf()\n", 401 | "plt.show()\n", 402 | "plt.draw()\n", 403 | "fig1.savefig('LL-fig1.jpg', dpi=500)" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "metadata": {}, 409 | "source": [ 410 | "### A 2-D comparison of two speakers" 411 | ] 412 | }, 413 | { 414 | "cell_type": "markdown", 415 | "metadata": {}, 416 | "source": [ 417 | "#### Create a dataframe from just two speakers" 418 | ] 419 | }, 420 | { 421 | "cell_type": "markdown", 422 | "metadata": {}, 423 | "source": [ 424 | " ***The x coordinate will be how distinctive each word is for Javits; the y coordinate will be how distinctive each word is for Long. Change the speakers in question by changingn in speakers_loglikelihood[n].***" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [ 440 | "# create a new dataframe of the scores and words from both Javits and Long\n", 441 | "xcoords = pd.DataFrame(columns=list(['word_label', 'x_llscore']))\n", 442 | "ycoords = pd.DataFrame(columns=list(['word_label', 'y_llscore']))\n", 443 | "\n", 444 | "# get all the words from JAVITS\n", 445 | "distinctwords = speakers_loglikelihood[0]\n", 446 | "for word in distinctwords: # for each word-per-cspeaker instance, plot the ll_score on the y axis\n", 447 | " x_llscore = word[0] # find just the ll-score from speakers_loglikelihood\n", 448 | " word_label = word[1] # find just the keyword name from speakers_loglikelihood\n", 449 | " speaker = 'JAVITS'\n", 450 | " xcoords.loc[-1] = [word_label, x_llscore]\n", 451 | " xcoords.index = xcoords.index + 1 # shifting index\n", 452 | " xcoords = xcoords.sort_index() # sorting by index\n", 453 | "\n", 454 | "# get all the words from LONG\n", 455 | "distinctwords2 = speakers_loglikelihood[1] \n", 456 | "for word in distinctwords2: # for each word-per-speaker instance, plot the ll_score on the y axis\n", 457 | " y_llscore = word[0] # find just the ll-score from speakers_loglikelihood\n", 458 | " word_label = word[1] # find just the keyword name from speakers_loglikelihood\n", 459 | " speaker = 'LONG'\n", 460 | " ycoords.loc[-1] = [word_label, y_llscore]\n", 461 | " ycoords.index = ycoords.index + 1 # shifting index\n", 462 | " ycoords = ycoords.sort_index() # sorting by index\n", 463 | "\n", 464 | "coords = pd.merge(xcoords, ycoords, on='word_label')\n", 465 | "coords = coords.dropna(axis = 0, how ='any') # drop rows with any NA's" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [ 474 | "#!pip install adjustText --user" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "metadata": {}, 481 | "outputs": [], 482 | "source": [ 483 | "%matplotlib inline\n", 484 | "\n", 485 | "from adjustText import adjust_text\n", 486 | "\n", 487 | "# change the figure's size here\n", 488 | "plt.figure(figsize=(10,10), dpi = 500)\n", 489 | "\n", 490 | "# label each dot with the name of the word -- note that we have to use a \"for\" loop for this to work; plt.annotate only plots\n", 491 | "# one label per iteration!\n", 492 | "for i, txt in enumerate(coords['word_label']):\n", 493 | " # draw a dot for each word\n", 494 | " plt.scatter( \n", 495 | " coords['x_llscore'][i], #x axis\n", 496 | " coords['y_llscore'][i], # y axis\n", 497 | " linewidth=1, \n", 498 | " s = 55, # dot size\n", 499 | " alpha=0.2) # dot transparency\n", 500 | " # make a label for each word\n", 501 | " plt.annotate(\n", 502 | " txt, \n", 503 | " (coords['x_llscore'][i], # one x llscore at a time\n", 504 | " coords['y_llscore'][i]), # one y llscore at a time\n", 505 | " alpha=0.3 # i've made the fonts transparent as well. you could play with color and size if you wanted to. \n", 506 | " )\n", 507 | "\n", 508 | "plt.xticks(rotation=90)\n", 509 | "\n", 510 | "# logarithmic axes make big things big and small things small\n", 511 | "plt.xscale('log')\n", 512 | "plt.yscale('log') \n", 513 | "\n", 514 | "# Add titles\n", 515 | "plt.title(\"Figure 2: Highest Log-Likelihood Scored Words per Speaker\", fontsize=30, fontweight=0, color='Red')\n", 516 | "plt.xlabel(\"How Distinctive Each Word is of Mr. Javits\")\n", 517 | "plt.ylabel(\"How Distinctive Each Word is of Mr. Long\")\n", 518 | "\n", 519 | "\n", 520 | "# save it\n", 521 | "fig1 = plt.gcf()\n", 522 | "plt.show()\n", 523 | "plt.draw()\n", 524 | "fig1.savefig('LL-fig2.jpg', dpi=500)" 525 | ] 526 | }, 527 | { 528 | "cell_type": "markdown", 529 | "metadata": {}, 530 | "source": [ 531 | "# From Data to Analysis" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": null, 537 | "metadata": {}, 538 | "outputs": [], 539 | "source": [ 540 | "cd /scratch/group/history/hist_3368-jguldi" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": null, 546 | "metadata": {}, 547 | "outputs": [], 548 | "source": [ 549 | "congress = pd.read_csv(\"congress1967-2010.csv\")\n", 550 | "congress = congress[congress['year'] == 1968]\n", 551 | "top_speakers_speeches = congress[congress['speaker'].isin(top_speakers.index)]" 552 | ] 553 | }, 554 | { 555 | "cell_type": "markdown", 556 | "metadata": {}, 557 | "source": [ 558 | "#### Setup: Make a KWIC Dictionary for a Single Speaker" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": null, 564 | "metadata": {}, 565 | "outputs": [], 566 | "source": [ 567 | "import string \n", 568 | "long_speeches = top_speakers_speeches[top_speakers_speeches['speaker'] == \"Mr. LONG of Louisiana\"]['speech']\n", 569 | "long_speeches = ' '.join(long_speeches).lower() # join back together and lowercase\n", 570 | "long_speeches = long_speeches.translate(str.maketrans('', '', string.punctuation)) # remove punctuation" 571 | ] 572 | }, 573 | { 574 | "cell_type": "markdown", 575 | "metadata": {}, 576 | "source": [ 577 | "Add some functions for defining ngrams " 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": null, 583 | "metadata": {}, 584 | "outputs": [], 585 | "source": [ 586 | "def getNGrams(wordlist, n):\n", 587 | " ngrams = []\n", 588 | " for i in range(len(wordlist)-(n-1)):\n", 589 | " ngrams.append(wordlist[i:i+n])\n", 590 | " return ngrams\n", 591 | "\n", 592 | "def nGramsToKWICDict(ngrams):\n", 593 | " keyindex = len(ngrams[0]) // 2\n", 594 | "\n", 595 | " kwicdict = {}\n", 596 | "\n", 597 | " for k in ngrams:\n", 598 | " if k[keyindex] not in kwicdict:\n", 599 | " kwicdict[k[keyindex]] = [k]\n", 600 | " else:\n", 601 | " kwicdict[k[keyindex]].append(k)\n", 602 | " return kwicdict\n", 603 | "\n", 604 | "\n", 605 | "def prettyPrintKWIC(kwic):\n", 606 | " n = len(kwic)\n", 607 | " keyindex = n // 2\n", 608 | " width = 20\n", 609 | "\n", 610 | " outstring = ' '.join(kwic[:keyindex]).rjust(width*keyindex)\n", 611 | " outstring += str(kwic[keyindex]).center(len(kwic[keyindex])+6)\n", 612 | " outstring += ' '.join(kwic[(keyindex+1):])\n", 613 | "\n", 614 | " return outstring\n", 615 | "\n" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": null, 621 | "metadata": {}, 622 | "outputs": [], 623 | "source": [ 624 | "fullwordlist = long_speeches.split()\n", 625 | "ngrams = getNGrams(fullwordlist, 20)\n", 626 | "worddict = nGramsToKWICDict(ngrams)" 627 | ] 628 | }, 629 | { 630 | "cell_type": "markdown", 631 | "metadata": {}, 632 | "source": [ 633 | "#### Doing Research With Log Likelihood and KWIC" 634 | ] 635 | }, 636 | { 637 | "cell_type": "code", 638 | "execution_count": null, 639 | "metadata": {}, 640 | "outputs": [], 641 | "source": [ 642 | "speakers_loglikelihood[1][:5]" 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": null, 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [ 651 | "# output KWIC for target word\n", 652 | "target = 'gun'\n", 653 | "outstr = '
'\n",
654 |     "if target in worddict:\n",
655 |     "    for k in worddict[target]:\n",
656 |     "        outstr += prettyPrintKWIC(k)\n",
657 |     "        outstr += '             '\n",
658 |     "else:\n",
659 |     "    outstr += 'Keyword not found in source'\n",
660 |     "\n",
661 |     "outstr += '
'\n", 662 | "outstr" 663 | ] 664 | } 665 | ], 666 | "metadata": { 667 | "kernelspec": { 668 | "display_name": "Python 3", 669 | "language": "python", 670 | "name": "python3" 671 | }, 672 | "language_info": { 673 | "codemirror_mode": { 674 | "name": "ipython", 675 | "version": 3 676 | }, 677 | "file_extension": ".py", 678 | "mimetype": "text/x-python", 679 | "name": "python", 680 | "nbconvert_exporter": "python", 681 | "pygments_lexer": "ipython3", 682 | "version": "3.7.4" 683 | } 684 | }, 685 | "nbformat": 4, 686 | "nbformat_minor": 4 687 | } 688 | -------------------------------------------------------------------------------- /hist3368-week10-distinctiveness/hist3368-word-vectors-with-ngrams.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Hist 3368\n", 8 | "\n", 9 | "## Word Vectors With Ngrams" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "### Play Around With Count_Vectorizer " 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "You may be thinking that this was a lot of work for a wordcount dataframe that you could have made in simpler ways. But you've just learned an incredibly powerful tool. " 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 559, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "vectorizer2 = CountVectorizer(max_features=100000, \n", 33 | " lowercase=True, \n", 34 | " stop_words = 'english',\n", 35 | " ngram_range=(1, 1), # <-- to see how this works, consider changing the arguments here from (1, 2) to (3, 4). That's a lower and an upper bound.\n", 36 | " analyzer = \"word\")" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 560, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "vectorized2 = vectorizer2.fit_transform(top_speakers['speech'])" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 561, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "all_words = np.array(vectorizer2.get_feature_names())\n", 55 | "\n", 56 | "vectors_dataframe2 = pd.DataFrame(vectorized2.todense(), # the matrix we saw above is turned into a dataframe\n", 57 | " columns=all_words,\n", 58 | " index = speaker_names\n", 59 | " )" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 562, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "(9, 32892)" 71 | ] 72 | }, 73 | "execution_count": 562, 74 | "metadata": {}, 75 | "output_type": "execute_result" 76 | } 77 | ], 78 | "source": [ 79 | "vectorized2.shape" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 563, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/html": [ 90 | "
\n", 91 | "\n", 104 | "\n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | "
_concerned_gasoline_n_percent_tinueaaaaaaapaaronaauw...zonethezoningzoomedzoomingzurichzurichwhichzweibruckenzwichzwickzwicks
Mr. JAVITS0000002000...0000110000
Mr. LONG of Louisiana0000100000...1000000052
Mr. MANSFIELD0001000000...0000001000
Mr. BYRD of West Virginia0010000020...0100000000
Mr. WILLIAMS of Delaware0000001000...00000001330
Mr. PROXMIRE0100010404...0012000020
Mr. DODD0000000000...0000000000
Mr. HOLLAND1000000000...0100000000
Mr. TYDINGS0000001020...0100000020
\n", 350 | "

9 rows × 32892 columns

\n", 351 | "
" 352 | ], 353 | "text/plain": [ 354 | " _concerned _gasoline _n _percent _tinue aa \\\n", 355 | "Mr. JAVITS 0 0 0 0 0 0 \n", 356 | "Mr. LONG of Louisiana 0 0 0 0 1 0 \n", 357 | "Mr. MANSFIELD 0 0 0 1 0 0 \n", 358 | "Mr. BYRD of West Virginia 0 0 1 0 0 0 \n", 359 | "Mr. WILLIAMS of Delaware 0 0 0 0 0 0 \n", 360 | "Mr. PROXMIRE 0 1 0 0 0 1 \n", 361 | "Mr. DODD 0 0 0 0 0 0 \n", 362 | "Mr. HOLLAND 1 0 0 0 0 0 \n", 363 | "Mr. TYDINGS 0 0 0 0 0 0 \n", 364 | "\n", 365 | " aaa aap aaron aauw ... zonethe zoning \\\n", 366 | "Mr. JAVITS 2 0 0 0 ... 0 0 \n", 367 | "Mr. LONG of Louisiana 0 0 0 0 ... 1 0 \n", 368 | "Mr. MANSFIELD 0 0 0 0 ... 0 0 \n", 369 | "Mr. BYRD of West Virginia 0 0 2 0 ... 0 1 \n", 370 | "Mr. WILLIAMS of Delaware 1 0 0 0 ... 0 0 \n", 371 | "Mr. PROXMIRE 0 4 0 4 ... 0 0 \n", 372 | "Mr. DODD 0 0 0 0 ... 0 0 \n", 373 | "Mr. HOLLAND 0 0 0 0 ... 0 1 \n", 374 | "Mr. TYDINGS 1 0 2 0 ... 0 1 \n", 375 | "\n", 376 | " zoomed zooming zurich zurichwhich zweibrucken \\\n", 377 | "Mr. JAVITS 0 0 1 1 0 \n", 378 | "Mr. LONG of Louisiana 0 0 0 0 0 \n", 379 | "Mr. MANSFIELD 0 0 0 0 1 \n", 380 | "Mr. BYRD of West Virginia 0 0 0 0 0 \n", 381 | "Mr. WILLIAMS of Delaware 0 0 0 0 0 \n", 382 | "Mr. PROXMIRE 1 2 0 0 0 \n", 383 | "Mr. DODD 0 0 0 0 0 \n", 384 | "Mr. HOLLAND 0 0 0 0 0 \n", 385 | "Mr. TYDINGS 0 0 0 0 0 \n", 386 | "\n", 387 | " zwich zwick zwicks \n", 388 | "Mr. JAVITS 0 0 0 \n", 389 | "Mr. LONG of Louisiana 0 5 2 \n", 390 | "Mr. MANSFIELD 0 0 0 \n", 391 | "Mr. BYRD of West Virginia 0 0 0 \n", 392 | "Mr. WILLIAMS of Delaware 1 33 0 \n", 393 | "Mr. PROXMIRE 0 2 0 \n", 394 | "Mr. DODD 0 0 0 \n", 395 | "Mr. HOLLAND 0 0 0 \n", 396 | "Mr. TYDINGS 0 2 0 \n", 397 | "\n", 398 | "[9 rows x 32892 columns]" 399 | ] 400 | }, 401 | "execution_count": 563, 402 | "metadata": {}, 403 | "output_type": "execute_result" 404 | } 405 | ], 406 | "source": [ 407 | "vectors_dataframe2" 408 | ] 409 | }, 410 | { 411 | "cell_type": "markdown", 412 | "metadata": {}, 413 | "source": [ 414 | "Pretty cool, eh? We're not going to do anything with this -- although you certainly could use it for the exercise that follows. \n", 415 | "\n", 416 | "You can actually use the features of CountVectorizers to do all sorts of things -- to clean your data, stopword it, lowercase it, and even implement a controlled vocabulary.\n", 417 | "\n", 418 | "For instance, if I had a controlled vocabulary called my_dict and a stopwords list called stop_words_list, I could do this:\n", 419 | "\n", 420 | " CountVectorizer(lowercase=True, \n", 421 | " stop_words = stop_words_list,\n", 422 | " vocabularyMapping = my_dict,\n", 423 | " ngram_range=(3, 4),\n", 424 | " analyzer = \"word\")\n", 425 | "\n", 426 | "\n", 427 | "Read up more here: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html" 428 | ] 429 | } 430 | ], 431 | "metadata": { 432 | "kernelspec": { 433 | "display_name": "Python 3", 434 | "language": "python", 435 | "name": "python3" 436 | }, 437 | "language_info": { 438 | "codemirror_mode": { 439 | "name": "ipython", 440 | "version": 3 441 | }, 442 | "file_extension": ".py", 443 | "mimetype": "text/x-python", 444 | "name": "python", 445 | "nbconvert_exporter": "python", 446 | "pygments_lexer": "ipython3", 447 | "version": "3.6.7" 448 | } 449 | }, 450 | "nbformat": 4, 451 | "nbformat_minor": 4 452 | } 453 | -------------------------------------------------------------------------------- /hist3368-week12-word-context-vectors/README.md: -------------------------------------------------------------------------------- 1 | ## Configuring your session on M2 2 | 3 | 10 | 11 | __Memory__: `64` 12 | 13 | ## Source Information 14 | The Notebook, "Word Embedding Models: word2vec" (hist3368-week12-word-context-vectors.ipynb), was written by Jo Guldi. 15 | -------------------------------------------------------------------------------- /hist3368-week2-critical-word-count/README.md: -------------------------------------------------------------------------------- 1 | ## Configuring your session on M2 2 | 3 | 10 | 11 | __Memory__: `6` G 12 | 13 | ## Source Information 14 | 15 | The Notebook, "Critical Word Count" (hist3368-week2-critical-word-count.ipynb), was written by Dr. Jo Guldi, Associate Professor of History at Southern Methodist University. It was inspired by lessons from [Programming Historian](https://programminghistorian.org/). 16 | -------------------------------------------------------------------------------- /hist3368-week2-critical-word-count/hist3368-week2-for-loops.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "For citation information, please see the \"Source Information\" section listed in the associated README file: https://github.com/stephbuon/digital-history/tree/master/hist3368-week2-critical-word-count" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Week 2 Assignment: For Loops Tutorial" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "In this week's assignment, you'll learn how to loop over lists of data. You'll also start the process of thinking critically about which words matter to you for the purposes of text mining, and how to use a thesaurus and the powers of reason to expand your expert vocabulary and divide it into categories of information. " 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "we'll be looking at commands that tell Python to repeat:\n", 29 | "\n", 30 | " take an item in a list\n", 31 | " do something to it \n", 32 | " take the next item in the list\n", 33 | " do something to it\n", 34 | " repeat until all the items in the list have been touched." 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "This structure is called a \"loop\" because when Python reaches the end of the statements in the body, it \"loops\" back to the beginning of the body, and executes the same statements again (this time with the next item in the list).\n" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "The list comprehension syntax discussed earlier is very powerful: it allows you to succinctly transform one list into another list by a repeated modification. " 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "### Using the 'for'...'in' formula" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "The 'in' operator is part of the grammar of most for loops. 'In' usually tells Python to iterate over the variables in a list.\n", 63 | "\n", 64 | "The basic 'for loop' formula that we'll be using in this class using the formula:\n", 65 | "\n", 66 | "for [dummy variable] in [list]:\n", 67 | " [do something to the] [dummy variable]\n", 68 | " \n", 69 | "What you should notice:\n", 70 | " \n", 71 | " * notice that the line begins with 'for'\n", 72 | " * note the use of 'in'\n", 73 | " * notice that the 'for' line closes with a colon -- ':' -- which is right next to the name of the list.\n", 74 | " * note that the name of the dummy variable called by 'for' is repeated inside the loop.\n", 75 | " \n", 76 | "That may seem terribly abstract, so let's look at a hands-on example." 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "Let's start out with a list of words." 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 42, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "['it', 'was', 'the', 'best', 'of', 'times', 'it', 'was', 'the', 'worst', 'of', 'times']\n" 101 | ] 102 | } 103 | ], 104 | "source": [ 105 | "wordstring = ['it', 'was', 'the', 'best', 'of', 'times', 'it', 'was', 'the', 'worst', 'of', 'times']\n", 106 | "print(wordstring)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "We can use a for loop to format wordstring in new ways." 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 43, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "it\n", 126 | "was\n", 127 | "the\n", 128 | "best\n", 129 | "of\n", 130 | "times\n", 131 | "it\n", 132 | "was\n", 133 | "the\n", 134 | "worst\n", 135 | "of\n", 136 | "times\n" 137 | ] 138 | } 139 | ], 140 | "source": [ 141 | "for word in wordstring:\n", 142 | " print(word)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 44, 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "it!!\n", 155 | "was!!\n", 156 | "the!!\n", 157 | "best!!\n", 158 | "of!!\n", 159 | "times!!\n", 160 | "it!!\n", 161 | "was!!\n", 162 | "the!!\n", 163 | "worst!!\n", 164 | "of!!\n", 165 | "times!!\n" 166 | ] 167 | } 168 | ], 169 | "source": [ 170 | "for word in wordstring:\n", 171 | " print(word + '!!')" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "You might be wondering where we got the 'word' in the formula 'for [blank] in [list].' This is important: **word** in wordstring could be anything. 'word' is just a dummy variable. " 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 45, 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "name": "stdout", 188 | "output_type": "stream", 189 | "text": [ 190 | "it\n", 191 | "was\n", 192 | "the\n", 193 | "best\n", 194 | "of\n", 195 | "times\n", 196 | "it\n", 197 | "was\n", 198 | "the\n", 199 | "worst\n", 200 | "of\n", 201 | "times\n" 202 | ] 203 | } 204 | ], 205 | "source": [ 206 | "for rutabaga in wordstring:\n", 207 | " print(rutabaga)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "What's important is consistency. Whatever you name a dummy variable, you must continue to use that same variable name **inside** the for loop. Otherwise you'll be telling Python to do something very different." 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 46, 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "name": "stdout", 224 | "output_type": "stream", 225 | "text": [ 226 | "times\n", 227 | "times\n", 228 | "times\n", 229 | "times\n", 230 | "times\n", 231 | "times\n", 232 | "times\n", 233 | "times\n", 234 | "times\n", 235 | "times\n", 236 | "times\n", 237 | "times\n" 238 | ] 239 | } 240 | ], 241 | "source": [ 242 | "for tyrannosaurus in wordstring:\n", 243 | " print(rutabaga)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 47, 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "name": "stdout", 253 | "output_type": "stream", 254 | "text": [ 255 | "it\n", 256 | "was\n", 257 | "the\n", 258 | "best\n", 259 | "of\n", 260 | "times\n", 261 | "it\n", 262 | "was\n", 263 | "the\n", 264 | "worst\n", 265 | "of\n", 266 | "times\n" 267 | ] 268 | } 269 | ], 270 | "source": [ 271 | "for tyrannosaurus in wordstring:\n", 272 | " print(tyrannosaurus)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "*Can you see what is different between the two commands above?*" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "## What is 'for' doing?" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "\"For\" is Python's command to repeat. " 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "Consider this: if you wanted to print out each word in 'wordstring,' you could just write out a series of commands like so:'" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 48, 306 | "metadata": {}, 307 | "outputs": [ 308 | { 309 | "name": "stdout", 310 | "output_type": "stream", 311 | "text": [ 312 | "it!!!\n", 313 | "was!!!\n", 314 | "the!!!\n", 315 | "best!!!\n", 316 | "of!!!\n", 317 | "times!!!\n", 318 | "it!!!\n", 319 | "was!!!\n", 320 | "the!!!\n", 321 | "worst!!!\n", 322 | "of!!!\n", 323 | "times!!!\n" 324 | ] 325 | } 326 | ], 327 | "source": [ 328 | "print(wordstring[0]+'!!!')\n", 329 | "print(wordstring[1]+'!!!')\n", 330 | "print(wordstring[2]+'!!!')\n", 331 | "print(wordstring[3]+'!!!')\n", 332 | "print(wordstring[4]+'!!!')\n", 333 | "print(wordstring[5]+'!!!')\n", 334 | "print(wordstring[6]+'!!!')\n", 335 | "print(wordstring[7]+'!!!')\n", 336 | "print(wordstring[8]+'!!!')\n", 337 | "print(wordstring[9]+'!!!')\n", 338 | "print(wordstring[10]+'!!!')\n", 339 | "print(wordstring[11]+'!!!')" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "metadata": {}, 345 | "source": [ 346 | "But that's a lot of typing. 'For' saves you from unnecessary, repetitive typing. It's one of the kinds of repeated tasks that computers are great at." 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 49, 352 | "metadata": {}, 353 | "outputs": [ 354 | { 355 | "name": "stdout", 356 | "output_type": "stream", 357 | "text": [ 358 | "it -- is the best word! \n", 359 | "was -- is the best word! \n", 360 | "the -- is the best word! \n", 361 | "best -- is the best word! \n", 362 | "of -- is the best word! \n", 363 | "times -- is the best word! \n", 364 | "it -- is the best word! \n", 365 | "was -- is the best word! \n", 366 | "the -- is the best word! \n", 367 | "worst -- is the best word! \n", 368 | "of -- is the best word! \n", 369 | "times -- is the best word! \n" 370 | ] 371 | } 372 | ], 373 | "source": [ 374 | "for word in wordstring:\n", 375 | " print(word + ' -- is the best word! ')" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "metadata": {}, 381 | "source": [ 382 | "Here's a mathematical example." 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 50, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "numberlist = [1, 2, 3, 4]\n", 392 | " " 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 51, 398 | "metadata": {}, 399 | "outputs": [ 400 | { 401 | "name": "stdout", 402 | "output_type": "stream", 403 | "text": [ 404 | "1000\n", 405 | "2000\n", 406 | "3000\n", 407 | "4000\n" 408 | ] 409 | } 410 | ], 411 | "source": [ 412 | "for int in numberlist:\n", 413 | " print(int * 1000)\n" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "In the rest of this class, we won't be making up silly punctuation for lines of words. But we will want to count the number of words for every document in a given year. We will want to change the punctuation or spelling or plural form of many words so as to produce a uniform text that is ieasy to count. So we will have many occasions to use repeated commands." 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": {}, 426 | "source": [ 427 | "## Other formats for for loops" 428 | ] 429 | }, 430 | { 431 | "cell_type": "markdown", 432 | "metadata": {}, 433 | "source": [ 434 | "There are also 'for' loops that don't use 'in.' \n", 435 | "\n", 436 | "Sometimes they use other commandments. For instance, 'range()' is often used with for loops. \n", 437 | "\n", 438 | "'Range' calls up a list of integers leading up to a number. Thus 'range(4)' produces \"0, 1, 2, 3, 4\"." 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "metadata": {}, 444 | "source": [ 445 | "In the example that follows, we make an empty list with the line\n", 446 | "\n", 447 | " integers = []\n", 448 | " \n", 449 | "We then invoke 'for' and 'range' to tell Python to repeat the next command in a 'loop'.\n", 450 | "\n", 451 | " for i in range(10): -- tells Python to do the next line 11 times \n", 452 | " integers.append(i) --- this line tells Python to 'append' the contents of the changing variable 'i' to the list 'integers'\n", 453 | " \n", 454 | "The result of this for loop is that Python takes the dummy variable 'i' and 'appends' it as a new member in the list 'integers.' \n", 455 | "\n", 456 | "Because of the structure of for...range, for each loop of 'for i', the variable i increases from 0 to 10." 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": 52, 462 | "metadata": {}, 463 | "outputs": [ 464 | { 465 | "name": "stdout", 466 | "output_type": "stream", 467 | "text": [ 468 | "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]\n" 469 | ] 470 | } 471 | ], 472 | "source": [ 473 | "integers = []\n", 474 | "\n", 475 | "for i in range(10):\n", 476 | " integers.append(i)\n", 477 | "\n", 478 | "print(integers)" 479 | ] 480 | }, 481 | { 482 | "cell_type": "markdown", 483 | "metadata": {}, 484 | "source": [ 485 | "## Doing More than One Thing inside a For Loop" 486 | ] 487 | }, 488 | { 489 | "cell_type": "markdown", 490 | "metadata": {}, 491 | "source": [ 492 | "\n", 493 | " \n", 494 | "Of course, the body of the loop can have more than one statement, and you can assign values to variables inside the loop:\n" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 54, 500 | "metadata": {}, 501 | "outputs": [ 502 | { 503 | "name": "stdout", 504 | "output_type": "stream", 505 | "text": [ 506 | "IT\n", 507 | "WAS\n", 508 | "THE\n", 509 | "BEST\n", 510 | "OF\n", 511 | "TIMES\n", 512 | "IT\n", 513 | "WAS\n", 514 | "THE\n", 515 | "WORST\n", 516 | "OF\n", 517 | "TIMES\n" 518 | ] 519 | } 520 | ], 521 | "source": [ 522 | "for item in wordstring:\n", 523 | " yelling = item.upper()\n", 524 | " print(yelling)" 525 | ] 526 | }, 527 | { 528 | "cell_type": "markdown", 529 | "metadata": {}, 530 | "source": [ 531 | "You can even put a loop inside a for loop. This is called a 'nested for loop'.\n", 532 | "\n", 533 | "The nested for loop below contains TWO for statements. \n", 534 | " * The first statemetn ('for item in wordstring') moves through each word in wordstring, as we saw above.\n", 535 | " * The second statement ('for letter in item') moves through each letter in each word.\n", 536 | "\n", 537 | "The result is to print out one letter per line." 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": 55, 543 | "metadata": {}, 544 | "outputs": [ 545 | { 546 | "name": "stdout", 547 | "output_type": "stream", 548 | "text": [ 549 | "i\n", 550 | "t\n", 551 | "w\n", 552 | "a\n", 553 | "s\n", 554 | "t\n", 555 | "h\n", 556 | "e\n", 557 | "b\n", 558 | "e\n", 559 | "s\n", 560 | "t\n", 561 | "o\n", 562 | "f\n", 563 | "t\n", 564 | "i\n", 565 | "m\n", 566 | "e\n", 567 | "s\n", 568 | "i\n", 569 | "t\n", 570 | "w\n", 571 | "a\n", 572 | "s\n", 573 | "t\n", 574 | "h\n", 575 | "e\n", 576 | "w\n", 577 | "o\n", 578 | "r\n", 579 | "s\n", 580 | "t\n", 581 | "o\n", 582 | "f\n", 583 | "t\n", 584 | "i\n", 585 | "m\n", 586 | "e\n", 587 | "s\n" 588 | ] 589 | } 590 | ], 591 | "source": [ 592 | "for item in wordstring:\n", 593 | " for letter in item:\n", 594 | " print(letter)" 595 | ] 596 | }, 597 | { 598 | "cell_type": "markdown", 599 | "metadata": {}, 600 | "source": [ 601 | "Here's an example of a nested for loop that calls two lists in succession." 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": 59, 607 | "metadata": {}, 608 | "outputs": [], 609 | "source": [ 610 | "num_list = [1, 2, 3]\n", 611 | "alpha_list = ['a', 'b', 'c']" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": 60, 617 | "metadata": {}, 618 | "outputs": [ 619 | { 620 | "name": "stdout", 621 | "output_type": "stream", 622 | "text": [ 623 | "1\n", 624 | " a\n", 625 | " b\n", 626 | " c\n", 627 | "2\n", 628 | " a\n", 629 | " b\n", 630 | " c\n", 631 | "3\n", 632 | " a\n", 633 | " b\n", 634 | " c\n" 635 | ] 636 | } 637 | ], 638 | "source": [ 639 | "for number in num_list:\n", 640 | " print(number)\n", 641 | " for letter in alpha_list:\n", 642 | " print(' ' + letter)" 643 | ] 644 | }, 645 | { 646 | "cell_type": "markdown", 647 | "metadata": {}, 648 | "source": [ 649 | "First the **first** for loop calls an item from num_list.\n", 650 | "\n", 651 | "Then the **second** for loop calls all the items from alpha_list, printing each slightly indented.\n", 652 | "\n", 653 | "The loop repeats until all the numbers in num_list are exhausted." 654 | ] 655 | }, 656 | { 657 | "cell_type": "markdown", 658 | "metadata": {}, 659 | "source": [ 660 | "## Conditional Loops" 661 | ] 662 | }, 663 | { 664 | "cell_type": "markdown", 665 | "metadata": {}, 666 | "source": [ 667 | "You can also include other kinds of nested statements inside the for loop. \n", 668 | "\n", 669 | "\"Conditional\" statements, for instance \"if\", ask the computer to first consider whether a certain condition is true before proceeding. \n", 670 | "\n", 671 | "In the code below, the \"if\" statement asks if the length of item in characters is 2 -- in other words: \n", 672 | "\n", 673 | " if len(item) == 2:\n", 674 | " \n", 675 | "If that statement is true, then the computer will obey the next command, which tells Python to print words that meet the above condition in uppercase:\n", 676 | "\n", 677 | " print(item.upper())\n", 678 | " \n", 679 | "In other words, the command\n", 680 | "\n", 681 | " if len(item) == 2:\n", 682 | " print(item.upper())\n", 683 | "\n", 684 | "means that the computer will look for ONLY two-character words in wordstring, and those two-character words will be printed in uppercase. " 685 | ] 686 | }, 687 | { 688 | "cell_type": "code", 689 | "execution_count": 61, 690 | "metadata": {}, 691 | "outputs": [ 692 | { 693 | "name": "stdout", 694 | "output_type": "stream", 695 | "text": [ 696 | "IT\n", 697 | "OF\n", 698 | "IT\n", 699 | "OF\n" 700 | ] 701 | } 702 | ], 703 | "source": [ 704 | "\n", 705 | "for item in wordstring:\n", 706 | " if len(item) == 2:\n", 707 | " print(item.upper())" 708 | ] 709 | }, 710 | { 711 | "cell_type": "markdown", 712 | "metadata": {}, 713 | "source": [ 714 | "Conditional statements can become very complicated. You might not see very many of these in our class, but it's useful to have seen the commands just in case.\n", 715 | "\n", 716 | "\"If\" statements are often followed by one or more \"elif\" statements that mean: \"if the conditions for the original 'if' are wrong, test the next condition\"\n", 717 | "\n", 718 | " elif len(item) == 3:\n", 719 | " print(\" \" + item)\n", 720 | " \n", 721 | "\"If\" and \"elif\" statements are often given with an alternative, which is formatted as \"else.\" An \"else\" statement tells the computer what to do \n", 722 | " else:\n", 723 | " print(item)" 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": 66, 729 | "metadata": {}, 730 | "outputs": [ 731 | { 732 | "name": "stdout", 733 | "output_type": "stream", 734 | "text": [ 735 | "IT\n", 736 | " was\n", 737 | " the\n", 738 | "best\n", 739 | "OF\n", 740 | "times\n", 741 | "IT\n", 742 | " was\n", 743 | " the\n", 744 | "worst\n", 745 | "OF\n", 746 | "times\n" 747 | ] 748 | } 749 | ], 750 | "source": [ 751 | "\n", 752 | "for item in wordstring:\n", 753 | " if len(item) == 2:\n", 754 | " print(item.upper())\n", 755 | " elif len(item) == 3:\n", 756 | " print(\" \" + item)\n", 757 | " else:\n", 758 | " print(item)" 759 | ] 760 | }, 761 | { 762 | "cell_type": "markdown", 763 | "metadata": {}, 764 | "source": [ 765 | "# Digression: Doing things with Text" 766 | ] 767 | }, 768 | { 769 | "cell_type": "markdown", 770 | "metadata": {}, 771 | "source": [ 772 | "That's enough for loops for now. Let's quickly pick up a few more functions that are useful for working with text." 773 | ] 774 | }, 775 | { 776 | "cell_type": "markdown", 777 | "metadata": {}, 778 | "source": [ 779 | "## Introducing the .split() function" 780 | ] 781 | }, 782 | { 783 | "cell_type": "markdown", 784 | "metadata": {}, 785 | "source": [ 786 | "One quick way to make a list out of a line of text is to use the \".split()\" function. Applied to a line of text, .split() will *split* the variable string of text into a list of words." 787 | ] 788 | }, 789 | { 790 | "cell_type": "markdown", 791 | "metadata": {}, 792 | "source": [ 793 | "Let's say that you want to print every string in a list. Here's a short text:" 794 | ] 795 | }, 796 | { 797 | "cell_type": "code", 798 | "execution_count": 1, 799 | "metadata": {}, 800 | "outputs": [], 801 | "source": [ 802 | "text = \"it was the best of times, it was the worst of times\"" 803 | ] 804 | }, 805 | { 806 | "cell_type": "markdown", 807 | "metadata": {}, 808 | "source": [ 809 | "We can make a list of all the words in the text by splitting on whitespace:" 810 | ] 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": 2, 815 | "metadata": {}, 816 | "outputs": [], 817 | "source": [ 818 | "words = text.split()" 819 | ] 820 | }, 821 | { 822 | "cell_type": "markdown", 823 | "metadata": {}, 824 | "source": [ 825 | "Of course, we can see what's in the list simply by evaluating the variable:" 826 | ] 827 | }, 828 | { 829 | "cell_type": "code", 830 | "execution_count": 3, 831 | "metadata": {}, 832 | "outputs": [ 833 | { 834 | "data": { 835 | "text/plain": [ 836 | "['it',\n", 837 | " 'was',\n", 838 | " 'the',\n", 839 | " 'best',\n", 840 | " 'of',\n", 841 | " 'times,',\n", 842 | " 'it',\n", 843 | " 'was',\n", 844 | " 'the',\n", 845 | " 'worst',\n", 846 | " 'of',\n", 847 | " 'times']" 848 | ] 849 | }, 850 | "execution_count": 3, 851 | "metadata": {}, 852 | "output_type": "execute_result" 853 | } 854 | ], 855 | "source": [ 856 | "words" 857 | ] 858 | }, 859 | { 860 | "cell_type": "markdown", 861 | "metadata": {}, 862 | "source": [ 863 | "## Join: Making strings from lists" 864 | ] 865 | }, 866 | { 867 | "cell_type": "markdown", 868 | "metadata": {}, 869 | "source": [ 870 | "Once we've created a list of words, it's a common task to want to take that list and \"glue\" it back together, so it's a single string again, instead of a list. So, for example:" 871 | ] 872 | }, 873 | { 874 | "cell_type": "code", 875 | "execution_count": 11, 876 | "metadata": {}, 877 | "outputs": [ 878 | { 879 | "data": { 880 | "text/plain": [ 881 | "'hydrogen, and helium, and lithium, and beryllium, and boron'" 882 | ] 883 | }, 884 | "execution_count": 11, 885 | "metadata": {}, 886 | "output_type": "execute_result" 887 | } 888 | ], 889 | "source": [ 890 | "element_list = [\"hydrogen\", \"helium\", \"lithium\", \"beryllium\", \"boron\"]\n", 891 | "glue = \", and \"\n", 892 | "glue.join(element_list)" 893 | ] 894 | }, 895 | { 896 | "cell_type": "markdown", 897 | "metadata": {}, 898 | "source": [ 899 | "The .join() method needs a \"glue\" string to the left of it---this is the string that will be placed in between the list elements. In the parentheses to the right, you need to put an expression that evaluates to a list. Very frequently with .join(), programmers don't bother to assign the \"glue\" string to a variable first, so you end up with code that looks like this:\n" 900 | ] 901 | }, 902 | { 903 | "cell_type": "code", 904 | "execution_count": 12, 905 | "metadata": {}, 906 | "outputs": [ 907 | { 908 | "data": { 909 | "text/plain": [ 910 | "'this is a test'" 911 | ] 912 | }, 913 | "execution_count": 12, 914 | "metadata": {}, 915 | "output_type": "execute_result" 916 | } 917 | ], 918 | "source": [ 919 | "words = [\"this\", \"is\", \"a\", \"test\"]\n", 920 | "\" \".join(words)" 921 | ] 922 | }, 923 | { 924 | "cell_type": "markdown", 925 | "metadata": {}, 926 | "source": [ 927 | "# Assignment" 928 | ] 929 | }, 930 | { 931 | "cell_type": "code", 932 | "execution_count": null, 933 | "metadata": {}, 934 | "outputs": [], 935 | "source": [ 936 | "* Write a list that contains your full name as a series of words in quotation marks separated by commas.\n", 937 | "\n", 938 | "* Write a for loop that prints out your full name one word at a time.\n", 939 | "\n", 940 | "* Write a nested for loop that prints out your full name one character at a time.\n", 941 | "\n", 942 | "* WRite a for loop using range() that counts to 20 by fives." 943 | ] 944 | } 945 | ], 946 | "metadata": { 947 | "kernelspec": { 948 | "display_name": "Python 3", 949 | "language": "python", 950 | "name": "python3" 951 | }, 952 | "language_info": { 953 | "codemirror_mode": { 954 | "name": "ipython", 955 | "version": 3 956 | }, 957 | "file_extension": ".py", 958 | "mimetype": "text/x-python", 959 | "name": "python", 960 | "nbconvert_exporter": "python", 961 | "pygments_lexer": "ipython3", 962 | "version": "3.6.7" 963 | } 964 | }, 965 | "nbformat": 4, 966 | "nbformat_minor": 4 967 | } 968 | -------------------------------------------------------------------------------- /hist3368-week2-critical-word-count/hist3368-week2-more-on-lists-and-dictionaries.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Navigating Lists and Dictionaries\n", 8 | "\n", 9 | "*by Jo Guldi*\n", 10 | "\n", 11 | "This Notebook introduces the concept of 'navigating' different data types. It will help you figure out how to find your way around dictionaries and lists, the two data types that we'll encounter most frequently in this class. " 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "What you will learn:\n", 21 | " * how to use square brackets -- [ ] -- or .index() -- to select an item from a list\n", 22 | " * how to use the functions .append() and .count() to do things with lists\n", 23 | " " 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## Navigating the 'List' Data Type" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "In previous sessions, we introduced *variables*. A variable is a nametag that can be placed on any object. \n", 38 | "\n", 39 | "But Python also has particular data types that involve not just *one* object but *several*. You can think of these data types as *baskets* rather than nametags. Each basket has a name. But the basket contains multiple items.\n", 40 | "\n", 41 | "The kind of basket we're going to start with is a list. Lists have certain attributes:\n", 42 | "\n", 43 | " * Lists are ordered. The first object in a list is always the first; the second is always the second, etc. (unless you change the order)\n", 44 | " * You can add to lists, typically using the function .append()\n", 45 | " \n", 46 | " " 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "Remember that we use *square brackets* to make a list." 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 1, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "streaming = ['netflix', 'hulu', 'disney+', 'appletv+']" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "With python lists, square brackets mean 'which.' The number in the brackets tells the computer which count of an item to look for. Note that counting starts at 0." 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 2, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/plain": [ 80 | "'disney+'" 81 | ] 82 | }, 83 | "execution_count": 2, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "streaming[2]" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 3, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "'appletv+'" 101 | ] 102 | }, 103 | "execution_count": 3, 104 | "metadata": {}, 105 | "output_type": "execute_result" 106 | } 107 | ], 108 | "source": [ 109 | "streaming[3]" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 4, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "data": { 119 | "text/plain": [ 120 | "'netflix'" 121 | ] 122 | }, 123 | "execution_count": 4, 124 | "metadata": {}, 125 | "output_type": "execute_result" 126 | } 127 | ], 128 | "source": [ 129 | "streaming[0]" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "The opposite of using brackets is .index()" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 5, 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "data": { 146 | "text/plain": [ 147 | "0" 148 | ] 149 | }, 150 | "execution_count": 5, 151 | "metadata": {}, 152 | "output_type": "execute_result" 153 | } 154 | ], 155 | "source": [ 156 | "streaming.index('netflix')" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 6, 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "name": "stdout", 166 | "output_type": "stream", 167 | "text": [ 168 | "The index of disney+ is: 2\n" 169 | ] 170 | } 171 | ], 172 | "source": [ 173 | "index = streaming.index('disney+')\n", 174 | "print('The index of disney+ is:', index)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "You can also use square brackets with a colon to call the first items in a list. This will be useful when we are looking at hundreds of pages of text. You won't want to flood your screen by looking at millions of characters at once; however, you might want to look at one paragraph of 100 words. " 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 27, 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "data": { 191 | "text/plain": [ 192 | "['netflix']" 193 | ] 194 | }, 195 | "execution_count": 27, 196 | "metadata": {}, 197 | "output_type": "execute_result" 198 | } 199 | ], 200 | "source": [ 201 | "streaming[:1]" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 26, 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "data": { 211 | "text/plain": [ 212 | "['netflix', 'hulu']" 213 | ] 214 | }, 215 | "execution_count": 26, 216 | "metadata": {}, 217 | "output_type": "execute_result" 218 | } 219 | ], 220 | "source": [ 221 | "streaming[:2]" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "Another useful navigation tool is to call the last few entries in a list by using a negative number in the square brackets." 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "A negative number in brackets calls the nth to the last item in the list " 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 28, 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "data": { 245 | "text/plain": [ 246 | "'prime'" 247 | ] 248 | }, 249 | "execution_count": 28, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "streaming[-2]" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "A negative number followed by a colon calls the last n items in the list." 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 34, 268 | "metadata": {}, 269 | "outputs": [ 270 | { 271 | "data": { 272 | "text/plain": [ 273 | "['appletv+', 'prime', 'hulu']" 274 | ] 275 | }, 276 | "execution_count": 34, 277 | "metadata": {}, 278 | "output_type": "execute_result" 279 | } 280 | ], 281 | "source": [ 282 | "streaming[-3:]" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "### Using .append() and .count() with lists" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "Lists take a number of built-in functions, for instance 'append' and 'count.'" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 40, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "streaming.append('prime')" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 41, 311 | "metadata": {}, 312 | "outputs": [ 313 | { 314 | "data": { 315 | "text/plain": [ 316 | "['netflix', 'hulu', 'disney+', 'appletv+', 'prime', 'hulu', 'prime']" 317 | ] 318 | }, 319 | "execution_count": 41, 320 | "metadata": {}, 321 | "output_type": "execute_result" 322 | } 323 | ], 324 | "source": [ 325 | "streaming" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 42, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "streaming.append('hulu')" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 43, 340 | "metadata": {}, 341 | "outputs": [ 342 | { 343 | "data": { 344 | "text/plain": [ 345 | "['netflix', 'hulu', 'disney+', 'appletv+', 'prime', 'hulu', 'prime', 'hulu']" 346 | ] 347 | }, 348 | "execution_count": 43, 349 | "metadata": {}, 350 | "output_type": "execute_result" 351 | } 352 | ], 353 | "source": [ 354 | "streaming" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": 44, 360 | "metadata": {}, 361 | "outputs": [ 362 | { 363 | "data": { 364 | "text/plain": [ 365 | "3" 366 | ] 367 | }, 368 | "execution_count": 44, 369 | "metadata": {}, 370 | "output_type": "execute_result" 371 | } 372 | ], 373 | "source": [ 374 | "streaming.count('hulu')" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 4, 380 | "metadata": {}, 381 | "outputs": [ 382 | { 383 | "data": { 384 | "text/plain": [ 385 | "3" 386 | ] 387 | }, 388 | "execution_count": 4, 389 | "metadata": {}, 390 | "output_type": "execute_result" 391 | } 392 | ], 393 | "source": [ 394 | "song = ['row', 'row', 'row', 'your', 'boat', 'gently', 'down', 'the', 'stream'] \n", 395 | "song.count('row')" 396 | ] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "metadata": {}, 401 | "source": [ 402 | "But what if we want to count every word?" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "## The 'Dictionary' Data Type" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "Dictionaries have their own rules for how data is organized. They are also navigated with specific functions that pertain only to dictionaries. That's why it's important to know whether your data is a list or a dictionary at any given time! Otherwise, the commands you use to navigate are liable to act funny.\n", 417 | "\n", 418 | "What is a dictionary?\n", 419 | "\n", 420 | "* A 'dictionary' is a data type with 'keys' that correspond to 'values.' \n", 421 | "* A colon -- ':' -- is used to separate keys from values. \n", 422 | "* Key-value pairs are separated by commas. \n", 423 | "\n", 424 | "Why would you need a dictionary?\n", 425 | "\n", 426 | "* Technically, dictionaries don't have an order. They will automatically try to re-order their values -- alphabetically or by number. This makes them a bad structure for storing raw text, where the order of the words is important. \n", 427 | "* However, dictionaries are a fast way of storing values that need to be 'looked up' by the computer at some point. This makes them ideal for storing certain information -- for instance lists of special words, also known as a 'controlled vocabulary.'\n", 428 | "\n", 429 | "What's inside a dictionary?\n", 430 | "\n", 431 | "* Technically, each key-value pair is a tuple.\n", 432 | "* The individual tuples of the dictionary can have any kind of contents -- strings, integers, boolean statements, whatever. But there must be *exactly* two of them per entry.\n", 433 | "\n", 434 | "How do you call the information in a dictionary?\n", 435 | "\n", 436 | "* You can call just the keys, with .keys(), or just the values, with .values()\n", 437 | "* You can look up any given value if you know its key, or vice versa. " 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 98, 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [ 446 | "black_tea = {\n", 447 | " 'supplier' : 'Twinings',\n", 448 | " 'name' : 'English Breakfast',\n", 449 | " 'boxes_in_stock' : 12,\n", 450 | " 'loose_leaf' : True\n", 451 | " }" 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "metadata": {}, 457 | "source": [ 458 | "### Calling the items a dictionary" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": {}, 464 | "source": [ 465 | "You can do some of the things with dictionaries you can do with lists, but not others. \n", 466 | "\n", 467 | "For instance, you **can** use square brackets with dictionaries, just like lists." 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 99, 473 | "metadata": {}, 474 | "outputs": [ 475 | { 476 | "data": { 477 | "text/plain": [ 478 | "'Twinings'" 479 | ] 480 | }, 481 | "execution_count": 99, 482 | "metadata": {}, 483 | "output_type": "execute_result" 484 | } 485 | ], 486 | "source": [ 487 | "black_tea['supplier'] " 488 | ] 489 | }, 490 | { 491 | "cell_type": "markdown", 492 | "metadata": {}, 493 | "source": [ 494 | "The function .get() does exactly the same thing:" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 101, 500 | "metadata": {}, 501 | "outputs": [ 502 | { 503 | "data": { 504 | "text/plain": [ 505 | "'Twinings'" 506 | ] 507 | }, 508 | "execution_count": 101, 509 | "metadata": {}, 510 | "output_type": "execute_result" 511 | } 512 | ], 513 | "source": [ 514 | "black_tea.get('supplier')" 515 | ] 516 | }, 517 | { 518 | "cell_type": "markdown", 519 | "metadata": {}, 520 | "source": [ 521 | "Notice that the input is different. Instead of inputting the index number like you did with index -- streaming[0] -- you use square brackets to call dictionaries by their **key**" 522 | ] 523 | }, 524 | { 525 | "cell_type": "markdown", 526 | "metadata": {}, 527 | "source": [ 528 | "*Now let's talk about what you can't do with dictionaries.*" 529 | ] 530 | }, 531 | { 532 | "cell_type": "markdown", 533 | "metadata": {}, 534 | "source": [ 535 | "You **can't** navigate back to the keys based on a certain value. Why? Because Python dictionaries are meant to be used to look up values based on known keys, not the other way around. You can look up keys to your heart's content -- but you cannot go the other way around and use 'Twinings' to find the supplier." 536 | ] 537 | }, 538 | { 539 | "cell_type": "markdown", 540 | "metadata": {}, 541 | "source": [ 542 | "You **can't** navigate the items in dictionaries with for loops by using the 'in' operator, as you do with lists:" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": 102, 548 | "metadata": {}, 549 | "outputs": [ 550 | { 551 | "ename": "TypeError", 552 | "evalue": "'dict' object is not callable", 553 | "output_type": "error", 554 | "traceback": [ 555 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 556 | "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", 557 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mpotato\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mblack_tea\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpotato\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 558 | "\u001b[0;31mTypeError\u001b[0m: 'dict' object is not callable" 559 | ] 560 | } 561 | ], 562 | "source": [ 563 | "for potato in black_tea():\n", 564 | " print(potato)" 565 | ] 566 | }, 567 | { 568 | "cell_type": "markdown", 569 | "metadata": {}, 570 | "source": [ 571 | "Notice the 'TypeError' dictionary. This is why data types matter" 572 | ] 573 | }, 574 | { 575 | "cell_type": "markdown", 576 | "metadata": {}, 577 | "source": [ 578 | "### .keys() and .values() with Dictionaries" 579 | ] 580 | }, 581 | { 582 | "cell_type": "markdown", 583 | "metadata": {}, 584 | "source": [ 585 | "Instead of calling dictionaries as you do lists, you have to call either the keys or the values.\n", 586 | "\n", 587 | " .keys()\n", 588 | " .values()" 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": 56, 594 | "metadata": {}, 595 | "outputs": [ 596 | { 597 | "data": { 598 | "text/plain": [ 599 | "dict_keys(['supplier', 'name', 'boxes_in_stock', 'loose_leaf'])" 600 | ] 601 | }, 602 | "execution_count": 56, 603 | "metadata": {}, 604 | "output_type": "execute_result" 605 | } 606 | ], 607 | "source": [ 608 | "black_tea.keys()" 609 | ] 610 | }, 611 | { 612 | "cell_type": "code", 613 | "execution_count": 57, 614 | "metadata": {}, 615 | "outputs": [ 616 | { 617 | "data": { 618 | "text/plain": [ 619 | "dict_values(['Twinings', 'English Breakfast', 12, True])" 620 | ] 621 | }, 622 | "execution_count": 57, 623 | "metadata": {}, 624 | "output_type": "execute_result" 625 | } 626 | ], 627 | "source": [ 628 | "black_tea.values()" 629 | ] 630 | }, 631 | { 632 | "cell_type": "markdown", 633 | "metadata": {}, 634 | "source": [ 635 | "#### You can export keys or values as lists" 636 | ] 637 | }, 638 | { 639 | "cell_type": "markdown", 640 | "metadata": {}, 641 | "source": [ 642 | "Using the function list(), you can also export either set -- that is, either **keys** or **values** -- to a list, which you can then call the normal way." 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": 62, 648 | "metadata": {}, 649 | "outputs": [ 650 | { 651 | "data": { 652 | "text/plain": [ 653 | "['Twinings', 'English Breakfast', 12, True]" 654 | ] 655 | }, 656 | "execution_count": 62, 657 | "metadata": {}, 658 | "output_type": "execute_result" 659 | } 660 | ], 661 | "source": [ 662 | "list(black_tea.values())" 663 | ] 664 | }, 665 | { 666 | "cell_type": "markdown", 667 | "metadata": {}, 668 | "source": [ 669 | "Poof! It was the values to the dictionary, but now it's a list!" 670 | ] 671 | }, 672 | { 673 | "cell_type": "code", 674 | "execution_count": 65, 675 | "metadata": {}, 676 | "outputs": [ 677 | { 678 | "data": { 679 | "text/plain": [ 680 | "['supplier', 'name', 'boxes_in_stock', 'loose_leaf']" 681 | ] 682 | }, 683 | "execution_count": 65, 684 | "metadata": {}, 685 | "output_type": "execute_result" 686 | } 687 | ], 688 | "source": [ 689 | "list(black_tea.keys())" 690 | ] 691 | }, 692 | { 693 | "cell_type": "markdown", 694 | "metadata": {}, 695 | "source": [ 696 | "You already know how to navigate lists." 697 | ] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "execution_count": 59, 702 | "metadata": {}, 703 | "outputs": [ 704 | { 705 | "data": { 706 | "text/plain": [ 707 | "12" 708 | ] 709 | }, 710 | "execution_count": 59, 711 | "metadata": {}, 712 | "output_type": "execute_result" 713 | } 714 | ], 715 | "source": [ 716 | "list(black_tea.values())[2]" 717 | ] 718 | }, 719 | { 720 | "cell_type": "code", 721 | "execution_count": 67, 722 | "metadata": {}, 723 | "outputs": [ 724 | { 725 | "data": { 726 | "text/plain": [ 727 | "'boxes_in_stock'" 728 | ] 729 | }, 730 | "execution_count": 67, 731 | "metadata": {}, 732 | "output_type": "execute_result" 733 | } 734 | ], 735 | "source": [ 736 | "list(black_tea.keys())[2]" 737 | ] 738 | }, 739 | { 740 | "cell_type": "markdown", 741 | "metadata": {}, 742 | "source": [ 743 | "# Assignment" 744 | ] 745 | }, 746 | { 747 | "cell_type": "markdown", 748 | "metadata": {}, 749 | "source": [ 750 | "* Create a variable called by your middle name. Write out the lyrics to a poem or song of your choice of at least five lines as a list. \n", 751 | "* Write out the code to navigate this list:\n", 752 | " * What is the **first** item in the list?\n", 753 | " * What is the **last** item in the list?\n", 754 | " \n", 755 | "* Use the \".append()\" function to add the word 'rutabaga' to the end of your list.\n", 756 | "* Use the function \".count()\" to count how many times the word \"the\" appears in your song.\n", 757 | " \n" 758 | ] 759 | }, 760 | { 761 | "cell_type": "markdown", 762 | "metadata": {}, 763 | "source": [ 764 | "Paste a screenshot of your code and the answers into the box in Canvas." 765 | ] 766 | } 767 | ], 768 | "metadata": { 769 | "kernelspec": { 770 | "display_name": "Python 3", 771 | "language": "python", 772 | "name": "python3" 773 | }, 774 | "language_info": { 775 | "codemirror_mode": { 776 | "name": "ipython", 777 | "version": 3 778 | }, 779 | "file_extension": ".py", 780 | "mimetype": "text/x-python", 781 | "name": "python", 782 | "nbconvert_exporter": "python", 783 | "pygments_lexer": "ipython3", 784 | "version": "3.6.7" 785 | } 786 | }, 787 | "nbformat": 4, 788 | "nbformat_minor": 4 789 | } 790 | -------------------------------------------------------------------------------- /hist3368-week3-ngrams-lemmatization-gender/README.md: -------------------------------------------------------------------------------- 1 | ## Configuring your session on M2 2 | 3 | 10 | 11 | __Memory__: `6` G 12 | 13 | ## Source Information 14 | 15 | The Notebooks, "Lemmatization" and "Searching Bigrams" (hist3368-week3-controlled-vocab.ipynb), were written by Dr. Jo Guldi, Associate Professor of History at Southern Methodist University. Content and ideas were borrowed from [Dr. Eric Godat](https://github.com/egodat), member of OIT at Southern Methodist University, and [Dr. Rob Kalescky](https://github.com/rkalescky), application scientist at Southern Methodist University. The Notebooks that inspired these lessons can be found at Southern Methodist University's GitHub at: [SouthernMethodistUniversity/text_mining_data_sets/DataAccess.ipynb](https://github.com/SouthernMethodistUniversity/text_mining_data_sets/blob/master/DataAccess.ipynb). 16 | -------------------------------------------------------------------------------- /hist3368-week3-ngrams-lemmatization-gender/hist3368-cleaning-lemmatizing-visualization-congress-pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Hist 3368 \n", 8 | "## Cleaning, Lemmatizing, and Visualization with Congress in Pandas" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "## Load Some Data" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "name": "stdout", 34 | "output_type": "stream", 35 | "text": [ 36 | "/scratch/group/history/hist_3368-jguldi\n" 37 | ] 38 | } 39 | ], 40 | "source": [ 41 | "cd /scratch/group/history/hist_3368-jguldi" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "***Give this several minutes; we're reading in big data:***" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "congress = pd.read_csv(\"congress1967-2010.csv\")\n", 58 | "#congress = pd.read_csv(\"eighties_data.csv\")" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "Let's do a couple of basic cleaning steps. Let's look at the actual text output of the Content column to get an idea of what we're dealing with. " 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 4, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "Those who do not enjoy the privilege of the floor will please retire from the Chamber.\n", 78 | "cleared of all attaches. unless they have absolutely important business to attend to in the Chamber.\n", 79 | "lly needed for the next few minutes of the deliberations of the Senate will tetire from the Chamber.\n" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "for contenttext in congress['speech'].head(3): # for the first three entries in the 'Content' column\n", 85 | " print(contenttext[-100:]) # print the last 100 characters" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "You'll notice that there are uppercase words, punctuation marks, and stopwords that will interfere with our analysis unless we do away with them." 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "**Let's package all of these commands into a function, defined with \"def,\" and use .apply() to apply the function to each item in the column 'speech.'''**" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "We know that we can split the text of the 'Content' column into words, lowercase them, stopword them, and lemmatize them using some familiar commands. \n", 107 | "\n", 108 | " .lower()\n", 109 | " .split()\n", 110 | " wn.morphy()\n", 111 | " if word in stopwords" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "We could also add some steps to screen out digits and initials:\n", 119 | "\n", 120 | " if not word.isdigit()\n", 121 | " if len(word) > 1\n", 122 | " \n", 123 | "Note the use of \"len()\", which asks the \"length\" of a string in characters. If the length of a word -- len(word) -- is greater than 1, we keep it:" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 5, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "# load stopwords and software\n", 133 | "from nltk.corpus import stopwords # this calls all multilingual stopword lists from NLTK\n", 134 | "from nltk.corpus import wordnet as wn\n", 135 | "stop = stopwords.words('english') # this command calls only the English stopwords, labeling them \"stop\"\n", 136 | "stop_set = set(stop) # use the Python native command \"set\" to streamline how the stopwords are stored, improve performance" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 6, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "# create a function that does all the cleanup\n", 146 | "\n", 147 | "def cleaning_step(row):\n", 148 | " \n", 149 | " clean_row = row.replace('[^\\w\\s]','') # remove punctuation\n", 150 | " clean_row = clean_row.split() # split into words\n", 151 | " clean_row = [wn.morphy(word.lower()) for word in clean_row # lemmatize\n", 152 | " if word not in stop_set\n", 153 | " if not word.isdigit() # if it isn't a number)\n", 154 | " if len(word) > 1] # if it's longer than one character\n", 155 | " clean_row = filter(None, clean_row) # remove any 'None's that result from cases such as wn.morphy(\"the\")\n", 156 | " clean_row = ' '.join(clean_row) # glue the words back together into one string per row\n", 157 | " \n", 158 | " return(clean_row)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "***This may take some time. Lemmatizing is computation intensive. Allot 30 minutes.***" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "congress['speech'] = congress['speech'].apply(cleaning_step) \n", 175 | "congress[:5]" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "Inspect the data to see what we've done. " 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "for contenttext in congress['speech'].head(3): # for the first three entries in the 'Content' column\n", 192 | " print(contenttext[-1000:]) # print the last 100 characters" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "#### Save the data for later." 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "cd ~/digital-history" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "congress.to_csv(\"lemmatized-congress1968.csv\")" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "# for use if you need to re-load\n", 227 | "# congress = pd.read_csv(\"lemmatized-congress1967-2010.csv\")" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "# Overall Visualisation" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "Let's paste together all the words in the 'speech' column to get a list that we'll call 'allwords.'" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "allwords = \" \".join(congress['speech'])" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "Let's get a rough sense of what's in the 'Content' Column by creating a wordcloud." 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "The wordcloud package has its own built-in function to split a block of text. It just needs one big block of text assembled from all the rows in the 'Content' column. We'll use the join() command to paste together all the entries in df['Content'], calling the result 'allwords.' Then we'l use the WordCloud().generate() command to make a wordcloud from the variable 'allwords'." 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "#import software \n", 274 | "!pip install wordcloud --user\n", 275 | "from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator\n", 276 | "import matplotlib.pyplot as plt\n", 277 | "%matplotlib inline\n", 278 | "stop_words = set(STOPWORDS)\n", 279 | "\n", 280 | "# make a wordcloud\n", 281 | "wordcloud = WordCloud(stopwords=stop, background_color=\"white\").generate(allwords)\n", 282 | "plt.figure(figsize=(12, 12))\n", 283 | "plt.imshow(wordcloud, interpolation='bilinear')\n", 284 | "plt.axis(\"off\")\n", 285 | "plt.show()" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "Next, let's visualize the most frequent words, breaking the variable 'allwords' down into individual words using split(). " 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "wordlist = allwords.split()\n", 302 | "wordlist[:10] # look at the first ten elements of the list only" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "Next, count the individual words using the pandas commands \"Series()\" and \"value_counts()\"" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "wordcounts = pd.Series(wordlist).value_counts()[:20]\n", 319 | "wordcounts[:10]" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "Now, plot those values as a well-labeled barchart. Notice that the axes are well-labeled and that the chart has a title that describes the data." 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "wordcounts.plot(kind='bar', \n", 336 | " title='Most frequent words in the CONTENT column of EDGAR for 8 key companies',\n", 337 | " figsize=(6, 6)\n", 338 | " )" 339 | ] 340 | } 341 | ], 342 | "metadata": { 343 | "kernelspec": { 344 | "display_name": "Python 3", 345 | "language": "python", 346 | "name": "python3" 347 | }, 348 | "language_info": { 349 | "codemirror_mode": { 350 | "name": "ipython", 351 | "version": 3 352 | }, 353 | "file_extension": ".py", 354 | "mimetype": "text/x-python", 355 | "name": "python", 356 | "nbconvert_exporter": "python", 357 | "pygments_lexer": "ipython3", 358 | "version": "3.6.7" 359 | } 360 | }, 361 | "nbformat": 4, 362 | "nbformat_minor": 4 363 | } 364 | -------------------------------------------------------------------------------- /hist3368-week4-wordnet-controlled-vocab/README.md: -------------------------------------------------------------------------------- 1 | ## Configuring your session on M2 2 | 3 | 10 | 11 | __Memory__: `6` G 12 | 13 | ## Source Information 14 | 15 | The Notebooks, "Controlled Vocabulary" and "Wordnet" (hist3368-week3-controlled-vocab.ipynb), were written by Dr. Jo Guldi, Associate Professor of History at Southern Methodist University. Content and ideas were borrowed from [Dr. Eric Godat](https://github.com/egodat), member of OIT at Southern Methodist University, and [Dr. Rob Kalescky](https://github.com/rkalescky), application scientist at Southern Methodist University. The Notebooks that inspired these lessons can be found at Southern Methodist University's GitHub at: [SouthernMethodistUniversity/text_mining_data_sets/DataAccess.ipynb](https://github.com/SouthernMethodistUniversity/text_mining_data_sets/blob/master/DataAccess.ipynb). 16 | -------------------------------------------------------------------------------- /hist3368-week5-plotting-change-over-time/README.md: -------------------------------------------------------------------------------- 1 | ## Configuring your session on M2 2 | 3 | 10 | 11 | __Memory__: 12 | Below are the memory requirements to read in each data set. You may want to increase your memory (maybe from `6G` to `10G`, for example) if you expect to create many variables while processing your data. 13 | - Project Gutenberg: start with `6G`. Increase memory if you make a large corpus. 14 | - EDGAR: start with `6G`. Increase memory if you add many more companies. 15 | - Hansard: `64G` 16 | - US Congress: `6G` 17 | - Dallas City Council: `6G` 18 | - Houston City Council: `6G` 19 | - Reddit: 20 | - the Reddit data is broken into 92 (yes, 92!) files totalling ~`250G` of Reddit data!!! You can access individual files using ~`6-15G`, but please see Steph if you need help. 21 | 22 | ## Source Information 23 | 24 | The Notebook, "Instructions for Accessing Data" (hist3368-week5-plotting-change-over-time.ipynb), was written by Steph Buongiorno, project manager to Dr. Jo Guldi and PhD student in Applied Science in Engineering at Southern Methodist University. Additional code for reading in data was provided by Alexander Cerpa, computer science undergraduate at Southern Methodist Univeristy. A version one of this notebook was written by [Dr. Eric Godat](https://github.com/egodat), member of OIT at Southern Methodist University. Version one can be found on Southern Methodist University's GitHub at: [SouthernMethodistUniversity/text_mining_data_sets/DataAccess.ipynb](https://github.com/SouthernMethodistUniversity/text_mining_data_sets/blob/master/DataAccess.ipynb). 25 | -------------------------------------------------------------------------------- /hist3368-week6-measuring-change-and-using-groupby/README.md: -------------------------------------------------------------------------------- 1 | ## Configuring your session on M2 2 | 3 | 6 | 7 | 8 | 9 | 10 | 11 | __Memory__: `32` G 12 | 13 | ### Source Information 14 | 15 | The Notebook, "Measuring Change Over Time" (hist3368-week6-measuring-change-and-using-groupby.ipynb), was written by Dr. Jo Guldi, Associate Professor of History at Southern Methodist University, and Steph Buongiorno, project manager to Dr. Jo Guldi and PhD student in Applied Science in Engineering at Southern Methodist University. 16 | -------------------------------------------------------------------------------- /hist3368-week9-named-entities/hist3368-tutorial-on-speed.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Hist 3368\n", 8 | "## Tutorial on Speed" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "Let's say you want to use Spacy, a resource-intensive software package, to extract named entities from Congress." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "#### Setup Spacy" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 1, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import pandas as pd, spacy\n", 32 | "from datetime import datetime" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "nlp = spacy.load('en_core_web_sm')" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "Getting an error? \n", 49 | "\n", 50 | "* Please note that to use spaCy on M2 you must go to My Interactive Sessions/JupyterLab and add **source /hpc/applications/python_environments/spacy/bin/activate** to the **“Custom environment settings”** field.\n" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "#### Load some data" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "We're going to load the speeches of Congress." 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "/scratch/group/history/hist_3368-jguldi\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "cd /scratch/group/history/hist_3368-jguldi" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 6, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "congress = pd.read_csv(\"congress1967-2010.csv\")\n", 91 | "#congress = pd.read_csv(\"eighties_data.csv\")" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 9, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "name": "stdout", 101 | "output_type": "stream", 102 | "text": [ 103 | "/users/jguldi/digital-history\n" 104 | ] 105 | } 106 | ], 107 | "source": [ 108 | "cd ~/digital-history" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "#### Notice that the Code is slow" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "Let's apply our event recognizer to just a sample." 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 2, 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "ename": "NameError", 132 | "evalue": "name 'congress_1968' is not defined", 133 | "output_type": "error", 134 | "traceback": [ 135 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 136 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 137 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msample\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mner_finder\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspeech\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'LAW'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mspeech\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcongress_1968\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'speech'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 138 | "\u001b[0;31mNameError\u001b[0m: name 'congress_1968' is not defined" 139 | ] 140 | } 141 | ], 142 | "source": [ 143 | "sample = [ner_finder(speech, 'LAW') for speech in congress_1968['speech'][:20]]\n", 144 | "sample" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "Notice that the code hangs for a minute. Spacy uses a lot of 'memory' -- or computing power. Coders have tricks to speed things up. Let's talk about that." 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "#### Tracking Speed with time.time()" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "\n", 166 | "Many coders like to keep track of how fast different approaches are so that they can choose the speediest approach when they move from small data to big data. Let's do that. We'll import the *time* module and call\n", 167 | "\n", 168 | " time.time() \n", 169 | " \n", 170 | "to get the time in milliseconds. Then we run the same line of code, and call time.time() again afterwards, and subtract start time from finish." 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "We can use datetime.time() to take the time before and after the operation to see how quick or slow each operation is.\n", 178 | "\n", 179 | "Here's the same code you just ran again, with timing instructions around it." 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "import time\n", 189 | "start = time.time()\n", 190 | "\n", 191 | "sample = [ner_finder(speech, 'LAW') for speech in congress_1968['speech'][:-20]]\n", 192 | "\n", 193 | "finish = time.time()\n", 194 | "\n", 195 | "print(sample)\n", 196 | "print()\n", 197 | "\n", 198 | "finish-start" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "Next, let's try a speedier approach. Let's use our parallelized ner_finder to search for mentions of laws in just one year. \n", 206 | "\n", 207 | "Again, we'll run the sample code on a tiny sample. Again, we'll keep track of how long it takes. " 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "\n", 215 | "#### Speeding things up with .apply()\n", 216 | "\n", 217 | "To speed things up, we can try calling upon \"parallel\" processing, which causes every node within a computer system to run the same command simultaneously. \n", 218 | "\n", 219 | "We'll use a 'lambda' function, which allows us to take the function following \"lambda x\" and efficiently \"apply\" it to every row in the dataframe. Lambda functions run in parallel.\n", 220 | "\n", 221 | "Note these two elements of the grammar.\n", 222 | "\n", 223 | " .apply()\n", 224 | " lambda x: [function to be applied]\n" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "Here's a tutorial about using .apply()." 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "from IPython.display import HTML\n", 241 | "\n", 242 | "HTML('')\n", 243 | "\n", 244 | "\n" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "And here's some code using .apply() with ner_finder() to search for all the laws mentioned in the Dallas Minutes.\n", 252 | "\n", 253 | "Note that we are also using *time.time()* to take the time in milliseconds before and after running the function, so that we can compare how fast the .apply() method is to similar code using list comprehension above.\n", 254 | "\n", 255 | "**This may still take a minute.** But apply is potentially much, much faster than if you had run the same command wihtout parallel processing. \n", 256 | "\n", 257 | "*Note: You will see a pink warning label. It isn't an error, and the data is still running.*" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "start = time.time()\n", 267 | "\n", 268 | "sample2 = dallas_minutes['Text'][:5].apply(lambda x: ner_finder(x, 'LAW'))\n", 269 | "\n", 270 | "finish = time.time()\n", 271 | "\n", 272 | "print(sample2)\n", 273 | "print()\n", 274 | "\n", 275 | "finish-start" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "The winner is... the .apply() method in parallel -- faster by a hair! (*NOTE: Your mileage may vary*)\n", 283 | "\n", 284 | "Let's run it on a slightly larger sample of text -- the whole year 2019. \n", 285 | "\n", 286 | "***We chose the faster method on purpose, but NER is a slow process. This process clocks at 30 m on my session. Get a cup of tea.***\n", 287 | "\n", 288 | "*You can also limit the amount of text you're working with by using square brackets, e.g. dallas_minutes_year1['Text'][:100]*" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "start = time.time()\n", 298 | "\n", 299 | "\n", 300 | "dallas_minutes_year1['Laws'] = dallas_minutes_year1['Text'].apply(lambda x: ner_finder(x, 'LAW'))\n", 301 | "\n", 302 | "finish = time.time()\n", 303 | "print(finish-start)\n", 304 | "\n", 305 | "\n", 306 | "dallas_minutes_year1[:5]" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "# Here's the code for applying nlp to the entire archive of Dallas City Council minutes, not just one year. \n", 316 | "#dallas_minutes['Laws'] = dallas_minutes['Text'].apply(lambda x: ner_finder(x, 'LAW'))\n", 317 | "#dallas_minutes " 318 | ] 319 | } 320 | ], 321 | "metadata": { 322 | "kernelspec": { 323 | "display_name": "Python 3", 324 | "language": "python", 325 | "name": "python3" 326 | }, 327 | "language_info": { 328 | "codemirror_mode": { 329 | "name": "ipython", 330 | "version": 3 331 | }, 332 | "file_extension": ".py", 333 | "mimetype": "text/x-python", 334 | "name": "python", 335 | "nbconvert_exporter": "python", 336 | "pygments_lexer": "ipython3", 337 | "version": "3.7.4" 338 | } 339 | }, 340 | "nbformat": 4, 341 | "nbformat_minor": 4 342 | } 343 | -------------------------------------------------------------------------------- /hist3368-week9-named-entities/read.me: -------------------------------------------------------------------------------- 1 | 2 | read.me 3 | -------------------------------------------------------------------------------- /images/README.md: -------------------------------------------------------------------------------- 1 | This folder contains the images used by README files. 2 | -------------------------------------------------------------------------------- /images/connect_jupyter_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephbuon/digital-history/9d34699215f875946f38690ddadb989504fac2f9/images/connect_jupyter_1.png -------------------------------------------------------------------------------- /images/data_team_fields.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephbuon/digital-history/9d34699215f875946f38690ddadb989504fac2f9/images/data_team_fields.png -------------------------------------------------------------------------------- /images/double_click.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephbuon/digital-history/9d34699215f875946f38690ddadb989504fac2f9/images/double_click.png -------------------------------------------------------------------------------- /images/fields.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephbuon/digital-history/9d34699215f875946f38690ddadb989504fac2f9/images/fields.png -------------------------------------------------------------------------------- /images/file_open.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephbuon/digital-history/9d34699215f875946f38690ddadb989504fac2f9/images/file_open.png -------------------------------------------------------------------------------- /images/memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephbuon/digital-history/9d34699215f875946f38690ddadb989504fac2f9/images/memory.png -------------------------------------------------------------------------------- /images/open_filep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephbuon/digital-history/9d34699215f875946f38690ddadb989504fac2f9/images/open_filep.png -------------------------------------------------------------------------------- /images/pipe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephbuon/digital-history/9d34699215f875946f38690ddadb989504fac2f9/images/pipe.jpg -------------------------------------------------------------------------------- /images/pipe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephbuon/digital-history/9d34699215f875946f38690ddadb989504fac2f9/images/pipe.png -------------------------------------------------------------------------------- /images/resources_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephbuon/digital-history/9d34699215f875946f38690ddadb989504fac2f9/images/resources_1.png -------------------------------------------------------------------------------- /images/select_jupyter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephbuon/digital-history/9d34699215f875946f38690ddadb989504fac2f9/images/select_jupyter.png -------------------------------------------------------------------------------- /images/source-spacy-m2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephbuon/digital-history/9d34699215f875946f38690ddadb989504fac2f9/images/source-spacy-m2.png -------------------------------------------------------------------------------- /images/thesearenotthesame.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephbuon/digital-history/9d34699215f875946f38690ddadb989504fac2f9/images/thesearenotthesame.png -------------------------------------------------------------------------------- /optional-notebooks/embeddings-read.me: -------------------------------------------------------------------------------- 1 | ## Configuring your session on M2 2 | 3 | 10 | 11 | __Memory__: `64` 12 | 13 | ## Source Information 14 | The Notebook, "Word Embedding Models: word2vec" (hist3368-week11-word-embeddings.ipynb), was written by Jo Guldi. 15 | -------------------------------------------------------------------------------- /optional-notebooks/hist3368-week8-metadata/README.md: -------------------------------------------------------------------------------- 1 | ## Configuring your session on M2 2 | 3 | Your session’s settings should look exactly like the following image: 4 | 5 | ![placeholdertext](https://github.com/stephbuon/digital-history/blob/master/images/source-spacy-m2.png?raw=true) 6 | 7 | Note that the first two boxes are empty. 8 | 9 | __Custom environment settings__: `source /hpc/applications/python_environments/spacy/bin/activate` 10 | 11 | __Memory__: `20` G 12 | 13 | ### List of Useful Part-of-Speech spaCy Symbols: 14 | 15 | - `'ADJ'` for adjective 16 | - `'ADV'` for adverb 17 | - `'NOUN'` for noun 18 | - `'PRON'` for pronoun 19 | - `'VERB'` for verb 20 | 21 | ### List of Useful Syntactic Dependency spaCy Symbols: 22 | 23 | - `'acomp'` for adjectival complement 24 | - `'dobj'` for direct object 25 | - `'iobj'` for indirect object 26 | - `'pobj'` for object of a preposition 27 | - `'nsubj'` for nominal subject 28 | - `'nsubjpass'` for passive nominal subject 29 | - `'root'` for root (or main) verb 30 | - `'prep'` for preposition 31 | - `'relcl'` for relative clause 32 | 33 | ### Source Information 34 | 35 | The Notebook, "Natural Language Processing with spaCy" (hist3368-week8-metadata.ipynb), was written by Steph Buongiorno, project manager to Dr. Jo Guldi and PhD student in Applied Science in Engineering at Southern Methodist University. 36 | 37 | 38 | -------------------------------------------------------------------------------- /optional-notebooks/hist3368-week9-advanced-parts-of-speech/README.md: -------------------------------------------------------------------------------- 1 | ## Configuring your session on M2 2 | 3 | Your session’s settings should look exactly like the following image: 4 | 5 | ![placeholdertext](https://github.com/stephbuon/digital-history/blob/master/images/source-spacy-m2.png?raw=true) 6 | 7 | Note that the first two boxes are empty. 8 | 9 | __Custom environment settings__: `source /hpc/applications/python_environments/spacy/bin/activate` 10 | 11 | __Memory__: `20` G 12 | 13 | ### List of Useful Part-of-Speech spaCy Symbols: 14 | 15 | - `'ADJ'` for adjective 16 | - `'ADV'` for adverb 17 | - `'NOUN'` for noun 18 | - `'PRON'` for pronoun 19 | - `'VERB'` for verb 20 | 21 | ### List of Useful Syntactic Dependency spaCy Symbols: 22 | 23 | - `'acomp'` for adjectival complement 24 | - `'dobj'` for direct object 25 | - `'iobj'` for indirect object 26 | - `'pobj'` for object of a preposition 27 | - `'nsubj'` for nominal subject 28 | - `'nsubjpass'` for passive nominal subject 29 | - `'root'` for root (or main) verb 30 | - `'prep'` for preposition 31 | - `'relcl'` for relative clause 32 | 33 | ## Source Information 34 | 35 | The Notebook, "Extracting Gendered Grammatical Constructions Using spaCy" (hist3368-week9-advance-parts-of-speech.ipynb), was written by Steph Buongiorno, project manager to Dr. Jo Guldi and PhD student in Applied Science in Engineering at Southern Methodist University. 36 | -------------------------------------------------------------------------------- /optional-notebooks/read.me: -------------------------------------------------------------------------------- 1 | These notebooks are "advanced" -- they are not taught in the current iteration of Hist 3368, but they may be useful for students interested in pursuing more analysis in their final projects. 2 | -------------------------------------------------------------------------------- /optional-notebooks/store-congress-embeddings-from-gensim-in-parallel.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Parallelization with Word Context Vectors" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "#### By Jo Guldi - 11/2021" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "Workhorse script to download the Congressional testimony, produce a 5-yr word2vec model for showing change over time, run in parallel" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## Setup" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import multiprocessing\n", 45 | "from multiprocessing import Pool" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "startdate = 1870\n", 55 | "enddate = 2010\n", 56 | "n = multiprocessing.cpu_count()\n", 57 | "print(multiprocessing.cpu_count())" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "def parallelize_operation(df, func, n_cores = n):\n", 67 | " df_split = np.array_split(df, n_cores)\n", 68 | " pool = Pool(n)\n", 69 | " df = pd.concat(pool.map(func, df_split))\n", 70 | " pool.close()\n", 71 | " pool.join()\n", 72 | " return df" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## Loading data" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 86, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "ename": "ImportError", 89 | "evalue": "cannot import name 'LEMMA_INDEX'", 90 | "output_type": "error", 91 | "traceback": [ 92 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 93 | "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", 94 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpprint\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mspacy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlemmatizer\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mLemmatizer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 23\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mspacy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlang\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0men\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mLEMMA_INDEX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mLEMMA_EXC\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mLEMMA_RULES\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 95 | "\u001b[0;31mImportError\u001b[0m: cannot import name 'LEMMA_INDEX'" 96 | ] 97 | } 98 | ], 99 | "source": [ 100 | "import pandas as pd\n", 101 | "import nltk, numpy, re, matplotlib# , num2words\n", 102 | "from nltk.corpus import wordnet as wn\n", 103 | "import gensim \n", 104 | "import csv\n", 105 | "import glob\n", 106 | "import numpy as np\n", 107 | "import multiprocessing\n", 108 | "from sklearn.feature_extraction.text import CountVectorizer\n", 109 | "import scipy.spatial.distance\n", 110 | "import matplotlib\n", 111 | "import matplotlib.pyplot as plt\n", 112 | "import itertools\n", 113 | "from nltk.tokenize import sent_tokenize\n", 114 | "from nltk.tokenize.treebank import TreebankWordTokenizer\n", 115 | "#!pip install wordsegment --user\n", 116 | "from wordsegment import load, segment, clean\n", 117 | "import string\n", 118 | "load()\n", 119 | "import re\n", 120 | "import gensim, pprint\n", 121 | "from spacy.lemmatizer import Lemmatizer\n", 122 | "from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES\n" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "The following lines load some data from Congress. " 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": { 136 | "id": "6qCg0mXrtOD1", 137 | "outputId": "4ecca950-9419-4b8d-96fc-aa5fbb1426f5" 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "all_speech_files = glob.glob('/scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_*.txt')\n", 142 | "CONGRESS_MIN_THRESHOLD = 1\n", 143 | "CONGRESS_MAX_THRESHOLD = 115\n", 144 | "\n", 145 | "speech_files = []\n", 146 | "\n", 147 | "for fn in all_speech_files:\n", 148 | " number = int(fn.rsplit('_', 1)[-1].split('.')[0])\n", 149 | " if CONGRESS_MIN_THRESHOLD <= number <= CONGRESS_MAX_THRESHOLD:\n", 150 | " speech_files.append(fn)\n", 151 | "\n", 152 | "speech_files.sort()\n", 153 | " \n", 154 | "def parse_one(fn):\n", 155 | " print(f'Reading {fn}...')\n", 156 | " return pd.read_csv(fn, sep='|', encoding=\"ISO-8859-1\", error_bad_lines=False, warn_bad_lines=False, quoting=csv.QUOTE_NONE)\n", 157 | "\n", 158 | "speeches_df = pd.concat((parse_one(fn) for fn in speech_files))\n", 159 | "speeches_df.dropna(how='any', inplace=True)\n", 160 | "\n", 161 | "all_description_files = glob.glob('/scratch/group/oit_research_data/stanford_congress/hein-bound/descr_*.txt')\n", 162 | " \n", 163 | "description_files = []\n", 164 | "\n", 165 | "for fn in all_description_files:\n", 166 | " number = int(fn.rsplit('_', 1)[-1].split('.')[0])\n", 167 | " if CONGRESS_MIN_THRESHOLD <= number <= CONGRESS_MAX_THRESHOLD:\n", 168 | " description_files.append(fn)\n", 169 | " description_files.sort()\n", 170 | " \n", 171 | "description_df = pd.concat((parse_one(fn) for fn in description_files))\n", 172 | "\n", 173 | "all_data = pd.merge(speeches_df, description_df, on = 'speech_id')\n", 174 | "all_data.fillna(0, inplace=True)\n", 175 | "all_data = all_data.drop(['chamber', 'speech_id', 'number_within_file', 'speaker', 'first_name'], 1)\n", 176 | "all_data = all_data.drop(['last_name', 'state', 'gender', 'line_start', 'line_end', 'file', 'char_count', 'word_count'], 1)\n", 177 | "all_data['date']=pd.to_datetime(all_data['date'],format='%Y%m%d')\n", 178 | "all_data['year'] = pd.to_datetime(all_data['date']).dt.year\n", 179 | "all_data['5yrperiod'] = np.floor(all_data['year'] / 5) * 5 # round each year to the nearest 5 -- by dividing by 5 and \"flooring\" to the lowest integer\n", 180 | "all_data = all_data.drop(['date', 'year'], 1)\n", 181 | "all_data['index'] = np.arange(len(all_data)) # create an 'index' column\n", 182 | "all_data.head()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "sample_l = all_data.sample(50000)\n", 192 | "sample_m = sample_l.sample(5000)\n", 193 | "sample = sample_m.sample(500)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "all_data" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "sample" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "## Create function for cleaning & structuring the data in parallel" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "In this section and the next, we will create a function, then launch that function with parallelize_operation." 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 91, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "def split_strings_into_sentences(data): # WORKING 4-16\n", 235 | " \n", 236 | " df = data\n", 237 | " \n", 238 | " # Getting s as pandas series which has split on full stop and new sentence a new line\n", 239 | " s = df['speech'].str.split('.').apply(pd.Series,1).stack()\n", 240 | " s.index = s.index.droplevel(-1) # to line up with df's index\n", 241 | " s.name = 'sentence' # needs a name to join\n", 242 | "\n", 243 | " del df['speech']\n", 244 | " df = df.join(s)\n", 245 | " del df['index']\n", 246 | "\n", 247 | " \n", 248 | " return df\n" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 92, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "def split_sentences_into_words(data): # works 11-12-21\n", 258 | " \n", 259 | " new_column = [row.split() for row in data['sentence']]\n", 260 | " data['sentence'] = new_column\n", 261 | " \n", 262 | " return(data)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 93, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "# WORKING 4-17\n", 272 | "def cleanup(df):\n", 273 | "\n", 274 | " df = sentences_df2.reset_index()\n", 275 | " df2 = df \n", 276 | " \n", 277 | " # To remove punctuation:\n", 278 | " for i, sentence in enumerate(df['sentence']):\n", 279 | " sentence2 = []\n", 280 | " for word in sentence:\n", 281 | " word2 = re.sub('\\W', '', word).lower()\n", 282 | " if len(word2)>0:\n", 283 | " sentence2.append(word2)\n", 284 | " #df2['sentence'][index] = sentence2 #<---- ERROR HERE\n", 285 | " #df2.at[index, 'sentence'] = sentence2 \n", 286 | " df2.at[df2.index[i],'sentence'] = sentence2 \n", 287 | " #df2['5yrperiod'][index] = df['5yrperiod'][index]\n", 288 | "\n", 289 | " \n", 290 | " return(df2)" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 94, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "def structure_data_old(period_data):\n", 300 | " sentences_df = parallelize_operation(period_data, split_strings_into_sentences) # split speech into sentences\n", 301 | " sentences_df2 = parallelize_operation(sentences_df, split_sentences_into_words) # split sentences into words\n", 302 | " sentences_df3 = cleanup(sentences_df2) # cleanup punctuation and empty lines\n", 303 | "\n", 304 | " return(sentences_df3)" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 155, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "def structure_data(sentences, lemma, stopwords, stemmed):\n", 314 | "\n", 315 | " # tokenize documents with gensim's tokenize() function\n", 316 | " token_list = [list(gensim.utils.tokenize(sent, lower=True)) for sent in sentences]\n", 317 | " \n", 318 | " # build bigram model\n", 319 | " bigram_mdl = gensim.models.phrases.Phrases(token_list, min_count=1, threshold=2)\n", 320 | "\n", 321 | " # lemmatize the tokens\n", 322 | " if lemma == True:\n", 323 | " pool = multiprocessing.Pool()\n", 324 | " token_list = pool.map(lemmatize_column, token_list) #[[wn.morphy(item) for item in list] for list in token_list] \n", 325 | " token_list = [[item for item in list if item is not None] for list in token_list] \n", 326 | " token_list[0][:15]\n", 327 | "\n", 328 | " # remove stopwords and/or do stemming\n", 329 | " from gensim.parsing.preprocessing import preprocess_string#, remove_stopwords#, #stem_text\n", 330 | " CUSTOM_FILTERS = []\n", 331 | " if stopwords == True:\n", 332 | " from gensim.parsing.preprocessing import remove_stopwords\n", 333 | " CUSTOM_FILTERS.append(remove_stopwords)\n", 334 | " if stemmed == True:\n", 335 | " from gensim.parsing.preprocessing import stem_text\n", 336 | " CUSTOM_FILTERS.append(stem_text)\n", 337 | " \n", 338 | " processed = [preprocess_string(\" \".join(word), CUSTOM_FILTERS) for word in token_list]\n", 339 | " #processed = [[item for item in list if item] for list in processed]\n", 340 | "\n", 341 | " # apply bigram model to list\n", 342 | " result = [bigram_mdl[item] for item in processed]\n", 343 | " \n", 344 | " return(result)\n", 345 | " " 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 147, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "def lemmatize_column(token_list): \n", 355 | " \n", 356 | " token_list = [wn.morphy(item) for item in token_list]#[[wn.morphy(item) for item in list] for list in token_list]\n", 357 | " return(token_list)" 358 | ] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "metadata": {}, 363 | "source": [ 364 | "## Making GENSIM Word Embeddings for every 5yr period" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 140, 370 | "metadata": {}, 371 | "outputs": [], 372 | "source": [ 373 | "periodnames = all_data['5yrperiod'].unique().tolist()" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": 141, 379 | "metadata": {}, 380 | "outputs": [], 381 | "source": [ 382 | "#periodnames = [period for period in periodnames if int(period) > 1970]" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 142, 388 | "metadata": {}, 389 | "outputs": [ 390 | { 391 | "data": { 392 | "text/plain": [ 393 | "[1870.0,\n", 394 | " 1875.0,\n", 395 | " 1880.0,\n", 396 | " 1885.0,\n", 397 | " 1890.0,\n", 398 | " 1895.0,\n", 399 | " 1900.0,\n", 400 | " 1905.0,\n", 401 | " 1910.0,\n", 402 | " 1915.0,\n", 403 | " 1920.0,\n", 404 | " 1925.0,\n", 405 | " 1930.0,\n", 406 | " 1935.0,\n", 407 | " 1940.0,\n", 408 | " 1945.0,\n", 409 | " 1950.0,\n", 410 | " 1955.0,\n", 411 | " 1960.0,\n", 412 | " 1965.0,\n", 413 | " 1970.0,\n", 414 | " 1975.0,\n", 415 | " 1980.0,\n", 416 | " 1985.0,\n", 417 | " 1990.0,\n", 418 | " 1995.0,\n", 419 | " 2000.0,\n", 420 | " 2005.0,\n", 421 | " 2010.0]" 422 | ] 423 | }, 424 | "execution_count": 142, 425 | "metadata": {}, 426 | "output_type": "execute_result" 427 | } 428 | ], 429 | "source": [ 430 | "periodnames" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 152, 436 | "metadata": {}, 437 | "outputs": [], 438 | "source": [ 439 | "filename = 'lemmatized-stopworded-bigrammed-congress-model-'" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 153, 445 | "metadata": {}, 446 | "outputs": [ 447 | { 448 | "name": "stdout", 449 | "output_type": "stream", 450 | "text": [ 451 | "/scratch/group/history/hist_3368-jguldi/congress-embeddings\n" 452 | ] 453 | } 454 | ], 455 | "source": [ 456 | "cd '/scratch/group/history/hist_3368-jguldi/congress-embeddings'" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "metadata": {}, 463 | "outputs": [], 464 | "source": [ 465 | "keyword_context = [] # create an empty dummy variable\n", 466 | "\n", 467 | "for period1 in periodnames:\n", 468 | " \n", 469 | " # get just the data in the period in question\n", 470 | " period_data = all_data[all_data['5yrperiod'] == period1]\n", 471 | "\n", 472 | "\n", 473 | " # split speech into sentences, split sentences into words, cleanup punctuation and empty lines\n", 474 | " structured_data = structure_data(period_data['speech'], True, True, False) \n", 475 | " structured_data\n", 476 | "\n", 477 | " # make a gensim model for that data\n", 478 | " period_model = gensim.models.Word2Vec( \n", 479 | " sentences = structured_data,\n", 480 | " workers= n,\n", 481 | " iter = 15,\n", 482 | " min_count = 20, \n", 483 | " size = 100) \n", 484 | " \n", 485 | " # save the model with the name of the period\n", 486 | " period_model.save(filename + str(period1)) \n", 487 | " \n", 488 | " # load model for each 5 yr period - one period per cycle of the for loop\n", 489 | " #period_model = gensim.models.Word2Vec.load('model-' + str(period1)) # to load a saved model\n", 490 | "\n", 491 | " # append each period to a larger model of all congress\n", 492 | " if period1 == periodnames[0]:\n", 493 | " congress_model = period_model # for the first time, save period_model as congress model\n", 494 | " else: \n", 495 | " congress_model.build_vocab(sentences_df3['sentence'], # after the first period, add new period data to the congress model\n", 496 | " update = True)\n", 497 | " congress_model.train(sentences_df3['sentence'], total_examples=period_model.corpus_count, epochs=period_model.epochs) \n", 498 | "\n", 499 | " # store the model with the name of the period\n", 500 | " congress_model.save(filename + str(startdate) + '-' + str(period1)) " 501 | ] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "metadata": {}, 506 | "source": [ 507 | "## Making GENSIM Word Embeddings for all Congress" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "metadata": {}, 514 | "outputs": [], 515 | "source": [ 516 | "cd '/scratch/group/history/hist_3368-jguldi/congress-embeddings'" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": null, 522 | "metadata": {}, 523 | "outputs": [], 524 | "source": [ 525 | "# split speech into sentences, split sentences into words, cleanup punctuation and empty lines\n", 526 | "structured_data = structure_data(all_data, True, True, False) \n", 527 | " \n", 528 | "# make a gensim model for that data\n", 529 | "congress_model = gensim.models.Word2Vec( \n", 530 | " sentences = structured_data,\n", 531 | " workers= n,\n", 532 | " iter = 15,\n", 533 | " min_count = 20, \n", 534 | " size = 100) \n", 535 | " \n", 536 | "# save the model with the name of the period\n", 537 | "congress_model.save('lemmatized-stopworded-bigrammed-congress_model-1870-2010') " 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": null, 543 | "metadata": {}, 544 | "outputs": [], 545 | "source": [] 546 | } 547 | ], 548 | "metadata": { 549 | "kernelspec": { 550 | "display_name": "Python 3", 551 | "language": "python", 552 | "name": "python3" 553 | }, 554 | "language_info": { 555 | "codemirror_mode": { 556 | "name": "ipython", 557 | "version": 3 558 | }, 559 | "file_extension": ".py", 560 | "mimetype": "text/x-python", 561 | "name": "python", 562 | "nbconvert_exporter": "python", 563 | "pygments_lexer": "ipython3", 564 | "version": "3.6.7" 565 | } 566 | }, 567 | "nbformat": 4, 568 | "nbformat_minor": 4 569 | } 570 | -------------------------------------------------------------------------------- /utilities/README.md: -------------------------------------------------------------------------------- 1 | This folder contains utility code used in the preparation of course materials. 2 | -------------------------------------------------------------------------------- /utilities/concat_gutenberg_noveltm.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 12, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Dear Student, your observation is correct: Project Gutenberg provides content whereas Novel TM provides metadata.\n", 10 | "# However, in just a few steps you can join the meta data with your content, allowing you to access this information in a single \n", 11 | "# dataframe \n", 12 | "\n", 13 | "# Here is a code example: \n", 14 | "\n", 15 | "# First download your Project Gutenberg data: " 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 60, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import pandas as pd\n", 25 | "from gutenberg.acquire import load_etext\n", 26 | "from gutenberg.query import get_metadata\n", 27 | "from gutenberg.cleanup import strip_headers\n", 28 | "\n", 29 | "gutenberg_mirror = 'https://gutenberg.pglaf.org/'" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 61, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "# Write the name and title to match the Novel TM data set (note that the name is ordered \"last name, first name\")\n", 39 | "book = [[1400,\"Great expectations: [and other stories]\",\"Dickens, Charles\"]]\n", 40 | "book\n", 41 | "\n", 42 | "# create a data frame from \"book\" that has columns for the ID, title, and author.\n", 43 | "# give the columns the same name as NovelTM's.\n", 44 | "guten_df = pd.DataFrame(book, columns=['ID','shorttitle','author']) \n", 45 | "guten_df" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 63, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/html": [ 56 | "
\n", 57 | "\n", 70 | "\n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | "
IDshorttitleauthorFullText
01400Great expectations: [and other stories]Dickens, CharlesGreat Expectations[1867 Edition]by Charles Dic...
\n", 90 | "
" 91 | ], 92 | "text/plain": [ 93 | " ID shorttitle author \\\n", 94 | "0 1400 Great expectations: [and other stories] Dickens, Charles \n", 95 | "\n", 96 | " FullText \n", 97 | "0 Great Expectations[1867 Edition]by Charles Dic... " 98 | ] 99 | }, 100 | "execution_count": 63, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "# clean the data\n", 107 | "guten_df['FullText'] = guten_df.apply(lambda row: strip_headers(load_etext(row['ID'], mirror=gutenberg_mirror)).replace(\"\\n\", \"\").replace(\"[Illustration]\", \"\") , axis=1)\n", 108 | "guten_df" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "# Now select your Novel TM data: " 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 70, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "path_to_noveltm_metadata = \"/scratch/group/history/hist_3368-jguldi/tedunderwood-noveltmmeta-451ae72/metadata\"\n", 127 | "\n", 128 | "title_metadata = pd.read_csv(f'{path_to_noveltm_metadata}/titlemeta.tsv', sep='\\t', low_memory=False)\n", 129 | "\n", 130 | "# Here I am selecting just Dickens's Great Expectations\n", 131 | "great_expectations_metadata = title_metadata[92:93].copy()" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 65, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/html": [ 142 | "
\n", 143 | "\n", 156 | "\n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | "
docidoldauthorauthorauthordateinferreddatelatestcompdatetypestartdateenddateimprint...allcopiesofworkcopiesin25yrsenumcronvolnumtitleparttitleearlyeditionshorttitlenonficprobjuvenileprob
92inu.30000011835398Dickens, CharlesDickens, Charles1812-1870.01870nNew York|Mershon co.|n.d....1.01.0NaNNaNGreat expectations: | [and other stories]/ | $...NaNTrueGreat expectations: [and other stories]0.5212840.081614
\n", 210 | "

1 rows × 30 columns

\n", 211 | "
" 212 | ], 213 | "text/plain": [ 214 | " docid oldauthor author authordate \\\n", 215 | "92 inu.30000011835398 Dickens, Charles Dickens, Charles 1812-1870. \n", 216 | "\n", 217 | " inferreddate latestcomp datetype startdate enddate \\\n", 218 | "92 0 1870 n \n", 219 | "\n", 220 | " imprint ... allcopiesofwork copiesin25yrs enumcron \\\n", 221 | "92 New York|Mershon co.|n.d. ... 1.0 1.0 NaN \n", 222 | "\n", 223 | " volnum title parttitle \\\n", 224 | "92 NaN Great expectations: | [and other stories]/ | $... NaN \n", 225 | "\n", 226 | " earlyedition shorttitle nonficprob \\\n", 227 | "92 True Great expectations: [and other stories] 0.521284 \n", 228 | "\n", 229 | " juvenileprob \n", 230 | "92 0.081614 \n", 231 | "\n", 232 | "[1 rows x 30 columns]" 233 | ] 234 | }, 235 | "execution_count": 65, 236 | "metadata": {}, 237 | "output_type": "execute_result" 238 | } 239 | ], 240 | "source": [ 241 | "great_expectations_metadata" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "# Now merge the two into a single data frame: " 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 68, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "df_merged = pd.merge(guten_df, great_expectations_metadata, on=['author', 'shorttitle'], how='left')" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 69, 265 | "metadata": {}, 266 | "outputs": [ 267 | { 268 | "data": { 269 | "text/html": [ 270 | "
\n", 271 | "\n", 284 | "\n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | "
IDshorttitleauthorFullTextdocidoldauthorauthordateinferreddatelatestcompdatetype...instancesallcopiesofworkcopiesin25yrsenumcronvolnumtitleparttitleearlyeditionnonficprobjuvenileprob
01400Great expectations: [and other stories]Dickens, CharlesGreat Expectations[1867 Edition]by Charles Dic...inu.30000011835398Dickens, Charles1812-1870.01870n...11.01.0NaNNaNGreat expectations: | [and other stories]/ | $...NaNTrue0.5212840.081614
\n", 338 | "

1 rows × 32 columns

\n", 339 | "
" 340 | ], 341 | "text/plain": [ 342 | " ID shorttitle author \\\n", 343 | "0 1400 Great expectations: [and other stories] Dickens, Charles \n", 344 | "\n", 345 | " FullText docid \\\n", 346 | "0 Great Expectations[1867 Edition]by Charles Dic... inu.30000011835398 \n", 347 | "\n", 348 | " oldauthor authordate inferreddate latestcomp datetype ... \\\n", 349 | "0 Dickens, Charles 1812-1870. 0 1870 n ... \n", 350 | "\n", 351 | " instances allcopiesofwork copiesin25yrs enumcron volnum \\\n", 352 | "0 1 1.0 1.0 NaN NaN \n", 353 | "\n", 354 | " title parttitle earlyedition \\\n", 355 | "0 Great expectations: | [and other stories]/ | $... NaN True \n", 356 | "\n", 357 | " nonficprob juvenileprob \n", 358 | "0 0.521284 0.081614 \n", 359 | "\n", 360 | "[1 rows x 32 columns]" 361 | ] 362 | }, 363 | "execution_count": 69, 364 | "metadata": {}, 365 | "output_type": "execute_result" 366 | } 367 | ], 368 | "source": [ 369 | "df_merged" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [] 378 | } 379 | ], 380 | "metadata": { 381 | "kernelspec": { 382 | "display_name": "Python 3", 383 | "language": "python", 384 | "name": "python3" 385 | }, 386 | "language_info": { 387 | "codemirror_mode": { 388 | "name": "ipython", 389 | "version": 3 390 | }, 391 | "file_extension": ".py", 392 | "mimetype": "text/x-python", 393 | "name": "python", 394 | "nbconvert_exporter": "python", 395 | "pygments_lexer": "ipython3", 396 | "version": "3.6.7" 397 | } 398 | }, 399 | "nbformat": 4, 400 | "nbformat_minor": 4 401 | } 402 | -------------------------------------------------------------------------------- /utilities/export_spacy_doc.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | import pandas as pd 3 | from os import cpu_count 4 | import argparse 5 | 6 | from spacy.tokens import DocBin 7 | 8 | nlp = spacy.load("en_core_web_sm") 9 | DEFAULT_CPU_COUNT = 3 10 | 11 | 12 | class Config: 13 | """ 14 | Keeps track of configuration options that we can pass around to other processes. 15 | """ 16 | CHUNK_SIZE = 2**15 17 | CPU_CORES = 3 18 | INPUT_FILE = '' 19 | OUTPUT_FILE = '' 20 | 21 | 22 | def parse_config(): 23 | """ 24 | Parses and validates command line arguments. 25 | :return: Config object filled with the corresponding configuration for the specified CL arguments. 26 | """ 27 | config = Config() 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument('input_file', help='Input data file') 30 | parser.add_argument('output_file', help='output data file') 31 | parser.add_argument('--cores', default=DEFAULT_CPU_COUNT, type=int, help='Number of cores to use.') 32 | args = parser.parse_args() 33 | 34 | config.INPUT_FILE = args.input_file 35 | print('INPUT_FILE:', config.INPUT_FILE) 36 | 37 | config.OUTPUT_FILE = args.output_file 38 | print('OUTPUT_FILE:', config.OUTPUT_FILE) 39 | 40 | config.CPU_CORES = args.cores 41 | if config.CPU_CORES < 0 or config.CPU_CORES > cpu_count(): 42 | raise ValueError('Invalid core number specified.') 43 | 44 | print('CPU_CORES:', config.CPU_CORES) 45 | return config 46 | 47 | 48 | if __name__ == '__main__': 49 | conf = parse_config() 50 | df = pd.read_csv(conf.INPUT_FILE, usecols=['sentence_id', 'text']) 51 | 52 | if spacy.__version__[0] == '3': 53 | # Spacy v3 saved all attributes by default. 54 | doc_bin = DocBin(store_user_data=True) 55 | else: 56 | doc_bin = DocBin(["LEMMA", "ENT_TYPE", "POS", "DEP"], store_user_data=True) 57 | 58 | for i, doc in enumerate(nlp.pipe(df['text'], n_process=conf.CPU_CORES)): 59 | if doc.is_parsed: 60 | doc.user_data['sentence_id'] = df.iloc[i, 0] 61 | doc_bin.add(doc) 62 | 63 | print('Writing file...') 64 | bytes_data = doc_bin.to_bytes() 65 | with open(conf.OUTPUT_FILE, 'wb+') as f: 66 | f.write(bytes_data) 67 | print('Exiting...') 68 | -------------------------------------------------------------------------------- /utilities/hansard_spacy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 11, 6 | "id": "improved-perth", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import spacy\n", 11 | "from spacy.tokens import DocBin" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 21, 17 | "id": "smooth-tower", 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "name": "stdout", 22 | "output_type": "stream", 23 | "text": [ 24 | "CPU times: user 5.48 s, sys: 1.37 s, total: 6.84 s\n", 25 | "Wall time: 7.27 s\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "%%time\n", 31 | "filepath ='/scratch/group/pract-txt-mine/hansard_1870_9_doc_object'\n", 32 | "\n", 33 | "# Important: we must start with a blank NLP pipeline.\n", 34 | "nlp = spacy.blank(\"en\")\n", 35 | "\n", 36 | "# We have to list these out for spacy versions < 3.\n", 37 | "attributes = [\"LEMMA\", \"ENT_TYPE\", \"POS\", \"DEP\"]\n", 38 | "\n", 39 | "with open(filepath, 'rb') as f:\n", 40 | " doc_bin = DocBin(attributes, store_user_data=True).from_bytes(f.read())" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "id": "minimal-poultry", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "%%time\n", 51 | "# Retrieves all the docs, takes a long time\n", 52 | "docs = list(doc_bin.get_docs(nlp.vocab))" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "id": "victorian-toner", 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "\n", 63 | "def get_n_docs(n: int):\n", 64 | " # Let's you only retrieve N number of documents,\n", 65 | " # if you think loading all of them would take too long\n", 66 | " # and is unnecessary.\n", 67 | " doc_generator = doc_bin.get_docs(nlp.vocab)\n", 68 | " return [try: next(doc_generator) except StopIteration: break for _ in range(n)]\n", 69 | "\n", 70 | "first_ten = get_n_docs(10)\n", 71 | "first_ten" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "id": "reflected-groove", 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "doc = first_ten[0]\n", 82 | "print(doc.user_data['sentence_id'])\n", 83 | "for token in first_ten[0]:\n", 84 | " print(token.text, token.lemma_, token.pos_)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "id": "controversial-technology", 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [] 94 | } 95 | ], 96 | "metadata": { 97 | "kernelspec": { 98 | "display_name": "Python 3", 99 | "language": "python", 100 | "name": "python3" 101 | }, 102 | "language_info": { 103 | "codemirror_mode": { 104 | "name": "ipython", 105 | "version": 3 106 | }, 107 | "file_extension": ".py", 108 | "mimetype": "text/x-python", 109 | "name": "python", 110 | "nbconvert_exporter": "python", 111 | "pygments_lexer": "ipython3", 112 | "version": "3.8.6" 113 | } 114 | }, 115 | "nbformat": 4, 116 | "nbformat_minor": 5 117 | } 118 | -------------------------------------------------------------------------------- /utilities/parse_city_council.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | if sys.version_info.major < 3: 4 | exit('Python 3 required.') 5 | 6 | try: 7 | import fitz 8 | except ModuleNotFoundError as e: 9 | print('PyMuPDF is missing. Install using `pip install pymupdf`') 10 | raise e 11 | 12 | import pandas as pd 13 | 14 | import datetime 15 | import os 16 | from typing import Optional 17 | 18 | 19 | def extract_text(filepath: str) -> str: 20 | pdf = fitz.Document(filepath) 21 | 22 | texts = [] 23 | 24 | for i in range(pdf.pageCount): 25 | page = pdf[i] 26 | textPage = page.getTextPage() 27 | rawText = textPage.extractText() 28 | rawText = rawText.replace('\n', ' ').replace('\t', ' ') 29 | texts.append(rawText) 30 | 31 | return ' '.join(texts) 32 | 33 | 34 | def parse_date(name: str) -> Optional[datetime.datetime]: 35 | formats = ['%Y%m%d', '%m%d%yMin', 'cc%m%d%y'] 36 | 37 | for format in formats: 38 | try: 39 | return datetime.datetime.strptime(name, format) 40 | except ValueError: 41 | continue 42 | 43 | 44 | if __name__ == '__main__': 45 | import argparse 46 | parser = argparse.ArgumentParser( 47 | description='Extracts text from all PDFs inside the input directory and exports into a Pipe-seperated CSV file.' 48 | ) 49 | parser.add_argument('-d', required=True, metavar='input_directory', help='Target input directory containing PDF files to process') 50 | parser.add_argument('-o', required=True, metavar='output_file', help='Output filename') 51 | 52 | args = parser.parse_args() 53 | 54 | directory = args.d 55 | 56 | rows = [] 57 | 58 | for dirpath, dirnames, filenames in os.walk(directory): 59 | for fn in filenames: 60 | name, extension = fn.rsplit('.', maxsplit=1) 61 | if extension != 'pdf': 62 | continue 63 | 64 | date = parse_date(name) 65 | 66 | if date is None: 67 | print(f'Skipping {fn}: could not parse date from filename') 68 | continue 69 | 70 | filepath = os.path.join(dirpath, fn) 71 | 72 | text = extract_text(filepath) 73 | 74 | rows.append((name, date.strftime('%Y-%m-%d'), text)) 75 | 76 | print(f'Parsed {fn}') 77 | 78 | print(f'{len(rows)} files parsed. Exporting to {args.o}...') 79 | 80 | council_df = pd.DataFrame(rows, columns=['Filename', 'Date', 'Text']) 81 | council_df.index.name = 'index' 82 | council_df.to_csv(args.o, sep='|') 83 | -------------------------------------------------------------------------------- /utilities/reddit_subset.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(lubridate) 3 | 4 | 5 | data <- read_csv("~/reddit_subset_2008.csv") 6 | 7 | #firstyear = 1869 8 | #lastyear = 1874 9 | 10 | #data <- data %>% 11 | # filter(year(speechdate) > firstyear) %>% 12 | # filter(year(speechdate) < lastyear) 13 | 14 | data <- data %>% 15 | select(body, subreddit) 16 | 17 | data$body <- data$body %>% 18 | tolower() 19 | 20 | #data <- data %>% 21 | # filter(str_detect(body, "\\bhe |\\bshe ")) 22 | 23 | data <- data %>% 24 | filter(str_detect(body, "\\bman |\\bwoman ")) 25 | 26 | dir <- setwd("~/") 27 | 28 | #write_csv(data, file.path(dir, "reddit_2008_he_she.csv")) 29 | 30 | write_csv(data, file.path(dir, "reddit_2008_man_woman.csv")) 31 | -------------------------------------------------------------------------------- /utilities/subset_reddit.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(lubridate) 3 | 4 | 5 | data <- read_csv("~/reddit_subset_2008.csv") 6 | 7 | #firstyear = 1869 8 | #lastyear = 1874 9 | 10 | #data <- data %>% 11 | # filter(year(speechdate) > firstyear) %>% 12 | # filter(year(speechdate) < lastyear) 13 | 14 | data <- data %>% 15 | select(body, subreddit) 16 | 17 | data$body <- data$body %>% 18 | tolower() 19 | 20 | #data <- data %>% 21 | # filter(str_detect(body, "\\bhe |\\bshe ")) 22 | 23 | data <- data %>% 24 | filter(str_detect(body, "\\bman |\\bwoman ")) 25 | 26 | dir <- setwd("~/") 27 | 28 | #write_csv(data, file.path(dir, "reddit_2008_he_she.csv")) 29 | 30 | write_csv(data, file.path(dir, "reddit_2008_man_woman.csv")) 31 | --------------------------------------------------------------------------------