├── gutenbergpoetrycorpus ├── __init__.py └── build.py ├── .gitignore ├── setup.py ├── README.md └── quick-experiments.ipynb /gutenbergpoetrycorpus/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | env/ 3 | 4 | # C extensions 5 | *.so 6 | 7 | # Packages 8 | *.egg 9 | *.egg-info 10 | dist 11 | build 12 | eggs 13 | .eggs 14 | parts 15 | bin 16 | var 17 | sdist 18 | develop-eggs 19 | .installed.cfg 20 | lib 21 | lib64 22 | 23 | # Installer logs 24 | pip-log.txt 25 | 26 | # Unit test / coverage reports 27 | .coverage 28 | .tox 29 | nosetests.xml 30 | htmlcov 31 | 32 | # Translations 33 | *.mo 34 | 35 | # Mr Developer 36 | .mr.developer.cfg 37 | .project 38 | .pydevproject 39 | 40 | # Complexity 41 | output/*.html 42 | output/*/index.html 43 | 44 | # Sphinx 45 | docs/_build 46 | 47 | # macos garbage 48 | .DS_Store 49 | 50 | # project-specific 51 | *.zip 52 | *.gz 53 | .ipynb_checkpoints/ 54 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | with open('README.md') as readme_file: 4 | readme = readme_file.read() 5 | 6 | setup( 7 | name='gutenbergpoetrycorpus', 8 | version='0.0.1', 9 | author='Allison Parrish', 10 | author_email='allison@decontextualize.com', 11 | url='https://github.com/aparrish/gutenberg-poetry-corpus', 12 | description='A corpus of poetry from Project Gutenberg', 13 | long_description=readme, 14 | packages=setuptools.find_packages(), 15 | install_requires=[ 16 | 'gutenbergdammit==0.0.2', 17 | 'wordfilter' 18 | ], 19 | dependency_links=[ 20 | 'https://github.com/aparrish/gutenberg-dammit/archive/master.zip#egg=gutenbergdammit-0.0.2' 21 | ], 22 | classifiers=[ 23 | 'Development Status :: 3 - Alpha', 24 | "Intended Audience :: Education", 25 | "Intended Audience :: Science/Research", 26 | 'Programming Language :: Python :: 3', 27 | ], 28 | platforms='any', 29 | ) 30 | -------------------------------------------------------------------------------- /gutenbergpoetrycorpus/build.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import codecs 4 | import sys 5 | 6 | from gutenbergdammit.ziputils import searchandretrieve 7 | import wordfilter 8 | 9 | def clean(s): 10 | "removes leading numbers and trailing numbers with whitespace" 11 | match = re.search(r"( {3,}\d+\.?)$", s) 12 | if match: 13 | s = s[:match.start()] 14 | s = re.sub(r"\[\d+\]", "", s) 15 | return s 16 | 17 | # sorta hamfisted criteria for determining if a line of text is a line of 18 | # "poetry." each function receives the text of the line to check along with the 19 | # text of the previous line. all checks must succeed for the line to be 20 | # included. TODO: Replace this with an actual classifier. 21 | 22 | checks = { 23 | # between five and sixty-five characters (inclusive) 24 | 'length': lambda prev, line: 5 <= len(line) <= 65, 25 | # not all upper-case 26 | 'case': lambda prev, line: not(line.isupper()), 27 | # doesn't begin with a roman numeral 28 | 'not_roman_numerals': lambda prev, line: \ 29 | not(re.search("^[IVXDC]+\.", line)), 30 | # if the last line was long and this one is short, it's probably the end of 31 | # a paragraph 32 | 'not_last_para_line': lambda prev, line: \ 33 | not(len(prev) >= 65 and len(line) <= 65), 34 | # less than 25% of the line is punctuation characters 35 | 'punct': lambda prev, line: \ 36 | (len([ch for ch in line if ch.isalpha() or ch.isspace()]) / \ 37 | (len(line)+0.01)) > 0.75, 38 | # doesn't begin with a bracket (angle or square) 39 | 'no_bracket': lambda prev, line: \ 40 | not(any([line.startswith(ch) for ch in '[<'])), 41 | # isn't in title case 42 | 'not_title_case': lambda prev, line: not(line.istitle()), 43 | # isn't title case when considering only longer words 44 | 'not_mostly_title_case': lambda prev, line: \ 45 | not(" ".join([w for w in line.split() if len(w) >= 4]).istitle()), 46 | # not more than 50% upper-case characters 47 | 'not_mostly_upper': lambda prev, line: \ 48 | (len([ch for ch in line if ch.isupper()]) / (len(line)+0.01)) < 0.5, 49 | # doesn't begin or end with a digit 50 | 'not_number': lambda prev, line: \ 51 | not(re.search("^\d", line)) and not(re.search("\d$", line)), 52 | # passes the wordfilter 53 | 'wordfilter_ok': lambda prev, line: not(wordfilter.blacklisted(line)) 54 | } 55 | 56 | def err(*args): 57 | print(*args, file=sys.stderr) 58 | 59 | if __name__ == '__main__': 60 | 61 | # remove some terms from wordfilter because they were filtering large 62 | # numbers of inoffensive lines; added one because its presence in this 63 | # corpus is almost always questionable. (terms in rot13 as a kind of 64 | # content warning) 65 | wordfilter.remove_words([codecs.encode(item, "rot_13") 66 | for item in ['ynzr', 'pevc', 'tnfu', 'fcvp']]) 67 | wordfilter.add_words([codecs.encode("wrj", "rot_13")]) 68 | 69 | from optparse import OptionParser 70 | parser = OptionParser() 71 | parser.add_option("--srczip", 72 | help="path to gutenberg-dammit-files zip", 73 | default="gutenberg-dammit-files-v002.zip") 74 | options, _ = parser.parse_args() 75 | 76 | err("finding books of poetry in", options.srczip, "...") 77 | 78 | poetry = list(searchandretrieve(options.srczip, { 79 | 'Language': 'English', 80 | 'Subject': lambda x: 'poetry' in x.lower(), 81 | 'Copyright Status': lambda x: not(x.startswith("Copyrighted")) 82 | })) 83 | 84 | err("done.") 85 | err("finding lines of poetry in", len(poetry), "books of poetry...") 86 | 87 | poem_lines = [] 88 | line_count = 0 89 | for metadata, text in poetry: 90 | prev = "" 91 | for line in text.split("\n"): 92 | line = clean(line.strip()) 93 | check_results = {k: v(prev, line) for k, v in checks.items()} 94 | if all(check_results.values()): 95 | poem_lines.append((line, metadata['Num'])) 96 | line_count += 1 97 | prev = line 98 | 99 | err("done.") 100 | err("found", len(poem_lines), "lines of poetry, of", line_count, "total.") 101 | 102 | err("printing to stdout...") 103 | for line in poem_lines: 104 | print(json.dumps({'s': line[0], 'gid': line[1]})) 105 | 106 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A Gutenberg Poetry Corpus 2 | 3 | By [Allison Parrish](https://www.decontextualize.com/) 4 | 5 | This is a Gutenberg Poetry corpus, comprised of approximately three million 6 | lines of poetry extracted from hundreds of books from [Project 7 | Gutenberg](https://gutenberg.org/). The corpus is especially suited to 8 | applications in creative computational poetic text generation. 9 | 10 | [Download the corpus here.](http://static.decontextualize.com/gutenberg-poetry-v001.ndjson.gz) 11 | 12 | ## How to use it 13 | 14 | The corpus is provided as a gzipped [newline-delimited JSON format](http://ndjson.org/). 15 | Here's a representative excerpt: 16 | 17 | {"s": "The Heav'ns and all the Constellations rung,", "gid": "20"} 18 | {"s": "The Planets in thir stations list'ning stood,", "gid": "20"} 19 | {"s": "While the bright Pomp ascended jubilant.", "gid": "20"} 20 | {"s": "Open, ye everlasting Gates, they sung,", "gid": "20"} 21 | {"s": "Open, ye Heav'ns, your living dores; let in", "gid": "20"} 22 | 23 | Each line of poetry is represented by a JSON object, with one object per line 24 | in the archive. The value for the `s` key is the line of poetry itself, and the 25 | value for the `gid` key is the ID of the Project Gutenberg book that the line 26 | comes from. You can use the value for `gid` to look up the title and author of 27 | the book that serves as that line's source, either "by hand" (just type the ID 28 | into Project Gutenberg's search box) or using a computer-readable version of 29 | the Project Gutenberg metadata (such as [Gutenberg, 30 | dammit](https://github.com/aparrish/gutenberg-dammit/)). 31 | 32 | The [Quick Experiments notebook](quick-experiments.ipynb) included in this 33 | repository shows how to get up and running quickly with the corpus in Python. 34 | No need to install the Python module in this repository---working with the data is 35 | surprisingly straightforward! 36 | 37 | ## How it was made 38 | 39 | The corpus was generated using the included `build.py` script, which uses 40 | [Gutenberg, dammit](https://github.com/aparrish/gutenberg-dammit/) to provide 41 | access to books from Project Gutenberg. First, books with the string `poetry` 42 | listed in their "Subject" metadata are added to a list. Then, the plaintext 43 | versions of those books are scanned for lines that "look like" poetry, based on 44 | a set of textual characteristics, such as their length and capitalization. 45 | (See `build.py` for a list of these characteristics.) Finally, lines are 46 | compared against a word list (from 47 | [wordfilter](https://github.com/dariusk/wordfilter)) to exclude lines that 48 | may contain egregiously offensive content. 49 | 50 | > NOTE: While a best-effort attempt has been made to exclude offensive language 51 | > from this corpus, I have not personally vetted each of the three million 52 | > lines. If you use this corpus to produce work for the public, please read 53 | > over it first or take approriate measures to ensure that the language in the 54 | > work is appropriate for you and your audience.read over it first or take 55 | > approriate measures to ensure that the language in the work is appropriate 56 | > for you and your audience. 57 | 58 | The corpus contains only lines of poetry from books that the Project Gutenberg 59 | metadata identifies as being written in English and as being free from 60 | copyright (i.e., public domain) in the United States. 61 | 62 | ## Examples of usage 63 | 64 | Previous versions of this corpus have served as a foundation for several 65 | projects produced by myself and others: 66 | 67 | * [Gutenberg Poetry 68 | Autocomplete](http://gutenberg-poetry.decontextualize.com/), a search 69 | engine-like interface for writing poems mined from Project Gutenberg. (A poem 70 | written using this interface was [recently published in the Indianapolis 71 | Review](https://theindianapolisreview.com/betting-the-under/)!) 72 | * [*Articulations*](http://counterpathpress.org/articulations-allison-parrish), 73 | a book of poetry created by finding phonetically similar lines of poetry in 74 | Project Gutenberg 75 | * [Plot to Poem](http://static.decontextualize.com/plot-to-poem.html), a quick 76 | [NaPoGenMo](https://github.com/NaPoGenMo/) project that finds the lines of 77 | poetry closest in meaning to sentences from Wikipedia plot summaries 78 | * [Lynn Cherny](http://www.ghostweather.com/) used a version of this corpus to 79 | do [some quick and dirty computational stylistics on computer-generated 80 | poetry](https://medium.com/@lynn_72328/cocos-memory-palace-a-strange-fantasia-28b48264612f). 81 | 82 | If you make something cool with this corpus, let me know! 83 | 84 | ## Build your own from scratch 85 | 86 | You don't need to read any of the following if you just want to use the corpus. 87 | If you're interested in building your own version from scratch, read on. 88 | 89 | This repository includes a script to build the Gutenberg Poetry corpus from the 90 | files included in [Gutenberg, 91 | dammit](https://github.com/aparrish/gutenberg-dammit/). First, download the 92 | *Gutenberg, dammit* archive. Then install this package, like so: 93 | 94 | ```bash 95 | pip install --process-dependency-links https://github.com/aparrish/gutenberg-poetry-corpus/archive/master.zip 96 | ``` 97 | 98 | You can then run the following command to produce your own version of the 99 | corpus: 100 | 101 | ```bash 102 | python -m gutenbergpoetrycorpus.build --srczip=PATH-TO-GUTENBERG-DAMMIT-ZIP | gzip -c >gutenberg-poetry.ndjson.gz 103 | ``` 104 | 105 | Parameters for what gets included in the corpus can be adjusted in `build.py`. 106 | (E.g., it should be relatively easy to adapt this script to produce corpora of 107 | poetry in different languages!) 108 | 109 | ## License 110 | 111 | To the best of my knowledge, the Gutenberg Poetry corpus contains only text 112 | excerpted from works that are is in the public domain (at least in the United 113 | States). For avoidance of doubt, I release the particular arrangement of these 114 | excerpts in the provided format as 115 | [CC0](https://creativecommons.org/share-your-work/public-domain/cc0/). 116 | 117 | The code in this repository is provided under the following license: 118 | 119 | Copyright 2018 Allison Parrish 120 | 121 | Permission is hereby granted, free of charge, to any person obtaining a copy of 122 | this software and associated documentation files (the "Software"), to deal in 123 | the Software without restriction, including without limitation the rights to 124 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 125 | of the Software, and to permit persons to whom the Software is furnished to do 126 | so, subject to the following conditions: 127 | 128 | The above copyright notice and this permission notice shall be included in all 129 | copies or substantial portions of the Software. 130 | 131 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 132 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 133 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 134 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 135 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 136 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 137 | SOFTWARE. 138 | 139 | -------------------------------------------------------------------------------- /quick-experiments.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# A Project Gutenberg Poetry Corpus: Quick Experiments\n", 8 | "\n", 9 | "By [Allison Parrish](https://www.decontextualize.com/)\n", 10 | "\n", 11 | "I made [a corpus of around three million lines of poetry from Project Gutenberg](https://github.com/aparrish/gutenberg-poetry-corpus), which anyone can download and use. This notebook shows a couple of quick examples of using the corpus in Python, just to get you started.\n", 12 | "\n", 13 | "First, [download the corpus via this link](http://static.decontextualize.com/gutenberg-poetry-v001.ndjson.gz), or if you're following along in your own copy of Jupyter Notebook and you have `curl` installed, run the cell below:" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | " % Total % Received % Xferd Average Speed Time Time Time Current\n", 26 | " Dload Upload Total Spent Left Speed\n", 27 | "100 52.2M 100 52.2M 0 0 2014k 0 0:00:26 0:00:26 --:--:-- 1089k\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "!curl -O http://static.decontextualize.com/gutenberg-poetry-v001.ndjson.gz" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "Three million lines of poetry in just over 52 megabytes! Not bad.\n", 40 | "\n", 41 | "The file is in gzipped [newline delimited JSON format](http://ndjson.org/): there's a JSON object on each line. You don't need to decompress the file to work with it, since Python has a handy library for working with gzipped files right in the code. The following cell will read in the file and create a list `all_lines` that contains all of these JSON objects." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 1, 47 | "metadata": { 48 | "collapsed": true 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "import gzip, json\n", 53 | "all_lines = []\n", 54 | "for line in gzip.open(\"gutenberg-poetry-v001.ndjson.gz\"):\n", 55 | " all_lines.append(json.loads(line.strip()))" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "Just to see what those lines look like, let's pick a handful at random:" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 2, 68 | "metadata": { 69 | "collapsed": true 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "import random" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 3, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "data": { 83 | "text/plain": [ 84 | "[{'gid': '3305', 's': 'When shall we find the spring come in,'},\n", 85 | " {'gid': '33156', 's': \"How great, in the wild whirl of Time's pursuits,\"},\n", 86 | " {'gid': '40344', 's': 'She sighs in desert lands:'},\n", 87 | " {'gid': '34870', 's': '\"We came within the fosses deep, that moat'},\n", 88 | " {'gid': '37752', 's': 'The dense black-coated throng, and all a-strain'},\n", 89 | " {'gid': '1365', 's': 'One only lives. Behold them where they lie'},\n", 90 | " {'gid': '32153', 's': 'With the rapturous adoration'},\n", 91 | " {'gid': '38877', 's': 'About them; and the horse of faery screamed'}]" 92 | ] 93 | }, 94 | "execution_count": 3, 95 | "metadata": {}, 96 | "output_type": "execute_result" 97 | } 98 | ], 99 | "source": [ 100 | "random.sample(all_lines, 8)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "Each object has a key `s` that contains the text of the line of poetry, and a key `gid` that contains the Project Gutenberg ID of the file in question. You can use this ID to look up the title and author of the book of poetry that the line came from (either using the [Project Gutenberg website](https://www.gutenberg.org/) or using pre-built metadata from, e.g., [Gutenberg, dammit](https://github.com/aparrish/gutenberg-dammit/))." 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "## Concordances and counts\n", 115 | "\n", 116 | "The corpus could be useful for collecting, counting and comparing lines of poetry with certain characteristics. Here's our first experiment: find every line of poetry in the corpus with the word \"flower.\" I do this using a regular expression that finds the string `flower` between two word boundaries, without respect to case:" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 4, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "import re\n", 126 | "flower_lines = [line['s'] for line in all_lines if re.search(r'\\bflower\\b', line['s'], re.I)]" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "Again, just to see what we have, we'll take a random sample:" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 5, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "['Blooms for you some happy flower.',\n", 145 | " \"Low to his heart he said; 'the flower\",\n", 146 | " 'The blush is on the flower, and the bloom is on the tree,',\n", 147 | " 'Woo and win the Sahri-flower,',\n", 148 | " \"The very flower of Issland; 'twas a fair yet fearful scene.\",\n", 149 | " \"There's not a dew drop on the flower,\",\n", 150 | " \"Of fame, the world's alluring, phantom flower.\",\n", 151 | " 'Be it not mine to steal the cultured flower']" 152 | ] 153 | }, 154 | "execution_count": 5, 155 | "metadata": {}, 156 | "output_type": "execute_result" 157 | } 158 | ], 159 | "source": [ 160 | "random.sample(flower_lines, 8)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "As a cut-up method poem, that's not bad all on its own! But let's do a little bit of Digital Humanities and make an aligned concordance of these lines, with the lines sorted alphabetically by the word following \"flower,\" using a bit of regular expression trickery:" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 6, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "name": "stdout", 177 | "output_type": "stream", 178 | "text": [ 179 | " Or why sae sweet a flower as love\n", 180 | " So sweet a flower as she.\"\n", 181 | " A flower as yet unblossomed. Warmth and light\n", 182 | " Is only half in flower as yet. But why--\n", 183 | " \"To gain so fair a flower as you,\n", 184 | " Cast like a flower aside?\n", 185 | " (Yon scarlet fruit-bell is a flower asleep;)\n", 186 | " As doth a flower at Apollo's touch.\n", 187 | " 'Twas a pigmy flower at best,\n", 188 | " But he, the flower at head and soil at root,\n", 189 | " But he, the flower at head and soil at root,\n", 190 | " Blooms the perfect flower at last.\n", 191 | " Each plant and flower at length being view'd,\n", 192 | " Shaking that flower at me with soft invitation\n", 193 | " My sweet lovely flower at my ain fireside.\n", 194 | " _Who can behold the flower at noon, nor seek_\n", 195 | " Love's not a flower at sunset droops,\n", 196 | " From the Is flower at the fence;\n", 197 | " From the passion-flower at the gate,\n", 198 | " From the passion-flower at the gate.\n", 199 | " From the passion-flower at the gate.\n", 200 | " From the passion-flower at the gate.\n", 201 | " And those to flower at the prime (and yet\n", 202 | " I am the flower at your feet,\n", 203 | " The plum-tree's flower awakens\n", 204 | " And drink the very flower away.\n", 205 | " If such a flower be cast to the bleak winds,\n", 206 | " Yes, let the flower be gathered in its bloom!\n", 207 | " By thee shall herb and flower be kissed,\n", 208 | " By thee shall herb and flower be kissed;\n", 209 | " The withered leaf, the faded flower be mine,\n", 210 | " Never a flower be near me set,\n", 211 | " Before the flower be on the bramble spray?\n", 212 | " \"If this fair flower be plucked, oh, misery! oh,\n", 213 | " If the flower be scorched by the summer sun,\n", 214 | " And every flower be springing.\n", 215 | " And every flower be springing;\n", 216 | " Pale flower beaten by the rain!\n", 217 | " With careless scythe too near some flower bed,\n", 218 | " With careless scythe too near some flower bed,\n", 219 | " Humble little cottage, but a royal flower bed.\n", 220 | " Dry wildness of the weedy flower bed;\n", 221 | " Nor flower bedropt with diamond dew;\n", 222 | " Fruit trees and flower beds eaten bare,\n", 223 | " Among the flower beds her dear form sees,\n", 224 | " So pluck the flower before it fades--\n", 225 | " The colours of the flower before its leaves unclose;\n", 226 | " pass silently, flower before our eyes, it is\n", 227 | " Both beast and bird and flower before the Queen;\n", 228 | " And like a fragile flower before the storm,\n" 229 | ] 230 | } 231 | ], 232 | "source": [ 233 | "longest = max([len(x) for x in flower_lines]) # find the length of the longest line\n", 234 | "center = longest - len(\"flower\") # and use it to create a \"center\" offset that will work for all lines\n", 235 | "\n", 236 | "sorted_flower_lines = sorted(\n", 237 | " [line for line in flower_lines if re.search(r\"\\bflower\\b\\s\\w\", line)], # only lines with word following\n", 238 | " key=lambda line: line[re.search(r\"\\bflower\\b\\s\", line).end():]) # sort on the substring following the match\n", 239 | "\n", 240 | "for line in sorted_flower_lines[350:400]: # change these numbers to see a different slice\n", 241 | " offset = center - re.search(r'\\bflower\\b', line, re.I).start()\n", 242 | " print((\" \"*offset)+line) # left-pad the string with spaces to align on \"flower\"" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "As another experiment, let's find all of the words that occur between either \"the\" or \"a\" and the word \"flower.\" English being the way it is, these words are pretty much guaranteed to be adjectives, so this is an ersatz but effective way of getting a (non-exhaustive) list of adjectives that are used to describe a flower in the corpus." 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 7, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "found_adj = []\n", 259 | "for line in flower_lines:\n", 260 | " matches = re.findall(r\"(the|a)\\s(\\b\\w+\\b)\\s(\\bflower\\b)\", line, re.I)\n", 261 | " for match in matches: \n", 262 | " found_adj.append(match[1])" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "Some adjectives at random:" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 8, 275 | "metadata": {}, 276 | "outputs": [ 277 | { 278 | "data": { 279 | "text/plain": [ 280 | "['milky',\n", 281 | " 'sweetest',\n", 282 | " 'wild',\n", 283 | " 'fairer',\n", 284 | " 'moon',\n", 285 | " 'fairest',\n", 286 | " 'blue',\n", 287 | " 'flaming',\n", 288 | " 'splendid',\n", 289 | " 'golden',\n", 290 | " 'meanest',\n", 291 | " 'coveted']" 292 | ] 293 | }, 294 | "execution_count": 8, 295 | "metadata": {}, 296 | "output_type": "execute_result" 297 | } 298 | ], 299 | "source": [ 300 | "random.sample(found_adj, 12)" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "Using the `Counter` object, we can easily count these up and find the twelve most common adjectives (used in the type of noun phrase we've identified) used to describe a flower:" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 9, 313 | "metadata": { 314 | "collapsed": true 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "from collections import Counter" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 10, 324 | "metadata": {}, 325 | "outputs": [ 326 | { 327 | "data": { 328 | "text/plain": [ 329 | "[('little', 26),\n", 330 | " ('white', 23),\n", 331 | " ('sweetest', 22),\n", 332 | " ('wild', 19),\n", 333 | " ('fairest', 15),\n", 334 | " ('tender', 13),\n", 335 | " ('sweet', 11),\n", 336 | " ('purple', 11),\n", 337 | " ('meanest', 11),\n", 338 | " ('lovely', 10),\n", 339 | " ('bonnie', 10),\n", 340 | " ('faded', 9)]" 341 | ] 342 | }, 343 | "execution_count": 10, 344 | "metadata": {}, 345 | "output_type": "execute_result" 346 | } 347 | ], 348 | "source": [ 349 | "Counter(found_adj).most_common(12)" 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "The little white sweetest wild fairest tender sweet purple meanest lovely bonnie faded flower..." 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "## Rhymes\n", 364 | "\n", 365 | "Stretches of language identified as poetry characteristically exhibit some variety of rhyming, and the lines of poetry in the Gutenberg Poetry corpus are no different. Let's set ourselves a task of finding random rhyming lines in the corpus. To do this, we need to know how words are pronounced. The way that words are spelled in English doesn't really tell us anything helpful about how the word is pronounced, so we need some alternate method to get that information. The [CMU Pronouncing Dictionary](http://www.speech.cs.cmu.edu/cgi-bin/cmudict) is one such method: it's a big database of phonetic transcriptions for many thousands of English words.\n", 366 | "\n", 367 | "I made a Python library called [pronouncing](https://pypi.org/project/pronouncing/) to make it very easy to work with the CMU Pronouncing Dictionary in Python. You can install it like so:" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 33, 373 | "metadata": { 374 | "scrolled": true 375 | }, 376 | "outputs": [ 377 | { 378 | "name": "stdout", 379 | "output_type": "stream", 380 | "text": [ 381 | "Requirement already satisfied: pronouncing in /Users/allison/anaconda/lib/python3.6/site-packages\n", 382 | "Requirement already satisfied: cmudict>=0.4.0 in /Users/allison/anaconda/lib/python3.6/site-packages (from pronouncing)\n", 383 | "\u001b[33mYou are using pip version 9.0.3, however version 18.0 is available.\n", 384 | "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" 385 | ] 386 | } 387 | ], 388 | "source": [ 389 | "!pip install pronouncing" 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": {}, 395 | "source": [ 396 | "And then import it:" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 11, 402 | "metadata": { 403 | "collapsed": true 404 | }, 405 | "outputs": [], 406 | "source": [ 407 | "import pronouncing" 408 | ] 409 | }, 410 | { 411 | "cell_type": "markdown", 412 | "metadata": {}, 413 | "source": [ 414 | "We'll consider two lines to rhyme with each other if the last words in the lines rhyme. To test this out, we'll pick a source word, say, \"flowering,\" and find all of the words that rhyme with it:" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 12, 420 | "metadata": { 421 | "collapsed": true 422 | }, 423 | "outputs": [], 424 | "source": [ 425 | "source_word = \"flowering\"\n", 426 | "source_word_rhymes = pronouncing.rhymes(source_word)" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 13, 432 | "metadata": {}, 433 | "outputs": [ 434 | { 435 | "data": { 436 | "text/plain": [ 437 | "['cowering',\n", 438 | " 'devouring',\n", 439 | " 'empowering',\n", 440 | " 'glowering',\n", 441 | " 'powering',\n", 442 | " 'scouring',\n", 443 | " 'showering',\n", 444 | " 'souring',\n", 445 | " 'towering']" 446 | ] 447 | }, 448 | "execution_count": 13, 449 | "metadata": {}, 450 | "output_type": "execute_result" 451 | } 452 | ], 453 | "source": [ 454 | "source_word_rhymes" 455 | ] 456 | }, 457 | { 458 | "cell_type": "markdown", 459 | "metadata": {}, 460 | "source": [ 461 | "And then look through the lines of poetry in the corpus for lines that end with any of these words:" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": 14, 467 | "metadata": {}, 468 | "outputs": [ 469 | { 470 | "name": "stdout", 471 | "output_type": "stream", 472 | "text": [ 473 | "In the Winter you are cowering\n", 474 | "\"Oh, yes!\" exclaimed John, with a towering\n", 475 | "In the Winter you are cowering\n", 476 | "winged things may never pass, nay, not even the cowering\n", 477 | "Ithaca, these are wooing me against my will, and devouring\n", 478 | "\"Of Coleridge, I can not speak but with reverence. His towering\n", 479 | "upbraid him. \"Son of Tydeus,\" he said, \"why stand you cowering\n", 480 | "the heaviness of his heart, \"why are the Achaeans again scouring\n", 481 | "Maidens with towering\n", 482 | "Are its waters, aye showering\n", 483 | "In the Winter you are cowering\n", 484 | "In the Winter you are cowering\n", 485 | "So hunted, yet defiant, cowering\n", 486 | "The moonlit crests of foaming waves gleam towering\n" 487 | ] 488 | } 489 | ], 490 | "source": [ 491 | "for line in all_lines:\n", 492 | " text = line['s']\n", 493 | " match = re.search(r'(\\b\\w+\\b)\\W*$', text)\n", 494 | " if match:\n", 495 | " last_word = match.group()\n", 496 | " if last_word in source_word_rhymes:\n", 497 | " print(text)" 498 | ] 499 | }, 500 | { 501 | "cell_type": "markdown", 502 | "metadata": {}, 503 | "source": [ 504 | "Looking through all three million lines of poetry to find rhyming lines one-by-one will be pretty slow. Another approach is to use the `phones_for_word()` and `rhyming_part()` functions in the `pronouncing` library to pre-build a data structure with all of the lines in the corpus grouped with their rhymes. The `phones_for_word()` function gives you the \"phones\" (sounds) of how a word is pronounced; the `rhyming_part()` function gives you just the portion of a string of phones that another word must share in order for them to be considered \"rhyming\":" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": 15, 510 | "metadata": {}, 511 | "outputs": [ 512 | { 513 | "data": { 514 | "text/plain": [ 515 | "'F L AW1 ER0 IH0 NG'" 516 | ] 517 | }, 518 | "execution_count": 15, 519 | "metadata": {}, 520 | "output_type": "execute_result" 521 | } 522 | ], 523 | "source": [ 524 | "phones = pronouncing.phones_for_word(source_word)[0] # words may have multiple pronunciations, so this returns a list\n", 525 | "phones" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 16, 531 | "metadata": {}, 532 | "outputs": [ 533 | { 534 | "data": { 535 | "text/plain": [ 536 | "'AW1 ER0 IH0 NG'" 537 | ] 538 | }, 539 | "execution_count": 16, 540 | "metadata": {}, 541 | "output_type": "execute_result" 542 | } 543 | ], 544 | "source": [ 545 | "pronouncing.rhyming_part(phones)" 546 | ] 547 | }, 548 | { 549 | "cell_type": "markdown", 550 | "metadata": {}, 551 | "source": [ 552 | "The following cell builds the data structure proposed above: a dictionary that maps rhyming parts to a dictionary that maps words with that rhyming part to the lines of poetry that they're found at the end of." 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": 17, 558 | "metadata": {}, 559 | "outputs": [], 560 | "source": [ 561 | "from collections import defaultdict\n", 562 | "by_rhyming_part = defaultdict(lambda: defaultdict(list))\n", 563 | "for line in all_lines:\n", 564 | " text = line['s']\n", 565 | " if not(32 < len(text) < 48): # only use lines of uniform lengths\n", 566 | " continue\n", 567 | " match = re.search(r'(\\b\\w+\\b)\\W*$', text)\n", 568 | " if match:\n", 569 | " last_word = match.group()\n", 570 | " pronunciations = pronouncing.phones_for_word(last_word)\n", 571 | " if len(pronunciations) > 0:\n", 572 | " rhyming_part = pronouncing.rhyming_part(pronunciations[0])\n", 573 | " # group by rhyming phones (for rhymes) and words (to avoid duplicate words)\n", 574 | " by_rhyming_part[rhyming_part][last_word.lower()].append(text)" 575 | ] 576 | }, 577 | { 578 | "cell_type": "markdown", 579 | "metadata": {}, 580 | "source": [ 581 | "A random key/value pair from this dictionary, so you can see its structure:" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": 24, 587 | "metadata": {}, 588 | "outputs": [ 589 | { 590 | "data": { 591 | "text/plain": [ 592 | "('EH1 N S AH0 Z',\n", 593 | " defaultdict(list,\n", 594 | " {'commences': ['Ancient history of Portugal commences',\n", 595 | " 'Each day some scene of woe commences'],\n", 596 | " 'expenses': ['Will pay for all the school expenses',\n", 597 | " 'Will pay for all the school expenses',\n", 598 | " 'Which brought great bothers and expenses'],\n", 599 | " 'fences': [\"We've been climbing trees an' fences\",\n", 600 | " 'And men too; and why there are fences']}))" 601 | ] 602 | }, 603 | "execution_count": 24, 604 | "metadata": {}, 605 | "output_type": "execute_result" 606 | } 607 | ], 608 | "source": [ 609 | "random_rhyming_part = random.choice(list(by_rhyming_part.keys()))\n", 610 | "random_rhyming_part, by_rhyming_part[random_rhyming_part]" 611 | ] 612 | }, 613 | { 614 | "cell_type": "markdown", 615 | "metadata": {}, 616 | "source": [ 617 | "Many rhyming parts are found in multiple lines, but only with one unique word. While it's true that identical words \"rhyme,\" it's a little disingenuous to claim that we've made a computer program that finds rhyming lines of poetry if it's mostly just finding lines that end in the same word. So we'll just find the groups from the `by_rhyming_part` dictionary that have at least two different line-ending words:" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": 25, 623 | "metadata": { 624 | "collapsed": true 625 | }, 626 | "outputs": [], 627 | "source": [ 628 | "rhyme_groups = [group for group in by_rhyming_part.values() if len(group) >= 2]" 629 | ] 630 | }, 631 | { 632 | "cell_type": "markdown", 633 | "metadata": {}, 634 | "source": [ 635 | "Now, find seven rhyming couplets by selecting a random rhyming group, sampling two keys (words) from that group, and printing a random line from both groups:" 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": 32, 641 | "metadata": {}, 642 | "outputs": [ 643 | { 644 | "name": "stdout", 645 | "output_type": "stream", 646 | "text": [ 647 | "For Brighton's size compared to Nairn\n", 648 | "The wind blaws clean about the cairn\n", 649 | "Or vermin, or, at best, of cock purloined\n", 650 | "There with the Romans in the camp were joined\n", 651 | "Nor wine nor wassail could raise a vassal\n", 652 | "You saw the day when Henry Schnetzen's castle\n", 653 | "The Legislative Bodies to assemble\n", 654 | "In vain would formal art dissemble\n", 655 | "Venus's Advice to Adonis on Hunting\n", 656 | "Growling, as was his wont, and grunting\n", 657 | "Of our successors should in part be seated\n", 658 | "Of ancient prudent words too much repeated\n", 659 | "Reared by a spring to stately height, amidst\n", 660 | "For here I read of Eden, and that in the midst\n" 661 | ] 662 | } 663 | ], 664 | "source": [ 665 | "for i in range(7):\n", 666 | " group = random.choice(rhyme_groups)\n", 667 | " words = random.sample(list(group.keys()), 2)\n", 668 | " print(random.choice(group[words[0]]))\n", 669 | " print(random.choice(group[words[1]]))" 670 | ] 671 | }, 672 | { 673 | "cell_type": "markdown", 674 | "metadata": {}, 675 | "source": [ 676 | "## Markov chain text generation\n", 677 | "\n", 678 | "Markov chain text generation uses statistical information about word co-occurrence to build a model that allows you to generate text that looks similar to your source text. [Markovify](https://github.com/jsvine/markovify) is a great library for Python that makes it easy to build and generate from Markov chain models. Install it like so:" 679 | ] 680 | }, 681 | { 682 | "cell_type": "code", 683 | "execution_count": 83, 684 | "metadata": {}, 685 | "outputs": [ 686 | { 687 | "name": "stdout", 688 | "output_type": "stream", 689 | "text": [ 690 | "Requirement already satisfied: markovify in /Users/allison/anaconda/lib/python3.6/site-packages\n", 691 | "Requirement already satisfied: unidecode in /Users/allison/anaconda/lib/python3.6/site-packages (from markovify)\n", 692 | "\u001b[33mYou are using pip version 9.0.3, however version 18.0 is available.\n", 693 | "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" 694 | ] 695 | } 696 | ], 697 | "source": [ 698 | "!pip install markovify" 699 | ] 700 | }, 701 | { 702 | "cell_type": "markdown", 703 | "metadata": {}, 704 | "source": [ 705 | "And import it:" 706 | ] 707 | }, 708 | { 709 | "cell_type": "code", 710 | "execution_count": 33, 711 | "metadata": { 712 | "collapsed": true 713 | }, 714 | "outputs": [], 715 | "source": [ 716 | "import markovify" 717 | ] 718 | }, 719 | { 720 | "cell_type": "markdown", 721 | "metadata": {}, 722 | "source": [ 723 | "Our goal is to use a Markov chain to generate new lines of poetry from the Gutenberg Poetry corpus. Markovify requires you to pass in your source text as a string, so first off we'll create a big string with a sample of the corpus, separated by newlines:" 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": 34, 729 | "metadata": {}, 730 | "outputs": [], 731 | "source": [ 732 | "big_poem = \"\\n\".join([line['s'] for line in random.sample(all_lines, 250000)])" 733 | ] 734 | }, 735 | { 736 | "cell_type": "markdown", 737 | "metadata": {}, 738 | "source": [ 739 | "(You can change the number as needed; I kept it low so that the model will build fast and not consume too much RAM.)\n", 740 | "\n", 741 | "Build the model:" 742 | ] 743 | }, 744 | { 745 | "cell_type": "code", 746 | "execution_count": 35, 747 | "metadata": {}, 748 | "outputs": [], 749 | "source": [ 750 | "model = markovify.NewlineText(big_poem)" 751 | ] 752 | }, 753 | { 754 | "cell_type": "markdown", 755 | "metadata": {}, 756 | "source": [ 757 | "And then generate some lines:" 758 | ] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "execution_count": 36, 763 | "metadata": {}, 764 | "outputs": [ 765 | { 766 | "name": "stdout", 767 | "output_type": "stream", 768 | "text": [ 769 | "Were emerald: snow new-fallen seem'd the white wisteria\n", 770 | "And the stars have hid their white faces\n", 771 | "Bot wel he sih hire wepe,\n", 772 | "Red like a moon-shaft silver and the flow!\n", 773 | "In love Heaven gave him last his country house, as if in Nature's scorn,\n", 774 | "Of cloud grew violet; how thy fame has felt joy and uproar, can ne'er be effaced--\n", 775 | "Thus Ráma spoke: the Vánar found,\n", 776 | "I feel him warm, but how it steams in your arms and hands forespent with toil,\n", 777 | "Or that starred Ethiop queen that we die in a pleasant dream.\n", 778 | "Leave the dead anew.\n", 779 | "Through the streets he passed,\n", 780 | "Home through the mire;\n", 781 | "Since, stranger! thou hast every gentle wight I pray,\n", 782 | "Soon made the clouds, as morning walks the sea,\n" 783 | ] 784 | } 785 | ], 786 | "source": [ 787 | "for i in range(14):\n", 788 | " print(model.make_sentence())" 789 | ] 790 | }, 791 | { 792 | "cell_type": "markdown", 793 | "metadata": {}, 794 | "source": [ 795 | "This is okay but the lines don't make a lot of sense, and are sometimes too long. You can constrain the length using Markovify's `.make_short_sentence()` method:" 796 | ] 797 | }, 798 | { 799 | "cell_type": "code", 800 | "execution_count": 49, 801 | "metadata": {}, 802 | "outputs": [ 803 | { 804 | "data": { 805 | "text/plain": [ 806 | "'The record sound in the wood, or the glory moving on,'" 807 | ] 808 | }, 809 | "execution_count": 49, 810 | "metadata": {}, 811 | "output_type": "execute_result" 812 | } 813 | ], 814 | "source": [ 815 | "model.make_short_sentence(60)" 816 | ] 817 | }, 818 | { 819 | "cell_type": "markdown", 820 | "metadata": {}, 821 | "source": [ 822 | "I find that Markov-generated text is best when you keep it short and force juxtapositions—otherwise the reader's attention will wander. The following cell generates a series of short, haiku-esque poems of two to five Markov-generated lines, and ensures that the last line of each poem ends with a period:" 823 | ] 824 | }, 825 | { 826 | "cell_type": "code", 827 | "execution_count": 67, 828 | "metadata": {}, 829 | "outputs": [ 830 | { 831 | "name": "stdout", 832 | "output_type": "stream", 833 | "text": [ 834 | "\n", 835 | "There and here he died,\n", 836 | "Must be the king Theucer.\n", 837 | "\n", 838 | "~ ❀ ~\n", 839 | "\n", 840 | "They seemed the most beautiful;\n", 841 | "Better the rule maintain?\n", 842 | "Or kings be worn,\n", 843 | "From curl-crowned forehead to my good.\n", 844 | "\n", 845 | "~ ❀ ~\n", 846 | "\n", 847 | "And then the words upon our sphere,\n", 848 | "And so it runs away.\n", 849 | "Four-and-twenty years he spake\n", 850 | "They buried him at your length,\n", 851 | "I do not go from her flying.\n", 852 | "\n", 853 | "~ ❀ ~\n", 854 | "\n", 855 | "Bot of verray covenant\n", 856 | "amiable lady, by whom alone is giv'n.\n", 857 | "\n", 858 | "~ ❀ ~\n", 859 | "\n", 860 | "And there in the little earthen vessels,\n", 861 | "And owns no softer charm\n", 862 | "Is each to Heaven commends.\n", 863 | "\n", 864 | "~ ❀ ~\n", 865 | "\n", 866 | "I thought it very large.\n", 867 | "And wish'd confusion to the lute\n", 868 | "And, as he was kind,\n", 869 | "As the black stars, merrily.\n", 870 | "\n", 871 | "~ ❀ ~\n" 872 | ] 873 | } 874 | ], 875 | "source": [ 876 | "for i in range(6):\n", 877 | " print()\n", 878 | " for i in range(random.randrange(1, 5)):\n", 879 | " print(model.make_short_sentence(40))\n", 880 | " # ensure last line has a period at the end, for closure\n", 881 | " print(re.sub(r\"(\\w)[^\\w.]?$\", r\"\\1.\", model.make_short_sentence(40)))\n", 882 | " print()\n", 883 | " print(\"~ ❀ ~\")" 884 | ] 885 | }, 886 | { 887 | "cell_type": "markdown", 888 | "metadata": {}, 889 | "source": [ 890 | "## Further reading\n", 891 | "\n", 892 | "The [README in the code repository](README.md) has a few more examples of (earlier iterations of) this corpus at work.\n", 893 | "\n", 894 | "If you're just getting started with Python and creative language generation, check out the notes for [Reading and Writing Electronic Text](http://rwet.decontextualize.com/), a class I teach at ITP." 895 | ] 896 | } 897 | ], 898 | "metadata": { 899 | "kernelspec": { 900 | "display_name": "Python 3", 901 | "language": "python", 902 | "name": "python3" 903 | }, 904 | "language_info": { 905 | "codemirror_mode": { 906 | "name": "ipython", 907 | "version": 3 908 | }, 909 | "file_extension": ".py", 910 | "mimetype": "text/x-python", 911 | "name": "python", 912 | "nbconvert_exporter": "python", 913 | "pygments_lexer": "ipython3", 914 | "version": "3.6.5" 915 | } 916 | }, 917 | "nbformat": 4, 918 | "nbformat_minor": 2 919 | } 920 | --------------------------------------------------------------------------------