├── COPYING ├── README.md ├── storyofjohn.py └── storyofjohn2.py /COPYING: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 Jeff Binder 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is the code for my [entry](https://github.com/dariusk/NaNoGenMo-2014/issues/75) in the [2014 National Novel Generating Month contest](https://github.com/dariusk/NaNoGenMo-2014). It generates a novel that tells the story of John - that is, the story of every character named John in the Wright American Fiction corpus, merged into one person. 2 | 3 | The script works by identifying sentences in the corpus that contain the lemma "John" tagged as a proper name. For the purposes of consistency, it also standardizes John's last name to "Arbuckle." It does this by looking for proper names that appear immediately after the token John; once a last name has been identified as belonging to (some) John, it will be replaced by "Arbuckle" for the remainder of the input text. (Yes, I know that the character's name is spelled "Jon.") 4 | 5 | To keep things readable, the script also eliminates sentences that contain an odd number of quotation marks. 6 | 7 | There are two versions of this script. The first outputs the sentences in the order in which they appear in the corpus, with a "chapter" for each text and a paragraph for each division within the texts. The second mixes the texts together, ordering the sentences based on how far through the text they appear, so that sentences from the early parts of a novel will tend to appear near te beginning of the output, etc. This version creates paragraph breaks based on the patterns of quotation marks, and starts new chapters whenever it has passed chapter breaks in all of the input texts. -------------------------------------------------------------------------------- /storyofjohn.py: -------------------------------------------------------------------------------- 1 | # First go round - creates a chapter for each text and a paragraph for each section 2 | # of the source texts. 3 | 4 | import codecs 5 | import os 6 | import xml.etree.ElementTree as ET 7 | 8 | indir = '../corpora/monk/wright/bibadorned' 9 | first_name = 'John' 10 | surname = 'Arbuckle' 11 | 12 | f = codecs.open('output.txt', 'w', 'utf8') 13 | 14 | first_name_tok = first_name.lower() 15 | chapter = 0 16 | for file in os.listdir(indir): 17 | print file 18 | tree = ET.parse(os.path.join(indir, file)) 19 | root = tree.getroot() 20 | body = root.find('{http://www.tei-c.org/ns/1.0}text/{http://www.tei-c.org/ns/1.0}body') 21 | paragraphs = [] 22 | matching_surnames = set() 23 | for div in body.findall('{http://www.tei-c.org/ns/1.0}div'): 24 | sentences = [] 25 | current_sentence = [] 26 | between_sentences = True 27 | contains_name = False 28 | num_quotes_in_sentence = 0 29 | prior_tok_is_first_name = False 30 | for p in div.findall('{http://www.tei-c.org/ns/1.0}p'): 31 | for el in p: 32 | if el.tag == '{http://www.tei-c.org/ns/1.0}w': 33 | pos = el.attrib['pos'] 34 | lem = el.attrib['lem'].lower() 35 | if between_sentences: 36 | between_sentences = False 37 | if prior_tok_is_first_name and (pos in ('np1', 'np-n1') or (pos == 'n1' and el.text[0].isupper())): 38 | current_sentence.append(surname) 39 | matching_surnames.add(lem) 40 | elif prior_tok_is_first_name and (pos == 'npg1' or (pos == 'ng1' and el.text[0].isupper())): 41 | current_sentence.append(surname) 42 | matching_surnames.add(lem) 43 | current_sentence.append("'s") 44 | elif pos == 'np1' and lem in matching_surnames: 45 | current_sentence.append(surname) 46 | elif pos == 'npg1' and lem in matching_surnames: 47 | current_sentence.append(surname) 48 | current_sentence.append("'s") 49 | else: 50 | current_sentence.append(el.text) 51 | if pos in ('np1', 'npg1') and lem == first_name_tok: 52 | contains_name = True 53 | prior_tok_is_first_name = True 54 | else: 55 | prior_tok_is_first_name = False 56 | if pos == '"' or lem == '"' or pos == '"' or lem == '"': 57 | num_quotes_in_sentence += 1 58 | if el.attrib['eos'] == '1': 59 | if contains_name and num_quotes_in_sentence % 2 == 0: 60 | sentences.append(''.join(current_sentence)) 61 | current_sentence = [] 62 | between_sentences = True 63 | contains_name = False 64 | contains_offensive_word = False 65 | num_quotes_in_sentence = 0 66 | if el.tag == '{http://www.tei-c.org/ns/1.0}c': 67 | if not between_sentences: 68 | current_sentence.append(el.text) 69 | if sentences: 70 | paragraphs.append(' '.join(sentences)) 71 | if paragraphs: 72 | chapter += 1 73 | if chapter > 1: 74 | text = '\n\n\n' 75 | else: 76 | text = '' 77 | text += 'Chapter ' + str(chapter) + '\n\n' 78 | text += '\n\n'.join(paragraphs) 79 | f.write(text) 80 | f.flush() 81 | -------------------------------------------------------------------------------- /storyofjohn2.py: -------------------------------------------------------------------------------- 1 | # Second go round - orders sentences based on how far along they appear in the original 2 | # texts, so that sentences that tend to appear near the beginning of an input text 3 | # appear near the beginning of an output text, etc. 4 | 5 | import codecs 6 | import os 7 | import xml.etree.ElementTree as ET 8 | 9 | indir = '../corpora/monk/wright/bibadorned' 10 | first_name = 'John' 11 | surname = 'Arbuckle' 12 | 13 | f = codecs.open('output2.txt', 'w', 'utf8') 14 | 15 | first_name_tok = first_name.lower() 16 | sentences = [] 17 | ntexts = 0 18 | for file in os.listdir(indir): 19 | print file 20 | ntexts += 1 21 | tree = ET.parse(os.path.join(indir, file)) 22 | root = tree.getroot() 23 | body = root.find('{http://www.tei-c.org/ns/1.0}text/{http://www.tei-c.org/ns/1.0}body') 24 | doc_sentences = [] 25 | ntoks = 0 26 | num_sentences = 0 27 | matching_surnames = set() 28 | for div in body.findall('{http://www.tei-c.org/ns/1.0}div'): 29 | current_sentence = [] 30 | between_sentences = True 31 | contains_name = False 32 | num_quotes_in_sentence = 0 33 | prior_tok_is_first_name = False 34 | for p in div.findall('{http://www.tei-c.org/ns/1.0}p'): 35 | for el in p: 36 | if el.tag == '{http://www.tei-c.org/ns/1.0}w': 37 | ntoks += 1 38 | pos = el.attrib['pos'] 39 | lem = el.attrib['lem'].lower() 40 | if between_sentences: 41 | between_sentences = False 42 | if prior_tok_is_first_name and (pos in ('np1', 'np-n1') or (pos == 'n1' and el.text[0].isupper())): 43 | current_sentence.append(surname) 44 | matching_surnames.add(lem) 45 | elif prior_tok_is_first_name and (pos == 'npg1' or (pos == 'ng1' and el.text[0].isupper())): 46 | current_sentence.append(surname) 47 | matching_surnames.add(lem) 48 | current_sentence.append("'s") 49 | elif pos == 'np1' and lem in matching_surnames: 50 | current_sentence.append(surname) 51 | elif pos == 'npg1' and lem in matching_surnames: 52 | current_sentence.append(surname) 53 | current_sentence.append("'s") 54 | else: 55 | current_sentence.append(el.text) 56 | if pos in ('np1', 'npg1') and lem == first_name_tok: 57 | contains_name = True 58 | prior_tok_is_first_name = True 59 | else: 60 | prior_tok_is_first_name = False 61 | if pos == '"' or lem == '"' or pos == '"' or lem == '"': 62 | num_quotes_in_sentence += 1 63 | if el.attrib['eos'] == '1': 64 | if contains_name and num_quotes_in_sentence % 2 == 0: 65 | doc_sentences.append((''.join(current_sentence), ntoks)) 66 | num_sentences += 1 67 | current_sentence = [] 68 | between_sentences = True 69 | contains_name = False 70 | num_quotes_in_sentence = 0 71 | if el.tag == '{http://www.tei-c.org/ns/1.0}c': 72 | if not between_sentences: 73 | current_sentence.append(el.text) 74 | doc_sentences.append(('', ntoks)) 75 | if ntoks and num_sentences: 76 | ntoks_total = ntoks 77 | for sentence, ntoks in doc_sentences: 78 | sentences.append((sentence, float(ntoks) / ntoks_total)) 79 | 80 | text = ['Chapter 1\n\n'] 81 | chapter_num = 1 82 | nsectionbreaks = 0 83 | last_sentence = '.' 84 | for sentence, ntoks in sorted(sentences, key=lambda x: x[1]): 85 | if sentence == '': 86 | nsectionbreaks += 1 87 | else: 88 | if nsectionbreaks >= ntexts: 89 | chapter_num += 1 90 | text.append('\n\n\nChapter ' + str(chapter_num) + '\n\n') 91 | nsectionbreaks = 0 92 | last_sentence = '.' 93 | elif last_sentence.endswith('"') or sentence.startswith('"'): 94 | text.append('\n\n') 95 | text.append(sentence + ' ') 96 | last_sentence = sentence 97 | f.write(''.join(text)) 98 | --------------------------------------------------------------------------------