├── hashcat-rules ├── passphrase-rule1.rule └── passphrase-rule2.rule ├── LICENSE ├── utilities ├── updating-sources.md ├── kym_scrape.py └── cleanup.py └── README.md /hashcat-rules/passphrase-rule1.rule: -------------------------------------------------------------------------------- 1 | : 2 | s - 3 | s . 4 | s _ 5 | @ 6 | c 7 | u 8 | C 9 | @ c 10 | @ C 11 | @ u 12 | E 13 | E@ 14 | s - e- 15 | s . e. 16 | s _ e_ 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 InitString 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /hashcat-rules/passphrase-rule2.rule: -------------------------------------------------------------------------------- 1 | # Do Nothing 2 | : 3 | 4 | # Add years to end - both with spaces and without 5 | $2$0$2$0 6 | $2$0$2$1 7 | $2$0$2$2 8 | $2$0$2$3 9 | $2$0$2$4 10 | $2$0$2$5 11 | $2$0$2$0$! 12 | $2$0$2$1$! 13 | $2$0$2$2$! 14 | $2$0$2$3$! 15 | $2$0$2$4$! 16 | $2$0$2$5$! 17 | $ $2$0$2$0 18 | $ $2$0$2$1 19 | $ $2$0$2$2 20 | $ $2$0$2$3 21 | $ $2$0$2$4 22 | $ $2$0$2$5 23 | $ $2$0$2$0$! 24 | $ $2$0$2$1$! 25 | $ $2$0$2$2$! 26 | $ $2$0$2$3$! 27 | $ $2$0$2$4$! 28 | $ $2$0$2$5$! 29 | 30 | # Add common numbers to the end - both with spaces and without 31 | $1 32 | $1$! 33 | $1$2$3 34 | $1$2$3$! 35 | $ $1 36 | $ $1$! 37 | $ $1$2$3 38 | $ $1$2$3$! 39 | 40 | # Add common numbers to the beginning - both with spaces and without (have to do them backwards with a prepend) 41 | ^1 42 | ^3^2^1 43 | ^ ^1 44 | ^ ^3^2^1 45 | 46 | # Add common punctuation to end 47 | $! 48 | $? 49 | 50 | # G3t 133t (just the common ones) across whole phrase 51 | sa@sA@ 52 | se3sE3 53 | sl1sL1 54 | so0sO0 55 | ss5sS5 56 | sa@sA@se3sE3so0sO0ss5sS5 57 | sa@sA@$! 58 | se3sE3$! 59 | sl1sL1$! 60 | so0sO0$! 61 | ss5sS5$! 62 | sa@sA@se3sE3so0sO0ss5sS5$! 63 | 64 | # Hashcat doesn't support 'nth place'positional replace in rule sets yet. 65 | # So we can say 'replace only the first A with @' 66 | # See: https://hashcat.net/wiki/doku.php?id=rule_based_attack#using_p_nth_instance_of_a_character_with_positional_rules 67 | # So, we are going to make some guesses here.... Sub in l33t characters at positions 1,2,3 and also end with ! 68 | # Not a great way to do it, but oh well. Will update in future if feature becomes available. 69 | o1@ 70 | o2@ 71 | o3@ 72 | o13 73 | o23 74 | o33 75 | o11 76 | o21 77 | o31 78 | o10 79 | o20 80 | o30 81 | o15 82 | o25 83 | o35 84 | o1@$! 85 | o2@$! 86 | o3@$! 87 | o13$! 88 | o23$! 89 | o33$! 90 | o11$! 91 | o21$! 92 | o31$! 93 | o10$! 94 | o20$! 95 | o30$! 96 | o15$! 97 | o25$! 98 | o35$! 99 | -------------------------------------------------------------------------------- /utilities/updating-sources.md: -------------------------------------------------------------------------------- 1 | # Notes on updating sources 2 | 3 | Sure, this should be a CI job. But hey, it's a start. 4 | 5 | Some of the source files get regular updates. Below is a guide to obtaining those, preparing them for cleaning, actually cleaning, and then merging into the existing list. 6 | 7 | ## IMDB titles 8 | 9 | ``` 10 | wget https://datasets.imdbws.com/title.basics.tsv.gz 11 | gunzip ./title.basics.tsv.gz 12 | cat title.basics.tsv | awk -F '\t' '{print $3}' > ./imdb-titles-$(date +%Y-%m-%d).txt 13 | rm title.basics.tsv 14 | ``` 15 | 16 | ## Wikipedia article titles & category names 17 | 18 | ``` 19 | wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream-index.txt.bz2 20 | gunzip2 ./enwiki-latest-pages-articles-multistream-index.txt.bz2 21 | cat ./enwiki-latest-pages-articles-multistream-index.txt | cut -d: -f 3 > ./wikipedia-$(date +%Y-%m-%d).txt 22 | rm enwiki-latest-pages-articles-multistream-index.txt 23 | 24 | ``` 25 | 26 | ## Wiktionary titles 27 | 28 | ``` 29 | wget https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-all-titles.gz 30 | gunzip enwiktionary-latest-all-titles.gz 31 | cat enwiktionary-latest-all-titles | awk -F '\t' '{print $2}' > ./wiktionary-$(date +%Y-%m-%d).txt 32 | rm enwiktionary-latest-all-titles 33 | 34 | ``` 35 | 36 | ## Urban Dictionary 37 | 38 | ``` 39 | git clone https://github.com/initstring/urban-dictionary-word-list 40 | cd urban-dictionary-word-list 41 | touch urban-dictionary-$(date +%Y-%m-%d).txt 42 | python3 ./main.py --out urban-dictionary-$(date +%Y-%m-%d).txt 43 | ``` 44 | 45 | ## Know Your Meme 46 | 47 | ``` 48 | python3 /utilities/kym_scrape.py 49 | mv memes.txt ./know-your-meme-$(date +%Y-%m-%d).txt 50 | ``` 51 | 52 | ## Global POI dataset 53 | 54 | ``` 55 | wget http://download.geonames.org/export/dump/allCountries.zip 56 | unzip ./allCountries.zip 57 | cat allCountries.txt | awk -F '\t' '{print $3}' > ./global-poi-$(date +%Y-%m-%d).txt 58 | rm allCountries.zip 59 | rm allCountries.txt 60 | ``` 61 | 62 | ## Billboard charts 63 | 64 | ``` 65 | git clone https://github.com/initstring/umdmusic-downloader 66 | cd umdmusic-downloader 67 | pip3 install -r ./requirements.txt 68 | python3 ./downloader.py 69 | cat ./us_billboard.psv | cut -d "|" -f 5 > ./billboard-titles-$(date +%Y-%m-%d).txt 70 | cat ./us_billboard.psv | cut -d "|" -f 6 | sed "s/ featuring /\n/g" > ./billboard-artists-$(date +%Y-%m-%d).txt 71 | rm ./us_billboard.psv 72 | ``` 73 | 74 | ## Combining 75 | 76 | With all raw files in the same folder: 77 | 78 | ``` 79 | cat ./*.txt | sort -u > raw.txt 80 | python3 ./cleanup.py raw.txt passphrases.txt 81 | ``` 82 | 83 | If you generate a new version and want to compare what's new you can use a command like: 84 | 85 | ``` 86 | sort new.txt old.txt | uniq -u 87 | ``` -------------------------------------------------------------------------------- /utilities/kym_scrape.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Know Your Meme Scraper 5 | Grabs all titles from https://knowyourmeme.com 6 | 7 | Puts output into memes.txt 8 | 9 | Used to feed into password cracking wordlists like 10 | https://github.com/initstring/passphrase-wordlist 11 | 12 | Code by initstring 13 | """ 14 | 15 | import html 16 | import time 17 | import re 18 | import sys 19 | import requests 20 | 21 | # The "all" URL allows autoscrolling 22 | KYM_URL = 'https://knowyourmeme.com/memes/all/page' 23 | 24 | # Regex to grab all formatted titles 25 | RE_TITLE = re.compile(r']*>\s*(.*?)\s*') 26 | 27 | # Text to know when we reached end of line 28 | NO_MORE = 'There are no entries for this category' 29 | 30 | # Need real headers to get past WAF 31 | HEADERS = {'User-Agent': 'Mozilla/5.0'} 32 | 33 | # Out file 34 | OUTFILE = 'memes.txt' 35 | 36 | # File for in-process scraping 37 | LOGFILE = 'memes-incomplete.txt' 38 | 39 | # Sleep to avoid IP ban 40 | SLEEP = 3 41 | 42 | def write_log(phrases): 43 | """ 44 | Logs phrases as the program runs 45 | 46 | Used for troubleshooting or to at least have _something_ in the case of 47 | IP ban, failure, etc 48 | """ 49 | with open(LOGFILE, 'a') as logfile: 50 | for phrase in phrases: 51 | phrase = html.unescape(phrase) 52 | logfile.write(phrase + '\n') 53 | 54 | def write_final(phrases): 55 | """ 56 | Writes all phrases to a log file 57 | """ 58 | # Unescape the HTML and write the phrases out 59 | with open(OUTFILE, 'w') as outfile: 60 | for phrase in phrases: 61 | phrase = html.unescape(phrase) 62 | outfile.write(phrase + '\n') 63 | 64 | def scrape_pages(): 65 | """ 66 | Loops through all pages of kym 67 | """ 68 | page = 0 69 | phrases = set([]) 70 | 71 | while True: 72 | # Build the URL based on auto-scroll behaviour 73 | url = "{}/{}".format(KYM_URL, page) 74 | response = requests.get(url, headers=HEADERS) 75 | 76 | # Check for IP ban 77 | if response.status_code == 403: 78 | print("\n[!] You have been IP banned. Oops.") 79 | sys.exit() 80 | 81 | # Return if no more results 82 | if response.status_code == 404: 83 | print("\n[*] Reached end of line at page {}. Exiting" 84 | .format(page)) 85 | return phrases 86 | 87 | # Clear stdout for ongoing notifications 88 | sys.stdout.flush() 89 | sys.stdout.write(" " * 20) 90 | sys.stdout.write("\r") 91 | 92 | # Grab phrases from the raw text and add to set 93 | new_phrases = re.findall(RE_TITLE, response.text) 94 | phrases.update(new_phrases) 95 | 96 | # Write the new phrases to an ongoing logile 97 | write_log(new_phrases) 98 | 99 | # Update the patiently waiting user 100 | sys.stdout.write("[*] Page: {}, Phrases: {}, Unique Phrases: {}" 101 | .format(page, len(new_phrases), len(phrases))) 102 | 103 | # Increment the page for the next loop 104 | page += 1 105 | 106 | # Sleep to avoid IP ban 107 | time.sleep(SLEEP) 108 | 109 | 110 | def main(): 111 | """ 112 | Main program function 113 | """ 114 | print("[*] Scraping all pages of KYM...") 115 | phrases = scrape_pages() 116 | 117 | print("[+] Found {} phrases, writing to {}..." 118 | .format(len(phrases), OUTFILE)) 119 | write_final(phrases) 120 | 121 | 122 | if __name__ == "__main__": 123 | main() 124 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | People think they are getting smarter by using passphrases. Let's prove them wrong! 4 | 5 | This project includes a massive wordlist of phrases (over 20 million) and two hashcat rule files for GPU-based cracking. The rules will create over 1,000 permutations of each phase. 6 | 7 | To use this project, you need: 8 | 9 | - The wordlist `passphrases.txt`, which you can find under [releases](https://github.com/initstring/passphrase-wordlist/releases). 10 | - Both hashcat rules [here](/hashcat-rules/). 11 | 12 | **WORDLIST LAST UPDATED**: July 2025 13 | 14 | # Usage 15 | 16 | Generally, you will use with hashcat's `-a 0` mode which takes a wordlist and allows rule files. It is important to use the rule files in the correct order, as rule #1 mostly handles capital letters and spaces, and rule #2 deals with permutations. 17 | 18 | Here is an example for NTLMv2 hashes: If you use the `-O` option, watch out for what the maximum password length is set to - it may be too short. 19 | 20 | ``` 21 | hashcat -a 0 -m 5600 hashes.txt passphrases.txt -r passphrase-rule1.rule -r passphrase-rule2.rule -O -w 3 22 | ``` 23 | 24 | # Sources Used 25 | 26 | Some sources are pulled from a static dataset, like a Kaggle upload. Others I generate myself using various scripts and APIs. I might one day automate that via CI, but for now you can see how I update the dynamic sources [here](/utilities/updating-sources.md). 27 | 28 | | **source file name** | **source type** | **description** | 29 | | --- | --- | --- | 30 | | wiktionary-$(date).txt | dynamic | Article titles scraped from Wiktionary's index dump [here.](https://dumps.wikimedia.org/enwiktionary) | 31 | | wikipedia-$(date).txt | dynamic | Article titles scraped from the Wikipedia `pages-articles-multistream-index` dump generated 29-Sept-2021 [here.](https://dumps.wikimedia.org/enwiki) | 32 | | urban-dictionary-$(date).txt | dynamic | Urban Dictionary dataset pulled using [this script](https://github.com/mattbierner/urban-dictionary-word-list). | 33 | | know-your-meme-$(date).txt | dynamic | Meme titles from KnownYourMeme scraped using my tool [here.](/utilities/kym_scrape.py) | 34 | | imdb-titles-$(date).txt | dynamic | IMDB dataset using the "primaryTitle" column from `title.basics.tsv.gz` file available [here](https://datasets.imdbws.com/) | 35 | | global-poi-$(date).txt | dynamic | [Global POI dataset](https://download.geonames.org/export/dump/) using the 'allCountries' file from 29-Sept-2021. | 36 | | billboard-titles-$(date).txt | dynamic | Album and track names using [Ultimate Music Database](https://www.umdmusic.com/), scraped with [a fork of mwkling's tool](https://github.com/initstring/umdmusic-downloader), modified to grab Billboard Singles (1940-2021) and Billboard Albums (1970-2021) charts. | 37 | | billboard-artists-$(date).txt | dynamic | Artist names using [Ultimate Music Database](https://www.umdmusic.com/), scraped with [a fork of mwkling's tool](https://github.com/initstring/umdmusic-downloader), modified to grab Billboard Singles (1940-2021) and Billboard Albums (1970-2021) charts. | 38 | | book.txt | static | Kaggle dataset with titles from over 300,000 books. | 39 | | rstone-top-100.txt | static
(could be dynamic in future) | Song lyrics for Rolling Stone's "top 100" artists using my [lyric scraping tool](https://github.com/initstring/lyricpass). | 40 | | cornell-movie-titles-raw.txt | static | Movie titles from this [Cornell project](https://www.cs.cornell.edu/~cristian//Cornell_Movie-Dialogs_Corpus.html). | 41 | | cornell-movie-lines.txt | static | Movie lines from this [Cornell project](https://www.cs.cornell.edu/~cristian//Cornell_Movie-Dialogs_Corpus.html). | 42 | | author-quotes-raw.txt | static | [Quotables](https://www.kaggle.com/alvations/quotables) dataset on Kaggle. | 43 | | 1800-phrases-raw.txt | static | [1,800 English Phrases.](https://www.phrases.org.uk/meanings/phrases-and-sayings-list.html) | 44 | | 15k-phrases-raw.txt | static | [15,000 Useful Phrases.](https://www.gutenberg.org/ebooks/18362) | 45 | 46 | # Hashcat Rules 47 | 48 | The rule files are designed to both "shape" the password and to mutate it. Shaping is based on the idea that human beings follow fairly predictable patterns when choosing a password, such as capitalising the first letter of each word and following the phrase with a number or special character. Mutations are also fairly predictable, such as replacing letters with visually-similar special characters. 49 | 50 | Given the phrase `take the red pill` the first hashcat rule will output the following: 51 | 52 | ``` 53 | take the red pill 54 | take-the-red-pill 55 | take.the.red.pill 56 | take_the_red_pill 57 | taketheredpill 58 | Take the red pill 59 | TAKE THE RED PILL 60 | tAKE THE RED PILL 61 | Taketheredpill 62 | tAKETHEREDPILL 63 | TAKETHEREDPILL 64 | Take The Red Pill 65 | TakeTheRedPill 66 | Take-The-Red-Pill 67 | Take.The.Red.Pill 68 | Take_The_Red_Pill 69 | ``` 70 | 71 | Adding in the second hashcat rule makes things get a bit more interesting. That will return a huge list per candidate. Here are a couple examples: 72 | 73 | ``` 74 | T@k3Th3R3dPill! 75 | T@ke-The-Red-Pill 76 | taketheredpill2020! 77 | T0KE THE RED PILL 78 | ``` 79 | 80 | # Additional Info 81 | 82 | Optionally, some researchers might be interested in the script I use to clean the raw sources into the wordlist [here](/utilities/cleanup.py). 83 | 84 | The cleanup script works like this: 85 | 86 | ``` 87 | $ python3.6 cleanup.py infile.txt outfile.txt 88 | Reading from ./infile.txt: 505 MB 89 | Wrote to ./outfile.txt: 250 MB 90 | Elapsed time: 0:02:53.062531 91 | 92 | ``` 93 | 94 | Enjoy! 95 | -------------------------------------------------------------------------------- /utilities/cleanup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Prepares passphrase cracking lists for use with the hashcat rules at 6 | github.com/initstring/passphrase-wordlist 7 | """ 8 | 9 | import sys 10 | import re 11 | import urllib.parse 12 | import html 13 | import os 14 | import time 15 | import argparse 16 | from datetime import timedelta 17 | 18 | # Set a min/max passphrase character length. Change this if you want. 19 | MIN_LENGTH = 8 20 | MAX_LENGTH = 40 21 | 22 | # Compiled regex patterns for performance 23 | MULTIWORD_PATTERN = re.compile('[a-z0-9\'&] [a-z0-9\'&]') 24 | ALLOWED_CHARS_PATTERN = re.compile("[^a-zA-Z0-9 '&]") 25 | MULTIPLE_SPACES_PATTERN = re.compile(r'\s\s+') 26 | QUOTE_REMOVAL_PATTERN = re.compile(r" '([^']*)' ") 27 | WHITESPACE_PATTERN = re.compile(r'\s+') 28 | HYPHEN_UNDERSCORE_PATTERN = re.compile(r'[-_]') 29 | APOSTROPHE_REMOVAL_PATTERN = re.compile("'") 30 | AND_TO_AMPERSAND_PATTERN = re.compile(' and ') 31 | AMPERSAND_TO_AND_PATTERN = re.compile('&') 32 | 33 | # Accented character patterns 34 | ACCENTED_A_PATTERN = re.compile('[àáâãäå]') 35 | ACCENTED_E_PATTERN = re.compile('[èéêë]') 36 | ACCENTED_I_PATTERN = re.compile('[ìíîï]') 37 | ACCENTED_O_PATTERN = re.compile('[òóôõö]') 38 | ACCENTED_U_PATTERN = re.compile('[ùúûü]') 39 | ACCENTED_N_PATTERN = re.compile('[ñ]') 40 | ACCENTED_C_PATTERN = re.compile('[ç]') 41 | ACCENTED_Y_PATTERN = re.compile('[ÿ]') 42 | 43 | # Split pattern 44 | SPLIT_PATTERN = re.compile(r';|,|\.') 45 | 46 | def parse_arguments(): 47 | """ 48 | Handles user-passed parameters 49 | """ 50 | desc = 'Transforms text files in passphrase lists.' 51 | parser = argparse.ArgumentParser(description=desc) 52 | 53 | parser.add_argument('infile', type=str, action='store', 54 | help='Input file.') 55 | parser.add_argument('outfile', type=str, action='store', 56 | help='Output file.') 57 | 58 | args = parser.parse_args() 59 | 60 | if not os.access(args.infile, os.R_OK): 61 | print("[!] Cannot access input file, exiting") 62 | sys.exit() 63 | 64 | return args 65 | 66 | def build_buffer(infile): 67 | """ 68 | Reads infile and builds a list of candidates for additional processing 69 | """ 70 | buffer = [] 71 | 72 | infile_size = str((int(os.path.getsize(infile)/1000000))) + " MB" 73 | print("Reading from {}: {}".format(infile, infile_size)) 74 | 75 | with open(infile, encoding='utf-8', errors='ignore') as file_handler: 76 | for line in file_handler: 77 | candidates = [] 78 | # Remove HTML and URL encoding first 79 | line = escape_encoding(line) 80 | 81 | # Split lines with common delimiters like . , or ; 82 | for split_line in SPLIT_PATTERN.split(line): 83 | candidates.append(split_line.strip()) 84 | 85 | # There is a new short list, append each to the buffer 86 | for string in candidates: 87 | buffer.append(string) 88 | 89 | return buffer 90 | 91 | def handle_punctuation(line): 92 | """ 93 | Deals with common punctionation 94 | """ 95 | clean_lines = [] 96 | 97 | # Gets rid of any remaining special characters in the name 98 | line = ALLOWED_CHARS_PATTERN.sub('', line) 99 | 100 | # Shrinks down multiple spaces 101 | line = MULTIPLE_SPACES_PATTERN.sub(' ', line) 102 | 103 | # Strip quotes around line 104 | line = line.strip('\'"') 105 | 106 | # Remove quotes around internal segments 107 | line = QUOTE_REMOVAL_PATTERN.sub(r' \1 ', line) 108 | 109 | # If line has an apostrophe make a duplicate without deleting it 110 | if "'" in line: 111 | clean_lines.append(APOSTROPHE_REMOVAL_PATTERN.sub("", line)) 112 | 113 | # Making duplicating phrases including and / & 114 | if ' and ' in line: 115 | clean_lines.append(AND_TO_AMPERSAND_PATTERN.sub(' & ', line)) 116 | if '&' in line: 117 | newline = AMPERSAND_TO_AND_PATTERN.sub(' and ', line) 118 | newline = WHITESPACE_PATTERN.sub(' ', newline).strip() 119 | clean_lines.append(newline) 120 | 121 | # Add what is left to the list and return it 122 | clean_lines.append(line) 123 | return clean_lines 124 | 125 | def escape_encoding(line): 126 | """ 127 | Deals with common encoding and accented characters 128 | """ 129 | line = urllib.parse.unquote(line) # convert URL encoding like %27 130 | line = html.unescape(line) # convert HTML encoding like ' 131 | line = WHITESPACE_PATTERN.sub(' ', line).strip() # Remove extra whitespace 132 | line = line.lower() # convert to lowercase 133 | line = HYPHEN_UNDERSCORE_PATTERN.sub(' ', line) # Change - and _ to spaces 134 | 135 | # The following lines attempt to remove accented characters, as the 136 | # tool is focused on Engligh-language passwords. 137 | line = ACCENTED_A_PATTERN.sub('a', line) 138 | line = ACCENTED_E_PATTERN.sub('e', line) 139 | line = ACCENTED_I_PATTERN.sub('i', line) 140 | line = ACCENTED_O_PATTERN.sub('o', line) 141 | line = ACCENTED_U_PATTERN.sub('u', line) 142 | line = ACCENTED_N_PATTERN.sub('n', line) 143 | line = ACCENTED_C_PATTERN.sub('c', line) 144 | line = ACCENTED_Y_PATTERN.sub('y', line) 145 | 146 | return line 147 | 148 | def choose_candidates(line): 149 | """ 150 | Final check to determine with cleaned phrases to keep 151 | """ 152 | # Throw out single-word candidates 153 | if not MULTIWORD_PATTERN.search(line): 154 | return False 155 | 156 | # Thow out too short / too long lines 157 | if len(line) < MIN_LENGTH or len(line) > MAX_LENGTH: 158 | return False 159 | 160 | return True 161 | 162 | def write_file(buffer, outfile): 163 | """ 164 | Writes choses candidates to an output file 165 | """ 166 | with open(outfile, 'w') as file_handler: 167 | for line in sorted(buffer): 168 | file_handler.write(line.strip() + '\n') 169 | 170 | outfile_size = str((int(os.path.getsize(outfile)/1000000))) 171 | print("Wrote to {}: {} MB".format(outfile, outfile_size)) 172 | 173 | 174 | def main(): 175 | """ 176 | Main program function 177 | """ 178 | start = time.time() 179 | args = parse_arguments() 180 | buffer = build_buffer(args.infile) 181 | final = set([]) 182 | # Processes phrases and adds to a set (deduped) 183 | for phrase in buffer: 184 | new_phrases = handle_punctuation(phrase) 185 | for newphrase in new_phrases: 186 | if choose_candidates(newphrase): 187 | final.add(newphrase) 188 | # Writes final set out to file 189 | write_file(final, args.outfile) 190 | elapsed = time.time() - start 191 | print("Elapsed time: " + str(timedelta(seconds=elapsed))) 192 | 193 | 194 | if __name__ == "__main__": 195 | main() 196 | --------------------------------------------------------------------------------