├── hashcat-rules
    ├── passphrase-rule1.rule
    └── passphrase-rule2.rule
├── LICENSE
├── utilities
    ├── updating-sources.md
    ├── kym_scrape.py
    └── cleanup.py
└── README.md


/hashcat-rules/passphrase-rule1.rule:
--------------------------------------------------------------------------------
 1 | :
 2 | s -
 3 | s .
 4 | s _
 5 | @ 
 6 | c
 7 | u
 8 | C
 9 | @ c
10 | @ C
11 | @ u
12 | E
13 | E@ 
14 | s - e-
15 | s . e.
16 | s _ e_
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 InitString
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/hashcat-rules/passphrase-rule2.rule:
--------------------------------------------------------------------------------
 1 | # Do Nothing
 2 | :
 3 | 
 4 | # Add years to end - both with spaces and without
 5 | $2$0$2$0
 6 | $2$0$2$1
 7 | $2$0$2$2
 8 | $2$0$2$3
 9 | $2$0$2$4
10 | $2$0$2$5
11 | $2$0$2$0$!
12 | $2$0$2$1$!
13 | $2$0$2$2$!
14 | $2$0$2$3$!
15 | $2$0$2$4$!
16 | $2$0$2$5$!
17 | $ $2$0$2$0
18 | $ $2$0$2$1
19 | $ $2$0$2$2
20 | $ $2$0$2$3
21 | $ $2$0$2$4
22 | $ $2$0$2$5
23 | $ $2$0$2$0$!
24 | $ $2$0$2$1$!
25 | $ $2$0$2$2$!
26 | $ $2$0$2$3$!
27 | $ $2$0$2$4$!
28 | $ $2$0$2$5$!
29 | 
30 | # Add common numbers to the end - both with spaces and without
31 | $1
32 | $1$!
33 | $1$2$3
34 | $1$2$3$!
35 | $ $1
36 | $ $1$!
37 | $ $1$2$3
38 | $ $1$2$3$!
39 | 
40 | # Add common numbers to the beginning - both with spaces and without (have to do them backwards with a prepend)
41 | ^1
42 | ^3^2^1
43 | ^ ^1
44 | ^ ^3^2^1
45 | 
46 | # Add common punctuation to end
47 | $!
48 | $?
49 | 
50 | # G3t 133t (just the common ones) across whole phrase
51 | sa@sA@
52 | se3sE3
53 | sl1sL1
54 | so0sO0
55 | ss5sS5
56 | sa@sA@se3sE3so0sO0ss5sS5
57 | sa@sA@$!
58 | se3sE3$!
59 | sl1sL1$!
60 | so0sO0$!
61 | ss5sS5$!
62 | sa@sA@se3sE3so0sO0ss5sS5$!
63 | 
64 | # Hashcat doesn't support 'nth place'positional replace in rule sets yet.
65 | # So we can say 'replace only the first A with @'
66 | # See: https://hashcat.net/wiki/doku.php?id=rule_based_attack#using_p_nth_instance_of_a_character_with_positional_rules
67 | # So, we are going to make some guesses here.... Sub in l33t characters at positions 1,2,3 and also end with !
68 | # Not a great way to do it, but oh well. Will update in future if feature becomes available.
69 | o1@
70 | o2@
71 | o3@
72 | o13
73 | o23
74 | o33
75 | o11
76 | o21
77 | o31
78 | o10
79 | o20
80 | o30
81 | o15
82 | o25
83 | o35
84 | o1@$!
85 | o2@$!
86 | o3@$!
87 | o13$!
88 | o23$!
89 | o33$!
90 | o11$!
91 | o21$!
92 | o31$!
93 | o10$!
94 | o20$!
95 | o30$!
96 | o15$!
97 | o25$!
98 | o35$!
99 | 


--------------------------------------------------------------------------------
/utilities/updating-sources.md:
--------------------------------------------------------------------------------
 1 | # Notes on updating sources
 2 | 
 3 | Sure, this should be a CI job. But hey, it's a start.
 4 | 
 5 | Some of the source files get regular updates. Below is a guide to obtaining those, preparing them for cleaning, actually cleaning, and then merging into the existing list.
 6 | 
 7 | ## IMDB titles
 8 | 
 9 | ```
10 | wget https://datasets.imdbws.com/title.basics.tsv.gz
11 | gunzip ./title.basics.tsv.gz
12 | cat title.basics.tsv | awk -F '\t' '{print $3}' > ./imdb-titles-$(date +%Y-%m-%d).txt
13 | rm title.basics.tsv
14 | ```
15 | 
16 | ## Wikipedia article titles & category names
17 | 
18 | ```
19 | wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream-index.txt.bz2
20 | gunzip2 ./enwiki-latest-pages-articles-multistream-index.txt.bz2
21 | cat ./enwiki-latest-pages-articles-multistream-index.txt | cut -d: -f 3 > ./wikipedia-$(date +%Y-%m-%d).txt
22 | rm enwiki-latest-pages-articles-multistream-index.txt
23 | 
24 | ```
25 | 
26 | ## Wiktionary titles
27 | 
28 | ```
29 | wget https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-all-titles.gz
30 | gunzip enwiktionary-latest-all-titles.gz
31 | cat enwiktionary-latest-all-titles | awk -F '\t' '{print $2}' > ./wiktionary-$(date +%Y-%m-%d).txt
32 | rm enwiktionary-latest-all-titles
33 | 
34 | ```
35 | 
36 | ## Urban Dictionary
37 | 
38 | ```
39 | git clone https://github.com/initstring/urban-dictionary-word-list
40 | cd urban-dictionary-word-list
41 | touch urban-dictionary-$(date +%Y-%m-%d).txt
42 | python3 ./main.py --out urban-dictionary-$(date +%Y-%m-%d).txt
43 | ```
44 | 
45 | ## Know Your Meme
46 | 
47 | ```
48 | python3 /utilities/kym_scrape.py
49 | mv memes.txt ./know-your-meme-$(date +%Y-%m-%d).txt
50 | ```
51 | 
52 | ## Global POI dataset
53 | 
54 | ```
55 | wget http://download.geonames.org/export/dump/allCountries.zip
56 | unzip ./allCountries.zip
57 | cat allCountries.txt | awk -F '\t' '{print $3}' > ./global-poi-$(date +%Y-%m-%d).txt
58 | rm allCountries.zip
59 | rm allCountries.txt
60 | ```
61 | 
62 | ## Billboard charts
63 | 
64 | ```
65 | git clone https://github.com/initstring/umdmusic-downloader
66 | cd umdmusic-downloader
67 | pip3 install -r ./requirements.txt
68 | python3 ./downloader.py
69 | cat ./us_billboard.psv | cut -d "|" -f 5 > ./billboard-titles-$(date +%Y-%m-%d).txt
70 | cat ./us_billboard.psv | cut -d "|" -f 6 | sed "s/ featuring /\n/g" > ./billboard-artists-$(date +%Y-%m-%d).txt
71 | rm ./us_billboard.psv
72 | ```
73 | 
74 | ## Combining
75 | 
76 | With all raw files in the same folder:
77 | 
78 | ```
79 | cat ./*.txt | sort -u > raw.txt
80 | python3 ./cleanup.py raw.txt passphrases.txt
81 | ```
82 | 
83 | If you generate a new version and want to compare what's new you can use a command like:
84 | 
85 | ```
86 | sort new.txt old.txt | uniq -u
87 | ```


--------------------------------------------------------------------------------
/utilities/kym_scrape.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | Know Your Meme Scraper
  5 | Grabs all titles from https://knowyourmeme.com
  6 | 
  7 | Puts output into memes.txt
  8 | 
  9 | Used to feed into password cracking wordlists like
 10 | https://github.com/initstring/passphrase-wordlist
 11 | 
 12 | Code by initstring
 13 | """
 14 | 
 15 | import html
 16 | import time
 17 | import re
 18 | import sys
 19 | import requests
 20 | 
 21 | # The "all" URL allows autoscrolling
 22 | KYM_URL = 'https://knowyourmeme.com/memes/all/page'
 23 | 
 24 | # Regex to grab all formatted titles
 25 | RE_TITLE = re.compile(r'<h3[^>]*>\s*(.*?)\s*</h3>')
 26 | 
 27 | # Text to know when we reached end of line
 28 | NO_MORE = 'There are no entries for this category'
 29 | 
 30 | # Need real headers to get past WAF
 31 | HEADERS = {'User-Agent': 'Mozilla/5.0'}
 32 | 
 33 | # Out file
 34 | OUTFILE = 'memes.txt'
 35 | 
 36 | # File for in-process scraping
 37 | LOGFILE = 'memes-incomplete.txt'
 38 | 
 39 | # Sleep to avoid IP ban
 40 | SLEEP = 3
 41 | 
 42 | def write_log(phrases):
 43 |     """
 44 |     Logs phrases as the program runs
 45 | 
 46 |     Used for troubleshooting or to at least have _something_ in the case of
 47 |     IP ban, failure, etc
 48 |     """
 49 |     with open(LOGFILE, 'a') as logfile:
 50 |         for phrase in phrases:
 51 |             phrase = html.unescape(phrase)
 52 |             logfile.write(phrase + '\n')
 53 | 
 54 | def write_final(phrases):
 55 |     """
 56 |     Writes all phrases to a log file
 57 |     """
 58 |     # Unescape the HTML and write the phrases out
 59 |     with open(OUTFILE, 'w') as outfile:
 60 |         for phrase in phrases:
 61 |             phrase = html.unescape(phrase)
 62 |             outfile.write(phrase + '\n')
 63 | 
 64 | def scrape_pages():
 65 |     """
 66 |     Loops through all pages of kym
 67 |     """
 68 |     page = 0
 69 |     phrases = set([])
 70 | 
 71 |     while True:
 72 |         # Build the URL based on auto-scroll behaviour
 73 |         url = "{}/{}".format(KYM_URL, page)
 74 |         response = requests.get(url, headers=HEADERS)
 75 | 
 76 |         # Check for IP ban
 77 |         if response.status_code == 403:
 78 |             print("\n[!] You have been IP banned. Oops.")
 79 |             sys.exit()
 80 | 
 81 |         # Return if no more results
 82 |         if response.status_code == 404:
 83 |             print("\n[*] Reached end of line at page {}. Exiting"
 84 |                   .format(page))
 85 |             return phrases
 86 | 
 87 |         # Clear stdout for ongoing notifications
 88 |         sys.stdout.flush()
 89 |         sys.stdout.write(" " * 20)
 90 |         sys.stdout.write("\r")
 91 | 
 92 |         # Grab phrases from the raw text and add to set
 93 |         new_phrases = re.findall(RE_TITLE, response.text)
 94 |         phrases.update(new_phrases)
 95 | 
 96 |         # Write the new phrases to an ongoing logile
 97 |         write_log(new_phrases)
 98 | 
 99 |         # Update the patiently waiting user
100 |         sys.stdout.write("[*] Page: {}, Phrases: {}, Unique Phrases: {}"
101 |                          .format(page, len(new_phrases), len(phrases)))
102 | 
103 |         # Increment the page for the next loop
104 |         page += 1
105 | 
106 |         # Sleep to avoid IP ban
107 |         time.sleep(SLEEP)
108 | 
109 | 
110 | def main():
111 |     """
112 |     Main program function
113 |     """
114 |     print("[*] Scraping all pages of KYM...")
115 |     phrases = scrape_pages()
116 | 
117 |     print("[+] Found {} phrases, writing to {}..."
118 |           .format(len(phrases), OUTFILE))
119 |     write_final(phrases)
120 | 
121 | 
122 | if __name__ == "__main__":
123 |     main()
124 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Overview
 2 | 
 3 | People think they are getting smarter by using passphrases. Let's prove them wrong!
 4 | 
 5 | This project includes a massive wordlist of phrases (over 20 million) and two hashcat rule files for GPU-based cracking. The rules will create over 1,000 permutations of each phase.
 6 | 
 7 | To use this project, you need:
 8 | 
 9 | - The wordlist `passphrases.txt`, which you can find under [releases](https://github.com/initstring/passphrase-wordlist/releases).
10 | - Both hashcat rules [here](/hashcat-rules/).
11 | 
12 | **WORDLIST LAST UPDATED**: July 2025
13 | 
14 | # Usage
15 | 
16 | Generally, you will use with hashcat's `-a 0` mode which takes a wordlist and allows rule files. It is important to use the rule files in the correct order, as rule #1 mostly handles capital letters and spaces, and rule #2 deals with permutations.
17 | 
18 | Here is an example for NTLMv2 hashes: If you use the `-O` option, watch out for what the maximum password length is set to - it may be too short.
19 | 
20 | ```
21 | hashcat -a 0 -m 5600 hashes.txt passphrases.txt -r passphrase-rule1.rule -r passphrase-rule2.rule -O -w 3
22 | ```
23 | 
24 | # Sources Used
25 | 
26 | Some sources are pulled from a static dataset, like a Kaggle upload. Others I generate myself using various scripts and APIs. I might one day automate that via CI, but for now you can see how I update the dynamic sources [here](/utilities/updating-sources.md).
27 | 
28 | | <ins>**source file name**</ins> | <ins>**source type**</ins> | <ins>**description**</ins> |
29 | | --- | --- | --- |
30 | | wiktionary-$(date).txt | dynamic | Article titles scraped from Wiktionary's index dump [here.](https://dumps.wikimedia.org/enwiktionary) |
31 | | wikipedia-$(date).txt | dynamic | Article titles scraped from the Wikipedia `pages-articles-multistream-index` dump generated 29-Sept-2021 [here.](https://dumps.wikimedia.org/enwiki) |
32 | | urban-dictionary-$(date).txt | dynamic | Urban Dictionary dataset pulled using [this script](https://github.com/mattbierner/urban-dictionary-word-list). |
33 | | know-your-meme-$(date).txt | dynamic | Meme titles from KnownYourMeme scraped using my tool [here.](/utilities/kym_scrape.py) |
34 | | imdb-titles-$(date).txt | dynamic | IMDB dataset using the "primaryTitle" column from `title.basics.tsv.gz` file available [here](https://datasets.imdbws.com/) |
35 | | global-poi-$(date).txt | dynamic | [Global POI dataset](https://download.geonames.org/export/dump/) using the 'allCountries' file from 29-Sept-2021. |
36 | | billboard-titles-$(date).txt | dynamic | Album and track names using [Ultimate Music Database](https://www.umdmusic.com/), scraped with [a fork of mwkling's tool](https://github.com/initstring/umdmusic-downloader), modified to grab Billboard Singles (1940-2021) and Billboard Albums (1970-2021) charts. |
37 | | billboard-artists-$(date).txt | dynamic | Artist names using [Ultimate Music Database](https://www.umdmusic.com/), scraped with [a fork of mwkling's tool](https://github.com/initstring/umdmusic-downloader), modified to grab Billboard Singles (1940-2021) and Billboard Albums (1970-2021) charts. |
38 | | book.txt | static | Kaggle dataset with titles from over 300,000 books. |
39 | | rstone-top-100.txt | static<br>(could be dynamic in future) | Song lyrics for Rolling Stone's "top 100" artists using my [lyric scraping tool](https://github.com/initstring/lyricpass). |
40 | | cornell-movie-titles-raw.txt | static | Movie titles from this [Cornell project](https://www.cs.cornell.edu/~cristian//Cornell_Movie-Dialogs_Corpus.html). |
41 | | cornell-movie-lines.txt | static | Movie lines from this [Cornell project](https://www.cs.cornell.edu/~cristian//Cornell_Movie-Dialogs_Corpus.html). |
42 | | author-quotes-raw.txt | static | [Quotables](https://www.kaggle.com/alvations/quotables) dataset on Kaggle. |
43 | | 1800-phrases-raw.txt | static | [1,800 English Phrases.](https://www.phrases.org.uk/meanings/phrases-and-sayings-list.html) |
44 | | 15k-phrases-raw.txt | static | [15,000 Useful Phrases.](https://www.gutenberg.org/ebooks/18362) |
45 | 
46 | # Hashcat Rules
47 | 
48 | The rule files are designed to both "shape" the password and to mutate it. Shaping is based on the idea that human beings follow fairly predictable patterns when choosing a password, such as capitalising the first letter of each word and following the phrase with a number or special character. Mutations are also fairly predictable, such as replacing letters with visually-similar special characters.
49 | 
50 | Given the phrase `take the red pill` the first hashcat rule will output the following:
51 | 
52 | ```
53 | take the red pill
54 | take-the-red-pill
55 | take.the.red.pill
56 | take_the_red_pill
57 | taketheredpill
58 | Take the red pill
59 | TAKE THE RED PILL
60 | tAKE THE RED PILL
61 | Taketheredpill
62 | tAKETHEREDPILL
63 | TAKETHEREDPILL
64 | Take The Red Pill
65 | TakeTheRedPill
66 | Take-The-Red-Pill
67 | Take.The.Red.Pill
68 | Take_The_Red_Pill
69 | ```
70 | 
71 | Adding in the second hashcat rule makes things get a bit more interesting. That will return a huge list per candidate. Here are a couple examples:
72 | 
73 | ```
74 | T@k3Th3R3dPill!
75 | T@ke-The-Red-Pill
76 | taketheredpill2020!
77 | T0KE THE RED PILL
78 | ```
79 | 
80 | # Additional Info
81 | 
82 | Optionally, some researchers might be interested in the script I use to clean the raw sources into the wordlist [here](/utilities/cleanup.py).
83 | 
84 | The cleanup script works like this:
85 | 
86 | ```
87 | $ python3.6 cleanup.py infile.txt outfile.txt
88 | Reading from ./infile.txt: 505 MB
89 | Wrote to ./outfile.txt: 250 MB
90 | Elapsed time: 0:02:53.062531
91 | 
92 | ```
93 | 
94 | Enjoy!
95 | 


--------------------------------------------------------------------------------
/utilities/cleanup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Prepares passphrase cracking lists for use with the hashcat rules at
  6 | github.com/initstring/passphrase-wordlist
  7 | """
  8 | 
  9 | import sys
 10 | import re
 11 | import urllib.parse
 12 | import html
 13 | import os
 14 | import time
 15 | import argparse
 16 | from datetime import timedelta
 17 | 
 18 | # Set a min/max passphrase character length. Change this if you want.
 19 | MIN_LENGTH = 8
 20 | MAX_LENGTH = 40
 21 | 
 22 | # Compiled regex patterns for performance
 23 | MULTIWORD_PATTERN = re.compile('[a-z0-9\'&] [a-z0-9\'&]')
 24 | ALLOWED_CHARS_PATTERN = re.compile("[^a-zA-Z0-9 '&]")
 25 | MULTIPLE_SPACES_PATTERN = re.compile(r'\s\s+')
 26 | QUOTE_REMOVAL_PATTERN = re.compile(r" '([^']*)' ")
 27 | WHITESPACE_PATTERN = re.compile(r'\s+')
 28 | HYPHEN_UNDERSCORE_PATTERN = re.compile(r'[-_]')
 29 | APOSTROPHE_REMOVAL_PATTERN = re.compile("'")
 30 | AND_TO_AMPERSAND_PATTERN = re.compile(' and ')
 31 | AMPERSAND_TO_AND_PATTERN = re.compile('&')
 32 | 
 33 | # Accented character patterns
 34 | ACCENTED_A_PATTERN = re.compile('[àáâãäå]')
 35 | ACCENTED_E_PATTERN = re.compile('[èéêë]')
 36 | ACCENTED_I_PATTERN = re.compile('[ìíîï]')
 37 | ACCENTED_O_PATTERN = re.compile('[òóôõö]')
 38 | ACCENTED_U_PATTERN = re.compile('[ùúûü]')
 39 | ACCENTED_N_PATTERN = re.compile('[ñ]')
 40 | ACCENTED_C_PATTERN = re.compile('[ç]')
 41 | ACCENTED_Y_PATTERN = re.compile('[ÿ]')
 42 | 
 43 | # Split pattern
 44 | SPLIT_PATTERN = re.compile(r';|,|\.')
 45 | 
 46 | def parse_arguments():
 47 |     """
 48 |     Handles user-passed parameters
 49 |     """
 50 |     desc = 'Transforms text files in passphrase lists.'
 51 |     parser = argparse.ArgumentParser(description=desc)
 52 | 
 53 |     parser.add_argument('infile', type=str, action='store',
 54 |                         help='Input file.')
 55 |     parser.add_argument('outfile', type=str, action='store',
 56 |                         help='Output file.')
 57 | 
 58 |     args = parser.parse_args()
 59 | 
 60 |     if not os.access(args.infile, os.R_OK):
 61 |         print("[!] Cannot access input file, exiting")
 62 |         sys.exit()
 63 | 
 64 |     return args
 65 | 
 66 | def build_buffer(infile):
 67 |     """
 68 |     Reads infile and builds a list of candidates for additional processing
 69 |     """
 70 |     buffer = []
 71 | 
 72 |     infile_size = str((int(os.path.getsize(infile)/1000000))) + " MB"
 73 |     print("Reading from {}: {}".format(infile, infile_size))
 74 | 
 75 |     with open(infile, encoding='utf-8', errors='ignore') as file_handler:
 76 |         for line in file_handler:
 77 |             candidates = []
 78 |             # Remove HTML and URL encoding first
 79 |             line = escape_encoding(line)
 80 | 
 81 |             # Split lines with common delimiters like . , or ;
 82 |             for split_line in SPLIT_PATTERN.split(line):
 83 |                 candidates.append(split_line.strip())
 84 | 
 85 |             # There is a new short list, append each to the buffer
 86 |             for string in candidates:
 87 |                 buffer.append(string)
 88 | 
 89 |     return buffer
 90 | 
 91 | def handle_punctuation(line):
 92 |     """
 93 |     Deals with common punctionation
 94 |     """
 95 |     clean_lines = []
 96 | 
 97 |     # Gets rid of any remaining special characters in the name
 98 |     line = ALLOWED_CHARS_PATTERN.sub('', line)
 99 | 
100 |     # Shrinks down multiple spaces
101 |     line = MULTIPLE_SPACES_PATTERN.sub(' ', line)
102 | 
103 |     # Strip quotes around line
104 |     line = line.strip('\'"')
105 | 
106 |     # Remove quotes around internal segments
107 |     line = QUOTE_REMOVAL_PATTERN.sub(r' \1 ', line)
108 | 
109 |     # If line has an apostrophe make a duplicate without deleting it
110 |     if "'" in line:
111 |         clean_lines.append(APOSTROPHE_REMOVAL_PATTERN.sub("", line))
112 | 
113 |     # Making duplicating phrases including and / &
114 |     if ' and ' in line:
115 |         clean_lines.append(AND_TO_AMPERSAND_PATTERN.sub(' & ', line))
116 |     if '&' in line:
117 |         newline = AMPERSAND_TO_AND_PATTERN.sub(' and ', line)
118 |         newline = WHITESPACE_PATTERN.sub(' ', newline).strip()
119 |         clean_lines.append(newline)
120 | 
121 |     # Add what is left to the list and return it
122 |     clean_lines.append(line)
123 |     return clean_lines
124 | 
125 | def escape_encoding(line):
126 |     """
127 |     Deals with common encoding and accented characters
128 |     """
129 |     line = urllib.parse.unquote(line)       # convert URL encoding like %27
130 |     line = html.unescape(line)              # convert HTML encoding like &apos;
131 |     line = WHITESPACE_PATTERN.sub(' ', line).strip() # Remove extra whitespace
132 |     line = line.lower()                     # convert to lowercase
133 |     line = HYPHEN_UNDERSCORE_PATTERN.sub(' ', line)       # Change - and _ to spaces
134 | 
135 |     # The following lines attempt to remove accented characters, as the
136 |     # tool is focused on Engligh-language passwords.
137 |     line = ACCENTED_A_PATTERN.sub('a', line)
138 |     line = ACCENTED_E_PATTERN.sub('e', line)
139 |     line = ACCENTED_I_PATTERN.sub('i', line)
140 |     line = ACCENTED_O_PATTERN.sub('o', line)
141 |     line = ACCENTED_U_PATTERN.sub('u', line)
142 |     line = ACCENTED_N_PATTERN.sub('n', line)
143 |     line = ACCENTED_C_PATTERN.sub('c', line)
144 |     line = ACCENTED_Y_PATTERN.sub('y', line)
145 | 
146 |     return line
147 | 
148 | def choose_candidates(line):
149 |     """
150 |     Final check to determine with cleaned phrases to keep
151 |     """
152 |     # Throw out single-word candidates
153 |     if not MULTIWORD_PATTERN.search(line):
154 |         return False
155 | 
156 |     # Thow out too short / too long lines
157 |     if len(line) < MIN_LENGTH or len(line) > MAX_LENGTH:
158 |         return False
159 | 
160 |     return True
161 | 
162 | def write_file(buffer, outfile):
163 |     """
164 |     Writes choses candidates to an output file
165 |     """
166 |     with open(outfile, 'w') as file_handler:
167 |         for line in sorted(buffer):
168 |             file_handler.write(line.strip() + '\n')
169 | 
170 |     outfile_size = str((int(os.path.getsize(outfile)/1000000)))
171 |     print("Wrote to {}: {} MB".format(outfile, outfile_size))
172 | 
173 | 
174 | def main():
175 |     """
176 |     Main program function
177 |     """
178 |     start = time.time()
179 |     args = parse_arguments()
180 |     buffer = build_buffer(args.infile)
181 |     final = set([])
182 |     # Processes phrases and adds to a set (deduped)
183 |     for phrase in buffer:
184 |         new_phrases = handle_punctuation(phrase)
185 |         for newphrase in new_phrases:
186 |             if choose_candidates(newphrase):
187 |                 final.add(newphrase)
188 |     # Writes final set out to file
189 |     write_file(final, args.outfile)
190 |     elapsed = time.time() - start
191 |     print("Elapsed time: " + str(timedelta(seconds=elapsed)))
192 | 
193 | 
194 | if __name__ == "__main__":
195 |     main()
196 | 


--------------------------------------------------------------------------------