├── hashcat-rules
├── passphrase-rule1.rule
└── passphrase-rule2.rule
├── LICENSE
├── utilities
├── updating-sources.md
├── kym_scrape.py
└── cleanup.py
└── README.md
/hashcat-rules/passphrase-rule1.rule:
--------------------------------------------------------------------------------
1 | :
2 | s -
3 | s .
4 | s _
5 | @
6 | c
7 | u
8 | C
9 | @ c
10 | @ C
11 | @ u
12 | E
13 | E@
14 | s - e-
15 | s . e.
16 | s _ e_
17 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 InitString
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/hashcat-rules/passphrase-rule2.rule:
--------------------------------------------------------------------------------
1 | # Do Nothing
2 | :
3 |
4 | # Add years to end - both with spaces and without
5 | $2$0$2$0
6 | $2$0$2$1
7 | $2$0$2$2
8 | $2$0$2$3
9 | $2$0$2$4
10 | $2$0$2$5
11 | $2$0$2$0$!
12 | $2$0$2$1$!
13 | $2$0$2$2$!
14 | $2$0$2$3$!
15 | $2$0$2$4$!
16 | $2$0$2$5$!
17 | $ $2$0$2$0
18 | $ $2$0$2$1
19 | $ $2$0$2$2
20 | $ $2$0$2$3
21 | $ $2$0$2$4
22 | $ $2$0$2$5
23 | $ $2$0$2$0$!
24 | $ $2$0$2$1$!
25 | $ $2$0$2$2$!
26 | $ $2$0$2$3$!
27 | $ $2$0$2$4$!
28 | $ $2$0$2$5$!
29 |
30 | # Add common numbers to the end - both with spaces and without
31 | $1
32 | $1$!
33 | $1$2$3
34 | $1$2$3$!
35 | $ $1
36 | $ $1$!
37 | $ $1$2$3
38 | $ $1$2$3$!
39 |
40 | # Add common numbers to the beginning - both with spaces and without (have to do them backwards with a prepend)
41 | ^1
42 | ^3^2^1
43 | ^ ^1
44 | ^ ^3^2^1
45 |
46 | # Add common punctuation to end
47 | $!
48 | $?
49 |
50 | # G3t 133t (just the common ones) across whole phrase
51 | sa@sA@
52 | se3sE3
53 | sl1sL1
54 | so0sO0
55 | ss5sS5
56 | sa@sA@se3sE3so0sO0ss5sS5
57 | sa@sA@$!
58 | se3sE3$!
59 | sl1sL1$!
60 | so0sO0$!
61 | ss5sS5$!
62 | sa@sA@se3sE3so0sO0ss5sS5$!
63 |
64 | # Hashcat doesn't support 'nth place'positional replace in rule sets yet.
65 | # So we can say 'replace only the first A with @'
66 | # See: https://hashcat.net/wiki/doku.php?id=rule_based_attack#using_p_nth_instance_of_a_character_with_positional_rules
67 | # So, we are going to make some guesses here.... Sub in l33t characters at positions 1,2,3 and also end with !
68 | # Not a great way to do it, but oh well. Will update in future if feature becomes available.
69 | o1@
70 | o2@
71 | o3@
72 | o13
73 | o23
74 | o33
75 | o11
76 | o21
77 | o31
78 | o10
79 | o20
80 | o30
81 | o15
82 | o25
83 | o35
84 | o1@$!
85 | o2@$!
86 | o3@$!
87 | o13$!
88 | o23$!
89 | o33$!
90 | o11$!
91 | o21$!
92 | o31$!
93 | o10$!
94 | o20$!
95 | o30$!
96 | o15$!
97 | o25$!
98 | o35$!
99 |
--------------------------------------------------------------------------------
/utilities/updating-sources.md:
--------------------------------------------------------------------------------
1 | # Notes on updating sources
2 |
3 | Sure, this should be a CI job. But hey, it's a start.
4 |
5 | Some of the source files get regular updates. Below is a guide to obtaining those, preparing them for cleaning, actually cleaning, and then merging into the existing list.
6 |
7 | ## IMDB titles
8 |
9 | ```
10 | wget https://datasets.imdbws.com/title.basics.tsv.gz
11 | gunzip ./title.basics.tsv.gz
12 | cat title.basics.tsv | awk -F '\t' '{print $3}' > ./imdb-titles-$(date +%Y-%m-%d).txt
13 | rm title.basics.tsv
14 | ```
15 |
16 | ## Wikipedia article titles & category names
17 |
18 | ```
19 | wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream-index.txt.bz2
20 | gunzip2 ./enwiki-latest-pages-articles-multistream-index.txt.bz2
21 | cat ./enwiki-latest-pages-articles-multistream-index.txt | cut -d: -f 3 > ./wikipedia-$(date +%Y-%m-%d).txt
22 | rm enwiki-latest-pages-articles-multistream-index.txt
23 |
24 | ```
25 |
26 | ## Wiktionary titles
27 |
28 | ```
29 | wget https://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-all-titles.gz
30 | gunzip enwiktionary-latest-all-titles.gz
31 | cat enwiktionary-latest-all-titles | awk -F '\t' '{print $2}' > ./wiktionary-$(date +%Y-%m-%d).txt
32 | rm enwiktionary-latest-all-titles
33 |
34 | ```
35 |
36 | ## Urban Dictionary
37 |
38 | ```
39 | git clone https://github.com/initstring/urban-dictionary-word-list
40 | cd urban-dictionary-word-list
41 | touch urban-dictionary-$(date +%Y-%m-%d).txt
42 | python3 ./main.py --out urban-dictionary-$(date +%Y-%m-%d).txt
43 | ```
44 |
45 | ## Know Your Meme
46 |
47 | ```
48 | python3 /utilities/kym_scrape.py
49 | mv memes.txt ./know-your-meme-$(date +%Y-%m-%d).txt
50 | ```
51 |
52 | ## Global POI dataset
53 |
54 | ```
55 | wget http://download.geonames.org/export/dump/allCountries.zip
56 | unzip ./allCountries.zip
57 | cat allCountries.txt | awk -F '\t' '{print $3}' > ./global-poi-$(date +%Y-%m-%d).txt
58 | rm allCountries.zip
59 | rm allCountries.txt
60 | ```
61 |
62 | ## Billboard charts
63 |
64 | ```
65 | git clone https://github.com/initstring/umdmusic-downloader
66 | cd umdmusic-downloader
67 | pip3 install -r ./requirements.txt
68 | python3 ./downloader.py
69 | cat ./us_billboard.psv | cut -d "|" -f 5 > ./billboard-titles-$(date +%Y-%m-%d).txt
70 | cat ./us_billboard.psv | cut -d "|" -f 6 | sed "s/ featuring /\n/g" > ./billboard-artists-$(date +%Y-%m-%d).txt
71 | rm ./us_billboard.psv
72 | ```
73 |
74 | ## Combining
75 |
76 | With all raw files in the same folder:
77 |
78 | ```
79 | cat ./*.txt | sort -u > raw.txt
80 | python3 ./cleanup.py raw.txt passphrases.txt
81 | ```
82 |
83 | If you generate a new version and want to compare what's new you can use a command like:
84 |
85 | ```
86 | sort new.txt old.txt | uniq -u
87 | ```
--------------------------------------------------------------------------------
/utilities/kym_scrape.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | """
4 | Know Your Meme Scraper
5 | Grabs all titles from https://knowyourmeme.com
6 |
7 | Puts output into memes.txt
8 |
9 | Used to feed into password cracking wordlists like
10 | https://github.com/initstring/passphrase-wordlist
11 |
12 | Code by initstring
13 | """
14 |
15 | import html
16 | import time
17 | import re
18 | import sys
19 | import requests
20 |
21 | # The "all" URL allows autoscrolling
22 | KYM_URL = 'https://knowyourmeme.com/memes/all/page'
23 |
24 | # Regex to grab all formatted titles
25 | RE_TITLE = re.compile(r'
]*>\s*(.*?)\s*
')
26 |
27 | # Text to know when we reached end of line
28 | NO_MORE = 'There are no entries for this category'
29 |
30 | # Need real headers to get past WAF
31 | HEADERS = {'User-Agent': 'Mozilla/5.0'}
32 |
33 | # Out file
34 | OUTFILE = 'memes.txt'
35 |
36 | # File for in-process scraping
37 | LOGFILE = 'memes-incomplete.txt'
38 |
39 | # Sleep to avoid IP ban
40 | SLEEP = 3
41 |
42 | def write_log(phrases):
43 | """
44 | Logs phrases as the program runs
45 |
46 | Used for troubleshooting or to at least have _something_ in the case of
47 | IP ban, failure, etc
48 | """
49 | with open(LOGFILE, 'a') as logfile:
50 | for phrase in phrases:
51 | phrase = html.unescape(phrase)
52 | logfile.write(phrase + '\n')
53 |
54 | def write_final(phrases):
55 | """
56 | Writes all phrases to a log file
57 | """
58 | # Unescape the HTML and write the phrases out
59 | with open(OUTFILE, 'w') as outfile:
60 | for phrase in phrases:
61 | phrase = html.unescape(phrase)
62 | outfile.write(phrase + '\n')
63 |
64 | def scrape_pages():
65 | """
66 | Loops through all pages of kym
67 | """
68 | page = 0
69 | phrases = set([])
70 |
71 | while True:
72 | # Build the URL based on auto-scroll behaviour
73 | url = "{}/{}".format(KYM_URL, page)
74 | response = requests.get(url, headers=HEADERS)
75 |
76 | # Check for IP ban
77 | if response.status_code == 403:
78 | print("\n[!] You have been IP banned. Oops.")
79 | sys.exit()
80 |
81 | # Return if no more results
82 | if response.status_code == 404:
83 | print("\n[*] Reached end of line at page {}. Exiting"
84 | .format(page))
85 | return phrases
86 |
87 | # Clear stdout for ongoing notifications
88 | sys.stdout.flush()
89 | sys.stdout.write(" " * 20)
90 | sys.stdout.write("\r")
91 |
92 | # Grab phrases from the raw text and add to set
93 | new_phrases = re.findall(RE_TITLE, response.text)
94 | phrases.update(new_phrases)
95 |
96 | # Write the new phrases to an ongoing logile
97 | write_log(new_phrases)
98 |
99 | # Update the patiently waiting user
100 | sys.stdout.write("[*] Page: {}, Phrases: {}, Unique Phrases: {}"
101 | .format(page, len(new_phrases), len(phrases)))
102 |
103 | # Increment the page for the next loop
104 | page += 1
105 |
106 | # Sleep to avoid IP ban
107 | time.sleep(SLEEP)
108 |
109 |
110 | def main():
111 | """
112 | Main program function
113 | """
114 | print("[*] Scraping all pages of KYM...")
115 | phrases = scrape_pages()
116 |
117 | print("[+] Found {} phrases, writing to {}..."
118 | .format(len(phrases), OUTFILE))
119 | write_final(phrases)
120 |
121 |
122 | if __name__ == "__main__":
123 | main()
124 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Overview
2 |
3 | People think they are getting smarter by using passphrases. Let's prove them wrong!
4 |
5 | This project includes a massive wordlist of phrases (over 20 million) and two hashcat rule files for GPU-based cracking. The rules will create over 1,000 permutations of each phase.
6 |
7 | To use this project, you need:
8 |
9 | - The wordlist `passphrases.txt`, which you can find under [releases](https://github.com/initstring/passphrase-wordlist/releases).
10 | - Both hashcat rules [here](/hashcat-rules/).
11 |
12 | **WORDLIST LAST UPDATED**: July 2025
13 |
14 | # Usage
15 |
16 | Generally, you will use with hashcat's `-a 0` mode which takes a wordlist and allows rule files. It is important to use the rule files in the correct order, as rule #1 mostly handles capital letters and spaces, and rule #2 deals with permutations.
17 |
18 | Here is an example for NTLMv2 hashes: If you use the `-O` option, watch out for what the maximum password length is set to - it may be too short.
19 |
20 | ```
21 | hashcat -a 0 -m 5600 hashes.txt passphrases.txt -r passphrase-rule1.rule -r passphrase-rule2.rule -O -w 3
22 | ```
23 |
24 | # Sources Used
25 |
26 | Some sources are pulled from a static dataset, like a Kaggle upload. Others I generate myself using various scripts and APIs. I might one day automate that via CI, but for now you can see how I update the dynamic sources [here](/utilities/updating-sources.md).
27 |
28 | | **source file name** | **source type** | **description** |
29 | | --- | --- | --- |
30 | | wiktionary-$(date).txt | dynamic | Article titles scraped from Wiktionary's index dump [here.](https://dumps.wikimedia.org/enwiktionary) |
31 | | wikipedia-$(date).txt | dynamic | Article titles scraped from the Wikipedia `pages-articles-multistream-index` dump generated 29-Sept-2021 [here.](https://dumps.wikimedia.org/enwiki) |
32 | | urban-dictionary-$(date).txt | dynamic | Urban Dictionary dataset pulled using [this script](https://github.com/mattbierner/urban-dictionary-word-list). |
33 | | know-your-meme-$(date).txt | dynamic | Meme titles from KnownYourMeme scraped using my tool [here.](/utilities/kym_scrape.py) |
34 | | imdb-titles-$(date).txt | dynamic | IMDB dataset using the "primaryTitle" column from `title.basics.tsv.gz` file available [here](https://datasets.imdbws.com/) |
35 | | global-poi-$(date).txt | dynamic | [Global POI dataset](https://download.geonames.org/export/dump/) using the 'allCountries' file from 29-Sept-2021. |
36 | | billboard-titles-$(date).txt | dynamic | Album and track names using [Ultimate Music Database](https://www.umdmusic.com/), scraped with [a fork of mwkling's tool](https://github.com/initstring/umdmusic-downloader), modified to grab Billboard Singles (1940-2021) and Billboard Albums (1970-2021) charts. |
37 | | billboard-artists-$(date).txt | dynamic | Artist names using [Ultimate Music Database](https://www.umdmusic.com/), scraped with [a fork of mwkling's tool](https://github.com/initstring/umdmusic-downloader), modified to grab Billboard Singles (1940-2021) and Billboard Albums (1970-2021) charts. |
38 | | book.txt | static | Kaggle dataset with titles from over 300,000 books. |
39 | | rstone-top-100.txt | static
(could be dynamic in future) | Song lyrics for Rolling Stone's "top 100" artists using my [lyric scraping tool](https://github.com/initstring/lyricpass). |
40 | | cornell-movie-titles-raw.txt | static | Movie titles from this [Cornell project](https://www.cs.cornell.edu/~cristian//Cornell_Movie-Dialogs_Corpus.html). |
41 | | cornell-movie-lines.txt | static | Movie lines from this [Cornell project](https://www.cs.cornell.edu/~cristian//Cornell_Movie-Dialogs_Corpus.html). |
42 | | author-quotes-raw.txt | static | [Quotables](https://www.kaggle.com/alvations/quotables) dataset on Kaggle. |
43 | | 1800-phrases-raw.txt | static | [1,800 English Phrases.](https://www.phrases.org.uk/meanings/phrases-and-sayings-list.html) |
44 | | 15k-phrases-raw.txt | static | [15,000 Useful Phrases.](https://www.gutenberg.org/ebooks/18362) |
45 |
46 | # Hashcat Rules
47 |
48 | The rule files are designed to both "shape" the password and to mutate it. Shaping is based on the idea that human beings follow fairly predictable patterns when choosing a password, such as capitalising the first letter of each word and following the phrase with a number or special character. Mutations are also fairly predictable, such as replacing letters with visually-similar special characters.
49 |
50 | Given the phrase `take the red pill` the first hashcat rule will output the following:
51 |
52 | ```
53 | take the red pill
54 | take-the-red-pill
55 | take.the.red.pill
56 | take_the_red_pill
57 | taketheredpill
58 | Take the red pill
59 | TAKE THE RED PILL
60 | tAKE THE RED PILL
61 | Taketheredpill
62 | tAKETHEREDPILL
63 | TAKETHEREDPILL
64 | Take The Red Pill
65 | TakeTheRedPill
66 | Take-The-Red-Pill
67 | Take.The.Red.Pill
68 | Take_The_Red_Pill
69 | ```
70 |
71 | Adding in the second hashcat rule makes things get a bit more interesting. That will return a huge list per candidate. Here are a couple examples:
72 |
73 | ```
74 | T@k3Th3R3dPill!
75 | T@ke-The-Red-Pill
76 | taketheredpill2020!
77 | T0KE THE RED PILL
78 | ```
79 |
80 | # Additional Info
81 |
82 | Optionally, some researchers might be interested in the script I use to clean the raw sources into the wordlist [here](/utilities/cleanup.py).
83 |
84 | The cleanup script works like this:
85 |
86 | ```
87 | $ python3.6 cleanup.py infile.txt outfile.txt
88 | Reading from ./infile.txt: 505 MB
89 | Wrote to ./outfile.txt: 250 MB
90 | Elapsed time: 0:02:53.062531
91 |
92 | ```
93 |
94 | Enjoy!
95 |
--------------------------------------------------------------------------------
/utilities/cleanup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | """
5 | Prepares passphrase cracking lists for use with the hashcat rules at
6 | github.com/initstring/passphrase-wordlist
7 | """
8 |
9 | import sys
10 | import re
11 | import urllib.parse
12 | import html
13 | import os
14 | import time
15 | import argparse
16 | from datetime import timedelta
17 |
18 | # Set a min/max passphrase character length. Change this if you want.
19 | MIN_LENGTH = 8
20 | MAX_LENGTH = 40
21 |
22 | # Compiled regex patterns for performance
23 | MULTIWORD_PATTERN = re.compile('[a-z0-9\'&] [a-z0-9\'&]')
24 | ALLOWED_CHARS_PATTERN = re.compile("[^a-zA-Z0-9 '&]")
25 | MULTIPLE_SPACES_PATTERN = re.compile(r'\s\s+')
26 | QUOTE_REMOVAL_PATTERN = re.compile(r" '([^']*)' ")
27 | WHITESPACE_PATTERN = re.compile(r'\s+')
28 | HYPHEN_UNDERSCORE_PATTERN = re.compile(r'[-_]')
29 | APOSTROPHE_REMOVAL_PATTERN = re.compile("'")
30 | AND_TO_AMPERSAND_PATTERN = re.compile(' and ')
31 | AMPERSAND_TO_AND_PATTERN = re.compile('&')
32 |
33 | # Accented character patterns
34 | ACCENTED_A_PATTERN = re.compile('[àáâãäå]')
35 | ACCENTED_E_PATTERN = re.compile('[èéêë]')
36 | ACCENTED_I_PATTERN = re.compile('[ìíîï]')
37 | ACCENTED_O_PATTERN = re.compile('[òóôõö]')
38 | ACCENTED_U_PATTERN = re.compile('[ùúûü]')
39 | ACCENTED_N_PATTERN = re.compile('[ñ]')
40 | ACCENTED_C_PATTERN = re.compile('[ç]')
41 | ACCENTED_Y_PATTERN = re.compile('[ÿ]')
42 |
43 | # Split pattern
44 | SPLIT_PATTERN = re.compile(r';|,|\.')
45 |
46 | def parse_arguments():
47 | """
48 | Handles user-passed parameters
49 | """
50 | desc = 'Transforms text files in passphrase lists.'
51 | parser = argparse.ArgumentParser(description=desc)
52 |
53 | parser.add_argument('infile', type=str, action='store',
54 | help='Input file.')
55 | parser.add_argument('outfile', type=str, action='store',
56 | help='Output file.')
57 |
58 | args = parser.parse_args()
59 |
60 | if not os.access(args.infile, os.R_OK):
61 | print("[!] Cannot access input file, exiting")
62 | sys.exit()
63 |
64 | return args
65 |
66 | def build_buffer(infile):
67 | """
68 | Reads infile and builds a list of candidates for additional processing
69 | """
70 | buffer = []
71 |
72 | infile_size = str((int(os.path.getsize(infile)/1000000))) + " MB"
73 | print("Reading from {}: {}".format(infile, infile_size))
74 |
75 | with open(infile, encoding='utf-8', errors='ignore') as file_handler:
76 | for line in file_handler:
77 | candidates = []
78 | # Remove HTML and URL encoding first
79 | line = escape_encoding(line)
80 |
81 | # Split lines with common delimiters like . , or ;
82 | for split_line in SPLIT_PATTERN.split(line):
83 | candidates.append(split_line.strip())
84 |
85 | # There is a new short list, append each to the buffer
86 | for string in candidates:
87 | buffer.append(string)
88 |
89 | return buffer
90 |
91 | def handle_punctuation(line):
92 | """
93 | Deals with common punctionation
94 | """
95 | clean_lines = []
96 |
97 | # Gets rid of any remaining special characters in the name
98 | line = ALLOWED_CHARS_PATTERN.sub('', line)
99 |
100 | # Shrinks down multiple spaces
101 | line = MULTIPLE_SPACES_PATTERN.sub(' ', line)
102 |
103 | # Strip quotes around line
104 | line = line.strip('\'"')
105 |
106 | # Remove quotes around internal segments
107 | line = QUOTE_REMOVAL_PATTERN.sub(r' \1 ', line)
108 |
109 | # If line has an apostrophe make a duplicate without deleting it
110 | if "'" in line:
111 | clean_lines.append(APOSTROPHE_REMOVAL_PATTERN.sub("", line))
112 |
113 | # Making duplicating phrases including and / &
114 | if ' and ' in line:
115 | clean_lines.append(AND_TO_AMPERSAND_PATTERN.sub(' & ', line))
116 | if '&' in line:
117 | newline = AMPERSAND_TO_AND_PATTERN.sub(' and ', line)
118 | newline = WHITESPACE_PATTERN.sub(' ', newline).strip()
119 | clean_lines.append(newline)
120 |
121 | # Add what is left to the list and return it
122 | clean_lines.append(line)
123 | return clean_lines
124 |
125 | def escape_encoding(line):
126 | """
127 | Deals with common encoding and accented characters
128 | """
129 | line = urllib.parse.unquote(line) # convert URL encoding like %27
130 | line = html.unescape(line) # convert HTML encoding like '
131 | line = WHITESPACE_PATTERN.sub(' ', line).strip() # Remove extra whitespace
132 | line = line.lower() # convert to lowercase
133 | line = HYPHEN_UNDERSCORE_PATTERN.sub(' ', line) # Change - and _ to spaces
134 |
135 | # The following lines attempt to remove accented characters, as the
136 | # tool is focused on Engligh-language passwords.
137 | line = ACCENTED_A_PATTERN.sub('a', line)
138 | line = ACCENTED_E_PATTERN.sub('e', line)
139 | line = ACCENTED_I_PATTERN.sub('i', line)
140 | line = ACCENTED_O_PATTERN.sub('o', line)
141 | line = ACCENTED_U_PATTERN.sub('u', line)
142 | line = ACCENTED_N_PATTERN.sub('n', line)
143 | line = ACCENTED_C_PATTERN.sub('c', line)
144 | line = ACCENTED_Y_PATTERN.sub('y', line)
145 |
146 | return line
147 |
148 | def choose_candidates(line):
149 | """
150 | Final check to determine with cleaned phrases to keep
151 | """
152 | # Throw out single-word candidates
153 | if not MULTIWORD_PATTERN.search(line):
154 | return False
155 |
156 | # Thow out too short / too long lines
157 | if len(line) < MIN_LENGTH or len(line) > MAX_LENGTH:
158 | return False
159 |
160 | return True
161 |
162 | def write_file(buffer, outfile):
163 | """
164 | Writes choses candidates to an output file
165 | """
166 | with open(outfile, 'w') as file_handler:
167 | for line in sorted(buffer):
168 | file_handler.write(line.strip() + '\n')
169 |
170 | outfile_size = str((int(os.path.getsize(outfile)/1000000)))
171 | print("Wrote to {}: {} MB".format(outfile, outfile_size))
172 |
173 |
174 | def main():
175 | """
176 | Main program function
177 | """
178 | start = time.time()
179 | args = parse_arguments()
180 | buffer = build_buffer(args.infile)
181 | final = set([])
182 | # Processes phrases and adds to a set (deduped)
183 | for phrase in buffer:
184 | new_phrases = handle_punctuation(phrase)
185 | for newphrase in new_phrases:
186 | if choose_candidates(newphrase):
187 | final.add(newphrase)
188 | # Writes final set out to file
189 | write_file(final, args.outfile)
190 | elapsed = time.time() - start
191 | print("Elapsed time: " + str(timedelta(seconds=elapsed)))
192 |
193 |
194 | if __name__ == "__main__":
195 | main()
196 |
--------------------------------------------------------------------------------