') : 71 | skip = False 72 | if skip == True and word.find(''): 73 | skip = True 74 | # 'wtf does this even do?' can't remember... probably nothing 75 | pass 76 | if len(word) >= min and len(word) <= max and skip == False: 77 | for ban in banned: 78 | try: 79 | while 1: 80 | word = word.replace(ban, '') 81 | if word.find(ban) == -1: 82 | break 83 | except: 84 | pass 85 | if word == '' or word == ' ' or len(word) < min: 86 | continue 87 | else: 88 | self.rv.append(word.lower()) 89 | else: 90 | pass 91 | return list(set(self.rv)) 92 | 93 | def getUrls(self, data): 94 | global bad_urls 95 | self.test = data 96 | self.rv = [] 97 | for lineA in StringIO.StringIO(self.test): 98 | match = re.findall(r'.+', lineA) 99 | if match: 100 | match2 = re.findall(r'.+', lineA) 101 | if match2 != True: 102 | for i in match: 103 | try: 104 | reg = re.compile('/wiki/.*?"') 105 | self.urlvalue = reg.search(i).group(0) 106 | self.urlvalue.replace('"', '') 107 | self.urlvalue = str(URLVALUE) + str(self.urlvalue).strip('"') 108 | if self.urlvalue.endswith('.jpg') or self.urlvalue.endswith('.svg') or self.urlvalue.endswith('.png') or self.urlvalue.endswith('.gif') : 109 | pass 110 | elif '/wiki/Wikipedia:' in self.urlvalue or '/wiki/Portal:' in self.urlvalue or '/wiki/Special:' in self.urlvalue or '%' in self.urlvalue or '/wiki/Template' in self.urlvalue: 111 | pass 112 | else: 113 | self.rv.append(self.urlvalue) 114 | except Exception, e: 115 | pass 116 | else: 117 | pass 118 | return list(set(self.rv)) 119 | 120 | def writeWords(): 121 | global outputfile, words, wordqueue 122 | while 1: 123 | data = wordqueue.get() 124 | for line in data: 125 | try: 126 | line_encoded = line.encode('ISO-8859-1') 127 | #line_encoded = line.encode('UTF-8') # might want to uncomment $ 128 | except: 129 | continue 130 | f = open(outputfile, 'a') 131 | f.write(line_encoded.lower() + '\n') 132 | f.close() 133 | words += 1 134 | if wordqueue.empty(): 135 | break 136 | 137 | ################## 138 | def handler(signum, frame): # http://stackoverflow.com/questions/1112343/how-do-i-capture-sigint-in-python 139 | global words, outputfile 140 | if not wordqueue.empty(): 141 | print '\nHold on cowboy, let me finish the running threads and dump the words into %s' % outputfile 142 | writeWords() 143 | print 'Done. Wrote %i words into %s' % (words, outputfile) 144 | 145 | quit() 146 | 147 | 148 | signal.signal(signal.SIGINT, handler) 149 | ################### 150 | 151 | 152 | 153 | filename = os.path.split(inspect.getfile(inspect.currentframe())) 154 | parser = optparse.OptionParser('Usage: ' + filename[1] + ' ' + '\nWikipedia Wordlist Generator by @_tmp0\nURL must be formated as following (most subdomains should work): ' 155 | 'http://en.wikipedia.org/wiki/wikipage\n\nExample: python %s -u http://en.wikipedia.org/wiki/Europe -o wordlist.txt -t 5\nIf no minumum or max length is set the script will save words between 6 and 30 characters length' 156 | '\n\nctrl+c to break\n\nI suggest doing something like this to clean the wordlist from duplicates:' 157 | ' sort -u wordlist.txt >> n_wordlist.txt' % filename[1]) 158 | parser.add_option('-u', dest='starturl', type='string', help='Wikipedia URL to use as start for the crawler') 159 | parser.add_option('-t', dest='nrthreads', type='int', help='Amount of threads') 160 | parser.add_option('-o', dest='outputfile', type='string', help='File to write output to') 161 | parser.add_option('-m', dest='min', type='int', help='Minimum length of words') 162 | parser.add_option('-M', dest='max', type='int', help='Maximum length of words') 163 | (options, args) = parser.parse_args() 164 | nrthreads = options.nrthreads 165 | starturl = options.starturl 166 | outputfile = options.outputfile 167 | min = options.min 168 | max = options.max 169 | 170 | 171 | if starturl == None or outputfile == None or nrthreads == None: 172 | print parser.print_help() 173 | quit(0) 174 | 175 | if min == None: 176 | print '[!] No minimum length supplied. Setting minimum length to 6' 177 | min = 6 178 | if max == None: 179 | print '[!] No maximum length supplied. Setting maximum length to 30' 180 | max = 30 181 | 182 | 183 | words = 0 184 | URLVALUE = starturl.split('/wiki')[0] 185 | bad_urls = [bad_url.replace(bad_url, str(URLVALUE) + str(bad_url)) for bad_url in bad_urls] 186 | 187 | firstlayerqueue.put(starturl) 188 | while 1: # generate first crawl content 189 | thread = Crawl(firstlayerqueue, secondlayerqueue, wordqueue) 190 | thread.daemon = True 191 | thread.start() 192 | if thread.isAlive(): 193 | break 194 | 195 | int_count = 0 196 | while 1: 197 | 198 | if firstlayerqueue.empty(): 199 | while 1: 200 | firstlayerqueue.put(secondlayerqueue.get()) 201 | if secondlayerqueue.empty(): 202 | writeWords() 203 | print '\nWrote %i words to %s. Queue empty, filling...' % (words, outputfile) 204 | words = 0 205 | break 206 | 207 | 208 | 209 | if not firstlayerqueue.empty(): 210 | alivethread = 0 211 | for i in range(nrthreads): 212 | if not firstlayerqueue.empty(): 213 | alivethread += 1 214 | thread = Crawl(firstlayerqueue, secondlayerqueue, wordqueue) 215 | thread.daemon = True 216 | thread.start() 217 | for i in range(alivethread): 218 | thread.join(5) 219 | int_count += 1 220 | if int_count == 2: 221 | print 'Joined %i threads. Queue size: %i' % (alivethread, firstlayerqueue.qsize()) 222 | int_count = 0 223 | continue 224 | --------------------------------------------------------------------------------