├── LICENSE.txt
├── README.md
└── wwg.py


/LICENSE.txt:
--------------------------------------------------------------------------------
 1 |           DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
 2 |                    Version 2, December 2004
 3 | 
 4 | Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
 5 | 
 6 | Everyone is permitted to copy and distribute verbatim or modified
 7 | copies of this license document, and changing it is allowed as long
 8 | as the name is changed.
 9 | 
10 |            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
11 |   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
12 | 
13 |  0. You just DO WHAT THE FUCK YOU WANT TO.
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Wikipedia Wordlist Generator
2 | 
3 | Will crawl said site and create a wordlist out of the words it find. Some ugly code may occur!
4 | 


--------------------------------------------------------------------------------
/wwg.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | # Coded by Sam (alex@0xdeadcode.se)
  4 | # http://0xdeadcode.se
  5 | # Twitter: @_tmp0
  6 | 
  7 | import threading, Queue, urllib2, StringIO, re, sys, os, optparse, inspect, signal
  8 | reload(sys)
  9 | sys.setdefaultencoding("latin-1")
 10 | 
 11 | 
 12 | # Add Wikipedia links you do not wish the crawler to visit here
 13 | bad_urls = ['/wiki/Wikipedia:Tutorial/Registration', '/wiki/Wikipedia:Authority_control'\
 14 |             '/wiki/Help:Contents', '/wiki/Help:Contents', \
 15 |             '/wiki/Wikipedia:External_links','/wiki/Wikipedia:Contact_us' \
 16 |             '/wiki/Wikipedia:Contact_us_-_Readers','/wiki/Wikipedia:Contact_us_-_Press', \
 17 |             '/wiki/Press/Contact/Chapters', '/wiki/Wikipedia:General_disclaimer', \
 18 |             '/wiki/Wikipedia:Stub', '/wiki/Wikipedia:Subpages', '/wiki/Wikipedia', \
 19 |             '/wiki/Wikipedia_talk:Do_not_use_subpages', '/wiki/Terms_of_Use']
 20 | 
 21 | # Add/Remove any word or character you wish to skip
 22 | banned = [':', ';', '\\', '/', '[', '{', ']', '}', '&', '&', '*', '(', ')', '.', ',', '\'', '\'', '"', '#', '=', '”', '’',\
 23 |           'Deltagarportalen', 'Wikipedia', 'Wikimedia', 'diskussionssidan', 'Utskriftsvänlig', 'engelskspråkiga', \
 24 |           'användarvillkor', 'Grundprinciperna', 'ქართული', 'ÙØ§Ø±Ø³ÛŒ', '日本語', 'oʻzbekcha', 'العربية', '⇒', '·', \
 25 |           'ifwindowmw', '한국어', 'മലയാളം', 'việt', 'тыла', 'Перем', 'kreyòl', 'ไทย', '粵語', 'پنجابی', 'Български', 'کوردی', \
 26 |           'தமிழ்', 'བོད་ཡིག', 'Հայերեն', 'Авар', 'ᨅᨔ', 'తెలుగు', 'avañeẽ', 'తెలుగు', 'Српски', 'српскохрватски', 'မြန်မာဘာသာ', \
 27 |           'ಕನ್ನಡ', 'Коми', 'Эрзянь', 'Чӑвашла', 'Беларуская', 'ଓଡ଼ିଆ', '⇐', 'ଓଡ଼ିଆ', 'documentwriteu003cdiv', 'Русский',\
 28 |           'Македонски', 'Лакку', 'ܐܪܡܝܐ', 'فارسی', 'ትግርኛ', '客家語hak-kâ-ngî', 'ייִדיש', 'اردو', 'Ελληνικά', 'მარგალური', \
 29 |           'ᨕᨘᨁᨗ', '贛語', 'Аҧсшәа', 'עברית', 'Українська', 'việt', '中文', 'ગુજરાતી', 'тарашкевіца‎', '文言', 'Ирон', 'नेपाल']
 30 | 
 31 | firstlayerqueue = Queue.Queue()
 32 | secondlayerqueue = Queue.Queue()
 33 | wordqueue = Queue.Queue()
 34 | 
 35 | 
 36 | class Crawl(threading.Thread):
 37 | 	def __init__(self, firstlayerqueue, secondlayerqueue, wordqueue):
 38 | 		threading.Thread.__init__(self)
 39 | 		self.firstlayerqueue = firstlayerqueue
 40 | 		self.secondlayerqueue = secondlayerqueue
 41 | 		self.wordqueue = wordqueue
 42 | 
 43 | 	def run(self):
 44 | 		self.url = self.firstlayerqueue.get()
 45 |         	#print 'IN THREAD: ' + self.url # uncomment to watch a lot of spam
 46 |         	try:
 47 | 			self.req = urllib2.Request(self.url, headers={'User-Agent' : "Mozilla/4.0(compatible; MSIE 7.0b; Windows NT 6.0)"}) # :)
 48 | 			self.con = urllib2.urlopen(self.req)
 49 | 			self.data = self.con.read()
 50 |         	except:
 51 | 			self.firstlayerqueue.task_done()
 52 | 			return 1
 53 |         	self.urls = self.getUrls(self.data)
 54 |         	self.data = self.getWords(self.data)
 55 | 		self.wordqueue.put(self.data)
 56 |         	for url in self.urls:
 57 | 			self.secondlayerqueue.put(url)
 58 |         	self.firstlayerqueue.task_done()
 59 | 
 60 | 	# please dont read this part :(
 61 |     	def getWords(self, test):
 62 | 		global banned, min, max
 63 |         	self.rv = []
 64 |         	self.test = test
 65 |         	self.skip = True
 66 |         	for lines in StringIO.StringIO(self.test):
 67 |         		lines = lines.strip('\n').strip('\n\r').strip('\t')
 68 |             		self.testa = re.sub('<.*?>', ' ', lines).split(' ')
 69 |             		for word in self.testa:
 70 |                 		if word.find('<div id="content" class="mw-body" role="main">') :
 71 |                     			skip = False
 72 |                 		if skip == True and word.find('<span class="mw-headline" id="References">'):
 73 |                     			skip = True
 74 |                 			# 'wtf does this even do?' can't remember... probably nothing
 75 |                     			pass
 76 |                 		if len(word) >= min and len(word) <= max and skip == False:
 77 |                     			for ban in banned:
 78 |                         			try:
 79 |                             				while 1:
 80 |                                 				word = word.replace(ban, '')
 81 |                                 				if word.find(ban) == -1:
 82 |                                     					break
 83 |                         			except:
 84 |                             				pass
 85 |                     			if word == '' or word == ' ' or len(word) < min:
 86 | 						continue
 87 |                     			else:
 88 | 						self.rv.append(word.lower())
 89 |                 		else:
 90 |                  			pass
 91 | 		return list(set(self.rv))
 92 | 
 93 | 	def getUrls(self, data):
 94 |         	global bad_urls
 95 |        		self.test = data
 96 |         	self.rv = []
 97 |         	for lineA in StringIO.StringIO(self.test):
 98 |             		match = re.findall(r'<a href=".*?">.+</a>', lineA)
 99 |             		if match:
100 |                 		match2 = re.findall(r'<a href="/w/.*?">.+</a>', lineA)
101 |                 		if match2 != True:
102 |                     			for i in match:
103 |                         			try:
104 |                         				reg = re.compile('/wiki/.*?"')
105 | 			                        	self.urlvalue = reg.search(i).group(0)
106 | 		  					self.urlvalue.replace('"', '')
107 | 							self.urlvalue = str(URLVALUE) + str(self.urlvalue).strip('"')
108 |                             				if self.urlvalue.endswith('.jpg') or self.urlvalue.endswith('.svg') or self.urlvalue.endswith('.png') or self.urlvalue.endswith('.gif') :
109 | 								pass
110 | 							elif '/wiki/Wikipedia:' in self.urlvalue or '/wiki/Portal:' in self.urlvalue or '/wiki/Special:' in self.urlvalue or '%' in self.urlvalue or '/wiki/Template' in self.urlvalue:
111 |                                 				pass
112 |                             				else:
113 |                                 				self.rv.append(self.urlvalue)
114 |                         			except Exception, e:
115 |                             				pass
116 |             		else:
117 |                 		pass
118 |         	return list(set(self.rv))
119 | 
120 | def writeWords():
121 | 	global outputfile, words, wordqueue
122 | 	while 1:
123 | 		data =  wordqueue.get()
124 | 		for line in data:
125 | 			try:
126 | 				line_encoded = line.encode('ISO-8859-1')
127 |                			#line_encoded = line.encode('UTF-8') # might want to uncomment $
128 | 			except:
129 | 				continue
130 | 			f = open(outputfile, 'a')
131 | 			f.write(line_encoded.lower() + '\n')
132 | 			f.close()
133 | 			words += 1
134 | 		if wordqueue.empty():
135 | 			break
136 | 
137 | ##################
138 | def handler(signum, frame): # http://stackoverflow.com/questions/1112343/how-do-i-capture-sigint-in-python
139 | 	global words, outputfile
140 | 	if not wordqueue.empty():
141 | 		print '\nHold on cowboy, let me finish the running threads and dump the words into %s' % outputfile
142 | 		writeWords()
143 | 		print 'Done. Wrote %i words into %s' % (words, outputfile)
144 | 			
145 | 	quit()
146 | 
147 | 
148 | signal.signal(signal.SIGINT, handler)
149 | ###################
150 | 
151 | 
152 | 
153 | filename = os.path.split(inspect.getfile(inspect.currentframe()))
154 | parser = optparse.OptionParser('Usage: ' + filename[1] + ' <args>' + '\nWikipedia Wordlist Generator by @_tmp0\nURL must be formated as following (most subdomains should work): '
155 |                                                                                 'http://en.wikipedia.org/wiki/wikipage\n\nExample: python %s -u http://en.wikipedia.org/wiki/Europe -o wordlist.txt -t 5\nIf no minumum or max length is set the script will save words between 6 and 30 characters length'
156 |                                                                                 '\n\nctrl+c to break\n\nI suggest doing something like this to clean the wordlist from duplicates:'
157 |                                                                                 ' sort -u wordlist.txt >> n_wordlist.txt' % filename[1])
158 | parser.add_option('-u', dest='starturl', type='string', help='Wikipedia URL to use as start for the crawler')
159 | parser.add_option('-t', dest='nrthreads', type='int', help='Amount of threads')
160 | parser.add_option('-o', dest='outputfile', type='string', help='File to write output to')
161 | parser.add_option('-m', dest='min', type='int', help='Minimum length of words')
162 | parser.add_option('-M', dest='max', type='int', help='Maximum length of words')
163 | (options, args) = parser.parse_args()
164 | nrthreads = options.nrthreads
165 | starturl = options.starturl
166 | outputfile = options.outputfile
167 | min = options.min
168 | max = options.max
169 | 
170 | 
171 | if starturl == None or outputfile == None or nrthreads == None:
172 |     print parser.print_help()
173 |     quit(0)
174 | 
175 | if min == None:
176 | 	print '[!] No minimum length supplied. Setting minimum length to 6'
177 | 	min = 6
178 | if max == None:
179 | 	print '[!] No maximum length supplied. Setting maximum length to 30'
180 | 	max = 30
181 | 
182 | 
183 | words = 0
184 | URLVALUE = starturl.split('/wiki')[0]
185 | bad_urls = [bad_url.replace(bad_url, str(URLVALUE) + str(bad_url)) for bad_url in bad_urls]
186 | 
187 | firstlayerqueue.put(starturl)
188 | while 1: # generate first crawl content
189 |     thread = Crawl(firstlayerqueue, secondlayerqueue, wordqueue)
190 |     thread.daemon = True
191 |     thread.start()
192 |     if thread.isAlive():
193 |         break
194 | 
195 | int_count = 0
196 | while 1:
197 | 	
198 | 	if firstlayerqueue.empty():
199 |         	while 1:
200 |             		firstlayerqueue.put(secondlayerqueue.get())
201 |             		if secondlayerqueue.empty():
202 |                 		writeWords()
203 | 				print '\nWrote %i words to %s. Queue empty, filling...' % (words, outputfile)
204 |                 		words = 0
205 | 				break
206 | 
207 | 
208 | 
209 | 	if not firstlayerqueue.empty():
210 |         	alivethread = 0
211 |         	for i in range(nrthreads):
212 |             		if not firstlayerqueue.empty():
213 |                 		alivethread += 1
214 |                 		thread = Crawl(firstlayerqueue, secondlayerqueue, wordqueue)
215 |                 		thread.daemon = True
216 |                 		thread.start()
217 | 	        for i in range(alivethread):
218 | 			thread.join(5)
219 | 		int_count += 1
220 | 		if int_count == 2:
221 |             		print 'Joined %i threads. Queue size: %i' % (alivethread, firstlayerqueue.qsize())
222 | 			int_count = 0
223 | 		continue
224 | 


--------------------------------------------------------------------------------