├── README.md
├── dbmodels.py
├── download_new.py
├── extremums.ods
├── hash.py
├── solved.ods
├── stat.ods
└── utils.py


/README.md:
--------------------------------------------------------------------------------
1 | This is my attempt to create a system to solve Google reCaptcha v2 automatically based on perceptive hashes, colors histograms and huge training image sets.
2 | All research details on habrahabr.ru: https://habrahabr.ru/post/280230/
3 | 


--------------------------------------------------------------------------------
/dbmodels.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import create_engine, Column, Integer, String, Text, Float, DateTime
 2 | from sqlalchemy.dialects import postgresql
 3 | from sqlalchemy.schema import ForeignKey
 4 | from sqlalchemy.ext.declarative import declarative_base
 5 | from sqlalchemy.orm import sessionmaker, relationship
 6 | import datetime
 7 | 
 8 | engine = create_engine('sqlite:///recaptcha.db')
 9 | 
10 | #Declare an instance of the Base class for mapping tables
11 | Base = declarative_base()
12 | 
13 | #Map a table to a class by inheriting base class
14 | class Types(Base):
15 |     __tablename__ = 'types'
16 | 
17 |     id = Column(Integer, primary_key=True)
18 |     text = Column(Text, nullable=False)
19 | 
20 |     def __init__(self, text):
21 |         self.text = text
22 | 
23 |     def __str__(self):
24 |         return "Type(id: {}, text: {})".format(self.id, self.text)
25 | 
26 | class Captcha(Base):
27 |     __tablename__ = 'captcha'
28 | 
29 |     id = Column(Integer, primary_key=True)
30 |     md5 = Column(String(32), nullable=False)
31 |     phash = Column(String(16), nullable=False)
32 |     type_id = Column(Integer, nullable=False)
33 |     histogram = Column(postgresql.ARRAY(Float), nullable=False)
34 |     max1 = Column(Integer, nullable=False)
35 |     max2 = Column(Integer, nullable=False)
36 |     min1 = Column(Integer, nullable=False)
37 |     min2 = Column(Integer, nullable=False)
38 |     popularity = Column(Integer, nullable=False)
39 |     failures = Column(Integer, nullable=False)
40 |     creation_date = Column(DateTime, nullable=False)
41 | 
42 |     def __init__(self, type_id, md5, phash, histogram, min, max, popularity = 1, failures = 0):
43 |         self.type_id = type_id
44 |         self.md5 = md5
45 |         self.phash = phash
46 |         self.histogram = histogram
47 | 
48 |         if len(min) == 0:
49 |             self.min1 = 1000
50 |         else:
51 |             self.min1 = min[0]
52 |         if len(min) == 1:
53 |             self.min2 = 1000
54 |         else:
55 |             self.min2 = min[1]
56 | 
57 |         if len(max) == 0:
58 |             self.max1 = 1000
59 |         else:
60 |             self.max1 = max[0]
61 |         if len(max) == 1:
62 |             self.max2 = 1000
63 |         else:
64 |             self.max2 = max[1]
65 |         self.popularity = popularity
66 |         self.failures = failures
67 |         self.creation_date = datetime.datetime.now()
68 | 
69 |     def __str__(self):
70 |         return "Captcha(id: {}, typeId: {}, md5: {} mins: {},{} maxs: {},{} pop: {} fails: {} created: {})".format(
71 |             self.id, self.type_id, self.md5, self.min1, self.min2, self.max1, self.max2, self.popularity, self.failures,
72 |             self.creation_date)
73 | 
74 | class Captcha_Groups(Base):
75 |     __tablename__ = "captcha_groups"
76 | 
77 |     id = Column(Integer, ForeignKey('captcha.id'), primary_key=True)
78 |     type_id = Column(Integer, nullable=False)
79 |     group = Column(postgresql.ARRAY(Integer))
80 |     captcha = relationship(Captcha)
81 | 
82 |     def __init__(self, captcha_id, type_id, group = []):
83 |         self.id = captcha_id
84 |         self.type_id = type_id
85 |         self.group = group
86 | 
87 |     def __str__(self):
88 |         return "Captcha Group(id: {}, type: {}, group: {})".format(self.id, self.type_id, self.group)
89 | 
90 | #Create the table using the metadata attribute of the base class
91 | Base.metadata.create_all(engine)
92 | 
93 | Session = sessionmaker(bind=engine)
94 | session = Session()
95 | 


--------------------------------------------------------------------------------
/download_new.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | import json
  3 | import requests
  4 | import base64
  5 | import re
  6 | import PIL
  7 | from PIL import Image
  8 | from hashlib import md5
  9 | import time, datetime
 10 | import signal
 11 | from utils import translate, get_instructions_hash
 12 | import dbmodels
 13 | from hash import CaptchaHash, ImageHash
 14 | 
 15 | 
 16 | class Stats:
 17 |     def __init__(self):
 18 |         self.STATS_IDS_CHECKED = 0
 19 |         self.STATS_NEW_TYPES = 0
 20 |         self.STATS_NEW_IMAGES = 0
 21 |         self.STATS_IMAGES_ALREADY_STORED = 0
 22 |         self.STATS_IMAGES_GROUPED = 0
 23 |         self.STATS_WRONG_SIZE_IMAGE = 0
 24 |         self.STATS_WRONG_TEXT = 0
 25 |         self.STATS_OTHER_ERRORS = 0
 26 |         self.STATS_SOLVED_CAPTCHAS = 0
 27 |         self.STATS_FAILED_CAPTCHAS = 0
 28 |         self.STATS_ERRORS_FIXED = 0
 29 |         self.STATS_TARGET_IMAGES = 0
 30 |         self.STATS_NONTARGET_IMAGES = 0
 31 |         self.start_time = datetime.datetime.now()
 32 |         self.solved_types = {}
 33 | 
 34 |     def new_id(self, count = 1): self.STATS_IDS_CHECKED += count
 35 |     def new_type(self, count = 1): self.STATS_NEW_TYPES += count
 36 |     def new_image(self, count = 1): self.STATS_NEW_IMAGES += count
 37 |     def new_image_already_stored(self, count = 1): self.STATS_IMAGES_ALREADY_STORED += count
 38 |     def new_image_grouped(self, count = 1): self.STATS_IMAGES_GROUPED += count
 39 |     def wrong_size(self, count = 1): self.STATS_WRONG_SIZE_IMAGE += count
 40 |     def wrong_text(self, count = 1): self.STATS_WRONG_TEXT += count
 41 |     def solved_captcha(self, count = 1): self.STATS_SOLVED_CAPTCHAS += count
 42 |     def failed_captcha(self, count = 1): self.STATS_FAILED_CAPTCHAS += count
 43 |     def error_fixed(self, count = 1): self.STATS_ERRORS_FIXED += count
 44 |     def other_error(self, count = 1): self.STATS_OTHER_ERRORS += count
 45 |     def target_images(self, count = 1): self.STATS_TARGET_IMAGES += count
 46 |     def nontarget_images(self, count = 1): self.STATS_NONTARGET_IMAGES += count
 47 |     def solved_type(self, type):
 48 |         if type in self.solved_types:
 49 |             self.solved_types[type] += 1
 50 |         else:
 51 |             self.solved_types[type] = 1
 52 | 
 53 |     def output(self):
 54 |         print "\nStatistics:\nStarted: {}\n===========\n".format(self.start_time)
 55 |         print "IDs Checked: {}\nNew types added: {}\nNew images saved: {}\nNew images alread stored: {}\nNew images grouped: {}\n" \
 56 |               "Images with wrong size: {}\nWrong text instructins: {}\nOther errors: {}\nTime elapsed: {}s\n\nSolved captchas: {}\n" \
 57 |               "Failed captchas: {}\nErrors fixed: {}\nTarget images: {}\nNon-Target images: {}\n\nSolved types:".format(
 58 |             self.STATS_IDS_CHECKED, self.STATS_NEW_TYPES, self.STATS_NEW_IMAGES, self.STATS_IMAGES_ALREADY_STORED, self.STATS_IMAGES_GROUPED,
 59 |             self.STATS_WRONG_SIZE_IMAGE, self.STATS_WRONG_TEXT, self.STATS_OTHER_ERRORS, str(datetime.datetime.now() - self.start_time),
 60 |             self.STATS_SOLVED_CAPTCHAS, self.STATS_FAILED_CAPTCHAS, self.STATS_ERRORS_FIXED, self.STATS_TARGET_IMAGES, self.STATS_NONTARGET_IMAGES)
 61 |         all_types = sum(self.solved_types.values())
 62 |         for type in self.solved_types:
 63 |             print "{}: {}   ({:.4}%)".format(type, self.solved_types[type], 100. * float(self.solved_types[type]) / float(all_types))
 64 | 
 65 | 
 66 | statistics = Stats()
 67 | OLDEST_ID_URL = '...'
 68 | CAPTCHAS_API_URL = '...'
 69 | BAD_INSTRUCTIONS = ['):']
 70 | INSTRUCTION_SUBSTITUTE_PATTERNS = {
 71 |     'Verify\r\nReport a problem': '(\.\r\n)?Verify\r\nReport a problem.*',
 72 |     'Please select all matching images': '(.\r\n)?Please select all matching images.*',
 73 |     'Multiple correct solutions required - please solve more': '(\.\r\n)?Multiple correct solutions required - please solve more.*',
 74 | }
 75 | CREATE_NEW_TYPES = False
 76 | 
 77 | 
 78 | 
 79 | def merge_instructions_hash(instructions, path = 'instructions.txt'):
 80 |     file_instr = get_instructions_hash(path)
 81 |     f = open(path, 'a')
 82 |     for id in set(instructions.keys()) - set(file_instr.keys()):
 83 |         text = instructions[id]
 84 |         text = text.replace('\n', '').replace('\r', '')
 85 |         f.write("{}:{}\n".format(id, text))
 86 |         if not os.path.exists('captchas/{}'.format(id)):
 87 |             os.makedirs('captchas/{}'.format(id))
 88 |     f.flush()
 89 |     f.close()
 90 | 
 91 | def check_text_instruction(text):
 92 |     global statistics
 93 | 
 94 |     if text:
 95 |         if type(text) == unicode:
 96 |             text = text.encode('utf8')
 97 |         if text:
 98 |             for pattern in INSTRUCTION_SUBSTITUTE_PATTERNS:
 99 |                 if text.find(pattern):
100 |                     text = re.sub(INSTRUCTION_SUBSTITUTE_PATTERNS[pattern], '', text)
101 |             bad_instr = False
102 |             for instr in BAD_INSTRUCTIONS:
103 |                 if text.find(instr) != -1:
104 |                     bad_instr = True
105 |                     break
106 |             if bad_instr:
107 |                 statistics.wrong_text()
108 |                 print "[*] Bad string found"
109 |                 return None
110 |         return translate(text).strip()
111 |     return text
112 | 
113 | def get_instruction_id(instructions, text):
114 |     for id in instructions.keys():
115 |         if instructions[id] == text:
116 |             return id
117 |     return -1
118 | 
119 | def final():
120 |     global statistics
121 |     statistics.output()
122 |     f = open('logs/'+str(datetime.datetime.now()) + '.txt', 'w')
123 |     old_stdout = sys.stdout
124 |     sys.stdout = f
125 |     statistics.output()
126 |     sys.stderr = old_stdout
127 |     f.close()
128 | 
129 | def exit_handler(signal, frame):
130 |     final()
131 |     sys.exit(0)
132 | 
133 | def match_captcha(images):
134 |     matches = {}
135 |     for image_hash in images:
136 |         matches[image_hash] = None
137 |         query = dbmodels.session.query(dbmodels.Captcha).filter_by(md5 = image_hash.md5).filter(
138 |             dbmodels.Captcha.popularity > dbmodels.Captcha.failures).order_by(dbmodels.Captcha.popularity.desc())
139 |         if query.count() > 0:
140 |             matches[image_hash] = query.first()
141 |             continue
142 | 
143 |         query = dbmodels.session.query(dbmodels.Captcha_Groups).join(dbmodels.Captcha).filter(
144 |             dbmodels.Captcha.max1 > image_hash.maxs[0] - 2).filter(
145 |             dbmodels.Captcha.max1 < image_hash.maxs[0] + 2).filter(
146 |             dbmodels.Captcha.max2 > image_hash.maxs[1] - 2).filter(
147 |             dbmodels.Captcha.max2 < image_hash.maxs[1] + 2).filter(
148 |             dbmodels.Captcha.min1 > image_hash.mins[0] - 2).filter(
149 |             dbmodels.Captcha.min1 < image_hash.mins[0] + 2).filter(
150 |             dbmodels.Captcha.min2 > image_hash.mins[1] - 2).filter(
151 |             dbmodels.Captcha.min2 < image_hash.mins[1] + 2).filter(
152 |             dbmodels.Captcha.popularity > dbmodels.Captcha.failures)
153 |         for group in query.all():
154 |             ihash = ImageHash.create_from_db(group.captcha.phash, group.captcha.histogram)
155 |             if ImageHash.phash_diff(ihash.phash, image_hash.phash) < 14:
156 |                 if ImageHash.aver_squad_diff(ihash.histogram[:], image_hash.histogram[:]) < 5:
157 |                     matches[image_hash] = group.captcha
158 |                     break
159 |     return matches
160 | 
161 | def solve_captcha(instr_id, images):
162 |     solution = []
163 |     for image_hash in images:
164 |         query = dbmodels.session.query(dbmodels.Captcha).filter_by(md5 = image_hash.md5, type_id = instr_id).filter(
165 |             dbmodels.Captcha.popularity > dbmodels.Captcha.failures)
166 |         if query.count() > 0:
167 |             solution.append(images.index(image_hash))
168 |             continue
169 | 
170 |         query = dbmodels.session.query(dbmodels.Captcha_Groups).join(dbmodels.Captcha).filter(
171 |             dbmodels.Captcha.type_id == instr_id).filter(
172 |             dbmodels.Captcha.max1 > image_hash.maxs[0] - 2).filter(
173 |             dbmodels.Captcha.max1 < image_hash.maxs[0] + 2).filter(
174 |             dbmodels.Captcha.max2 > image_hash.maxs[1] - 2).filter(
175 |             dbmodels.Captcha.max2 < image_hash.maxs[1] + 2).filter(
176 |             dbmodels.Captcha.min1 > image_hash.mins[0] - 2).filter(
177 |             dbmodels.Captcha.min1 < image_hash.mins[0] + 2).filter(
178 |             dbmodels.Captcha.min2 > image_hash.mins[1] - 2).filter(
179 |             dbmodels.Captcha.min2 < image_hash.mins[1] + 2).filter(
180 |             dbmodels.Captcha.popularity > dbmodels.Captcha.failures)
181 |         for group in query.all():
182 |             ihash = ImageHash.create_from_db(group.captcha.phash, group.captcha.histogram)
183 |             if ImageHash.phash_diff(ihash.phash, image_hash.phash) < 14:
184 |                 if ImageHash.aver_squad_diff(ihash.histogram[:], image_hash.histogram[:]) < 5:
185 |                     solution.append(images.index(image_hash))
186 |                     break
187 |     return solution
188 | 
189 | 
190 | if __name__ == "__main__":
191 |     # temporary file to figure out what the hell is going on with crontab
192 |     with open('runs.txt', 'a') as f:
193 |         f.write(str(datetime.datetime.now()) + "\n")
194 | 
195 |     lastidfile = 'lastid.txt'
196 |     current_id = 0
197 |     if not os.path.exists(lastidfile):
198 |         print "[!] No {lastidfile} file here"
199 |     else:
200 |         with open(lastidfile) as f:
201 |             current_id = int(f.read().strip())
202 | 
203 |     signal.signal(signal.SIGINT, exit_handler)
204 |     signal.signal(signal.SIGQUIT, exit_handler)
205 | 
206 |     instructions_hash = get_instructions_hash()
207 | 
208 |     current_id = 0
209 |     req = requests.get(OLDEST_ID_URL)
210 |     if req.ok:
211 |         id = json.loads(req.text)['id']
212 |         if current_id == 0:
213 |             current_id = id
214 |         elif current_id == id:
215 |             print "[*] No ids to check"
216 |             exit(0)
217 | 
218 |     try:
219 |         while current_id:
220 |             try:
221 |                 req = requests.get(CAPTCHAS_API_URL.format(current_id))
222 |             except: pass
223 |             if req and req.ok:
224 |                 data = json.loads(req.text)
225 |                 if type(data) is dict:
226 |                     ids = sorted(map(lambda x: int(x), data.keys()))
227 |                     for id in ids:
228 |                         statistics.new_id()
229 |                         current_id = str(id)
230 |                         captcha_data = data[str(id)]
231 |                         text_instructions = check_text_instruction(captcha_data['textinstructions'])
232 |                         if text_instructions:
233 |                             image_base64 = captcha_data['image']
234 |                             if len(image_base64) >= 100 and image_base64.find(',') > 0:
235 |                                 image_bytes = base64.b64decode(image_base64.split(',')[1])
236 | 
237 |                                 #ToDo: only one thread possible there!
238 |                                 with open('temp/temp.jpeg', 'wb') as f:
239 |                                     f.write(image_bytes)
240 |                                 try:
241 |                                     img = Image.open('temp/temp.jpeg')
242 |                                 except:
243 |                                     statistics.other_error()
244 | 
245 |                                 if img and (img.size == (300, 300) or img.size == (400, 400)):
246 |                                     #Merge instructions there in order to avoid good instructions without a picture
247 |                                     instr_id = -1
248 |                                     query = dbmodels.session.query(dbmodels.Types).filter_by(text = text_instructions)
249 |                                     if query.count() == 0:
250 |                                         if CREATE_NEW_TYPES:
251 |                                             statistics.new_type()
252 |                                             print "[*] [{}] New captcha type: {}".format(current_id, text_instructions)
253 |                                             new_type = dbmodels.Types(text_instructions)
254 |                                             dbmodels.session.add(new_type)
255 |                                             dbmodels.session.commit()
256 |                                             instr_id = new_type.id
257 |                                         else:
258 |                                             print "[!] [{}] Creation of new types is disabled".format(current_id)
259 |                                             continue
260 |                                     else:
261 |                                         instr_id = query.first().id
262 | 
263 |                                     captcha_answer = captcha_data['code']
264 |                                     if captcha_answer.find('click') >= 0:
265 |                                         indexes = re.findall('\d+', captcha_answer)
266 |                                         indexes = map(lambda x: int(x) - 1, indexes)
267 |                                         if indexes:
268 |                                             images = []
269 |                                             for index in range(img.size[0] * img.size[1] / 10000):
270 |                                                 try:
271 |                                                     width = (index % (img.size[0] / 100)) * 100
272 |                                                     height = (index / (img.size[0] / 100)) * 100
273 |                                                     cropped = img.crop((width, height, width+100, height+100))
274 |                                                     images.append(CaptchaHash(cropped))
275 |                                                 except:
276 |                                                     statistics.other_error()
277 | 
278 |                                             matches = match_captcha(images)
279 |                                             matches = {images.index(x): matches[x] for x in matches.keys()}
280 |                                             solution = [x for x in matches.keys() if matches[x] and matches[x].type_id == instr_id]
281 |                                             print "Matched {} of {} ({:.4}%)".format(len(solution), len(matches), float(100 * len(solution))/ float(len(matches)))
282 |                                             for match in matches:
283 |                                                 if matches[match]:
284 |                                                     print "{}: type id {}".format(match, matches[match].type_id)
285 |                                                 else:
286 |                                                     print "{}: didnt matched to any".format(match)
287 |                                             statistics.target_images(len([x for x in matches.keys() if matches[x] and matches[x].type_id == instr_id]))
288 |                                             statistics.nontarget_images(len([x for x in matches.keys() if matches[x] and matches[x].type_id != instr_id]))
289 | 
290 |                                             solution = solve_captcha(instr_id, images)
291 | 
292 |                                             # prediction = (success, failed, overall)
293 |                                             prediction = (len(set(solution).intersection(set(indexes))),
294 |                                                           len(set(solution) ^ set(indexes)),
295 |                                                           len(set(solution).union(set(indexes))))
296 | 
297 |                                             if prediction[0] > prediction[1]:
298 |                                                 print "[+] Solved captcha (type: {})  ({:.4}%): {} - {}".format(instr_id,
299 |                                                     100 * float(prediction[0])/float(prediction[2]), indexes, solution)
300 |                                                 statistics.solved_captcha()
301 |                                                 statistics.solved_type(text_instructions)
302 |                                             else:
303 |                                                 print "[+] Failed captcha (type: {})  ({:.4}%): {} - {}".format(instr_id,
304 |                                                     100 * float(prediction[0])/float(prediction[2]), indexes, solution)
305 |                                                 statistics.failed_captcha()
306 | 
307 |                                             for index in range(img.size[0] * img.size[1] / 10000):
308 |                                                 try:
309 |                                                     image_hash = images[index]
310 |                                                     query = dbmodels.session.query(dbmodels.Captcha).filter_by(
311 |                                                         md5 = image_hash.md5, type_id = instr_id)
312 |                                                     if query.count():
313 |                                                         if index in indexes:
314 |                                                             statistics.new_image_already_stored()
315 |                                                             print "[!] [{}] The image is already storing: {}".format(current_id, image_hash.md5)
316 |                                                             query.first().popularity += 1
317 |                                                             dbmodels.session.commit()
318 |                                                         else:
319 |                                                             statistics.error_fixed()
320 |                                                             print "[+] [{}] Image stores with incorrect type: {}".format(current_id, image_hash.md5)
321 |                                                             query.first().failures += 1
322 |                                                             dbmodels.session.commit()
323 |                                                     elif index in solution and not index in indexes:
324 |                                                         # Counter-captcha is an image which purpose is to prevent future false matches of
325 |                                                         # images that were marked as solution by grouping feature.
326 |                                                         # Thus counter-captcha should have popularity = 0 and failures = 1 (>0)
327 |                                                         print "[+] [{}] Anti-captcha image: {}".format(current_id, image_hash.md5)
328 |                                                         captcha = dbmodels.Captcha(instr_id, image_hash.md5, str(image_hash.phash),
329 |                                                             image_hash.histogram, image_hash.mins, image_hash.maxs, 0, 1)
330 |                                                         dbmodels.session.add(captcha)
331 |                                                         dbmodels.session.commit()
332 |                                                     elif index in indexes:
333 |                                                         statistics.new_image()
334 | 
335 |                                                         print "[+] [{}] Saved image: {}".format(current_id, image_hash.md5)
336 |                                                         captcha = dbmodels.Captcha(instr_id, image_hash.md5, str(image_hash.phash),
337 |                                                             image_hash.histogram, image_hash.mins, image_hash.maxs)
338 |                                                         dbmodels.session.add(captcha)
339 |                                                         dbmodels.session.commit()
340 | 
341 |                                                         matched = False
342 |                                                         query = dbmodels.session.query(dbmodels.Captcha_Groups).join(dbmodels.Captcha).filter(
343 |                                                             dbmodels.Captcha.type_id == captcha.type_id).filter(
344 |                                                             dbmodels.Captcha.max1 > image_hash.maxs[0] - 2).filter(
345 |                                                             dbmodels.Captcha.max1 < image_hash.maxs[0] + 2).filter(
346 |                                                             dbmodels.Captcha.max2 > image_hash.maxs[1] - 2).filter(
347 |                                                             dbmodels.Captcha.max2 < image_hash.maxs[1] + 2).filter(
348 |                                                             dbmodels.Captcha.min1 > image_hash.mins[0] - 2).filter(
349 |                                                             dbmodels.Captcha.min1 < image_hash.mins[0] + 2).filter(
350 |                                                             dbmodels.Captcha.min2 > image_hash.mins[1] - 2).filter(                                                                dbmodels.Captcha.min2 < image_hash.mins[1] + 2)
351 |                                                         for group in query.all():
352 |                                                             ihash = ImageHash.create_from_db(group.captcha.phash, group.captcha.histogram)
353 |                                                             if ImageHash.phash_diff(ihash.phash, image_hash.phash) < 14:
354 |                                                                 if ImageHash.aver_squad_diff(ihash.histogram[:], image_hash.histogram[:]) < 5:
355 |                                                                     matched = True
356 |                                                                     group.group = group.group + [captcha.id]
357 |                                                                     dbmodels.session.commit()
358 |                                                                     print "[+]   [{}] Matched to captcha id {}".format(current_id, group.id)
359 |                                                                     statistics.new_image_grouped()
360 | 
361 |                                                                     break
362 | 
363 |                                                         if not matched:
364 |                                                             group = dbmodels.Captcha_Groups(captcha.id, captcha.type_id)
365 |                                                             dbmodels.session.add(group)
366 |                                                             dbmodels.session.commit()
367 |                                                 except:
368 |                                                     statistics.other_error()
369 |                                         else:
370 |                                             print "[!][{}] No indexes in code: {}".format(current_id, captcha_answer)
371 |                                     else:
372 |                                         print "[!][{}] Bad captcha answer: {}".format(current_id, captcha_answer)
373 |                                 else:
374 |                                     statistics.wrong_size()
375 |                                     print "[!][{}] Non standart image size: {}".format(current_id, img.size)
376 |                             else:
377 |                                 statistics.other_error()
378 |                                 print "[!][{}] Too short image base64: {}".format(current_id, len(image_base64))
379 |                         else:
380 |                             print "[!][{}] No text instructions".format(current_id)
381 | 
382 |                     with open(lastidfile, 'w') as f:
383 |                         f.write(str(max(ids)))
384 |                 else:
385 |                     print "[!] Got not dictionary: {}".format(type(data))
386 |                     break
387 |     except:
388 |         print "[!] Very bad exception!"
389 | 
390 |     final()
391 | 


--------------------------------------------------------------------------------
/extremums.ods:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajaxtpm/reCaptcha/68117fbf1be1f7a1692220a837954c44e31e234a/extremums.ods


--------------------------------------------------------------------------------
/hash.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import imagehash
 3 | from PIL import Image
 4 | from hashlib import md5
 5 | import matplotlib.pyplot as plt
 6 | import numpy as np
 7 | import math
 8 | import operator
 9 | import time, datetime
10 | import utils
11 | from scipy.signal import argrelextrema
12 | import dbmodels
13 | 
14 | class ImageHash:
15 |     def __init__(self, phash, histogram):
16 |         self.phash = phash
17 |         self.histogram = histogram
18 | 
19 |     @staticmethod
20 |     def create_from_db(phash, histogram):
21 |         return ImageHash(phash, np.array(histogram))
22 | 
23 |     @staticmethod
24 |     def aver_squad_diff(one, two):
25 |         if len(one) == len(two):
26 |             return math.sqrt(reduce(operator.add, list(map(lambda a,b: (a-b)**2, one, two))) / len(one))
27 |         return -1
28 | 
29 |     @staticmethod
30 |     def phash_diff(one, two):
31 |         return bin(int(one, 16) ^ int(two, 16)).count('1')
32 | 
33 | 
34 | class CaptchaHash(ImageHash):
35 |     def __init__(self, path):
36 |         t = path
37 |         if type and path.__class__ == str:
38 |             t = Image.open(path)
39 |         phash = imagehash.phash(t, 8)
40 |         histogram = np.array(t.convert('L').histogram())
41 |         self.md5 = md5(t.tostring()).hexdigest()
42 | 
43 |         self.phash = str(phash)
44 |         self.histogram = utils.smooth(histogram, 100)
45 | 
46 |         self.mins = argrelextrema(self.histogram, np.less)[0]
47 |         self.maxs = argrelextrema(self.histogram, np.greater)[0]
48 |         self.histogram = np.array(map(lambda x: int(x), self.histogram))
49 | 
50 |         if len(self.mins) < 2: self.mins = np.append(self.mins, [1000] * (2-len(self.mins)) )
51 |         if len(self.maxs) < 2: self.maxs = np.append(self.maxs, [1000] * (2-len(self.maxs)) )
52 | 
53 |     def __str__(self):
54 |         return '{}, <{}, {}>'.format(self.phash, self.mins, self.maxs)
55 | 


--------------------------------------------------------------------------------
/solved.ods:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajaxtpm/reCaptcha/68117fbf1be1f7a1692220a837954c44e31e234a/solved.ods


--------------------------------------------------------------------------------
/stat.ods:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajaxtpm/reCaptcha/68117fbf1be1f7a1692220a837954c44e31e234a/stat.ods


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | import numpy
 4 | 
 5 | def get_instructions_hash(path = 'instructions.txt'):
 6 |     instructions = {}
 7 |     f = open(path, 'r')
 8 |     for line in f.read().split('\n'):
 9 |         if line:
10 |             (id, text) = line.split(':', 1)
11 |             id = int(id)
12 |             instructions[id] = text
13 |     f.close()
14 |     return instructions
15 | 
16 | EXTRACTION_PATTERNS = [
17 |     'ENG: *([^-.]+)',
18 |     ' / (.+?)\.?$',
19 |     'all (?:the )?(?:images |pictures |squares )?(?:with |are |of )?(?:a |the |an )?([^-.]+)',
20 |     'все (?:изображения|квадраты),? (?:на которых |где |с |со )(?:есть )?([^-.]+)'
21 | ]
22 | 
23 | def translate(text):
24 |     res = text
25 |     for pattern in EXTRACTION_PATTERNS:
26 |         match = re.search(pattern, res)
27 |         if match and match.group(0):
28 |             res = match.group(1)
29 |     return re.sub('_', ' ', res)
30 | 
31 | 
32 | def smooth(x,window_len=11,window='hanning'):
33 |     if x.ndim != 1:
34 |         raise ValueError, "smooth only accepts 1 dimension arrays."
35 |     if x.size < window_len:
36 |         raise ValueError, "Input vector needs to be bigger than window size."
37 | 
38 |     if window_len<3:
39 |         return x
40 | 
41 |     if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']:
42 |         raise ValueError, "Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'"
43 |     s=numpy.r_[x[window_len-1:0:-1],x,x[-1:-window_len:-1]]
44 |     #print(len(s))
45 |     if window == 'flat': #moving average
46 |         w=numpy.ones(window_len,'d')
47 |     else:
48 |         w=eval('numpy.'+window+'(window_len)')
49 | 
50 |     y=numpy.convolve(w/w.sum(),s,mode='valid')
51 |     return y
52 | 
53 | if __name__ == '__main__':
54 |     f = open('instructions.txt')
55 |     lines = []
56 |     for line in f.read().split('\n'):
57 |         if line:
58 |             (id, t) = line.split(':', 1)
59 |             lines.append(t)
60 | 
61 |     for line in lines:
62 |         print "Was <{}> become <{}>".format(line, translate(line))
63 | 


--------------------------------------------------------------------------------