├── README.md ├── dbmodels.py ├── download_new.py ├── extremums.ods ├── hash.py ├── solved.ods ├── stat.ods └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | This is my attempt to create a system to solve Google reCaptcha v2 automatically based on perceptive hashes, colors histograms and huge training image sets. 2 | All research details on habrahabr.ru: https://habrahabr.ru/post/280230/ 3 | -------------------------------------------------------------------------------- /dbmodels.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine, Column, Integer, String, Text, Float, DateTime 2 | from sqlalchemy.dialects import postgresql 3 | from sqlalchemy.schema import ForeignKey 4 | from sqlalchemy.ext.declarative import declarative_base 5 | from sqlalchemy.orm import sessionmaker, relationship 6 | import datetime 7 | 8 | engine = create_engine('sqlite:///recaptcha.db') 9 | 10 | #Declare an instance of the Base class for mapping tables 11 | Base = declarative_base() 12 | 13 | #Map a table to a class by inheriting base class 14 | class Types(Base): 15 | __tablename__ = 'types' 16 | 17 | id = Column(Integer, primary_key=True) 18 | text = Column(Text, nullable=False) 19 | 20 | def __init__(self, text): 21 | self.text = text 22 | 23 | def __str__(self): 24 | return "Type(id: {}, text: {})".format(self.id, self.text) 25 | 26 | class Captcha(Base): 27 | __tablename__ = 'captcha' 28 | 29 | id = Column(Integer, primary_key=True) 30 | md5 = Column(String(32), nullable=False) 31 | phash = Column(String(16), nullable=False) 32 | type_id = Column(Integer, nullable=False) 33 | histogram = Column(postgresql.ARRAY(Float), nullable=False) 34 | max1 = Column(Integer, nullable=False) 35 | max2 = Column(Integer, nullable=False) 36 | min1 = Column(Integer, nullable=False) 37 | min2 = Column(Integer, nullable=False) 38 | popularity = Column(Integer, nullable=False) 39 | failures = Column(Integer, nullable=False) 40 | creation_date = Column(DateTime, nullable=False) 41 | 42 | def __init__(self, type_id, md5, phash, histogram, min, max, popularity = 1, failures = 0): 43 | self.type_id = type_id 44 | self.md5 = md5 45 | self.phash = phash 46 | self.histogram = histogram 47 | 48 | if len(min) == 0: 49 | self.min1 = 1000 50 | else: 51 | self.min1 = min[0] 52 | if len(min) == 1: 53 | self.min2 = 1000 54 | else: 55 | self.min2 = min[1] 56 | 57 | if len(max) == 0: 58 | self.max1 = 1000 59 | else: 60 | self.max1 = max[0] 61 | if len(max) == 1: 62 | self.max2 = 1000 63 | else: 64 | self.max2 = max[1] 65 | self.popularity = popularity 66 | self.failures = failures 67 | self.creation_date = datetime.datetime.now() 68 | 69 | def __str__(self): 70 | return "Captcha(id: {}, typeId: {}, md5: {} mins: {},{} maxs: {},{} pop: {} fails: {} created: {})".format( 71 | self.id, self.type_id, self.md5, self.min1, self.min2, self.max1, self.max2, self.popularity, self.failures, 72 | self.creation_date) 73 | 74 | class Captcha_Groups(Base): 75 | __tablename__ = "captcha_groups" 76 | 77 | id = Column(Integer, ForeignKey('captcha.id'), primary_key=True) 78 | type_id = Column(Integer, nullable=False) 79 | group = Column(postgresql.ARRAY(Integer)) 80 | captcha = relationship(Captcha) 81 | 82 | def __init__(self, captcha_id, type_id, group = []): 83 | self.id = captcha_id 84 | self.type_id = type_id 85 | self.group = group 86 | 87 | def __str__(self): 88 | return "Captcha Group(id: {}, type: {}, group: {})".format(self.id, self.type_id, self.group) 89 | 90 | #Create the table using the metadata attribute of the base class 91 | Base.metadata.create_all(engine) 92 | 93 | Session = sessionmaker(bind=engine) 94 | session = Session() 95 | -------------------------------------------------------------------------------- /download_new.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import json 3 | import requests 4 | import base64 5 | import re 6 | import PIL 7 | from PIL import Image 8 | from hashlib import md5 9 | import time, datetime 10 | import signal 11 | from utils import translate, get_instructions_hash 12 | import dbmodels 13 | from hash import CaptchaHash, ImageHash 14 | 15 | 16 | class Stats: 17 | def __init__(self): 18 | self.STATS_IDS_CHECKED = 0 19 | self.STATS_NEW_TYPES = 0 20 | self.STATS_NEW_IMAGES = 0 21 | self.STATS_IMAGES_ALREADY_STORED = 0 22 | self.STATS_IMAGES_GROUPED = 0 23 | self.STATS_WRONG_SIZE_IMAGE = 0 24 | self.STATS_WRONG_TEXT = 0 25 | self.STATS_OTHER_ERRORS = 0 26 | self.STATS_SOLVED_CAPTCHAS = 0 27 | self.STATS_FAILED_CAPTCHAS = 0 28 | self.STATS_ERRORS_FIXED = 0 29 | self.STATS_TARGET_IMAGES = 0 30 | self.STATS_NONTARGET_IMAGES = 0 31 | self.start_time = datetime.datetime.now() 32 | self.solved_types = {} 33 | 34 | def new_id(self, count = 1): self.STATS_IDS_CHECKED += count 35 | def new_type(self, count = 1): self.STATS_NEW_TYPES += count 36 | def new_image(self, count = 1): self.STATS_NEW_IMAGES += count 37 | def new_image_already_stored(self, count = 1): self.STATS_IMAGES_ALREADY_STORED += count 38 | def new_image_grouped(self, count = 1): self.STATS_IMAGES_GROUPED += count 39 | def wrong_size(self, count = 1): self.STATS_WRONG_SIZE_IMAGE += count 40 | def wrong_text(self, count = 1): self.STATS_WRONG_TEXT += count 41 | def solved_captcha(self, count = 1): self.STATS_SOLVED_CAPTCHAS += count 42 | def failed_captcha(self, count = 1): self.STATS_FAILED_CAPTCHAS += count 43 | def error_fixed(self, count = 1): self.STATS_ERRORS_FIXED += count 44 | def other_error(self, count = 1): self.STATS_OTHER_ERRORS += count 45 | def target_images(self, count = 1): self.STATS_TARGET_IMAGES += count 46 | def nontarget_images(self, count = 1): self.STATS_NONTARGET_IMAGES += count 47 | def solved_type(self, type): 48 | if type in self.solved_types: 49 | self.solved_types[type] += 1 50 | else: 51 | self.solved_types[type] = 1 52 | 53 | def output(self): 54 | print "\nStatistics:\nStarted: {}\n===========\n".format(self.start_time) 55 | print "IDs Checked: {}\nNew types added: {}\nNew images saved: {}\nNew images alread stored: {}\nNew images grouped: {}\n" \ 56 | "Images with wrong size: {}\nWrong text instructins: {}\nOther errors: {}\nTime elapsed: {}s\n\nSolved captchas: {}\n" \ 57 | "Failed captchas: {}\nErrors fixed: {}\nTarget images: {}\nNon-Target images: {}\n\nSolved types:".format( 58 | self.STATS_IDS_CHECKED, self.STATS_NEW_TYPES, self.STATS_NEW_IMAGES, self.STATS_IMAGES_ALREADY_STORED, self.STATS_IMAGES_GROUPED, 59 | self.STATS_WRONG_SIZE_IMAGE, self.STATS_WRONG_TEXT, self.STATS_OTHER_ERRORS, str(datetime.datetime.now() - self.start_time), 60 | self.STATS_SOLVED_CAPTCHAS, self.STATS_FAILED_CAPTCHAS, self.STATS_ERRORS_FIXED, self.STATS_TARGET_IMAGES, self.STATS_NONTARGET_IMAGES) 61 | all_types = sum(self.solved_types.values()) 62 | for type in self.solved_types: 63 | print "{}: {} ({:.4}%)".format(type, self.solved_types[type], 100. * float(self.solved_types[type]) / float(all_types)) 64 | 65 | 66 | statistics = Stats() 67 | OLDEST_ID_URL = '...' 68 | CAPTCHAS_API_URL = '...' 69 | BAD_INSTRUCTIONS = ['):'] 70 | INSTRUCTION_SUBSTITUTE_PATTERNS = { 71 | 'Verify\r\nReport a problem': '(\.\r\n)?Verify\r\nReport a problem.*', 72 | 'Please select all matching images': '(.\r\n)?Please select all matching images.*', 73 | 'Multiple correct solutions required - please solve more': '(\.\r\n)?Multiple correct solutions required - please solve more.*', 74 | } 75 | CREATE_NEW_TYPES = False 76 | 77 | 78 | 79 | def merge_instructions_hash(instructions, path = 'instructions.txt'): 80 | file_instr = get_instructions_hash(path) 81 | f = open(path, 'a') 82 | for id in set(instructions.keys()) - set(file_instr.keys()): 83 | text = instructions[id] 84 | text = text.replace('\n', '').replace('\r', '') 85 | f.write("{}:{}\n".format(id, text)) 86 | if not os.path.exists('captchas/{}'.format(id)): 87 | os.makedirs('captchas/{}'.format(id)) 88 | f.flush() 89 | f.close() 90 | 91 | def check_text_instruction(text): 92 | global statistics 93 | 94 | if text: 95 | if type(text) == unicode: 96 | text = text.encode('utf8') 97 | if text: 98 | for pattern in INSTRUCTION_SUBSTITUTE_PATTERNS: 99 | if text.find(pattern): 100 | text = re.sub(INSTRUCTION_SUBSTITUTE_PATTERNS[pattern], '', text) 101 | bad_instr = False 102 | for instr in BAD_INSTRUCTIONS: 103 | if text.find(instr) != -1: 104 | bad_instr = True 105 | break 106 | if bad_instr: 107 | statistics.wrong_text() 108 | print "[*] Bad string found" 109 | return None 110 | return translate(text).strip() 111 | return text 112 | 113 | def get_instruction_id(instructions, text): 114 | for id in instructions.keys(): 115 | if instructions[id] == text: 116 | return id 117 | return -1 118 | 119 | def final(): 120 | global statistics 121 | statistics.output() 122 | f = open('logs/'+str(datetime.datetime.now()) + '.txt', 'w') 123 | old_stdout = sys.stdout 124 | sys.stdout = f 125 | statistics.output() 126 | sys.stderr = old_stdout 127 | f.close() 128 | 129 | def exit_handler(signal, frame): 130 | final() 131 | sys.exit(0) 132 | 133 | def match_captcha(images): 134 | matches = {} 135 | for image_hash in images: 136 | matches[image_hash] = None 137 | query = dbmodels.session.query(dbmodels.Captcha).filter_by(md5 = image_hash.md5).filter( 138 | dbmodels.Captcha.popularity > dbmodels.Captcha.failures).order_by(dbmodels.Captcha.popularity.desc()) 139 | if query.count() > 0: 140 | matches[image_hash] = query.first() 141 | continue 142 | 143 | query = dbmodels.session.query(dbmodels.Captcha_Groups).join(dbmodels.Captcha).filter( 144 | dbmodels.Captcha.max1 > image_hash.maxs[0] - 2).filter( 145 | dbmodels.Captcha.max1 < image_hash.maxs[0] + 2).filter( 146 | dbmodels.Captcha.max2 > image_hash.maxs[1] - 2).filter( 147 | dbmodels.Captcha.max2 < image_hash.maxs[1] + 2).filter( 148 | dbmodels.Captcha.min1 > image_hash.mins[0] - 2).filter( 149 | dbmodels.Captcha.min1 < image_hash.mins[0] + 2).filter( 150 | dbmodels.Captcha.min2 > image_hash.mins[1] - 2).filter( 151 | dbmodels.Captcha.min2 < image_hash.mins[1] + 2).filter( 152 | dbmodels.Captcha.popularity > dbmodels.Captcha.failures) 153 | for group in query.all(): 154 | ihash = ImageHash.create_from_db(group.captcha.phash, group.captcha.histogram) 155 | if ImageHash.phash_diff(ihash.phash, image_hash.phash) < 14: 156 | if ImageHash.aver_squad_diff(ihash.histogram[:], image_hash.histogram[:]) < 5: 157 | matches[image_hash] = group.captcha 158 | break 159 | return matches 160 | 161 | def solve_captcha(instr_id, images): 162 | solution = [] 163 | for image_hash in images: 164 | query = dbmodels.session.query(dbmodels.Captcha).filter_by(md5 = image_hash.md5, type_id = instr_id).filter( 165 | dbmodels.Captcha.popularity > dbmodels.Captcha.failures) 166 | if query.count() > 0: 167 | solution.append(images.index(image_hash)) 168 | continue 169 | 170 | query = dbmodels.session.query(dbmodels.Captcha_Groups).join(dbmodels.Captcha).filter( 171 | dbmodels.Captcha.type_id == instr_id).filter( 172 | dbmodels.Captcha.max1 > image_hash.maxs[0] - 2).filter( 173 | dbmodels.Captcha.max1 < image_hash.maxs[0] + 2).filter( 174 | dbmodels.Captcha.max2 > image_hash.maxs[1] - 2).filter( 175 | dbmodels.Captcha.max2 < image_hash.maxs[1] + 2).filter( 176 | dbmodels.Captcha.min1 > image_hash.mins[0] - 2).filter( 177 | dbmodels.Captcha.min1 < image_hash.mins[0] + 2).filter( 178 | dbmodels.Captcha.min2 > image_hash.mins[1] - 2).filter( 179 | dbmodels.Captcha.min2 < image_hash.mins[1] + 2).filter( 180 | dbmodels.Captcha.popularity > dbmodels.Captcha.failures) 181 | for group in query.all(): 182 | ihash = ImageHash.create_from_db(group.captcha.phash, group.captcha.histogram) 183 | if ImageHash.phash_diff(ihash.phash, image_hash.phash) < 14: 184 | if ImageHash.aver_squad_diff(ihash.histogram[:], image_hash.histogram[:]) < 5: 185 | solution.append(images.index(image_hash)) 186 | break 187 | return solution 188 | 189 | 190 | if __name__ == "__main__": 191 | # temporary file to figure out what the hell is going on with crontab 192 | with open('runs.txt', 'a') as f: 193 | f.write(str(datetime.datetime.now()) + "\n") 194 | 195 | lastidfile = 'lastid.txt' 196 | current_id = 0 197 | if not os.path.exists(lastidfile): 198 | print "[!] No {lastidfile} file here" 199 | else: 200 | with open(lastidfile) as f: 201 | current_id = int(f.read().strip()) 202 | 203 | signal.signal(signal.SIGINT, exit_handler) 204 | signal.signal(signal.SIGQUIT, exit_handler) 205 | 206 | instructions_hash = get_instructions_hash() 207 | 208 | current_id = 0 209 | req = requests.get(OLDEST_ID_URL) 210 | if req.ok: 211 | id = json.loads(req.text)['id'] 212 | if current_id == 0: 213 | current_id = id 214 | elif current_id == id: 215 | print "[*] No ids to check" 216 | exit(0) 217 | 218 | try: 219 | while current_id: 220 | try: 221 | req = requests.get(CAPTCHAS_API_URL.format(current_id)) 222 | except: pass 223 | if req and req.ok: 224 | data = json.loads(req.text) 225 | if type(data) is dict: 226 | ids = sorted(map(lambda x: int(x), data.keys())) 227 | for id in ids: 228 | statistics.new_id() 229 | current_id = str(id) 230 | captcha_data = data[str(id)] 231 | text_instructions = check_text_instruction(captcha_data['textinstructions']) 232 | if text_instructions: 233 | image_base64 = captcha_data['image'] 234 | if len(image_base64) >= 100 and image_base64.find(',') > 0: 235 | image_bytes = base64.b64decode(image_base64.split(',')[1]) 236 | 237 | #ToDo: only one thread possible there! 238 | with open('temp/temp.jpeg', 'wb') as f: 239 | f.write(image_bytes) 240 | try: 241 | img = Image.open('temp/temp.jpeg') 242 | except: 243 | statistics.other_error() 244 | 245 | if img and (img.size == (300, 300) or img.size == (400, 400)): 246 | #Merge instructions there in order to avoid good instructions without a picture 247 | instr_id = -1 248 | query = dbmodels.session.query(dbmodels.Types).filter_by(text = text_instructions) 249 | if query.count() == 0: 250 | if CREATE_NEW_TYPES: 251 | statistics.new_type() 252 | print "[*] [{}] New captcha type: {}".format(current_id, text_instructions) 253 | new_type = dbmodels.Types(text_instructions) 254 | dbmodels.session.add(new_type) 255 | dbmodels.session.commit() 256 | instr_id = new_type.id 257 | else: 258 | print "[!] [{}] Creation of new types is disabled".format(current_id) 259 | continue 260 | else: 261 | instr_id = query.first().id 262 | 263 | captcha_answer = captcha_data['code'] 264 | if captcha_answer.find('click') >= 0: 265 | indexes = re.findall('\d+', captcha_answer) 266 | indexes = map(lambda x: int(x) - 1, indexes) 267 | if indexes: 268 | images = [] 269 | for index in range(img.size[0] * img.size[1] / 10000): 270 | try: 271 | width = (index % (img.size[0] / 100)) * 100 272 | height = (index / (img.size[0] / 100)) * 100 273 | cropped = img.crop((width, height, width+100, height+100)) 274 | images.append(CaptchaHash(cropped)) 275 | except: 276 | statistics.other_error() 277 | 278 | matches = match_captcha(images) 279 | matches = {images.index(x): matches[x] for x in matches.keys()} 280 | solution = [x for x in matches.keys() if matches[x] and matches[x].type_id == instr_id] 281 | print "Matched {} of {} ({:.4}%)".format(len(solution), len(matches), float(100 * len(solution))/ float(len(matches))) 282 | for match in matches: 283 | if matches[match]: 284 | print "{}: type id {}".format(match, matches[match].type_id) 285 | else: 286 | print "{}: didnt matched to any".format(match) 287 | statistics.target_images(len([x for x in matches.keys() if matches[x] and matches[x].type_id == instr_id])) 288 | statistics.nontarget_images(len([x for x in matches.keys() if matches[x] and matches[x].type_id != instr_id])) 289 | 290 | solution = solve_captcha(instr_id, images) 291 | 292 | # prediction = (success, failed, overall) 293 | prediction = (len(set(solution).intersection(set(indexes))), 294 | len(set(solution) ^ set(indexes)), 295 | len(set(solution).union(set(indexes)))) 296 | 297 | if prediction[0] > prediction[1]: 298 | print "[+] Solved captcha (type: {}) ({:.4}%): {} - {}".format(instr_id, 299 | 100 * float(prediction[0])/float(prediction[2]), indexes, solution) 300 | statistics.solved_captcha() 301 | statistics.solved_type(text_instructions) 302 | else: 303 | print "[+] Failed captcha (type: {}) ({:.4}%): {} - {}".format(instr_id, 304 | 100 * float(prediction[0])/float(prediction[2]), indexes, solution) 305 | statistics.failed_captcha() 306 | 307 | for index in range(img.size[0] * img.size[1] / 10000): 308 | try: 309 | image_hash = images[index] 310 | query = dbmodels.session.query(dbmodels.Captcha).filter_by( 311 | md5 = image_hash.md5, type_id = instr_id) 312 | if query.count(): 313 | if index in indexes: 314 | statistics.new_image_already_stored() 315 | print "[!] [{}] The image is already storing: {}".format(current_id, image_hash.md5) 316 | query.first().popularity += 1 317 | dbmodels.session.commit() 318 | else: 319 | statistics.error_fixed() 320 | print "[+] [{}] Image stores with incorrect type: {}".format(current_id, image_hash.md5) 321 | query.first().failures += 1 322 | dbmodels.session.commit() 323 | elif index in solution and not index in indexes: 324 | # Counter-captcha is an image which purpose is to prevent future false matches of 325 | # images that were marked as solution by grouping feature. 326 | # Thus counter-captcha should have popularity = 0 and failures = 1 (>0) 327 | print "[+] [{}] Anti-captcha image: {}".format(current_id, image_hash.md5) 328 | captcha = dbmodels.Captcha(instr_id, image_hash.md5, str(image_hash.phash), 329 | image_hash.histogram, image_hash.mins, image_hash.maxs, 0, 1) 330 | dbmodels.session.add(captcha) 331 | dbmodels.session.commit() 332 | elif index in indexes: 333 | statistics.new_image() 334 | 335 | print "[+] [{}] Saved image: {}".format(current_id, image_hash.md5) 336 | captcha = dbmodels.Captcha(instr_id, image_hash.md5, str(image_hash.phash), 337 | image_hash.histogram, image_hash.mins, image_hash.maxs) 338 | dbmodels.session.add(captcha) 339 | dbmodels.session.commit() 340 | 341 | matched = False 342 | query = dbmodels.session.query(dbmodels.Captcha_Groups).join(dbmodels.Captcha).filter( 343 | dbmodels.Captcha.type_id == captcha.type_id).filter( 344 | dbmodels.Captcha.max1 > image_hash.maxs[0] - 2).filter( 345 | dbmodels.Captcha.max1 < image_hash.maxs[0] + 2).filter( 346 | dbmodels.Captcha.max2 > image_hash.maxs[1] - 2).filter( 347 | dbmodels.Captcha.max2 < image_hash.maxs[1] + 2).filter( 348 | dbmodels.Captcha.min1 > image_hash.mins[0] - 2).filter( 349 | dbmodels.Captcha.min1 < image_hash.mins[0] + 2).filter( 350 | dbmodels.Captcha.min2 > image_hash.mins[1] - 2).filter( dbmodels.Captcha.min2 < image_hash.mins[1] + 2) 351 | for group in query.all(): 352 | ihash = ImageHash.create_from_db(group.captcha.phash, group.captcha.histogram) 353 | if ImageHash.phash_diff(ihash.phash, image_hash.phash) < 14: 354 | if ImageHash.aver_squad_diff(ihash.histogram[:], image_hash.histogram[:]) < 5: 355 | matched = True 356 | group.group = group.group + [captcha.id] 357 | dbmodels.session.commit() 358 | print "[+] [{}] Matched to captcha id {}".format(current_id, group.id) 359 | statistics.new_image_grouped() 360 | 361 | break 362 | 363 | if not matched: 364 | group = dbmodels.Captcha_Groups(captcha.id, captcha.type_id) 365 | dbmodels.session.add(group) 366 | dbmodels.session.commit() 367 | except: 368 | statistics.other_error() 369 | else: 370 | print "[!][{}] No indexes in code: {}".format(current_id, captcha_answer) 371 | else: 372 | print "[!][{}] Bad captcha answer: {}".format(current_id, captcha_answer) 373 | else: 374 | statistics.wrong_size() 375 | print "[!][{}] Non standart image size: {}".format(current_id, img.size) 376 | else: 377 | statistics.other_error() 378 | print "[!][{}] Too short image base64: {}".format(current_id, len(image_base64)) 379 | else: 380 | print "[!][{}] No text instructions".format(current_id) 381 | 382 | with open(lastidfile, 'w') as f: 383 | f.write(str(max(ids))) 384 | else: 385 | print "[!] Got not dictionary: {}".format(type(data)) 386 | break 387 | except: 388 | print "[!] Very bad exception!" 389 | 390 | final() 391 | -------------------------------------------------------------------------------- /extremums.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ajaxtpm/reCaptcha/68117fbf1be1f7a1692220a837954c44e31e234a/extremums.ods -------------------------------------------------------------------------------- /hash.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import imagehash 3 | from PIL import Image 4 | from hashlib import md5 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | import math 8 | import operator 9 | import time, datetime 10 | import utils 11 | from scipy.signal import argrelextrema 12 | import dbmodels 13 | 14 | class ImageHash: 15 | def __init__(self, phash, histogram): 16 | self.phash = phash 17 | self.histogram = histogram 18 | 19 | @staticmethod 20 | def create_from_db(phash, histogram): 21 | return ImageHash(phash, np.array(histogram)) 22 | 23 | @staticmethod 24 | def aver_squad_diff(one, two): 25 | if len(one) == len(two): 26 | return math.sqrt(reduce(operator.add, list(map(lambda a,b: (a-b)**2, one, two))) / len(one)) 27 | return -1 28 | 29 | @staticmethod 30 | def phash_diff(one, two): 31 | return bin(int(one, 16) ^ int(two, 16)).count('1') 32 | 33 | 34 | class CaptchaHash(ImageHash): 35 | def __init__(self, path): 36 | t = path 37 | if type and path.__class__ == str: 38 | t = Image.open(path) 39 | phash = imagehash.phash(t, 8) 40 | histogram = np.array(t.convert('L').histogram()) 41 | self.md5 = md5(t.tostring()).hexdigest() 42 | 43 | self.phash = str(phash) 44 | self.histogram = utils.smooth(histogram, 100) 45 | 46 | self.mins = argrelextrema(self.histogram, np.less)[0] 47 | self.maxs = argrelextrema(self.histogram, np.greater)[0] 48 | self.histogram = np.array(map(lambda x: int(x), self.histogram)) 49 | 50 | if len(self.mins) < 2: self.mins = np.append(self.mins, [1000] * (2-len(self.mins)) ) 51 | if len(self.maxs) < 2: self.maxs = np.append(self.maxs, [1000] * (2-len(self.maxs)) ) 52 | 53 | def __str__(self): 54 | return '{}, <{}, {}>'.format(self.phash, self.mins, self.maxs) 55 | -------------------------------------------------------------------------------- /solved.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ajaxtpm/reCaptcha/68117fbf1be1f7a1692220a837954c44e31e234a/solved.ods -------------------------------------------------------------------------------- /stat.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ajaxtpm/reCaptcha/68117fbf1be1f7a1692220a837954c44e31e234a/stat.ods -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import numpy 4 | 5 | def get_instructions_hash(path = 'instructions.txt'): 6 | instructions = {} 7 | f = open(path, 'r') 8 | for line in f.read().split('\n'): 9 | if line: 10 | (id, text) = line.split(':', 1) 11 | id = int(id) 12 | instructions[id] = text 13 | f.close() 14 | return instructions 15 | 16 | EXTRACTION_PATTERNS = [ 17 | 'ENG: *([^-.]+)', 18 | ' / (.+?)\.?$', 19 | 'all (?:the )?(?:images |pictures |squares )?(?:with |are |of )?(?:a |the |an )?([^-.]+)', 20 | 'все (?:изображения|квадраты),? (?:на которых |где |с |со )(?:есть )?([^-.]+)' 21 | ] 22 | 23 | def translate(text): 24 | res = text 25 | for pattern in EXTRACTION_PATTERNS: 26 | match = re.search(pattern, res) 27 | if match and match.group(0): 28 | res = match.group(1) 29 | return re.sub('_', ' ', res) 30 | 31 | 32 | def smooth(x,window_len=11,window='hanning'): 33 | if x.ndim != 1: 34 | raise ValueError, "smooth only accepts 1 dimension arrays." 35 | if x.size < window_len: 36 | raise ValueError, "Input vector needs to be bigger than window size." 37 | 38 | if window_len<3: 39 | return x 40 | 41 | if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']: 42 | raise ValueError, "Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'" 43 | s=numpy.r_[x[window_len-1:0:-1],x,x[-1:-window_len:-1]] 44 | #print(len(s)) 45 | if window == 'flat': #moving average 46 | w=numpy.ones(window_len,'d') 47 | else: 48 | w=eval('numpy.'+window+'(window_len)') 49 | 50 | y=numpy.convolve(w/w.sum(),s,mode='valid') 51 | return y 52 | 53 | if __name__ == '__main__': 54 | f = open('instructions.txt') 55 | lines = [] 56 | for line in f.read().split('\n'): 57 | if line: 58 | (id, t) = line.split(':', 1) 59 | lines.append(t) 60 | 61 | for line in lines: 62 | print "Was <{}> become <{}>".format(line, translate(line)) 63 | --------------------------------------------------------------------------------