├── .idea ├── LeetCodeSpider.iml ├── leetcode-spider.iml ├── misc.xml ├── modules.xml └── vcs.xml ├── DuplicateFilter.py ├── Extractor.py ├── README.md ├── example.py ├── screenshot ├── csv.png └── excel.png └── templates └── duplicate.html /.idea/LeetCodeSpider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/leetcode-spider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 18 | 19 | 20 | 22 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /DuplicateFilter.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, make_response, request, redirect 2 | import sqlite3 3 | 4 | app = Flask(__name__) 5 | 6 | 7 | def dict_factory(cursor, row): 8 | d = {} 9 | for idx, col in enumerate(cursor.description): 10 | d[col[0]] = row[idx] 11 | return d 12 | 13 | 14 | @app.route('/') 15 | def main(): 16 | conn = sqlite3.connect('leetcode.db') 17 | conn.row_factory = dict_factory 18 | c = conn.cursor() 19 | c.execute( 20 | ''' 21 | SELECT lang, title, url, path 22 | FROM submission a 23 | WHERE EXISTS( 24 | SELECT 1 25 | FROM submission b 26 | WHERE b.downloaded=1 AND b.removed=0 AND a.lang=b.lang AND a.title=b.title 27 | GROUP BY lang, title 28 | HAVING COUNT(lang)>1) 29 | ORDER BY lang, title 30 | ''') 31 | problems = c.fetchall() 32 | conn.close() 33 | return render_template('duplicate.html', problems=problems) 34 | 35 | 36 | @app.route('/view/') 37 | def view(path=None): 38 | with open(path, 'r', encoding='utf-8') as f: 39 | content = f.read() 40 | response = make_response(content) 41 | response.headers['content-type'] = 'text/plain' 42 | return response 43 | 44 | 45 | @app.route('/remove', methods=['POST']) 46 | def remove(): 47 | url = request.form['url'] 48 | conn = sqlite3.connect('leetcode.db') 49 | c = conn.cursor() 50 | c.execute('UPDATE submission SET removed=1 WHERE url=?', (url,)) 51 | conn.commit() 52 | conn.close() 53 | return redirect('/') 54 | 55 | 56 | if __name__ == '__main__': 57 | app.run(debug=True) 58 | -------------------------------------------------------------------------------- /Extractor.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | import urllib.parse 3 | import http.cookiejar 4 | import json 5 | from lxml import etree 6 | import os 7 | import concurrent.futures 8 | import re 9 | import sqlite3 10 | import codecs 11 | import shutil 12 | import os.path 13 | 14 | 15 | def dict_factory(cursor, row): 16 | d = {} 17 | for idx, col in enumerate(cursor.description): 18 | d[col[0]] = row[idx] 19 | return d 20 | 21 | 22 | class Extractor: 23 | def __init__(self): 24 | self.base_url = 'https://leetcode.com' 25 | self.db_name = 'leetcode.db' 26 | cj = http.cookiejar.CookieJar() 27 | self.opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) 28 | self.opener.addheaders = [ 29 | ('Host', 'leetcode.com'), 30 | ('User-Agent', 31 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36') 32 | ] 33 | self.is_logged_in = False 34 | 35 | def login(self, user_name, password): 36 | if self.is_logged_in: 37 | return 38 | url = self.base_url + '/accounts/login/' 39 | with self.opener.open(url) as f: 40 | content = f.read().decode('utf-8') 41 | token = re.findall("name='csrfmiddlewaretoken'\svalue='(.*?)'", content)[0] 42 | post_data = { 43 | 'csrfmiddlewaretoken': token, 44 | 'login': user_name, 45 | 'password': password 46 | } 47 | post_data = urllib.parse.urlencode(post_data) 48 | self.opener.addheaders.append(('Referer', url)) 49 | with self.opener.open(url, data=post_data.encode()) as f: 50 | if f.read().decode().find('Successfully signed in') != -1: 51 | self.is_logged_in = True 52 | print('logged in') 53 | else: 54 | print('failed to login in') 55 | self.opener.addheaders.pop() 56 | 57 | def get_problem_list(self): 58 | with self.opener.open(self.base_url + '/api/problems/algorithms/') as f: 59 | content = f.read().decode('utf-8') 60 | content = json.loads(content) 61 | return content['stat_status_pairs'] 62 | 63 | def store_problem_list_to_db(self, problem_list): 64 | conn = sqlite3.connect(self.db_name) 65 | c = conn.cursor() 66 | c.execute( 67 | ''' 68 | CREATE TABLE IF NOT EXISTS problem ( 69 | id INTEGER, 70 | title TEXT, 71 | slug TEXT, 72 | difficulty INTEGER, 73 | paid_only INTEGER, 74 | status TEXT, 75 | total_acs INTEGER, 76 | total_submitted INTEGER, 77 | PRIMARY KEY(id)) 78 | ''') 79 | c.execute('DELETE FROM problem') 80 | for problem in problem_list: 81 | c.execute( 82 | ''' 83 | INSERT INTO problem 84 | (id, title, slug, difficulty, paid_only, status, total_acs, total_submitted) 85 | VALUES 86 | (?, ?, ?, ?, ?, ?, ?, ?) 87 | ''', 88 | (problem['stat']['question_id'] 89 | , problem['stat']['question__title'] 90 | , problem['stat']['question__title_slug'] 91 | , problem['difficulty']['level'] 92 | , 1 if problem['paid_only'] else 0 93 | , problem['status'] 94 | , problem['stat']['total_acs'] 95 | , problem['stat']['total_submitted']) 96 | ) 97 | conn.commit() 98 | conn.close() 99 | 100 | def update_problem_list(self): 101 | self.store_problem_list_to_db(self.get_problem_list()) 102 | 103 | def get_problem_list_from_db(self): 104 | conn = sqlite3.connect(self.db_name) 105 | conn.row_factory = dict_factory 106 | c = conn.cursor() 107 | c.execute('SELECT * FROM problem') 108 | return c.fetchall() 109 | 110 | def get_question_detail(self, title_slug): 111 | QUERY = '''query getQuestionDetail($titleSlug: String!) { 112 | isCurrentUserAuthenticated 113 | question(titleSlug: $titleSlug) { 114 | questionId 115 | questionFrontendId 116 | questionTitle 117 | translatedTitle 118 | questionTitleSlug 119 | content 120 | translatedContent 121 | difficulty 122 | stats 123 | contributors 124 | similarQuestions 125 | discussUrl 126 | mysqlSchemas 127 | randomQuestionUrl 128 | sessionId 129 | categoryTitle 130 | submitUrl 131 | interpretUrl 132 | codeDefinition 133 | sampleTestCase 134 | enableTestMode 135 | metaData 136 | enableRunCode 137 | enableSubmit 138 | judgerAvailable 139 | infoVerified 140 | envInfo 141 | urlManager 142 | article 143 | questionDetailUrl 144 | discussCategoryId 145 | discussSolutionCategoryId 146 | libraryUrl 147 | companyTags { 148 | name 149 | slug 150 | translatedName 151 | } 152 | topicTags { 153 | name 154 | slug 155 | translatedName 156 | } 157 | } 158 | interviewed { 159 | interviewedUrl 160 | companies { 161 | id 162 | name 163 | slug 164 | } 165 | timeOptions { 166 | id 167 | name 168 | } 169 | stageOptions { 170 | id 171 | name 172 | } 173 | } 174 | subscribeUrl 175 | isPremium 176 | loginUrl 177 | }''' 178 | params = { 179 | 'query': QUERY, 180 | 'operationName': 'getQuestionDetail', 181 | 'variables': json.dumps({ 182 | 'titleSlug': title_slug 183 | }) 184 | } 185 | url = self.base_url + '/graphql' + '?' + urllib.parse.urlencode(params, quote_via=urllib.parse.quote).replace( 186 | '%28', '(').replace('%29', ')').replace('%21', '!') 187 | with self.opener.open(url) as f: 188 | content = f.read().decode('utf-8') 189 | content=json.loads(content) 190 | return content['data']['question'] 191 | 192 | def get_description(self, url, file_path): 193 | with self.opener.open(url) as f: 194 | content = f.read().decode('utf-8') 195 | root = etree.HTML(content) 196 | result = root.xpath('//*[@id="descriptionContent"]//div[@class="question-description"]') 197 | html = etree.tostring(result[0], encoding='utf-8') 198 | with open(file_path, 'wb') as f: 199 | f.write(html) 200 | return file_path 201 | 202 | def extract_descriptions(self): 203 | conn = sqlite3.connect(self.db_name) 204 | c = conn.cursor() 205 | c.execute('CREATE TABLE IF NOT EXISTS description (title TEXT, path TEXT, PRIMARY KEY(title))') 206 | c.execute( 207 | ''' 208 | SELECT a.id, a.title, a.slug 209 | FROM problem a 210 | LEFT JOIN description b 211 | ON a.title=b.title 212 | WHERE a.paid_only=0 AND b.title IS NULL 213 | ''') 214 | problems = c.fetchall() 215 | dir_path = 'descriptions/' 216 | os.makedirs(dir_path, exist_ok=True) 217 | with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor: 218 | futures = { 219 | executor.submit(self.get_description 220 | , self.base_url + '/problems/' + problem[2] + '/description/' 221 | , os.path.join(dir_path, str(problem[0]).zfill(3) + '. ' + problem[1] + '.html')): 222 | problem[1] for problem in problems} 223 | for future in concurrent.futures.as_completed(futures): 224 | title = futures[future] 225 | try: 226 | file_path = future.result() 227 | except Exception as e: 228 | print('%r generated an exception: %s' % (title, e)) 229 | else: 230 | if file_path: 231 | c.execute('INSERT INTO description (title, path) VALUES (?, ?)', (title, file_path)) 232 | conn.commit() 233 | conn.close() 234 | 235 | def sync_description_db_and_file(self): 236 | conn = sqlite3.connect(self.db_name) 237 | conn.row_factory = dict_factory 238 | c = conn.cursor() 239 | c.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="description"') 240 | if not c.fetchone(): 241 | return 242 | c.execute('SELECT title,path FROM description') 243 | descriptions = c.fetchall() 244 | for description in descriptions: 245 | if not os.path.exists(description['path']): 246 | c.execute('DELETE FROM description WHERE title=?', (description['title'],)) 247 | conn.commit() 248 | conn.close() 249 | 250 | def update_descriptions(self): 251 | self.sync_description_db_and_file() 252 | self.extract_descriptions() 253 | 254 | def get_submission_list(self): 255 | if not self.is_logged_in: 256 | print('should login first') 257 | return 258 | result = [] 259 | offset = 0 260 | LIMIT = 100 261 | while True: 262 | url = self.base_url + '/api/submissions/?offset=' + str(offset) + '&limit=' + str(LIMIT) 263 | with self.opener.open(url) as f: 264 | content = f.read().decode('utf-8') 265 | content = json.loads(content) 266 | result.extend(content['submissions_dump']) 267 | if not content['has_next']: 268 | return result 269 | offset += LIMIT 270 | 271 | def store_submission_list_to_db(self, submission_list): 272 | conn = sqlite3.connect(self.db_name) 273 | c = conn.cursor() 274 | c.execute( 275 | ''' 276 | CREATE TABLE IF NOT EXISTS submission ( 277 | lang TEXT, 278 | title TEXT, 279 | url TEXT, 280 | downloaded INTEGER DEFAULT 0, 281 | path TEXT, 282 | removed INTEGER DEFAULT 0, 283 | PRIMARY KEY(url)) 284 | ''') 285 | for submission in submission_list: 286 | if submission['status_display'] == 'Accepted': 287 | c.execute('INSERT OR IGNORE INTO submission (lang, title, url) VALUES (?, ?, ?)' 288 | , (submission['lang'], submission['title'], submission['url'])) 289 | conn.commit() 290 | conn.close() 291 | 292 | def update_submission_list(self): 293 | self.store_submission_list_to_db(self.get_submission_list()) 294 | 295 | def get_submission(self, url, file_path): 296 | with self.opener.open(url) as f: 297 | content = f.read().decode('utf-8') 298 | code = re.findall("submissionCode:\s'(.*?)',", content)[0] 299 | code = codecs.decode(code, 'unicode-escape') 300 | code = code.replace('\r\n', '\n') 301 | with open(file_path, 'w', encoding='utf-8') as f: 302 | f.write(code) 303 | return file_path 304 | 305 | def extract_submissions(self): 306 | conn = sqlite3.connect(self.db_name) 307 | c = conn.cursor() 308 | c.execute('SELECT url FROM submission WHERE downloaded=0 AND removed=0') 309 | urls = c.fetchall() 310 | urls = [url[0] for url in urls] 311 | dir_path = 'submissions/' 312 | os.makedirs(dir_path, exist_ok=True) 313 | with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor: 314 | futures = {executor.submit(self.get_submission, self.base_url + url, 315 | os.path.join(dir_path, url.split('/')[-2])): url for url in urls} 316 | for future in concurrent.futures.as_completed(futures): 317 | url = futures[future] 318 | try: 319 | file_path = future.result() 320 | except Exception as e: 321 | print('%r generated an exception: %s' % (url, e)) 322 | else: 323 | if file_path: 324 | c.execute('UPDATE submission SET downloaded=1,path=? WHERE url=?', (file_path, url)) 325 | conn.commit() 326 | conn.close() 327 | 328 | def sync_submission_db_and_file(self): 329 | conn = sqlite3.connect(self.db_name) 330 | conn.row_factory = dict_factory 331 | c = conn.cursor() 332 | c.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="submission"') 333 | if not c.fetchone(): 334 | return 335 | c.execute('SELECT url,path FROM submission') 336 | submissions = c.fetchall() 337 | for submission in submissions: 338 | if not os.path.exists(submission['path']): 339 | c.execute('DELETE FROM submission WHERE url=?', (submission['url'],)) 340 | conn.commit() 341 | conn.close() 342 | 343 | def update_submissions(self): 344 | self.sync_submission_db_and_file() 345 | self.update_submission_list() 346 | self.extract_submissions() 347 | 348 | def output_submissions(self, dir_path='out_submissions/', latest_only=True): 349 | def lang_to_language(lang): 350 | if lang == 'python' or lang == 'python3': 351 | return 'Python' 352 | if lang == 'java': 353 | return 'Java' 354 | if lang == 'cpp': 355 | return 'C++' 356 | 357 | def lang_to_extension(lang): 358 | if lang == 'python' or lang == 'python3': 359 | return '.py' 360 | if lang == 'java': 361 | return '.java' 362 | if lang == 'cpp': 363 | return '.cpp' 364 | 365 | os.makedirs(dir_path, exist_ok=True) 366 | conn = sqlite3.connect(self.db_name) 367 | c = conn.cursor() 368 | c.execute( 369 | ''' 370 | SELECT problem.id, submission.title 371 | FROM submission 372 | LEFT JOIN problem 373 | ON submission.title=problem.title 374 | WHERE submission.downloaded=1 AND submission.removed=0 375 | GROUP BY submission.title 376 | ''') 377 | titles = c.fetchall() 378 | for id, title in titles: 379 | if not id: 380 | continue 381 | problem_dir = os.path.join(dir_path, str(id).zfill(3) + '. ' + title) 382 | os.makedirs(problem_dir, exist_ok=True) 383 | c.execute('SELECT lang FROM submission WHERE downloaded=1 AND removed=0 AND title=?', (title,)) 384 | langs = c.fetchall() 385 | langs = [lang[0] for lang in langs] 386 | for lang in langs: 387 | current_dir = os.path.join(problem_dir, lang_to_language(lang)) 388 | os.makedirs(current_dir, exist_ok=True) 389 | c.execute( 390 | 'SELECT path FROM submission WHERE downloaded=1 AND removed=0 AND title=? AND lang=? ORDER BY url', 391 | (title, lang)) 392 | orig_file_paths = c.fetchall() 393 | orig_file_paths = [orig_file_path[0] for orig_file_path in orig_file_paths] 394 | shutil.copyfile(orig_file_paths[0], os.path.join(current_dir, 'Solution' + lang_to_extension(lang))) 395 | if latest_only: 396 | continue 397 | for i in range(1, len(orig_file_paths)): 398 | shutil.copyfile(orig_file_paths[0], 399 | os.path.join(current_dir, 'Solution ' + 'I' * (i + 1) + lang_to_extension(lang))) 400 | 401 | conn.close() 402 | 403 | def save_problem_list(self, file_name, file_type='csv', language='Chinese'): 404 | def preprocess(problem_list): 405 | for problem in problem_list: 406 | problem['acceptance'] = problem['total_acs'] / problem['total_submitted'] 407 | problem['status'] = problem['status'] == 'ac' 408 | 409 | def to_locale(problem_list, language_dict): 410 | problem_list = [{language_dict[key]: value for (key, value) in problem.items()} for problem in problem_list] 411 | 412 | for problem in problem_list: 413 | problem[language_dict['difficulty']] = language_dict['level'][problem[language_dict['difficulty']]] 414 | problem[language_dict['paid_only']] = language_dict['bool'][problem[language_dict['paid_only']]] 415 | problem[language_dict['status']] = language_dict['bool'][problem[language_dict['status']]] 416 | return problem_list 417 | 418 | problem_list = self.get_problem_list_from_db() 419 | preprocess(problem_list) 420 | language_dict = self.get_language_dict(language) 421 | problem_list = to_locale(problem_list, language_dict) 422 | if file_type == 'csv': 423 | self.save_problem_list_as_csv(problem_list, file_name) 424 | elif file_type == 'excel': 425 | self.save_problem_list_as_excel(problem_list, file_name, language_dict) 426 | 427 | def save_problem_list_as_csv(self, problem_list, file_name): 428 | with open(file_name, 'w', encoding='utf-8', newline='') as f: 429 | import csv 430 | writer = csv.DictWriter(f, fieldnames=problem_list[0].keys()) 431 | writer.writeheader() 432 | writer.writerows(problem_list) 433 | 434 | def save_problem_list_as_excel(self, problem_list, file_name, language_dict): 435 | from openpyxl import Workbook 436 | from openpyxl.styles import NamedStyle 437 | from openpyxl.formatting.rule import CellIsRule, DataBarRule 438 | from openpyxl.styles import PatternFill 439 | 440 | def format_cell_style(ws, language_dict): 441 | style_int = NamedStyle('int') 442 | style_int.number_format = '0' 443 | style_str = NamedStyle('str') 444 | style_str.number_format = '@' 445 | style_pcnt = NamedStyle('pcnt') 446 | style_pcnt.number_format = '0.0%' 447 | for cell in ws[column_index[language_dict['id']]][1:]: 448 | cell.style = style_int 449 | for cell in ws[column_index[language_dict['total_submitted']]][1:]: 450 | cell.style = style_int 451 | for cell in ws[column_index[language_dict['total_acs']]][1:]: 452 | cell.style = style_int 453 | for cell in ws[column_index[language_dict['title']]][1:]: 454 | cell.style = style_str 455 | for cell in ws[column_index[language_dict['slug']]][1:]: 456 | cell.style = style_str 457 | for cell in ws[column_index[language_dict['difficulty']]][1:]: 458 | cell.style = style_str 459 | for cell in ws[column_index[language_dict['paid_only']]][1:]: 460 | cell.style = style_str 461 | for cell in ws[column_index[language_dict['status']]][1:]: 462 | cell.style = style_str 463 | for cell in ws[column_index[language_dict['acceptance']]][1:]: 464 | cell.style = style_pcnt 465 | 466 | def conditional_formatting(ws, language_dict): 467 | def get_entire_column(index): 468 | return index + '1:' + index + '1048576' 469 | 470 | red_color = 'ffc7ce' 471 | green_color = 'c2efcf' 472 | yellow_color = 'ffeba2' 473 | 474 | red_fill = PatternFill(start_color=red_color, end_color=red_color, fill_type='solid') 475 | green_fill = PatternFill(start_color=green_color, end_color=green_color, fill_type='solid') 476 | yellow_fill = PatternFill(start_color=yellow_color, end_color=yellow_color, fill_type='solid') 477 | 478 | ws.conditional_formatting.add(get_entire_column(column_index[language_dict['difficulty']]), 479 | CellIsRule(operator='equal', formula=['"' + language_dict['level'][1] + '"'], 480 | stopIfTrue=False, fill=green_fill)) 481 | ws.conditional_formatting.add(get_entire_column(column_index[language_dict['difficulty']]), 482 | CellIsRule(operator='equal', formula=['"' + language_dict['level'][2] + '"'], 483 | stopIfTrue=False, fill=yellow_fill)) 484 | ws.conditional_formatting.add(get_entire_column(column_index[language_dict['difficulty']]), 485 | CellIsRule(operator='equal', formula=['"' + language_dict['level'][3] + '"'], 486 | stopIfTrue=False, fill=red_fill)) 487 | 488 | ws.conditional_formatting.add(get_entire_column(column_index[language_dict['paid_only']]), 489 | CellIsRule(operator='equal', 490 | formula=['"' + language_dict['bool'][False] + '"'], 491 | stopIfTrue=False, fill=green_fill)) 492 | ws.conditional_formatting.add(get_entire_column(column_index[language_dict['paid_only']]), 493 | CellIsRule(operator='equal', 494 | formula=['"' + language_dict['bool'][True] + '"'], 495 | stopIfTrue=False, fill=red_fill)) 496 | 497 | ws.conditional_formatting.add(get_entire_column(column_index[language_dict['status']]), 498 | CellIsRule(operator='equal', 499 | formula=['"' + language_dict['bool'][False] + '"'], 500 | stopIfTrue=False, fill=red_fill)) 501 | ws.conditional_formatting.add(get_entire_column(column_index[language_dict['status']]), 502 | CellIsRule(operator='equal', 503 | formula=['"' + language_dict['bool'][True] + '"'], 504 | stopIfTrue=False, fill=green_fill)) 505 | 506 | ws.conditional_formatting.add(get_entire_column(column_index[language_dict['acceptance']]), 507 | DataBarRule(start_type='percentile', start_value=0, end_type='percentile', 508 | end_value=100, color="FF638EC6", showValue='None')) 509 | 510 | wb = Workbook() 511 | ws = wb.active 512 | ws.append(tuple(problem_list[0].keys())) 513 | column_index = {item.value: item.column for item in ws[1]} 514 | rows = [{column_index[key]: value for (key, value) in problem.items()} for problem in problem_list] 515 | for row in rows: 516 | ws.append(row) 517 | format_cell_style(ws, language_dict) 518 | conditional_formatting(ws, language_dict) 519 | wb.save(file_name) 520 | 521 | def get_language_dict(self, language): 522 | language_dict = None 523 | if language == 'Chinese': 524 | language_dict = { 525 | 'id': '题号', 526 | 'title': '标题', 527 | 'slug': '链接', 528 | 'difficulty': '难度', 529 | 'total_submitted': '总提交数', 530 | 'total_acs': '总通过数', 531 | 'acceptance': '通过率', 532 | 'paid_only': '付费', 533 | 'status': '已解决', 534 | 'level': { 535 | 1: '简单', 536 | 2: '中等', 537 | 3: '难' 538 | }, 539 | 'bool': { 540 | True: '是', 541 | False: '否' 542 | } 543 | } 544 | elif language == 'English': 545 | language_dict = { 546 | 'id': '#', 547 | 'title': 'Title', 548 | 'slug': 'Link', 549 | 'difficulty': 'Difficulty', 550 | 'total_submitted': 'Total Submitted', 551 | 'total_acs': 'Total Accepted', 552 | 'acceptance': 'Acceptance', 553 | 'paid_only': 'Paid Only', 554 | 'status': 'Solved', 555 | 'level': { 556 | 1: 'Easy', 557 | 2: 'Medium', 558 | 3: 'Hard' 559 | }, 560 | 'bool': { 561 | True: 'Yes', 562 | False: 'No' 563 | } 564 | } 565 | return language_dict 566 | 567 | 568 | if __name__ == '__main__': 569 | extractor = Extractor() 570 | result=extractor.get_question_detail('merge-two-sorted-lists') 571 | print(result) 572 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LeetCode-Spider 2 | 3 | Python实现的LeetCode爬虫。爬取LeetCode题目描述和提交的代码。 4 | 5 | ## 特点 6 | 7 | - 支持爬取题目列表,保存为本地CSV/Excel文件。 8 | - 支持爬取题目描述,保存为本地HTML文件。 9 | - 支持爬取用户提交的代码,保存为如_.py、_.java、\*.cpp等源码。 10 | - 高速并发下载题目描述和提交的代码。 11 | - 支持增量更新,当本地有缺损或LeetCode有新内容(题目/提交的代码)时,以增量形式更新。 12 | 13 | ## 使用 14 | 15 | 参考example.py。 16 | 17 | ### 克隆或下载本仓库 18 | 19 | 使用`git clone`或直接下载本仓库代码,并切换工作目录到本项目根目录 20 | 21 | ### 安装依赖 22 | 23 | 本项目需要用到第三方库`lxml`和`openpyxl`(如果需要保存为Excel),可通过如下`pip`命令安装: 24 | 25 | ```bash 26 | pip3 install lxml 27 | pip3 install openpyxl 28 | ``` 29 | 30 | ### 获取问题列表(必须) 31 | 32 | ```python 33 | from Extractor import Extractor 34 | 35 | extractor = Extractor() 36 | 37 | # 获取问题列表(保存在数据库leetcode.db中,若希望获取问题状态(是否ac),需首先登录) 38 | # extractor.login('foo@bar.com', '123456') 39 | extractor.update_problem_list() 40 | ``` 41 | 42 | 获取得到的数据保存在leetcode.db数据库中。 43 | 44 | ### 导出问题列表 45 | 46 | 可将问题列表导出为CSV/Excel格式文件,参考下面的截图。 47 | 48 | ```python 49 | # 导出问题列表为中文CSV文件 50 | extractor.save_problem_list('problems.csv') 51 | 52 | # 导出问题列表为英文Excel文件 53 | extractor.save_problem_list('problems.xlsx', 'excel', 'English') 54 | ``` 55 | 56 | ![CSV][csv] 57 | 58 | ![Excel][excel] 59 | 60 | ### 获取问题描述HTML文件 61 | 62 | **需先获取问题列表** 63 | 64 | ```python 65 | # 获取问题描述HTML文件(保存在descriptions文件夹下,需要先获取问题列表) 66 | extractor.update_descriptions() 67 | ``` 68 | 69 | 根据问题列表增量多线程并发下载新的问题描述,并将HTML文件保存到descriptions文件夹下。文件夹结构为: 70 | 71 | ``` 72 | descriptions 73 | 001. Two Sum.html 74 | 002. Add Two Numbers.html 75 | 003. Longest Substring Without Repeating Characters.html 76 | ... 77 | ``` 78 | 79 | ### 获取提交的代码 80 | 81 | **需先获取问题列表** 82 | 83 | ```python 84 | # 获取提交的代码(保存在submissions文件夹下,需要先获取问题列,并登录) 85 | extractor.login('foo@bar.com', '123456') 86 | extractor.update_submissions() 87 | ``` 88 | 89 | 这里需要先输入用户名和密码登录,然后才能获取到此用户提交的代码。 90 | 91 | 根据问题列表增量多线程并发下载新的提交代码,并将其保存到submissions文件夹下。文件夹结构为: 92 | 93 | ``` 94 | submissions 95 | 24152714 96 | 24153189 97 | 24165875 98 | ... 99 | ``` 100 | 101 | ### 导出提交的代码 102 | 103 | **需先获取提交的代码** 104 | 105 | ```python 106 | # 导出提交的代码(保存在out_submissions文件夹下,需先获取提交的代码) 107 | extractor.output_submissions() 108 | ``` 109 | 110 | 导出之前保存的文件为格式化文件结构(默认仅导出每种语言的最新提交版本),保存到out_submissions文件夹下。文件夹结构为: 111 | 112 | ``` 113 | out_submissions 114 | 001. Two Sum 115 | C++ 116 | Solution.cpp 117 | Java 118 | Solution.java 119 | Python 120 | Solution 121 | 002. Add Two Numbers 122 | C++ 123 | Solution.cpp 124 | Java 125 | Solution.java 126 | Python 127 | Solution 128 | ... 129 | ``` 130 | 131 | ## 第三方依赖 132 | 133 | - [lxml](http://lxml.de/) 134 | - [openpyxl](https://openpyxl.readthedocs.io/) 135 | 136 | [csv]: screenshot/csv.png 137 | 138 | [excel]: screenshot/excel.png 139 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | from Extractor import Extractor 2 | 3 | extractor = Extractor() 4 | 5 | # 获取问题列表(保存在数据库leetcode.db中,若希望获取问题状态(是否ac),需首先登录) 6 | # extractor.login('foo@bar.com', '123456') 7 | extractor.update_problem_list() 8 | 9 | # 导出问题列表为中文CSV文件 10 | extractor.save_problem_list('problems.csv') 11 | 12 | # 导出问题列表为英文Excel文件 13 | extractor.save_problem_list('problems.xlsx', 'excel', 'English') 14 | 15 | # 获取问题描述HTML文件(保存在descriptions文件夹下,需要先获取问题列表) 16 | extractor.update_descriptions() 17 | 18 | # 获取提交的代码(保存在submissions文件夹下,需要先获取问题列,并登录) 19 | extractor.login('foo@bar.com', '123456') 20 | extractor.update_submissions() 21 | 22 | # 导出提交的代码(保存在out_submissions文件夹下,需先获取提交的代码) 23 | extractor.output_submissions() 24 | -------------------------------------------------------------------------------- /screenshot/csv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhantong/leetcode-spider/54883459e4c5af40d5bbfb324eaae45e8902855e/screenshot/csv.png -------------------------------------------------------------------------------- /screenshot/excel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhantong/leetcode-spider/54883459e4c5af40d5bbfb324eaae45e8902855e/screenshot/excel.png -------------------------------------------------------------------------------- /templates/duplicate.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | {% for problem in problems %} 12 | 13 | 14 | 15 | 16 | 17 | 18 | 24 | 25 | {% endfor %} 26 |
languagetitleurlpathviewdelete
{{ problem['lang'] }}{{ problem['title'] }}{{ problem['url'] }}{{ problem['path'] }}查看 19 |
20 | 21 | 22 |
23 |
--------------------------------------------------------------------------------