├── .idea
├── LeetCodeSpider.iml
├── leetcode-spider.iml
├── misc.xml
├── modules.xml
└── vcs.xml
├── DuplicateFilter.py
├── Extractor.py
├── README.md
├── example.py
├── screenshot
├── csv.png
└── excel.png
└── templates
└── duplicate.html
/.idea/LeetCodeSpider.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/leetcode-spider.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/DuplicateFilter.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, render_template, make_response, request, redirect
2 | import sqlite3
3 |
4 | app = Flask(__name__)
5 |
6 |
7 | def dict_factory(cursor, row):
8 | d = {}
9 | for idx, col in enumerate(cursor.description):
10 | d[col[0]] = row[idx]
11 | return d
12 |
13 |
14 | @app.route('/')
15 | def main():
16 | conn = sqlite3.connect('leetcode.db')
17 | conn.row_factory = dict_factory
18 | c = conn.cursor()
19 | c.execute(
20 | '''
21 | SELECT lang, title, url, path
22 | FROM submission a
23 | WHERE EXISTS(
24 | SELECT 1
25 | FROM submission b
26 | WHERE b.downloaded=1 AND b.removed=0 AND a.lang=b.lang AND a.title=b.title
27 | GROUP BY lang, title
28 | HAVING COUNT(lang)>1)
29 | ORDER BY lang, title
30 | ''')
31 | problems = c.fetchall()
32 | conn.close()
33 | return render_template('duplicate.html', problems=problems)
34 |
35 |
36 | @app.route('/view/')
37 | def view(path=None):
38 | with open(path, 'r', encoding='utf-8') as f:
39 | content = f.read()
40 | response = make_response(content)
41 | response.headers['content-type'] = 'text/plain'
42 | return response
43 |
44 |
45 | @app.route('/remove', methods=['POST'])
46 | def remove():
47 | url = request.form['url']
48 | conn = sqlite3.connect('leetcode.db')
49 | c = conn.cursor()
50 | c.execute('UPDATE submission SET removed=1 WHERE url=?', (url,))
51 | conn.commit()
52 | conn.close()
53 | return redirect('/')
54 |
55 |
56 | if __name__ == '__main__':
57 | app.run(debug=True)
58 |
--------------------------------------------------------------------------------
/Extractor.py:
--------------------------------------------------------------------------------
1 | import urllib.request
2 | import urllib.parse
3 | import http.cookiejar
4 | import json
5 | from lxml import etree
6 | import os
7 | import concurrent.futures
8 | import re
9 | import sqlite3
10 | import codecs
11 | import shutil
12 | import os.path
13 |
14 |
15 | def dict_factory(cursor, row):
16 | d = {}
17 | for idx, col in enumerate(cursor.description):
18 | d[col[0]] = row[idx]
19 | return d
20 |
21 |
22 | class Extractor:
23 | def __init__(self):
24 | self.base_url = 'https://leetcode.com'
25 | self.db_name = 'leetcode.db'
26 | cj = http.cookiejar.CookieJar()
27 | self.opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
28 | self.opener.addheaders = [
29 | ('Host', 'leetcode.com'),
30 | ('User-Agent',
31 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36')
32 | ]
33 | self.is_logged_in = False
34 |
35 | def login(self, user_name, password):
36 | if self.is_logged_in:
37 | return
38 | url = self.base_url + '/accounts/login/'
39 | with self.opener.open(url) as f:
40 | content = f.read().decode('utf-8')
41 | token = re.findall("name='csrfmiddlewaretoken'\svalue='(.*?)'", content)[0]
42 | post_data = {
43 | 'csrfmiddlewaretoken': token,
44 | 'login': user_name,
45 | 'password': password
46 | }
47 | post_data = urllib.parse.urlencode(post_data)
48 | self.opener.addheaders.append(('Referer', url))
49 | with self.opener.open(url, data=post_data.encode()) as f:
50 | if f.read().decode().find('Successfully signed in') != -1:
51 | self.is_logged_in = True
52 | print('logged in')
53 | else:
54 | print('failed to login in')
55 | self.opener.addheaders.pop()
56 |
57 | def get_problem_list(self):
58 | with self.opener.open(self.base_url + '/api/problems/algorithms/') as f:
59 | content = f.read().decode('utf-8')
60 | content = json.loads(content)
61 | return content['stat_status_pairs']
62 |
63 | def store_problem_list_to_db(self, problem_list):
64 | conn = sqlite3.connect(self.db_name)
65 | c = conn.cursor()
66 | c.execute(
67 | '''
68 | CREATE TABLE IF NOT EXISTS problem (
69 | id INTEGER,
70 | title TEXT,
71 | slug TEXT,
72 | difficulty INTEGER,
73 | paid_only INTEGER,
74 | status TEXT,
75 | total_acs INTEGER,
76 | total_submitted INTEGER,
77 | PRIMARY KEY(id))
78 | ''')
79 | c.execute('DELETE FROM problem')
80 | for problem in problem_list:
81 | c.execute(
82 | '''
83 | INSERT INTO problem
84 | (id, title, slug, difficulty, paid_only, status, total_acs, total_submitted)
85 | VALUES
86 | (?, ?, ?, ?, ?, ?, ?, ?)
87 | ''',
88 | (problem['stat']['question_id']
89 | , problem['stat']['question__title']
90 | , problem['stat']['question__title_slug']
91 | , problem['difficulty']['level']
92 | , 1 if problem['paid_only'] else 0
93 | , problem['status']
94 | , problem['stat']['total_acs']
95 | , problem['stat']['total_submitted'])
96 | )
97 | conn.commit()
98 | conn.close()
99 |
100 | def update_problem_list(self):
101 | self.store_problem_list_to_db(self.get_problem_list())
102 |
103 | def get_problem_list_from_db(self):
104 | conn = sqlite3.connect(self.db_name)
105 | conn.row_factory = dict_factory
106 | c = conn.cursor()
107 | c.execute('SELECT * FROM problem')
108 | return c.fetchall()
109 |
110 | def get_question_detail(self, title_slug):
111 | QUERY = '''query getQuestionDetail($titleSlug: String!) {
112 | isCurrentUserAuthenticated
113 | question(titleSlug: $titleSlug) {
114 | questionId
115 | questionFrontendId
116 | questionTitle
117 | translatedTitle
118 | questionTitleSlug
119 | content
120 | translatedContent
121 | difficulty
122 | stats
123 | contributors
124 | similarQuestions
125 | discussUrl
126 | mysqlSchemas
127 | randomQuestionUrl
128 | sessionId
129 | categoryTitle
130 | submitUrl
131 | interpretUrl
132 | codeDefinition
133 | sampleTestCase
134 | enableTestMode
135 | metaData
136 | enableRunCode
137 | enableSubmit
138 | judgerAvailable
139 | infoVerified
140 | envInfo
141 | urlManager
142 | article
143 | questionDetailUrl
144 | discussCategoryId
145 | discussSolutionCategoryId
146 | libraryUrl
147 | companyTags {
148 | name
149 | slug
150 | translatedName
151 | }
152 | topicTags {
153 | name
154 | slug
155 | translatedName
156 | }
157 | }
158 | interviewed {
159 | interviewedUrl
160 | companies {
161 | id
162 | name
163 | slug
164 | }
165 | timeOptions {
166 | id
167 | name
168 | }
169 | stageOptions {
170 | id
171 | name
172 | }
173 | }
174 | subscribeUrl
175 | isPremium
176 | loginUrl
177 | }'''
178 | params = {
179 | 'query': QUERY,
180 | 'operationName': 'getQuestionDetail',
181 | 'variables': json.dumps({
182 | 'titleSlug': title_slug
183 | })
184 | }
185 | url = self.base_url + '/graphql' + '?' + urllib.parse.urlencode(params, quote_via=urllib.parse.quote).replace(
186 | '%28', '(').replace('%29', ')').replace('%21', '!')
187 | with self.opener.open(url) as f:
188 | content = f.read().decode('utf-8')
189 | content=json.loads(content)
190 | return content['data']['question']
191 |
192 | def get_description(self, url, file_path):
193 | with self.opener.open(url) as f:
194 | content = f.read().decode('utf-8')
195 | root = etree.HTML(content)
196 | result = root.xpath('//*[@id="descriptionContent"]//div[@class="question-description"]')
197 | html = etree.tostring(result[0], encoding='utf-8')
198 | with open(file_path, 'wb') as f:
199 | f.write(html)
200 | return file_path
201 |
202 | def extract_descriptions(self):
203 | conn = sqlite3.connect(self.db_name)
204 | c = conn.cursor()
205 | c.execute('CREATE TABLE IF NOT EXISTS description (title TEXT, path TEXT, PRIMARY KEY(title))')
206 | c.execute(
207 | '''
208 | SELECT a.id, a.title, a.slug
209 | FROM problem a
210 | LEFT JOIN description b
211 | ON a.title=b.title
212 | WHERE a.paid_only=0 AND b.title IS NULL
213 | ''')
214 | problems = c.fetchall()
215 | dir_path = 'descriptions/'
216 | os.makedirs(dir_path, exist_ok=True)
217 | with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
218 | futures = {
219 | executor.submit(self.get_description
220 | , self.base_url + '/problems/' + problem[2] + '/description/'
221 | , os.path.join(dir_path, str(problem[0]).zfill(3) + '. ' + problem[1] + '.html')):
222 | problem[1] for problem in problems}
223 | for future in concurrent.futures.as_completed(futures):
224 | title = futures[future]
225 | try:
226 | file_path = future.result()
227 | except Exception as e:
228 | print('%r generated an exception: %s' % (title, e))
229 | else:
230 | if file_path:
231 | c.execute('INSERT INTO description (title, path) VALUES (?, ?)', (title, file_path))
232 | conn.commit()
233 | conn.close()
234 |
235 | def sync_description_db_and_file(self):
236 | conn = sqlite3.connect(self.db_name)
237 | conn.row_factory = dict_factory
238 | c = conn.cursor()
239 | c.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="description"')
240 | if not c.fetchone():
241 | return
242 | c.execute('SELECT title,path FROM description')
243 | descriptions = c.fetchall()
244 | for description in descriptions:
245 | if not os.path.exists(description['path']):
246 | c.execute('DELETE FROM description WHERE title=?', (description['title'],))
247 | conn.commit()
248 | conn.close()
249 |
250 | def update_descriptions(self):
251 | self.sync_description_db_and_file()
252 | self.extract_descriptions()
253 |
254 | def get_submission_list(self):
255 | if not self.is_logged_in:
256 | print('should login first')
257 | return
258 | result = []
259 | offset = 0
260 | LIMIT = 100
261 | while True:
262 | url = self.base_url + '/api/submissions/?offset=' + str(offset) + '&limit=' + str(LIMIT)
263 | with self.opener.open(url) as f:
264 | content = f.read().decode('utf-8')
265 | content = json.loads(content)
266 | result.extend(content['submissions_dump'])
267 | if not content['has_next']:
268 | return result
269 | offset += LIMIT
270 |
271 | def store_submission_list_to_db(self, submission_list):
272 | conn = sqlite3.connect(self.db_name)
273 | c = conn.cursor()
274 | c.execute(
275 | '''
276 | CREATE TABLE IF NOT EXISTS submission (
277 | lang TEXT,
278 | title TEXT,
279 | url TEXT,
280 | downloaded INTEGER DEFAULT 0,
281 | path TEXT,
282 | removed INTEGER DEFAULT 0,
283 | PRIMARY KEY(url))
284 | ''')
285 | for submission in submission_list:
286 | if submission['status_display'] == 'Accepted':
287 | c.execute('INSERT OR IGNORE INTO submission (lang, title, url) VALUES (?, ?, ?)'
288 | , (submission['lang'], submission['title'], submission['url']))
289 | conn.commit()
290 | conn.close()
291 |
292 | def update_submission_list(self):
293 | self.store_submission_list_to_db(self.get_submission_list())
294 |
295 | def get_submission(self, url, file_path):
296 | with self.opener.open(url) as f:
297 | content = f.read().decode('utf-8')
298 | code = re.findall("submissionCode:\s'(.*?)',", content)[0]
299 | code = codecs.decode(code, 'unicode-escape')
300 | code = code.replace('\r\n', '\n')
301 | with open(file_path, 'w', encoding='utf-8') as f:
302 | f.write(code)
303 | return file_path
304 |
305 | def extract_submissions(self):
306 | conn = sqlite3.connect(self.db_name)
307 | c = conn.cursor()
308 | c.execute('SELECT url FROM submission WHERE downloaded=0 AND removed=0')
309 | urls = c.fetchall()
310 | urls = [url[0] for url in urls]
311 | dir_path = 'submissions/'
312 | os.makedirs(dir_path, exist_ok=True)
313 | with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
314 | futures = {executor.submit(self.get_submission, self.base_url + url,
315 | os.path.join(dir_path, url.split('/')[-2])): url for url in urls}
316 | for future in concurrent.futures.as_completed(futures):
317 | url = futures[future]
318 | try:
319 | file_path = future.result()
320 | except Exception as e:
321 | print('%r generated an exception: %s' % (url, e))
322 | else:
323 | if file_path:
324 | c.execute('UPDATE submission SET downloaded=1,path=? WHERE url=?', (file_path, url))
325 | conn.commit()
326 | conn.close()
327 |
328 | def sync_submission_db_and_file(self):
329 | conn = sqlite3.connect(self.db_name)
330 | conn.row_factory = dict_factory
331 | c = conn.cursor()
332 | c.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="submission"')
333 | if not c.fetchone():
334 | return
335 | c.execute('SELECT url,path FROM submission')
336 | submissions = c.fetchall()
337 | for submission in submissions:
338 | if not os.path.exists(submission['path']):
339 | c.execute('DELETE FROM submission WHERE url=?', (submission['url'],))
340 | conn.commit()
341 | conn.close()
342 |
343 | def update_submissions(self):
344 | self.sync_submission_db_and_file()
345 | self.update_submission_list()
346 | self.extract_submissions()
347 |
348 | def output_submissions(self, dir_path='out_submissions/', latest_only=True):
349 | def lang_to_language(lang):
350 | if lang == 'python' or lang == 'python3':
351 | return 'Python'
352 | if lang == 'java':
353 | return 'Java'
354 | if lang == 'cpp':
355 | return 'C++'
356 |
357 | def lang_to_extension(lang):
358 | if lang == 'python' or lang == 'python3':
359 | return '.py'
360 | if lang == 'java':
361 | return '.java'
362 | if lang == 'cpp':
363 | return '.cpp'
364 |
365 | os.makedirs(dir_path, exist_ok=True)
366 | conn = sqlite3.connect(self.db_name)
367 | c = conn.cursor()
368 | c.execute(
369 | '''
370 | SELECT problem.id, submission.title
371 | FROM submission
372 | LEFT JOIN problem
373 | ON submission.title=problem.title
374 | WHERE submission.downloaded=1 AND submission.removed=0
375 | GROUP BY submission.title
376 | ''')
377 | titles = c.fetchall()
378 | for id, title in titles:
379 | if not id:
380 | continue
381 | problem_dir = os.path.join(dir_path, str(id).zfill(3) + '. ' + title)
382 | os.makedirs(problem_dir, exist_ok=True)
383 | c.execute('SELECT lang FROM submission WHERE downloaded=1 AND removed=0 AND title=?', (title,))
384 | langs = c.fetchall()
385 | langs = [lang[0] for lang in langs]
386 | for lang in langs:
387 | current_dir = os.path.join(problem_dir, lang_to_language(lang))
388 | os.makedirs(current_dir, exist_ok=True)
389 | c.execute(
390 | 'SELECT path FROM submission WHERE downloaded=1 AND removed=0 AND title=? AND lang=? ORDER BY url',
391 | (title, lang))
392 | orig_file_paths = c.fetchall()
393 | orig_file_paths = [orig_file_path[0] for orig_file_path in orig_file_paths]
394 | shutil.copyfile(orig_file_paths[0], os.path.join(current_dir, 'Solution' + lang_to_extension(lang)))
395 | if latest_only:
396 | continue
397 | for i in range(1, len(orig_file_paths)):
398 | shutil.copyfile(orig_file_paths[0],
399 | os.path.join(current_dir, 'Solution ' + 'I' * (i + 1) + lang_to_extension(lang)))
400 |
401 | conn.close()
402 |
403 | def save_problem_list(self, file_name, file_type='csv', language='Chinese'):
404 | def preprocess(problem_list):
405 | for problem in problem_list:
406 | problem['acceptance'] = problem['total_acs'] / problem['total_submitted']
407 | problem['status'] = problem['status'] == 'ac'
408 |
409 | def to_locale(problem_list, language_dict):
410 | problem_list = [{language_dict[key]: value for (key, value) in problem.items()} for problem in problem_list]
411 |
412 | for problem in problem_list:
413 | problem[language_dict['difficulty']] = language_dict['level'][problem[language_dict['difficulty']]]
414 | problem[language_dict['paid_only']] = language_dict['bool'][problem[language_dict['paid_only']]]
415 | problem[language_dict['status']] = language_dict['bool'][problem[language_dict['status']]]
416 | return problem_list
417 |
418 | problem_list = self.get_problem_list_from_db()
419 | preprocess(problem_list)
420 | language_dict = self.get_language_dict(language)
421 | problem_list = to_locale(problem_list, language_dict)
422 | if file_type == 'csv':
423 | self.save_problem_list_as_csv(problem_list, file_name)
424 | elif file_type == 'excel':
425 | self.save_problem_list_as_excel(problem_list, file_name, language_dict)
426 |
427 | def save_problem_list_as_csv(self, problem_list, file_name):
428 | with open(file_name, 'w', encoding='utf-8', newline='') as f:
429 | import csv
430 | writer = csv.DictWriter(f, fieldnames=problem_list[0].keys())
431 | writer.writeheader()
432 | writer.writerows(problem_list)
433 |
434 | def save_problem_list_as_excel(self, problem_list, file_name, language_dict):
435 | from openpyxl import Workbook
436 | from openpyxl.styles import NamedStyle
437 | from openpyxl.formatting.rule import CellIsRule, DataBarRule
438 | from openpyxl.styles import PatternFill
439 |
440 | def format_cell_style(ws, language_dict):
441 | style_int = NamedStyle('int')
442 | style_int.number_format = '0'
443 | style_str = NamedStyle('str')
444 | style_str.number_format = '@'
445 | style_pcnt = NamedStyle('pcnt')
446 | style_pcnt.number_format = '0.0%'
447 | for cell in ws[column_index[language_dict['id']]][1:]:
448 | cell.style = style_int
449 | for cell in ws[column_index[language_dict['total_submitted']]][1:]:
450 | cell.style = style_int
451 | for cell in ws[column_index[language_dict['total_acs']]][1:]:
452 | cell.style = style_int
453 | for cell in ws[column_index[language_dict['title']]][1:]:
454 | cell.style = style_str
455 | for cell in ws[column_index[language_dict['slug']]][1:]:
456 | cell.style = style_str
457 | for cell in ws[column_index[language_dict['difficulty']]][1:]:
458 | cell.style = style_str
459 | for cell in ws[column_index[language_dict['paid_only']]][1:]:
460 | cell.style = style_str
461 | for cell in ws[column_index[language_dict['status']]][1:]:
462 | cell.style = style_str
463 | for cell in ws[column_index[language_dict['acceptance']]][1:]:
464 | cell.style = style_pcnt
465 |
466 | def conditional_formatting(ws, language_dict):
467 | def get_entire_column(index):
468 | return index + '1:' + index + '1048576'
469 |
470 | red_color = 'ffc7ce'
471 | green_color = 'c2efcf'
472 | yellow_color = 'ffeba2'
473 |
474 | red_fill = PatternFill(start_color=red_color, end_color=red_color, fill_type='solid')
475 | green_fill = PatternFill(start_color=green_color, end_color=green_color, fill_type='solid')
476 | yellow_fill = PatternFill(start_color=yellow_color, end_color=yellow_color, fill_type='solid')
477 |
478 | ws.conditional_formatting.add(get_entire_column(column_index[language_dict['difficulty']]),
479 | CellIsRule(operator='equal', formula=['"' + language_dict['level'][1] + '"'],
480 | stopIfTrue=False, fill=green_fill))
481 | ws.conditional_formatting.add(get_entire_column(column_index[language_dict['difficulty']]),
482 | CellIsRule(operator='equal', formula=['"' + language_dict['level'][2] + '"'],
483 | stopIfTrue=False, fill=yellow_fill))
484 | ws.conditional_formatting.add(get_entire_column(column_index[language_dict['difficulty']]),
485 | CellIsRule(operator='equal', formula=['"' + language_dict['level'][3] + '"'],
486 | stopIfTrue=False, fill=red_fill))
487 |
488 | ws.conditional_formatting.add(get_entire_column(column_index[language_dict['paid_only']]),
489 | CellIsRule(operator='equal',
490 | formula=['"' + language_dict['bool'][False] + '"'],
491 | stopIfTrue=False, fill=green_fill))
492 | ws.conditional_formatting.add(get_entire_column(column_index[language_dict['paid_only']]),
493 | CellIsRule(operator='equal',
494 | formula=['"' + language_dict['bool'][True] + '"'],
495 | stopIfTrue=False, fill=red_fill))
496 |
497 | ws.conditional_formatting.add(get_entire_column(column_index[language_dict['status']]),
498 | CellIsRule(operator='equal',
499 | formula=['"' + language_dict['bool'][False] + '"'],
500 | stopIfTrue=False, fill=red_fill))
501 | ws.conditional_formatting.add(get_entire_column(column_index[language_dict['status']]),
502 | CellIsRule(operator='equal',
503 | formula=['"' + language_dict['bool'][True] + '"'],
504 | stopIfTrue=False, fill=green_fill))
505 |
506 | ws.conditional_formatting.add(get_entire_column(column_index[language_dict['acceptance']]),
507 | DataBarRule(start_type='percentile', start_value=0, end_type='percentile',
508 | end_value=100, color="FF638EC6", showValue='None'))
509 |
510 | wb = Workbook()
511 | ws = wb.active
512 | ws.append(tuple(problem_list[0].keys()))
513 | column_index = {item.value: item.column for item in ws[1]}
514 | rows = [{column_index[key]: value for (key, value) in problem.items()} for problem in problem_list]
515 | for row in rows:
516 | ws.append(row)
517 | format_cell_style(ws, language_dict)
518 | conditional_formatting(ws, language_dict)
519 | wb.save(file_name)
520 |
521 | def get_language_dict(self, language):
522 | language_dict = None
523 | if language == 'Chinese':
524 | language_dict = {
525 | 'id': '题号',
526 | 'title': '标题',
527 | 'slug': '链接',
528 | 'difficulty': '难度',
529 | 'total_submitted': '总提交数',
530 | 'total_acs': '总通过数',
531 | 'acceptance': '通过率',
532 | 'paid_only': '付费',
533 | 'status': '已解决',
534 | 'level': {
535 | 1: '简单',
536 | 2: '中等',
537 | 3: '难'
538 | },
539 | 'bool': {
540 | True: '是',
541 | False: '否'
542 | }
543 | }
544 | elif language == 'English':
545 | language_dict = {
546 | 'id': '#',
547 | 'title': 'Title',
548 | 'slug': 'Link',
549 | 'difficulty': 'Difficulty',
550 | 'total_submitted': 'Total Submitted',
551 | 'total_acs': 'Total Accepted',
552 | 'acceptance': 'Acceptance',
553 | 'paid_only': 'Paid Only',
554 | 'status': 'Solved',
555 | 'level': {
556 | 1: 'Easy',
557 | 2: 'Medium',
558 | 3: 'Hard'
559 | },
560 | 'bool': {
561 | True: 'Yes',
562 | False: 'No'
563 | }
564 | }
565 | return language_dict
566 |
567 |
568 | if __name__ == '__main__':
569 | extractor = Extractor()
570 | result=extractor.get_question_detail('merge-two-sorted-lists')
571 | print(result)
572 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LeetCode-Spider
2 |
3 | Python实现的LeetCode爬虫。爬取LeetCode题目描述和提交的代码。
4 |
5 | ## 特点
6 |
7 | - 支持爬取题目列表,保存为本地CSV/Excel文件。
8 | - 支持爬取题目描述,保存为本地HTML文件。
9 | - 支持爬取用户提交的代码,保存为如_.py、_.java、\*.cpp等源码。
10 | - 高速并发下载题目描述和提交的代码。
11 | - 支持增量更新,当本地有缺损或LeetCode有新内容(题目/提交的代码)时,以增量形式更新。
12 |
13 | ## 使用
14 |
15 | 参考example.py。
16 |
17 | ### 克隆或下载本仓库
18 |
19 | 使用`git clone`或直接下载本仓库代码,并切换工作目录到本项目根目录
20 |
21 | ### 安装依赖
22 |
23 | 本项目需要用到第三方库`lxml`和`openpyxl`(如果需要保存为Excel),可通过如下`pip`命令安装:
24 |
25 | ```bash
26 | pip3 install lxml
27 | pip3 install openpyxl
28 | ```
29 |
30 | ### 获取问题列表(必须)
31 |
32 | ```python
33 | from Extractor import Extractor
34 |
35 | extractor = Extractor()
36 |
37 | # 获取问题列表(保存在数据库leetcode.db中,若希望获取问题状态(是否ac),需首先登录)
38 | # extractor.login('foo@bar.com', '123456')
39 | extractor.update_problem_list()
40 | ```
41 |
42 | 获取得到的数据保存在leetcode.db数据库中。
43 |
44 | ### 导出问题列表
45 |
46 | 可将问题列表导出为CSV/Excel格式文件,参考下面的截图。
47 |
48 | ```python
49 | # 导出问题列表为中文CSV文件
50 | extractor.save_problem_list('problems.csv')
51 |
52 | # 导出问题列表为英文Excel文件
53 | extractor.save_problem_list('problems.xlsx', 'excel', 'English')
54 | ```
55 |
56 | ![CSV][csv]
57 |
58 | ![Excel][excel]
59 |
60 | ### 获取问题描述HTML文件
61 |
62 | **需先获取问题列表**
63 |
64 | ```python
65 | # 获取问题描述HTML文件(保存在descriptions文件夹下,需要先获取问题列表)
66 | extractor.update_descriptions()
67 | ```
68 |
69 | 根据问题列表增量多线程并发下载新的问题描述,并将HTML文件保存到descriptions文件夹下。文件夹结构为:
70 |
71 | ```
72 | descriptions
73 | 001. Two Sum.html
74 | 002. Add Two Numbers.html
75 | 003. Longest Substring Without Repeating Characters.html
76 | ...
77 | ```
78 |
79 | ### 获取提交的代码
80 |
81 | **需先获取问题列表**
82 |
83 | ```python
84 | # 获取提交的代码(保存在submissions文件夹下,需要先获取问题列,并登录)
85 | extractor.login('foo@bar.com', '123456')
86 | extractor.update_submissions()
87 | ```
88 |
89 | 这里需要先输入用户名和密码登录,然后才能获取到此用户提交的代码。
90 |
91 | 根据问题列表增量多线程并发下载新的提交代码,并将其保存到submissions文件夹下。文件夹结构为:
92 |
93 | ```
94 | submissions
95 | 24152714
96 | 24153189
97 | 24165875
98 | ...
99 | ```
100 |
101 | ### 导出提交的代码
102 |
103 | **需先获取提交的代码**
104 |
105 | ```python
106 | # 导出提交的代码(保存在out_submissions文件夹下,需先获取提交的代码)
107 | extractor.output_submissions()
108 | ```
109 |
110 | 导出之前保存的文件为格式化文件结构(默认仅导出每种语言的最新提交版本),保存到out_submissions文件夹下。文件夹结构为:
111 |
112 | ```
113 | out_submissions
114 | 001. Two Sum
115 | C++
116 | Solution.cpp
117 | Java
118 | Solution.java
119 | Python
120 | Solution
121 | 002. Add Two Numbers
122 | C++
123 | Solution.cpp
124 | Java
125 | Solution.java
126 | Python
127 | Solution
128 | ...
129 | ```
130 |
131 | ## 第三方依赖
132 |
133 | - [lxml](http://lxml.de/)
134 | - [openpyxl](https://openpyxl.readthedocs.io/)
135 |
136 | [csv]: screenshot/csv.png
137 |
138 | [excel]: screenshot/excel.png
139 |
--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
1 | from Extractor import Extractor
2 |
3 | extractor = Extractor()
4 |
5 | # 获取问题列表(保存在数据库leetcode.db中,若希望获取问题状态(是否ac),需首先登录)
6 | # extractor.login('foo@bar.com', '123456')
7 | extractor.update_problem_list()
8 |
9 | # 导出问题列表为中文CSV文件
10 | extractor.save_problem_list('problems.csv')
11 |
12 | # 导出问题列表为英文Excel文件
13 | extractor.save_problem_list('problems.xlsx', 'excel', 'English')
14 |
15 | # 获取问题描述HTML文件(保存在descriptions文件夹下,需要先获取问题列表)
16 | extractor.update_descriptions()
17 |
18 | # 获取提交的代码(保存在submissions文件夹下,需要先获取问题列,并登录)
19 | extractor.login('foo@bar.com', '123456')
20 | extractor.update_submissions()
21 |
22 | # 导出提交的代码(保存在out_submissions文件夹下,需先获取提交的代码)
23 | extractor.output_submissions()
24 |
--------------------------------------------------------------------------------
/screenshot/csv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhantong/leetcode-spider/54883459e4c5af40d5bbfb324eaae45e8902855e/screenshot/csv.png
--------------------------------------------------------------------------------
/screenshot/excel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhantong/leetcode-spider/54883459e4c5af40d5bbfb324eaae45e8902855e/screenshot/excel.png
--------------------------------------------------------------------------------
/templates/duplicate.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | language |
5 | title |
6 | url |
7 | path |
8 | view |
9 | delete |
10 |
11 | {% for problem in problems %}
12 |
13 | {{ problem['lang'] }} |
14 | {{ problem['title'] }} |
15 | {{ problem['url'] }} |
16 | {{ problem['path'] }} |
17 | 查看 |
18 |
19 |
23 | |
24 |
25 | {% endfor %}
26 |
--------------------------------------------------------------------------------