├── .idea
    ├── LeetCodeSpider.iml
    ├── leetcode-spider.iml
    ├── misc.xml
    ├── modules.xml
    └── vcs.xml
├── DuplicateFilter.py
├── Extractor.py
├── README.md
├── example.py
├── screenshot
    ├── csv.png
    └── excel.png
└── templates
    └── duplicate.html


/.idea/LeetCodeSpider.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Python 3.6.2 (/usr/local/bin/python3)" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/.idea/leetcode-spider.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="Flask">
 4 |     <option name="enabled" value="true" />
 5 |   </component>
 6 |   <component name="NewModuleRootManager">
 7 |     <content url="file://$MODULE_DIR$" />
 8 |     <orderEntry type="inheritedJdk" />
 9 |     <orderEntry type="sourceFolder" forTests="false" />
10 |   </component>
11 |   <component name="TemplatesService">
12 |     <option name="TEMPLATE_CONFIGURATION" value="Jinja2" />
13 |     <option name="TEMPLATE_FOLDERS">
14 |       <list>
15 |         <option value="$MODULE_DIR$/templates" />
16 |       </list>
17 |     </option>
18 |   </component>
19 |   <component name="TestRunnerService">
20 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
21 |   </component>
22 | </module>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.2 (/usr/local/bin/python3)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/leetcode-spider.iml" filepath="$PROJECT_DIR$/.idea/leetcode-spider.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/DuplicateFilter.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, render_template, make_response, request, redirect
 2 | import sqlite3
 3 | 
 4 | app = Flask(__name__)
 5 | 
 6 | 
 7 | def dict_factory(cursor, row):
 8 |     d = {}
 9 |     for idx, col in enumerate(cursor.description):
10 |         d[col[0]] = row[idx]
11 |     return d
12 | 
13 | 
14 | @app.route('/')
15 | def main():
16 |     conn = sqlite3.connect('leetcode.db')
17 |     conn.row_factory = dict_factory
18 |     c = conn.cursor()
19 |     c.execute(
20 |         '''
21 |             SELECT lang, title, url, path 
22 |             FROM submission a 
23 |             WHERE EXISTS(
24 |                 SELECT 1 
25 |                 FROM submission b 
26 |                 WHERE b.downloaded=1 AND b.removed=0 AND a.lang=b.lang AND a.title=b.title 
27 |                 GROUP BY lang, title 
28 |                 HAVING COUNT(lang)>1) 
29 |             ORDER BY lang, title
30 |         ''')
31 |     problems = c.fetchall()
32 |     conn.close()
33 |     return render_template('duplicate.html', problems=problems)
34 | 
35 | 
36 | @app.route('/view/<path:path>')
37 | def view(path=None):
38 |     with open(path, 'r', encoding='utf-8') as f:
39 |         content = f.read()
40 |     response = make_response(content)
41 |     response.headers['content-type'] = 'text/plain'
42 |     return response
43 | 
44 | 
45 | @app.route('/remove', methods=['POST'])
46 | def remove():
47 |     url = request.form['url']
48 |     conn = sqlite3.connect('leetcode.db')
49 |     c = conn.cursor()
50 |     c.execute('UPDATE submission SET removed=1 WHERE url=?', (url,))
51 |     conn.commit()
52 |     conn.close()
53 |     return redirect('/')
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     app.run(debug=True)
58 | 


--------------------------------------------------------------------------------
/Extractor.py:
--------------------------------------------------------------------------------
  1 | import urllib.request
  2 | import urllib.parse
  3 | import http.cookiejar
  4 | import json
  5 | from lxml import etree
  6 | import os
  7 | import concurrent.futures
  8 | import re
  9 | import sqlite3
 10 | import codecs
 11 | import shutil
 12 | import os.path
 13 | 
 14 | 
 15 | def dict_factory(cursor, row):
 16 |     d = {}
 17 |     for idx, col in enumerate(cursor.description):
 18 |         d[col[0]] = row[idx]
 19 |     return d
 20 | 
 21 | 
 22 | class Extractor:
 23 |     def __init__(self):
 24 |         self.base_url = 'https://leetcode.com'
 25 |         self.db_name = 'leetcode.db'
 26 |         cj = http.cookiejar.CookieJar()
 27 |         self.opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
 28 |         self.opener.addheaders = [
 29 |             ('Host', 'leetcode.com'),
 30 |             ('User-Agent',
 31 |              'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36')
 32 |         ]
 33 |         self.is_logged_in = False
 34 | 
 35 |     def login(self, user_name, password):
 36 |         if self.is_logged_in:
 37 |             return
 38 |         url = self.base_url + '/accounts/login/'
 39 |         with self.opener.open(url) as f:
 40 |             content = f.read().decode('utf-8')
 41 |         token = re.findall("name='csrfmiddlewaretoken'\svalue='(.*?)'", content)[0]
 42 |         post_data = {
 43 |             'csrfmiddlewaretoken': token,
 44 |             'login': user_name,
 45 |             'password': password
 46 |         }
 47 |         post_data = urllib.parse.urlencode(post_data)
 48 |         self.opener.addheaders.append(('Referer', url))
 49 |         with self.opener.open(url, data=post_data.encode()) as f:
 50 |             if f.read().decode().find('Successfully signed in') != -1:
 51 |                 self.is_logged_in = True
 52 |                 print('logged in')
 53 |             else:
 54 |                 print('failed to login in')
 55 |         self.opener.addheaders.pop()
 56 | 
 57 |     def get_problem_list(self):
 58 |         with self.opener.open(self.base_url + '/api/problems/algorithms/') as f:
 59 |             content = f.read().decode('utf-8')
 60 |         content = json.loads(content)
 61 |         return content['stat_status_pairs']
 62 | 
 63 |     def store_problem_list_to_db(self, problem_list):
 64 |         conn = sqlite3.connect(self.db_name)
 65 |         c = conn.cursor()
 66 |         c.execute(
 67 |             '''
 68 |                 CREATE TABLE IF NOT EXISTS problem (
 69 |                     id INTEGER,
 70 |                     title TEXT,
 71 |                     slug TEXT,
 72 |                     difficulty INTEGER,
 73 |                     paid_only INTEGER,
 74 |                     status TEXT,
 75 |                     total_acs INTEGER,
 76 |                     total_submitted INTEGER,
 77 |                     PRIMARY KEY(id))
 78 |             ''')
 79 |         c.execute('DELETE FROM problem')
 80 |         for problem in problem_list:
 81 |             c.execute(
 82 |                 '''
 83 |                     INSERT INTO problem 
 84 |                         (id, title, slug, difficulty, paid_only, status, total_acs, total_submitted) 
 85 |                     VALUES 
 86 |                         (?, ?, ?, ?, ?, ?, ?, ?)
 87 |                 ''',
 88 |                 (problem['stat']['question_id']
 89 |                  , problem['stat']['question__title']
 90 |                  , problem['stat']['question__title_slug']
 91 |                  , problem['difficulty']['level']
 92 |                  , 1 if problem['paid_only'] else 0
 93 |                  , problem['status']
 94 |                  , problem['stat']['total_acs']
 95 |                  , problem['stat']['total_submitted'])
 96 |             )
 97 |         conn.commit()
 98 |         conn.close()
 99 | 
100 |     def update_problem_list(self):
101 |         self.store_problem_list_to_db(self.get_problem_list())
102 | 
103 |     def get_problem_list_from_db(self):
104 |         conn = sqlite3.connect(self.db_name)
105 |         conn.row_factory = dict_factory
106 |         c = conn.cursor()
107 |         c.execute('SELECT * FROM problem')
108 |         return c.fetchall()
109 | 
110 |     def get_question_detail(self, title_slug):
111 |         QUERY = '''query getQuestionDetail($titleSlug: String!) {
112 |   isCurrentUserAuthenticated
113 |   question(titleSlug: $titleSlug) {
114 |     questionId
115 |     questionFrontendId
116 |     questionTitle
117 |     translatedTitle
118 |     questionTitleSlug
119 |     content
120 |     translatedContent
121 |     difficulty
122 |     stats
123 |     contributors
124 |     similarQuestions
125 |     discussUrl
126 |     mysqlSchemas
127 |     randomQuestionUrl
128 |     sessionId
129 |     categoryTitle
130 |     submitUrl
131 |     interpretUrl
132 |     codeDefinition
133 |     sampleTestCase
134 |     enableTestMode
135 |     metaData
136 |     enableRunCode
137 |     enableSubmit
138 |     judgerAvailable
139 |     infoVerified
140 |     envInfo
141 |     urlManager
142 |     article
143 |     questionDetailUrl
144 |     discussCategoryId
145 |     discussSolutionCategoryId
146 |     libraryUrl
147 |     companyTags {
148 |       name
149 |       slug
150 |       translatedName
151 |     }
152 |     topicTags {
153 |       name
154 |       slug
155 |       translatedName
156 |     }
157 |   }
158 |   interviewed {
159 |     interviewedUrl
160 |     companies {
161 |       id
162 |       name
163 |       slug
164 |     }
165 |     timeOptions {
166 |       id
167 |       name
168 |     }
169 |     stageOptions {
170 |       id
171 |       name
172 |     }
173 |   }
174 |   subscribeUrl
175 |   isPremium
176 |   loginUrl
177 | }'''
178 |         params = {
179 |             'query': QUERY,
180 |             'operationName': 'getQuestionDetail',
181 |             'variables': json.dumps({
182 |                 'titleSlug': title_slug
183 |             })
184 |         }
185 |         url = self.base_url + '/graphql' + '?' + urllib.parse.urlencode(params, quote_via=urllib.parse.quote).replace(
186 |             '%28', '(').replace('%29', ')').replace('%21', '!')
187 |         with self.opener.open(url) as f:
188 |             content = f.read().decode('utf-8')
189 |             content=json.loads(content)
190 |             return content['data']['question']
191 | 
192 |     def get_description(self, url, file_path):
193 |         with self.opener.open(url) as f:
194 |             content = f.read().decode('utf-8')
195 |             root = etree.HTML(content)
196 |             result = root.xpath('//*[@id="descriptionContent"]//div[@class="question-description"]')
197 |             html = etree.tostring(result[0], encoding='utf-8')
198 |             with open(file_path, 'wb') as f:
199 |                 f.write(html)
200 |         return file_path
201 | 
202 |     def extract_descriptions(self):
203 |         conn = sqlite3.connect(self.db_name)
204 |         c = conn.cursor()
205 |         c.execute('CREATE TABLE IF NOT EXISTS description (title TEXT, path TEXT, PRIMARY KEY(title))')
206 |         c.execute(
207 |             '''
208 |                 SELECT a.id, a.title, a.slug 
209 |                 FROM problem a 
210 |                 LEFT JOIN description b 
211 |                 ON a.title=b.title 
212 |                 WHERE a.paid_only=0 AND b.title IS NULL
213 |             ''')
214 |         problems = c.fetchall()
215 |         dir_path = 'descriptions/'
216 |         os.makedirs(dir_path, exist_ok=True)
217 |         with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
218 |             futures = {
219 |                 executor.submit(self.get_description
220 |                                 , self.base_url + '/problems/' + problem[2] + '/description/'
221 |                                 , os.path.join(dir_path, str(problem[0]).zfill(3) + '. ' + problem[1] + '.html')):
222 |                     problem[1] for problem in problems}
223 |             for future in concurrent.futures.as_completed(futures):
224 |                 title = futures[future]
225 |                 try:
226 |                     file_path = future.result()
227 |                 except Exception as e:
228 |                     print('%r generated an exception: %s' % (title, e))
229 |                 else:
230 |                     if file_path:
231 |                         c.execute('INSERT INTO description (title, path) VALUES (?, ?)', (title, file_path))
232 |         conn.commit()
233 |         conn.close()
234 | 
235 |     def sync_description_db_and_file(self):
236 |         conn = sqlite3.connect(self.db_name)
237 |         conn.row_factory = dict_factory
238 |         c = conn.cursor()
239 |         c.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="description"')
240 |         if not c.fetchone():
241 |             return
242 |         c.execute('SELECT title,path FROM description')
243 |         descriptions = c.fetchall()
244 |         for description in descriptions:
245 |             if not os.path.exists(description['path']):
246 |                 c.execute('DELETE FROM description WHERE title=?', (description['title'],))
247 |         conn.commit()
248 |         conn.close()
249 | 
250 |     def update_descriptions(self):
251 |         self.sync_description_db_and_file()
252 |         self.extract_descriptions()
253 | 
254 |     def get_submission_list(self):
255 |         if not self.is_logged_in:
256 |             print('should login first')
257 |             return
258 |         result = []
259 |         offset = 0
260 |         LIMIT = 100
261 |         while True:
262 |             url = self.base_url + '/api/submissions/?offset=' + str(offset) + '&limit=' + str(LIMIT)
263 |             with self.opener.open(url) as f:
264 |                 content = f.read().decode('utf-8')
265 |             content = json.loads(content)
266 |             result.extend(content['submissions_dump'])
267 |             if not content['has_next']:
268 |                 return result
269 |             offset += LIMIT
270 | 
271 |     def store_submission_list_to_db(self, submission_list):
272 |         conn = sqlite3.connect(self.db_name)
273 |         c = conn.cursor()
274 |         c.execute(
275 |             '''
276 |                 CREATE TABLE IF NOT EXISTS submission (
277 |                     lang TEXT,
278 |                     title TEXT,
279 |                     url TEXT,
280 |                     downloaded INTEGER DEFAULT 0,
281 |                     path TEXT,
282 |                     removed INTEGER DEFAULT 0,
283 |                     PRIMARY KEY(url))
284 |             ''')
285 |         for submission in submission_list:
286 |             if submission['status_display'] == 'Accepted':
287 |                 c.execute('INSERT OR IGNORE INTO submission (lang, title, url) VALUES (?, ?, ?)'
288 |                           , (submission['lang'], submission['title'], submission['url']))
289 |         conn.commit()
290 |         conn.close()
291 | 
292 |     def update_submission_list(self):
293 |         self.store_submission_list_to_db(self.get_submission_list())
294 | 
295 |     def get_submission(self, url, file_path):
296 |         with self.opener.open(url) as f:
297 |             content = f.read().decode('utf-8')
298 |             code = re.findall("submissionCode:\s'(.*?)',", content)[0]
299 |             code = codecs.decode(code, 'unicode-escape')
300 |             code = code.replace('\r\n', '\n')
301 |             with open(file_path, 'w', encoding='utf-8') as f:
302 |                 f.write(code)
303 |         return file_path
304 | 
305 |     def extract_submissions(self):
306 |         conn = sqlite3.connect(self.db_name)
307 |         c = conn.cursor()
308 |         c.execute('SELECT url FROM submission WHERE downloaded=0 AND removed=0')
309 |         urls = c.fetchall()
310 |         urls = [url[0] for url in urls]
311 |         dir_path = 'submissions/'
312 |         os.makedirs(dir_path, exist_ok=True)
313 |         with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
314 |             futures = {executor.submit(self.get_submission, self.base_url + url,
315 |                                        os.path.join(dir_path, url.split('/')[-2])): url for url in urls}
316 |             for future in concurrent.futures.as_completed(futures):
317 |                 url = futures[future]
318 |                 try:
319 |                     file_path = future.result()
320 |                 except Exception as e:
321 |                     print('%r generated an exception: %s' % (url, e))
322 |                 else:
323 |                     if file_path:
324 |                         c.execute('UPDATE submission SET downloaded=1,path=? WHERE url=?', (file_path, url))
325 |         conn.commit()
326 |         conn.close()
327 | 
328 |     def sync_submission_db_and_file(self):
329 |         conn = sqlite3.connect(self.db_name)
330 |         conn.row_factory = dict_factory
331 |         c = conn.cursor()
332 |         c.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="submission"')
333 |         if not c.fetchone():
334 |             return
335 |         c.execute('SELECT url,path FROM submission')
336 |         submissions = c.fetchall()
337 |         for submission in submissions:
338 |             if not os.path.exists(submission['path']):
339 |                 c.execute('DELETE FROM submission WHERE url=?', (submission['url'],))
340 |         conn.commit()
341 |         conn.close()
342 | 
343 |     def update_submissions(self):
344 |         self.sync_submission_db_and_file()
345 |         self.update_submission_list()
346 |         self.extract_submissions()
347 | 
348 |     def output_submissions(self, dir_path='out_submissions/', latest_only=True):
349 |         def lang_to_language(lang):
350 |             if lang == 'python' or lang == 'python3':
351 |                 return 'Python'
352 |             if lang == 'java':
353 |                 return 'Java'
354 |             if lang == 'cpp':
355 |                 return 'C++'
356 | 
357 |         def lang_to_extension(lang):
358 |             if lang == 'python' or lang == 'python3':
359 |                 return '.py'
360 |             if lang == 'java':
361 |                 return '.java'
362 |             if lang == 'cpp':
363 |                 return '.cpp'
364 | 
365 |         os.makedirs(dir_path, exist_ok=True)
366 |         conn = sqlite3.connect(self.db_name)
367 |         c = conn.cursor()
368 |         c.execute(
369 |             '''
370 |                 SELECT problem.id, submission.title 
371 |                 FROM submission 
372 |                 LEFT JOIN problem 
373 |                     ON submission.title=problem.title 
374 |                 WHERE submission.downloaded=1 AND submission.removed=0 
375 |                 GROUP BY submission.title
376 |             ''')
377 |         titles = c.fetchall()
378 |         for id, title in titles:
379 |             if not id:
380 |                 continue
381 |             problem_dir = os.path.join(dir_path, str(id).zfill(3) + '. ' + title)
382 |             os.makedirs(problem_dir, exist_ok=True)
383 |             c.execute('SELECT lang FROM submission WHERE downloaded=1 AND removed=0 AND title=?', (title,))
384 |             langs = c.fetchall()
385 |             langs = [lang[0] for lang in langs]
386 |             for lang in langs:
387 |                 current_dir = os.path.join(problem_dir, lang_to_language(lang))
388 |                 os.makedirs(current_dir, exist_ok=True)
389 |                 c.execute(
390 |                     'SELECT path FROM submission WHERE downloaded=1 AND removed=0 AND title=? AND lang=? ORDER BY url',
391 |                     (title, lang))
392 |                 orig_file_paths = c.fetchall()
393 |                 orig_file_paths = [orig_file_path[0] for orig_file_path in orig_file_paths]
394 |                 shutil.copyfile(orig_file_paths[0], os.path.join(current_dir, 'Solution' + lang_to_extension(lang)))
395 |                 if latest_only:
396 |                     continue
397 |                 for i in range(1, len(orig_file_paths)):
398 |                     shutil.copyfile(orig_file_paths[0],
399 |                                     os.path.join(current_dir, 'Solution ' + 'I' * (i + 1) + lang_to_extension(lang)))
400 | 
401 |         conn.close()
402 | 
403 |     def save_problem_list(self, file_name, file_type='csv', language='Chinese'):
404 |         def preprocess(problem_list):
405 |             for problem in problem_list:
406 |                 problem['acceptance'] = problem['total_acs'] / problem['total_submitted']
407 |                 problem['status'] = problem['status'] == 'ac'
408 | 
409 |         def to_locale(problem_list, language_dict):
410 |             problem_list = [{language_dict[key]: value for (key, value) in problem.items()} for problem in problem_list]
411 | 
412 |             for problem in problem_list:
413 |                 problem[language_dict['difficulty']] = language_dict['level'][problem[language_dict['difficulty']]]
414 |                 problem[language_dict['paid_only']] = language_dict['bool'][problem[language_dict['paid_only']]]
415 |                 problem[language_dict['status']] = language_dict['bool'][problem[language_dict['status']]]
416 |             return problem_list
417 | 
418 |         problem_list = self.get_problem_list_from_db()
419 |         preprocess(problem_list)
420 |         language_dict = self.get_language_dict(language)
421 |         problem_list = to_locale(problem_list, language_dict)
422 |         if file_type == 'csv':
423 |             self.save_problem_list_as_csv(problem_list, file_name)
424 |         elif file_type == 'excel':
425 |             self.save_problem_list_as_excel(problem_list, file_name, language_dict)
426 | 
427 |     def save_problem_list_as_csv(self, problem_list, file_name):
428 |         with open(file_name, 'w', encoding='utf-8', newline='') as f:
429 |             import csv
430 |             writer = csv.DictWriter(f, fieldnames=problem_list[0].keys())
431 |             writer.writeheader()
432 |             writer.writerows(problem_list)
433 | 
434 |     def save_problem_list_as_excel(self, problem_list, file_name, language_dict):
435 |         from openpyxl import Workbook
436 |         from openpyxl.styles import NamedStyle
437 |         from openpyxl.formatting.rule import CellIsRule, DataBarRule
438 |         from openpyxl.styles import PatternFill
439 | 
440 |         def format_cell_style(ws, language_dict):
441 |             style_int = NamedStyle('int')
442 |             style_int.number_format = '0'
443 |             style_str = NamedStyle('str')
444 |             style_str.number_format = '@'
445 |             style_pcnt = NamedStyle('pcnt')
446 |             style_pcnt.number_format = '0.0%'
447 |             for cell in ws[column_index[language_dict['id']]][1:]:
448 |                 cell.style = style_int
449 |             for cell in ws[column_index[language_dict['total_submitted']]][1:]:
450 |                 cell.style = style_int
451 |             for cell in ws[column_index[language_dict['total_acs']]][1:]:
452 |                 cell.style = style_int
453 |             for cell in ws[column_index[language_dict['title']]][1:]:
454 |                 cell.style = style_str
455 |             for cell in ws[column_index[language_dict['slug']]][1:]:
456 |                 cell.style = style_str
457 |             for cell in ws[column_index[language_dict['difficulty']]][1:]:
458 |                 cell.style = style_str
459 |             for cell in ws[column_index[language_dict['paid_only']]][1:]:
460 |                 cell.style = style_str
461 |             for cell in ws[column_index[language_dict['status']]][1:]:
462 |                 cell.style = style_str
463 |             for cell in ws[column_index[language_dict['acceptance']]][1:]:
464 |                 cell.style = style_pcnt
465 | 
466 |         def conditional_formatting(ws, language_dict):
467 |             def get_entire_column(index):
468 |                 return index + '1:' + index + '1048576'
469 | 
470 |             red_color = 'ffc7ce'
471 |             green_color = 'c2efcf'
472 |             yellow_color = 'ffeba2'
473 | 
474 |             red_fill = PatternFill(start_color=red_color, end_color=red_color, fill_type='solid')
475 |             green_fill = PatternFill(start_color=green_color, end_color=green_color, fill_type='solid')
476 |             yellow_fill = PatternFill(start_color=yellow_color, end_color=yellow_color, fill_type='solid')
477 | 
478 |             ws.conditional_formatting.add(get_entire_column(column_index[language_dict['difficulty']]),
479 |                                           CellIsRule(operator='equal', formula=['"' + language_dict['level'][1] + '"'],
480 |                                                      stopIfTrue=False, fill=green_fill))
481 |             ws.conditional_formatting.add(get_entire_column(column_index[language_dict['difficulty']]),
482 |                                           CellIsRule(operator='equal', formula=['"' + language_dict['level'][2] + '"'],
483 |                                                      stopIfTrue=False, fill=yellow_fill))
484 |             ws.conditional_formatting.add(get_entire_column(column_index[language_dict['difficulty']]),
485 |                                           CellIsRule(operator='equal', formula=['"' + language_dict['level'][3] + '"'],
486 |                                                      stopIfTrue=False, fill=red_fill))
487 | 
488 |             ws.conditional_formatting.add(get_entire_column(column_index[language_dict['paid_only']]),
489 |                                           CellIsRule(operator='equal',
490 |                                                      formula=['"' + language_dict['bool'][False] + '"'],
491 |                                                      stopIfTrue=False, fill=green_fill))
492 |             ws.conditional_formatting.add(get_entire_column(column_index[language_dict['paid_only']]),
493 |                                           CellIsRule(operator='equal',
494 |                                                      formula=['"' + language_dict['bool'][True] + '"'],
495 |                                                      stopIfTrue=False, fill=red_fill))
496 | 
497 |             ws.conditional_formatting.add(get_entire_column(column_index[language_dict['status']]),
498 |                                           CellIsRule(operator='equal',
499 |                                                      formula=['"' + language_dict['bool'][False] + '"'],
500 |                                                      stopIfTrue=False, fill=red_fill))
501 |             ws.conditional_formatting.add(get_entire_column(column_index[language_dict['status']]),
502 |                                           CellIsRule(operator='equal',
503 |                                                      formula=['"' + language_dict['bool'][True] + '"'],
504 |                                                      stopIfTrue=False, fill=green_fill))
505 | 
506 |             ws.conditional_formatting.add(get_entire_column(column_index[language_dict['acceptance']]),
507 |                                           DataBarRule(start_type='percentile', start_value=0, end_type='percentile',
508 |                                                       end_value=100, color="FF638EC6", showValue='None'))
509 | 
510 |         wb = Workbook()
511 |         ws = wb.active
512 |         ws.append(tuple(problem_list[0].keys()))
513 |         column_index = {item.value: item.column for item in ws[1]}
514 |         rows = [{column_index[key]: value for (key, value) in problem.items()} for problem in problem_list]
515 |         for row in rows:
516 |             ws.append(row)
517 |         format_cell_style(ws, language_dict)
518 |         conditional_formatting(ws, language_dict)
519 |         wb.save(file_name)
520 | 
521 |     def get_language_dict(self, language):
522 |         language_dict = None
523 |         if language == 'Chinese':
524 |             language_dict = {
525 |                 'id': '题号',
526 |                 'title': '标题',
527 |                 'slug': '链接',
528 |                 'difficulty': '难度',
529 |                 'total_submitted': '总提交数',
530 |                 'total_acs': '总通过数',
531 |                 'acceptance': '通过率',
532 |                 'paid_only': '付费',
533 |                 'status': '已解决',
534 |                 'level': {
535 |                     1: '简单',
536 |                     2: '中等',
537 |                     3: '难'
538 |                 },
539 |                 'bool': {
540 |                     True: '是',
541 |                     False: '否'
542 |                 }
543 |             }
544 |         elif language == 'English':
545 |             language_dict = {
546 |                 'id': '#',
547 |                 'title': 'Title',
548 |                 'slug': 'Link',
549 |                 'difficulty': 'Difficulty',
550 |                 'total_submitted': 'Total Submitted',
551 |                 'total_acs': 'Total Accepted',
552 |                 'acceptance': 'Acceptance',
553 |                 'paid_only': 'Paid Only',
554 |                 'status': 'Solved',
555 |                 'level': {
556 |                     1: 'Easy',
557 |                     2: 'Medium',
558 |                     3: 'Hard'
559 |                 },
560 |                 'bool': {
561 |                     True: 'Yes',
562 |                     False: 'No'
563 |                 }
564 |             }
565 |         return language_dict
566 | 
567 | 
568 | if __name__ == '__main__':
569 |     extractor = Extractor()
570 |     result=extractor.get_question_detail('merge-two-sorted-lists')
571 |     print(result)
572 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # LeetCode-Spider
  2 | 
  3 | Python实现的LeetCode爬虫。爬取LeetCode题目描述和提交的代码。
  4 | 
  5 | ## 特点
  6 | 
  7 | - 支持爬取题目列表，保存为本地CSV/Excel文件。
  8 | - 支持爬取题目描述，保存为本地HTML文件。
  9 | - 支持爬取用户提交的代码，保存为如_.py、_.java、\*.cpp等源码。
 10 | - 高速并发下载题目描述和提交的代码。
 11 | - 支持增量更新，当本地有缺损或LeetCode有新内容（题目/提交的代码）时，以增量形式更新。
 12 | 
 13 | ## 使用
 14 | 
 15 | 参考example.py。
 16 | 
 17 | ### 克隆或下载本仓库
 18 | 
 19 | 使用`git clone`或直接下载本仓库代码，并切换工作目录到本项目根目录
 20 | 
 21 | ### 安装依赖
 22 | 
 23 | 本项目需要用到第三方库`lxml`和`openpyxl`（如果需要保存为Excel），可通过如下`pip`命令安装：
 24 | 
 25 | ```bash
 26 | pip3 install lxml
 27 | pip3 install openpyxl
 28 | ```
 29 | 
 30 | ### 获取问题列表（必须）
 31 | 
 32 | ```python
 33 | from Extractor import Extractor
 34 | 
 35 | extractor = Extractor()
 36 | 
 37 | # 获取问题列表（保存在数据库leetcode.db中，若希望获取问题状态（是否ac），需首先登录）
 38 | # extractor.login('foo@bar.com', '123456')
 39 | extractor.update_problem_list()
 40 | ```
 41 | 
 42 | 获取得到的数据保存在leetcode.db数据库中。
 43 | 
 44 | ### 导出问题列表
 45 | 
 46 | 可将问题列表导出为CSV/Excel格式文件，参考下面的截图。
 47 | 
 48 | ```python
 49 | # 导出问题列表为中文CSV文件
 50 | extractor.save_problem_list('problems.csv')
 51 | 
 52 | # 导出问题列表为英文Excel文件
 53 | extractor.save_problem_list('problems.xlsx', 'excel', 'English')
 54 | ```
 55 | 
 56 | ![CSV][csv]
 57 | 
 58 | ![Excel][excel]
 59 | 
 60 | ### 获取问题描述HTML文件
 61 | 
 62 | **需先获取问题列表**
 63 | 
 64 | ```python
 65 | # 获取问题描述HTML文件（保存在descriptions文件夹下，需要先获取问题列表）
 66 | extractor.update_descriptions()
 67 | ```
 68 | 
 69 | 根据问题列表增量多线程并发下载新的问题描述，并将HTML文件保存到descriptions文件夹下。文件夹结构为：
 70 | 
 71 | ```
 72 | descriptions
 73 |     001. Two Sum.html
 74 |     002. Add Two Numbers.html
 75 |     003. Longest Substring Without Repeating Characters.html
 76 |     ...
 77 | ```
 78 | 
 79 | ### 获取提交的代码
 80 | 
 81 | **需先获取问题列表**
 82 | 
 83 | ```python
 84 | # 获取提交的代码（保存在submissions文件夹下，需要先获取问题列，并登录）
 85 | extractor.login('foo@bar.com', '123456')
 86 | extractor.update_submissions()
 87 | ```
 88 | 
 89 | 这里需要先输入用户名和密码登录，然后才能获取到此用户提交的代码。
 90 | 
 91 | 根据问题列表增量多线程并发下载新的提交代码，并将其保存到submissions文件夹下。文件夹结构为：
 92 | 
 93 | ```
 94 | submissions
 95 |     24152714
 96 |     24153189
 97 |     24165875
 98 |     ...
 99 | ```
100 | 
101 | ### 导出提交的代码
102 | 
103 | **需先获取提交的代码**
104 | 
105 | ```python
106 | # 导出提交的代码（保存在out_submissions文件夹下，需先获取提交的代码）
107 | extractor.output_submissions()
108 | ```
109 | 
110 | 导出之前保存的文件为格式化文件结构（默认仅导出每种语言的最新提交版本），保存到out_submissions文件夹下。文件夹结构为：
111 | 
112 | ```
113 | out_submissions
114 |     001. Two Sum
115 |         C++
116 |             Solution.cpp
117 |         Java
118 |             Solution.java
119 |         Python
120 |             Solution
121 |     002. Add Two Numbers
122 |         C++
123 |             Solution.cpp
124 |         Java
125 |             Solution.java
126 |         Python
127 |             Solution
128 |     ...
129 | ```
130 | 
131 | ## 第三方依赖
132 | 
133 | - [lxml](http://lxml.de/)
134 | - [openpyxl](https://openpyxl.readthedocs.io/)
135 | 
136 | [csv]: screenshot/csv.png
137 | 
138 | [excel]: screenshot/excel.png
139 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | from Extractor import Extractor
 2 | 
 3 | extractor = Extractor()
 4 | 
 5 | # 获取问题列表（保存在数据库leetcode.db中，若希望获取问题状态（是否ac），需首先登录）
 6 | # extractor.login('foo@bar.com', '123456')
 7 | extractor.update_problem_list()
 8 | 
 9 | # 导出问题列表为中文CSV文件
10 | extractor.save_problem_list('problems.csv')
11 | 
12 | # 导出问题列表为英文Excel文件
13 | extractor.save_problem_list('problems.xlsx', 'excel', 'English')
14 | 
15 | # 获取问题描述HTML文件（保存在descriptions文件夹下，需要先获取问题列表）
16 | extractor.update_descriptions()
17 | 
18 | # 获取提交的代码（保存在submissions文件夹下，需要先获取问题列，并登录）
19 | extractor.login('foo@bar.com', '123456')
20 | extractor.update_submissions()
21 | 
22 | # 导出提交的代码（保存在out_submissions文件夹下，需先获取提交的代码）
23 | extractor.output_submissions()
24 | 


--------------------------------------------------------------------------------
/screenshot/csv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhantong/leetcode-spider/54883459e4c5af40d5bbfb324eaae45e8902855e/screenshot/csv.png


--------------------------------------------------------------------------------
/screenshot/excel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhantong/leetcode-spider/54883459e4c5af40d5bbfb324eaae45e8902855e/screenshot/excel.png


--------------------------------------------------------------------------------
/templates/duplicate.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <table>
 3 |     <tr>
 4 |         <th>language</th>
 5 |         <th>title</th>
 6 |         <th>url</th>
 7 |         <th>path</th>
 8 |         <th>view</th>
 9 |         <th>delete</th>
10 |     </tr>
11 |     {% for problem in problems %}
12 |     <tr>
13 |         <td>{{ problem['lang'] }}</td>
14 |         <td>{{ problem['title'] }}</td>
15 |         <td>{{ problem['url'] }}</td>
16 |         <td>{{ problem['path'] }}</td>
17 |         <td><a href="view/{{ problem['path'] }}">查看</a></td>
18 |         <td>
19 |             <form action="remove" method="post">
20 |                 <input type="hidden" name="url" value="{{ problem['url'] }}">
21 |                 <input type="submit" value="删除">
22 |             </form>
23 |         </td>
24 |     </tr>
25 |     {% endfor %}
26 | </table>


--------------------------------------------------------------------------------