├── .gitignore ├── README.md ├── config.ini ├── crawler.py ├── grasp.py └── requirement.txt /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.swp 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | README 2 | ====== 3 | 4 | TODO: 5 | ------ 6 | - [ ] Daemonized crawler which can save the latest accepted submissions in real time. 7 | 8 | Features: 9 | ------------ 10 | 1. Save filtered problems on leetcode.com 11 | 2. Save all your latest accepted submissions 12 | 13 | Note: 14 | ----- 15 | * To save your submissions, please make sure the fields in `config.ini` have been filled in correctly. 16 | * Saved problems with specified language will be ignored 17 | * When saving problems, cannot specify category and tag at the same time 18 | * Multiple tags/categories/difficulties/numbers are now separated by **space** not **comma** 19 | 20 | Dependecies: 21 | ------------ 22 | * python >= 3.4 23 | * python3-beautifulsoup4 4.3.2 24 | * python3-requests 2.7.0 25 | 26 | Commands: 27 | ------ 28 | ``` 29 | usage: grasp.py [-h] {show_tags,show_cate,save,save_sub} ... 30 | 31 | positional arguments: 32 | {show_tags,show_cate,save,save_sub} 33 | Available commands 34 | show_tags Display available tags or problems with specified tags 35 | show_cate Display available categories or problems in specified 36 | categories 37 | save Save filtered problems in cur dir. 38 | save_sub Save last accepted submissions. 39 | 40 | optional arguments: 41 | -h, --help show this help message and exit 42 | ``` 43 | 44 | #### For each sub-command's options, please run "grasp.py command -h/--help" 45 | 46 | 47 | Example: 48 | -------- 49 | * save all your latest accepted submissions 50 | ``` 51 | $ grasp.py save_sub 52 | ``` 53 | * save all your latest accepted submissions with specified language 54 | ``` 55 | $ grasp.py save_sub -l python 56 | ``` 57 | * save the problems whose numbers are in {1,2,3,4,5,12,16} in `algorithms` category and C & Python's default code: 58 | ``` 59 | $ grasp.py save -c algorithms -n 1-5 12 16 -l c python 60 | ``` 61 | * save the easy and hard problems in `database` category and only their description 62 | ``` 63 | $ grasp.py save -c database -d easy hard 64 | ``` 65 | * save the hard problems with tag `dynamic-programming` and C#'s default code': 66 | ``` 67 | $ grasp.py save -t dynamic-programming -d hard -l c# 68 | ``` 69 | * save all problems and all default code: 70 | ``` 71 | $ grasp.py save -t all -l all 72 | $ grasp.py save -c all -l all 73 | ``` 74 | * display available tags: 75 | ``` 76 | $ grasp.py show_tags 77 | ``` 78 | * display available categories: 79 | ``` 80 | $ grasp.py show_cate 81 | ``` 82 | * show problems with specified tags: 83 | ``` 84 | $ grasp.py show_tags -t trie math 85 | ``` 86 | * show easy problems in `algorithm` category: 87 | ``` 88 | $ grasp.py show_cate -c algorithms -d easy 89 | ``` 90 | -------------------------------------------------------------------------------- /config.ini: -------------------------------------------------------------------------------- 1 | [USER] 2 | username = 3 | password = 4 | 5 | -------------------------------------------------------------------------------- /crawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 -O 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import re 5 | import json 6 | import requests 7 | import itertools 8 | import configparser 9 | from urllib.parse import urljoin 10 | from collections import defaultdict 11 | from bs4 import BeautifulSoup, SoupStrainer 12 | from concurrent.futures import ThreadPoolExecutor 13 | 14 | 15 | ALL_CATEGORIES = ['algorithms', 'database', 'shell'] 16 | ALL_LANGUAGES = ['cpp', 'java', 'python', 'c', 'csharp', 'javascript', 'ruby', 'bash', 'mysql'] 17 | 18 | class PageError(Exception): 19 | 20 | def __init__(self, text, url): 21 | self.msg = text 22 | self.url = url 23 | 24 | def __str__(self): 25 | return '{}: {}'.format(self.msg, self.url) 26 | 27 | 28 | class LoginError(Exception): 29 | 30 | def __init__(self): 31 | pass 32 | 33 | def __str__(self): 34 | return 'Fail to login! Please check your username or password in `config.ini` .' 35 | 36 | 37 | class Crawler: 38 | def __init__(self, debug=False): 39 | self.BASEURL = 'https://leetcode.com/problemset/' 40 | self.DEBUG = debug 41 | self.BASEDIR = os.path.dirname(__file__) 42 | self.session = requests.Session() 43 | self.session.headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0' 44 | 45 | def daemon(self, writer): 46 | 47 | import time 48 | self.login() 49 | while 1: 50 | first_page = next(self.get_submissions(ALL_LANGUAGES)) 51 | writer.save_submissions(self, first_page) 52 | time.sleep(3) 53 | 54 | def get_soup(self, url, strainer=None): 55 | html = self.session.get(url, timeout=10) 56 | soup = BeautifulSoup(html.content, parse_only=strainer) 57 | return soup 58 | 59 | def get_tags(self): 60 | soup = self.get_soup(self.BASEURL, SoupStrainer(class_='list-group-item')) 61 | content = soup.find_all('a', onclick=None) 62 | tagdict = {} 63 | for item in content: 64 | if '/tag/' not in item['href']: 65 | continue 66 | count, title = list(item.stripped_strings) 67 | title = title.replace(' ', '-').lower() 68 | tagdict[title] = (count, urljoin(self.BASEURL, item['href'])) 69 | return tagdict 70 | 71 | def login(self): 72 | config = configparser.ConfigParser() 73 | config.read(os.path.join(self.BASEDIR, 'config.ini')) 74 | username = config['USER']['username'] 75 | password = config['USER']['password'] 76 | loginurl = 'https://leetcode.com/accounts/login/' 77 | self.session.headers['Referer'] = loginurl 78 | self.session.get(loginurl) 79 | token = self.session.cookies['csrftoken'] 80 | payload = { 81 | 'csrfmiddlewaretoken': token, 82 | 'login': username, 83 | 'password': password 84 | } 85 | self.session.post(loginurl, data=payload) 86 | if not self.session.cookies.get('PHPSESSID'): 87 | raise LoginError() 88 | 89 | def get_submissions(self, specified_langs): 90 | submurl = 'https://leetcode.com/submissions/' 91 | strainer = SoupStrainer('tbody') 92 | memory = defaultdict(dict) 93 | for i in itertools.count(1): 94 | url = urljoin(submurl, str(i)) 95 | soup = self.get_soup(url, strainer) 96 | rowlist = soup.find_all('tr') 97 | if rowlist == []: 98 | break 99 | eachpage = defaultdict(dict) 100 | for row in rowlist: 101 | _, title, status, _, lang = list(row.stripped_strings) 102 | if status == 'Accepted': 103 | title = title.replace(' ', '_') 104 | if not memory[title].get(lang): 105 | memory[title][lang] = urljoin(self.BASEURL, row.find_all('a')[1]['href']) 106 | eachpage[title][lang] = memory[title][lang] 107 | info = [] 108 | for title in eachpage.keys(): 109 | for lang in eachpage[title].keys(): 110 | if lang in specified_langs: 111 | info.append((title, eachpage[title][lang], lang)) 112 | yield info 113 | 114 | def get_table(self, url): 115 | soup = self.get_soup(url) 116 | if soup.find(text=re.compile('available')): 117 | raise PageError('No Such Page', url) 118 | 119 | if '/tag/' in url: 120 | pat = re.compile('data: (\[.*\])', re.S | re.U) 121 | raw_script = soup.body.find_all('script')[3].text 122 | rawjson = pat.findall(raw_script)[0] 123 | rawjson = re.sub(',\s*}', '}', rawjson) 124 | rawjson = re.sub('"\s*\+\s*"', '', rawjson) 125 | rawjson = ''.join(rawjson.rsplit(',', 1)) 126 | allproblems = json.loads(rawjson) 127 | table = list() 128 | for p in allproblems: 129 | title, diff, ac_or_not = p['title'], p['difficulty'], p['ac_or_not'] 130 | title, diff, ac_or_not = (BeautifulSoup(title).body.a, 131 | BeautifulSoup(diff).text, 132 | BeautifulSoup(ac_or_not).span['class'][0]) 133 | ac_rate, idnum = p['ac_rate'], p['id'] 134 | table.append((idnum, title.text, ac_rate, diff, title['href'], ac_or_not)) 135 | else: 136 | tmp = soup.find(id='problemList').find_all('tr')[1:] 137 | table = [tuple(i.stripped_strings) + (i.a['href'], i.td.span['class'][0]) for i in tmp] 138 | 139 | return table 140 | 141 | def get_problems_list(self, url): 142 | try: 143 | content = self.get_table(url) 144 | except: 145 | raise 146 | 147 | if self.DEBUG: 148 | print("Grasped content:") 149 | print(content) 150 | 151 | for info in content: 152 | yield {'id': info[0], 153 | 'title': info[1].replace(' ', '_'), 154 | 'acceptance': info[2], 155 | 'difficulty': info[3].lower(), 156 | 'url': urljoin(self.BASEURL, info[4]), 157 | 'ac_or_not': info[5] 158 | } 159 | 160 | 161 | class Writer: 162 | def __init__(self, debug=False): 163 | self.DEBUG = debug 164 | self.BASEDIR = os.path.dirname(__file__) 165 | self.SAVENAME = {'c': 'solution.c', 166 | 'cpp': 'solution.cpp', 167 | 'ruby': 'solution.rb', 168 | 'javascript': 'solution.js', 169 | 'csharp': 'solution.cs', 170 | 'python': 'solution.py', 171 | 'bash': 'solution.sh', 172 | 'mysql': 'solution.sql', 173 | 'java': 'solution.java'} 174 | 175 | def print_to_file(self, text, path): 176 | with open(path, 'w') as fout: 177 | print(text.replace('\r\n', os.linesep), file=fout) 178 | if self.DEBUG: 179 | print('{} saved.'.format(path)) 180 | 181 | def save_submissions(self, spider, info): 182 | 183 | def set_save_path(title, lang): 184 | if lang == 'bash': 185 | pdir = os.path.join(self.BASEDIR, 'shell', title) 186 | elif lang == 'mysql': 187 | pdir = os.path.join(self.BASEDIR, 'database', title) 188 | else: 189 | pdir = os.path.join(self.BASEDIR, 'algorithms', title) 190 | os.makedirs(pdir, exist_ok=True) 191 | return os.path.join(pdir, self.SAVENAME[lang]) 192 | 193 | def executor(item): 194 | title, url, lang = item 195 | page = spider.session.get(url) 196 | pat = re.compile("vm.code.{} = '(.+)'".format(lang)) 197 | code = pat.findall(page.text)[0] 198 | jsoncode = json.loads('{"code": "%s"}' % code) 199 | codepath = set_save_path(title, lang) 200 | self.print_to_file(jsoncode['code'], codepath) 201 | 202 | with ThreadPoolExecutor(max_workers=15) as pool: 203 | pool.map(executor, info) 204 | pool.shutdown(wait=True) 205 | 206 | def save_problems(self, spider, plist, subdir, langlist): 207 | 208 | def save_defaultcode(soup, pdir, langlist): 209 | tag = soup.find(lambda x: x.has_attr('ng-init')) 210 | rawjson = tag['ng-init'] 211 | pat = re.compile(r'(\[.+\])') 212 | raw = pat.findall(rawjson)[0].replace("'", '"') # ' -> " 213 | raw = ''.join(raw.rsplit(',', 1)) # remove the last ',' in json list 214 | codelist = json.loads(raw) 215 | codelist = filter(lambda x: x['value'] in langlist, codelist) 216 | 217 | codedict = {i['value']: i['defaultCode'] for i in codelist} 218 | 219 | for lang in codedict.keys(): 220 | codepath = os.path.join(pdir, self.SAVENAME[lang]) 221 | if not os.path.isfile(codepath): 222 | self.print_to_file(codedict[lang], codepath) 223 | elif self.DEBUG: 224 | print('{} already exists!'.format(codepath)) 225 | 226 | def save_description(soup, pdir): 227 | descpath = os.path.join(pdir, 'description.txt') 228 | if not os.path.isfile(descpath): 229 | desc = soup.find(class_='question-content').text 230 | self.print_to_file(desc, descpath) 231 | elif self.DEBUG: 232 | print('{} already exists!'.format(descpath)) 233 | 234 | def executor(info): 235 | soup = spider.get_soup(info['url'], SoupStrainer(class_='col-md-12')) 236 | pdir = os.path.join(self.BASEDIR, subdir, info['title']) 237 | os.makedirs(pdir, exist_ok=True) 238 | save_description(soup, pdir) 239 | save_defaultcode(soup, pdir, langlist) 240 | 241 | with ThreadPoolExecutor(max_workers=15) as pool: 242 | pool.map(executor, plist) 243 | pool.shutdown(wait=True) 244 | print('All done!') 245 | -------------------------------------------------------------------------------- /grasp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 -O 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import sys 5 | import argparse 6 | import crawler 7 | from crawler import ALL_LANGUAGES, ALL_CATEGORIES 8 | from urllib.parse import urljoin 9 | 10 | #################### 11 | # This piece of code mainly comes from @vamin in StackOverFlow 12 | # See http://stackoverflow.com/a/25334100/4725840 13 | # Thank him very much! :D 14 | class CustomFormatter(argparse.HelpFormatter): 15 | def _format_action_invocation(self, action): 16 | if not action.option_strings: 17 | metavar, = self._metavar_formatter(action, action.dest)(1) 18 | return metavar 19 | else: 20 | # if the Optional doesn't take a value, format is: 21 | # -s, --long 22 | if action.nargs == 0: 23 | return ', '.join(action.option_strings) 24 | # if the Optional takes a value, format is: 25 | # -s, --long ARGS 26 | else: 27 | default = action.dest.upper() 28 | args_string = self._format_args(action, default) 29 | option_string = ', '.join(action.option_strings) 30 | return '{} {}'.format(option_string, args_string) 31 | #################### 32 | 33 | def get_filtered_problems(plist, flist): 34 | for filt in flist: 35 | plist = filter(filt, plist) 36 | return list(plist) 37 | 38 | def print_problems(spider, items, urllist, filter_list): 39 | for item, url in zip(items, urllist): 40 | print('The problems under <{}> are:'.format(item)) 41 | plist = get_filtered_problems(spider.get_problems_list(url), filter_list) 42 | maxlen = max(len(pro['title']) for pro in plist) 43 | head = '{0:<10} {1:<5} {2:<{tlen}} {3:<12} {4:<12}' 44 | print(head.format('ac_or_not', 'id', 'title', 'acceptance', 'difficulty', tlen=maxlen)) 45 | total = len(plist) 46 | ac, notac = 0, 0 47 | for pro in plist: 48 | if pro['ac_or_not'] == 'ac': 49 | ac += 1 50 | elif pro['ac_or_not'] == 'notac': 51 | notac += 1 52 | print(head.format(pro['ac_or_not'], pro['id'], pro['title'], 53 | pro['acceptance'], pro['difficulty'], tlen=maxlen)) 54 | 55 | print('AC: {}'.format(ac)) 56 | print('NotAC: {}'.format(notac)) 57 | print('Unknown: {}'.format(total-ac-notac)) 58 | print('Total: {}'.format(total)) 59 | print(os.linesep) 60 | 61 | 62 | if __name__ == '__main__': 63 | 64 | parser = argparse.ArgumentParser() 65 | 66 | base_parser = argparse.ArgumentParser(add_help=False) 67 | base_parser.add_argument('-n', '--number', 68 | nargs='+', 69 | help="Specify the question number") 70 | base_parser.add_argument('-d', '--difficulty', 71 | nargs='+', 72 | choices=['easy', 'medium', 'hard'], 73 | help="Specify the difficulty.\n" 74 | "If not specified, all problems will be grasped.") 75 | base_parser.add_argument('-v', '--verbose', 76 | action="store_true", 77 | default=False, 78 | help="Verbose output") 79 | base_parser.add_argument('--login', 80 | action="store_true", 81 | default=False, 82 | help="Login and disp extra information") 83 | 84 | subparsers = parser.add_subparsers(help='Available commands', dest='command') 85 | 86 | 87 | tag_parser = subparsers.add_parser('show_tags', 88 | parents=[base_parser], 89 | formatter_class=CustomFormatter, 90 | help='Display available tags or problems with specified tags') 91 | tag_parser.add_argument('-t', '--tag', 92 | nargs='+', 93 | help="Specify the tag") 94 | 95 | 96 | cat_parser = subparsers.add_parser('show_cate', 97 | parents=[base_parser], 98 | formatter_class=CustomFormatter, 99 | help='Display available categories or problems in specified categories') 100 | cat_parser.add_argument('-c', '--category', 101 | nargs='+', 102 | choices=ALL_CATEGORIES + ['all'], 103 | help="Specify the category") 104 | 105 | 106 | sav_parser = subparsers.add_parser('save', 107 | parents=[base_parser], 108 | formatter_class=CustomFormatter, 109 | help='Save filtered problems in cur dir.') 110 | sav_parser.add_argument('-l', '--language', 111 | nargs='+', 112 | default=[], 113 | choices=['all', 'cpp', 'java', 'python', 'c', 'c#', 'js', 'ruby', 'bash', 'mysql'], 114 | help="Specify the language.\n" 115 | "If not specified, only the description will be saved.") 116 | sav_group = sav_parser.add_mutually_exclusive_group(required=True) 117 | sav_group.add_argument('-c', '--category', 118 | nargs='+', 119 | choices=ALL_CATEGORIES + ['all'], 120 | help="Specify the category") 121 | sav_group.add_argument('-t', '--tag', 122 | nargs='+', 123 | help="Specify the tag") 124 | 125 | 126 | sav_sub_parser = subparsers.add_parser('save_sub', 127 | formatter_class=CustomFormatter, 128 | help='Save last accepted submissions.') 129 | sav_sub_parser.add_argument('-l', '--language', 130 | nargs='+', 131 | default=[], 132 | choices=['all', 'cpp', 'java', 'python', 'c', 'c#', 'js', 'ruby', 'bash', 'mysql'], 133 | help="Specify the language.\n" 134 | "If not specified, all your latest accepted submissions will be saved.") 135 | sav_sub_parser.add_argument('-v', '--verbose', 136 | action="store_true", 137 | default=False, 138 | help="Verbose output") 139 | 140 | daemon_parser = subparsers.add_parser('daemon', 141 | formatter_class=CustomFormatter, 142 | help='Daemonized crawler.') 143 | daemon_parser.add_argument('-v', '--verbose', 144 | action="store_true", 145 | default=False, 146 | help="Verbose output") 147 | 148 | if len(sys.argv) > 1: 149 | args = parser.parse_args() 150 | else: 151 | parser.print_help() 152 | sys.exit(1) 153 | 154 | filter_list = [] 155 | 156 | argsDict = vars(args) 157 | 158 | if argsDict.get('number'): 159 | specified_numbers = set() 160 | for n in args.number: 161 | if n.isdigit(): 162 | specified_numbers.add(n) 163 | elif '-' in n: 164 | b, e = n.split('-') 165 | specified_numbers.update({str(i) for i in range(int(b), int(e)+1)}) 166 | filter_list.append(lambda x: x['number'] in specified_numbers) 167 | 168 | if args.verbose: 169 | print('Specified numbers are: {}'.format(specified_numbers)) 170 | 171 | if argsDict.get('difficulty'): 172 | filter_list.append(lambda x: x['difficulty'] in args.difficulty) 173 | 174 | if args.verbose: 175 | print('Specified difficulty is: {}'.format(args.difficulty)) 176 | 177 | if argsDict.get('language'): 178 | specified_langs = [] 179 | for l in set(args.language): 180 | if l == 'all': 181 | specified_langs = ALL_LANGUAGES 182 | break 183 | elif l == 'c#': 184 | specified_langs.append('csharp') 185 | elif l == 'js': 186 | specified_langs.append('javascript') 187 | else: 188 | specified_langs.append(l) 189 | args.language = specified_langs 190 | if args.verbose: 191 | print('Specified languages are: {}'.format(', '.join(specified_langs))) 192 | 193 | c = crawler.Crawler(debug=args.verbose) 194 | w = crawler.Writer(debug=args.verbose) 195 | 196 | if args.command == 'daemon': 197 | c.daemon(w) 198 | pass 199 | 200 | if argsDict.get('login'): 201 | c.login() 202 | 203 | if argsDict.get('category'): 204 | if 'all' in args.category: 205 | args.category = ALL_CATEGORIES 206 | L = args.category 207 | urllist = [urljoin(c.BASEURL, i) for i in L] 208 | 209 | if args.verbose: 210 | print('Specified categories are: {}'.format(args.category)) 211 | 212 | elif argsDict.get('tag'): 213 | w.BASEDIR = os.path.join(w.BASEDIR, 'Tag') 214 | alltags = c.get_tags() 215 | if 'all' in args.tag: 216 | args.tag = list(alltags.keys()) 217 | L = args.tag 218 | urllist = [alltags[i][1] for i in L] 219 | 220 | if args.verbose: 221 | print('Specified tags are: {}'.format(args.tag)) 222 | 223 | if args.command == 'show_tags': 224 | if not args.tag: 225 | print('Available tags are:') 226 | print(os.linesep.join(sorted(c.get_tags().keys()))) 227 | else: 228 | print_problems(c, args.tag, urllist, filter_list) 229 | 230 | elif args.command == 'show_cate': 231 | if not args.category: 232 | print('Available categories are: {}'.format(', '.join(ALL_CATEGORIES))) 233 | else: 234 | print_problems(c, args.category, urllist, filter_list) 235 | 236 | elif args.command == 'save': 237 | for i, u in zip(L, urllist): 238 | try: 239 | plist = get_filtered_problems(c.get_problems_list(u), filter_list) 240 | except: 241 | print(sys.exc_info[2].tb_lineno, sys.exc_info()[1]) 242 | continue 243 | 244 | if args.verbose: 245 | print('-----------8<---Problems List Begin---8<------------') 246 | print(plist) 247 | print('-----------8<---Problems List End-----8<------------') 248 | 249 | w.save_problems(c, plist, i, args.language) 250 | 251 | elif args.command == 'save_sub': 252 | c.login() 253 | print('The process may take a while, depending on how much submissions you have') 254 | print('Why not take a rest and have a cup of coffee :)') 255 | if not args.language: 256 | specified_langs = ALL_LANGUAGES 257 | 258 | def sub_exec(info): 259 | if args.verbose: 260 | print('Submissions on this page:') 261 | print(info) 262 | w.save_submissions(c, info) 263 | 264 | with crawler.ThreadPoolExecutor(max_workers=5) as pool: 265 | pool.map(sub_exec, c.get_submissions(specified_langs)) 266 | pool.shutdown(wait=True) 267 | print('All done!') 268 | 269 | -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.4.1 2 | requests==2.9.1 3 | --------------------------------------------------------------------------------