├── .gitignore ├── requirements.txt ├── README.md ├── get_my_packtpub_book.py ├── cn163.py ├── kog_wallpaper.py └── oreilly_free.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .idea 3 | oreilly 4 | result/ 5 | /venv/ 6 | *.log -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.8.0 2 | certifi==2019.9.11 3 | chardet==3.0.4 4 | idna==2.8 5 | requests==2.22.0 6 | soupsieve==1.9.3 7 | urllib3==1.25.3 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # just-a-spider 2 | 3 | * cn163.py 4 | 由于看美剧时在网盘中一个一个链接添加很是繁琐,索性写了个脚本直接将下载链接解析出来,然后干脆直接加个递归将所有美剧的链接取回来吧以备不时之需。 5 | 6 | *保存为cvs文件,可以方便地使用类excel软件打开批量复制下载链接* 7 | 8 | * oreilly_free.py 9 | 今天看到oreilly免费了22本电子书,索性写个脚本直接都拖下来了,下次用着也方便. 10 | 11 | * get_my_packtpub_book.py 12 | packtpub上每天免费的书领取太多了,索性把书的相关信息(书名、ISBN、下载地址等)爬下来,放本地检索。 13 | > 前提是把https://www.packtpub.com/account/my-ebooks 中的数据下载下来放到本地来解析 14 | -------------------------------------------------------------------------------- /get_my_packtpub_book.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Created on 22/06/2017 4 | 5 | @author: 'Jiezhi.G@gmail.com' 6 | 7 | Reference: 8 | """ 9 | from bs4 import BeautifulSoup as bs 10 | import csv 11 | 12 | 13 | def get_book(path): 14 | with open(path, 'r') as f, open('result/pack_book.csv', 'w') as csvfile: 15 | csvwriter = csv.writer(csvfile) 16 | 17 | soup = bs(f, 'html.parser') 18 | books = soup.find_all('div', {'class': 'product-line unseen'}) 19 | # print(titles) 20 | for book in books: 21 | print('----------------------------') 22 | title = book['title'] 23 | nid = book['nid'] 24 | 25 | paper_book = book.find('div', {'type': 'book'}) 26 | if paper_book: 27 | isbn = paper_book['isbn'] 28 | paper_nid = paper_book['nid'] 29 | source_file = 'https://www.packtpub.com/code_download/%s' % paper_nid 30 | else: 31 | isbn = '' 32 | paper_nid = '' 33 | source_file = '' 34 | pdf_path = 'https://www.packtpub.com/ebook_download/%s/pdf' % nid 35 | mobi_path = 'https://www.packtpub.com/ebook_download/%s/mobi' % nid 36 | epub_path = 'https://www.packtpub.com/ebook_download/%s/epub' % nid 37 | print(title, nid, isbn) 38 | csvwriter.writerow([title, nid, paper_nid, isbn, pdf_path, mobi_path, epub_path, source_file]) 39 | 40 | 41 | if __name__ == '__main__': 42 | get_book('~/tmp/packt.html') 43 | -------------------------------------------------------------------------------- /cn163.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Created on 5/8/16 4 | 5 | @author: 'Jiezhi.G@gmail.com' 6 | 7 | 8 | To get the latest code, please visit my github: https://github.com/Jiezhi/just-a-spider 9 | 10 | Reference: 11 | """ 12 | import requests 13 | from bs4 import BeautifulSoup 14 | import csv 15 | import time 16 | import os.path 17 | 18 | 19 | def get_content_from_url(url): 20 | split_url = url.split('/') 21 | number = split_url[len(split_url) - 2] 22 | print(url, number) 23 | response = requests.get(url) 24 | soup = BeautifulSoup(response.content, 'lxml') 25 | title = soup.title.get_text().split('|')[0].strip() 26 | file_title = os.path.join('result', number + title.split('/')[0].strip() + '.csv') 27 | print('parsing %s' % title) 28 | 29 | # if the result file exists then continue 30 | if os.path.exists(file_title): 31 | print('%s already parsed' % file_title) 32 | else: 33 | with open(file_title, 'w') as csvfile: 34 | csvwriter = csv.writer(csvfile) 35 | for link in soup.find('div', {'id': 'entry'}).findAll('a'): 36 | item_name = link.get_text().strip() 37 | item_value = link.get('href').strip() 38 | # exclude irrelevant url 39 | if item_value.startswith('http://cn163.net'): 40 | continue 41 | csvwriter.writerow([item_name, item_value]) 42 | try: 43 | next_url = soup.find('a', {'rel': 'next'}).get('href').strip() 44 | except AttributeError: 45 | print('it seems no more url to get') 46 | exit(0) 47 | time.sleep(5) 48 | # get_content_from_url(next_url) 49 | 50 | 51 | if __name__ == '__main__': 52 | # TODO get list from this page: http://cn163.net/archives/ 53 | first_url = 'http://cn163.net/archives/58/' 54 | first_url = 'http://cn163.net/archives/1316/' 55 | get_content_from_url(first_url) 56 | -------------------------------------------------------------------------------- /kog_wallpaper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Created on 11/07/2017 4 | 5 | @author: 'Jiezhi.G@gmail.com' 6 | 7 | Reference: 8 | """ 9 | import json 10 | import os 11 | import time 12 | from urllib import parse 13 | 14 | import requests 15 | 16 | url = 'http://pvp.qq.com/web201605/wallpaper.shtml' 17 | 18 | 19 | def get_kog_wallpaper(path): 20 | headers = { 21 | 'Referer': 'http://pvp.qq.com/web201605/wallpaper.shtml', 22 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', 23 | 'DNT': '1', 24 | } 25 | 26 | params = ( 27 | ('activityId', '2735'), 28 | ('sVerifyCode', 'ABCD'), 29 | ('sDataType', 'JSON'), 30 | ('iListNum', '200'), 31 | ('totalpage', '0'), 32 | ('page', '0'), 33 | ('iOrder', '0'), 34 | ('iSortNumClose', '1'), 35 | ('jsoncallback', 'jQuery17106024178839309091_1568785080871'), 36 | ('iAMSActivityId', '51991'), 37 | ('_everyRead', 'true'), 38 | ('iTypeId', '2'), 39 | ('iFlowId', '267733'), 40 | ('iActId', '2735'), 41 | ('iModuleId', '2735'), 42 | ('_', int(time.time() * 1000)), 43 | ) 44 | 45 | response = requests.get('http://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi', 46 | headers=headers, 47 | params=params, verify=False) 48 | ret = response.text 49 | ret = ret[ret.index('{'):ret.rindex('}') + 1] 50 | # print(ret) 51 | data = json.loads(ret) 52 | heros = data['List'] 53 | # pprint(parse.unquote(heros)) 54 | if not os.path.exists(path): 55 | os.mkdir(path) 56 | for hero in heros: 57 | file_path = os.path.join(path, '{0}.jpg'.format(parse.unquote(hero['sProdName']))) 58 | if os.path.exists(file_path): 59 | continue 60 | with open(file_path, 'wb') as image_file: 61 | image_url = parse.unquote(hero['sProdImgNo_6']) 62 | image_url = image_url.replace("/200", "/0") 63 | print(image_url) 64 | image_file.write(requests.get(image_url).content) 65 | 66 | 67 | if __name__ == '__main__': 68 | file_dir = 'result/kog/' 69 | get_kog_wallpaper(file_dir) 70 | -------------------------------------------------------------------------------- /oreilly_free.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Created on 5/21/16 4 | 5 | @author: 'Jiezhi.G@gmail.com' 6 | 7 | This code mainly parse & download free books from oreily(www.oreilly.com/programming/free/) 8 | 9 | To get the latest code, please visit my github: https://github.com/Jiezhi/just-a-spider 10 | 11 | Reference: 12 | """ 13 | import os 14 | import requests 15 | from bs4 import BeautifulSoup 16 | import threading 17 | import re 18 | 19 | 20 | def get_keyword(url): 21 | """ 22 | Return catelog and book 23 | """ 24 | # TODO error handler 25 | reg=r"http://www\.oreilly\.com/([^/]+)/free/([^.]+)\.csp.*" 26 | m = re.match(reg,url) 27 | return m.groups() 28 | 29 | 30 | def download_file(url): 31 | """ 32 | Just download a small file by url 33 | This code snip come from http://stackoverflow.com/a/16696317/5425709 34 | :param url: The file url 35 | :return: The downloaded file name 36 | """ 37 | local_filename = url.split('/')[-1] 38 | dir_name = 'oreilly' + os.path.sep + url.split('/')[-4] 39 | if not os.path.exists(dir_name): 40 | os.makedirs(dir_name) 41 | local_filename = os.path.join(dir_name, local_filename) 42 | if os.path.exists(local_filename): 43 | print('file already downloaded: ', local_filename) 44 | return local_filename 45 | print('downloading ', url) 46 | # NOTE the stream=True parameter 47 | r = requests.get(url, stream=True) 48 | with open(local_filename, 'wb') as f: 49 | for chunk in r.iter_content(chunk_size=1024): 50 | if chunk: # filter out keep-alive new chunks 51 | f.write(chunk) 52 | #f.flush() commented by recommendation from J.F.Sebastian 53 | return local_filename 54 | 55 | 56 | def get_free_book(content, file_format='pdf'): 57 | """ 58 | Parse free book information from html content 59 | :param content: the content of what your get from oreily free book web page 60 | :param file_format epub mobi or pdf 61 | :return: 62 | """ 63 | soup = BeautifulSoup(content, 'lxml') 64 | # books = soup.find_all('div', {'class': 'product-row cover-showcase'}) 65 | # TODO handle error 66 | books = soup.find_all('a', {'data-toggle': 'popover'}) 67 | print('Find %d book(s)...' % len(books)) 68 | for book in books: 69 | href = book['href'] 70 | if not href or 'player.oreilly.com' in href or not '.csp' in href: 71 | print("this page will be igored: ", href) 72 | continue 73 | try: 74 | catelog,book_name = get_keyword(href) 75 | book_url = 'http://www.oreilly.com/%s/free/files/%s.%s' % (catelog, book_name, file_format) 76 | t = threading.Thread(target=download_file,args=(book_url,)) 77 | t.start() 78 | except Exception as e: 79 | print("Downloading from {} failed".format(href),e) 80 | 81 | 82 | if __name__ == '__main__': 83 | free_oreilly = ['http://www.oreilly.com/programming/free/', 84 | 'http://www.oreilly.com/web-platform/free/', 85 | 'http://www.oreilly.com/security/free/', 86 | 'http://www.oreilly.com/business/free/', 87 | 'http://www.oreilly.com/data/free/', 88 | 'http://www.oreilly.com/iot/free/', 89 | 'http://www.oreilly.com/design/free/', 90 | 'http://www.oreilly.com/webops-perf/free/', 91 | ] 92 | for free in free_oreilly: 93 | html = requests.get(free) 94 | get_free_book(html.content) 95 | --------------------------------------------------------------------------------