├── .gitignore
├── requirements.txt
├── README.md
├── get_my_packtpub_book.py
├── cn163.py
├── kog_wallpaper.py
└── oreilly_free.py


/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | .idea
3 | oreilly
4 | result/
5 | /venv/
6 | *.log


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.8.0
2 | certifi==2019.9.11
3 | chardet==3.0.4
4 | idna==2.8
5 | requests==2.22.0
6 | soupsieve==1.9.3
7 | urllib3==1.25.3
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # just-a-spider
 2 | 
 3 | * cn163.py
 4 | 由于看美剧时在网盘中一个一个链接添加很是繁琐，索性写了个脚本直接将下载链接解析出来，然后干脆直接加个递归将所有美剧的链接取回来吧以备不时之需。
 5 | 
 6 | *保存为cvs文件，可以方便地使用类excel软件打开批量复制下载链接*
 7 | 
 8 | * oreilly_free.py
 9 | 今天看到oreilly免费了22本电子书,索性写个脚本直接都拖下来了,下次用着也方便.
10 | 
11 | * get_my_packtpub_book.py
12 | packtpub上每天免费的书领取太多了，索性把书的相关信息（书名、ISBN、下载地址等）爬下来，放本地检索。
13 | > 前提是把https://www.packtpub.com/account/my-ebooks 中的数据下载下来放到本地来解析
14 | 


--------------------------------------------------------------------------------
/get_my_packtpub_book.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Created on 22/06/2017
 4 | 
 5 | @author: 'Jiezhi.G@gmail.com'
 6 | 
 7 | Reference:
 8 | """
 9 | from bs4 import BeautifulSoup as bs
10 | import csv
11 | 
12 | 
13 | def get_book(path):
14 |     with open(path, 'r') as f, open('result/pack_book.csv', 'w') as csvfile:
15 |         csvwriter = csv.writer(csvfile)
16 | 
17 |         soup = bs(f, 'html.parser')
18 |         books = soup.find_all('div', {'class': 'product-line unseen'})
19 |         # print(titles)
20 |         for book in books:
21 |             print('----------------------------')
22 |             title = book['title']
23 |             nid = book['nid']
24 | 
25 |             paper_book = book.find('div', {'type': 'book'})
26 |             if paper_book:
27 |                 isbn = paper_book['isbn']
28 |                 paper_nid = paper_book['nid']
29 |                 source_file = 'https://www.packtpub.com/code_download/%s' % paper_nid
30 |             else:
31 |                 isbn = ''
32 |                 paper_nid = ''
33 |                 source_file = ''
34 |             pdf_path = 'https://www.packtpub.com/ebook_download/%s/pdf' % nid
35 |             mobi_path = 'https://www.packtpub.com/ebook_download/%s/mobi' % nid
36 |             epub_path = 'https://www.packtpub.com/ebook_download/%s/epub' % nid
37 |             print(title, nid, isbn)
38 |             csvwriter.writerow([title, nid, paper_nid, isbn, pdf_path, mobi_path, epub_path, source_file])
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     get_book('~/tmp/packt.html')
43 | 


--------------------------------------------------------------------------------
/cn163.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Created on 5/8/16
 4 | 
 5 | @author: 'Jiezhi.G@gmail.com'
 6 | 
 7 | 
 8 | To get the latest code, please visit my github: https://github.com/Jiezhi/just-a-spider
 9 | 
10 | Reference: 
11 | """
12 | import requests
13 | from bs4 import BeautifulSoup
14 | import csv
15 | import time
16 | import os.path
17 | 
18 | 
19 | def get_content_from_url(url):
20 |     split_url = url.split('/')
21 |     number = split_url[len(split_url) - 2]
22 |     print(url, number)
23 |     response = requests.get(url)
24 |     soup = BeautifulSoup(response.content, 'lxml')
25 |     title = soup.title.get_text().split('|')[0].strip()
26 |     file_title = os.path.join('result', number + title.split('/')[0].strip() + '.csv')
27 |     print('parsing %s' % title)
28 | 
29 |     # if the result file exists then continue
30 |     if os.path.exists(file_title):
31 |         print('%s already parsed' % file_title)
32 |     else:
33 |         with open(file_title, 'w') as csvfile:
34 |             csvwriter = csv.writer(csvfile)
35 |             for link in soup.find('div', {'id': 'entry'}).findAll('a'):
36 |                 item_name = link.get_text().strip()
37 |                 item_value = link.get('href').strip()
38 |                 # exclude irrelevant url
39 |                 if item_value.startswith('http://cn163.net'):
40 |                     continue
41 |                 csvwriter.writerow([item_name, item_value])
42 |     try:
43 |         next_url = soup.find('a', {'rel': 'next'}).get('href').strip()
44 |     except AttributeError:
45 |         print('it seems no more url to get')
46 |         exit(0)
47 |     time.sleep(5)
48 |     # get_content_from_url(next_url)
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     # TODO get list from this page: http://cn163.net/archives/
53 |     first_url = 'http://cn163.net/archives/58/'
54 |     first_url = 'http://cn163.net/archives/1316/'
55 |     get_content_from_url(first_url)
56 | 


--------------------------------------------------------------------------------
/kog_wallpaper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Created on 11/07/2017
 4 | 
 5 | @author: 'Jiezhi.G@gmail.com'
 6 | 
 7 | Reference: 
 8 | """
 9 | import json
10 | import os
11 | import time
12 | from urllib import parse
13 | 
14 | import requests
15 | 
16 | url = 'http://pvp.qq.com/web201605/wallpaper.shtml'
17 | 
18 | 
19 | def get_kog_wallpaper(path):
20 |     headers = {
21 |         'Referer': 'http://pvp.qq.com/web201605/wallpaper.shtml',
22 |         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
23 |         'DNT': '1',
24 |     }
25 | 
26 |     params = (
27 |         ('activityId', '2735'),
28 |         ('sVerifyCode', 'ABCD'),
29 |         ('sDataType', 'JSON'),
30 |         ('iListNum', '200'),
31 |         ('totalpage', '0'),
32 |         ('page', '0'),
33 |         ('iOrder', '0'),
34 |         ('iSortNumClose', '1'),
35 |         ('jsoncallback', 'jQuery17106024178839309091_1568785080871'),
36 |         ('iAMSActivityId', '51991'),
37 |         ('_everyRead', 'true'),
38 |         ('iTypeId', '2'),
39 |         ('iFlowId', '267733'),
40 |         ('iActId', '2735'),
41 |         ('iModuleId', '2735'),
42 |         ('_', int(time.time() * 1000)),
43 |     )
44 | 
45 |     response = requests.get('http://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi',
46 |                             headers=headers,
47 |                             params=params, verify=False)
48 |     ret = response.text
49 |     ret = ret[ret.index('{'):ret.rindex('}') + 1]
50 |     # print(ret)
51 |     data = json.loads(ret)
52 |     heros = data['List']
53 |     # pprint(parse.unquote(heros))
54 |     if not os.path.exists(path):
55 |         os.mkdir(path)
56 |     for hero in heros:
57 |         file_path = os.path.join(path, '{0}.jpg'.format(parse.unquote(hero['sProdName'])))
58 |         if os.path.exists(file_path):
59 |             continue
60 |         with open(file_path, 'wb') as image_file:
61 |             image_url = parse.unquote(hero['sProdImgNo_6'])
62 |             image_url = image_url.replace("/200", "/0")
63 |             print(image_url)
64 |             image_file.write(requests.get(image_url).content)
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     file_dir = 'result/kog/'
69 |     get_kog_wallpaper(file_dir)
70 | 


--------------------------------------------------------------------------------
/oreilly_free.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Created on 5/21/16
 4 | 
 5 | @author: 'Jiezhi.G@gmail.com'
 6 | 
 7 | This code mainly parse & download free books from oreily(www.oreilly.com/programming/free/)
 8 | 
 9 | To get the latest code, please visit my github: https://github.com/Jiezhi/just-a-spider
10 | 
11 | Reference: 
12 | """
13 | import os
14 | import requests
15 | from bs4 import BeautifulSoup
16 | import threading
17 | import re
18 | 
19 | 
20 | def get_keyword(url):
21 |     """
22 |     Return catelog and book
23 |     """
24 |     # TODO error handler
25 |     reg=r"http://www\.oreilly\.com/([^/]+)/free/([^.]+)\.csp.*"
26 |     m = re.match(reg,url)
27 |     return m.groups()
28 | 
29 | 
30 | def download_file(url):
31 |     """
32 |     Just download a small file by url
33 |     This code snip come from http://stackoverflow.com/a/16696317/5425709
34 |     :param url: The file url
35 |     :return: The downloaded file name
36 |     """
37 |     local_filename = url.split('/')[-1]
38 |     dir_name = 'oreilly' + os.path.sep + url.split('/')[-4]
39 |     if not os.path.exists(dir_name):
40 |         os.makedirs(dir_name)
41 |     local_filename = os.path.join(dir_name, local_filename)
42 |     if os.path.exists(local_filename):
43 |         print('file already downloaded: ', local_filename)
44 |         return local_filename
45 |     print('downloading ', url)
46 |     # NOTE the stream=True parameter
47 |     r = requests.get(url, stream=True)
48 |     with open(local_filename, 'wb') as f:
49 |         for chunk in r.iter_content(chunk_size=1024):
50 |             if chunk:  # filter out keep-alive new chunks
51 |                 f.write(chunk)
52 |                 #f.flush() commented by recommendation from J.F.Sebastian
53 |     return local_filename
54 | 
55 | 
56 | def get_free_book(content, file_format='pdf'):
57 |     """
58 |     Parse free book information from html content
59 |     :param content: the content of what your get from oreily free book web page
60 |     :param file_format epub mobi or pdf
61 |     :return:
62 |     """
63 |     soup = BeautifulSoup(content, 'lxml')
64 |     # books = soup.find_all('div', {'class': 'product-row cover-showcase'})
65 |     # TODO handle error
66 |     books = soup.find_all('a', {'data-toggle': 'popover'})
67 |     print('Find %d book(s)...' % len(books))
68 |     for book in books:
69 |         href = book['href']
70 |         if not href or 'player.oreilly.com' in href or not '.csp' in href:
71 |             print("this page will be igored: ", href)
72 |             continue
73 |         try:
74 |             catelog,book_name = get_keyword(href)
75 |             book_url = 'http://www.oreilly.com/%s/free/files/%s.%s' % (catelog, book_name, file_format)
76 |             t = threading.Thread(target=download_file,args=(book_url,))
77 |             t.start()
78 |         except Exception as e:
79 |             print("Downloading from {} failed".format(href),e)
80 | 
81 | 
82 | if __name__ == '__main__':
83 |     free_oreilly = ['http://www.oreilly.com/programming/free/',
84 |                     'http://www.oreilly.com/web-platform/free/',
85 |                     'http://www.oreilly.com/security/free/',
86 |                     'http://www.oreilly.com/business/free/',
87 |                     'http://www.oreilly.com/data/free/',
88 |                     'http://www.oreilly.com/iot/free/',
89 |                     'http://www.oreilly.com/design/free/',
90 |                     'http://www.oreilly.com/webops-perf/free/',
91 |                     ]
92 |     for free in free_oreilly:
93 |         html = requests.get(free)
94 |         get_free_book(html.content)
95 | 


--------------------------------------------------------------------------------