├── LICENSE
├── NovelSpider.py
├── README.md
├── TorrentSpider_AsianNomosaic.py
├── TorrentSpider_AsianNomosaic_DB.py
├── TorrentSpider_AsianNomosaic_With_Json.py
├── TorrentSpider_EuropeAmerica_DB.py
├── TorrentSpider_JapaneseCavalry_DB.py
├── TorrentSpider_LatestCollection.py
├── config_template.json
└── requirements.txt


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Menghui Xie
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NovelSpider.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | # -*- coding: UTF-8 -*-
  3 | import urllib
  4 | import urllib.parse
  5 | import urllib.request
  6 | from urllib.request import urlopen
  7 | import requests
  8 | import threading
  9 | from bs4 import BeautifulSoup
 10 | import re
 11 | from urllib.parse import urlencode, parse_qs, urlsplit, urlunsplit
 12 | 
 13 | 
 14 | # conversion encode
 15 | # 转换编码
 16 | def encodeConversion(req):
 17 |     if req.encoding == 'ISO-8859-1':
 18 |         encodings = requests.utils.get_encodings_from_content(req.text)
 19 |         if encodings:
 20 |             encoding = encodings[0]
 21 |         else:
 22 |             encoding = req.apparent_encoding
 23 |         # encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace')
 24 |         encode_content = req.content.decode(encoding, 'replace')
 25 |         return encode_content
 26 |     else:
 27 |         return ""
 28 | 
 29 | 
 30 | # 设置 http 请求的参数
 31 | # set the parameters of the http request
 32 | def set_query_parameter(url, param_name, param_value):
 33 |     """Given a URL, set or replace a query parameter and return the
 34 |     modified URL.
 35 | 
 36 |     >>> set_query_parameter('http://example.com?foo=bar&biz=baz', 'foo', 'stuff')
 37 |     'http://example.com?foo=stuff&biz=baz'
 38 | 
 39 |     """
 40 |     scheme, netloc, path, query_string, fragment = urlsplit(url)
 41 |     query_params = parse_qs(query_string)
 42 | 
 43 |     query_params[param_name] = [param_value]
 44 |     new_query_string = urlencode(query_params, doseq=True)
 45 | 
 46 |     return urlunsplit((scheme, netloc, path, new_query_string, fragment))
 47 | 
 48 | 
 49 | # each novel post page
 50 | # 每个小说帖子页面
 51 | def praseHtml(req_url , headers, path):
 52 |     try:
 53 |         # 请求当前章节页面，params 为请求参数
 54 |         global isProxy
 55 |         global proxies
 56 |         if (isProxy == True):
 57 |             req = requests.get(req_url, params=headers, proxies=proxies)
 58 |         else:
 59 |             req = requests.get(req_url, params=headers)
 60 | 
 61 |         # 转换编码
 62 |         encode_content = encodeConversion(req)
 63 |         # soup转换
 64 |         soup = BeautifulSoup(encode_content, "html.parser")
 65 |         # 获取章节名称
 66 |         section_name = soup.select('#subject_tpc')[0].text
 67 |         # 获取章节文本
 68 |         section_text = soup.select('#read_tpc')[0].text
 69 |         result = section_name + '\n' + section_text
 70 |         result = result.replace('　　', '\n  ')
 71 | 
 72 |         if result != "" and section_name != "":
 73 |             savePath = path + "\\" + str(section_name).replace(u'\0', u'').replace(u'\t', u'') + ".txt"
 74 |             f = open(savePath, "w", encoding='utf-8')
 75 |             f.write(result)
 76 |     except ValueError:
 77 |         print("ValueError: 传入无效的参数" + req_url)
 78 |     except IndexError:
 79 |         print("IndexError: 没有此网页索引：" + req_url)
 80 |     except IOError:
 81 |         print("IOError: 没有找到文件或读取文件失败" + req_url)
 82 |     except Exception as e:
 83 |         print("Exception: 存在异常" + e + req_url)
 84 |     else:
 85 |         # 内容写入文件成功
 86 |         print(req_url, end='')
 87 |         f.close()
 88 | 
 89 | 
 90 | # novel post list page
 91 | # 成人小说帖子列表页面
 92 | def novelList(directory_url, fid, page , chapter_url, headers, path):
 93 |     # content_url = directory_url + '?fid='+str(fid)+"&page="+str(page)
 94 | 
 95 |     directory_url = set_query_parameter(directory_url, 'fid', fid)
 96 |     directory_url = set_query_parameter(directory_url, 'page', page)
 97 | 
 98 |     print(directory_url + ' start downloading')
 99 | 
100 |     # 请求当前章节页面  params为请求参数
101 |     global isProxy
102 |     if(isProxy == True):
103 |         req = requests.get(directory_url, params=headers, proxies=proxies)
104 |     else:
105 |         req = requests.get(directory_url, params=headers)
106 | 
107 |     # 转换编码
108 |     encode_content = encodeConversion(req)
109 | 
110 |     # soup转换
111 |     soup = BeautifulSoup(encode_content, "html.parser")
112 |     # 获取章节名称
113 |     section_list = soup.select('.tr3 h3 a')
114 |     section_num = len(section_list)
115 |     if section_num == 0:
116 |         print("目录页面不正确，无法找到匹配项！")
117 |         return -1
118 |     for section in section_list:
119 |         str_section = str(section)
120 |         # php网页的匹配
121 |         matchObj_act = re.match(r'(.*)a_ajax_(.*)">(.*?)</a>', str_section, re.M | re.I)
122 |         if matchObj_act:
123 |             section_sub = matchObj_act.group(2)  # 章节的标识
124 |             section_name = matchObj_act.group(3)  # 章节的名字
125 |             global php_chapter_url  # php的章节URL
126 |             php_chapter_url = set_query_parameter(php_chapter_url, 'tid', section_sub)
127 |             php_chapter_url = set_query_parameter(php_chapter_url, 'fpage', page)
128 |             praseHtml(php_chapter_url, headers, path)
129 |             prase_num = section_list.index(section) + 1
130 |             print(' [ ' + "{:.1f}".format(prase_num / section_num * 100) + '% chapter completed ]  ')
131 |         else:
132 |             # html网页的匹配
133 |             matchObj = re.match(r'(.*)href="htm_data(.*)" id=(.*)>(.*?)</a>', str_section, re.M | re.I)
134 |             if matchObj:
135 |                 section_sub = matchObj.group(2)  # 章节的标识
136 |                 section_name = matchObj.group(4)  # 章节的名字
137 |                 # 传入html章节的URL
138 |                 praseHtml(chapter_url + section_sub, headers, path)
139 |                 prase_num = section_list.index(section) + 1
140 |                 print(' [ ' + "{:.1f}".format(prase_num / section_num * 100) + '% chapter completed ]  ')
141 |             else:
142 |                 # 匹配失败
143 |                 print("No match: " + str_section)
144 |                 return -1
145 | 
146 | 
147 | # Crawl start page to end page, statistical results
148 | # 爬取开始页到结束页，统计结果
149 | def spider(directory_url, fid, page_start, page_end, chapter_url, novel_list_req_header, path):
150 |     page_num = abs(page_end-page_start)+1
151 |     for each_page in range(page_start, page_end):
152 |         list_return = novelList(directory_url, fid, each_page, chapter_url, novel_list_req_header, path)
153 |         if list_return == -1:
154 |             break
155 |         prase_num = abs(each_page - page_start)+1
156 |         print(' [ ' + "{:.1f}".format(prase_num/page_num*100) + '% page completed ] ')
157 | 
158 | 
159 | if __name__ == "__main__":
160 |     # request header
161 |     # 请求头字典
162 |     novel_list_req_header = {
163 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
164 |         'Accept-Encoding': 'gzip, deflate',
165 |         'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
166 |         'Cache-Control': 'no-cache',
167 |         'Connection': 'keep - alive',
168 |         'Cookie':'UM_distinctid=16574ce27ac246-04d3d1f1292635-9393265-1fa400-16574ce27aeff; aafaf_readlog'\
169 |                  '=%2C1245721%2C; aafaf_ol_offset=35448165; CNZZDATA1261158850=1879378976-1535261549-%7C1535279419;'\
170 |                  ' aafaf_lastpos=F17; aafaf_threadlog=%2C18%2C14%2C15%2C16%2C17%2C; aafaf_lastvisit=7839%09153528353'\
171 |                  '2%09%2Fpw%2Fthread.php%3Ffid%3D17%26page%3D2',
172 |         'Host': 'w3.afulyu.pw',
173 |         'Pragma': 'no-cache',
174 |         # 'Proxy-Connection': 'keep-alive',
175 |         'Referer': 'http://w3.afulyu.pw/pw/thread.php?fid=17&page=1',
176 |         'Upgrade-Insecure-Requests': '1',
177 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'\
178 |                       ' Chrome/68.0.3440.106 Safari/537.36'
179 |     }
180 |     # proxy request header
181 |     # 代理时的请求头字典
182 |     proxt_novel_list_req_header = {
183 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
184 |         'Accept-Encoding': 'gzip, deflate',
185 |         'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
186 |         'Cache-Control': 'no-cache',
187 |         # 'Connection': 'keep - alive',
188 |         'Cookie': '__cfduid=d7e5c699ef4d6599ef01239424b0e6cd71547705542; aafaf_lastvisit=0%091547705542%09%2Fpw%2Findex.php%3F;' \
189 |                   ' UM_distinctid=1685a707030539-0653970bbabd2b-46564b55-1fa400-1685a707031a0a; ' \
190 |                   'CNZZDATA1261158850=317005769-1547705297-%7C1547705297',
191 |         'Host': 'w3.jbzcjsj.pw',
192 |         'Pragma': 'no-cache',
193 |         'Proxy-Connection': 'keep-alive',
194 |         'Referer': 'http://w3.afulyu.pw/pw/thread.php?fid=17&page=1',
195 |         'Upgrade-Insecure-Requests': '1',
196 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \
197 |                       '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116'
198 |     }
199 | 
200 |     global php_chapter_url
201 |     global isProxy
202 |     global proxies
203 |     directory_url = "http://w3.afulyu.pw/pw/thread.php"                 # 小说目录url
204 |     html_chapter_url = 'http://w3.afulyu.pw/pw/htm_data'                # 每篇小说的html页面
205 |     php_chapter_url = 'http://w3.afulyu.pw/pw/read.php'                 # 每篇小说的php页面
206 |     save_path = 'D:\\code\\Pycharm\\1024Spider\\novel'                  # 保存在本地的路径
207 |     proxies = {'http': '127.0.0.1:1080', "https": "127.0.0.1:1080", }   # 代理信息
208 |     fid = 17                                                            # 网站帖子类型，17代表小说
209 |     page_start = 1                                                      # 小说目录开始页面
210 |     page_end = 940                                                      # 小说目录结束页面
211 |     isProxy = False                                                     # 是否设置代理
212 | 
213 |     spider(directory_url, fid, page_start, page_end, html_chapter_url, proxt_novel_list_req_header, save_path)
214 |     


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 1024 Spiders
 2 | 
 3 | Some 1024 spiders, crawl xp1024 porn information and magnet-links and insert them into the database.
 4 | 
 5 | * Python 3.6
 6 | * MySQL 8.0
 7 | 
 8 | ## Deploy
 9 | 
10 | ### Clone & Install
11 | 
12 | ```
13 | git clone git@github.com:huihut/1024Spiders.git
14 | cd 1024Spiders && pip install -r requirements.txt
15 | ```
16 | 
17 | ### Configure
18 | 
19 | * No_Json
20 | 
21 |     Configure your database, request_header, save_path, etc. in the source code.
22 | 
23 | * With_Json
24 | 
25 |     1. `mv config_template.json config.json` ( [config_template.json](config_template.json) )
26 |     2. Configure your database, request_header, save_path, etc. in the `config.json`.
27 | 
28 | ### Run front-end process
29 | 
30 | ```
31 | python TorrentSpider_AsianNomosaic_DB.py
32 | ```
33 | 
34 | ### Run background process
35 | 
36 | ```
37 | nohup python -u TorrentSpider_AsianNomosaic_DB.py > TorrentSpider_AsianNomosaic_DB.log 2>&1 &
38 | ```
39 | 
40 | ## Database
41 | 
42 | ```mysql
43 | mysql> show tables;
44 | +-------------------------+
45 | | Tables_in_torrent       |
46 | +-------------------------+
47 | | AsianNomosaic           |
48 | | AsianNomosaicPictures   |
49 | | EuropeAmerica           |
50 | | EuropeAmericaPictures   |
51 | | JapaneseCavalry         |
52 | | JapaneseCavalryPictures |
53 | +-------------------------+
54 | 6 rows in set (0.01 sec)
55 | 
56 | mysql> desc AsianNomosaic;
57 | +---------+-----------+------+-----+---------+----------------+
58 | | Field   | Type      | Null | Key | Default | Extra          |
59 | +---------+-----------+------+-----+---------+----------------+
60 | | id      | int(11)   | NO   | PRI | NULL    | auto_increment |     # porn id
61 | | data    | char(10)  | YES  |     | NULL    |                |     # porn date
62 | | name    | char(255) | NO   |     | NULL    |                |     # porn name
63 | | summary | text      | YES  |     | NULL    |                |     # porn introduction
64 | | magnet  | char(255) | NO   |     | NULL    |                |     # porn magnet-link
65 | +---------+-----------+------+-----+---------+----------------+
66 | 5 rows in set (0.00 sec)
67 | 
68 | mysql> desc AsianNomosaicPictures;
69 | +-------+-----------+------+-----+---------+----------------+
70 | | Field | Type      | Null | Key | Default | Extra          |
71 | +-------+-----------+------+-----+---------+----------------+
72 | | id    | int(11)   | NO   | PRI | NULL    | auto_increment |       # picture id
73 | | an_id | int(11)   | NO   |     | NULL    |                |       # porn id
74 | | name  | char(255) | NO   |     | NULL    |                |       # picture name
75 | +-------+-----------+------+-----+---------+----------------+
76 | 3 rows in set (0.00 sec)
77 | ```


--------------------------------------------------------------------------------
/TorrentSpider_AsianNomosaic.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #-*-coding:utf-8 -*-
  3 | import _thread
  4 | import re
  5 | import requests
  6 | from bs4 import BeautifulSoup
  7 | from urllib.parse import urlencode, parse_qs, urlsplit, urlunsplit
  8 | from urllib.request import urlretrieve
  9 | import urllib.request
 10 | import os
 11 | import time
 12 | import threading
 13 | 
 14 | 
 15 | # 1024 http request header
 16 | # 1024 网站请求头
 17 | proxt_1024_req_header = {
 18 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 19 |     'Accept-Encoding': 'gzip, deflate',
 20 |     'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
 21 |     'Cache-Control': 'max-age=0',
 22 |     # 'Connection': 'keep - alive',
 23 |     'Cookie': '__cfduid=d4e99b476e7372dec9a44b67f533f37aa1548178386; aafaf_lastvisit=0%091548178386%' \
 24 |                   '09%2Fpw%2Fthread.php%3Ffid-5-page-5.html; aafaf_lastpos=F5; aafaf_threadlog=%2C5%2C; ' \
 25 |                   'aafaf_ol_offset=32368318; UM_distinctid=168769f77ac958-0509e825886dfc-46564b55-1fa400-16876' \
 26 |                   '9f77ad1302; CNZZDATA1261158850=393281613-1548174901-%7C1548174901',
 27 |     'Host': 'w3.jbzcjsj.pw',
 28 |     # 'Pragma': 'no-cache',
 29 |     'Proxy-Connection': 'keep-alive',
 30 |     'Referer': 'http://w3.jbzcjsj.pw/pw/thread-htm-fid-5-page-5.html',
 31 |     'Upgrade-Insecure-Requests': '1',
 32 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' \
 33 |                   ' Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116'
 34 | }
 35 | request_header = proxt_1024_req_header
 36 | 
 37 | # magnet-link website http request header
 38 | # 磁力链接网站网站请求头
 39 | proxt_torrent_req_header = {
 40 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 41 |     'Accept-Encoding': 'gzip, deflate',
 42 |     'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
 43 |     'Cache-Control': 'no-cache',
 44 |     # 'Connection': 'keep - alive',
 45 |     'Cookie': '__cfduid=d7f5104b5a516916674841b656d67dde31548178497; UM_distinctid=16876a1266dfb2-018bb685bad' \
 46 |                   '0ed-46564b55-1fa400-16876a1266e1d2; CNZZDATA1273152310=501204684-1548176963-http%253A%2' \
 47 |                   '52F%252Fw3.jbzcjsj.pw%252F%7C1548176963; _ga=GA1.2.1886522142.1548178499; _gid=GA1.2.16499' \
 48 |                   '32666.1548178499; _gat=1',
 49 |     'Host': 'www1.downsx.com',
 50 |     'Pragma': 'no-cache',
 51 |     'Proxy-Connection': 'keep-alive',
 52 |     'Referer': 'http://w3.jbzcjsj.pw/pw/html_data/5/1901/3863561.html',
 53 |     'Upgrade-Insecure-Requests': '1',
 54 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \
 55 |               '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116'
 56 | }
 57 | torrent_request_header = proxt_torrent_req_header
 58 | opener=urllib.request.build_opener()
 59 | opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' \
 60 |                   ' (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116')]
 61 | urllib.request.install_opener(opener)
 62 | 
 63 | # proxy settings
 64 | # 代理设置
 65 | proxies = {'http': '127.0.0.1:1080', "https": "127.0.0.1:1080", }
 66 | proxies_header = proxies
 67 | isProxy = False                                                                  # 是否设置代理
 68 | 
 69 | base_url = "http://w3.jbzcjsj.pw/pw/"                                           # 基础url
 70 | save_path = "D:/code/Pycharm/1024Spider/torrent_asian_nomosaic"                  # 存储图片路径
 71 | fid = 5                                                                          # fid=5 表示亚洲无码
 72 | page_start = 1                                                                   # 爬取的开始页
 73 | page_end = 928                                                                   # 爬取的结束页
 74 | thread_num = 1                                                                   # 线程数
 75 | 
 76 | 
 77 | # conversion encode
 78 | # 转换编码
 79 | def Encode_Conversion(req):
 80 |     if req.encoding == 'ISO-8859-1':
 81 |         encodings = requests.utils.get_encodings_from_content(req.text)
 82 |         if encodings:
 83 |             encoding = encodings[0]
 84 |         else:
 85 |             encoding = req.apparent_encoding
 86 |         # encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace')
 87 |         encode_content = req.content.decode(encoding, 'replace')
 88 |         return encode_content
 89 |     else:
 90 |         return ""
 91 | 
 92 | 
 93 | # save [content] to [path]
 94 | # 保存文本
 95 | def Save_Text(id, path, content):
 96 |     try:
 97 |         f = open(path, "w", encoding='utf-8')
 98 |         f.write(content)
 99 |     except IOError:
100 |         print("[" + str(id) + "] IOError: File open failed.")
101 |     except Exception as e:
102 |         print("Save_Text Exception: " + str(e))
103 |     else:
104 |         # 内容写入文件成功
105 |         print("[" + str(id) + "] Successfully save the file to " + path)
106 |         f.close()
107 | 
108 | 
109 | # torrent and magnet-link page
110 | # 种子/磁力链接页面
111 | def Prase_Torrent(id, url, folder_path):
112 |     try:
113 |         if (isProxy == True):
114 |             req = requests.get(url, params=torrent_request_header, proxies=proxies_header)
115 |         else:
116 |             req = requests.get(url, params=torrent_request_header)
117 | 
118 |         # soup转换
119 |         soup = BeautifulSoup(req.content, "html.parser")
120 | 
121 |         torrent_content = soup.select('.uk-button ')
122 |         torrent_content_num = len(torrent_content)
123 |         if torrent_content_num == 0:
124 |             print("[" + str(id) + "] No match torrent.")
125 |             return ''
126 |         for content in torrent_content:
127 |             str_content = str(content)
128 |             # 匹配磁力链接
129 |             matchObj = re.search(r'magnet(.*?)"', str_content)
130 |             if matchObj:
131 |                 magnet_link = 'magnet' + matchObj.group(1)
132 |                 return magnet_link
133 |             else:
134 |                 # 匹配失败
135 |                 print("[" + str(id) + "] No match: " + str_content)
136 |                 return ''
137 |     except Exception as e:
138 |         print("[" + str(id) + "] Prase_Torrent Exception: " + str(e))
139 | 
140 | 
141 | # each post page
142 | # 每个帖子页面
143 | def Prase_Post(id, url, folder_name):
144 |     try:
145 |         if (isProxy == True):
146 |             req = requests.get(url, params=request_header, proxies=proxies_header)
147 |         else:
148 |             req = requests.get(url, params=request_header)
149 | 
150 |         # 转换编码
151 |         encode_content = Encode_Conversion(req)
152 |         # soup转换
153 |         soup = BeautifulSoup(encode_content, "html.parser")
154 | 
155 |         post_content = soup.select('div[id="read_tpc"]')
156 |         post_content_num = len(post_content)
157 |         if post_content_num == 0:
158 |             print("[" + str(id) + "] No match post.")
159 |             return
160 | 
161 |         # 创建保存的文件夹
162 |         folder_path = save_path + '/' + folder_name
163 |         folder = os.path.exists(folder_path)
164 |         if not folder:
165 |             os.makedirs(folder_path)
166 |             print("[" + str(id) + "] Created folder " + folder_name)
167 | 
168 |         # 保存文本内容
169 |         result = post_content[0].text
170 |         magnet_link = ''
171 |         for content in post_content:
172 |             str_content = str(content)
173 | 
174 |             # 匹配种子
175 |             matchObj = re.findall(r'href="(.*?)"', str_content)
176 |             if matchObj:
177 |                 for obj in matchObj:
178 |                     magnet_link = Prase_Torrent(id, obj, folder_path)
179 |             else:
180 |                 # 匹配失败
181 |                 print("[" + str(id) + "] No match: " + str_content)
182 | 
183 |             # 匹配图片
184 |             matchObj = re.findall(r'window.open\(\'(.*?)\'\);', str_content)
185 |             if matchObj:
186 |                 for obj in matchObj:
187 |                     objTemp = obj
188 |                     strlist = objTemp.split('/')
189 |                     strlen = len(strlist)
190 |                     if strlen != 0:
191 |                         img_name = strlist[strlen - 1]
192 |                         try:
193 |                             urllib.request.urlretrieve(obj, folder_path + '/' + img_name)
194 |                         except Exception as e:
195 |                             print("[" + str(id) + "] Download the picture Exception: " + str(e))
196 |                         else:
197 |                             print("[" + str(id) + "] Successfully save the image to " + folder_path + '/' + img_name)
198 |             else:
199 |                 # 匹配失败
200 |                 print("[" + str(id) + "] No match: " + str_content)
201 |         # 保存到文件
202 |         if magnet_link != '':
203 |             result = result + '\n\n' + magnet_link
204 |         Save_Text(id, folder_path + '/index.txt', result)
205 |     except Exception as e:
206 |         print("[" + str(id) + "] Prase_Post Exception: " + str(e))
207 | 
208 | 
209 | # post list page
210 | # 帖子列表页面
211 | def Post_list(id, page):
212 |     try:
213 |         post_url = base_url + 'thread-htm-fid-' + str(fid) + '-page-' + str(page) + '.html'
214 |         print('[' + str(id) + '] clicked: ' + post_url)
215 | 
216 |         if (isProxy == True):
217 |             req = requests.get(post_url, params=request_header, proxies=proxies_header)
218 |         else:
219 |             req = requests.get(post_url, params=request_header)
220 | 
221 |         # 转换编码
222 |         encode_content = Encode_Conversion(req)
223 | 
224 |         # soup转换
225 |         soup = BeautifulSoup(encode_content, "html.parser")
226 |         # 获取章节名称
227 |         post_list = soup.select('tr[class="tr3 t_one"] h3 a')
228 |         post_num = len(post_list)
229 |         if post_num == 0:
230 |             print("[" + str(id) + "] No match post_list.")
231 |             return
232 |         for post in post_list:
233 |             str_post = str(post)
234 |             # html网页的匹配
235 |             matchObj = re.match(r'(.*)href="(.*)" id=(.*)>(.*?)</a>', str_post, re.M | re.I)
236 |             if matchObj:
237 |                 post_url = matchObj.group(2)  # URL
238 |                 post_name = matchObj.group(4)  # 文件夹名
239 |                 if post_name != '':
240 |                     # 匹配每个帖子
241 |                     Prase_Post(id, base_url + post_url,
242 |                                post_name.replace(u'\0', u'').replace(u'/', u'.').replace(u'?',
243 |                                                                                          u'').replace(u'*',
244 |                                                                                                       u''))
245 |             else:
246 |                 # 匹配失败
247 |                 print("[" + str(id) + "] No match: " + str_post)
248 |     except Exception as e:
249 |         print("[" + str(id) + "] Post_list Exception." + str(e))
250 | 
251 | 
252 | # multi-threaded, the parameter [id] is the thread id
253 | # 多线程，参数 [id] 为线程 id
254 | def Work_thread(id):
255 |     try:
256 |         if id <= page_end:
257 |             prase_num = 0
258 |             prase_more_one = 0
259 |             page_num = abs(page_end - page_start) + 1
260 |             if id <= int(page_num % thread_num):
261 |                 prase_more_one = 1
262 |             page_num_each_thread = int(page_num / thread_num) + prase_more_one
263 |             for each_page in range(page_start + id - 1, page_end + 1, thread_num):
264 |                 Post_list(id, each_page)
265 |                 prase_num += 1
266 |                 print('[' + str(id) + '] [ ' + "{:.1f}".format(
267 |                     prase_num / page_num_each_thread * 100) + '% page completed ] ')
268 |             print('[' + str(id) + '] completed !!!!!')
269 |     except Exception as e:
270 |         print("[" + str(id) + "] Work_thread Exception." + str(e))
271 | 
272 | 
273 | if __name__ == "__main__":
274 |     # single thread # 单线程
275 |     # Work_thread(1)
276 |     # multithreading # 多线程
277 |     try:
278 |         for i in range(1, thread_num + 1):
279 |             _thread.start_new_thread(Work_thread, (i,))
280 |     except Exception as e:
281 |         print("Start_new_thread Exception: " + str(e))
282 |     while 1:
283 |         pass
284 | 


--------------------------------------------------------------------------------
/TorrentSpider_AsianNomosaic_DB.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | #-*-coding:utf-8 -*-
  3 | import _thread
  4 | import re
  5 | import requests
  6 | from bs4 import BeautifulSoup
  7 | from urllib.parse import urlencode, parse_qs, urlsplit, urlunsplit
  8 | from urllib.request import urlretrieve
  9 | import urllib.request
 10 | import os
 11 | import time
 12 | import pymysql
 13 | 
 14 | # 1024 http request header
 15 | # 1024 网站请求头
 16 | proxt_1024_req_header = {
 17 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 18 |     'Accept-Encoding': 'gzip, deflate',
 19 |     'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
 20 |     'Cache-Control': 'no-cache',
 21 |     # 'Connection': 'keep - alive',
 22 |     'Cookie': '__cfduid=d7e5c699ef4d6599ef01239424b0e6cd71547705542; aafaf_lastvisit=0%09154' \
 23 |               '7705542%09%2Fpw%2Findex.php%3F; UM_distinctid=1685a707030539-0653970bbabd2b-46564b55' \
 24 |               '-1fa400-1685a707031a0a; CNZZDATA1261158850=317005769-1547705297-%7C1547705297',
 25 |     'Host': 'w3.jbzcjsj.pw',
 26 |     'Pragma': 'no-cache',
 27 |     'Proxy-Connection': 'keep-alive',
 28 |     #'Referer': 'http://w3.afulyu.pw/pw/thread.php?fid=17&page=1',
 29 |     'Upgrade-Insecure-Requests': '1',
 30 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \
 31 |                   '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116'
 32 | }
 33 | request_header = proxt_1024_req_header
 34 | 
 35 | # magnet-link website http request header
 36 | # 磁力链接网站网站请求头
 37 | proxt_torrent_req_header = {
 38 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 39 |     'Accept-Encoding': 'gzip, deflate',
 40 |     'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
 41 |     'Cache-Control': 'no-cache',
 42 |     # 'Connection': 'keep - alive',
 43 |     'Cookie': '__cfduid=d062c450fc125c2a02de05db8586dc1941547731587; UM_distinctid=1685bfdd4' \
 44 |               'd4854-0edeecf536f3fc-46564b55-1fa400-1685bfdd4d515b4; CNZZDATA1273152310=651528679' \
 45 |               '-1547731013-http%253A%252F%252Fw3.jbzcjsj.pw%252F%7C1547731013; _ga=GA1.2.845482462.' \
 46 |               '1547731588; _gid=GA1.2.2026642011.1547731588',
 47 |     'Host': 'www1.downsx.club',
 48 |     'Pragma': 'no-cache',
 49 |     'Proxy-Connection': 'keep-alive',
 50 |     'Referer': 'http://w3.jbzcjsj.pw/pw/html_data/3/1901/3855151.html',
 51 |     'Upgrade-Insecure-Requests': '1',
 52 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \
 53 |               '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116'
 54 | }
 55 | torrent_request_header = proxt_torrent_req_header
 56 | opener=urllib.request.build_opener()
 57 | opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' \
 58 |                   ' (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116')]
 59 | urllib.request.install_opener(opener)
 60 | 
 61 | # proxy settings
 62 | # 代理设置
 63 | proxies = {'http': '127.0.0.1:1080', "https": "127.0.0.1:1080", }
 64 | proxies_header = proxies
 65 | isProxy = False                                         # whether to set proxy  # 是否设置代理
 66 | 
 67 | base_url = "http://w3.jbzcjsj.pw/pw/"                   # xp1024's base url     # xp1024的基本链接
 68 | save_path = "D:/code/Pycharm/1024Spider/torrent_asian_nomosaic"    # pictures save path # 图片保存路径
 69 | fid = 5                                                 # Fid=5 means Asian porn without mosaics. # Fid=5 表示亚洲无码
 70 | page_start = 1                                          # crawl start page      # 爬取的开始页
 71 | page_end = 913                                          # crawl end page        # 爬取的结束页
 72 | thread_num = 1                                          # number of threads     # 线程数
 73 | mySQLCommand = object
 74 | 
 75 | 
 76 | # Used to execute database commands
 77 | # 用于执行数据库命令
 78 | class MySQLCommand(object):
 79 |     # init # 类的初始化
 80 |     def __init__(self):
 81 |         self.host = ''                                  # host ip or domain name，local is [127.0.0.1] # 数据库所在的主机
 82 |         self.port = 3306                                # database port             # 数据库端口号
 83 |         self.user = ''                                  # database username         # 数据库用户名
 84 |         self.password = ""                              # database password         # 数据库密码
 85 |         self.db = ""                                    # database name             # 数据库名
 86 |         self.table_torrent = "AsianNomosaic"            # porn information table    # 影片信息表
 87 |         self.table_pictures = "AsianNomosaicPictures"   # pictures table            # 图片表
 88 | 
 89 |     # connect to database
 90 |     # 连接数据库
 91 |     def connect_mysql(self):
 92 |         try:
 93 |             self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user,
 94 |                                         passwd=self.password, db=self.db, charset='utf8')
 95 |             self.cursor = self.conn.cursor()
 96 |             return 0
 97 |         except Exception as e:
 98 |             print('[error] connect mysql error.' + str(e))
 99 |             return -1
100 | 
101 |     # query database table
102 |     # 查询表
103 |     def query_table(self, tablename):
104 |         sql = "SELECT * FROM " + tablename
105 |         try:
106 |             self.cursor.execute(sql)
107 |             row = self.cursor.fetchone()
108 |             print(row)
109 |             print(self.cursor.rowcount)
110 |         except Exception as e:
111 |             print("Failed to " + sql + str(e))
112 | 
113 |     # query porn information table
114 |     # 查询影片信息表
115 |     def query_table_torrent(self):
116 |         self.query_table(self.table_torrent)
117 | 
118 |     # query pictures table
119 |     # 查询图片表
120 |     def query_table_pictures(self):
121 |         self.query_table(self.table_pictures)
122 | 
123 |     # insert into [table_torrent] and return the primary key of the item just inserted
124 |     # 插入到 [table_torrent] 返回刚插入的项的主键
125 |     def insert_table_torrent(self, data='', name='', summary='', magnet=''):
126 |         sql = "INSERT INTO " + self.table_torrent + " (data, name, summary, magnet) VALUES ('" + data + "', '" + \
127 |               name + "', '" + summary + "', '" + magnet + "')"
128 |         try:
129 |             self.cursor.execute(sql)
130 |             self.conn.commit()
131 |             print("Successfully insert " + name + " into " + self.table_torrent)
132 |         except Exception as e:
133 |             print("Failed to " + sql + str(e))
134 |         try:
135 |             an_id = -1
136 |             an_id = self.cursor.lastrowid
137 |             if an_id != -1:
138 |                 return an_id
139 |         except Exception as e:
140 |             print("Failed to return last_insert_id." + str(e))
141 | 
142 |     # insert into [table_pictures]
143 |     # 插入到 table_pictures
144 |     def insert_table_pictures(self, an_id='', name=''):
145 |         sql = "INSERT INTO " + self.table_pictures + " (an_id, name) VALUES ('" + str(an_id) + "', '" + name + "')"
146 |         try:
147 |             self.cursor.execute(sql)
148 |             self.conn.commit()
149 |             print("Successfully insert " + name + " into " + self.table_pictures)
150 |         except Exception as e:
151 |             print("Failed to " + sql + str(e))
152 | 
153 |     # close database
154 |     # 关闭数据库连接
155 |     def close_mysql(self):
156 |         try:
157 |             self.cursor.close()
158 |             self.conn.close()
159 |         except Exception as e:
160 |             print("Failed to close mysql." + str(e))
161 | 
162 | 
163 | # conversion encode
164 | # 转换编码
165 | def Encode_Conversion(req):
166 |     if req.encoding == 'ISO-8859-1':
167 |         encodings = requests.utils.get_encodings_from_content(req.text)
168 |         if encodings:
169 |             encoding = encodings[0]
170 |         else:
171 |             encoding = req.apparent_encoding
172 |         # encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace')
173 |         encode_content = req.content.decode(encoding, 'replace')
174 |         return encode_content
175 |     else:
176 |         return ""
177 | 
178 | 
179 | # save [content] to [path]
180 | # 保存文本
181 | def Save_Text(id, path, content):
182 |     try:
183 |         f = open(path, "w", encoding='utf-8')
184 |         f.write(content)
185 |     except IOError:
186 |         print("[" + str(id) + "] IOError: File open failed.")
187 |     except Exception as e:
188 |         print("Save_Text Exception: " + str(e))
189 |     else:
190 |         print("[" + str(id) + "] Successfully save the file to " + path)
191 |         f.close()
192 | 
193 | 
194 | # torrent and magnet-link page
195 | # 种子/磁力链接页面
196 | def Prase_Torrent(id, url):
197 |     try:
198 |         if (isProxy == True):
199 |             req = requests.get(url, params=torrent_request_header, proxies=proxies_header)
200 |         else:
201 |             req = requests.get(url, params=torrent_request_header)
202 |         
203 |         soup = BeautifulSoup(req.content, "html.parser")
204 |         torrent_content = soup.select('.uk-button ')
205 |         torrent_content_num = len(torrent_content)
206 |         if torrent_content_num == 0:
207 |             print("[" + str(id) + "] No match torrent.")
208 |             return ''
209 |         for content in torrent_content:
210 |             str_content = str(content)
211 |             # matching magnet-link
212 |             # 匹配磁力链接
213 |             matchObj = re.search(r'magnet(.*?)"', str_content)
214 |             if matchObj:
215 |                 magnet_link = 'magnet' + matchObj.group(1)
216 |                 return magnet_link
217 |             else:
218 |                 # matching magnet-link failed
219 |                 # 匹配磁力链接失败
220 |                 print("[" + str(id) + "] No match: " + str_content)
221 |                 return ''
222 |     except Exception as e:
223 |         print("[" + str(id) + "] Prase_Torrent Exception: " + str(e))
224 | 
225 | 
226 | # each post page
227 | # 每个帖子页面
228 | def Prase_Post(id, url, folder_name):
229 |     try:
230 |         # match data
231 |         # 匹配日期
232 |         data = ''
233 |         matchObj = re.search(r'\[(.*?)\]', folder_name, re.M | re.I)
234 |         if matchObj:
235 |             data = matchObj.group(1)
236 |         else:
237 |             # match data failed
238 |             # 匹配日期失败
239 |             print("[" + str(id) + "] No match: " + folder_name)
240 | 
241 |         if (isProxy == True):
242 |             req = requests.get(url, params=request_header, proxies=proxies_header)
243 |         else:
244 |             req = requests.get(url, params=request_header)
245 | 
246 |         encode_content = Encode_Conversion(req)
247 |         soup = BeautifulSoup(encode_content, "html.parser")
248 |         post_content = soup.select('div[id="read_tpc"]')
249 |         post_content_num = len(post_content)
250 |         if post_content_num == 0:
251 |             print("[" + str(id) + "] No match post.")
252 |             return
253 | 
254 |         # save text content
255 |         # 保存文本内容
256 |         summary = post_content[0].text
257 |         str_content = str(post_content[0])
258 | 
259 |         # match magnet-link page
260 |         # 匹配磁力
261 |         magnet_link = ''
262 |         matchObj = re.findall(r'href="(.*?)"', str_content)
263 |         if matchObj:
264 |             for obj in matchObj:
265 |                 magnet_link = Prase_Torrent(id, obj)
266 |         else:
267 |             # match magnet-link page failed
268 |             # 匹配磁力失败
269 |             print("[" + str(id) + "] No match: " + str_content)
270 | 
271 |         # insert the [insert_table_torrent] table of the database
272 |         # 插入到 [insert_table_torrent] 表
273 |         an_id = -1
274 |         if folder_name != '' and magnet_link != '':
275 |             an_id = mySQLCommand.insert_table_torrent(data=data, name=folder_name, summary=summary, magnet=magnet_link)
276 | 
277 |         if an_id != -1:
278 |             # create a folder to save the picture
279 |             # 创建保存图片的文件夹
280 |             folder_path = save_path + '/' + str(an_id)
281 |             folder = os.path.exists(folder_path)
282 |             if not folder:
283 |                 os.makedirs(folder_path)
284 |                 print("[" + str(id) + "] Created folder " + str(an_id))
285 | 
286 |             # match pictures
287 |             # 匹配图片
288 |             matchObj = re.findall(r'window.open\(\'(.*?)\'\);', str_content)
289 |             if matchObj:
290 |                 for obj in matchObj:
291 |                     objTemp = obj
292 |                     strlist = objTemp.split('/')
293 |                     strlen = len(strlist)
294 |                     if strlen != 0:
295 |                         img_name = strlist[strlen - 1]
296 |                         try:
297 |                             urllib.request.urlretrieve(obj, folder_path + '/' + img_name)
298 |                         except Exception as e:
299 |                             print("[" + str(id) + "] Download the picture Exception: " + str(e))
300 |                         else:
301 |                             print("[" + str(id) + "] Successfully save the image to " + folder_path + '/' + img_name)
302 |                             # insert the [insert_table_pictures] table of the database
303 |                             # 插入 [insert_table_pictures] 表
304 |                             mySQLCommand.insert_table_pictures(an_id=an_id, name=img_name)
305 |             else:
306 |                 # 匹配图片失败
307 |                 # match pictures failed
308 |                 print("[" + str(id) + "] No match: " + str_content)
309 |     except Exception as e:
310 |         print("[" + str(id) + "] Prase_Post Exception: " + str(e))
311 | 
312 | 
313 | # post list page
314 | # 帖子列表页面
315 | def Post_list(id, page):
316 |     try:
317 |         post_url = base_url + 'thread-htm-fid-' + str(fid) + '-page-' + str(page) + '.html'
318 |         print('[' + str(id) + '] clicked: ' + post_url)
319 | 
320 |         if (isProxy == True):
321 |             req = requests.get(post_url, params=request_header, proxies=proxies_header)
322 |         else:
323 |             req = requests.get(post_url, params=request_header)
324 | 
325 |         encode_content = Encode_Conversion(req)
326 |         soup = BeautifulSoup(encode_content, "html.parser")
327 |         post_list = soup.select('tr[class="tr3 t_one"] h3 a')
328 |         post_num = len(post_list)
329 |         if post_num == 0:
330 |             print("[" + str(id) + "] No match post_list.")
331 |             return
332 |         for post in post_list:
333 |             str_post = str(post)
334 |             matchObj = re.match(r'(.*)href="(.*)" id=(.*)>(.*?)</a>', str_post, re.M | re.I)
335 |             if matchObj:
336 |                 post_url = matchObj.group(2)
337 |                 post_name = matchObj.group(4)
338 |                 if post_name != '':
339 |                     # match each post page
340 |                     # 匹配每个帖子
341 |                     Prase_Post(id, base_url + post_url,
342 |                                post_name.replace(u'\0', u'').replace(u'/', u'.').replace(u'?',
343 |                                                                                          u'').replace(u'*',
344 |                                                                                                       u''))
345 |             else:
346 |                 # match failed
347 |                 # 匹配失败
348 |                 print("[" + str(id) + "] No match: " + str_post)
349 |     except Exception as e:
350 |         print("[" + str(id) + "] Post_list Exception." + str(e))
351 | 
352 | 
353 | # multi-threaded, the parameter [id] is the thread id
354 | # 多线程，参数 [id] 为线程 id
355 | def Work_thread(id):
356 |     try:
357 |         if id <= page_end:
358 |             prase_num = 0
359 |             prase_more_one = 0
360 |             page_num = abs(page_end - page_start) + 1
361 |             if id <= int(page_num % thread_num):
362 |                 prase_more_one = 1
363 |             page_num_each_thread = int(page_num / thread_num) + prase_more_one
364 |             for each_page in range(page_start + id - 1, page_end + 1, thread_num):
365 |                 Post_list(id, each_page)
366 |                 prase_num += 1
367 |                 print('[' + str(id) + '] [ ' + "{:.1f}".format(
368 |                     prase_num / page_num_each_thread * 100) + '% page completed ] ')
369 |             print('[' + str(id) + '] completed !!!!!')
370 |     except Exception as e:
371 |         print("[" + str(id) + "] Work_thread Exception." + str(e))
372 | 
373 | 
374 | if __name__ == "__main__":
375 |     # database command object 
376 |     # 数据库命令对象
377 |     mySQLCommand = MySQLCommand()
378 |     if mySQLCommand.connect_mysql() != -1:
379 |         # single thread # 单线程
380 |         # Work_thread(1)
381 |         # multithreading # 多线程
382 |         try:
383 |             for i in range(1, thread_num + 1):
384 |                 _thread.start_new_thread(Work_thread, (i,))
385 |         except Exception as e:
386 |             print("Start_new_thread Exception: " + str(e))
387 |         while 1:
388 |             pass
389 |         mySQLCommand.close_mysql()
390 | 


--------------------------------------------------------------------------------
/TorrentSpider_AsianNomosaic_With_Json.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #-*-coding:utf-8 -*-
  3 | import _thread
  4 | import re
  5 | import requests
  6 | from bs4 import BeautifulSoup
  7 | from urllib.parse import urlencode, parse_qs, urlsplit, urlunsplit
  8 | from urllib.request import urlretrieve
  9 | import urllib.request
 10 | import os
 11 | import json
 12 | 
 13 | 
 14 | config = object
 15 | 
 16 | 
 17 | # Read configuration from [config.json]
 18 | # 从 [config.json] 读取配置信息
 19 | class JsonCommand(object):
 20 |     def __init__(self):
 21 |         try:
 22 |             with open('config.json', encoding='utf-8') as config_file:
 23 |                 _config = json.loads(config_file.read())
 24 |             self.request_header = _config["_1024_req_header"]
 25 |             self.torrent_request_header = _config["_torrent_req_header"]
 26 |             self.proxies = _config["proxies"]
 27 |             self.is_proxy = bool(_config["is_proxy"])
 28 |             self.fid = int(_config["fid"])
 29 |             self.base_url = _config["base_url"]
 30 |             self.save_path = _config["save_path"]
 31 |             self.page_start = int(_config["page_start"])
 32 |             self.page_end = int(_config["page_end"])
 33 |             self.thread_num = int(_config["thread_num"])
 34 |             self.user_agent = _config["_1024_req_header"]["User-Agent"]
 35 |             config_file.close()
 36 |         except Exception as e:
 37 |             print("JsonCommand Exception: " + str(e))
 38 | 
 39 | 
 40 | # conversion encode
 41 | # 转换编码
 42 | def Encode_Conversion(req):
 43 |     if req.encoding == 'ISO-8859-1':
 44 |         encodings = requests.utils.get_encodings_from_content(req.text)
 45 |         if encodings:
 46 |             encoding = encodings[0]
 47 |         else:
 48 |             encoding = req.apparent_encoding
 49 |         # encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace')
 50 |         encode_content = req.content.decode(encoding, 'replace')
 51 |         return encode_content
 52 |     else:
 53 |         return ""
 54 | 
 55 | 
 56 | # save [content] to [path]
 57 | # 保存文本
 58 | def Save_Text(id, path, content):
 59 |     try:
 60 |         f = open(path, "w", encoding='utf-8')
 61 |         f.write(content)
 62 |     except IOError:
 63 |         print("[" + str(id) + "] IOError: File open failed.")
 64 |     except Exception as e:
 65 |         print("Save_Text Exception: " + str(e))
 66 |     else:
 67 |         # 内容写入文件成功
 68 |         print("[" + str(id) + "] Successfully save the file to " + path)
 69 |         f.close()
 70 | 
 71 | 
 72 | # torrent and magnet-link page
 73 | # 种子/磁力链接页面
 74 | def Prase_Torrent(id, url, folder_path):
 75 |     try:
 76 |         if (config.is_proxy == True):
 77 |             req = requests.get(url, params=config.torrent_request_header, proxies=config.proxies)
 78 |         else:
 79 |             req = requests.get(url, params=config.torrent_request_header)
 80 | 
 81 |         # soup转换
 82 |         soup = BeautifulSoup(req.content, "html.parser")
 83 | 
 84 |         torrent_content = soup.select('.uk-button ')
 85 |         torrent_content_num = len(torrent_content)
 86 |         if torrent_content_num == 0:
 87 |             print("[" + str(id) + "] No match torrent.")
 88 |             return ''
 89 |         for content in torrent_content:
 90 |             str_content = str(content)
 91 |             # 匹配磁力链接
 92 |             matchObj = re.search(r'magnet(.*?)"', str_content)
 93 |             if matchObj:
 94 |                 magnet_link = 'magnet' + matchObj.group(1)
 95 |                 return magnet_link
 96 |             else:
 97 |                 # 匹配失败
 98 |                 print("[" + str(id) + "] No match: " + str_content)
 99 |                 return ''
100 |     except Exception as e:
101 |         print("[" + str(id) + "] Prase_Torrent Exception: " + str(e))
102 | 
103 | 
104 | # each post page
105 | # 每个帖子页面
106 | def Prase_Post(id, url, folder_name):
107 |     try:
108 |         if (config.is_proxy == True):
109 |             req = requests.get(url, params=config.request_header, proxies=config.proxies)
110 |         else:
111 |             req = requests.get(url, params=config.request_header)
112 | 
113 |         # 转换编码
114 |         encode_content = Encode_Conversion(req)
115 |         # soup转换
116 |         soup = BeautifulSoup(encode_content, "html.parser")
117 | 
118 |         post_content = soup.select('div[id="read_tpc"]')
119 |         post_content_num = len(post_content)
120 |         if post_content_num == 0:
121 |             print("[" + str(id) + "] No match post.")
122 |             return
123 | 
124 |         # 创建保存的文件夹
125 |         folder_path = config.save_path + '/' + folder_name
126 |         folder = os.path.exists(folder_path)
127 |         if not folder:
128 |             os.makedirs(folder_path)
129 |             print("[" + str(id) + "] Created folder " + folder_name)
130 | 
131 |         # 保存文本内容
132 |         result = post_content[0].text
133 |         magnet_link = ''
134 |         for content in post_content:
135 |             str_content = str(content)
136 | 
137 |             # 匹配种子
138 |             matchObj = re.findall(r'href="(.*?)"', str_content)
139 |             if matchObj:
140 |                 for obj in matchObj:
141 |                     magnet_link = Prase_Torrent(id, obj, folder_path)
142 |             else:
143 |                 # 匹配失败
144 |                 print("[" + str(id) + "] No match: " + str_content)
145 | 
146 |             # 匹配图片
147 |             matchObj = re.findall(r'window.open\(\'(.*?)\'\);', str_content)
148 |             if matchObj:
149 |                 for obj in matchObj:
150 |                     objTemp = obj
151 |                     strlist = objTemp.split('/')
152 |                     strlen = len(strlist)
153 |                     if strlen != 0:
154 |                         img_name = strlist[strlen - 1]
155 |                         try:
156 |                             urllib.request.urlretrieve(obj, folder_path + '/' + img_name)
157 |                         except Exception as e:
158 |                             print("[" + str(id) + "] Download the picture Exception: " + str(e))
159 |                         else:
160 |                             print("[" + str(id) + "] Successfully save the picture to " + folder_path + '/' + img_name)
161 |             else:
162 |                 # 匹配失败
163 |                 print("[" + str(id) + "] No match: " + str_content)
164 |         # 保存到文件
165 |         if magnet_link != '':
166 |             result = result + '\n\n' + magnet_link
167 |         Save_Text(id, folder_path + '/index.txt', result)
168 |     except Exception as e:
169 |         print("[" + str(id) + "] Prase_Post Exception: " + str(e))
170 | 
171 | 
172 | # post list page
173 | # 帖子列表页面
174 | def Post_list(id, page):
175 |     try:
176 |         post_url = config.base_url + 'thread-htm-fid-' + str(config.fid) + '-page-' + str(page) + '.html'
177 |         print('[' + str(id) + '] clicked: ' + post_url)
178 | 
179 |         if (config.is_proxy == True):
180 |             req = requests.get(post_url, params=config.request_header, proxies=config.proxies)
181 |         else:
182 |             req = requests.get(post_url, params=config.request_header)
183 | 
184 |         # 转换编码
185 |         encode_content = Encode_Conversion(req)
186 | 
187 |         # soup转换
188 |         soup = BeautifulSoup(encode_content, "html.parser")
189 |         # 获取章节名称
190 |         post_list = soup.select('tr[class="tr3 t_one"] h3 a')
191 |         post_num = len(post_list)
192 |         if post_num == 0:
193 |             print("[" + str(id) + "] No match post_list.")
194 |             return
195 |         for post in post_list:
196 |             str_post = str(post)
197 |             # html网页的匹配
198 |             matchObj = re.match(r'(.*)href="(.*)" id=(.*)>(.*?)</a>', str_post, re.M | re.I)
199 |             if matchObj:
200 |                 post_url = matchObj.group(2)  # URL
201 |                 post_name = matchObj.group(4)  # 文件夹名
202 |                 if post_name != '':
203 |                     # 匹配每个帖子
204 |                     Prase_Post(id, config.base_url + post_url,
205 |                                post_name.replace(u'\0', u'').replace(u'/', u'.').replace(u'?',
206 |                                                                                          u'').replace(u'*',
207 |                                                                                                       u''))
208 |             else:
209 |                 # 匹配失败
210 |                 print("[" + str(id) + "] No match: " + str_post)
211 |     except Exception as e:
212 |         print("[" + str(id) + "] Post_list Exception." + str(e))
213 | 
214 | 
215 | # multi-threaded, the parameter [id] is the thread id
216 | # 多线程，参数 [id] 为线程 id
217 | def Work_thread(id):
218 |     try:
219 |         if id <= config.page_end:
220 |             prase_num = 0
221 |             prase_more_one = 0
222 |             page_num = abs(config.page_end - config.page_start) + 1
223 |             if id <= int(page_num % config.thread_num):
224 |                 prase_more_one = 1
225 |             page_num_each_thread = int(page_num / config.thread_num) + prase_more_one
226 |             for each_page in range(config.page_start + id - 1, config.page_end + 1, config.thread_num):
227 |                 Post_list(id, each_page)
228 |                 prase_num += 1
229 |                 print('[' + str(id) + '] [ ' + "{:.1f}".format(
230 |                     prase_num / page_num_each_thread * 100) + '% page completed ] ')
231 |             print('[' + str(id) + '] completed !!!!!')
232 |     except Exception as e:
233 |         print("[" + str(id) + "] Work_thread Exception." + str(e))
234 | 
235 | 
236 | if __name__ == "__main__":
237 |     config = JsonCommand()
238 |     opener = urllib.request.build_opener()
239 |     opener.addheaders = [(config.user_agent)]
240 |     urllib.request.install_opener(opener)
241 |     # single thread # 单线程
242 |     # Work_thread(1)
243 |     # multithreading # 多线程
244 |     try:
245 |         for i in range(1, config.thread_num + 1):
246 |             _thread.start_new_thread(Work_thread, (i,))
247 |     except Exception as e:
248 |         print("Start_new_thread Exception: " + str(e))
249 |     while 1:
250 |         pass
251 | 


--------------------------------------------------------------------------------
/TorrentSpider_EuropeAmerica_DB.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | #-*-coding:utf-8 -*-
  3 | import _thread
  4 | import re
  5 | import requests
  6 | from bs4 import BeautifulSoup
  7 | from urllib.parse import urlencode, parse_qs, urlsplit, urlunsplit
  8 | from urllib.request import urlretrieve
  9 | import urllib.request
 10 | import os
 11 | import time
 12 | import pymysql
 13 | 
 14 | # 1024 http request header
 15 | # 1024 网站请求头
 16 | proxt_1024_req_header = {
 17 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 18 |     'Accept-Encoding': 'gzip, deflate',
 19 |     'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
 20 |     'Cache-Control': 'max-age=0',
 21 |     # 'Connection': 'keep - alive',
 22 |     'Cookie': '__cfduid=d8a8419777cdc090aeacad5676c478c181548136023; UM_distinctid=16874190914afb' \
 23 |                   '-02debbef036148-46564b55-1fa400-168741909152aa; CNZZDATA1261158850=1725766245-' \
 24 |                   '1548135728-%7C1548135728; aafaf_threadlog=%2C7%2C5%2C110%2C18%2C106%2C14%2C22%2C; ' \
 25 |                   'aafaf_readlog=%2C2024971%2C; aafaf_lastpos=F22; aafaf_lastvisit=2122%091548138145%09' \
 26 |                   '%2Fpw%2Fthread.php%3Ffid-22-page-1.html; aafaf_ol_offset=32470944',
 27 |     'Host': 'h3.cnmbtgf.info',
 28 |     # 'Pragma': 'no-cache',
 29 |     'Proxy-Connection': 'keep-alive',
 30 |     'Referer': 'http://h3.cnmbtgf.info/pw/thread-htm-fid-22-page-2.html',
 31 |     'Upgrade-Insecure-Requests': '1',
 32 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' \
 33 |                   ' Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116'
 34 | }
 35 | request_header = proxt_1024_req_header
 36 | 
 37 | # magnet-link website http request header
 38 | # 磁力链接网站网站请求头
 39 | proxt_torrent_req_header = {
 40 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 41 |     'Accept-Encoding': 'gzip, deflate',
 42 |     'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
 43 |     'Cache-Control': 'no-cache',
 44 |     # 'Connection': 'keep - alive',
 45 |     'Cookie': '__cfduid=d941de1b4432ad5277d394ccf9eef5a521548136720; UM_distinctid=1687423abf6414' \
 46 |                   '-0e3d3cc25c160b-46564b55-1fa400-1687423abf7b62; CNZZDATA1273152310=28791063-' \
 47 |                   '1548133540-http%253A%252F%252Fh3.cnmbtgf.info%252F%7C1548133540; _ga=GA1.2.18' \
 48 |                   '32968654.1548136721; _gid=GA1.2.1853650139.1548136721',
 49 |     'Host': 'www1.downsx.net',
 50 |     'Pragma': 'no-cache',
 51 |     'Proxy-Connection': 'keep-alive',
 52 |     'Referer': 'http://h3.cnmbtgf.info/pw/html_data/22/1901/3863610.html',
 53 |     'Upgrade-Insecure-Requests': '1',
 54 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \
 55 |               '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116'
 56 | }
 57 | torrent_request_header = proxt_torrent_req_header
 58 | opener=urllib.request.build_opener()
 59 | opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' \
 60 |                   ' (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116')]
 61 | urllib.request.install_opener(opener)
 62 | 
 63 | # proxy settings
 64 | # 代理设置
 65 | proxies = {'http': '127.0.0.1:1080', "https": "127.0.0.1:1080", }
 66 | proxies_header = proxies
 67 | isProxy = False                                                 # 是否设置代理
 68 | 
 69 | base_url = "http://h3.cnmbtgf.info/pw/"                         # 基础url
 70 | save_path = "D:/code/Pycharm/1024Spider/torrent_europe_america" # 存储图片路径
 71 | fid = 7                                                         # fid=7 表示欧美
 72 | page_start = 1                                                  # 爬取的开始页
 73 | page_end = 434                                                  # 爬取的结束页
 74 | thread_num = 1                                                  # 线程数
 75 | mySQLCommand = object
 76 | 
 77 | 
 78 | # Used to execute database commands
 79 | # 用于执行数据库命令
 80 | class MySQLCommand(object):
 81 |     # init # 类的初始化
 82 |     def __init__(self):
 83 |         self.host = ''                                          # 主机，本地填 127.0.0.1
 84 |         self.port = 3306                                        # 数据端口号
 85 |         self.user = ''                                          # 数据库用户名
 86 |         self.password = ""                                      # 数据库密码
 87 |         self.db = ""                                            # 数据库名
 88 |         self.table_torrent = "EuropeAmerica"                    # 欧美新片信息表
 89 |         self.table_pictures = "EuropeAmericaPictures"           # 欧美新片图片表
 90 | 
 91 |     # connect to database
 92 |     # 连接数据库
 93 |     def connect_mysql(self):
 94 |         try:
 95 |             self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user,
 96 |                                         passwd=self.password, db=self.db, charset='utf8')
 97 |             self.cursor = self.conn.cursor()
 98 |             return 0
 99 |         except Exception as e:
100 |             print('[error] connect mysql error.' + str(e))
101 |             return -1
102 | 
103 |     # query database table
104 |     # 查询表
105 |     def query_table(self, tablename):
106 |         sql = "SELECT * FROM " + tablename
107 |         try:
108 |             self.cursor.execute(sql)
109 |             row = self.cursor.fetchone()
110 |             print(row)
111 |             print(self.cursor.rowcount)
112 |         except Exception as e:
113 |             print("Failed to " + sql + str(e))
114 | 
115 |     # query porn information table
116 |     # 查询影片信息表
117 |     def query_table_torrent(self):
118 |         self.query_table(self.table_torrent)
119 | 
120 |     # query pictures table
121 |     # 查询图片表
122 |     def query_table_pictures(self):
123 |         self.query_table(self.table_pictures)
124 | 
125 |     # insert into [table_torrent] and return the primary key of the item just inserted
126 |     # 插入到 [table_torrent] 返回刚插入的项的主键
127 |     def insert_table_torrent(self, data='', name='', summary='', magnet=''):
128 |         sql = "INSERT INTO " + self.table_torrent + " (data, name, summary, magnet) VALUES ('" + data + "', '" + \
129 |               name + "', '" + summary + "', '" + magnet + "')"
130 |         try:
131 |             self.cursor.execute(sql)
132 |             self.conn.commit()
133 |             print("Successfully insert " + name + " into " + self.table_torrent)
134 |         except Exception as e:
135 |             print("Failed to " + sql + str(e))
136 |         try:
137 |             an_id = -1
138 |             an_id = self.cursor.lastrowid
139 |             if an_id != -1:
140 |                 return an_id
141 |         except Exception as e:
142 |             print("Failed to return last_insert_id." + str(e))
143 | 
144 |     # insert into [table_pictures]
145 |     # 插入到 table_pictures
146 |     def insert_table_pictures(self, an_id='', name=''):
147 |         sql = "INSERT INTO " + self.table_pictures + " (an_id, name) VALUES ('" + str(an_id) + "', '" + name + "')"
148 |         try:
149 |             self.cursor.execute(sql)
150 |             self.conn.commit()
151 |             print("Successfully insert " + name + " into " + self.table_pictures)
152 |         except Exception as e:
153 |             print("Failed to " + sql + str(e))
154 | 
155 |     # close database
156 |     # 关闭数据库连接
157 |     def close_mysql(self):
158 |         try:
159 |             self.cursor.close()
160 |             self.conn.close()
161 |         except Exception as e:
162 |             print("Failed to close mysql." + str(e))
163 | 
164 | 
165 | # conversion encode
166 | # 转换编码
167 | def Encode_Conversion(req):
168 |     if req.encoding == 'ISO-8859-1':
169 |         encodings = requests.utils.get_encodings_from_content(req.text)
170 |         if encodings:
171 |             encoding = encodings[0]
172 |         else:
173 |             encoding = req.apparent_encoding
174 |         # encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace')
175 |         encode_content = req.content.decode(encoding, 'replace')
176 |         return encode_content
177 |     else:
178 |         return ""
179 | 
180 | 
181 | # save [content] to [path]
182 | # 保存文本
183 | def Save_Text(id, path, content):
184 |     try:
185 |         f = open(path, "w", encoding='utf-8')
186 |         f.write(content)
187 |     except IOError:
188 |         print("[" + str(id) + "] IOError: File open failed.")
189 |     except Exception as e:
190 |         print("Save_Text Exception: " + str(e))
191 |     else:
192 |         # 内容写入文件成功
193 |         print("[" + str(id) + "] Successfully save the file to " + path)
194 |         f.close()
195 | 
196 | 
197 | # torrent and magnet-link page
198 | # 种子/磁力链接页面
199 | def Prase_Torrent(id, url):
200 |     try:
201 |         if (isProxy == True):
202 |             req = requests.get(url, params=torrent_request_header, proxies=proxies_header)
203 |         else:
204 |             req = requests.get(url, params=torrent_request_header)
205 | 
206 |         # soup转换
207 |         soup = BeautifulSoup(req.content, "html.parser")
208 | 
209 |         torrent_content = soup.select('.uk-button ')
210 |         torrent_content_num = len(torrent_content)
211 |         if torrent_content_num == 0:
212 |             print("[" + str(id) + "] No match torrent.")
213 |             return ''
214 |         for content in torrent_content:
215 |             str_content = str(content)
216 |             # 匹配磁力链接
217 |             matchObj = re.search(r'magnet(.*?)"', str_content)
218 |             if matchObj:
219 |                 magnet_link = 'magnet' + matchObj.group(1)
220 |                 return magnet_link
221 |             else:
222 |                 # 匹配失败
223 |                 print("[" + str(id) + "] No match: " + str_content)
224 |                 return ''
225 |     except Exception as e:
226 |         print("[" + str(id) + "] Prase_Torrent Exception: " + str(e))
227 | 
228 | 
229 | # each post page
230 | # 每个帖子页面
231 | def Prase_Post(id, url, folder_name):
232 |     try:
233 |         # 匹配日期
234 |         data = ''
235 |         matchObj = re.search(r'\[(.*?)\]', folder_name, re.M | re.I)
236 |         if matchObj:
237 |             data = matchObj.group(1)  # 文件夹名
238 |         else:
239 |             # 匹配失败
240 |             print("[" + str(id) + "] No match: " + folder_name)
241 | 
242 |         if (isProxy == True):
243 |             req = requests.get(url, params=request_header, proxies=proxies_header)
244 |         else:
245 |             req = requests.get(url, params=request_header)
246 | 
247 |         # 转换编码
248 |         encode_content = Encode_Conversion(req)
249 |         # soup转换
250 |         soup = BeautifulSoup(encode_content, "html.parser")
251 | 
252 |         post_content = soup.select('div[id="read_tpc"]')
253 |         post_content_num = len(post_content)
254 |         if post_content_num == 0:
255 |             print("[" + str(id) + "] No match post.")
256 |             return
257 | 
258 |         # 保存文本内容
259 |         summary = post_content[0].text
260 |         str_content = str(post_content[0])
261 | 
262 |         # 匹配种子
263 |         magnet_link = ''
264 |         matchObj = re.findall(r'href="(.*?)"', str_content)
265 |         if matchObj:
266 |             for obj in matchObj:
267 |                 magnet_link = Prase_Torrent(id, obj)
268 |         else:
269 |             # 匹配种子失败
270 |             print("[" + str(id) + "] No match: " + str_content)
271 | 
272 |         # 插入到数据库：insert_table_torrent 表
273 |         an_id = -1
274 |         if folder_name != '' and magnet_link != '':
275 |             an_id = mySQLCommand.insert_table_torrent(data=data, name=folder_name, summary=summary, magnet=magnet_link)
276 | 
277 |         if an_id != -1:
278 |             # 创建保存图片的文件夹
279 |             folder_path = save_path + '/' + str(an_id)
280 |             folder = os.path.exists(folder_path)
281 |             if not folder:
282 |                 os.makedirs(folder_path)
283 |                 print("[" + str(id) + "] Created folder " + str(an_id))
284 | 
285 |             # 匹配图片
286 |             matchObj = re.findall(r'window.open\(\'(.*?)\'\);', str_content)
287 |             if matchObj:
288 |                 for obj in matchObj:
289 |                     objTemp = obj
290 |                     strlist = objTemp.split('/')
291 |                     strlen = len(strlist)
292 |                     if strlen != 0:
293 |                         img_name = strlist[strlen - 1]
294 |                         try:
295 |                             urllib.request.urlretrieve(obj, folder_path + '/' + img_name)
296 |                         except Exception as e:
297 |                             print("[" + str(id) + "] Download the picture Exception: " + str(e))
298 |                         else:
299 |                             print("[" + str(id) + "] Successfully save the image to " + folder_path + '/' + img_name)
300 |                             # 插入数据库：insert_table_pictures 表
301 |                             mySQLCommand.insert_table_pictures(an_id=an_id, name=img_name)
302 |             else:
303 |                 # 匹配失败
304 |                 print("[" + str(id) + "] No match: " + str_content)
305 |     except Exception as e:
306 |         print("[" + str(id) + "] Prase_Post Exception: " + str(e))
307 | 
308 | 
309 | # post list page
310 | # 帖子列表页面
311 | def Post_list(id, page):
312 |     try:
313 |         post_url = base_url + 'thread-htm-fid-' + str(fid) + '-page-' + str(page) + '.html'
314 |         print('[' + str(id) + '] clicked: ' + post_url)
315 | 
316 |         if (isProxy == True):
317 |             req = requests.get(post_url, params=request_header, proxies=proxies_header)
318 |         else:
319 |             req = requests.get(post_url, params=request_header)
320 | 
321 |         # 转换编码
322 |         encode_content = Encode_Conversion(req)
323 | 
324 |         # soup转换
325 |         soup = BeautifulSoup(encode_content, "html.parser")
326 |         # 获取章节名称
327 |         post_list = soup.select('tr[class="tr3 t_one"] h3 a')
328 |         post_num = len(post_list)
329 |         if post_num == 0:
330 |             print("[" + str(id) + "] No match post_list.")
331 |             return
332 |         for post in post_list:
333 |             str_post = str(post)
334 |             # html网页的匹配
335 |             matchObj = re.match(r'(.*)href="(.*)" id=(.*)>(.*?)</a>', str_post, re.M | re.I)
336 |             if matchObj:
337 |                 post_url = matchObj.group(2)  # URL
338 |                 post_name = matchObj.group(4)  # 文件夹名
339 |                 if post_name != '':
340 |                     # 匹配每个帖子
341 |                     Prase_Post(id, base_url + post_url,
342 |                                post_name.replace(u'\0', u'').replace(u'/', u'.').replace(u'?',
343 |                                                                                          u'').replace(u'*',
344 |                                                                                                       u''))
345 |             else:
346 |                 # 匹配失败
347 |                 print("[" + str(id) + "] No match: " + str_post)
348 |     except Exception as e:
349 |         print("[" + str(id) + "] Post_list Exception." + str(e))
350 | 
351 | 
352 | # multi-threaded, the parameter [id] is the thread id
353 | # 多线程，参数 [id] 为线程 id
354 | def Work_thread(id):
355 |     try:
356 |         if id <= page_end:
357 |             prase_num = 0
358 |             prase_more_one = 0
359 |             page_num = abs(page_end - page_start) + 1
360 |             if id <= int(page_num % thread_num):
361 |                 prase_more_one = 1
362 |             page_num_each_thread = int(page_num / thread_num) + prase_more_one
363 |             for each_page in range(page_start + id - 1, page_end + 1, thread_num):
364 |                 Post_list(id, each_page)
365 |                 prase_num += 1
366 |                 print('[' + str(id) + '] [ ' + "{:.1f}".format(
367 |                     prase_num / page_num_each_thread * 100) + '% page completed ] ')
368 |             print('[' + str(id) + '] completed !!!!!')
369 |     except Exception as e:
370 |         print("[" + str(id) + "] Work_thread Exception." + str(e))
371 | 
372 | 
373 | if __name__ == "__main__":
374 |     # database command object 
375 |     # 数据库命令对象
376 |     mySQLCommand = MySQLCommand()
377 |     if mySQLCommand.connect_mysql() != -1:
378 |         # single thread # 单线程
379 |         # Work_thread(1)
380 |         # multithreading # 多线程
381 |         try:
382 |             for i in range(1, thread_num + 1):
383 |                 _thread.start_new_thread(Work_thread, (i,))
384 |         except Exception as e:
385 |             print("Start_new_thread Exception: " + str(e))
386 |         while 1:
387 |             pass
388 |         mySQLCommand.close_mysql()
389 | 


--------------------------------------------------------------------------------
/TorrentSpider_JapaneseCavalry_DB.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | #-*-coding:utf-8 -*-
  3 | import _thread
  4 | import re
  5 | import requests
  6 | from bs4 import BeautifulSoup
  7 | from urllib.parse import urlencode, parse_qs, urlsplit, urlunsplit
  8 | from urllib.request import urlretrieve
  9 | import urllib.request
 10 | import os
 11 | import time
 12 | import pymysql
 13 | 
 14 | # 1024 http request header
 15 | # 1024 网站请求头
 16 | proxt_1024_req_header = {
 17 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 18 |     'Accept-Encoding': 'gzip, deflate',
 19 |     'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
 20 |     'Cache-Control': 'max-age=0',
 21 |     # 'Connection': 'keep - alive',
 22 |     'Cookie': '__cfduid=d8a8419777cdc090aeacad5676c478c181548136023; UM_distinctid=16874190914afb' \
 23 |                   '-02debbef036148-46564b55-1fa400-168741909152aa; CNZZDATA1261158850=1725766245-' \
 24 |                   '1548135728-%7C1548135728; aafaf_threadlog=%2C7%2C5%2C110%2C18%2C106%2C14%2C22%2C; ' \
 25 |                   'aafaf_readlog=%2C2024971%2C; aafaf_lastpos=F22; aafaf_lastvisit=2122%091548138145%09' \
 26 |                   '%2Fpw%2Fthread.php%3Ffid-22-page-1.html; aafaf_ol_offset=32470944',
 27 |     'Host': 'h3.cnmbtgf.info',
 28 |     # 'Pragma': 'no-cache',
 29 |     'Proxy-Connection': 'keep-alive',
 30 |     'Referer': 'http://h3.cnmbtgf.info/pw/thread-htm-fid-22-page-2.html',
 31 |     'Upgrade-Insecure-Requests': '1',
 32 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' \
 33 |                   ' Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116'
 34 | }
 35 | request_header = proxt_1024_req_header
 36 | 
 37 | # magnet-link website http request header
 38 | # 磁力链接网站网站请求头
 39 | proxt_torrent_req_header = {
 40 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 41 |     'Accept-Encoding': 'gzip, deflate',
 42 |     'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
 43 |     'Cache-Control': 'no-cache',
 44 |     # 'Connection': 'keep - alive',
 45 |     'Cookie': '__cfduid=d941de1b4432ad5277d394ccf9eef5a521548136720; UM_distinctid=1687423abf6414' \
 46 |                   '-0e3d3cc25c160b-46564b55-1fa400-1687423abf7b62; CNZZDATA1273152310=28791063-' \
 47 |                   '1548133540-http%253A%252F%252Fh3.cnmbtgf.info%252F%7C1548133540; _ga=GA1.2.18' \
 48 |                   '32968654.1548136721; _gid=GA1.2.1853650139.1548136721',
 49 |     'Host': 'www1.downsx.net',
 50 |     'Pragma': 'no-cache',
 51 |     'Proxy-Connection': 'keep-alive',
 52 |     'Referer': 'http://h3.cnmbtgf.info/pw/html_data/22/1901/3863610.html',
 53 |     'Upgrade-Insecure-Requests': '1',
 54 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \
 55 |               '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116'
 56 | }
 57 | torrent_request_header = proxt_torrent_req_header
 58 | opener=urllib.request.build_opener()
 59 | opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' \
 60 |                   ' (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116')]
 61 | urllib.request.install_opener(opener)
 62 | 
 63 | # proxy settings
 64 | # 代理设置
 65 | proxies = {'http': '127.0.0.1:1080', "https": "127.0.0.1:1080", }
 66 | proxies_header = proxies
 67 | isProxy = False                                                 # 是否设置代理
 68 | 
 69 | base_url = "http://h3.cnmbtgf.info/pw/"                         # 基础url
 70 | save_path = "D:/code/Pycharm/1024Spider/torrent_japanese_cavalry" # 存储图片路径
 71 | fid = 22                                                        # fid=22 表示日本骑兵
 72 | page_start = 1                                                  # 爬取的开始页
 73 | page_end = 1332                                                 # 爬取的结束页
 74 | thread_num = 1                                                  # 线程数
 75 | mySQLCommand = object
 76 | 
 77 | 
 78 | # Used to execute database commands
 79 | # 用于执行数据库命令
 80 | class MySQLCommand(object):
 81 |     # init # 类的初始化
 82 |     def __init__(self):
 83 |         self.host = ''                                          # 主机，本地填 127.0.0.1
 84 |         self.port = 3306                                        # 数据端口号
 85 |         self.user = ''                                          # 数据库用户名
 86 |         self.password = ""                                      # 数据库密码
 87 |         self.db = ""                                            # 数据库名
 88 |         self.table_torrent = "JapaneseCavalry"                  # 日本骑兵信息表
 89 |         self.table_pictures = "JapaneseCavalryPictures"         # 日本骑兵图片表
 90 | 
 91 |     # connect to database
 92 |     # 连接数据库
 93 |     def connect_mysql(self):
 94 |         try:
 95 |             self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user,
 96 |                                         passwd=self.password, db=self.db, charset='utf8')
 97 |             self.cursor = self.conn.cursor()
 98 |             return 0
 99 |         except Exception as e:
100 |             print('[error] connect mysql error.' + str(e))
101 |             return -1
102 | 
103 |     # query database table
104 |     # 查询表
105 |     def query_table(self, tablename):
106 |         sql = "SELECT * FROM " + tablename
107 |         try:
108 |             self.cursor.execute(sql)
109 |             row = self.cursor.fetchone()
110 |             print(row)
111 |             print(self.cursor.rowcount)
112 |         except Exception as e:
113 |             print("Failed to " + sql + str(e))
114 | 
115 |     # query porn information table
116 |     # 查询影片信息表
117 |     def query_table_torrent(self):
118 |         self.query_table(self.table_torrent)
119 | 
120 |     # query pictures table
121 |     # 查询图片表
122 |     def query_table_pictures(self):
123 |         self.query_table(self.table_pictures)
124 | 
125 |     # insert into [table_torrent] and return the primary key of the item just inserted
126 |     # 插入到 [table_torrent] 返回刚插入的项的主键
127 |     def insert_table_torrent(self, data='', name='', summary='', magnet=''):
128 |         sql = "INSERT INTO " + self.table_torrent + " (data, name, summary, magnet) VALUES ('" + data + "', '" + \
129 |               name + "', '" + summary + "', '" + magnet + "')"
130 |         try:
131 |             self.cursor.execute(sql)
132 |             self.conn.commit()
133 |             print("Successfully insert " + name + " into " + self.table_torrent)
134 |         except Exception as e:
135 |             print("Failed to " + sql + str(e))
136 |         try:
137 |             an_id = -1
138 |             an_id = self.cursor.lastrowid
139 |             if an_id != -1:
140 |                 return an_id
141 |         except Exception as e:
142 |             print("Failed to return last_insert_id." + str(e))
143 | 
144 |     # insert into [table_pictures]
145 |     # 插入到 table_pictures
146 |     def insert_table_pictures(self, an_id='', name=''):
147 |         sql = "INSERT INTO " + self.table_pictures + " (an_id, name) VALUES ('" + str(an_id) + "', '" + name + "')"
148 |         try:
149 |             self.cursor.execute(sql)
150 |             self.conn.commit()
151 |             print("Successfully insert " + name + " into " + self.table_pictures)
152 |         except Exception as e:
153 |             print("Failed to " + sql + str(e))
154 | 
155 |     # close database
156 |     # 关闭数据库连接
157 |     def close_mysql(self):
158 |         try:
159 |             self.cursor.close()
160 |             self.conn.close()
161 |         except Exception as e:
162 |             print("Failed to close mysql." + str(e))
163 | 
164 | 
165 | # conversion encode
166 | # 转换编码
167 | def Encode_Conversion(req):
168 |     if req.encoding == 'ISO-8859-1':
169 |         encodings = requests.utils.get_encodings_from_content(req.text)
170 |         if encodings:
171 |             encoding = encodings[0]
172 |         else:
173 |             encoding = req.apparent_encoding
174 |         # encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace')
175 |         encode_content = req.content.decode(encoding, 'replace')
176 |         return encode_content
177 |     else:
178 |         return ""
179 | 
180 | 
181 | # save [content] to [path]
182 | # 保存文本
183 | def Save_Text(id, path, content):
184 |     try:
185 |         f = open(path, "w", encoding='utf-8')
186 |         f.write(content)
187 |     except IOError:
188 |         print("[" + str(id) + "] IOError: File open failed.")
189 |     except Exception as e:
190 |         print("Save_Text Exception: " + str(e))
191 |     else:
192 |         # 内容写入文件成功
193 |         print("[" + str(id) + "] Successfully save the file to " + path)
194 |         f.close()
195 | 
196 | 
197 | # torrent and magnet-link page
198 | # 种子/磁力链接页面
199 | def Prase_Torrent(id, url):
200 |     try:
201 |         if (isProxy == True):
202 |             req = requests.get(url, params=torrent_request_header, proxies=proxies_header)
203 |         else:
204 |             req = requests.get(url, params=torrent_request_header)
205 | 
206 |         # soup转换
207 |         soup = BeautifulSoup(req.content, "html.parser")
208 | 
209 |         torrent_content = soup.select('.uk-button ')
210 |         torrent_content_num = len(torrent_content)
211 |         if torrent_content_num == 0:
212 |             print("[" + str(id) + "] No match torrent.")
213 |             return ''
214 |         for content in torrent_content:
215 |             str_content = str(content)
216 |             # 匹配磁力链接
217 |             matchObj = re.search(r'magnet(.*?)"', str_content)
218 |             if matchObj:
219 |                 magnet_link = 'magnet' + matchObj.group(1)
220 |                 return magnet_link
221 |             else:
222 |                 # 匹配失败
223 |                 print("[" + str(id) + "] No match: " + str_content)
224 |                 return ''
225 |     except Exception as e:
226 |         print("[" + str(id) + "] Prase_Torrent Exception: " + str(e))
227 | 
228 | 
229 | # each post page
230 | # 每个帖子页面
231 | def Prase_Post(id, url, folder_name):
232 |     try:
233 |         # 匹配日期
234 |         data = ''
235 |         matchObj = re.search(r'\[(.*?)\]', folder_name, re.M | re.I)
236 |         if matchObj:
237 |             data = matchObj.group(1)  # 文件夹名
238 |         else:
239 |             # 匹配失败
240 |             print("[" + str(id) + "] No match: " + folder_name)
241 | 
242 |         if (isProxy == True):
243 |             req = requests.get(url, params=request_header, proxies=proxies_header)
244 |         else:
245 |             req = requests.get(url, params=request_header)
246 | 
247 |         # 转换编码
248 |         encode_content = Encode_Conversion(req)
249 |         # soup转换
250 |         soup = BeautifulSoup(encode_content, "html.parser")
251 | 
252 |         post_content = soup.select('div[id="read_tpc"]')
253 |         post_content_num = len(post_content)
254 |         if post_content_num == 0:
255 |             print("[" + str(id) + "] No match post.")
256 |             return
257 | 
258 |         # 保存文本内容
259 |         summary = post_content[0].text
260 |         str_content = str(post_content[0])
261 | 
262 |         # 匹配种子
263 |         magnet_link = ''
264 |         matchObj = re.findall(r'href="(.*?)"', str_content)
265 |         if matchObj:
266 |             for obj in matchObj:
267 |                 magnet_link = Prase_Torrent(id, obj)
268 |         else:
269 |             # 匹配种子失败
270 |             print("[" + str(id) + "] No match: " + str_content)
271 | 
272 |         # 插入到数据库：insert_table_torrent 表
273 |         an_id = -1
274 |         if folder_name != '' and magnet_link != '':
275 |             an_id = mySQLCommand.insert_table_torrent(data=data, name=folder_name, summary=summary, magnet=magnet_link)
276 | 
277 |         if an_id != -1:
278 |             # 创建保存图片的文件夹
279 |             folder_path = save_path + '/' + str(an_id)
280 |             folder = os.path.exists(folder_path)
281 |             if not folder:
282 |                 os.makedirs(folder_path)
283 |                 print("[" + str(id) + "] Created folder " + str(an_id))
284 | 
285 |             # 匹配图片
286 |             matchObj = re.findall(r'window.open\(\'(.*?)\'\);', str_content)
287 |             if matchObj:
288 |                 for obj in matchObj:
289 |                     objTemp = obj
290 |                     strlist = objTemp.split('/')
291 |                     strlen = len(strlist)
292 |                     if strlen != 0:
293 |                         img_name = strlist[strlen - 1]
294 |                         try:
295 |                             urllib.request.urlretrieve(obj, folder_path + '/' + img_name)
296 |                         except Exception as e:
297 |                             print("[" + str(id) + "] Download the picture Exception: " + str(e))
298 |                         else:
299 |                             print("[" + str(id) + "] Successfully save the image to " + folder_path + '/' + img_name)
300 |                             # 插入数据库：insert_table_pictures 表
301 |                             mySQLCommand.insert_table_pictures(an_id=an_id, name=img_name)
302 |             else:
303 |                 # 匹配失败
304 |                 print("[" + str(id) + "] No match: " + str_content)
305 |     except Exception as e:
306 |         print("[" + str(id) + "] Prase_Post Exception: " + str(e))
307 | 
308 | 
309 | # post list page
310 | # 帖子列表页面
311 | def Post_list(id, page):
312 |     try:
313 |         post_url = base_url + 'thread-htm-fid-' + str(fid) + '-page-' + str(page) + '.html'
314 |         print('[' + str(id) + '] clicked: ' + post_url)
315 | 
316 |         if (isProxy == True):
317 |             req = requests.get(post_url, params=request_header, proxies=proxies_header)
318 |         else:
319 |             req = requests.get(post_url, params=request_header)
320 | 
321 |         # 转换编码
322 |         encode_content = Encode_Conversion(req)
323 | 
324 |         # soup转换
325 |         soup = BeautifulSoup(encode_content, "html.parser")
326 |         # 获取章节名称
327 |         post_list = soup.select('tr[class="tr3 t_one"] h3 a')
328 |         post_num = len(post_list)
329 |         if post_num == 0:
330 |             print("[" + str(id) + "] No match post_list.")
331 |             return
332 |         for post in post_list:
333 |             str_post = str(post)
334 |             # html网页的匹配
335 |             matchObj = re.match(r'(.*)href="(.*)" id=(.*)>(.*?)</a>', str_post, re.M | re.I)
336 |             if matchObj:
337 |                 post_url = matchObj.group(2)  # URL
338 |                 post_name = matchObj.group(4)  # 文件夹名
339 |                 if post_name != '':
340 |                     # 匹配每个帖子
341 |                     Prase_Post(id, base_url + post_url,
342 |                                post_name.replace(u'\0', u'').replace(u'/', u'.').replace(u'?',
343 |                                                                                          u'').replace(u'*',
344 |                                                                                                       u''))
345 |             else:
346 |                 # 匹配失败
347 |                 print("[" + str(id) + "] No match: " + str_post)
348 |     except Exception as e:
349 |         print("[" + str(id) + "] Post_list Exception." + str(e))
350 | 
351 | 
352 | # multi-threaded, the parameter [id] is the thread id
353 | # 多线程，参数 [id] 为线程 id
354 | def Work_thread(id):
355 |     try:
356 |         if id <= page_end:
357 |             prase_num = 0
358 |             prase_more_one = 0
359 |             page_num = abs(page_end - page_start) + 1
360 |             if id <= int(page_num % thread_num):
361 |                 prase_more_one = 1
362 |             page_num_each_thread = int(page_num / thread_num) + prase_more_one
363 |             for each_page in range(page_start + id - 1, page_end + 1, thread_num):
364 |                 Post_list(id, each_page)
365 |                 prase_num += 1
366 |                 print('[' + str(id) + '] [ ' + "{:.1f}".format(
367 |                     prase_num / page_num_each_thread * 100) + '% page completed ] ')
368 |             print('[' + str(id) + '] completed !!!!!')
369 |     except Exception as e:
370 |         print("[" + str(id) + "] Work_thread Exception." + str(e))
371 | 
372 | 
373 | if __name__ == "__main__":
374 |     # database command object 
375 |     # 数据库命令对象
376 |     mySQLCommand = MySQLCommand()
377 |     if mySQLCommand.connect_mysql() != -1:
378 |         # single thread # 单线程
379 |         # Work_thread(1)
380 |         # multithreading # 多线程
381 |         try:
382 |             for i in range(1, thread_num + 1):
383 |                 _thread.start_new_thread(Work_thread, (i,))
384 |         except Exception as e:
385 |             print("Start_new_thread Exception: " + str(e))
386 |         while 1:
387 |             pass
388 |         mySQLCommand.close_mysql()
389 | 


--------------------------------------------------------------------------------
/TorrentSpider_LatestCollection.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | #-*-coding:utf-8 -*-
  3 | import _thread
  4 | import re
  5 | import requests
  6 | from bs4 import BeautifulSoup
  7 | from urllib.parse import urlencode, parse_qs, urlsplit, urlunsplit
  8 | from urllib.request import urlretrieve
  9 | import urllib.request
 10 | import os
 11 | import time
 12 | 
 13 | 
 14 | # 1024 http request header
 15 | # 1024 网站请求头
 16 | proxt_1024_req_header = {
 17 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 18 |     'Accept-Encoding': 'gzip, deflate',
 19 |     'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
 20 |     'Cache-Control': 'no-cache',
 21 |     # 'Connection': 'keep - alive',
 22 |     'Cookie': '__cfduid=d7e5c699ef4d6599ef01239424b0e6cd71547705542; aafaf_lastvisit=0%09154' \
 23 |               '7705542%09%2Fpw%2Findex.php%3F; UM_distinctid=1685a707030539-0653970bbabd2b-46564b55' \
 24 |               '-1fa400-1685a707031a0a; CNZZDATA1261158850=317005769-1547705297-%7C1547705297',
 25 |     'Host': 'w3.jbzcjsj.pw',
 26 |     'Pragma': 'no-cache',
 27 |     'Proxy-Connection': 'keep-alive',
 28 |     #'Referer': 'http://w3.afulyu.pw/pw/thread.php?fid=17&page=1',
 29 |     'Upgrade-Insecure-Requests': '1',
 30 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \
 31 |                   '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116'
 32 | }
 33 | request_header = proxt_1024_req_header
 34 | 
 35 | # magnet-link website http request header
 36 | # 磁力链接网站网站请求头
 37 | proxt_torrent_req_header = {
 38 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 39 |     'Accept-Encoding': 'gzip, deflate',
 40 |     'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
 41 |     'Cache-Control': 'no-cache',
 42 |     # 'Connection': 'keep - alive',
 43 |     'Cookie': '__cfduid=d062c450fc125c2a02de05db8586dc1941547731587; UM_distinctid=1685bfdd4' \
 44 |               'd4854-0edeecf536f3fc-46564b55-1fa400-1685bfdd4d515b4; CNZZDATA1273152310=651528679' \
 45 |               '-1547731013-http%253A%252F%252Fw3.jbzcjsj.pw%252F%7C1547731013; _ga=GA1.2.845482462.' \
 46 |               '1547731588; _gid=GA1.2.2026642011.1547731588',
 47 |     'Host': 'www1.downsx.club',
 48 |     'Pragma': 'no-cache',
 49 |     'Proxy-Connection': 'keep-alive',
 50 |     'Referer': 'http://w3.jbzcjsj.pw/pw/html_data/3/1901/3855151.html',
 51 |     'Upgrade-Insecure-Requests': '1',
 52 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \
 53 |               '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116'
 54 | }
 55 | torrent_request_header = proxt_torrent_req_header
 56 | opener=urllib.request.build_opener()
 57 | opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' \
 58 |                   ' (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116')]
 59 | urllib.request.install_opener(opener)
 60 | 
 61 | # proxy settings
 62 | # 代理设置
 63 | proxies = {'http': '127.0.0.1:1080', "https": "127.0.0.1:1080", }
 64 | proxies_header = proxies
 65 | isProxy = False                                          # 是否设置代理
 66 | 
 67 | base_url = "http://w3.jbzcjsj.pw/pw/"                # 基础url
 68 | save_path = "D:/code/Pycharm/1024Spider/torrent"   # 存储图片路径
 69 | fid = 3                                                  # fid=3 表示最新合集
 70 | page_start = 1                                           # 爬取的开始页
 71 | page_end = 245                                           # 爬取的结束页
 72 | thread_num = 1                                           # 线程数
 73 | 
 74 | 
 75 | # conversion encode
 76 | # 转换编码
 77 | def Encode_Conversion(req):
 78 |     if req.encoding == 'ISO-8859-1':
 79 |         encodings = requests.utils.get_encodings_from_content(req.text)
 80 |         if encodings:
 81 |             encoding = encodings[0]
 82 |         else:
 83 |             encoding = req.apparent_encoding
 84 | 
 85 |         # encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace')
 86 |         encode_content = req.content.decode(encoding, 'replace')  # 如果设置为replace，则会用?取代非法字符；
 87 |         return encode_content
 88 |     else:
 89 |         return ""
 90 | 
 91 | 
 92 | # save [content] to [path]
 93 | # 保存文本
 94 | def Save_Text(id, path, content):
 95 |     try:
 96 |         f = open(path, "w", encoding='utf-8')
 97 |         f.write(content)
 98 |     except IOError:
 99 |         print("[" + str(id) + "] IOError: File open failed.")
100 |     except Exception as e:
101 |         print("Save_Text Exception: " + str(e))
102 |     else:
103 |         # 内容写入文件成功
104 |         print("[" + str(id) + "] Successfully save the file to " + path)
105 |         f.close()
106 | 
107 | 
108 | # torrent and magnet-link page
109 | # 种子/磁力链接页面
110 | def Prase_Torrent(id, url, folder_path):
111 |     try:
112 |         if (isProxy == True):
113 |             req = requests.get(url, params=torrent_request_header, proxies=proxies_header)
114 |         else:
115 |             req = requests.get(url, params=torrent_request_header)
116 | 
117 |         # soup转换
118 |         soup = BeautifulSoup(req.content, "html.parser")
119 | 
120 |         torrent_content = soup.select('.uk-button ')
121 |         torrent_content_num = len(torrent_content)
122 |         if torrent_content_num == 0:
123 |             print("[" + str(id) + "] No match torrent.")
124 |             return
125 |         for content in torrent_content:
126 |             str_content = str(content)
127 |             # 匹配磁力链接
128 |             matchObj = re.search(r'magnet(.*?)"', str_content)
129 |             if matchObj:
130 |                 magnet_link = 'magnet' + matchObj.group(1)
131 |                 urlTemp = url
132 |                 strlist = urlTemp.split('/')
133 |                 strlen = len(strlist)
134 |                 if strlen != 0:
135 |                     torrent_name = strlist[strlen - 1]
136 |                     if torrent_name != "":
137 |                         savePath = folder_path + "/" + str(torrent_name).replace(u'\0', u'').replace(u'\t',
138 |                                                                                                       u'') + ".txt"
139 |                         Save_Text(id, savePath, magnet_link)
140 |             else:
141 |                 # 匹配失败
142 |                 print("[" + str(id) + "] No match: " + str_content)
143 |     except Exception as e:
144 |         print("[" + str(id) + "] Prase_Torrent Exception: " + str(e))
145 | 
146 | 
147 | # each post page
148 | # 每个帖子页面
149 | def Prase_Post(id, url, folder_name):
150 |    try:
151 |        if (isProxy == True):
152 |            req = requests.get(url, params=request_header, proxies=proxies_header)
153 |        else:
154 |            req = requests.get(url, params=request_header)
155 | 
156 |        # 转换编码
157 |        encode_content = Encode_Conversion(req)
158 |        # soup转换
159 |        soup = BeautifulSoup(encode_content, "html.parser")
160 | 
161 |        post_content = soup.select('div[id="read_tpc"]')
162 |        post_content_num = len(post_content)
163 |        if post_content_num == 0:
164 |            print("[" + str(id) + "] No match post.")
165 |            return
166 | 
167 |        # 创建保存的文件夹
168 |        folder_path = save_path + '/' + folder_name
169 |        folder = os.path.exists(folder_path)
170 |        if not folder:
171 |            os.makedirs(folder_path)
172 |            print("[" + str(id) + "] Created folder " + folder_name)
173 | 
174 |        # 保存文本内容
175 |        result = post_content[0].text
176 |        Save_Text(id, folder_path + '/index.txt', result)
177 |        for content in post_content:
178 |            str_content = str(content)
179 | 
180 |            # 匹配种子
181 |            matchObj = re.findall(r'href="(.*?)"', str_content)
182 |            if matchObj:
183 |                for obj in matchObj:
184 |                    Prase_Torrent(id, obj, folder_path)
185 |            else:
186 |                # 匹配失败
187 |                print("[" + str(id) + "] No match: " + str_content)
188 | 
189 |            # 匹配图片
190 |            matchObj = re.findall(r'window.open\(\'(.*?)\'\);', str_content)
191 |            if matchObj:
192 |                for obj in matchObj:
193 |                    objTemp = obj
194 |                    strlist = objTemp.split('/')
195 |                    strlen = len(strlist)
196 |                    if strlen != 0:
197 |                        img_name = strlist[strlen - 1]
198 |                        try:
199 |                             urllib.request.urlretrieve(obj, folder_path + '/' + img_name)
200 |                        except Exception as e:
201 |                             print("[" + str(id) + "] Download the picture Exception: " + str(e))
202 |                        else:
203 |                            print("[" + str(id) + "] Successfully save the image to " + folder_path + '/' + img_name)
204 |            else:
205 |                # 匹配失败
206 |                print("[" + str(id) + "] No match: " + str_content)
207 |    except Exception as e:
208 |        print("[" + str(id) + "] Prase_Post Exception: " + str(e))
209 | 
210 | 
211 | # post list page
212 | # 帖子列表页面
213 | def Post_list(id, page):
214 |     try:
215 |         post_url = base_url + 'thread-htm-fid-' + str(fid) + '-page-' + str(page) + '.html'
216 |         print('[' + str(id) + '] clicked: ' + post_url)
217 | 
218 |         if (isProxy == True):
219 |             req = requests.get(post_url, params=request_header, proxies=proxies_header)
220 |         else:
221 |             req = requests.get(post_url, params=request_header)
222 | 
223 |         # 转换编码
224 |         encode_content = Encode_Conversion(req)
225 | 
226 |         # soup转换
227 |         soup = BeautifulSoup(encode_content, "html.parser")
228 |         # 获取帖子名称
229 |         post_list = soup.select('tr[class="tr3 t_one"] h3 a')
230 |         post_num = len(post_list)
231 |         if post_num == 0:
232 |             print("[" + str(id) + "] No match post_list.")
233 |             return
234 |         for post in post_list:
235 |             str_post = str(post)
236 |             # 帖子网页的匹配
237 |             matchObj = re.match(r'(.*)href="(.*)" id=(.*)>(.*?)</a>', str_post, re.M | re.I)
238 |             if matchObj:
239 |                 post_url = matchObj.group(2)  # URL
240 |                 post_name = matchObj.group(4)  # 文件夹名
241 |                 if post_name != '':
242 |                     # 匹配每个帖子
243 |                     Prase_Post(id, base_url + post_url,
244 |                                post_name.replace(u'\0', u'').replace(u'/', u'.').replace(u'?',
245 |                                                                                          u'').replace(u'*',
246 |                                                                                                       u''))
247 |             else:
248 |                 # 匹配失败
249 |                 print("[" + str(id) + "] No match: " + str_post)
250 |     except Exception as e:
251 |         print("[" + str(id) + "] Post_list Exception." + str(e))
252 | 
253 | 
254 | # multi-threaded, the parameter [id] is the thread id
255 | # 多线程，参数 [id] 为线程 id
256 | def Work_thread(id):
257 |     try:
258 |         if id <= page_end:
259 |             prase_num = 0
260 |             prase_more_one = 0
261 |             page_num = abs(page_end - page_start) + 1
262 |             if id <= int(page_num % thread_num):
263 |                 prase_more_one = 1
264 |             page_num_each_thread = int(page_num / thread_num) + prase_more_one
265 |             for each_page in range(page_start + id - 1, page_end + 1, thread_num):
266 |                 Post_list(id, each_page)
267 |                 prase_num += 1
268 |                 print('[' + str(id) + '] [ ' + "{:.1f}".format(
269 |                     prase_num / page_num_each_thread * 100) + '% page completed ] ')
270 |             print('[' + str(id) + '] completed !!!!!')
271 |     except Exception as e:
272 |         print("[" + str(id) + "] Work_thread Exception." + str(e))
273 | 
274 | 
275 | if __name__ == "__main__":
276 |     # single thread # 单线程
277 |     # Work_thread(1)
278 |     # multithreading # 多线程
279 |     try:
280 |         for i in range(1, thread_num + 1):
281 |             _thread.start_new_thread(Work_thread, (i,))
282 |     except Exception as e:
283 |         print("Start_new_thread Exception: " + str(e))
284 |     while 1:
285 |         pass
286 | 


--------------------------------------------------------------------------------
/config_template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "db": {
 3 |         "host": "127.0.0.1",
 4 |         "port": "3306",
 5 |         "user": "",
 6 |         "password": "",
 7 |         "db": "",
 8 |         "table_AsianNomosaic": "AsianNomosaic",
 9 |         "table_AsianNomosaicPictures": "AsianNomosaicPictures"
10 |     },
11 |     "_1024_req_header": {
12 |         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
13 |         "Accept-Encoding": "gzip, deflate",
14 |         "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
15 |         "Cache-Control": "no-cache",
16 |         "Cookie": "__cfduid=d7e5c699ef4d6599ef01239424b0e6cd71547705542; aafaf_lastvisit=0%091547705542%09%2Fpw%2Findex.php%3F; UM_distinctid=1685a707030539-0653970bbabd2b-46564b55-1fa400-1685a707031a0a; CNZZDATA1261158850=317005769-1547705297-%7C1547705297",
17 |         "Host": "w3.jbzcjsj.pw",
18 |         "Pragma": "no-cache",
19 |         "Proxy-Connection": "keep-alive",
20 |         "Upgrade-Insecure-Requests": "1",
21 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116"
22 |     },
23 |     "_torrent_req_header": {
24 |         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
25 |         "Accept-Encoding": "gzip, deflate",
26 |         "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
27 |         "Cache-Control": "no-cache",
28 |         "Cookie": "__cfduid=d062c450fc125c2a02de05db8586dc1941547731587; UM_distinctid=1685bfdd4d4854-0edeecf536f3fc-46564b55-1fa400-1685bfdd4d515b4; CNZZDATA1273152310=651528679-1547731013-http%253A%252F%252Fw3.jbzcjsj.pw%252F%7C1547731013; _ga=GA1.2.845482462.1547731588; _gid=GA1.2.2026642011.1547731588",
29 |         "Host": "w3.jbzcjsj.pw",
30 |         "Pragma": "no-cache",
31 |         "Proxy-Connection": "keep-alive",
32 |         "Referer": "http://w3.jbzcjsj.pw/pw/html_data/3/1901/3855151.html",
33 |         "Upgrade-Insecure-Requests": "1",
34 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116"
35 |     },
36 |     "proxies": {
37 |         "http": "127.0.0.1:1080",
38 |         "https": "127.0.0.1:1080"
39 |     },
40 |     "is_proxy": "False",
41 |     "fid": "5",
42 |     "base_url": "http://w3.jbzcjsj.pw/pw/",
43 |     "save_path": "D:/code/Pycharm/1024Spider/torrent_asian_nomosaic",
44 |     "page_start": "1",
45 |     "page_end": "913",
46 |     "thread_num": "1"
47 | }


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | bs4
2 | html5lib
3 | lxml
4 | requests
5 | urllib3
6 | pymysql


--------------------------------------------------------------------------------