├── LICENSE ├── NovelSpider.py ├── README.md ├── TorrentSpider_AsianNomosaic.py ├── TorrentSpider_AsianNomosaic_DB.py ├── TorrentSpider_AsianNomosaic_With_Json.py ├── TorrentSpider_EuropeAmerica_DB.py ├── TorrentSpider_JapaneseCavalry_DB.py ├── TorrentSpider_LatestCollection.py ├── config_template.json └── requirements.txt /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Menghui Xie 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NovelSpider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*- coding: UTF-8 -*- 3 | import urllib 4 | import urllib.parse 5 | import urllib.request 6 | from urllib.request import urlopen 7 | import requests 8 | import threading 9 | from bs4 import BeautifulSoup 10 | import re 11 | from urllib.parse import urlencode, parse_qs, urlsplit, urlunsplit 12 | 13 | 14 | # conversion encode 15 | # 转换编码 16 | def encodeConversion(req): 17 | if req.encoding == 'ISO-8859-1': 18 | encodings = requests.utils.get_encodings_from_content(req.text) 19 | if encodings: 20 | encoding = encodings[0] 21 | else: 22 | encoding = req.apparent_encoding 23 | # encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace') 24 | encode_content = req.content.decode(encoding, 'replace') 25 | return encode_content 26 | else: 27 | return "" 28 | 29 | 30 | # 设置 http 请求的参数 31 | # set the parameters of the http request 32 | def set_query_parameter(url, param_name, param_value): 33 | """Given a URL, set or replace a query parameter and return the 34 | modified URL. 35 | 36 | >>> set_query_parameter('http://example.com?foo=bar&biz=baz', 'foo', 'stuff') 37 | 'http://example.com?foo=stuff&biz=baz' 38 | 39 | """ 40 | scheme, netloc, path, query_string, fragment = urlsplit(url) 41 | query_params = parse_qs(query_string) 42 | 43 | query_params[param_name] = [param_value] 44 | new_query_string = urlencode(query_params, doseq=True) 45 | 46 | return urlunsplit((scheme, netloc, path, new_query_string, fragment)) 47 | 48 | 49 | # each novel post page 50 | # 每个小说帖子页面 51 | def praseHtml(req_url , headers, path): 52 | try: 53 | # 请求当前章节页面,params 为请求参数 54 | global isProxy 55 | global proxies 56 | if (isProxy == True): 57 | req = requests.get(req_url, params=headers, proxies=proxies) 58 | else: 59 | req = requests.get(req_url, params=headers) 60 | 61 | # 转换编码 62 | encode_content = encodeConversion(req) 63 | # soup转换 64 | soup = BeautifulSoup(encode_content, "html.parser") 65 | # 获取章节名称 66 | section_name = soup.select('#subject_tpc')[0].text 67 | # 获取章节文本 68 | section_text = soup.select('#read_tpc')[0].text 69 | result = section_name + '\n' + section_text 70 | result = result.replace('  ', '\n ') 71 | 72 | if result != "" and section_name != "": 73 | savePath = path + "\\" + str(section_name).replace(u'\0', u'').replace(u'\t', u'') + ".txt" 74 | f = open(savePath, "w", encoding='utf-8') 75 | f.write(result) 76 | except ValueError: 77 | print("ValueError: 传入无效的参数" + req_url) 78 | except IndexError: 79 | print("IndexError: 没有此网页索引:" + req_url) 80 | except IOError: 81 | print("IOError: 没有找到文件或读取文件失败" + req_url) 82 | except Exception as e: 83 | print("Exception: 存在异常" + e + req_url) 84 | else: 85 | # 内容写入文件成功 86 | print(req_url, end='') 87 | f.close() 88 | 89 | 90 | # novel post list page 91 | # 成人小说帖子列表页面 92 | def novelList(directory_url, fid, page , chapter_url, headers, path): 93 | # content_url = directory_url + '?fid='+str(fid)+"&page="+str(page) 94 | 95 | directory_url = set_query_parameter(directory_url, 'fid', fid) 96 | directory_url = set_query_parameter(directory_url, 'page', page) 97 | 98 | print(directory_url + ' start downloading') 99 | 100 | # 请求当前章节页面 params为请求参数 101 | global isProxy 102 | if(isProxy == True): 103 | req = requests.get(directory_url, params=headers, proxies=proxies) 104 | else: 105 | req = requests.get(directory_url, params=headers) 106 | 107 | # 转换编码 108 | encode_content = encodeConversion(req) 109 | 110 | # soup转换 111 | soup = BeautifulSoup(encode_content, "html.parser") 112 | # 获取章节名称 113 | section_list = soup.select('.tr3 h3 a') 114 | section_num = len(section_list) 115 | if section_num == 0: 116 | print("目录页面不正确,无法找到匹配项!") 117 | return -1 118 | for section in section_list: 119 | str_section = str(section) 120 | # php网页的匹配 121 | matchObj_act = re.match(r'(.*)a_ajax_(.*)">(.*?)', str_section, re.M | re.I) 122 | if matchObj_act: 123 | section_sub = matchObj_act.group(2) # 章节的标识 124 | section_name = matchObj_act.group(3) # 章节的名字 125 | global php_chapter_url # php的章节URL 126 | php_chapter_url = set_query_parameter(php_chapter_url, 'tid', section_sub) 127 | php_chapter_url = set_query_parameter(php_chapter_url, 'fpage', page) 128 | praseHtml(php_chapter_url, headers, path) 129 | prase_num = section_list.index(section) + 1 130 | print(' [ ' + "{:.1f}".format(prase_num / section_num * 100) + '% chapter completed ] ') 131 | else: 132 | # html网页的匹配 133 | matchObj = re.match(r'(.*)href="htm_data(.*)" id=(.*)>(.*?)', str_section, re.M | re.I) 134 | if matchObj: 135 | section_sub = matchObj.group(2) # 章节的标识 136 | section_name = matchObj.group(4) # 章节的名字 137 | # 传入html章节的URL 138 | praseHtml(chapter_url + section_sub, headers, path) 139 | prase_num = section_list.index(section) + 1 140 | print(' [ ' + "{:.1f}".format(prase_num / section_num * 100) + '% chapter completed ] ') 141 | else: 142 | # 匹配失败 143 | print("No match: " + str_section) 144 | return -1 145 | 146 | 147 | # Crawl start page to end page, statistical results 148 | # 爬取开始页到结束页,统计结果 149 | def spider(directory_url, fid, page_start, page_end, chapter_url, novel_list_req_header, path): 150 | page_num = abs(page_end-page_start)+1 151 | for each_page in range(page_start, page_end): 152 | list_return = novelList(directory_url, fid, each_page, chapter_url, novel_list_req_header, path) 153 | if list_return == -1: 154 | break 155 | prase_num = abs(each_page - page_start)+1 156 | print(' [ ' + "{:.1f}".format(prase_num/page_num*100) + '% page completed ] ') 157 | 158 | 159 | if __name__ == "__main__": 160 | # request header 161 | # 请求头字典 162 | novel_list_req_header = { 163 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 164 | 'Accept-Encoding': 'gzip, deflate', 165 | 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', 166 | 'Cache-Control': 'no-cache', 167 | 'Connection': 'keep - alive', 168 | 'Cookie':'UM_distinctid=16574ce27ac246-04d3d1f1292635-9393265-1fa400-16574ce27aeff; aafaf_readlog'\ 169 | '=%2C1245721%2C; aafaf_ol_offset=35448165; CNZZDATA1261158850=1879378976-1535261549-%7C1535279419;'\ 170 | ' aafaf_lastpos=F17; aafaf_threadlog=%2C18%2C14%2C15%2C16%2C17%2C; aafaf_lastvisit=7839%09153528353'\ 171 | '2%09%2Fpw%2Fthread.php%3Ffid%3D17%26page%3D2', 172 | 'Host': 'w3.afulyu.pw', 173 | 'Pragma': 'no-cache', 174 | # 'Proxy-Connection': 'keep-alive', 175 | 'Referer': 'http://w3.afulyu.pw/pw/thread.php?fid=17&page=1', 176 | 'Upgrade-Insecure-Requests': '1', 177 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'\ 178 | ' Chrome/68.0.3440.106 Safari/537.36' 179 | } 180 | # proxy request header 181 | # 代理时的请求头字典 182 | proxt_novel_list_req_header = { 183 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 184 | 'Accept-Encoding': 'gzip, deflate', 185 | 'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 186 | 'Cache-Control': 'no-cache', 187 | # 'Connection': 'keep - alive', 188 | 'Cookie': '__cfduid=d7e5c699ef4d6599ef01239424b0e6cd71547705542; aafaf_lastvisit=0%091547705542%09%2Fpw%2Findex.php%3F;' \ 189 | ' UM_distinctid=1685a707030539-0653970bbabd2b-46564b55-1fa400-1685a707031a0a; ' \ 190 | 'CNZZDATA1261158850=317005769-1547705297-%7C1547705297', 191 | 'Host': 'w3.jbzcjsj.pw', 192 | 'Pragma': 'no-cache', 193 | 'Proxy-Connection': 'keep-alive', 194 | 'Referer': 'http://w3.afulyu.pw/pw/thread.php?fid=17&page=1', 195 | 'Upgrade-Insecure-Requests': '1', 196 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \ 197 | '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116' 198 | } 199 | 200 | global php_chapter_url 201 | global isProxy 202 | global proxies 203 | directory_url = "http://w3.afulyu.pw/pw/thread.php" # 小说目录url 204 | html_chapter_url = 'http://w3.afulyu.pw/pw/htm_data' # 每篇小说的html页面 205 | php_chapter_url = 'http://w3.afulyu.pw/pw/read.php' # 每篇小说的php页面 206 | save_path = 'D:\\code\\Pycharm\\1024Spider\\novel' # 保存在本地的路径 207 | proxies = {'http': '127.0.0.1:1080', "https": "127.0.0.1:1080", } # 代理信息 208 | fid = 17 # 网站帖子类型,17代表小说 209 | page_start = 1 # 小说目录开始页面 210 | page_end = 940 # 小说目录结束页面 211 | isProxy = False # 是否设置代理 212 | 213 | spider(directory_url, fid, page_start, page_end, html_chapter_url, proxt_novel_list_req_header, save_path) 214 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 1024 Spiders 2 | 3 | Some 1024 spiders, crawl xp1024 porn information and magnet-links and insert them into the database. 4 | 5 | * Python 3.6 6 | * MySQL 8.0 7 | 8 | ## Deploy 9 | 10 | ### Clone & Install 11 | 12 | ``` 13 | git clone git@github.com:huihut/1024Spiders.git 14 | cd 1024Spiders && pip install -r requirements.txt 15 | ``` 16 | 17 | ### Configure 18 | 19 | * No_Json 20 | 21 | Configure your database, request_header, save_path, etc. in the source code. 22 | 23 | * With_Json 24 | 25 | 1. `mv config_template.json config.json` ( [config_template.json](config_template.json) ) 26 | 2. Configure your database, request_header, save_path, etc. in the `config.json`. 27 | 28 | ### Run front-end process 29 | 30 | ``` 31 | python TorrentSpider_AsianNomosaic_DB.py 32 | ``` 33 | 34 | ### Run background process 35 | 36 | ``` 37 | nohup python -u TorrentSpider_AsianNomosaic_DB.py > TorrentSpider_AsianNomosaic_DB.log 2>&1 & 38 | ``` 39 | 40 | ## Database 41 | 42 | ```mysql 43 | mysql> show tables; 44 | +-------------------------+ 45 | | Tables_in_torrent | 46 | +-------------------------+ 47 | | AsianNomosaic | 48 | | AsianNomosaicPictures | 49 | | EuropeAmerica | 50 | | EuropeAmericaPictures | 51 | | JapaneseCavalry | 52 | | JapaneseCavalryPictures | 53 | +-------------------------+ 54 | 6 rows in set (0.01 sec) 55 | 56 | mysql> desc AsianNomosaic; 57 | +---------+-----------+------+-----+---------+----------------+ 58 | | Field | Type | Null | Key | Default | Extra | 59 | +---------+-----------+------+-----+---------+----------------+ 60 | | id | int(11) | NO | PRI | NULL | auto_increment | # porn id 61 | | data | char(10) | YES | | NULL | | # porn date 62 | | name | char(255) | NO | | NULL | | # porn name 63 | | summary | text | YES | | NULL | | # porn introduction 64 | | magnet | char(255) | NO | | NULL | | # porn magnet-link 65 | +---------+-----------+------+-----+---------+----------------+ 66 | 5 rows in set (0.00 sec) 67 | 68 | mysql> desc AsianNomosaicPictures; 69 | +-------+-----------+------+-----+---------+----------------+ 70 | | Field | Type | Null | Key | Default | Extra | 71 | +-------+-----------+------+-----+---------+----------------+ 72 | | id | int(11) | NO | PRI | NULL | auto_increment | # picture id 73 | | an_id | int(11) | NO | | NULL | | # porn id 74 | | name | char(255) | NO | | NULL | | # picture name 75 | +-------+-----------+------+-----+---------+----------------+ 76 | 3 rows in set (0.00 sec) 77 | ``` -------------------------------------------------------------------------------- /TorrentSpider_AsianNomosaic.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*-coding:utf-8 -*- 3 | import _thread 4 | import re 5 | import requests 6 | from bs4 import BeautifulSoup 7 | from urllib.parse import urlencode, parse_qs, urlsplit, urlunsplit 8 | from urllib.request import urlretrieve 9 | import urllib.request 10 | import os 11 | import time 12 | import threading 13 | 14 | 15 | # 1024 http request header 16 | # 1024 网站请求头 17 | proxt_1024_req_header = { 18 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 19 | 'Accept-Encoding': 'gzip, deflate', 20 | 'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 21 | 'Cache-Control': 'max-age=0', 22 | # 'Connection': 'keep - alive', 23 | 'Cookie': '__cfduid=d4e99b476e7372dec9a44b67f533f37aa1548178386; aafaf_lastvisit=0%091548178386%' \ 24 | '09%2Fpw%2Fthread.php%3Ffid-5-page-5.html; aafaf_lastpos=F5; aafaf_threadlog=%2C5%2C; ' \ 25 | 'aafaf_ol_offset=32368318; UM_distinctid=168769f77ac958-0509e825886dfc-46564b55-1fa400-16876' \ 26 | '9f77ad1302; CNZZDATA1261158850=393281613-1548174901-%7C1548174901', 27 | 'Host': 'w3.jbzcjsj.pw', 28 | # 'Pragma': 'no-cache', 29 | 'Proxy-Connection': 'keep-alive', 30 | 'Referer': 'http://w3.jbzcjsj.pw/pw/thread-htm-fid-5-page-5.html', 31 | 'Upgrade-Insecure-Requests': '1', 32 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' \ 33 | ' Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116' 34 | } 35 | request_header = proxt_1024_req_header 36 | 37 | # magnet-link website http request header 38 | # 磁力链接网站网站请求头 39 | proxt_torrent_req_header = { 40 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 41 | 'Accept-Encoding': 'gzip, deflate', 42 | 'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 43 | 'Cache-Control': 'no-cache', 44 | # 'Connection': 'keep - alive', 45 | 'Cookie': '__cfduid=d7f5104b5a516916674841b656d67dde31548178497; UM_distinctid=16876a1266dfb2-018bb685bad' \ 46 | '0ed-46564b55-1fa400-16876a1266e1d2; CNZZDATA1273152310=501204684-1548176963-http%253A%2' \ 47 | '52F%252Fw3.jbzcjsj.pw%252F%7C1548176963; _ga=GA1.2.1886522142.1548178499; _gid=GA1.2.16499' \ 48 | '32666.1548178499; _gat=1', 49 | 'Host': 'www1.downsx.com', 50 | 'Pragma': 'no-cache', 51 | 'Proxy-Connection': 'keep-alive', 52 | 'Referer': 'http://w3.jbzcjsj.pw/pw/html_data/5/1901/3863561.html', 53 | 'Upgrade-Insecure-Requests': '1', 54 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \ 55 | '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116' 56 | } 57 | torrent_request_header = proxt_torrent_req_header 58 | opener=urllib.request.build_opener() 59 | opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' \ 60 | ' (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116')] 61 | urllib.request.install_opener(opener) 62 | 63 | # proxy settings 64 | # 代理设置 65 | proxies = {'http': '127.0.0.1:1080', "https": "127.0.0.1:1080", } 66 | proxies_header = proxies 67 | isProxy = False # 是否设置代理 68 | 69 | base_url = "http://w3.jbzcjsj.pw/pw/" # 基础url 70 | save_path = "D:/code/Pycharm/1024Spider/torrent_asian_nomosaic" # 存储图片路径 71 | fid = 5 # fid=5 表示亚洲无码 72 | page_start = 1 # 爬取的开始页 73 | page_end = 928 # 爬取的结束页 74 | thread_num = 1 # 线程数 75 | 76 | 77 | # conversion encode 78 | # 转换编码 79 | def Encode_Conversion(req): 80 | if req.encoding == 'ISO-8859-1': 81 | encodings = requests.utils.get_encodings_from_content(req.text) 82 | if encodings: 83 | encoding = encodings[0] 84 | else: 85 | encoding = req.apparent_encoding 86 | # encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace') 87 | encode_content = req.content.decode(encoding, 'replace') 88 | return encode_content 89 | else: 90 | return "" 91 | 92 | 93 | # save [content] to [path] 94 | # 保存文本 95 | def Save_Text(id, path, content): 96 | try: 97 | f = open(path, "w", encoding='utf-8') 98 | f.write(content) 99 | except IOError: 100 | print("[" + str(id) + "] IOError: File open failed.") 101 | except Exception as e: 102 | print("Save_Text Exception: " + str(e)) 103 | else: 104 | # 内容写入文件成功 105 | print("[" + str(id) + "] Successfully save the file to " + path) 106 | f.close() 107 | 108 | 109 | # torrent and magnet-link page 110 | # 种子/磁力链接页面 111 | def Prase_Torrent(id, url, folder_path): 112 | try: 113 | if (isProxy == True): 114 | req = requests.get(url, params=torrent_request_header, proxies=proxies_header) 115 | else: 116 | req = requests.get(url, params=torrent_request_header) 117 | 118 | # soup转换 119 | soup = BeautifulSoup(req.content, "html.parser") 120 | 121 | torrent_content = soup.select('.uk-button ') 122 | torrent_content_num = len(torrent_content) 123 | if torrent_content_num == 0: 124 | print("[" + str(id) + "] No match torrent.") 125 | return '' 126 | for content in torrent_content: 127 | str_content = str(content) 128 | # 匹配磁力链接 129 | matchObj = re.search(r'magnet(.*?)"', str_content) 130 | if matchObj: 131 | magnet_link = 'magnet' + matchObj.group(1) 132 | return magnet_link 133 | else: 134 | # 匹配失败 135 | print("[" + str(id) + "] No match: " + str_content) 136 | return '' 137 | except Exception as e: 138 | print("[" + str(id) + "] Prase_Torrent Exception: " + str(e)) 139 | 140 | 141 | # each post page 142 | # 每个帖子页面 143 | def Prase_Post(id, url, folder_name): 144 | try: 145 | if (isProxy == True): 146 | req = requests.get(url, params=request_header, proxies=proxies_header) 147 | else: 148 | req = requests.get(url, params=request_header) 149 | 150 | # 转换编码 151 | encode_content = Encode_Conversion(req) 152 | # soup转换 153 | soup = BeautifulSoup(encode_content, "html.parser") 154 | 155 | post_content = soup.select('div[id="read_tpc"]') 156 | post_content_num = len(post_content) 157 | if post_content_num == 0: 158 | print("[" + str(id) + "] No match post.") 159 | return 160 | 161 | # 创建保存的文件夹 162 | folder_path = save_path + '/' + folder_name 163 | folder = os.path.exists(folder_path) 164 | if not folder: 165 | os.makedirs(folder_path) 166 | print("[" + str(id) + "] Created folder " + folder_name) 167 | 168 | # 保存文本内容 169 | result = post_content[0].text 170 | magnet_link = '' 171 | for content in post_content: 172 | str_content = str(content) 173 | 174 | # 匹配种子 175 | matchObj = re.findall(r'href="(.*?)"', str_content) 176 | if matchObj: 177 | for obj in matchObj: 178 | magnet_link = Prase_Torrent(id, obj, folder_path) 179 | else: 180 | # 匹配失败 181 | print("[" + str(id) + "] No match: " + str_content) 182 | 183 | # 匹配图片 184 | matchObj = re.findall(r'window.open\(\'(.*?)\'\);', str_content) 185 | if matchObj: 186 | for obj in matchObj: 187 | objTemp = obj 188 | strlist = objTemp.split('/') 189 | strlen = len(strlist) 190 | if strlen != 0: 191 | img_name = strlist[strlen - 1] 192 | try: 193 | urllib.request.urlretrieve(obj, folder_path + '/' + img_name) 194 | except Exception as e: 195 | print("[" + str(id) + "] Download the picture Exception: " + str(e)) 196 | else: 197 | print("[" + str(id) + "] Successfully save the image to " + folder_path + '/' + img_name) 198 | else: 199 | # 匹配失败 200 | print("[" + str(id) + "] No match: " + str_content) 201 | # 保存到文件 202 | if magnet_link != '': 203 | result = result + '\n\n' + magnet_link 204 | Save_Text(id, folder_path + '/index.txt', result) 205 | except Exception as e: 206 | print("[" + str(id) + "] Prase_Post Exception: " + str(e)) 207 | 208 | 209 | # post list page 210 | # 帖子列表页面 211 | def Post_list(id, page): 212 | try: 213 | post_url = base_url + 'thread-htm-fid-' + str(fid) + '-page-' + str(page) + '.html' 214 | print('[' + str(id) + '] clicked: ' + post_url) 215 | 216 | if (isProxy == True): 217 | req = requests.get(post_url, params=request_header, proxies=proxies_header) 218 | else: 219 | req = requests.get(post_url, params=request_header) 220 | 221 | # 转换编码 222 | encode_content = Encode_Conversion(req) 223 | 224 | # soup转换 225 | soup = BeautifulSoup(encode_content, "html.parser") 226 | # 获取章节名称 227 | post_list = soup.select('tr[class="tr3 t_one"] h3 a') 228 | post_num = len(post_list) 229 | if post_num == 0: 230 | print("[" + str(id) + "] No match post_list.") 231 | return 232 | for post in post_list: 233 | str_post = str(post) 234 | # html网页的匹配 235 | matchObj = re.match(r'(.*)href="(.*)" id=(.*)>(.*?)', str_post, re.M | re.I) 236 | if matchObj: 237 | post_url = matchObj.group(2) # URL 238 | post_name = matchObj.group(4) # 文件夹名 239 | if post_name != '': 240 | # 匹配每个帖子 241 | Prase_Post(id, base_url + post_url, 242 | post_name.replace(u'\0', u'').replace(u'/', u'.').replace(u'?', 243 | u'').replace(u'*', 244 | u'')) 245 | else: 246 | # 匹配失败 247 | print("[" + str(id) + "] No match: " + str_post) 248 | except Exception as e: 249 | print("[" + str(id) + "] Post_list Exception." + str(e)) 250 | 251 | 252 | # multi-threaded, the parameter [id] is the thread id 253 | # 多线程,参数 [id] 为线程 id 254 | def Work_thread(id): 255 | try: 256 | if id <= page_end: 257 | prase_num = 0 258 | prase_more_one = 0 259 | page_num = abs(page_end - page_start) + 1 260 | if id <= int(page_num % thread_num): 261 | prase_more_one = 1 262 | page_num_each_thread = int(page_num / thread_num) + prase_more_one 263 | for each_page in range(page_start + id - 1, page_end + 1, thread_num): 264 | Post_list(id, each_page) 265 | prase_num += 1 266 | print('[' + str(id) + '] [ ' + "{:.1f}".format( 267 | prase_num / page_num_each_thread * 100) + '% page completed ] ') 268 | print('[' + str(id) + '] completed !!!!!') 269 | except Exception as e: 270 | print("[" + str(id) + "] Work_thread Exception." + str(e)) 271 | 272 | 273 | if __name__ == "__main__": 274 | # single thread # 单线程 275 | # Work_thread(1) 276 | # multithreading # 多线程 277 | try: 278 | for i in range(1, thread_num + 1): 279 | _thread.start_new_thread(Work_thread, (i,)) 280 | except Exception as e: 281 | print("Start_new_thread Exception: " + str(e)) 282 | while 1: 283 | pass 284 | -------------------------------------------------------------------------------- /TorrentSpider_AsianNomosaic_DB.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | #-*-coding:utf-8 -*- 3 | import _thread 4 | import re 5 | import requests 6 | from bs4 import BeautifulSoup 7 | from urllib.parse import urlencode, parse_qs, urlsplit, urlunsplit 8 | from urllib.request import urlretrieve 9 | import urllib.request 10 | import os 11 | import time 12 | import pymysql 13 | 14 | # 1024 http request header 15 | # 1024 网站请求头 16 | proxt_1024_req_header = { 17 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 18 | 'Accept-Encoding': 'gzip, deflate', 19 | 'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 20 | 'Cache-Control': 'no-cache', 21 | # 'Connection': 'keep - alive', 22 | 'Cookie': '__cfduid=d7e5c699ef4d6599ef01239424b0e6cd71547705542; aafaf_lastvisit=0%09154' \ 23 | '7705542%09%2Fpw%2Findex.php%3F; UM_distinctid=1685a707030539-0653970bbabd2b-46564b55' \ 24 | '-1fa400-1685a707031a0a; CNZZDATA1261158850=317005769-1547705297-%7C1547705297', 25 | 'Host': 'w3.jbzcjsj.pw', 26 | 'Pragma': 'no-cache', 27 | 'Proxy-Connection': 'keep-alive', 28 | #'Referer': 'http://w3.afulyu.pw/pw/thread.php?fid=17&page=1', 29 | 'Upgrade-Insecure-Requests': '1', 30 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \ 31 | '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116' 32 | } 33 | request_header = proxt_1024_req_header 34 | 35 | # magnet-link website http request header 36 | # 磁力链接网站网站请求头 37 | proxt_torrent_req_header = { 38 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 39 | 'Accept-Encoding': 'gzip, deflate', 40 | 'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 41 | 'Cache-Control': 'no-cache', 42 | # 'Connection': 'keep - alive', 43 | 'Cookie': '__cfduid=d062c450fc125c2a02de05db8586dc1941547731587; UM_distinctid=1685bfdd4' \ 44 | 'd4854-0edeecf536f3fc-46564b55-1fa400-1685bfdd4d515b4; CNZZDATA1273152310=651528679' \ 45 | '-1547731013-http%253A%252F%252Fw3.jbzcjsj.pw%252F%7C1547731013; _ga=GA1.2.845482462.' \ 46 | '1547731588; _gid=GA1.2.2026642011.1547731588', 47 | 'Host': 'www1.downsx.club', 48 | 'Pragma': 'no-cache', 49 | 'Proxy-Connection': 'keep-alive', 50 | 'Referer': 'http://w3.jbzcjsj.pw/pw/html_data/3/1901/3855151.html', 51 | 'Upgrade-Insecure-Requests': '1', 52 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \ 53 | '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116' 54 | } 55 | torrent_request_header = proxt_torrent_req_header 56 | opener=urllib.request.build_opener() 57 | opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' \ 58 | ' (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116')] 59 | urllib.request.install_opener(opener) 60 | 61 | # proxy settings 62 | # 代理设置 63 | proxies = {'http': '127.0.0.1:1080', "https": "127.0.0.1:1080", } 64 | proxies_header = proxies 65 | isProxy = False # whether to set proxy # 是否设置代理 66 | 67 | base_url = "http://w3.jbzcjsj.pw/pw/" # xp1024's base url # xp1024的基本链接 68 | save_path = "D:/code/Pycharm/1024Spider/torrent_asian_nomosaic" # pictures save path # 图片保存路径 69 | fid = 5 # Fid=5 means Asian porn without mosaics. # Fid=5 表示亚洲无码 70 | page_start = 1 # crawl start page # 爬取的开始页 71 | page_end = 913 # crawl end page # 爬取的结束页 72 | thread_num = 1 # number of threads # 线程数 73 | mySQLCommand = object 74 | 75 | 76 | # Used to execute database commands 77 | # 用于执行数据库命令 78 | class MySQLCommand(object): 79 | # init # 类的初始化 80 | def __init__(self): 81 | self.host = '' # host ip or domain name,local is [127.0.0.1] # 数据库所在的主机 82 | self.port = 3306 # database port # 数据库端口号 83 | self.user = '' # database username # 数据库用户名 84 | self.password = "" # database password # 数据库密码 85 | self.db = "" # database name # 数据库名 86 | self.table_torrent = "AsianNomosaic" # porn information table # 影片信息表 87 | self.table_pictures = "AsianNomosaicPictures" # pictures table # 图片表 88 | 89 | # connect to database 90 | # 连接数据库 91 | def connect_mysql(self): 92 | try: 93 | self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user, 94 | passwd=self.password, db=self.db, charset='utf8') 95 | self.cursor = self.conn.cursor() 96 | return 0 97 | except Exception as e: 98 | print('[error] connect mysql error.' + str(e)) 99 | return -1 100 | 101 | # query database table 102 | # 查询表 103 | def query_table(self, tablename): 104 | sql = "SELECT * FROM " + tablename 105 | try: 106 | self.cursor.execute(sql) 107 | row = self.cursor.fetchone() 108 | print(row) 109 | print(self.cursor.rowcount) 110 | except Exception as e: 111 | print("Failed to " + sql + str(e)) 112 | 113 | # query porn information table 114 | # 查询影片信息表 115 | def query_table_torrent(self): 116 | self.query_table(self.table_torrent) 117 | 118 | # query pictures table 119 | # 查询图片表 120 | def query_table_pictures(self): 121 | self.query_table(self.table_pictures) 122 | 123 | # insert into [table_torrent] and return the primary key of the item just inserted 124 | # 插入到 [table_torrent] 返回刚插入的项的主键 125 | def insert_table_torrent(self, data='', name='', summary='', magnet=''): 126 | sql = "INSERT INTO " + self.table_torrent + " (data, name, summary, magnet) VALUES ('" + data + "', '" + \ 127 | name + "', '" + summary + "', '" + magnet + "')" 128 | try: 129 | self.cursor.execute(sql) 130 | self.conn.commit() 131 | print("Successfully insert " + name + " into " + self.table_torrent) 132 | except Exception as e: 133 | print("Failed to " + sql + str(e)) 134 | try: 135 | an_id = -1 136 | an_id = self.cursor.lastrowid 137 | if an_id != -1: 138 | return an_id 139 | except Exception as e: 140 | print("Failed to return last_insert_id." + str(e)) 141 | 142 | # insert into [table_pictures] 143 | # 插入到 table_pictures 144 | def insert_table_pictures(self, an_id='', name=''): 145 | sql = "INSERT INTO " + self.table_pictures + " (an_id, name) VALUES ('" + str(an_id) + "', '" + name + "')" 146 | try: 147 | self.cursor.execute(sql) 148 | self.conn.commit() 149 | print("Successfully insert " + name + " into " + self.table_pictures) 150 | except Exception as e: 151 | print("Failed to " + sql + str(e)) 152 | 153 | # close database 154 | # 关闭数据库连接 155 | def close_mysql(self): 156 | try: 157 | self.cursor.close() 158 | self.conn.close() 159 | except Exception as e: 160 | print("Failed to close mysql." + str(e)) 161 | 162 | 163 | # conversion encode 164 | # 转换编码 165 | def Encode_Conversion(req): 166 | if req.encoding == 'ISO-8859-1': 167 | encodings = requests.utils.get_encodings_from_content(req.text) 168 | if encodings: 169 | encoding = encodings[0] 170 | else: 171 | encoding = req.apparent_encoding 172 | # encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace') 173 | encode_content = req.content.decode(encoding, 'replace') 174 | return encode_content 175 | else: 176 | return "" 177 | 178 | 179 | # save [content] to [path] 180 | # 保存文本 181 | def Save_Text(id, path, content): 182 | try: 183 | f = open(path, "w", encoding='utf-8') 184 | f.write(content) 185 | except IOError: 186 | print("[" + str(id) + "] IOError: File open failed.") 187 | except Exception as e: 188 | print("Save_Text Exception: " + str(e)) 189 | else: 190 | print("[" + str(id) + "] Successfully save the file to " + path) 191 | f.close() 192 | 193 | 194 | # torrent and magnet-link page 195 | # 种子/磁力链接页面 196 | def Prase_Torrent(id, url): 197 | try: 198 | if (isProxy == True): 199 | req = requests.get(url, params=torrent_request_header, proxies=proxies_header) 200 | else: 201 | req = requests.get(url, params=torrent_request_header) 202 | 203 | soup = BeautifulSoup(req.content, "html.parser") 204 | torrent_content = soup.select('.uk-button ') 205 | torrent_content_num = len(torrent_content) 206 | if torrent_content_num == 0: 207 | print("[" + str(id) + "] No match torrent.") 208 | return '' 209 | for content in torrent_content: 210 | str_content = str(content) 211 | # matching magnet-link 212 | # 匹配磁力链接 213 | matchObj = re.search(r'magnet(.*?)"', str_content) 214 | if matchObj: 215 | magnet_link = 'magnet' + matchObj.group(1) 216 | return magnet_link 217 | else: 218 | # matching magnet-link failed 219 | # 匹配磁力链接失败 220 | print("[" + str(id) + "] No match: " + str_content) 221 | return '' 222 | except Exception as e: 223 | print("[" + str(id) + "] Prase_Torrent Exception: " + str(e)) 224 | 225 | 226 | # each post page 227 | # 每个帖子页面 228 | def Prase_Post(id, url, folder_name): 229 | try: 230 | # match data 231 | # 匹配日期 232 | data = '' 233 | matchObj = re.search(r'\[(.*?)\]', folder_name, re.M | re.I) 234 | if matchObj: 235 | data = matchObj.group(1) 236 | else: 237 | # match data failed 238 | # 匹配日期失败 239 | print("[" + str(id) + "] No match: " + folder_name) 240 | 241 | if (isProxy == True): 242 | req = requests.get(url, params=request_header, proxies=proxies_header) 243 | else: 244 | req = requests.get(url, params=request_header) 245 | 246 | encode_content = Encode_Conversion(req) 247 | soup = BeautifulSoup(encode_content, "html.parser") 248 | post_content = soup.select('div[id="read_tpc"]') 249 | post_content_num = len(post_content) 250 | if post_content_num == 0: 251 | print("[" + str(id) + "] No match post.") 252 | return 253 | 254 | # save text content 255 | # 保存文本内容 256 | summary = post_content[0].text 257 | str_content = str(post_content[0]) 258 | 259 | # match magnet-link page 260 | # 匹配磁力 261 | magnet_link = '' 262 | matchObj = re.findall(r'href="(.*?)"', str_content) 263 | if matchObj: 264 | for obj in matchObj: 265 | magnet_link = Prase_Torrent(id, obj) 266 | else: 267 | # match magnet-link page failed 268 | # 匹配磁力失败 269 | print("[" + str(id) + "] No match: " + str_content) 270 | 271 | # insert the [insert_table_torrent] table of the database 272 | # 插入到 [insert_table_torrent] 表 273 | an_id = -1 274 | if folder_name != '' and magnet_link != '': 275 | an_id = mySQLCommand.insert_table_torrent(data=data, name=folder_name, summary=summary, magnet=magnet_link) 276 | 277 | if an_id != -1: 278 | # create a folder to save the picture 279 | # 创建保存图片的文件夹 280 | folder_path = save_path + '/' + str(an_id) 281 | folder = os.path.exists(folder_path) 282 | if not folder: 283 | os.makedirs(folder_path) 284 | print("[" + str(id) + "] Created folder " + str(an_id)) 285 | 286 | # match pictures 287 | # 匹配图片 288 | matchObj = re.findall(r'window.open\(\'(.*?)\'\);', str_content) 289 | if matchObj: 290 | for obj in matchObj: 291 | objTemp = obj 292 | strlist = objTemp.split('/') 293 | strlen = len(strlist) 294 | if strlen != 0: 295 | img_name = strlist[strlen - 1] 296 | try: 297 | urllib.request.urlretrieve(obj, folder_path + '/' + img_name) 298 | except Exception as e: 299 | print("[" + str(id) + "] Download the picture Exception: " + str(e)) 300 | else: 301 | print("[" + str(id) + "] Successfully save the image to " + folder_path + '/' + img_name) 302 | # insert the [insert_table_pictures] table of the database 303 | # 插入 [insert_table_pictures] 表 304 | mySQLCommand.insert_table_pictures(an_id=an_id, name=img_name) 305 | else: 306 | # 匹配图片失败 307 | # match pictures failed 308 | print("[" + str(id) + "] No match: " + str_content) 309 | except Exception as e: 310 | print("[" + str(id) + "] Prase_Post Exception: " + str(e)) 311 | 312 | 313 | # post list page 314 | # 帖子列表页面 315 | def Post_list(id, page): 316 | try: 317 | post_url = base_url + 'thread-htm-fid-' + str(fid) + '-page-' + str(page) + '.html' 318 | print('[' + str(id) + '] clicked: ' + post_url) 319 | 320 | if (isProxy == True): 321 | req = requests.get(post_url, params=request_header, proxies=proxies_header) 322 | else: 323 | req = requests.get(post_url, params=request_header) 324 | 325 | encode_content = Encode_Conversion(req) 326 | soup = BeautifulSoup(encode_content, "html.parser") 327 | post_list = soup.select('tr[class="tr3 t_one"] h3 a') 328 | post_num = len(post_list) 329 | if post_num == 0: 330 | print("[" + str(id) + "] No match post_list.") 331 | return 332 | for post in post_list: 333 | str_post = str(post) 334 | matchObj = re.match(r'(.*)href="(.*)" id=(.*)>(.*?)', str_post, re.M | re.I) 335 | if matchObj: 336 | post_url = matchObj.group(2) 337 | post_name = matchObj.group(4) 338 | if post_name != '': 339 | # match each post page 340 | # 匹配每个帖子 341 | Prase_Post(id, base_url + post_url, 342 | post_name.replace(u'\0', u'').replace(u'/', u'.').replace(u'?', 343 | u'').replace(u'*', 344 | u'')) 345 | else: 346 | # match failed 347 | # 匹配失败 348 | print("[" + str(id) + "] No match: " + str_post) 349 | except Exception as e: 350 | print("[" + str(id) + "] Post_list Exception." + str(e)) 351 | 352 | 353 | # multi-threaded, the parameter [id] is the thread id 354 | # 多线程,参数 [id] 为线程 id 355 | def Work_thread(id): 356 | try: 357 | if id <= page_end: 358 | prase_num = 0 359 | prase_more_one = 0 360 | page_num = abs(page_end - page_start) + 1 361 | if id <= int(page_num % thread_num): 362 | prase_more_one = 1 363 | page_num_each_thread = int(page_num / thread_num) + prase_more_one 364 | for each_page in range(page_start + id - 1, page_end + 1, thread_num): 365 | Post_list(id, each_page) 366 | prase_num += 1 367 | print('[' + str(id) + '] [ ' + "{:.1f}".format( 368 | prase_num / page_num_each_thread * 100) + '% page completed ] ') 369 | print('[' + str(id) + '] completed !!!!!') 370 | except Exception as e: 371 | print("[" + str(id) + "] Work_thread Exception." + str(e)) 372 | 373 | 374 | if __name__ == "__main__": 375 | # database command object 376 | # 数据库命令对象 377 | mySQLCommand = MySQLCommand() 378 | if mySQLCommand.connect_mysql() != -1: 379 | # single thread # 单线程 380 | # Work_thread(1) 381 | # multithreading # 多线程 382 | try: 383 | for i in range(1, thread_num + 1): 384 | _thread.start_new_thread(Work_thread, (i,)) 385 | except Exception as e: 386 | print("Start_new_thread Exception: " + str(e)) 387 | while 1: 388 | pass 389 | mySQLCommand.close_mysql() 390 | -------------------------------------------------------------------------------- /TorrentSpider_AsianNomosaic_With_Json.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*-coding:utf-8 -*- 3 | import _thread 4 | import re 5 | import requests 6 | from bs4 import BeautifulSoup 7 | from urllib.parse import urlencode, parse_qs, urlsplit, urlunsplit 8 | from urllib.request import urlretrieve 9 | import urllib.request 10 | import os 11 | import json 12 | 13 | 14 | config = object 15 | 16 | 17 | # Read configuration from [config.json] 18 | # 从 [config.json] 读取配置信息 19 | class JsonCommand(object): 20 | def __init__(self): 21 | try: 22 | with open('config.json', encoding='utf-8') as config_file: 23 | _config = json.loads(config_file.read()) 24 | self.request_header = _config["_1024_req_header"] 25 | self.torrent_request_header = _config["_torrent_req_header"] 26 | self.proxies = _config["proxies"] 27 | self.is_proxy = bool(_config["is_proxy"]) 28 | self.fid = int(_config["fid"]) 29 | self.base_url = _config["base_url"] 30 | self.save_path = _config["save_path"] 31 | self.page_start = int(_config["page_start"]) 32 | self.page_end = int(_config["page_end"]) 33 | self.thread_num = int(_config["thread_num"]) 34 | self.user_agent = _config["_1024_req_header"]["User-Agent"] 35 | config_file.close() 36 | except Exception as e: 37 | print("JsonCommand Exception: " + str(e)) 38 | 39 | 40 | # conversion encode 41 | # 转换编码 42 | def Encode_Conversion(req): 43 | if req.encoding == 'ISO-8859-1': 44 | encodings = requests.utils.get_encodings_from_content(req.text) 45 | if encodings: 46 | encoding = encodings[0] 47 | else: 48 | encoding = req.apparent_encoding 49 | # encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace') 50 | encode_content = req.content.decode(encoding, 'replace') 51 | return encode_content 52 | else: 53 | return "" 54 | 55 | 56 | # save [content] to [path] 57 | # 保存文本 58 | def Save_Text(id, path, content): 59 | try: 60 | f = open(path, "w", encoding='utf-8') 61 | f.write(content) 62 | except IOError: 63 | print("[" + str(id) + "] IOError: File open failed.") 64 | except Exception as e: 65 | print("Save_Text Exception: " + str(e)) 66 | else: 67 | # 内容写入文件成功 68 | print("[" + str(id) + "] Successfully save the file to " + path) 69 | f.close() 70 | 71 | 72 | # torrent and magnet-link page 73 | # 种子/磁力链接页面 74 | def Prase_Torrent(id, url, folder_path): 75 | try: 76 | if (config.is_proxy == True): 77 | req = requests.get(url, params=config.torrent_request_header, proxies=config.proxies) 78 | else: 79 | req = requests.get(url, params=config.torrent_request_header) 80 | 81 | # soup转换 82 | soup = BeautifulSoup(req.content, "html.parser") 83 | 84 | torrent_content = soup.select('.uk-button ') 85 | torrent_content_num = len(torrent_content) 86 | if torrent_content_num == 0: 87 | print("[" + str(id) + "] No match torrent.") 88 | return '' 89 | for content in torrent_content: 90 | str_content = str(content) 91 | # 匹配磁力链接 92 | matchObj = re.search(r'magnet(.*?)"', str_content) 93 | if matchObj: 94 | magnet_link = 'magnet' + matchObj.group(1) 95 | return magnet_link 96 | else: 97 | # 匹配失败 98 | print("[" + str(id) + "] No match: " + str_content) 99 | return '' 100 | except Exception as e: 101 | print("[" + str(id) + "] Prase_Torrent Exception: " + str(e)) 102 | 103 | 104 | # each post page 105 | # 每个帖子页面 106 | def Prase_Post(id, url, folder_name): 107 | try: 108 | if (config.is_proxy == True): 109 | req = requests.get(url, params=config.request_header, proxies=config.proxies) 110 | else: 111 | req = requests.get(url, params=config.request_header) 112 | 113 | # 转换编码 114 | encode_content = Encode_Conversion(req) 115 | # soup转换 116 | soup = BeautifulSoup(encode_content, "html.parser") 117 | 118 | post_content = soup.select('div[id="read_tpc"]') 119 | post_content_num = len(post_content) 120 | if post_content_num == 0: 121 | print("[" + str(id) + "] No match post.") 122 | return 123 | 124 | # 创建保存的文件夹 125 | folder_path = config.save_path + '/' + folder_name 126 | folder = os.path.exists(folder_path) 127 | if not folder: 128 | os.makedirs(folder_path) 129 | print("[" + str(id) + "] Created folder " + folder_name) 130 | 131 | # 保存文本内容 132 | result = post_content[0].text 133 | magnet_link = '' 134 | for content in post_content: 135 | str_content = str(content) 136 | 137 | # 匹配种子 138 | matchObj = re.findall(r'href="(.*?)"', str_content) 139 | if matchObj: 140 | for obj in matchObj: 141 | magnet_link = Prase_Torrent(id, obj, folder_path) 142 | else: 143 | # 匹配失败 144 | print("[" + str(id) + "] No match: " + str_content) 145 | 146 | # 匹配图片 147 | matchObj = re.findall(r'window.open\(\'(.*?)\'\);', str_content) 148 | if matchObj: 149 | for obj in matchObj: 150 | objTemp = obj 151 | strlist = objTemp.split('/') 152 | strlen = len(strlist) 153 | if strlen != 0: 154 | img_name = strlist[strlen - 1] 155 | try: 156 | urllib.request.urlretrieve(obj, folder_path + '/' + img_name) 157 | except Exception as e: 158 | print("[" + str(id) + "] Download the picture Exception: " + str(e)) 159 | else: 160 | print("[" + str(id) + "] Successfully save the picture to " + folder_path + '/' + img_name) 161 | else: 162 | # 匹配失败 163 | print("[" + str(id) + "] No match: " + str_content) 164 | # 保存到文件 165 | if magnet_link != '': 166 | result = result + '\n\n' + magnet_link 167 | Save_Text(id, folder_path + '/index.txt', result) 168 | except Exception as e: 169 | print("[" + str(id) + "] Prase_Post Exception: " + str(e)) 170 | 171 | 172 | # post list page 173 | # 帖子列表页面 174 | def Post_list(id, page): 175 | try: 176 | post_url = config.base_url + 'thread-htm-fid-' + str(config.fid) + '-page-' + str(page) + '.html' 177 | print('[' + str(id) + '] clicked: ' + post_url) 178 | 179 | if (config.is_proxy == True): 180 | req = requests.get(post_url, params=config.request_header, proxies=config.proxies) 181 | else: 182 | req = requests.get(post_url, params=config.request_header) 183 | 184 | # 转换编码 185 | encode_content = Encode_Conversion(req) 186 | 187 | # soup转换 188 | soup = BeautifulSoup(encode_content, "html.parser") 189 | # 获取章节名称 190 | post_list = soup.select('tr[class="tr3 t_one"] h3 a') 191 | post_num = len(post_list) 192 | if post_num == 0: 193 | print("[" + str(id) + "] No match post_list.") 194 | return 195 | for post in post_list: 196 | str_post = str(post) 197 | # html网页的匹配 198 | matchObj = re.match(r'(.*)href="(.*)" id=(.*)>(.*?)', str_post, re.M | re.I) 199 | if matchObj: 200 | post_url = matchObj.group(2) # URL 201 | post_name = matchObj.group(4) # 文件夹名 202 | if post_name != '': 203 | # 匹配每个帖子 204 | Prase_Post(id, config.base_url + post_url, 205 | post_name.replace(u'\0', u'').replace(u'/', u'.').replace(u'?', 206 | u'').replace(u'*', 207 | u'')) 208 | else: 209 | # 匹配失败 210 | print("[" + str(id) + "] No match: " + str_post) 211 | except Exception as e: 212 | print("[" + str(id) + "] Post_list Exception." + str(e)) 213 | 214 | 215 | # multi-threaded, the parameter [id] is the thread id 216 | # 多线程,参数 [id] 为线程 id 217 | def Work_thread(id): 218 | try: 219 | if id <= config.page_end: 220 | prase_num = 0 221 | prase_more_one = 0 222 | page_num = abs(config.page_end - config.page_start) + 1 223 | if id <= int(page_num % config.thread_num): 224 | prase_more_one = 1 225 | page_num_each_thread = int(page_num / config.thread_num) + prase_more_one 226 | for each_page in range(config.page_start + id - 1, config.page_end + 1, config.thread_num): 227 | Post_list(id, each_page) 228 | prase_num += 1 229 | print('[' + str(id) + '] [ ' + "{:.1f}".format( 230 | prase_num / page_num_each_thread * 100) + '% page completed ] ') 231 | print('[' + str(id) + '] completed !!!!!') 232 | except Exception as e: 233 | print("[" + str(id) + "] Work_thread Exception." + str(e)) 234 | 235 | 236 | if __name__ == "__main__": 237 | config = JsonCommand() 238 | opener = urllib.request.build_opener() 239 | opener.addheaders = [(config.user_agent)] 240 | urllib.request.install_opener(opener) 241 | # single thread # 单线程 242 | # Work_thread(1) 243 | # multithreading # 多线程 244 | try: 245 | for i in range(1, config.thread_num + 1): 246 | _thread.start_new_thread(Work_thread, (i,)) 247 | except Exception as e: 248 | print("Start_new_thread Exception: " + str(e)) 249 | while 1: 250 | pass 251 | -------------------------------------------------------------------------------- /TorrentSpider_EuropeAmerica_DB.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | #-*-coding:utf-8 -*- 3 | import _thread 4 | import re 5 | import requests 6 | from bs4 import BeautifulSoup 7 | from urllib.parse import urlencode, parse_qs, urlsplit, urlunsplit 8 | from urllib.request import urlretrieve 9 | import urllib.request 10 | import os 11 | import time 12 | import pymysql 13 | 14 | # 1024 http request header 15 | # 1024 网站请求头 16 | proxt_1024_req_header = { 17 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 18 | 'Accept-Encoding': 'gzip, deflate', 19 | 'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 20 | 'Cache-Control': 'max-age=0', 21 | # 'Connection': 'keep - alive', 22 | 'Cookie': '__cfduid=d8a8419777cdc090aeacad5676c478c181548136023; UM_distinctid=16874190914afb' \ 23 | '-02debbef036148-46564b55-1fa400-168741909152aa; CNZZDATA1261158850=1725766245-' \ 24 | '1548135728-%7C1548135728; aafaf_threadlog=%2C7%2C5%2C110%2C18%2C106%2C14%2C22%2C; ' \ 25 | 'aafaf_readlog=%2C2024971%2C; aafaf_lastpos=F22; aafaf_lastvisit=2122%091548138145%09' \ 26 | '%2Fpw%2Fthread.php%3Ffid-22-page-1.html; aafaf_ol_offset=32470944', 27 | 'Host': 'h3.cnmbtgf.info', 28 | # 'Pragma': 'no-cache', 29 | 'Proxy-Connection': 'keep-alive', 30 | 'Referer': 'http://h3.cnmbtgf.info/pw/thread-htm-fid-22-page-2.html', 31 | 'Upgrade-Insecure-Requests': '1', 32 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' \ 33 | ' Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116' 34 | } 35 | request_header = proxt_1024_req_header 36 | 37 | # magnet-link website http request header 38 | # 磁力链接网站网站请求头 39 | proxt_torrent_req_header = { 40 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 41 | 'Accept-Encoding': 'gzip, deflate', 42 | 'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 43 | 'Cache-Control': 'no-cache', 44 | # 'Connection': 'keep - alive', 45 | 'Cookie': '__cfduid=d941de1b4432ad5277d394ccf9eef5a521548136720; UM_distinctid=1687423abf6414' \ 46 | '-0e3d3cc25c160b-46564b55-1fa400-1687423abf7b62; CNZZDATA1273152310=28791063-' \ 47 | '1548133540-http%253A%252F%252Fh3.cnmbtgf.info%252F%7C1548133540; _ga=GA1.2.18' \ 48 | '32968654.1548136721; _gid=GA1.2.1853650139.1548136721', 49 | 'Host': 'www1.downsx.net', 50 | 'Pragma': 'no-cache', 51 | 'Proxy-Connection': 'keep-alive', 52 | 'Referer': 'http://h3.cnmbtgf.info/pw/html_data/22/1901/3863610.html', 53 | 'Upgrade-Insecure-Requests': '1', 54 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \ 55 | '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116' 56 | } 57 | torrent_request_header = proxt_torrent_req_header 58 | opener=urllib.request.build_opener() 59 | opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' \ 60 | ' (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116')] 61 | urllib.request.install_opener(opener) 62 | 63 | # proxy settings 64 | # 代理设置 65 | proxies = {'http': '127.0.0.1:1080', "https": "127.0.0.1:1080", } 66 | proxies_header = proxies 67 | isProxy = False # 是否设置代理 68 | 69 | base_url = "http://h3.cnmbtgf.info/pw/" # 基础url 70 | save_path = "D:/code/Pycharm/1024Spider/torrent_europe_america" # 存储图片路径 71 | fid = 7 # fid=7 表示欧美 72 | page_start = 1 # 爬取的开始页 73 | page_end = 434 # 爬取的结束页 74 | thread_num = 1 # 线程数 75 | mySQLCommand = object 76 | 77 | 78 | # Used to execute database commands 79 | # 用于执行数据库命令 80 | class MySQLCommand(object): 81 | # init # 类的初始化 82 | def __init__(self): 83 | self.host = '' # 主机,本地填 127.0.0.1 84 | self.port = 3306 # 数据端口号 85 | self.user = '' # 数据库用户名 86 | self.password = "" # 数据库密码 87 | self.db = "" # 数据库名 88 | self.table_torrent = "EuropeAmerica" # 欧美新片信息表 89 | self.table_pictures = "EuropeAmericaPictures" # 欧美新片图片表 90 | 91 | # connect to database 92 | # 连接数据库 93 | def connect_mysql(self): 94 | try: 95 | self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user, 96 | passwd=self.password, db=self.db, charset='utf8') 97 | self.cursor = self.conn.cursor() 98 | return 0 99 | except Exception as e: 100 | print('[error] connect mysql error.' + str(e)) 101 | return -1 102 | 103 | # query database table 104 | # 查询表 105 | def query_table(self, tablename): 106 | sql = "SELECT * FROM " + tablename 107 | try: 108 | self.cursor.execute(sql) 109 | row = self.cursor.fetchone() 110 | print(row) 111 | print(self.cursor.rowcount) 112 | except Exception as e: 113 | print("Failed to " + sql + str(e)) 114 | 115 | # query porn information table 116 | # 查询影片信息表 117 | def query_table_torrent(self): 118 | self.query_table(self.table_torrent) 119 | 120 | # query pictures table 121 | # 查询图片表 122 | def query_table_pictures(self): 123 | self.query_table(self.table_pictures) 124 | 125 | # insert into [table_torrent] and return the primary key of the item just inserted 126 | # 插入到 [table_torrent] 返回刚插入的项的主键 127 | def insert_table_torrent(self, data='', name='', summary='', magnet=''): 128 | sql = "INSERT INTO " + self.table_torrent + " (data, name, summary, magnet) VALUES ('" + data + "', '" + \ 129 | name + "', '" + summary + "', '" + magnet + "')" 130 | try: 131 | self.cursor.execute(sql) 132 | self.conn.commit() 133 | print("Successfully insert " + name + " into " + self.table_torrent) 134 | except Exception as e: 135 | print("Failed to " + sql + str(e)) 136 | try: 137 | an_id = -1 138 | an_id = self.cursor.lastrowid 139 | if an_id != -1: 140 | return an_id 141 | except Exception as e: 142 | print("Failed to return last_insert_id." + str(e)) 143 | 144 | # insert into [table_pictures] 145 | # 插入到 table_pictures 146 | def insert_table_pictures(self, an_id='', name=''): 147 | sql = "INSERT INTO " + self.table_pictures + " (an_id, name) VALUES ('" + str(an_id) + "', '" + name + "')" 148 | try: 149 | self.cursor.execute(sql) 150 | self.conn.commit() 151 | print("Successfully insert " + name + " into " + self.table_pictures) 152 | except Exception as e: 153 | print("Failed to " + sql + str(e)) 154 | 155 | # close database 156 | # 关闭数据库连接 157 | def close_mysql(self): 158 | try: 159 | self.cursor.close() 160 | self.conn.close() 161 | except Exception as e: 162 | print("Failed to close mysql." + str(e)) 163 | 164 | 165 | # conversion encode 166 | # 转换编码 167 | def Encode_Conversion(req): 168 | if req.encoding == 'ISO-8859-1': 169 | encodings = requests.utils.get_encodings_from_content(req.text) 170 | if encodings: 171 | encoding = encodings[0] 172 | else: 173 | encoding = req.apparent_encoding 174 | # encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace') 175 | encode_content = req.content.decode(encoding, 'replace') 176 | return encode_content 177 | else: 178 | return "" 179 | 180 | 181 | # save [content] to [path] 182 | # 保存文本 183 | def Save_Text(id, path, content): 184 | try: 185 | f = open(path, "w", encoding='utf-8') 186 | f.write(content) 187 | except IOError: 188 | print("[" + str(id) + "] IOError: File open failed.") 189 | except Exception as e: 190 | print("Save_Text Exception: " + str(e)) 191 | else: 192 | # 内容写入文件成功 193 | print("[" + str(id) + "] Successfully save the file to " + path) 194 | f.close() 195 | 196 | 197 | # torrent and magnet-link page 198 | # 种子/磁力链接页面 199 | def Prase_Torrent(id, url): 200 | try: 201 | if (isProxy == True): 202 | req = requests.get(url, params=torrent_request_header, proxies=proxies_header) 203 | else: 204 | req = requests.get(url, params=torrent_request_header) 205 | 206 | # soup转换 207 | soup = BeautifulSoup(req.content, "html.parser") 208 | 209 | torrent_content = soup.select('.uk-button ') 210 | torrent_content_num = len(torrent_content) 211 | if torrent_content_num == 0: 212 | print("[" + str(id) + "] No match torrent.") 213 | return '' 214 | for content in torrent_content: 215 | str_content = str(content) 216 | # 匹配磁力链接 217 | matchObj = re.search(r'magnet(.*?)"', str_content) 218 | if matchObj: 219 | magnet_link = 'magnet' + matchObj.group(1) 220 | return magnet_link 221 | else: 222 | # 匹配失败 223 | print("[" + str(id) + "] No match: " + str_content) 224 | return '' 225 | except Exception as e: 226 | print("[" + str(id) + "] Prase_Torrent Exception: " + str(e)) 227 | 228 | 229 | # each post page 230 | # 每个帖子页面 231 | def Prase_Post(id, url, folder_name): 232 | try: 233 | # 匹配日期 234 | data = '' 235 | matchObj = re.search(r'\[(.*?)\]', folder_name, re.M | re.I) 236 | if matchObj: 237 | data = matchObj.group(1) # 文件夹名 238 | else: 239 | # 匹配失败 240 | print("[" + str(id) + "] No match: " + folder_name) 241 | 242 | if (isProxy == True): 243 | req = requests.get(url, params=request_header, proxies=proxies_header) 244 | else: 245 | req = requests.get(url, params=request_header) 246 | 247 | # 转换编码 248 | encode_content = Encode_Conversion(req) 249 | # soup转换 250 | soup = BeautifulSoup(encode_content, "html.parser") 251 | 252 | post_content = soup.select('div[id="read_tpc"]') 253 | post_content_num = len(post_content) 254 | if post_content_num == 0: 255 | print("[" + str(id) + "] No match post.") 256 | return 257 | 258 | # 保存文本内容 259 | summary = post_content[0].text 260 | str_content = str(post_content[0]) 261 | 262 | # 匹配种子 263 | magnet_link = '' 264 | matchObj = re.findall(r'href="(.*?)"', str_content) 265 | if matchObj: 266 | for obj in matchObj: 267 | magnet_link = Prase_Torrent(id, obj) 268 | else: 269 | # 匹配种子失败 270 | print("[" + str(id) + "] No match: " + str_content) 271 | 272 | # 插入到数据库:insert_table_torrent 表 273 | an_id = -1 274 | if folder_name != '' and magnet_link != '': 275 | an_id = mySQLCommand.insert_table_torrent(data=data, name=folder_name, summary=summary, magnet=magnet_link) 276 | 277 | if an_id != -1: 278 | # 创建保存图片的文件夹 279 | folder_path = save_path + '/' + str(an_id) 280 | folder = os.path.exists(folder_path) 281 | if not folder: 282 | os.makedirs(folder_path) 283 | print("[" + str(id) + "] Created folder " + str(an_id)) 284 | 285 | # 匹配图片 286 | matchObj = re.findall(r'window.open\(\'(.*?)\'\);', str_content) 287 | if matchObj: 288 | for obj in matchObj: 289 | objTemp = obj 290 | strlist = objTemp.split('/') 291 | strlen = len(strlist) 292 | if strlen != 0: 293 | img_name = strlist[strlen - 1] 294 | try: 295 | urllib.request.urlretrieve(obj, folder_path + '/' + img_name) 296 | except Exception as e: 297 | print("[" + str(id) + "] Download the picture Exception: " + str(e)) 298 | else: 299 | print("[" + str(id) + "] Successfully save the image to " + folder_path + '/' + img_name) 300 | # 插入数据库:insert_table_pictures 表 301 | mySQLCommand.insert_table_pictures(an_id=an_id, name=img_name) 302 | else: 303 | # 匹配失败 304 | print("[" + str(id) + "] No match: " + str_content) 305 | except Exception as e: 306 | print("[" + str(id) + "] Prase_Post Exception: " + str(e)) 307 | 308 | 309 | # post list page 310 | # 帖子列表页面 311 | def Post_list(id, page): 312 | try: 313 | post_url = base_url + 'thread-htm-fid-' + str(fid) + '-page-' + str(page) + '.html' 314 | print('[' + str(id) + '] clicked: ' + post_url) 315 | 316 | if (isProxy == True): 317 | req = requests.get(post_url, params=request_header, proxies=proxies_header) 318 | else: 319 | req = requests.get(post_url, params=request_header) 320 | 321 | # 转换编码 322 | encode_content = Encode_Conversion(req) 323 | 324 | # soup转换 325 | soup = BeautifulSoup(encode_content, "html.parser") 326 | # 获取章节名称 327 | post_list = soup.select('tr[class="tr3 t_one"] h3 a') 328 | post_num = len(post_list) 329 | if post_num == 0: 330 | print("[" + str(id) + "] No match post_list.") 331 | return 332 | for post in post_list: 333 | str_post = str(post) 334 | # html网页的匹配 335 | matchObj = re.match(r'(.*)href="(.*)" id=(.*)>(.*?)', str_post, re.M | re.I) 336 | if matchObj: 337 | post_url = matchObj.group(2) # URL 338 | post_name = matchObj.group(4) # 文件夹名 339 | if post_name != '': 340 | # 匹配每个帖子 341 | Prase_Post(id, base_url + post_url, 342 | post_name.replace(u'\0', u'').replace(u'/', u'.').replace(u'?', 343 | u'').replace(u'*', 344 | u'')) 345 | else: 346 | # 匹配失败 347 | print("[" + str(id) + "] No match: " + str_post) 348 | except Exception as e: 349 | print("[" + str(id) + "] Post_list Exception." + str(e)) 350 | 351 | 352 | # multi-threaded, the parameter [id] is the thread id 353 | # 多线程,参数 [id] 为线程 id 354 | def Work_thread(id): 355 | try: 356 | if id <= page_end: 357 | prase_num = 0 358 | prase_more_one = 0 359 | page_num = abs(page_end - page_start) + 1 360 | if id <= int(page_num % thread_num): 361 | prase_more_one = 1 362 | page_num_each_thread = int(page_num / thread_num) + prase_more_one 363 | for each_page in range(page_start + id - 1, page_end + 1, thread_num): 364 | Post_list(id, each_page) 365 | prase_num += 1 366 | print('[' + str(id) + '] [ ' + "{:.1f}".format( 367 | prase_num / page_num_each_thread * 100) + '% page completed ] ') 368 | print('[' + str(id) + '] completed !!!!!') 369 | except Exception as e: 370 | print("[" + str(id) + "] Work_thread Exception." + str(e)) 371 | 372 | 373 | if __name__ == "__main__": 374 | # database command object 375 | # 数据库命令对象 376 | mySQLCommand = MySQLCommand() 377 | if mySQLCommand.connect_mysql() != -1: 378 | # single thread # 单线程 379 | # Work_thread(1) 380 | # multithreading # 多线程 381 | try: 382 | for i in range(1, thread_num + 1): 383 | _thread.start_new_thread(Work_thread, (i,)) 384 | except Exception as e: 385 | print("Start_new_thread Exception: " + str(e)) 386 | while 1: 387 | pass 388 | mySQLCommand.close_mysql() 389 | -------------------------------------------------------------------------------- /TorrentSpider_JapaneseCavalry_DB.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | #-*-coding:utf-8 -*- 3 | import _thread 4 | import re 5 | import requests 6 | from bs4 import BeautifulSoup 7 | from urllib.parse import urlencode, parse_qs, urlsplit, urlunsplit 8 | from urllib.request import urlretrieve 9 | import urllib.request 10 | import os 11 | import time 12 | import pymysql 13 | 14 | # 1024 http request header 15 | # 1024 网站请求头 16 | proxt_1024_req_header = { 17 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 18 | 'Accept-Encoding': 'gzip, deflate', 19 | 'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 20 | 'Cache-Control': 'max-age=0', 21 | # 'Connection': 'keep - alive', 22 | 'Cookie': '__cfduid=d8a8419777cdc090aeacad5676c478c181548136023; UM_distinctid=16874190914afb' \ 23 | '-02debbef036148-46564b55-1fa400-168741909152aa; CNZZDATA1261158850=1725766245-' \ 24 | '1548135728-%7C1548135728; aafaf_threadlog=%2C7%2C5%2C110%2C18%2C106%2C14%2C22%2C; ' \ 25 | 'aafaf_readlog=%2C2024971%2C; aafaf_lastpos=F22; aafaf_lastvisit=2122%091548138145%09' \ 26 | '%2Fpw%2Fthread.php%3Ffid-22-page-1.html; aafaf_ol_offset=32470944', 27 | 'Host': 'h3.cnmbtgf.info', 28 | # 'Pragma': 'no-cache', 29 | 'Proxy-Connection': 'keep-alive', 30 | 'Referer': 'http://h3.cnmbtgf.info/pw/thread-htm-fid-22-page-2.html', 31 | 'Upgrade-Insecure-Requests': '1', 32 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' \ 33 | ' Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116' 34 | } 35 | request_header = proxt_1024_req_header 36 | 37 | # magnet-link website http request header 38 | # 磁力链接网站网站请求头 39 | proxt_torrent_req_header = { 40 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 41 | 'Accept-Encoding': 'gzip, deflate', 42 | 'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 43 | 'Cache-Control': 'no-cache', 44 | # 'Connection': 'keep - alive', 45 | 'Cookie': '__cfduid=d941de1b4432ad5277d394ccf9eef5a521548136720; UM_distinctid=1687423abf6414' \ 46 | '-0e3d3cc25c160b-46564b55-1fa400-1687423abf7b62; CNZZDATA1273152310=28791063-' \ 47 | '1548133540-http%253A%252F%252Fh3.cnmbtgf.info%252F%7C1548133540; _ga=GA1.2.18' \ 48 | '32968654.1548136721; _gid=GA1.2.1853650139.1548136721', 49 | 'Host': 'www1.downsx.net', 50 | 'Pragma': 'no-cache', 51 | 'Proxy-Connection': 'keep-alive', 52 | 'Referer': 'http://h3.cnmbtgf.info/pw/html_data/22/1901/3863610.html', 53 | 'Upgrade-Insecure-Requests': '1', 54 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \ 55 | '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116' 56 | } 57 | torrent_request_header = proxt_torrent_req_header 58 | opener=urllib.request.build_opener() 59 | opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' \ 60 | ' (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116')] 61 | urllib.request.install_opener(opener) 62 | 63 | # proxy settings 64 | # 代理设置 65 | proxies = {'http': '127.0.0.1:1080', "https": "127.0.0.1:1080", } 66 | proxies_header = proxies 67 | isProxy = False # 是否设置代理 68 | 69 | base_url = "http://h3.cnmbtgf.info/pw/" # 基础url 70 | save_path = "D:/code/Pycharm/1024Spider/torrent_japanese_cavalry" # 存储图片路径 71 | fid = 22 # fid=22 表示日本骑兵 72 | page_start = 1 # 爬取的开始页 73 | page_end = 1332 # 爬取的结束页 74 | thread_num = 1 # 线程数 75 | mySQLCommand = object 76 | 77 | 78 | # Used to execute database commands 79 | # 用于执行数据库命令 80 | class MySQLCommand(object): 81 | # init # 类的初始化 82 | def __init__(self): 83 | self.host = '' # 主机,本地填 127.0.0.1 84 | self.port = 3306 # 数据端口号 85 | self.user = '' # 数据库用户名 86 | self.password = "" # 数据库密码 87 | self.db = "" # 数据库名 88 | self.table_torrent = "JapaneseCavalry" # 日本骑兵信息表 89 | self.table_pictures = "JapaneseCavalryPictures" # 日本骑兵图片表 90 | 91 | # connect to database 92 | # 连接数据库 93 | def connect_mysql(self): 94 | try: 95 | self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user, 96 | passwd=self.password, db=self.db, charset='utf8') 97 | self.cursor = self.conn.cursor() 98 | return 0 99 | except Exception as e: 100 | print('[error] connect mysql error.' + str(e)) 101 | return -1 102 | 103 | # query database table 104 | # 查询表 105 | def query_table(self, tablename): 106 | sql = "SELECT * FROM " + tablename 107 | try: 108 | self.cursor.execute(sql) 109 | row = self.cursor.fetchone() 110 | print(row) 111 | print(self.cursor.rowcount) 112 | except Exception as e: 113 | print("Failed to " + sql + str(e)) 114 | 115 | # query porn information table 116 | # 查询影片信息表 117 | def query_table_torrent(self): 118 | self.query_table(self.table_torrent) 119 | 120 | # query pictures table 121 | # 查询图片表 122 | def query_table_pictures(self): 123 | self.query_table(self.table_pictures) 124 | 125 | # insert into [table_torrent] and return the primary key of the item just inserted 126 | # 插入到 [table_torrent] 返回刚插入的项的主键 127 | def insert_table_torrent(self, data='', name='', summary='', magnet=''): 128 | sql = "INSERT INTO " + self.table_torrent + " (data, name, summary, magnet) VALUES ('" + data + "', '" + \ 129 | name + "', '" + summary + "', '" + magnet + "')" 130 | try: 131 | self.cursor.execute(sql) 132 | self.conn.commit() 133 | print("Successfully insert " + name + " into " + self.table_torrent) 134 | except Exception as e: 135 | print("Failed to " + sql + str(e)) 136 | try: 137 | an_id = -1 138 | an_id = self.cursor.lastrowid 139 | if an_id != -1: 140 | return an_id 141 | except Exception as e: 142 | print("Failed to return last_insert_id." + str(e)) 143 | 144 | # insert into [table_pictures] 145 | # 插入到 table_pictures 146 | def insert_table_pictures(self, an_id='', name=''): 147 | sql = "INSERT INTO " + self.table_pictures + " (an_id, name) VALUES ('" + str(an_id) + "', '" + name + "')" 148 | try: 149 | self.cursor.execute(sql) 150 | self.conn.commit() 151 | print("Successfully insert " + name + " into " + self.table_pictures) 152 | except Exception as e: 153 | print("Failed to " + sql + str(e)) 154 | 155 | # close database 156 | # 关闭数据库连接 157 | def close_mysql(self): 158 | try: 159 | self.cursor.close() 160 | self.conn.close() 161 | except Exception as e: 162 | print("Failed to close mysql." + str(e)) 163 | 164 | 165 | # conversion encode 166 | # 转换编码 167 | def Encode_Conversion(req): 168 | if req.encoding == 'ISO-8859-1': 169 | encodings = requests.utils.get_encodings_from_content(req.text) 170 | if encodings: 171 | encoding = encodings[0] 172 | else: 173 | encoding = req.apparent_encoding 174 | # encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace') 175 | encode_content = req.content.decode(encoding, 'replace') 176 | return encode_content 177 | else: 178 | return "" 179 | 180 | 181 | # save [content] to [path] 182 | # 保存文本 183 | def Save_Text(id, path, content): 184 | try: 185 | f = open(path, "w", encoding='utf-8') 186 | f.write(content) 187 | except IOError: 188 | print("[" + str(id) + "] IOError: File open failed.") 189 | except Exception as e: 190 | print("Save_Text Exception: " + str(e)) 191 | else: 192 | # 内容写入文件成功 193 | print("[" + str(id) + "] Successfully save the file to " + path) 194 | f.close() 195 | 196 | 197 | # torrent and magnet-link page 198 | # 种子/磁力链接页面 199 | def Prase_Torrent(id, url): 200 | try: 201 | if (isProxy == True): 202 | req = requests.get(url, params=torrent_request_header, proxies=proxies_header) 203 | else: 204 | req = requests.get(url, params=torrent_request_header) 205 | 206 | # soup转换 207 | soup = BeautifulSoup(req.content, "html.parser") 208 | 209 | torrent_content = soup.select('.uk-button ') 210 | torrent_content_num = len(torrent_content) 211 | if torrent_content_num == 0: 212 | print("[" + str(id) + "] No match torrent.") 213 | return '' 214 | for content in torrent_content: 215 | str_content = str(content) 216 | # 匹配磁力链接 217 | matchObj = re.search(r'magnet(.*?)"', str_content) 218 | if matchObj: 219 | magnet_link = 'magnet' + matchObj.group(1) 220 | return magnet_link 221 | else: 222 | # 匹配失败 223 | print("[" + str(id) + "] No match: " + str_content) 224 | return '' 225 | except Exception as e: 226 | print("[" + str(id) + "] Prase_Torrent Exception: " + str(e)) 227 | 228 | 229 | # each post page 230 | # 每个帖子页面 231 | def Prase_Post(id, url, folder_name): 232 | try: 233 | # 匹配日期 234 | data = '' 235 | matchObj = re.search(r'\[(.*?)\]', folder_name, re.M | re.I) 236 | if matchObj: 237 | data = matchObj.group(1) # 文件夹名 238 | else: 239 | # 匹配失败 240 | print("[" + str(id) + "] No match: " + folder_name) 241 | 242 | if (isProxy == True): 243 | req = requests.get(url, params=request_header, proxies=proxies_header) 244 | else: 245 | req = requests.get(url, params=request_header) 246 | 247 | # 转换编码 248 | encode_content = Encode_Conversion(req) 249 | # soup转换 250 | soup = BeautifulSoup(encode_content, "html.parser") 251 | 252 | post_content = soup.select('div[id="read_tpc"]') 253 | post_content_num = len(post_content) 254 | if post_content_num == 0: 255 | print("[" + str(id) + "] No match post.") 256 | return 257 | 258 | # 保存文本内容 259 | summary = post_content[0].text 260 | str_content = str(post_content[0]) 261 | 262 | # 匹配种子 263 | magnet_link = '' 264 | matchObj = re.findall(r'href="(.*?)"', str_content) 265 | if matchObj: 266 | for obj in matchObj: 267 | magnet_link = Prase_Torrent(id, obj) 268 | else: 269 | # 匹配种子失败 270 | print("[" + str(id) + "] No match: " + str_content) 271 | 272 | # 插入到数据库:insert_table_torrent 表 273 | an_id = -1 274 | if folder_name != '' and magnet_link != '': 275 | an_id = mySQLCommand.insert_table_torrent(data=data, name=folder_name, summary=summary, magnet=magnet_link) 276 | 277 | if an_id != -1: 278 | # 创建保存图片的文件夹 279 | folder_path = save_path + '/' + str(an_id) 280 | folder = os.path.exists(folder_path) 281 | if not folder: 282 | os.makedirs(folder_path) 283 | print("[" + str(id) + "] Created folder " + str(an_id)) 284 | 285 | # 匹配图片 286 | matchObj = re.findall(r'window.open\(\'(.*?)\'\);', str_content) 287 | if matchObj: 288 | for obj in matchObj: 289 | objTemp = obj 290 | strlist = objTemp.split('/') 291 | strlen = len(strlist) 292 | if strlen != 0: 293 | img_name = strlist[strlen - 1] 294 | try: 295 | urllib.request.urlretrieve(obj, folder_path + '/' + img_name) 296 | except Exception as e: 297 | print("[" + str(id) + "] Download the picture Exception: " + str(e)) 298 | else: 299 | print("[" + str(id) + "] Successfully save the image to " + folder_path + '/' + img_name) 300 | # 插入数据库:insert_table_pictures 表 301 | mySQLCommand.insert_table_pictures(an_id=an_id, name=img_name) 302 | else: 303 | # 匹配失败 304 | print("[" + str(id) + "] No match: " + str_content) 305 | except Exception as e: 306 | print("[" + str(id) + "] Prase_Post Exception: " + str(e)) 307 | 308 | 309 | # post list page 310 | # 帖子列表页面 311 | def Post_list(id, page): 312 | try: 313 | post_url = base_url + 'thread-htm-fid-' + str(fid) + '-page-' + str(page) + '.html' 314 | print('[' + str(id) + '] clicked: ' + post_url) 315 | 316 | if (isProxy == True): 317 | req = requests.get(post_url, params=request_header, proxies=proxies_header) 318 | else: 319 | req = requests.get(post_url, params=request_header) 320 | 321 | # 转换编码 322 | encode_content = Encode_Conversion(req) 323 | 324 | # soup转换 325 | soup = BeautifulSoup(encode_content, "html.parser") 326 | # 获取章节名称 327 | post_list = soup.select('tr[class="tr3 t_one"] h3 a') 328 | post_num = len(post_list) 329 | if post_num == 0: 330 | print("[" + str(id) + "] No match post_list.") 331 | return 332 | for post in post_list: 333 | str_post = str(post) 334 | # html网页的匹配 335 | matchObj = re.match(r'(.*)href="(.*)" id=(.*)>(.*?)', str_post, re.M | re.I) 336 | if matchObj: 337 | post_url = matchObj.group(2) # URL 338 | post_name = matchObj.group(4) # 文件夹名 339 | if post_name != '': 340 | # 匹配每个帖子 341 | Prase_Post(id, base_url + post_url, 342 | post_name.replace(u'\0', u'').replace(u'/', u'.').replace(u'?', 343 | u'').replace(u'*', 344 | u'')) 345 | else: 346 | # 匹配失败 347 | print("[" + str(id) + "] No match: " + str_post) 348 | except Exception as e: 349 | print("[" + str(id) + "] Post_list Exception." + str(e)) 350 | 351 | 352 | # multi-threaded, the parameter [id] is the thread id 353 | # 多线程,参数 [id] 为线程 id 354 | def Work_thread(id): 355 | try: 356 | if id <= page_end: 357 | prase_num = 0 358 | prase_more_one = 0 359 | page_num = abs(page_end - page_start) + 1 360 | if id <= int(page_num % thread_num): 361 | prase_more_one = 1 362 | page_num_each_thread = int(page_num / thread_num) + prase_more_one 363 | for each_page in range(page_start + id - 1, page_end + 1, thread_num): 364 | Post_list(id, each_page) 365 | prase_num += 1 366 | print('[' + str(id) + '] [ ' + "{:.1f}".format( 367 | prase_num / page_num_each_thread * 100) + '% page completed ] ') 368 | print('[' + str(id) + '] completed !!!!!') 369 | except Exception as e: 370 | print("[" + str(id) + "] Work_thread Exception." + str(e)) 371 | 372 | 373 | if __name__ == "__main__": 374 | # database command object 375 | # 数据库命令对象 376 | mySQLCommand = MySQLCommand() 377 | if mySQLCommand.connect_mysql() != -1: 378 | # single thread # 单线程 379 | # Work_thread(1) 380 | # multithreading # 多线程 381 | try: 382 | for i in range(1, thread_num + 1): 383 | _thread.start_new_thread(Work_thread, (i,)) 384 | except Exception as e: 385 | print("Start_new_thread Exception: " + str(e)) 386 | while 1: 387 | pass 388 | mySQLCommand.close_mysql() 389 | -------------------------------------------------------------------------------- /TorrentSpider_LatestCollection.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | #-*-coding:utf-8 -*- 3 | import _thread 4 | import re 5 | import requests 6 | from bs4 import BeautifulSoup 7 | from urllib.parse import urlencode, parse_qs, urlsplit, urlunsplit 8 | from urllib.request import urlretrieve 9 | import urllib.request 10 | import os 11 | import time 12 | 13 | 14 | # 1024 http request header 15 | # 1024 网站请求头 16 | proxt_1024_req_header = { 17 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 18 | 'Accept-Encoding': 'gzip, deflate', 19 | 'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 20 | 'Cache-Control': 'no-cache', 21 | # 'Connection': 'keep - alive', 22 | 'Cookie': '__cfduid=d7e5c699ef4d6599ef01239424b0e6cd71547705542; aafaf_lastvisit=0%09154' \ 23 | '7705542%09%2Fpw%2Findex.php%3F; UM_distinctid=1685a707030539-0653970bbabd2b-46564b55' \ 24 | '-1fa400-1685a707031a0a; CNZZDATA1261158850=317005769-1547705297-%7C1547705297', 25 | 'Host': 'w3.jbzcjsj.pw', 26 | 'Pragma': 'no-cache', 27 | 'Proxy-Connection': 'keep-alive', 28 | #'Referer': 'http://w3.afulyu.pw/pw/thread.php?fid=17&page=1', 29 | 'Upgrade-Insecure-Requests': '1', 30 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \ 31 | '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116' 32 | } 33 | request_header = proxt_1024_req_header 34 | 35 | # magnet-link website http request header 36 | # 磁力链接网站网站请求头 37 | proxt_torrent_req_header = { 38 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 39 | 'Accept-Encoding': 'gzip, deflate', 40 | 'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 41 | 'Cache-Control': 'no-cache', 42 | # 'Connection': 'keep - alive', 43 | 'Cookie': '__cfduid=d062c450fc125c2a02de05db8586dc1941547731587; UM_distinctid=1685bfdd4' \ 44 | 'd4854-0edeecf536f3fc-46564b55-1fa400-1685bfdd4d515b4; CNZZDATA1273152310=651528679' \ 45 | '-1547731013-http%253A%252F%252Fw3.jbzcjsj.pw%252F%7C1547731013; _ga=GA1.2.845482462.' \ 46 | '1547731588; _gid=GA1.2.2026642011.1547731588', 47 | 'Host': 'www1.downsx.club', 48 | 'Pragma': 'no-cache', 49 | 'Proxy-Connection': 'keep-alive', 50 | 'Referer': 'http://w3.jbzcjsj.pw/pw/html_data/3/1901/3855151.html', 51 | 'Upgrade-Insecure-Requests': '1', 52 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \ 53 | '(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116' 54 | } 55 | torrent_request_header = proxt_torrent_req_header 56 | opener=urllib.request.build_opener() 57 | opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' \ 58 | ' (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116')] 59 | urllib.request.install_opener(opener) 60 | 61 | # proxy settings 62 | # 代理设置 63 | proxies = {'http': '127.0.0.1:1080', "https": "127.0.0.1:1080", } 64 | proxies_header = proxies 65 | isProxy = False # 是否设置代理 66 | 67 | base_url = "http://w3.jbzcjsj.pw/pw/" # 基础url 68 | save_path = "D:/code/Pycharm/1024Spider/torrent" # 存储图片路径 69 | fid = 3 # fid=3 表示最新合集 70 | page_start = 1 # 爬取的开始页 71 | page_end = 245 # 爬取的结束页 72 | thread_num = 1 # 线程数 73 | 74 | 75 | # conversion encode 76 | # 转换编码 77 | def Encode_Conversion(req): 78 | if req.encoding == 'ISO-8859-1': 79 | encodings = requests.utils.get_encodings_from_content(req.text) 80 | if encodings: 81 | encoding = encodings[0] 82 | else: 83 | encoding = req.apparent_encoding 84 | 85 | # encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace') 86 | encode_content = req.content.decode(encoding, 'replace') # 如果设置为replace,则会用?取代非法字符; 87 | return encode_content 88 | else: 89 | return "" 90 | 91 | 92 | # save [content] to [path] 93 | # 保存文本 94 | def Save_Text(id, path, content): 95 | try: 96 | f = open(path, "w", encoding='utf-8') 97 | f.write(content) 98 | except IOError: 99 | print("[" + str(id) + "] IOError: File open failed.") 100 | except Exception as e: 101 | print("Save_Text Exception: " + str(e)) 102 | else: 103 | # 内容写入文件成功 104 | print("[" + str(id) + "] Successfully save the file to " + path) 105 | f.close() 106 | 107 | 108 | # torrent and magnet-link page 109 | # 种子/磁力链接页面 110 | def Prase_Torrent(id, url, folder_path): 111 | try: 112 | if (isProxy == True): 113 | req = requests.get(url, params=torrent_request_header, proxies=proxies_header) 114 | else: 115 | req = requests.get(url, params=torrent_request_header) 116 | 117 | # soup转换 118 | soup = BeautifulSoup(req.content, "html.parser") 119 | 120 | torrent_content = soup.select('.uk-button ') 121 | torrent_content_num = len(torrent_content) 122 | if torrent_content_num == 0: 123 | print("[" + str(id) + "] No match torrent.") 124 | return 125 | for content in torrent_content: 126 | str_content = str(content) 127 | # 匹配磁力链接 128 | matchObj = re.search(r'magnet(.*?)"', str_content) 129 | if matchObj: 130 | magnet_link = 'magnet' + matchObj.group(1) 131 | urlTemp = url 132 | strlist = urlTemp.split('/') 133 | strlen = len(strlist) 134 | if strlen != 0: 135 | torrent_name = strlist[strlen - 1] 136 | if torrent_name != "": 137 | savePath = folder_path + "/" + str(torrent_name).replace(u'\0', u'').replace(u'\t', 138 | u'') + ".txt" 139 | Save_Text(id, savePath, magnet_link) 140 | else: 141 | # 匹配失败 142 | print("[" + str(id) + "] No match: " + str_content) 143 | except Exception as e: 144 | print("[" + str(id) + "] Prase_Torrent Exception: " + str(e)) 145 | 146 | 147 | # each post page 148 | # 每个帖子页面 149 | def Prase_Post(id, url, folder_name): 150 | try: 151 | if (isProxy == True): 152 | req = requests.get(url, params=request_header, proxies=proxies_header) 153 | else: 154 | req = requests.get(url, params=request_header) 155 | 156 | # 转换编码 157 | encode_content = Encode_Conversion(req) 158 | # soup转换 159 | soup = BeautifulSoup(encode_content, "html.parser") 160 | 161 | post_content = soup.select('div[id="read_tpc"]') 162 | post_content_num = len(post_content) 163 | if post_content_num == 0: 164 | print("[" + str(id) + "] No match post.") 165 | return 166 | 167 | # 创建保存的文件夹 168 | folder_path = save_path + '/' + folder_name 169 | folder = os.path.exists(folder_path) 170 | if not folder: 171 | os.makedirs(folder_path) 172 | print("[" + str(id) + "] Created folder " + folder_name) 173 | 174 | # 保存文本内容 175 | result = post_content[0].text 176 | Save_Text(id, folder_path + '/index.txt', result) 177 | for content in post_content: 178 | str_content = str(content) 179 | 180 | # 匹配种子 181 | matchObj = re.findall(r'href="(.*?)"', str_content) 182 | if matchObj: 183 | for obj in matchObj: 184 | Prase_Torrent(id, obj, folder_path) 185 | else: 186 | # 匹配失败 187 | print("[" + str(id) + "] No match: " + str_content) 188 | 189 | # 匹配图片 190 | matchObj = re.findall(r'window.open\(\'(.*?)\'\);', str_content) 191 | if matchObj: 192 | for obj in matchObj: 193 | objTemp = obj 194 | strlist = objTemp.split('/') 195 | strlen = len(strlist) 196 | if strlen != 0: 197 | img_name = strlist[strlen - 1] 198 | try: 199 | urllib.request.urlretrieve(obj, folder_path + '/' + img_name) 200 | except Exception as e: 201 | print("[" + str(id) + "] Download the picture Exception: " + str(e)) 202 | else: 203 | print("[" + str(id) + "] Successfully save the image to " + folder_path + '/' + img_name) 204 | else: 205 | # 匹配失败 206 | print("[" + str(id) + "] No match: " + str_content) 207 | except Exception as e: 208 | print("[" + str(id) + "] Prase_Post Exception: " + str(e)) 209 | 210 | 211 | # post list page 212 | # 帖子列表页面 213 | def Post_list(id, page): 214 | try: 215 | post_url = base_url + 'thread-htm-fid-' + str(fid) + '-page-' + str(page) + '.html' 216 | print('[' + str(id) + '] clicked: ' + post_url) 217 | 218 | if (isProxy == True): 219 | req = requests.get(post_url, params=request_header, proxies=proxies_header) 220 | else: 221 | req = requests.get(post_url, params=request_header) 222 | 223 | # 转换编码 224 | encode_content = Encode_Conversion(req) 225 | 226 | # soup转换 227 | soup = BeautifulSoup(encode_content, "html.parser") 228 | # 获取帖子名称 229 | post_list = soup.select('tr[class="tr3 t_one"] h3 a') 230 | post_num = len(post_list) 231 | if post_num == 0: 232 | print("[" + str(id) + "] No match post_list.") 233 | return 234 | for post in post_list: 235 | str_post = str(post) 236 | # 帖子网页的匹配 237 | matchObj = re.match(r'(.*)href="(.*)" id=(.*)>(.*?)', str_post, re.M | re.I) 238 | if matchObj: 239 | post_url = matchObj.group(2) # URL 240 | post_name = matchObj.group(4) # 文件夹名 241 | if post_name != '': 242 | # 匹配每个帖子 243 | Prase_Post(id, base_url + post_url, 244 | post_name.replace(u'\0', u'').replace(u'/', u'.').replace(u'?', 245 | u'').replace(u'*', 246 | u'')) 247 | else: 248 | # 匹配失败 249 | print("[" + str(id) + "] No match: " + str_post) 250 | except Exception as e: 251 | print("[" + str(id) + "] Post_list Exception." + str(e)) 252 | 253 | 254 | # multi-threaded, the parameter [id] is the thread id 255 | # 多线程,参数 [id] 为线程 id 256 | def Work_thread(id): 257 | try: 258 | if id <= page_end: 259 | prase_num = 0 260 | prase_more_one = 0 261 | page_num = abs(page_end - page_start) + 1 262 | if id <= int(page_num % thread_num): 263 | prase_more_one = 1 264 | page_num_each_thread = int(page_num / thread_num) + prase_more_one 265 | for each_page in range(page_start + id - 1, page_end + 1, thread_num): 266 | Post_list(id, each_page) 267 | prase_num += 1 268 | print('[' + str(id) + '] [ ' + "{:.1f}".format( 269 | prase_num / page_num_each_thread * 100) + '% page completed ] ') 270 | print('[' + str(id) + '] completed !!!!!') 271 | except Exception as e: 272 | print("[" + str(id) + "] Work_thread Exception." + str(e)) 273 | 274 | 275 | if __name__ == "__main__": 276 | # single thread # 单线程 277 | # Work_thread(1) 278 | # multithreading # 多线程 279 | try: 280 | for i in range(1, thread_num + 1): 281 | _thread.start_new_thread(Work_thread, (i,)) 282 | except Exception as e: 283 | print("Start_new_thread Exception: " + str(e)) 284 | while 1: 285 | pass 286 | -------------------------------------------------------------------------------- /config_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "db": { 3 | "host": "127.0.0.1", 4 | "port": "3306", 5 | "user": "", 6 | "password": "", 7 | "db": "", 8 | "table_AsianNomosaic": "AsianNomosaic", 9 | "table_AsianNomosaicPictures": "AsianNomosaicPictures" 10 | }, 11 | "_1024_req_header": { 12 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 13 | "Accept-Encoding": "gzip, deflate", 14 | "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7", 15 | "Cache-Control": "no-cache", 16 | "Cookie": "__cfduid=d7e5c699ef4d6599ef01239424b0e6cd71547705542; aafaf_lastvisit=0%091547705542%09%2Fpw%2Findex.php%3F; UM_distinctid=1685a707030539-0653970bbabd2b-46564b55-1fa400-1685a707031a0a; CNZZDATA1261158850=317005769-1547705297-%7C1547705297", 17 | "Host": "w3.jbzcjsj.pw", 18 | "Pragma": "no-cache", 19 | "Proxy-Connection": "keep-alive", 20 | "Upgrade-Insecure-Requests": "1", 21 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116" 22 | }, 23 | "_torrent_req_header": { 24 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 25 | "Accept-Encoding": "gzip, deflate", 26 | "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7", 27 | "Cache-Control": "no-cache", 28 | "Cookie": "__cfduid=d062c450fc125c2a02de05db8586dc1941547731587; UM_distinctid=1685bfdd4d4854-0edeecf536f3fc-46564b55-1fa400-1685bfdd4d515b4; CNZZDATA1273152310=651528679-1547731013-http%253A%252F%252Fw3.jbzcjsj.pw%252F%7C1547731013; _ga=GA1.2.845482462.1547731588; _gid=GA1.2.2026642011.1547731588", 29 | "Host": "w3.jbzcjsj.pw", 30 | "Pragma": "no-cache", 31 | "Proxy-Connection": "keep-alive", 32 | "Referer": "http://w3.jbzcjsj.pw/pw/html_data/3/1901/3855151.html", 33 | "Upgrade-Insecure-Requests": "1", 34 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 OPR/57.0.3098.116" 35 | }, 36 | "proxies": { 37 | "http": "127.0.0.1:1080", 38 | "https": "127.0.0.1:1080" 39 | }, 40 | "is_proxy": "False", 41 | "fid": "5", 42 | "base_url": "http://w3.jbzcjsj.pw/pw/", 43 | "save_path": "D:/code/Pycharm/1024Spider/torrent_asian_nomosaic", 44 | "page_start": "1", 45 | "page_end": "913", 46 | "thread_num": "1" 47 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bs4 2 | html5lib 3 | lxml 4 | requests 5 | urllib3 6 | pymysql --------------------------------------------------------------------------------