├── run.bat
├── 视频转迅雷.bat
├── __pycache__
    ├── common.cpython-36.pyc
    ├── parse_list.cpython-36.pyc
    ├── parse_src.cpython-36.pyc
    └── redisutil.cpython-36.pyc
├── run.py
├── src2file.py
├── redisutil.py
├── README.md
├── common.py
├── parse_list.py
├── parse_src.py
└── download.py


/run.bat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rollingkeyboard/91/HEAD/run.bat


--------------------------------------------------------------------------------
/视频转迅雷.bat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rollingkeyboard/91/HEAD/视频转迅雷.bat


--------------------------------------------------------------------------------
/__pycache__/common.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rollingkeyboard/91/HEAD/__pycache__/common.cpython-36.pyc


--------------------------------------------------------------------------------
/__pycache__/parse_list.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rollingkeyboard/91/HEAD/__pycache__/parse_list.cpython-36.pyc


--------------------------------------------------------------------------------
/__pycache__/parse_src.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rollingkeyboard/91/HEAD/__pycache__/parse_src.cpython-36.pyc


--------------------------------------------------------------------------------
/__pycache__/redisutil.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rollingkeyboard/91/HEAD/__pycache__/redisutil.cpython-36.pyc


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import parse_list, parse_src, time
 3 | 
 4 | print("即将启动解析列表程序")
 5 | parse_list.start()
 6 | 
 7 | # 睡眠5分钟后启动
 8 | print("即将启动解析视频程序")
 9 | #time.sleep(2)
10 | parse_src.start()


--------------------------------------------------------------------------------
/src2file.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import redis, math, common, time
 3 | 
 4 | # 将每个视频的url写入文件，然后用迅雷拖吧
 5 | c = redis.StrictRedis("localhost", 6379)
 6 | lst = c.lrange("91_src", 0, -1)
 7 | 
 8 | total = len(lst)
 9 | count = math.floor(total / 1000) + 1 # 比如 3005个，需要4个文件，每个文件1000个，最后一个文件5个
10 | 
11 | for i in range(1, int(count + 1)):
12 | 	s = "\n\n" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+ "\n\n"
13 | 	for a in lst[(i - 1) * 1000 : i * 1000]:
14 | 		src = a.decode("utf-8")
15 | 		if src != "None":
16 | 			s += src + "\n"
17 | 			c.lrem(common.KEY_SRC, 1, src)
18 | 			# print("remove from redis ", src)
19 | 
20 | 	with open(common.TORRENT + "/" + str(i) + ".txt", 'a') as f:
21 | 		f.write(s)
22 | 	print("writing file ", i)


--------------------------------------------------------------------------------
/redisutil.py:
--------------------------------------------------------------------------------
 1 | import redis
 2 | 
 3 | def connect():
 4 |     r = redis.StrictRedis(host="localhost", port=6379, db=0, decode_responses=True)
 5 |     return r
 6 | 
 7 | def setredis(url, key):
 8 |     r = connect()
 9 |     if not r.sismember(key, url):
10 |         r.sadd(key, url)
11 | 
12 | def exists(url, key):
13 |     r = connect()
14 |     lst = r.lrange(key, 0, -1)
15 |     flag = -1
16 |     for a in lst:
17 |         if a == url:
18 |             flag = 1
19 |             break
20 |     if flag == 1:
21 |         return True
22 |     else:
23 |         return False
24 | 
25 | def add(url, key):
26 |     if not exists(url, key):
27 |         r = connect()
28 |         r.rpush(key, url)
29 | 
30 | def remove(url, key):
31 |     if exists(url, key):
32 |         r = connect()
33 |         r.lrem(key, 0, url)
34 | 
35 | def total(key):
36 |     r = connect()
37 |     return r.llen(key)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 91
 2 | [91 porn](http://91.91p17.space/) 是一个知名的自拍视频网站
 3 | - 本程序获取所有视频url-
 4 | - 然后将视频的下载地址保存到文件，每个文件1000个，可以直接拖到迅雷里下载，一晚上可以下载<label style="color:red"> 300~400GB </label> 没有任何问题
 5 | - 可以输入指定的页码进行抓取
 6 | 
 7 | # 安装
 8 | - [python3](http://www.python.org)
 9 | - [redis](redis.io)
10 | 
11 | On Windows
12 | 
13 |     redis-server.exe redis.window.conf    
14 | 
15 | Or on Unix-like OS
16 | 
17 |     $ redis-server redis.conf 
18 | 
19 | 安装 requests, 快速获取html
20 | 
21 |     $ pip install requests    
22 | 
23 | 
24 | # 启动
25 | 
26 |     python run.py    
27 | 
28 | 或者
29 | 
30 | ### 双击  <label style="color:red">**run.bat**</label>运行
31 | 
32 | ## 说明
33 | - parse_list.py 将所有视频url下载下来
34 | - parse_src.py 将所有视频的source下载下来，source复制到浏览器可以直接观看
35 | - common.py 通用的文件，保存一些常量
36 | - run.py, run.bat 运行脚本
37 | - src2file.py 将所有视频source复制到文本中，将文本的内容 `Ctrl` + `C`, 然后`Ctrl` + `V` 复制到迅雷里疯狂的下载吧
38 | 
39 | # 忠告
40 |   爱惜自己
41 | 
42 | # 免责声明
43 |   本程序仅做学习交流之使用，如有其它用途并产生其它后果，本人概不负责。
44 | 


--------------------------------------------------------------------------------
/common.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import requests, re, redisutil, time, random, threading
 3 | from urllib3.util.retry import Retry
 4 | from requests.adapters import HTTPAdapter
 5 | 
 6 | 
 7 | cookies = requests.cookies.RequestsCookieJar()
 8 | cookies.set("language", "cn_CN", domain=".91.91p17.space", path="/")
 9 | 
10 | #--------------------------------------
11 | # 91 的临时站点，可以随时更换
12 | URL = "http://91.91p17.space/"
13 | KEY = "91"
14 | KEY_SRC = "91_src" # 每个视频源url对于的redis key
15 | KEY_NONE = "91_none"
16 | LOG = "f:/log/visit.log"
17 | TORRENT = "f:/sed/"
18 | PARSE_LOG = "f:/log/parse.log"
19 | #----------------------------------------
20 | import os
21 | path = "/".join(LOG.split("/")[0:-1])
22 | 
23 | if not os.path.exists(TORRENT):
24 | 	os.makedirs(TORRENT)
25 | 
26 | if not os.path.exists(path):
27 |     os.makedirs(path)
28 | 
29 | 
30 | '''
31 |   获取访问的主页面
32 | '''
33 | def getNumber():
34 |     r = 0
35 |     while True:
36 |         num = input("请输入你想抓取的总页数:")
37 |         try:
38 |             r = int(num)
39 |             break
40 |         except:
41 |             print("抱歉，您输入的不是有效的数字, 请重新输入.")
42 |             continue
43 |     return r
44 | 
45 | '''
46 |   获取时长
47 | '''
48 | def getTime():
49 |     r = 0
50 |     while True:
51 |         num = input("请输入想获取的时长(分钟):")
52 |         try:
53 |             r = int(num)
54 |             break
55 |         except:
56 |             print("抱歉，您输入的不是有效的数字, 请重新输入.")
57 |             continue
58 |     return r
59 | 
60 | '''
61 |    构造随机ip作为请求头访问目标站点
62 | '''
63 | def visit(url):
64 |     randomIP = str(random.randint(0, 255)) + "." + str(random.randint(0,255)) + "." + str(random.randint(0,255)) + "." + str(random.randint(0,255))
65 |     retries = Retry(total=5,backoff_factor=10, status_forcelist=[500,502,503,504])
66 |     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
67 |         'X-Forwarded-For': randomIP}
68 |     s = requests.Session()
69 |     s.mount('http://', HTTPAdapter(max_retries=retries))
70 |     html = s.get(url, headers=headers, cookies=cookies).text
71 |     return html


--------------------------------------------------------------------------------
/parse_list.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import requests, re, redisutil, time, random, threading
 3 | from urllib3.util.retry import Retry
 4 | from requests.adapters import HTTPAdapter
 5 | import common
 6 | 
 7 | # 将列表页插入redis
 8 | def parseList(url):
 9 |     lst = re.compile(r'http:\/\/91\.91p17\.space\/view_video\.php\?viewkey\=\w+').findall(common.visit(url))
10 |     for a in set(lst):
11 |         if not redisutil.exists(a, common.KEY):
12 |             redisutil.add(a, common.KEY)
13 |             print(threading.current_thread().name, " insert into redis ", a)
14 |         else:
15 |             print(threading.current_thread().name, " redis 已经存在，不再访问 ", a)
16 | 
17 | '''
18 |     线程主方法
19 | '''
20 | def enter(**kwargs):
21 |     start = kwargs["start"]
22 |     end = kwargs["end"]
23 |     for page in range(start, end):
24 |         url = common.URL + "/v.php?next=watch&page=" + str(page)
25 |         try:
26 |             print(threading.current_thread().name, " 解析 ", page, " 页 ", url)
27 |             parseList(url)
28 |             time.sleep(random.randint(1, 3))
29 |         except RuntimeError:
30 |             print(threading.current_thread().name, " visiting page ", page, " occurs some errors ", RuntimeError.__with_traceback__)
31 |             redisutil.add(url, "91_error")
32 |             continue
33 |     # current thread has finished, log it and we can easily know it
34 |     with open(common.LOG, "a") as f:
35 |     	f.write("线程" + threading.current_thread().name + " 已经完成抓取 \n")
36 | 
37 | # 运行方法
38 | def start():
39 |     thread_list = []
40 |     total = common.getNumber()
41 |     thread_total = 5 # 线程总数，默认为5，如果抓取页面小于5，则线程总数就是抓取的页面总数
42 | 
43 |     if total <= 5:
44 |         page_size = 1
45 |         thread_total = total
46 |     else:
47 |         page_size = total / 5 # start 5 thread to visit
48 | 
49 |     for i in range(1, thread_total + 1):
50 |         start = (i - 1) * page_size + 1
51 |         end = i * page_size + 1
52 |         name = "a" + str(i)
53 |         t = threading.Thread(target=enter, name=name, kwargs={"start":start,"end":end})
54 |         thread_list.append(t)
55 | 
56 |     for t in thread_list:
57 |         t.start()
58 | 
59 |     for t in thread_list:
60 |         t.join()
61 | 
62 |     print("all thread over")
63 | 


--------------------------------------------------------------------------------
/parse_src.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import requests, re, redis, redisutil, time, random
 3 | from pyquery import PyQuery as pq
 4 | from urllib3.util.retry import Retry
 5 | from requests.adapters import HTTPAdapter
 6 | import threading
 7 | import common
 8 | 
 9 | # 将列表页插入redis
10 | def parse(url, c, ts):
11 |     d = pq(common.visit(url))
12 |     src = d("video").find("source").attr("src")
13 | 
14 |     m = d("#useraction .boxPart").html()
15 |     cn = re.search(u'时长:</span>(.*?)<span', m, re.S).group(1)
16 |     tc = "".join(cn.split())
17 |     t = tc.split(":")
18 |     times = 0
19 |     if len(t)  == 3:
20 |         times = int(t[1]) + 60
21 |     else:
22 |         times = int(t[0])
23 |     ts = int(ts)
24 |     if times < ts:
25 |         pass
26 |         #print( "时长不够不予处理")
27 |     elif src != None:
28 |         print( threading.current_thread().name,  " insert into redis ", src)
29 |         redisutil.add(src, common.KEY_SRC)
30 |         c.lrem(common.KEY, 1, url)
31 |     else:
32 |         print(threading.current_thread().name,  src, "解析为None, 插入 redis_error")
33 |         redisutil.add(src, common.KEY_NONE)
34 | 
35 | def enter(**kwargs):
36 |     start = kwargs["start"]
37 |     end = kwargs["end"]
38 |     ts = kwargs["ts"]
39 |     c = redisutil.connect()
40 |     lst = c.lrange(common.KEY, int(start), int(end))
41 | 
42 |     for a in lst:
43 |          print(threading.current_thread().name,  " parsing url ", a)
44 |          parse(a, c, ts)
45 |          time.sleep(0.1)
46 |     with open(common.PARSE_LOG, "a") as f:
47 |         f.write(threading.current_thread().name + " 已经解析完毕.\n")
48 | 
49 | def start():
50 |     thread_list = []
51 |     total = redisutil.total(common.KEY   )
52 |     ts = common.getTime()
53 |     page_size = 0
54 |     thread_total = 5
55 | 
56 |     if total <= 5:
57 |         page_size = 1
58 |         thread_total = total
59 |     else:
60 |         page_size = total / 5
61 | 
62 |     for t in range(1, thread_total + 1):
63 |         start = (t - 1) * page_size + 1
64 |         end = t * page_size + 1
65 |         name = "a" + str(t)
66 |         t = threading.Thread(target=enter, name=name, kwargs={"start":start, "end":end,"ts":ts})
67 |         thread_list.append(t)
68 | 
69 |     for t in thread_list:
70 |         t.start()
71 | 
72 |     for t in thread_list:
73 |         t.join()
74 | 
75 |     print("all thread over")


--------------------------------------------------------------------------------
/download.py:
--------------------------------------------------------------------------------
 1 | import urllib.request as request
 2 | import random, redis, threading
 3 | import ctypes
 4 | import os
 5 | import platform
 6 | import sys
 7 | 
 8 | client = redis.StrictRedis("localhost", 6379)
 9 | 
10 | def disk(folder):
11 |     if platform.system() == 'Windows':
12 |         free_bytes = ctypes.c_ulonglong(0)
13 |         ctypes.windll.kernel32.GetDiskFreeSpaceExW(ctypes.c_wchar_p(folder), None, None, ctypes.pointer(free_bytes))
14 |         return free_bytes.value/1024/1024/1024 
15 |     else:
16 |         st = os.statvfs(folder)
17 |         return st.f_bavail * st.f_frsize/1024/1024
18 | 
19 | def download(url):
20 |     randIP = str(random.randint(0, 255)) + "." + str(random.randint(0,255)) + "." + str(random.randint(0,255)) + "." + str(random.randint(0,255))
21 |     req = request.Request(url)
22 |     req.add_header('User-Agent', "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0")
23 |     req.add_header('X-Forwarded-For', randIP)
24 |     response = request.urlopen(req)
25 |     file_size = int(response.getheader("Content-Length"))
26 |     bytes_received = 0
27 |     dir = "e:/test/"
28 |     # 小于2g
29 |     if disk("e:") <= 2048:
30 |     	dir = "d:/test/"
31 | 
32 |     try:
33 |         with open(dir + str(random.randint(1, 99999999999999999999)) + ".mp4", 'wb') as dst_file:
34 |             while bytes_received / file_size != 1:
35 |                 _buffer = response.read(1024 * 1024)
36 | 
37 |                 bytes_received += len(_buffer)
38 |                 dst_file.write(_buffer)
39 |                 print(threading.current_thread().name + " 已下载 " +  str(bytes_received / file_size))
40 | 
41 |     except KeyboardInterrupt:
42 |         raise KeyboardInterrupt(
43 |             "Interrupt signal given. Deleting incomplete video.")
44 | 
45 | def enter(**kwargs):
46 | 	start = kwargs["start"]
47 | 	end = kwargs["end"]
48 | 
49 | 	for t in range(start, end):
50 | 		lst = client.lrange("91_src", start, end)
51 | 		for a in lst:
52 | 			src = a.decode("utf-8")
53 | 			download(src)
54 | 			print(threading.current_thread().name, " 下载 ", src, " 完成， 从redis 删除")
55 | 			client.lrem("91_src", 1, src)
56 | 
57 | if __name__ == "__main__":
58 | 	# thread_list = []
59 | 
60 | 	# for i in range(1, 6):
61 | 	# 	start = (i - 1) * 4000 + 1
62 | 	# 	end = i * 4000 + 1
63 | 	# 	t = threading.Thread(target=enter, name="a" + str(i),kwargs={'start':start, 'end':end})
64 | 	# 	thread_list.append(t)
65 | 
66 | 	# for t in thread_list:
67 | 	# 	t.start()
68 | 
69 | 	# for t in thread_list:
70 | 	# 	t.join()
71 | 
72 | 	# print("over")
73 | 	enter(start=1, end=2)


--------------------------------------------------------------------------------