├── magnet_crawler ├── __init__.py ├── utils.py ├── database.py ├── parse_torrent.py ├── crawler.py └── magnet2torrent.py ├── doc └── run1.gif ├── requirements.txt ├── .gitignore ├── run.py └── README.md /magnet_crawler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc/run1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cyrus97/magnet-crawler/HEAD/doc/run1.gif -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | redis==2.10.6 2 | bencoder.pyx==1.2.1 3 | websocket_client==0.53.0 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # ide 2 | .idea 3 | 4 | # log file 5 | *.log 6 | */*.log 7 | 8 | # torrent 9 | *.torrent 10 | torrents/ 11 | */torrents/ 12 | 13 | # db 14 | *.db -------------------------------------------------------------------------------- /magnet_crawler/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | import string 4 | from _socket import inet_ntoa 5 | 6 | from struct import unpack 7 | 8 | # 每个node节点信息的长度 9 | COMPACT_NODE_INFO_LENGTH = 26 10 | # 每个node节点长度 11 | COMPACT_NODE_LENGTH = 20 12 | 13 | 14 | def get_random_id(length): 15 | str_list = [random.choice(string.digits + string.ascii_letters) for i in range(length)] 16 | random_str = ''.join(str_list) 17 | return random_str 18 | 19 | 20 | def parse_nodes(data): 21 | nodes = [] 22 | if data: 23 | length = len(data) 24 | # 每个node信息,20:nid 4:ip 2:port 25 | for i in range(0, length, COMPACT_NODE_INFO_LENGTH): 26 | nid = data[i:i + 20] 27 | ip = inet_ntoa(data[i + 20:i + 24]) 28 | port = unpack("!H", data[i + 24:i + 26])[0] 29 | nodes.append((nid, ip, port)) 30 | 31 | return nodes 32 | 33 | 34 | def parse_info_hash(data): 35 | # info_hash 以16进制储存 36 | magnet = data.hex().upper() 37 | # info_hash = codecs.getencoder("hex")(data)[0].decode().upper() 38 | return magnet 39 | 40 | 41 | def get_logger(name, level=logging.INFO): 42 | logger = logging.getLogger(name) 43 | logger.setLevel(level) 44 | fh = logging.FileHandler('log.log', encoding='utf8') 45 | ch = logging.StreamHandler() 46 | formatter = logging.Formatter(fmt="[%(asctime)s %(levelname)s] %(name)s %(message)s", datefmt="%Y/%m/%d %X") 47 | fh.setFormatter(formatter) 48 | ch.setFormatter(formatter) 49 | logger.addHandler(fh) 50 | logger.addHandler(ch) 51 | 52 | return logger 53 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from multiprocessing import Process 3 | 4 | from magnet_crawler.crawler import start_multi_server, DEFAULT_SERVER_COUNT, DEFAULT_SERVER_PORT 5 | from magnet_crawler.database import create_tables 6 | from magnet_crawler.magnet2torrent import start_magnet_converter 7 | 8 | 9 | def start_all(crawler_args, converter_args): 10 | processes = [ 11 | Process(target=start_multi_server, args=crawler_args), 12 | Process(target=start_magnet_converter, args=converter_args), 13 | ] 14 | 15 | for p in processes: 16 | p.start() 17 | 18 | for p in processes: 19 | p.join() 20 | 21 | 22 | if __name__ == '__main__': 23 | parser = argparse.ArgumentParser(description='run for magnet-crawler') 24 | 25 | parser.add_argument("runserver", nargs='?', help='启动') 26 | parser.add_argument("createdatabase", nargs='?', help='创建数据库', default='magnet.db') 27 | parser.add_argument("-c", "--count", help="指定爬虫进程数", default=DEFAULT_SERVER_COUNT) 28 | parser.add_argument("-p", "--port", type=int, help="指定爬虫绑定端口起始位置", default=DEFAULT_SERVER_PORT) 29 | parser.add_argument("--only-crawler", help="只运行爬虫", action="store_true", dest='crawler') 30 | parser.add_argument("--only-convert", help="只运行 magnet 转换", action="store_true", dest='convert') 31 | 32 | args = parser.parse_args() 33 | 34 | if args.runserver == 'runserver': 35 | if args.crawler: 36 | # 只启动爬虫 37 | start_multi_server(args.count, args.port) 38 | elif args.convert: 39 | # 只启动转换 40 | start_magnet_converter() 41 | else: 42 | # 全部启动 43 | crawler_args = (args.count, args.port,) 44 | converter_args = () 45 | start_all(crawler_args, converter_args) 46 | elif args.runserver == 'createdatabase': 47 | create_tables(args.createdatabase) 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # magnet-crawler 磁力链接收集器 2 | 3 | ## 简介 4 | 5 | 这是一个 magnet 爬虫,通过伪装成一个 DHT 节点,接收其他节点发过来的请求信息,提取相关的 magnet。 6 | 然后通过 [Aria2](https://aria2.github.io/) 的 RPC 特性调用下载,把 magnet 下载为种子(torrent), 7 | 并解析种子文件信息,把文件信息存入数据库。 8 | 9 | ![运行界面](doc/run1.gif) 10 | 11 | ## 环境依赖 12 | 13 | - Python 3 14 | - Redis 15 | - Aria2 16 | 17 | ## 安装 18 | 19 | ``` 20 | git clone https://github.com/Cyrus97/magnet-crawler.git 21 | cd magnet-crawler 22 | pip install -r requirements.txt 23 | ``` 24 | 25 | ## 配置 26 | 27 | ### Aria2 RPC 28 | 29 | 需要开启 Aria2 的 RPC 功能,默认使用的是 6800 端口。关于 Aria2 的相关设置, 30 | 请到文件 [magnet2torrent.py](magnet_crawler/magnet2torrent.py) 中同步修改。 31 | 32 | ```python 33 | # file magnet_crawler/magnet2torrent.py 34 | 35 | RPC_SERVER = "http://localhost:6800/rpc" 36 | RPC_WEBSOCKET = "ws://localhost:6800/jsonrpc" 37 | # 把 token: 后面换成你的 secret 38 | RPC_SECRET = "token:abcdefg" 39 | # aria2 下载timeout 40 | BT_STOP_TIMEOUT = 600 41 | # aria2 最大下载数量 42 | MAX_DOWNLOADS = 32 43 | # aria2 下载路径 44 | DIR_PATH = os.path.abspath('./torrents') 45 | ``` 46 | 47 | ### Redis 48 | 49 | 使用默认的 6379 端口,要修改请到文件 [database.py](magnet_crawler/database.py) 中修改。 50 | 51 | ```python 52 | # file magnet_crawler/database.py 53 | 54 | # redis config 55 | REDIS_HOST = '127.0.0.1' 56 | REDIS_PORT = 6379 57 | # 所有的magnet存在这里 58 | REDIS_ALL_KEY = 'all-magnet' 59 | # 进行过转换的magnet 60 | REDIS_USED_KEY = 'used-magnet' 61 | # 能下载的magnet 62 | REDIS_AVAIL_KEY = 'magnet' 63 | ``` 64 | 65 | ## 使用 66 | 67 | ``` 68 | # python run.py -h 69 | 70 | usage: run.py [-h] [-c COUNT] [-p PORT] [--only-crawler] [--only-convert] 71 | [runserver] [createdatabase] 72 | 73 | run for magnet-crawler 74 | 75 | positional arguments: 76 | runserver 启动 77 | createdatabase 创建数据库 78 | 79 | optional arguments: 80 | -h, --help show this help message and exit 81 | -c COUNT, --count COUNT 82 | 指定爬虫进程数 83 | -p PORT, --port PORT 指定爬虫绑定端口起始位置 84 | --only-crawler 只运行爬虫 85 | --only-convert 只运行 magnet 转换 86 | 87 | ``` 88 | 89 | 90 | 91 | ## 快速开始 92 | 93 | ```python 94 | # 创建数据库 95 | python run.py createdatabase 96 | 97 | # 以默认方式启动(启动爬虫和 magnet 下载转换) 98 | python run.py runserver 99 | 100 | # 如果你只是想跑跑看,或者没有下载 redis 和 aria2 可以只启动爬虫 101 | python run.py runserver --only-crawler 102 | 103 | ``` 104 | 105 | ## 问题 106 | 107 | 有任何使用问题或者建议请提 Issue,或者发给我邮件 `liuxingran97@gmail.com`。 108 | 109 | ### 已知问题 110 | 111 | - 在命令行运行该程序无法使用 `Ctrl + C` 退出,只能关闭命令行程序(是因为多进程多线程的原因,正在解决) 112 | - 可能在使用 Aria2 RPC 发送和接收信息过程中出错 113 | 114 | ### 其他问题 115 | 116 | - 如果是在本地(没有公网)运行,有可能会出现一直爬取不到任何信息的问题,需要重新连接网络(重新拨号) 117 | 118 | ## TODO 119 | 120 | - [ ] 修复 bug 121 | - [ ] 优化多进程和多线程 122 | - [ ] 优化 magnet 转 torrent的逻辑,更好地对接 Aria2 和 数据库 123 | - [ ] 分析爬取到的内容 124 | -------------------------------------------------------------------------------- /magnet_crawler/database.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import sqlite3 4 | import string 5 | from datetime import datetime 6 | 7 | import redis 8 | 9 | # redis config 10 | REDIS_HOST = '127.0.0.1' 11 | REDIS_PORT = 6379 12 | # 所有的magnet存在这里 13 | REDIS_ALL_KEY = 'all-magnet' 14 | # 进行过转换的magnet 15 | REDIS_USED_KEY = 'used-magnet' 16 | # 能下载的magnet 17 | REDIS_AVAIL_KEY = 'magnet' 18 | 19 | # mysql config 20 | MYSQL_HOST = '127.0.0.1' 21 | MYSQL_PORT = 3306 22 | 23 | # sqlite3 24 | SQLITE_DATABASE_NAME = 'magnet.db' 25 | 26 | 27 | class RedisClient: 28 | def __init__(self, host=REDIS_HOST, port=REDIS_PORT): 29 | pool = redis.ConnectionPool(host=host, port=port, db=0) 30 | self.client = redis.Redis(connection_pool=pool) 31 | 32 | def add(self, magnet, key=REDIS_ALL_KEY): 33 | self.client.sadd(key, magnet) 34 | 35 | def count(self, key=REDIS_ALL_KEY): 36 | return self.client.scard(key) 37 | 38 | def get(self, count, keys=REDIS_ALL_KEY): 39 | """用于转换magnet""" 40 | # keys = (REDIS_ALL_KEY, REDIS_USED_KEY) 41 | diff_set = self.client.sdiff(keys) 42 | length = len(diff_set) 43 | count = count if length > count else length 44 | magnets = [diff_set.pop() for _ in range(count)] 45 | return magnets 46 | 47 | def diff(self, keys, count): 48 | diff_set = self.client.sdiff(keys) 49 | length = len(diff_set) 50 | count = count if length > count else length 51 | magnets = [diff_set.pop() for _ in range(count)] 52 | return magnets 53 | 54 | 55 | class MysqlClient: 56 | pass 57 | 58 | 59 | class SqliteClient: 60 | def __init__(self, db): 61 | self.db = db 62 | # sqlite3.ProgrammingError: SQLite objects created in a thread can only be used in that same thread. 63 | self.conn = sqlite3.connect(db, check_same_thread=False) 64 | 65 | def insert(self, magnet, data): 66 | insert_sql = ''' 67 | insert into {table_name} 68 | (magnet, torrent_name, content, create_date) 69 | values(?, ?, ?, ?); 70 | ''' 71 | cursor = self.conn.cursor() 72 | table_name = 'magnet_{}'.format(magnet[-40].lower()) 73 | # table_name = 'magnet_{}'.format(magnet[0]) 74 | try: 75 | params = (magnet, data.get('name', None), json.dumps(data), datetime.now(),) 76 | cursor.execute(insert_sql.format(table_name=table_name), params) 77 | self.conn.commit() 78 | except sqlite3.Error as e: 79 | print(e) 80 | finally: 81 | cursor.close() 82 | self.conn.commit() 83 | 84 | def count(self): 85 | pass 86 | 87 | 88 | def create_tables(db): 89 | create_sql = ''' 90 | create table {table_name} 91 | ( 92 | id integer not null primary key autoincrement, 93 | magnet varchar(30) not null unique, 94 | torrent_name varchar(500), 95 | content text, 96 | create_date datetime(6) 97 | ); 98 | ''' 99 | drop_sql = ''' 100 | drop table {table_name}; 101 | ''' 102 | table_names = ['magnet_' + i for i in string.digits + string.ascii_lowercase] 103 | exec_tables = [] 104 | 105 | conn = sqlite3.connect(db) 106 | cursor = conn.cursor() 107 | try: 108 | for name in table_names: 109 | try: 110 | cursor.execute(create_sql.format(table_name=name)) 111 | exec_tables.append(name) 112 | conn.commit() 113 | print('table {} created successful'.format(name)) 114 | except sqlite3.OperationalError as e: 115 | # logging.exception(e) 116 | if 'already exists' in str(e): 117 | print(e) 118 | continue 119 | else: 120 | raise Exception 121 | except sqlite3.OperationalError as e: 122 | logging.exception(e) 123 | except KeyboardInterrupt: 124 | for name in exec_tables: 125 | cursor.execute(drop_sql.format(table_name=name)) 126 | conn.commit() 127 | finally: 128 | cursor.close() 129 | conn.commit() 130 | conn.close() 131 | 132 | 133 | if __name__ == '__main__': 134 | # rc = RedisClient() 135 | # magnets = rc.get(20) 136 | # print(magnets) 137 | # print(len(magnets)) 138 | create_tables('magnet.db') 139 | 140 | -------------------------------------------------------------------------------- /magnet_crawler/parse_torrent.py: -------------------------------------------------------------------------------- 1 | import bencoder 2 | import json 3 | from pprint import pprint 4 | 5 | from magnet_crawler.database import SqliteClient, SQLITE_DATABASE_NAME 6 | 7 | 8 | def parse_torrent(): 9 | with open('../test.torrent', 'rb') as f: 10 | info = bencoder.bdecode(f.read()) 11 | print(info.keys()) 12 | # for key, value in info.get(b'info').items(): 13 | # print(key) 14 | # print(value) 15 | 16 | with open('../test2.torrent', 'rb') as f: 17 | info = bencoder.bdecode(f.read()) 18 | print(info.keys()) 19 | print(info[b'nodes']) 20 | # for key, value in info.get(b'info').items(): 21 | # print(key) 22 | # print(value) 23 | 24 | 25 | class TorrentParser: 26 | def __init__(self, torrent): 27 | self.torrent = self.decode_torrent(torrent) # all data is byte 28 | self.info = dict() 29 | self.encoding = self.torrent.get(b'encoding', b'utf-8').decode() 30 | self.sqlite3 = SqliteClient(SQLITE_DATABASE_NAME) 31 | 32 | def decode_torrent(self, torrent): 33 | with open(torrent, 'rb') as f: 34 | content = bencoder.bdecode(f.read()) 35 | return content 36 | 37 | def get_creation_info(self): 38 | created_by = self.torrent.get(b'created by', b'').decode() 39 | creation_date = self.torrent.get(b'creation date', None) # timestamp 40 | data = {'created by': created_by, 41 | 'creation date': creation_date} 42 | return data 43 | 44 | def is_dir(self): 45 | return b'files' in self.torrent.get(b'info').keys() 46 | 47 | def get_files_info(self): 48 | files = [] 49 | # 如果没有 'files', 表示下载的是单文件, name 是文件名称 50 | # 如果有, 'name' 是文件夹名称, 文件全部在 'files' 里 51 | tor_info = self.torrent[b'info'] 52 | # 作为种子文件的名称 53 | if b'name.utf-8' in tor_info.keys(): 54 | name = tor_info.get(b'name.utf-8').decode() 55 | elif b'name' in tor_info.keys(): 56 | name = self.decode_all(tor_info.get(b'name'), self.encoding) 57 | else: 58 | name = None 59 | # name = tor_info.get(b'name.utf-8', tor_info.get(b'name', b'').decode(self.encoding).encode()).decode() 60 | # print(name) 61 | self.info.update(name=name) 62 | 63 | if self.is_dir(): 64 | for file in tor_info.get(b'files', []): 65 | # TODO: 可能有多层文件夹 66 | length = file.get(b'length', None) 67 | if b'path.utf-8' in file.keys(): 68 | name = file.get(b'path.utf-8')[0].decode() 69 | elif b'path' in file.keys(): 70 | # 处理多出的一层文件夹 71 | if len(file.get(b'path')) == 1: 72 | name = self.decode_all(file.get(b'path')[0], self.encoding) 73 | else: 74 | name = self.decode_all(file.get(b'path')[1], self.encoding) 75 | else: 76 | name = None 77 | new_file = { 78 | 'length': length, 79 | 'name': name, 80 | } 81 | 82 | # print(new_file) 83 | files.append(new_file) 84 | else: 85 | length = tor_info.get(b'length', None) 86 | if b'name.utf-8' in tor_info.keys(): 87 | name = tor_info.get(b'name.utf-8').decode() 88 | elif b'name' in tor_info.keys(): 89 | name = self.decode_all(tor_info.get(b'name'), self.encoding) 90 | else: 91 | name = None 92 | file = { 93 | 'length': length, 94 | 'name': name, 95 | } 96 | files.append(file) 97 | 98 | return files 99 | 100 | def decode_all(self, data, encoding=None): 101 | encoding = encoding if encoding else self.encoding 102 | dec_data = data 103 | try: 104 | dec_data = data.decode(encoding) 105 | except UnicodeDecodeError as e: 106 | if 'utf-8' in str(e): 107 | dec_data = data.decode('gbk') 108 | elif 'gbk' in str(e): 109 | dec_data = data.decode('utf-8') 110 | return dec_data 111 | 112 | def filter_file(self): 113 | pass 114 | 115 | def get_torrent_info(self): 116 | self.info.update(self.get_creation_info()) 117 | self.info.update(files=self.get_files_info()) 118 | return self.info 119 | 120 | 121 | if __name__ == '__main__': 122 | # parse_torrent() 123 | tor_parser = TorrentParser('torrents/7c2fa8f559d38e61e8f23d9a2b2728e11173948f.torrent') 124 | # print(tor_parser.torrent[b'encoding']) 125 | # print(tor_parser.torrent[b'info'][b'pieces'].decode()) 126 | # print(tor_parser.torrent[b'info'].keys()) 127 | # pprint(tor_parser.get_files_info()) 128 | pprint(tor_parser.get_torrent_info()) 129 | print(json.loads(json.dumps(tor_parser.get_torrent_info()))) -------------------------------------------------------------------------------- /magnet_crawler/crawler.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import time 3 | from bencoder import bdecode, bencode 4 | from collections import deque 5 | from multiprocessing import Process 6 | from os import cpu_count 7 | from threading import Thread 8 | 9 | from magnet_crawler.database import RedisClient 10 | from magnet_crawler.utils import get_random_id, parse_nodes, parse_info_hash, get_logger 11 | 12 | BOOTSTRAP_NODES = [ 13 | ("router.bittorrent.com", 6881), 14 | ("dht.transmissionbt.com", 6881), 15 | ("router.utorrent.com", 6881), 16 | ] 17 | MAX_NODES_SIZE = 10000 18 | BUFSIZE = 10240 19 | # 发送间隔时间 20 | SLEEP_TIME = 1e-6 21 | MAGNET_TEMPLATE = "magnet:?xt=urn:btih:{}" 22 | SERVER_HOST = '0.0.0.0' 23 | DEFAULT_SERVER_PORT = 10086 24 | DEFAULT_SERVER_COUNT = cpu_count() 25 | TIMER_WAIT_TIME = 60 26 | 27 | 28 | class DHTNode: 29 | def __init__(self, nid, ip, port): 30 | self.nid = nid 31 | self.ip = ip 32 | self.port = port 33 | 34 | 35 | class DHTServer: 36 | def __init__(self, bind_ip, bind_port, name): 37 | """ 38 | 39 | :param bind_ip: 绑定的ip 40 | :param bind_port: 绑定的端口 41 | :param name: 该server的名字 42 | """ 43 | # 自己也是一个node 44 | self.node = DHTNode(get_random_id(20), bind_ip, bind_port) 45 | # 使用udp 46 | self.udp_socket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM, socket.IPPROTO_UDP) 47 | self.udp_socket.bind((bind_ip, bind_port)) 48 | # 存放发现的nodes 49 | self.nodes = deque(maxlen=MAX_NODES_SIZE) 50 | self.magnets = set() 51 | self.redis_client = RedisClient() 52 | self.logger = get_logger(name) 53 | self.logger.info("I'am {}, I'm bound at port:{}.".format(name, bind_port)) 54 | 55 | def join_dht(self): 56 | """从本地提供的节点加入 DHT 网络""" 57 | for addr in BOOTSTRAP_NODES: 58 | self.send_find_node_request(addr) 59 | time.sleep(SLEEP_TIME) 60 | 61 | def send_find_node_request(self, address, nid=None): 62 | """ 63 | 发送 find_node 请求,以便收到更多的 DHT 节点 64 | 65 | 对发送数据的说明 66 | =============== 67 | 68 | 't': transaction ID, 由请求节点产生,长度不定,目的是定位当前信息的唯一性,1byte-->对应2^8个请求 69 | 70 | 'y': 'q', 代表当前是请求 71 | 72 | 'q': 'find_node', 请求方法(get_peers, announce_peer) 73 | 74 | 'a': { 75 | 'id': node_id, 请求节点的 id 76 | 77 | 'target': node_id, 正在查找的节点 id 78 | } 79 | 80 | """ 81 | nid = nid if nid else self.node.nid 82 | tid = get_random_id(4) 83 | target = get_random_id(20) 84 | data = { 85 | 't': tid, 86 | 'y': 'q', 87 | 'q': 'find_node', 88 | 'a': { 89 | 'id': nid, 90 | 'target': target 91 | } 92 | } 93 | self.send_krpc(data, address) 94 | 95 | def send_krpc(self, data, address): 96 | """发送 krpc 信息""" 97 | self.logger.debug("I'm sending to {}".format(address)) 98 | try: 99 | self.udp_socket.sendto(bencode(data), address) 100 | except Exception: 101 | self.logger.exception(Exception) 102 | 103 | def handle_receive_things(self, data, address): 104 | """处理接收到的所有信息""" 105 | try: 106 | y = data.get(b'y') 107 | # 关键字 y = 'r', 表示当前是回复 108 | # y = 'q', 表示当前是请求 109 | if y == b'r': 110 | if data.get(b'r'): 111 | self.handle_find_node_response(data) 112 | elif y == b'q': 113 | # 关键字 q , 表示当前请求的方法名 114 | q = data.get(b'q') 115 | if q == b'get_peers': 116 | if data.get(b'a'): 117 | self.handle_get_peers_request(data, address) 118 | elif q == b'announce_peer': 119 | if data.get(b'a'): 120 | self.handle_announce_peer_request(data, address) 121 | except KeyError: 122 | pass 123 | # self.logger.exception(KeyError) 124 | 125 | def handle_find_node_response(self, data): 126 | """ 127 | 处理 find_node 的回复 128 | 129 | ‘r': { 130 | 'id': 发送方的 node id 131 | 132 | 'nodes': 离 target 最近的 k 个节点 133 | } 134 | """ 135 | self.logger.debug("I'm handling find_node_response") 136 | try: 137 | tid = data.get(b't') 138 | nid = data.get(b'r').get(b'id') 139 | nodes = data.get(b'r').get(b'nodes') 140 | nodes = parse_nodes(nodes) 141 | for node in nodes: 142 | nid, ip, port = node 143 | if len(nid) == 20 and ip != SERVER_HOST: 144 | self.nodes.append(DHTNode(nid, ip, port)) 145 | except KeyError: 146 | pass 147 | # self.logger.exception(KeyError) 148 | 149 | def handle_get_peers_request(self, data, address): 150 | """ 151 | 处理外部发来的 get_peers 请求,使用 info_hash 转为 magnet 152 | 153 | 'a': { 154 | 'id': node_id, 请求节点的 id 155 | 156 | 'info_hash': 请求的资源的 info_hash 157 | } 158 | """ 159 | self.logger.debug("I'm handling get_peers_request") 160 | try: 161 | tid = data.get(b't') 162 | nid = data.get(b'a').get(b'id') 163 | info_hash = data.get(b'a').get(b'info_hash') 164 | magnet = parse_info_hash(info_hash) 165 | # TODO: 储存 info_hash, 并回复 166 | self.save_magnet(magnet) 167 | except KeyError: 168 | pass 169 | # self.logger.exception(KeyError) 170 | 171 | def handle_announce_peer_request(self, data, address): 172 | """ 173 | 处理外部发来的 announce_peer 请求,使用 info_hash 转为 magnet 174 | 175 | 'a': { 176 | 'id': node_id, 请求节点的 id 177 | 178 | 'info_hash': 请求的资源的 info_hash 179 | } 180 | """ 181 | self.logger.debug("I'm handling announce_peer_request") 182 | print(data) 183 | try: 184 | tid = data.get(b't') 185 | nid = data.get(b'a').get(b'id') 186 | info_hash = data.get(b'a').get(b'info_hash') 187 | magnet = parse_info_hash(info_hash) 188 | # TODO: 储存 info_hash, 并回复 189 | self.save_magnet(magnet) 190 | except KeyError: 191 | pass 192 | # self.logger.exception(KeyError) 193 | 194 | def receive_forever(self): 195 | """一直接收外部发来的信息""" 196 | self.logger.info('start receive forever...') 197 | while True: 198 | try: 199 | data, addr = self.udp_socket.recvfrom(BUFSIZE) 200 | self.handle_receive_things(bdecode(data), addr) 201 | except Exception: 202 | pass 203 | # self.logger.exception(Exception) 204 | 205 | def send_forever(self): 206 | """一直对外发送信息,即发送 find_node""" 207 | self.logger.info('start send forever...') 208 | while True: 209 | try: 210 | node = self.nodes.popleft() 211 | self.send_find_node_request((node.ip, node.port), node.nid) 212 | time.sleep(SLEEP_TIME) 213 | except IndexError: 214 | self.join_dht() 215 | 216 | def save_magnet(self, magnet): 217 | self.logger.info(MAGNET_TEMPLATE.format(magnet)) 218 | self.magnets.add(MAGNET_TEMPLATE.format(magnet)) 219 | self.redis_client.add(MAGNET_TEMPLATE.format(magnet)) 220 | 221 | def reporter(self): 222 | """定时报告当前状况""" 223 | while True: 224 | time.sleep(TIMER_WAIT_TIME) 225 | self.join_dht() 226 | self.logger.info('当前有{}个节点, 有{}个磁力链接'.format(len(self.nodes), len(self.magnets))) 227 | 228 | 229 | def start_server(index=0, bind_port=DEFAULT_SERVER_PORT): 230 | dht_s = DHTServer(SERVER_HOST, bind_port, 'SERVER{}'.format(index)) 231 | threads = [ 232 | Thread(target=dht_s.send_forever), 233 | Thread(target=dht_s.receive_forever), 234 | Thread(target=dht_s.reporter) 235 | ] 236 | 237 | for t in threads: 238 | t.start() 239 | 240 | for t in threads: 241 | t.join() 242 | 243 | 244 | def start_multi_server(count=DEFAULT_SERVER_COUNT, origin_bind_port=DEFAULT_SERVER_PORT): 245 | # signal.signal(signal.SIGINT, handler) 246 | # signal.signal(signal.SIGTERM, handler) 247 | 248 | processes = [] 249 | try: 250 | for i in range(count): 251 | p = Process(target=start_server, args=(i, origin_bind_port + i,)) 252 | p.start() 253 | processes.append(p) 254 | 255 | for p in processes: 256 | p.join() 257 | except KeyboardInterrupt: 258 | print('退出') 259 | 260 | 261 | if __name__ == '__main__': 262 | start_multi_server() 263 | -------------------------------------------------------------------------------- /magnet_crawler/magnet2torrent.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import tempfile 4 | import time 5 | import xmlrpc.client 6 | from threading import Thread 7 | 8 | import websocket 9 | 10 | from magnet_crawler.database import RedisClient, REDIS_USED_KEY, REDIS_AVAIL_KEY, SqliteClient, SQLITE_DATABASE_NAME 11 | from magnet_crawler.parse_torrent import TorrentParser 12 | from magnet_crawler.utils import get_logger 13 | 14 | RPC_SERVER = "http://localhost:6800/rpc" 15 | RPC_WEBSOCKET = "ws://localhost:6800/jsonrpc" 16 | RPC_SECRET = "token:abcdefg" 17 | # aria2 下载timeout 18 | BT_STOP_TIMEOUT = 600 19 | # aria2 最大下载数量 20 | MAX_DOWNLOADS = 32 21 | # aria2 下载路径 22 | DIR_PATH = os.path.abspath('./torrents') 23 | # 从 redis 每次取出 magnet 的数量 24 | FETCH_MAGNET_COUNT = 32 25 | # 完成一轮下载的等待时间 26 | WAITING_NEXT_TIME = 120 27 | # 每次投入下载的等待时间 28 | SINGLE_DOWNLOAD_WAIT_TIME = 10 29 | 30 | 31 | def magnet_to_torrent(magnet): 32 | import libtorrent as lt 33 | ses = lt.session() 34 | tempdir = tempfile.mkdtemp() 35 | params = { 36 | 'save_path': tempdir, 37 | 'storage_mode': lt.storage_mode_t(2), 38 | 'paused': False, 39 | 'auto_managed': True, 40 | 'duplicate_is_error': True 41 | } 42 | handle = lt.add_magnet_uri(ses, magnet, params) 43 | while not handle.has_metadata(): 44 | try: 45 | time.sleep(1) 46 | except KeyboardInterrupt: 47 | print("Aborting...") 48 | ses.pause() 49 | ses.pause() 50 | print("Done") 51 | 52 | torinfo = handle.get_torrent_info() 53 | print(torinfo) 54 | torfile = lt.create_torrent(torinfo) 55 | 56 | output = os.path.abspath(torinfo.name() + ".torrent") 57 | 58 | 59 | class Aria2MagnetConverter: 60 | def __init__(self, server, secret=None, **kwargs): 61 | self.client = xmlrpc.client.ServerProxy(server) 62 | self.secret = secret 63 | self.redis_client = RedisClient() 64 | self.logger = get_logger(kwargs.get('logger_name', 'ARIA2')) 65 | self.download_info = dict({'all': dict(), 66 | 'start': dict(), 67 | 'complete': dict(), 68 | 'error': dict(), 69 | }) 70 | self.sqlite = SqliteClient(SQLITE_DATABASE_NAME) 71 | if not os.path.exists(DIR_PATH): 72 | os.mkdir(DIR_PATH) 73 | 74 | def magnet_to_torrent(self, magnet, dir_path=None, **kwargs): 75 | # TODO: 从github拉取tracker 76 | ops = { 77 | 'bt-metadata-only': 'true', # 只下载种子 78 | 'bt-stop-timeout': str(BT_STOP_TIMEOUT), # 下载停止超时 79 | # 'bt-tracker': "udp://tracker.coppersurfer.tk:6969/announce", 80 | } 81 | if dir_path: 82 | ops.update(dir=dir_path) 83 | if kwargs: 84 | ops.update(kwargs) 85 | r = None 86 | try: 87 | r = self.client.aria2.addUri(self.secret, [magnet], ops) 88 | # client.aria2.addTorrent(RPC_SECRET, xmlrpc.client.Binary(open('../test2.torrent', 'rb').read())) 89 | except Exception: 90 | self.logger.exception(Exception) 91 | 92 | return r 93 | 94 | def magnet_to_torrent_forever(self): 95 | self.logger.warning( 96 | 'set max-download={}, start to download torrent and store to database...'.format(MAX_DOWNLOADS)) 97 | # TODO:查询当前下载数量,然后再投放下载任务 98 | global_ops = { 99 | 'max-concurrent-downloads': str(MAX_DOWNLOADS), 100 | } 101 | try: 102 | if self.secret: 103 | r = self.client.aria2.changeGlobalOption(self.secret, global_ops) 104 | else: 105 | r = self.client.aria2.changeGlobalOption(global_ops) 106 | if r != 'OK': 107 | raise Exception('设置失败') 108 | except Exception: 109 | self.logger.exception(Exception) 110 | 111 | while True: 112 | for mgn in self.get_magnets(FETCH_MAGNET_COUNT): 113 | gid = self.magnet_to_torrent(mgn, DIR_PATH) 114 | self.logger.info('sending <{}> '.format(mgn.decode(), gid)) 115 | self.save_magnet(mgn, REDIS_USED_KEY) 116 | self.download_info.get('all').update({gid: mgn}) 117 | time.sleep(SINGLE_DOWNLOAD_WAIT_TIME) 118 | time.sleep(WAITING_NEXT_TIME) 119 | 120 | def get_magnets(self, count): 121 | return self.redis_client.get(count) 122 | 123 | def save_magnet(self, magnet, key): 124 | self.redis_client.add(magnet, key) 125 | 126 | def receive_aria2_notifications(self): 127 | socket_client = websocket.WebSocket() 128 | socket_client.connect(RPC_WEBSOCKET) 129 | # jsonreq = json.dumps({'jsonrpc': '2.0', 'id': 'qwer', 130 | # 'method': 'aria2.addUri', 131 | # 'params': [RPC_SECRET, ['http://example.org/file'], {}], 132 | # }) 133 | # r = socket_client.send(jsonreq) 134 | # return an int 135 | while True: 136 | resp = socket_client.recv() 137 | resp = json.loads(resp) 138 | self.handle_aria2_notifications(resp) 139 | 140 | def handle_aria2_notifications(self, data): 141 | # {'jsonrpc': '2.0', 'method': 'aria2.onDownloadStart', 'params': [{'gid': '88d5dff6df0c610f'}]} 142 | # print(data) 143 | method = data.get('method') 144 | gid = data.get('params')[0].get('gid') 145 | # sleep 可以保证 magnet_to_torrent 把 储存在 download_info 里 146 | # 保证了下面可以正确取到 magnet 147 | time.sleep(1) 148 | magnet = self.download_info['all'].get(gid, b'') 149 | # 如果是之前就有的下载任务, 直接去查找下载信息得到 magnet 150 | if not magnet: 151 | magnet = self.extract_magnet_from_status(gid) 152 | if method == 'aria2.onDownloadStart': 153 | self.logger.info('start <{}> '.format(magnet.decode(), gid)) 154 | self.download_info.get('start').update({gid: magnet}) 155 | # 加入已使用 magnet 156 | self.redis_client.add(magnet, REDIS_USED_KEY) 157 | elif method == 'aria2.onDownloadComplete': 158 | self.logger.info('complete <{}> '.format(magnet.decode(), gid)) 159 | self.download_info.get('complete').update({gid: magnet}) 160 | # 加入可用 magnet 161 | self.redis_client.add(magnet, REDIS_AVAIL_KEY) 162 | # 储存到数据库 163 | self.save_to_sqlite(magnet.decode()) 164 | elif method in ['aria2.onDownloadError', 'aria2.onDownloadStop']: 165 | self.logger.warning('error <{}> '.format(magnet.decode(), gid)) 166 | self.download_info.get('error').update({gid: magnet}) 167 | # 因为 stop 的时候,aria2重启时还会开始这个任务,所以要主动删除信息 168 | self.remove_download_result(gid) 169 | else: 170 | pass 171 | 172 | def remove_download_result(self, gid): 173 | r = self.client.aria2.removeDownloadResult(RPC_SECRET, gid) 174 | if r != 'OK': 175 | self.logger.warning('没有成功删除下载信息!') 176 | 177 | def purge_download_result(self): 178 | r = self.client.aria2.purgeDownloadResult(RPC_SECRET) 179 | if r != 'OK': 180 | self.logger.warning('没有成功清除所有下载信息!') 181 | 182 | def save_to_sqlite(self, magnet): 183 | if not magnet: 184 | return 185 | torrent = os.path.join(DIR_PATH, magnet[-40:] + '.torrent') 186 | if os.path.exists(torrent): 187 | self.logger.info('save {} to database'.format(magnet)) 188 | parser = TorrentParser(torrent) 189 | data = parser.get_torrent_info() 190 | self.sqlite.insert(magnet, data) 191 | else: 192 | self.logger.error('不存在该文件 {}'.format(torrent)) 193 | 194 | def extract_magnet_from_status(self, gid): 195 | r = self.client.aria2.tellStatus(RPC_SECRET, gid, ['infoHash']) 196 | info_hash = r.get('infoHash', None) 197 | magnet = None 198 | if info_hash: 199 | magnet = 'magnet:?xt=urn:btih:' + info_hash.upper() 200 | magnet = magnet.encode() 201 | # 也可以这样 202 | # r = self.client.aria2.getFiles(RPC_SECRET, gid)[0] 203 | # path = r.get('path', None) 204 | # magnet = None 205 | # if path: 206 | # magnet = 'magnet:?xt=urn:btih:' + path[-40:] 207 | # magnet = magnet.encode() 208 | return magnet 209 | 210 | 211 | def start_magnet_converter(): 212 | converter = Aria2MagnetConverter(RPC_SERVER, secret=RPC_SECRET) 213 | threads = [ 214 | Thread(target=converter.magnet_to_torrent_forever), 215 | Thread(target=converter.receive_aria2_notifications), 216 | ] 217 | 218 | for t in threads: 219 | t.start() 220 | 221 | for t in threads: 222 | t.join() 223 | 224 | 225 | if __name__ == '__main__': 226 | start_magnet_converter() 227 | --------------------------------------------------------------------------------