├── magnet_crawler
    ├── __init__.py
    ├── utils.py
    ├── database.py
    ├── parse_torrent.py
    ├── crawler.py
    └── magnet2torrent.py
├── doc
    └── run1.gif
├── requirements.txt
├── .gitignore
├── run.py
└── README.md


/magnet_crawler/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/doc/run1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cyrus97/magnet-crawler/HEAD/doc/run1.gif


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | redis==2.10.6
2 | bencoder.pyx==1.2.1
3 | websocket_client==0.53.0
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # ide
 2 | .idea
 3 | 
 4 | # log file
 5 | *.log
 6 | */*.log
 7 | 
 8 | # torrent
 9 | *.torrent
10 | torrents/
11 | */torrents/
12 | 
13 | # db
14 | *.db


--------------------------------------------------------------------------------
/magnet_crawler/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import random
 3 | import string
 4 | from _socket import inet_ntoa
 5 | 
 6 | from struct import unpack
 7 | 
 8 | # 每个node节点信息的长度
 9 | COMPACT_NODE_INFO_LENGTH = 26
10 | # 每个node节点长度
11 | COMPACT_NODE_LENGTH = 20
12 | 
13 | 
14 | def get_random_id(length):
15 |     str_list = [random.choice(string.digits + string.ascii_letters) for i in range(length)]
16 |     random_str = ''.join(str_list)
17 |     return random_str
18 | 
19 | 
20 | def parse_nodes(data):
21 |     nodes = []
22 |     if data:
23 |         length = len(data)
24 |         # 每个node信息，20：nid 4：ip 2：port
25 |         for i in range(0, length, COMPACT_NODE_INFO_LENGTH):
26 |             nid = data[i:i + 20]
27 |             ip = inet_ntoa(data[i + 20:i + 24])
28 |             port = unpack("!H", data[i + 24:i + 26])[0]
29 |             nodes.append((nid, ip, port))
30 | 
31 |     return nodes
32 | 
33 | 
34 | def parse_info_hash(data):
35 |     # info_hash 以16进制储存
36 |     magnet = data.hex().upper()
37 |     # info_hash = codecs.getencoder("hex")(data)[0].decode().upper()
38 |     return magnet
39 | 
40 | 
41 | def get_logger(name, level=logging.INFO):
42 |     logger = logging.getLogger(name)
43 |     logger.setLevel(level)
44 |     fh = logging.FileHandler('log.log', encoding='utf8')
45 |     ch = logging.StreamHandler()
46 |     formatter = logging.Formatter(fmt="[%(asctime)s %(levelname)s]  %(name)s  %(message)s", datefmt="%Y/%m/%d %X")
47 |     fh.setFormatter(formatter)
48 |     ch.setFormatter(formatter)
49 |     logger.addHandler(fh)
50 |     logger.addHandler(ch)
51 | 
52 |     return logger
53 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from multiprocessing import Process
 3 | 
 4 | from magnet_crawler.crawler import start_multi_server, DEFAULT_SERVER_COUNT, DEFAULT_SERVER_PORT
 5 | from magnet_crawler.database import create_tables
 6 | from magnet_crawler.magnet2torrent import start_magnet_converter
 7 | 
 8 | 
 9 | def start_all(crawler_args, converter_args):
10 |     processes = [
11 |         Process(target=start_multi_server, args=crawler_args),
12 |         Process(target=start_magnet_converter, args=converter_args),
13 |     ]
14 | 
15 |     for p in processes:
16 |         p.start()
17 | 
18 |     for p in processes:
19 |         p.join()
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     parser = argparse.ArgumentParser(description='run for magnet-crawler')
24 | 
25 |     parser.add_argument("runserver", nargs='?', help='启动')
26 |     parser.add_argument("createdatabase", nargs='?', help='创建数据库', default='magnet.db')
27 |     parser.add_argument("-c", "--count", help="指定爬虫进程数", default=DEFAULT_SERVER_COUNT)
28 |     parser.add_argument("-p", "--port", type=int, help="指定爬虫绑定端口起始位置", default=DEFAULT_SERVER_PORT)
29 |     parser.add_argument("--only-crawler", help="只运行爬虫", action="store_true", dest='crawler')
30 |     parser.add_argument("--only-convert", help="只运行 magnet 转换", action="store_true", dest='convert')
31 | 
32 |     args = parser.parse_args()
33 | 
34 |     if args.runserver == 'runserver':
35 |         if args.crawler:
36 |             # 只启动爬虫
37 |             start_multi_server(args.count, args.port)
38 |         elif args.convert:
39 |             # 只启动转换
40 |             start_magnet_converter()
41 |         else:
42 |             # 全部启动
43 |             crawler_args = (args.count, args.port,)
44 |             converter_args = ()
45 |             start_all(crawler_args, converter_args)
46 |     elif args.runserver == 'createdatabase':
47 |         create_tables(args.createdatabase)
48 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # magnet-crawler 磁力链接收集器
  2 | 
  3 | ## 简介
  4 | 
  5 | 这是一个 magnet 爬虫，通过伪装成一个 DHT 节点，接收其他节点发过来的请求信息，提取相关的 magnet。
  6 | 然后通过 [Aria2](https://aria2.github.io/) 的 RPC 特性调用下载，把 magnet 下载为种子(torrent)，
  7 | 并解析种子文件信息，把文件信息存入数据库。
  8 | 
  9 | ![运行界面](doc/run1.gif)
 10 | 
 11 | ## 环境依赖
 12 | 
 13 | - Python 3
 14 | - Redis
 15 | - Aria2
 16 | 
 17 | ## 安装
 18 | 
 19 | ```
 20 | git clone https://github.com/Cyrus97/magnet-crawler.git
 21 | cd magnet-crawler
 22 | pip install -r requirements.txt
 23 | ```
 24 | 
 25 | ## 配置
 26 | 
 27 | ### Aria2 RPC
 28 | 
 29 | 需要开启 Aria2 的 RPC 功能，默认使用的是 6800 端口。关于 Aria2 的相关设置，
 30 | 请到文件 [magnet2torrent.py](magnet_crawler/magnet2torrent.py) 中同步修改。
 31 | 
 32 | ```python
 33 | # file magnet_crawler/magnet2torrent.py
 34 | 
 35 | RPC_SERVER = "http://localhost:6800/rpc"
 36 | RPC_WEBSOCKET = "ws://localhost:6800/jsonrpc"
 37 | # 把 token: 后面换成你的 secret
 38 | RPC_SECRET = "token:abcdefg"
 39 | # aria2 下载timeout
 40 | BT_STOP_TIMEOUT = 600
 41 | # aria2 最大下载数量
 42 | MAX_DOWNLOADS = 32
 43 | # aria2 下载路径
 44 | DIR_PATH = os.path.abspath('./torrents')
 45 | ```
 46 | 
 47 | ### Redis
 48 | 
 49 | 使用默认的 6379 端口，要修改请到文件 [database.py](magnet_crawler/database.py) 中修改。
 50 | 
 51 | ```python
 52 | # file magnet_crawler/database.py
 53 | 
 54 | # redis config
 55 | REDIS_HOST = '127.0.0.1'
 56 | REDIS_PORT = 6379
 57 | # 所有的magnet存在这里
 58 | REDIS_ALL_KEY = 'all-magnet'
 59 | # 进行过转换的magnet
 60 | REDIS_USED_KEY = 'used-magnet'
 61 | # 能下载的magnet
 62 | REDIS_AVAIL_KEY = 'magnet'
 63 | ```
 64 | 
 65 | ## 使用
 66 | 
 67 | ```
 68 | # python run.py -h
 69 | 
 70 | usage: run.py [-h] [-c COUNT] [-p PORT] [--only-crawler] [--only-convert]
 71 |               [runserver] [createdatabase]
 72 | 
 73 | run for magnet-crawler
 74 | 
 75 | positional arguments:
 76 |   runserver             启动
 77 |   createdatabase        创建数据库
 78 | 
 79 | optional arguments:
 80 |   -h, --help            show this help message and exit
 81 |   -c COUNT, --count COUNT
 82 |                         指定爬虫进程数
 83 |   -p PORT, --port PORT  指定爬虫绑定端口起始位置
 84 |   --only-crawler        只运行爬虫
 85 |   --only-convert        只运行 magnet 转换
 86 | 
 87 | ```
 88 | 
 89 | 
 90 | 
 91 | ## 快速开始
 92 | 
 93 | ```python
 94 | # 创建数据库
 95 | python run.py createdatabase
 96 | 
 97 | # 以默认方式启动（启动爬虫和 magnet 下载转换）
 98 | python run.py runserver
 99 | 
100 | # 如果你只是想跑跑看，或者没有下载 redis 和 aria2 可以只启动爬虫
101 | python run.py runserver --only-crawler
102 | 
103 | ```
104 | 
105 | ## 问题
106 | 
107 | 有任何使用问题或者建议请提 Issue，或者发给我邮件 `liuxingran97@gmail.com`。
108 | 
109 | ### 已知问题
110 | 
111 | - 在命令行运行该程序无法使用 `Ctrl + C` 退出，只能关闭命令行程序（是因为多进程多线程的原因，正在解决）
112 | - 可能在使用 Aria2 RPC 发送和接收信息过程中出错
113 | 
114 | ### 其他问题
115 | 
116 | - 如果是在本地（没有公网）运行，有可能会出现一直爬取不到任何信息的问题，需要重新连接网络（重新拨号）
117 | 
118 | ## TODO
119 | 
120 | - [ ] 修复 bug
121 | - [ ] 优化多进程和多线程
122 | - [ ] 优化 magnet 转 torrent的逻辑，更好地对接 Aria2 和 数据库
123 | - [ ] 分析爬取到的内容
124 | 


--------------------------------------------------------------------------------
/magnet_crawler/database.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import sqlite3
  4 | import string
  5 | from datetime import datetime
  6 | 
  7 | import redis
  8 | 
  9 | # redis config
 10 | REDIS_HOST = '127.0.0.1'
 11 | REDIS_PORT = 6379
 12 | # 所有的magnet存在这里
 13 | REDIS_ALL_KEY = 'all-magnet'
 14 | # 进行过转换的magnet
 15 | REDIS_USED_KEY = 'used-magnet'
 16 | # 能下载的magnet
 17 | REDIS_AVAIL_KEY = 'magnet'
 18 | 
 19 | # mysql config
 20 | MYSQL_HOST = '127.0.0.1'
 21 | MYSQL_PORT = 3306
 22 | 
 23 | # sqlite3
 24 | SQLITE_DATABASE_NAME = 'magnet.db'
 25 | 
 26 | 
 27 | class RedisClient:
 28 |     def __init__(self, host=REDIS_HOST, port=REDIS_PORT):
 29 |         pool = redis.ConnectionPool(host=host, port=port, db=0)
 30 |         self.client = redis.Redis(connection_pool=pool)
 31 | 
 32 |     def add(self, magnet, key=REDIS_ALL_KEY):
 33 |         self.client.sadd(key, magnet)
 34 | 
 35 |     def count(self, key=REDIS_ALL_KEY):
 36 |         return self.client.scard(key)
 37 | 
 38 |     def get(self, count, keys=REDIS_ALL_KEY):
 39 |         """用于转换magnet"""
 40 |         # keys = (REDIS_ALL_KEY, REDIS_USED_KEY)
 41 |         diff_set = self.client.sdiff(keys)
 42 |         length = len(diff_set)
 43 |         count = count if length > count else length
 44 |         magnets = [diff_set.pop() for _ in range(count)]
 45 |         return magnets
 46 | 
 47 |     def diff(self, keys, count):
 48 |         diff_set = self.client.sdiff(keys)
 49 |         length = len(diff_set)
 50 |         count = count if length > count else length
 51 |         magnets = [diff_set.pop() for _ in range(count)]
 52 |         return magnets
 53 | 
 54 | 
 55 | class MysqlClient:
 56 |     pass
 57 | 
 58 | 
 59 | class SqliteClient:
 60 |     def __init__(self, db):
 61 |         self.db = db
 62 |         # sqlite3.ProgrammingError: SQLite objects created in a thread can only be used in that same thread.
 63 |         self.conn = sqlite3.connect(db, check_same_thread=False)
 64 | 
 65 |     def insert(self, magnet, data):
 66 |         insert_sql = '''
 67 |                 insert into {table_name}
 68 |                 (magnet, torrent_name, content, create_date)
 69 |                 values(?, ?, ?, ?);
 70 |                 '''
 71 |         cursor = self.conn.cursor()
 72 |         table_name = 'magnet_{}'.format(magnet[-40].lower())
 73 |         # table_name = 'magnet_{}'.format(magnet[0])
 74 |         try:
 75 |             params = (magnet, data.get('name', None), json.dumps(data), datetime.now(),)
 76 |             cursor.execute(insert_sql.format(table_name=table_name), params)
 77 |             self.conn.commit()
 78 |         except sqlite3.Error as e:
 79 |             print(e)
 80 |         finally:
 81 |             cursor.close()
 82 |             self.conn.commit()
 83 | 
 84 |     def count(self):
 85 |         pass
 86 | 
 87 | 
 88 | def create_tables(db):
 89 |     create_sql = '''
 90 |     create table {table_name}
 91 |     (
 92 |       id integer not null primary key autoincrement,
 93 |       magnet varchar(30) not null unique,
 94 |       torrent_name varchar(500),
 95 |       content text,
 96 |       create_date datetime(6)
 97 |     );
 98 |     '''
 99 |     drop_sql = '''
100 |     drop table {table_name};
101 |     '''
102 |     table_names = ['magnet_' + i for i in string.digits + string.ascii_lowercase]
103 |     exec_tables = []
104 | 
105 |     conn = sqlite3.connect(db)
106 |     cursor = conn.cursor()
107 |     try:
108 |         for name in table_names:
109 |             try:
110 |                 cursor.execute(create_sql.format(table_name=name))
111 |                 exec_tables.append(name)
112 |                 conn.commit()
113 |                 print('table {} created successful'.format(name))
114 |             except sqlite3.OperationalError as e:
115 |                 # logging.exception(e)
116 |                 if 'already exists' in str(e):
117 |                     print(e)
118 |                     continue
119 |                 else:
120 |                     raise Exception
121 |     except sqlite3.OperationalError as e:
122 |         logging.exception(e)
123 |     except KeyboardInterrupt:
124 |         for name in exec_tables:
125 |             cursor.execute(drop_sql.format(table_name=name))
126 |             conn.commit()
127 |     finally:
128 |         cursor.close()
129 |         conn.commit()
130 |         conn.close()
131 | 
132 | 
133 | if __name__ == '__main__':
134 |     # rc = RedisClient()
135 |     # magnets = rc.get(20)
136 |     # print(magnets)
137 |     # print(len(magnets))
138 |     create_tables('magnet.db')
139 | 
140 | 


--------------------------------------------------------------------------------
/magnet_crawler/parse_torrent.py:
--------------------------------------------------------------------------------
  1 | import bencoder
  2 | import json
  3 | from pprint import pprint
  4 | 
  5 | from magnet_crawler.database import SqliteClient, SQLITE_DATABASE_NAME
  6 | 
  7 | 
  8 | def parse_torrent():
  9 |     with open('../test.torrent', 'rb') as f:
 10 |         info = bencoder.bdecode(f.read())
 11 |     print(info.keys())
 12 |     # for key, value in info.get(b'info').items():
 13 |     #     print(key)
 14 |     #     print(value)
 15 | 
 16 |     with open('../test2.torrent', 'rb') as f:
 17 |         info = bencoder.bdecode(f.read())
 18 |     print(info.keys())
 19 |     print(info[b'nodes'])
 20 |     # for key, value in info.get(b'info').items():
 21 |     #     print(key)
 22 |     #     print(value)
 23 | 
 24 | 
 25 | class TorrentParser:
 26 |     def __init__(self, torrent):
 27 |         self.torrent = self.decode_torrent(torrent)  # all data is byte
 28 |         self.info = dict()
 29 |         self.encoding = self.torrent.get(b'encoding', b'utf-8').decode()
 30 |         self.sqlite3 = SqliteClient(SQLITE_DATABASE_NAME)
 31 | 
 32 |     def decode_torrent(self, torrent):
 33 |         with open(torrent, 'rb') as f:
 34 |             content = bencoder.bdecode(f.read())
 35 |         return content
 36 | 
 37 |     def get_creation_info(self):
 38 |         created_by = self.torrent.get(b'created by', b'').decode()
 39 |         creation_date = self.torrent.get(b'creation date', None)  # timestamp
 40 |         data = {'created by': created_by,
 41 |                 'creation date': creation_date}
 42 |         return data
 43 | 
 44 |     def is_dir(self):
 45 |         return b'files' in self.torrent.get(b'info').keys()
 46 | 
 47 |     def get_files_info(self):
 48 |         files = []
 49 |         # 如果没有 'files', 表示下载的是单文件, name 是文件名称
 50 |         # 如果有, 'name' 是文件夹名称, 文件全部在 'files' 里
 51 |         tor_info = self.torrent[b'info']
 52 |         # 作为种子文件的名称
 53 |         if b'name.utf-8' in tor_info.keys():
 54 |             name = tor_info.get(b'name.utf-8').decode()
 55 |         elif b'name' in tor_info.keys():
 56 |             name = self.decode_all(tor_info.get(b'name'), self.encoding)
 57 |         else:
 58 |             name = None
 59 |         # name = tor_info.get(b'name.utf-8', tor_info.get(b'name', b'').decode(self.encoding).encode()).decode()
 60 |         # print(name)
 61 |         self.info.update(name=name)
 62 | 
 63 |         if self.is_dir():
 64 |             for file in tor_info.get(b'files', []):
 65 |                 # TODO: 可能有多层文件夹
 66 |                 length = file.get(b'length', None)
 67 |                 if b'path.utf-8' in file.keys():
 68 |                     name = file.get(b'path.utf-8')[0].decode()
 69 |                 elif b'path' in file.keys():
 70 |                     # 处理多出的一层文件夹
 71 |                     if len(file.get(b'path')) == 1:
 72 |                         name = self.decode_all(file.get(b'path')[0], self.encoding)
 73 |                     else:
 74 |                         name = self.decode_all(file.get(b'path')[1], self.encoding)
 75 |                 else:
 76 |                     name = None
 77 |                 new_file = {
 78 |                     'length': length,
 79 |                     'name': name,
 80 |                 }
 81 | 
 82 |                 # print(new_file)
 83 |                 files.append(new_file)
 84 |         else:
 85 |             length = tor_info.get(b'length', None)
 86 |             if b'name.utf-8' in tor_info.keys():
 87 |                 name = tor_info.get(b'name.utf-8').decode()
 88 |             elif b'name' in tor_info.keys():
 89 |                 name = self.decode_all(tor_info.get(b'name'), self.encoding)
 90 |             else:
 91 |                 name = None
 92 |             file = {
 93 |                 'length': length,
 94 |                 'name': name,
 95 |             }
 96 |             files.append(file)
 97 | 
 98 |         return files
 99 | 
100 |     def decode_all(self, data, encoding=None):
101 |         encoding = encoding if encoding else self.encoding
102 |         dec_data = data
103 |         try:
104 |             dec_data = data.decode(encoding)
105 |         except UnicodeDecodeError as e:
106 |             if 'utf-8' in str(e):
107 |                 dec_data = data.decode('gbk')
108 |             elif 'gbk' in str(e):
109 |                 dec_data = data.decode('utf-8')
110 |         return dec_data
111 | 
112 |     def filter_file(self):
113 |         pass
114 | 
115 |     def get_torrent_info(self):
116 |         self.info.update(self.get_creation_info())
117 |         self.info.update(files=self.get_files_info())
118 |         return self.info
119 | 
120 | 
121 | if __name__ == '__main__':
122 |     # parse_torrent()
123 |     tor_parser = TorrentParser('torrents/7c2fa8f559d38e61e8f23d9a2b2728e11173948f.torrent')
124 |     # print(tor_parser.torrent[b'encoding'])
125 |     # print(tor_parser.torrent[b'info'][b'pieces'].decode())
126 |     # print(tor_parser.torrent[b'info'].keys())
127 |     # pprint(tor_parser.get_files_info())
128 |     pprint(tor_parser.get_torrent_info())
129 |     print(json.loads(json.dumps(tor_parser.get_torrent_info())))


--------------------------------------------------------------------------------
/magnet_crawler/crawler.py:
--------------------------------------------------------------------------------
  1 | import socket
  2 | import time
  3 | from bencoder import bdecode, bencode
  4 | from collections import deque
  5 | from multiprocessing import Process
  6 | from os import cpu_count
  7 | from threading import Thread
  8 | 
  9 | from magnet_crawler.database import RedisClient
 10 | from magnet_crawler.utils import get_random_id, parse_nodes, parse_info_hash, get_logger
 11 | 
 12 | BOOTSTRAP_NODES = [
 13 |     ("router.bittorrent.com", 6881),
 14 |     ("dht.transmissionbt.com", 6881),
 15 |     ("router.utorrent.com", 6881),
 16 | ]
 17 | MAX_NODES_SIZE = 10000
 18 | BUFSIZE = 10240
 19 | # 发送间隔时间
 20 | SLEEP_TIME = 1e-6
 21 | MAGNET_TEMPLATE = "magnet:?xt=urn:btih:{}"
 22 | SERVER_HOST = '0.0.0.0'
 23 | DEFAULT_SERVER_PORT = 10086
 24 | DEFAULT_SERVER_COUNT = cpu_count()
 25 | TIMER_WAIT_TIME = 60
 26 | 
 27 | 
 28 | class DHTNode:
 29 |     def __init__(self, nid, ip, port):
 30 |         self.nid = nid
 31 |         self.ip = ip
 32 |         self.port = port
 33 | 
 34 | 
 35 | class DHTServer:
 36 |     def __init__(self, bind_ip, bind_port, name):
 37 |         """
 38 | 
 39 |         :param bind_ip: 绑定的ip
 40 |         :param bind_port: 绑定的端口
 41 |         :param name: 该server的名字
 42 |         """
 43 |         # 自己也是一个node
 44 |         self.node = DHTNode(get_random_id(20), bind_ip, bind_port)
 45 |         # 使用udp
 46 |         self.udp_socket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM, socket.IPPROTO_UDP)
 47 |         self.udp_socket.bind((bind_ip, bind_port))
 48 |         # 存放发现的nodes
 49 |         self.nodes = deque(maxlen=MAX_NODES_SIZE)
 50 |         self.magnets = set()
 51 |         self.redis_client = RedisClient()
 52 |         self.logger = get_logger(name)
 53 |         self.logger.info("I'am {}, I'm bound at port:{}.".format(name, bind_port))
 54 | 
 55 |     def join_dht(self):
 56 |         """从本地提供的节点加入 DHT 网络"""
 57 |         for addr in BOOTSTRAP_NODES:
 58 |             self.send_find_node_request(addr)
 59 |             time.sleep(SLEEP_TIME)
 60 | 
 61 |     def send_find_node_request(self, address, nid=None):
 62 |         """
 63 |         发送 find_node 请求，以便收到更多的 DHT 节点
 64 | 
 65 |         对发送数据的说明
 66 |         ===============
 67 | 
 68 |         't': transaction ID, 由请求节点产生，长度不定，目的是定位当前信息的唯一性，1byte-->对应2^8个请求
 69 | 
 70 |         'y': 'q', 代表当前是请求
 71 | 
 72 |         'q': 'find_node', 请求方法(get_peers, announce_peer)
 73 | 
 74 |         'a': {
 75 |                 'id': node_id, 请求节点的 id
 76 | 
 77 |                 'target': node_id, 正在查找的节点 id
 78 |             }
 79 | 
 80 |         """
 81 |         nid = nid if nid else self.node.nid
 82 |         tid = get_random_id(4)
 83 |         target = get_random_id(20)
 84 |         data = {
 85 |             't': tid,
 86 |             'y': 'q',
 87 |             'q': 'find_node',
 88 |             'a': {
 89 |                 'id': nid,
 90 |                 'target': target
 91 |             }
 92 |         }
 93 |         self.send_krpc(data, address)
 94 | 
 95 |     def send_krpc(self, data, address):
 96 |         """发送 krpc 信息"""
 97 |         self.logger.debug("I'm sending to {}".format(address))
 98 |         try:
 99 |             self.udp_socket.sendto(bencode(data), address)
100 |         except Exception:
101 |             self.logger.exception(Exception)
102 | 
103 |     def handle_receive_things(self, data, address):
104 |         """处理接收到的所有信息"""
105 |         try:
106 |             y = data.get(b'y')
107 |             # 关键字 y = 'r', 表示当前是回复
108 |             # y = 'q', 表示当前是请求
109 |             if y == b'r':
110 |                 if data.get(b'r'):
111 |                     self.handle_find_node_response(data)
112 |             elif y == b'q':
113 |                 # 关键字 q , 表示当前请求的方法名
114 |                 q = data.get(b'q')
115 |                 if q == b'get_peers':
116 |                     if data.get(b'a'):
117 |                         self.handle_get_peers_request(data, address)
118 |                 elif q == b'announce_peer':
119 |                     if data.get(b'a'):
120 |                         self.handle_announce_peer_request(data, address)
121 |         except KeyError:
122 |             pass
123 |             # self.logger.exception(KeyError)
124 | 
125 |     def handle_find_node_response(self, data):
126 |         """
127 |         处理 find_node 的回复
128 | 
129 |         ‘r': {
130 |                 'id': 发送方的 node id
131 | 
132 |                 'nodes': 离 target 最近的 k 个节点
133 |             }
134 |         """
135 |         self.logger.debug("I'm handling find_node_response")
136 |         try:
137 |             tid = data.get(b't')
138 |             nid = data.get(b'r').get(b'id')
139 |             nodes = data.get(b'r').get(b'nodes')
140 |             nodes = parse_nodes(nodes)
141 |             for node in nodes:
142 |                 nid, ip, port = node
143 |                 if len(nid) == 20 and ip != SERVER_HOST:
144 |                     self.nodes.append(DHTNode(nid, ip, port))
145 |         except KeyError:
146 |             pass
147 |             # self.logger.exception(KeyError)
148 | 
149 |     def handle_get_peers_request(self, data, address):
150 |         """
151 |         处理外部发来的 get_peers 请求，使用 info_hash 转为 magnet
152 | 
153 |         'a': {
154 |                 'id': node_id, 请求节点的 id
155 | 
156 |                 'info_hash': 请求的资源的 info_hash
157 |             }
158 |         """
159 |         self.logger.debug("I'm handling get_peers_request")
160 |         try:
161 |             tid = data.get(b't')
162 |             nid = data.get(b'a').get(b'id')
163 |             info_hash = data.get(b'a').get(b'info_hash')
164 |             magnet = parse_info_hash(info_hash)
165 |             # TODO: 储存 info_hash, 并回复
166 |             self.save_magnet(magnet)
167 |         except KeyError:
168 |             pass
169 |             # self.logger.exception(KeyError)
170 | 
171 |     def handle_announce_peer_request(self, data, address):
172 |         """
173 |         处理外部发来的 announce_peer 请求，使用 info_hash 转为 magnet
174 | 
175 |         'a': {
176 |                 'id': node_id, 请求节点的 id
177 | 
178 |                 'info_hash': 请求的资源的 info_hash
179 |             }
180 |         """
181 |         self.logger.debug("I'm handling announce_peer_request")
182 |         print(data)
183 |         try:
184 |             tid = data.get(b't')
185 |             nid = data.get(b'a').get(b'id')
186 |             info_hash = data.get(b'a').get(b'info_hash')
187 |             magnet = parse_info_hash(info_hash)
188 |             # TODO: 储存 info_hash, 并回复
189 |             self.save_magnet(magnet)
190 |         except KeyError:
191 |             pass
192 |             # self.logger.exception(KeyError)
193 | 
194 |     def receive_forever(self):
195 |         """一直接收外部发来的信息"""
196 |         self.logger.info('start receive forever...')
197 |         while True:
198 |             try:
199 |                 data, addr = self.udp_socket.recvfrom(BUFSIZE)
200 |                 self.handle_receive_things(bdecode(data), addr)
201 |             except Exception:
202 |                 pass
203 |                 # self.logger.exception(Exception)
204 | 
205 |     def send_forever(self):
206 |         """一直对外发送信息，即发送 find_node"""
207 |         self.logger.info('start send forever...')
208 |         while True:
209 |             try:
210 |                 node = self.nodes.popleft()
211 |                 self.send_find_node_request((node.ip, node.port), node.nid)
212 |                 time.sleep(SLEEP_TIME)
213 |             except IndexError:
214 |                 self.join_dht()
215 | 
216 |     def save_magnet(self, magnet):
217 |         self.logger.info(MAGNET_TEMPLATE.format(magnet))
218 |         self.magnets.add(MAGNET_TEMPLATE.format(magnet))
219 |         self.redis_client.add(MAGNET_TEMPLATE.format(magnet))
220 | 
221 |     def reporter(self):
222 |         """定时报告当前状况"""
223 |         while True:
224 |             time.sleep(TIMER_WAIT_TIME)
225 |             self.join_dht()
226 |             self.logger.info('当前有{}个节点, 有{}个磁力链接'.format(len(self.nodes), len(self.magnets)))
227 | 
228 | 
229 | def start_server(index=0, bind_port=DEFAULT_SERVER_PORT):
230 |     dht_s = DHTServer(SERVER_HOST, bind_port, 'SERVER{}'.format(index))
231 |     threads = [
232 |         Thread(target=dht_s.send_forever),
233 |         Thread(target=dht_s.receive_forever),
234 |         Thread(target=dht_s.reporter)
235 |     ]
236 | 
237 |     for t in threads:
238 |         t.start()
239 | 
240 |     for t in threads:
241 |         t.join()
242 | 
243 | 
244 | def start_multi_server(count=DEFAULT_SERVER_COUNT, origin_bind_port=DEFAULT_SERVER_PORT):
245 |     # signal.signal(signal.SIGINT, handler)
246 |     # signal.signal(signal.SIGTERM, handler)
247 | 
248 |     processes = []
249 |     try:
250 |         for i in range(count):
251 |             p = Process(target=start_server, args=(i, origin_bind_port + i,))
252 |             p.start()
253 |             processes.append(p)
254 | 
255 |         for p in processes:
256 |             p.join()
257 |     except KeyboardInterrupt:
258 |         print('退出')
259 | 
260 | 
261 | if __name__ == '__main__':
262 |     start_multi_server()
263 | 


--------------------------------------------------------------------------------
/magnet_crawler/magnet2torrent.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import tempfile
  4 | import time
  5 | import xmlrpc.client
  6 | from threading import Thread
  7 | 
  8 | import websocket
  9 | 
 10 | from magnet_crawler.database import RedisClient, REDIS_USED_KEY, REDIS_AVAIL_KEY, SqliteClient, SQLITE_DATABASE_NAME
 11 | from magnet_crawler.parse_torrent import TorrentParser
 12 | from magnet_crawler.utils import get_logger
 13 | 
 14 | RPC_SERVER = "http://localhost:6800/rpc"
 15 | RPC_WEBSOCKET = "ws://localhost:6800/jsonrpc"
 16 | RPC_SECRET = "token:abcdefg"
 17 | # aria2 下载timeout
 18 | BT_STOP_TIMEOUT = 600
 19 | # aria2 最大下载数量
 20 | MAX_DOWNLOADS = 32
 21 | # aria2 下载路径
 22 | DIR_PATH = os.path.abspath('./torrents')
 23 | # 从 redis 每次取出 magnet 的数量
 24 | FETCH_MAGNET_COUNT = 32
 25 | # 完成一轮下载的等待时间
 26 | WAITING_NEXT_TIME = 120
 27 | # 每次投入下载的等待时间
 28 | SINGLE_DOWNLOAD_WAIT_TIME = 10
 29 | 
 30 | 
 31 | def magnet_to_torrent(magnet):
 32 |     import libtorrent as lt
 33 |     ses = lt.session()
 34 |     tempdir = tempfile.mkdtemp()
 35 |     params = {
 36 |         'save_path': tempdir,
 37 |         'storage_mode': lt.storage_mode_t(2),
 38 |         'paused': False,
 39 |         'auto_managed': True,
 40 |         'duplicate_is_error': True
 41 |     }
 42 |     handle = lt.add_magnet_uri(ses, magnet, params)
 43 |     while not handle.has_metadata():
 44 |         try:
 45 |             time.sleep(1)
 46 |         except KeyboardInterrupt:
 47 |             print("Aborting...")
 48 |             ses.pause()
 49 |     ses.pause()
 50 |     print("Done")
 51 | 
 52 |     torinfo = handle.get_torrent_info()
 53 |     print(torinfo)
 54 |     torfile = lt.create_torrent(torinfo)
 55 | 
 56 |     output = os.path.abspath(torinfo.name() + ".torrent")
 57 | 
 58 | 
 59 | class Aria2MagnetConverter:
 60 |     def __init__(self, server, secret=None, **kwargs):
 61 |         self.client = xmlrpc.client.ServerProxy(server)
 62 |         self.secret = secret
 63 |         self.redis_client = RedisClient()
 64 |         self.logger = get_logger(kwargs.get('logger_name', 'ARIA2'))
 65 |         self.download_info = dict({'all': dict(),
 66 |                                    'start': dict(),
 67 |                                    'complete': dict(),
 68 |                                    'error': dict(),
 69 |                                    })
 70 |         self.sqlite = SqliteClient(SQLITE_DATABASE_NAME)
 71 |         if not os.path.exists(DIR_PATH):
 72 |             os.mkdir(DIR_PATH)
 73 | 
 74 |     def magnet_to_torrent(self, magnet, dir_path=None, **kwargs):
 75 |         # TODO: 从github拉取tracker
 76 |         ops = {
 77 |             'bt-metadata-only': 'true',  # 只下载种子
 78 |             'bt-stop-timeout': str(BT_STOP_TIMEOUT),  # 下载停止超时
 79 |             # 'bt-tracker': "udp://tracker.coppersurfer.tk:6969/announce",
 80 |         }
 81 |         if dir_path:
 82 |             ops.update(dir=dir_path)
 83 |         if kwargs:
 84 |             ops.update(kwargs)
 85 |         r = None
 86 |         try:
 87 |             r = self.client.aria2.addUri(self.secret, [magnet], ops)
 88 |             # client.aria2.addTorrent(RPC_SECRET, xmlrpc.client.Binary(open('../test2.torrent', 'rb').read()))
 89 |         except Exception:
 90 |             self.logger.exception(Exception)
 91 | 
 92 |         return r
 93 | 
 94 |     def magnet_to_torrent_forever(self):
 95 |         self.logger.warning(
 96 |             'set max-download={}, start to download torrent and store to database...'.format(MAX_DOWNLOADS))
 97 |         # TODO：查询当前下载数量，然后再投放下载任务
 98 |         global_ops = {
 99 |             'max-concurrent-downloads': str(MAX_DOWNLOADS),
100 |         }
101 |         try:
102 |             if self.secret:
103 |                 r = self.client.aria2.changeGlobalOption(self.secret, global_ops)
104 |             else:
105 |                 r = self.client.aria2.changeGlobalOption(global_ops)
106 |             if r != 'OK':
107 |                 raise Exception('设置失败')
108 |         except Exception:
109 |             self.logger.exception(Exception)
110 | 
111 |         while True:
112 |             for mgn in self.get_magnets(FETCH_MAGNET_COUNT):
113 |                 gid = self.magnet_to_torrent(mgn, DIR_PATH)
114 |                 self.logger.info('sending  <{}>  <gid, {}>'.format(mgn.decode(), gid))
115 |                 self.save_magnet(mgn, REDIS_USED_KEY)
116 |                 self.download_info.get('all').update({gid: mgn})
117 |                 time.sleep(SINGLE_DOWNLOAD_WAIT_TIME)
118 |             time.sleep(WAITING_NEXT_TIME)
119 | 
120 |     def get_magnets(self, count):
121 |         return self.redis_client.get(count)
122 | 
123 |     def save_magnet(self, magnet, key):
124 |         self.redis_client.add(magnet, key)
125 | 
126 |     def receive_aria2_notifications(self):
127 |         socket_client = websocket.WebSocket()
128 |         socket_client.connect(RPC_WEBSOCKET)
129 |         # jsonreq = json.dumps({'jsonrpc': '2.0', 'id': 'qwer',
130 |         #                       'method': 'aria2.addUri',
131 |         #                       'params': [RPC_SECRET, ['http://example.org/file'], {}],
132 |         #                       })
133 |         # r = socket_client.send(jsonreq)
134 |         # return an int
135 |         while True:
136 |             resp = socket_client.recv()
137 |             resp = json.loads(resp)
138 |             self.handle_aria2_notifications(resp)
139 | 
140 |     def handle_aria2_notifications(self, data):
141 |         # {'jsonrpc': '2.0', 'method': 'aria2.onDownloadStart', 'params': [{'gid': '88d5dff6df0c610f'}]}
142 |         # print(data)
143 |         method = data.get('method')
144 |         gid = data.get('params')[0].get('gid')
145 |         # sleep 可以保证 magnet_to_torrent 把 <gid, magnet> 储存在 download_info 里
146 |         # 保证了下面可以正确取到 magnet
147 |         time.sleep(1)
148 |         magnet = self.download_info['all'].get(gid, b'')
149 |         # 如果是之前就有的下载任务, 直接去查找下载信息得到 magnet
150 |         if not magnet:
151 |             magnet = self.extract_magnet_from_status(gid)
152 |         if method == 'aria2.onDownloadStart':
153 |             self.logger.info('start  <{}>  <gid, {}>'.format(magnet.decode(), gid))
154 |             self.download_info.get('start').update({gid: magnet})
155 |             # 加入已使用 magnet
156 |             self.redis_client.add(magnet, REDIS_USED_KEY)
157 |         elif method == 'aria2.onDownloadComplete':
158 |             self.logger.info('complete  <{}>  <gid, {}>'.format(magnet.decode(), gid))
159 |             self.download_info.get('complete').update({gid: magnet})
160 |             # 加入可用 magnet
161 |             self.redis_client.add(magnet, REDIS_AVAIL_KEY)
162 |             # 储存到数据库
163 |             self.save_to_sqlite(magnet.decode())
164 |         elif method in ['aria2.onDownloadError', 'aria2.onDownloadStop']:
165 |             self.logger.warning('error  <{}>  <gid, {}>'.format(magnet.decode(), gid))
166 |             self.download_info.get('error').update({gid: magnet})
167 |             # 因为 stop 的时候，aria2重启时还会开始这个任务，所以要主动删除信息
168 |             self.remove_download_result(gid)
169 |         else:
170 |             pass
171 | 
172 |     def remove_download_result(self, gid):
173 |         r = self.client.aria2.removeDownloadResult(RPC_SECRET, gid)
174 |         if r != 'OK':
175 |             self.logger.warning('没有成功删除下载信息！')
176 | 
177 |     def purge_download_result(self):
178 |         r = self.client.aria2.purgeDownloadResult(RPC_SECRET)
179 |         if r != 'OK':
180 |             self.logger.warning('没有成功清除所有下载信息！')
181 | 
182 |     def save_to_sqlite(self, magnet):
183 |         if not magnet:
184 |             return
185 |         torrent = os.path.join(DIR_PATH, magnet[-40:] + '.torrent')
186 |         if os.path.exists(torrent):
187 |             self.logger.info('save {} to database'.format(magnet))
188 |             parser = TorrentParser(torrent)
189 |             data = parser.get_torrent_info()
190 |             self.sqlite.insert(magnet, data)
191 |         else:
192 |             self.logger.error('不存在该文件 {}'.format(torrent))
193 | 
194 |     def extract_magnet_from_status(self, gid):
195 |         r = self.client.aria2.tellStatus(RPC_SECRET, gid, ['infoHash'])
196 |         info_hash = r.get('infoHash', None)
197 |         magnet = None
198 |         if info_hash:
199 |             magnet = 'magnet:?xt=urn:btih:' + info_hash.upper()
200 |             magnet = magnet.encode()
201 |         # 也可以这样
202 |         # r = self.client.aria2.getFiles(RPC_SECRET, gid)[0]
203 |         # path = r.get('path', None)
204 |         # magnet = None
205 |         # if path:
206 |         #     magnet = 'magnet:?xt=urn:btih:' + path[-40:]
207 |         #     magnet = magnet.encode()
208 |         return magnet
209 | 
210 | 
211 | def start_magnet_converter():
212 |     converter = Aria2MagnetConverter(RPC_SERVER, secret=RPC_SECRET)
213 |     threads = [
214 |         Thread(target=converter.magnet_to_torrent_forever),
215 |         Thread(target=converter.receive_aria2_notifications),
216 |     ]
217 | 
218 |     for t in threads:
219 |         t.start()
220 | 
221 |     for t in threads:
222 |         t.join()
223 | 
224 | 
225 | if __name__ == '__main__':
226 |     start_magnet_converter()
227 | 


--------------------------------------------------------------------------------