├── .gitignore
├── requirements.txt
├── app
    ├── nico
    │   ├── config.py
    │   ├── __init__.py
    │   └── crawler.py
    ├── gelbooru
    │   ├── config.py
    │   ├── __init__.py
    │   └── crawler.py
    ├── twitter
    │   ├── config.py
    │   ├── model.py
    │   ├── __init__.py
    │   └── crawler.py
    ├── pixiv
    │   ├── config.py
    │   ├── __init__.py
    │   ├── pixiv_auth.py
    │   ├── crawler.py
    │   └── model.py
    └── models.py
├── config.py
├── run.sh
├── run_crawlers.py
├── README.md
└── main.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | **/__pycache__
 2 | **/*.log
 3 | test*.py
 4 | **/*.sqlite3
 5 | *.sqlite3-shm
 6 | *.sqlite3-wal
 7 | *.out
 8 | # config.py
 9 | # **/config.py
10 | # run.sh
11 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # pre
 2 | python-telegram-bot==20.0a2
 3 | aiohttp
 4 | python-dotenv
 5 | pytz
 6 | lxml
 7 | xmltodict
 8 | tortoise-orm
 9 | pydantic
10 | tweepy
11 | pixivpy-async


--------------------------------------------------------------------------------
/app/nico/config.py:
--------------------------------------------------------------------------------
 1 | # 要推送的图片 tags
 2 | NICO_TAGS = ['a', 'b']
 3 | 
 4 | # user_sess 同名 cookie，推送原图时需要
 5 | NICO_USER_SESS = ''
 6 | 
 7 | # 爬取页数
 8 | NICO_PAGE_NUM = 1
 9 | 
10 | # 用户黑名单，该名单内的用户的作品不会被推送
11 | NICO_USER_BLACKLIST = []
12 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | # bot token
 2 | TOKEN = 'xxxx'
 3 | 
 4 | # 消息发送失败的最大重试次数
 5 | MESSAGE_MAX_RETRY = 12
 6 | 
 7 | # group/user/channel id 列表
 8 | CHAD_ID_LIST = ['1234567']
 9 | 
10 | # 代理地址（本地使用需要设置）
11 | PROXY = None  # 'http://127.0.0.1:1081'
12 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | cd /root/telegram-bot/th-telegram-image-bot
 2 | LOCK="LOCK"
 3 | 
 4 | if [ ! -f "$LOCK" ]; then
 5 |   touch $LOCK
 6 |   python run_crawlers.py
 7 |   sleep 1
 8 |   python main.py
 9 |   rm -rf $LOCK
10 | else
11 |   ts=`stat -c %Y LOCK`
12 |   now=`date +%s`
13 |   if [ $[ $now - $ts ] -gt 1800 ]; then
14 |     rm -rf $LOCK
15 |     echo "Lock expired, deleted"
16 |   fi
17 | fi
18 | 


--------------------------------------------------------------------------------
/app/gelbooru/config.py:
--------------------------------------------------------------------------------
 1 | # 推送图片的 tags
 2 | GELBOORU_TAGS = ['a', 'b']
 3 | 
 4 | # 爬取页数
 5 | GELBOORU_PAGE_NUM = 1
 6 | 
 7 | # 图片质量
 8 | # file_url: 原图
 9 | # sample_url: 压缩图
10 | GELBOORU_IMAGE_QUALITY = 'file_url'  # or sample_url
11 | 
12 | # 因为 Gelbooru 限制了 API 请求次数（目测是 110 次每天），所以需要减少请求次数
13 | # 每隔多少分钟请求 Gelbooru，取值为 1 - 59
14 | REQ_INTERVAL = 20
15 | 
16 | # API KEY
17 | # 在 https://gelbooru.com/index.php?page=account&s=options 中
18 | API_KEY = ''
19 | USER_ID = ''


--------------------------------------------------------------------------------
/app/twitter/config.py:
--------------------------------------------------------------------------------
 1 | # 目标列表Id
 2 | # 关于 LIST 参考：https://help.twitter.com/en/using-twitter/twitter-lists
 3 | TWITTER_LIST_ID = ['123456789']
 4 | # 爬取页数
 5 | TWITTER_PAGE_NUM = 5
 6 | # 只推送含图片的动态
 7 | TWITTER_ONLY_IMAGE = True
 8 | 
 9 | # KEY 相关，生成 Authentication Tokens
10 | # https://developer.twitter.com/en/portal/dashboard
11 | # 获取方法：
12 | # https://developer.twitter.com/en/docs/authentication/oauth-2-0/bearer-tokens
13 | TWITTER_API_KEY = ''
14 | TWITTER_API_KEY_SECRET = ''
15 | TWITTER_ACCESS_TOKEN = ''
16 | TWITTER_ACCESS_SECRET = ''
17 | 


--------------------------------------------------------------------------------
/app/twitter/model.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from datetime import datetime
 3 | 
 4 | from typing import List
 5 | 
 6 | 
 7 | class TimeLine(BaseModel):
 8 |     author_name: str
 9 |     created_at: datetime
10 |     text: str
11 |     id: str
12 |     photos: List[str]
13 | 
14 |     def __str__(self):
15 |         author = self.author_name
16 |         created_at = self.created_at
17 |         text = self.text
18 |         id = self.id
19 | 
20 |         return id + '\n\n' + 'author: ' + author + '\n' + 'created_at(utc+8):\n' + str(created_at) + '\n\n' + text
21 | 


--------------------------------------------------------------------------------
/app/pixiv/config.py:
--------------------------------------------------------------------------------
 1 | # 必须配置 PIXIV_TAGS 和 PIXIV_REFRESH_TOKEN，其他配置可默认
 2 | 
 3 | # 推送图片的 tags
 4 | PIXIV_TAGS = [
 5 |     'a', 
 6 |     {
 7 |         'tag': 'b',
 8 |         'score': 1000
 9 |     }
10 | ]
11 | 
12 | # refresh token
13 | # 获取方法：https://gist.github.com/ZipFile/c9ebedb224406f4f11845ab700124362
14 | PIXIV_REFRESH_TOKEN = 'dF8jpliqu0UlM519k09ICNLQQYnbLL6MBO9XuqDGEBPFAC'
15 | 
16 | # 用户白名单，填入 uid，在该名单内的用户的作品会被立即推送（无视分数）
17 | PIXIV_USER_WHITELIST = []
18 | 
19 | # 每个 tag 爬取的最大页数
20 | PIXIV_MAX_PAGE = 1
21 | 
22 | # 推送的投稿的最小分数，用于排除低质量的投稿
23 | # 计算公式 view + bookmark * 5
24 | PIXIV_MIN_POST_SCORE = 300
25 | 
26 | # 是否推送关注的作者
27 | PIXIV_FOLLOW = False
28 | 
29 | # 反代 i.pixiv.re，目前不需要
30 | PIXIV_REVERSE_PROXY = ''
31 | 


--------------------------------------------------------------------------------
/app/twitter/__init__.py:
--------------------------------------------------------------------------------
 1 | from app.models import ImageDB, ImagePD
 2 | from config import PROXY
 3 | from .config import *
 4 | 
 5 | from .crawler import TwitterListCrawler
 6 | 
 7 | api_config = {
 8 |     'api_key': TWITTER_API_KEY,
 9 |     'api_key_secret': TWITTER_API_KEY_SECRET,
10 |     'access_token': TWITTER_ACCESS_TOKEN,
11 |     'access_secret': TWITTER_ACCESS_SECRET
12 | }
13 | 
14 | 
15 | async def run():
16 |     crawler = TwitterListCrawler(proxy=PROXY, **api_config)
17 |     for list_id in TWITTER_LIST_ID:
18 |         for timeline in crawler.get_timeline(list_id, pages=TWITTER_PAGE_NUM, only_img=TWITTER_ONLY_IMAGE):
19 |             dct = {
20 |                 'original_site': 'twitter',
21 |                 'original_id': timeline.id,
22 |                 'pic_hash_list': timeline.photos
23 |             }
24 | 
25 |             if not await ImageDB.filter(**dct):
26 |                 pd = ImagePD(content=str(timeline), **dct)
27 |                 await ImageDB.create(**pd.dict())
28 |                 
29 |                 yield pd
30 | 


--------------------------------------------------------------------------------
/run_crawlers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 运行全部爬虫，入库数据
 3 | """
 4 | import asyncio
 5 | 
 6 | from tortoise import Tortoise, run_async
 7 | 
 8 | from app.gelbooru import run as run_gel
 9 | from app.nico import run as run_nico
10 | from app.pixiv import run as run_pixiv
11 | from app.twitter import run as run_twitter
12 | 
13 | 
14 | async def init_db():
15 |     db_url = 'sqlite://db.sqlite3'
16 | 
17 |     await Tortoise.init(
18 |         db_url=db_url,
19 |         modules={'models': ['app.models']}
20 |     )
21 | 
22 |     await Tortoise.generate_schemas()
23 | 
24 | 
25 | funcs = {
26 |     'Gelbooru': run_gel,
27 |     'Nico': run_nico,
28 |     'Twitter': run_twitter,
29 |     'Pixiv': run_pixiv
30 | }
31 | 
32 | 
33 | async def main():
34 |     await init_db()
35 | 
36 |     for name, func in funcs.items():
37 |         print('=' * 30)
38 |         print(f'开始执行 {name} 爬虫')
39 |         try:
40 |             async for img in func():
41 |                 print(img)
42 |         except Exception as e:
43 |             print(f'{name} 爬虫失败:', e)
44 |         print('=' * 30)
45 |         print()
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     run_async(main())
50 | 


--------------------------------------------------------------------------------
/app/gelbooru/__init__.py:
--------------------------------------------------------------------------------
 1 | from app.models import ImageDB, ImagePD
 2 | from config import PROXY
 3 | from datetime import datetime
 4 | 
 5 | from .config import *
 6 | from .crawler import GelbooruCrawler
 7 | 
 8 | 
 9 | def parse_item(item, tag):
10 |     content = f'[Gel #{tag}]\nid={item["id"]}'
11 |     pics = [item[GELBOORU_IMAGE_QUALITY]]
12 |     return ImagePD(content=content, pic_hash_list=pics, original_site='gelbooru', original_id=item['id'])
13 | 
14 | 
15 | async def run():
16 |     minute = datetime.now().minute
17 | 
18 |     if minute % REQ_INTERVAL != 0:
19 |         return
20 | 
21 |     async with GelbooruCrawler(proxy=PROXY, api_key=API_KEY, user_id=USER_ID) as crawler:
22 |         for tag in GELBOORU_TAGS:
23 |             async for data in crawler.get_many_pages(tag, begin=0, end=GELBOORU_PAGE_NUM):
24 |                 for item in data['post']:
25 |                     pd = parse_item(item, tag)
26 |                     dct = pd.dict()
27 |                     if not await ImageDB.filter(original_site=pd.original_site, original_id=pd.original_id):
28 |                         await ImageDB.create(**dct)
29 |                         yield pd
30 | 


--------------------------------------------------------------------------------
/app/models.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | 
 3 | from pydantic import BaseModel
 4 | from tortoise import fields, models
 5 | 
 6 | 
 7 | class ImageDB(models.Model):
 8 |     id = fields.IntField(pk=True)
 9 |     content = fields.CharField(255)
10 |     pic_hash_list = fields.JSONField()
11 |     original_site = fields.CharField(max_length=10, index=True)
12 |     original_id = fields.CharField(max_length=25, index=True)
13 |     send_at = fields.DatetimeField(auto_now_add=True)
14 |     send_successed = fields.BooleanField(default=False)
15 |     retry = fields.IntField(default=0)
16 |     reason = fields.CharField(max_length=50, default='')
17 | 
18 |     def __str__(self):
19 |         return f'{self.content}'
20 | 
21 |     def __repr__(self):
22 |         return f'{self.content}\n\n{self.original_site} - {self.original_id}\n{ self.send_at}\n{self.send_successed}'
23 | 
24 | # pydantic creator doesn't work on pyright lint, so just written a pydantic model of ImageDB
25 | 
26 | 
27 | class ImagePD(BaseModel):
28 |     content: str
29 |     pic_hash_list: List[str]
30 |     original_site: str
31 |     original_id: str
32 | 
33 |     def __str__(self):
34 |         pic_number = len(self.pic_hash_list)
35 |         return self.content + f'\npic number: {pic_number}\n'
36 | 
37 | # TODO: 添加一个新表/字段，实现记录推送成功的人/群的ID，以避免多消息推送其中部分失败无法重发的问题
38 | 


--------------------------------------------------------------------------------
/app/pixiv/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from app.models import ImageDB, ImagePD
 3 | 
 4 | from config import PROXY
 5 | 
 6 | from .config import *
 7 | from .crawler import PixivCrawler
 8 | from .model import Illust
 9 | 
10 | 
11 | async def run():
12 |     illust_list: List[Illust] = []
13 | 
14 |     async with PixivCrawler(PIXIV_REFRESH_TOKEN, PROXY) as crawler:
15 |         for tag_info in PIXIV_TAGS:
16 |             min_score = PIXIV_MIN_POST_SCORE
17 |             tag = tag_info
18 |             
19 |             if isinstance(tag_info, dict):
20 |                 tag, min_score = tag_info['tag'], tag_info['score']
21 | 
22 |             async for illust in crawler.search_illust(tag, PIXIV_MAX_PAGE):
23 |                 if illust.score >= min_score or (illust.user_id in PIXIV_USER_WHITELIST):
24 |                     illust_list.append(illust)
25 | 
26 |         if PIXIV_FOLLOW:
27 |             async for illust in crawler.craw_follow(PIXIV_MAX_PAGE):
28 |                 illust_list.append(illust)
29 | 
30 |     for illust in illust_list:
31 |         dct = {
32 |             'original_site': 'pixiv',
33 |             'original_id': illust.illust_id,
34 |             'pic_hash_list': illust.images
35 |         }
36 | 
37 |         if not await ImageDB.filter(**dct):
38 |             pd = ImagePD(content=str(illust)[:254], **dct)
39 |             await ImageDB.create(**pd.dict())
40 |             yield pd
41 | 


--------------------------------------------------------------------------------
/app/pixiv/pixiv_auth.py:
--------------------------------------------------------------------------------
 1 | from pprint import pprint
 2 | from typing import Optional
 3 | 
 4 | 
 5 | import requests
 6 | 
 7 | # Latest app version can be found using GET /v1/application-info/android
 8 | USER_AGENT = "PixivAndroidApp/5.0.234 (Android 11; Pixel 5)"
 9 | REDIRECT_URI = "https://app-api.pixiv.net/web/v1/users/auth/pixiv/callback"
10 | LOGIN_URL = "https://app-api.pixiv.net/web/v1/login"
11 | AUTH_TOKEN_URL = "https://oauth.secure.pixiv.net/auth/token"
12 | CLIENT_ID = "MOBrBDS8blbauoSck0ZfDbtuzpyT"
13 | CLIENT_SECRET = "lsACyCD94FhDUtGTXi3QzcFE2uU1hqtDaKeqrdwj"
14 | 
15 | 
16 | def auth_token_response(response):
17 |     data = response.json()
18 | 
19 |     try:
20 |         access_token = data["access_token"]
21 |         refresh_token = data["refresh_token"]
22 |         return access_token, refresh_token
23 |     except KeyError:
24 |         print("error:")
25 |         pprint(data)
26 |         exit(1)
27 | 
28 | 
29 | def refresh(refresh_token: str, proxy = ''):
30 |     response = requests.post(
31 |         AUTH_TOKEN_URL,
32 |         data={
33 |             "client_id": CLIENT_ID,
34 |             "client_secret": CLIENT_SECRET,
35 |             "grant_type": "refresh_token",
36 |             "include_policy": "true",
37 |             "refresh_token": refresh_token,
38 |         },
39 |         headers={"User-Agent": USER_AGENT},
40 |         proxies={
41 |             'http': proxy,
42 |             'https': proxy
43 |         }
44 |     )
45 | 
46 |     return auth_token_response(response)
47 | 


--------------------------------------------------------------------------------
/app/gelbooru/crawler.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | import aiohttp
 4 | from aiohttp import TCPConnector
 5 | 
 6 | 
 7 | class GelbooruCrawler(object):
 8 |     def __init__(self, proxy: Optional[str] = '', api_key: Optional[str] = '', user_id: Optional[str] = ''):
 9 |         self.proxy = proxy
10 |         self.api_key = api_key
11 |         self.user_id = user_id
12 |         self.session = aiohttp.ClientSession(connector=TCPConnector(ssl=False))
13 | 
14 |     async def __aenter__(self):
15 |         return self
16 | 
17 |     async def close(self):
18 |         await self.session.close()
19 | 
20 |     async def __aexit__(self, ex1, e2, e3):
21 |         await self.close()
22 | 
23 |     async def get_one_page(self, tags: str, page: int, limit: int = 42):
24 |         api = 'https://gelbooru.com/index.php?page=dapi&s=post&q=index&json=1'
25 | 
26 |         params = {
27 |             'tags': tags,
28 |             'pid': page,
29 |             'limit': limit,
30 |             'api_key': self.api_key,
31 |             'user_id': self.user_id
32 |         }
33 | 
34 |         headers = {
35 |             'user-agent': 'wasp',
36 |             'cookie': 'fringeBenefits=yup'
37 |         }
38 | 
39 |         async with self.session.get(api, params=params, headers=headers, proxy=self.proxy) as resp:
40 |             return await resp.json()
41 | 
42 |     async def get_many_pages(self, tags: str, begin: int, end: int, limit: int = 42):
43 |         for page in range(begin, end):
44 |             yield await self.get_one_page(tags, page, limit)
45 | 


--------------------------------------------------------------------------------
/app/nico/__init__.py:
--------------------------------------------------------------------------------
 1 | from app.models import ImageDB, ImagePD
 2 | from config import PROXY
 3 | 
 4 | from .config import *
 5 | from .crawler import NSCrawler
 6 | 
 7 | 
 8 | def parse_message(info):
 9 |     username = info['nickname']
10 |     title = info['title']
11 |     description = info['description'][:150]
12 |     created = '投稿时间' + info['created']
13 |     sid = info['id']
14 |     tags = ['#' + tag['name'] for tag in info['tag_list']['tag']]
15 | 
16 |     content = 'im' + sid + '\n' + title + '\n'
17 |     content += '投稿者: ' + username + '\n'
18 |     content += description + '\n\n'
19 |     content += ' | '.join(tags) + '\n\n' + created
20 | 
21 |     return content
22 | 
23 | 
24 | async def run():
25 |     async with NSCrawler(NICO_USER_SESS, proxy=PROXY) as crawler:
26 |         for tag in NICO_TAGS:
27 |             async for imlist in crawler.get_many_pages(tag, 1, NICO_PAGE_NUM):
28 |                 for im in imlist:
29 |                     dct = {
30 |                         'original_site': 'nico',
31 |                         'original_id': im,
32 |                     }
33 | 
34 |                     if not await ImageDB.filter(**dct):
35 |                         info = await crawler.get_info(im)
36 | 
37 |                         if info['user_id'] in NICO_USER_BLACKLIST:  # type: ignore
38 |                             continue
39 | 
40 |                         content = parse_message(info)[:250]
41 |                         pd = ImagePD(content=content, pic_hash_list=[], **dct)
42 |                         await ImageDB.create(**pd.dict())
43 | 
44 |                         yield pd
45 | 


--------------------------------------------------------------------------------
/app/pixiv/crawler.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from random import randint
 3 | 
 4 | from pixivpy_async import AppPixivAPI, PixivClient
 5 | 
 6 | from .model import parse_illust
 7 | from .pixiv_auth import refresh
 8 | 
 9 | 
10 | class PixivCrawler(object):
11 |     def __init__(self, refresh_token, proxy='') -> None:
12 |         self.client = PixivClient(proxy=proxy)
13 |         self.aapi = AppPixivAPI(client=self.client.start())
14 | 
15 |         ACCESS_TOKEN, _ = refresh(refresh_token, proxy)
16 |         self.aapi.set_auth(ACCESS_TOKEN, refresh_token)
17 | 
18 |     async def __aenter__(self):
19 |         return self
20 | 
21 |     async def __aexit__(self, ex, e1, e2):
22 |         await self.client.close()
23 | 
24 |     async def search_illust(self, tag: str, max_page=3):
25 |         """抓取指定 Tag 的内容并返回一个生成器
26 |         :param max_page: 抓取的最大页数
27 |         """
28 |         offset = 0
29 |         page_idx = 1
30 | 
31 |         while page_idx <= max_page:
32 |             detail = await self.aapi.search_illust(tag, offset=offset)
33 | 
34 |             offset += len(detail['illusts'])
35 |             page_idx += 1
36 | 
37 |             for item in detail['illusts']:
38 |                 illust = parse_illust(item)
39 |                 yield illust
40 | 
41 |             # if page_idx <= max_page:
42 |             #     await asyncio.sleep(randint(1, 3))
43 | 
44 |     async def craw_follow(self, max_page=3):
45 |         for page in range(max_page):
46 |             for illust in (await self.aapi.illust_follow(offset=page * 30))['illusts']:
47 |                 yield parse_illust(illust)
48 | 
49 |             # if page != max_page - 1:
50 |             #     await asyncio.sleep(randint(1, 3))
51 | 


--------------------------------------------------------------------------------
/app/pixiv/model.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from pydantic import BaseModel
 4 | 
 5 | 
 6 | class Illust(BaseModel):
 7 |     title: str
 8 |     description: str
 9 |     illust_id: str
10 |     tags: List[str]
11 |     create_date: str
12 |     user_id: str
13 |     username: str
14 |     images: List[str]
15 |     score: int
16 | 
17 |     def __str__(self):
18 |         pid = f'Pixiv ID: {self.illust_id}\n\n'
19 |         title = f'{self.title}\n' if self.title else '(无题)'
20 |         description = f'{self.description}\n\n' if self.description else ''
21 |         poster = f'投稿者(id={self.user_id}): {self.username}\n\n'
22 |         tags = f'{" | ".join(self.tags)}\n\n'
23 |         post_at = f'投稿时间: {self.create_date}'
24 | 
25 |         return ''.join([pid, title, description, poster, tags, post_at])
26 | 
27 | 
28 | def parse_illust(illust: dict) -> Illust:
29 |     title = illust['title']
30 |     description = illust['caption'][:50]
31 |     illust_id = illust['id']
32 |     tags = ['#' + tag['name'] for tag in illust['tags']]
33 |     create_date = illust['create_date']
34 | 
35 |     view = illust['total_view']
36 |     bookmark = illust['total_bookmarks']
37 | 
38 |     user = illust['user']
39 |     user_id = user['id']
40 |     name = user['name']
41 | 
42 |     meta_single_page = illust['meta_single_page']
43 |     meta_pages = illust['meta_pages']
44 | 
45 |     if 'original_image_url' in meta_single_page:
46 |         images = [meta_single_page['original_image_url']]
47 |     else:
48 |         images = [image['image_urls']['original'] for image in meta_pages]
49 | 
50 |     illust_model = Illust(**{
51 |         'title': title,
52 |         'description': description,
53 |         'illust_id': illust_id,
54 |         'tags': tags,
55 |         'create_date': create_date,
56 |         'user_id': user_id,
57 |         'username': name,
58 |         'images': images,
59 |         'score': view + bookmark * 5
60 |     })
61 | 
62 |     return illust_model
63 | 


--------------------------------------------------------------------------------
/app/twitter/crawler.py:
--------------------------------------------------------------------------------
  1 | from datetime import timedelta
  2 | import os
  3 | 
  4 | import tweepy
  5 | 
  6 | from .model import TimeLine
  7 | 
  8 | """
  9 | from datetime import timedelta, datetime
 10 | import pytz
 11 |   utc8now = datetime.now(tz=pytz.timezone(
 12 |                         'Asia/Shanghai')).replace(tzinfo=None)
 13 | 
 14 |                     date_interval = utc8now - \
 15 |                         timedelta(minutes=config.INTERVAL)
 16 | 
 17 |                     date_interval = date_interval.replace(tzinfo=None)
 18 | 
 19 |                     if date_interval > created_at:
 20 |                         continue
 21 | 
 22 |                     for media in entities['media']:
 23 |                         if media['type'] == 'photo':
 24 |                             photos.append(media['media_url_https'] + ':orig')
 25 | 
 26 |                             
 27 | async def parse_message():
 28 |     timeline = await get_formatted_timeline()
 29 | 
 30 |     for item in timeline:
 31 |         author_name = item.author_name
 32 |         created_at = item.created_at
 33 |         text = item.text
 34 |         id = item.id
 35 |         photos = item.photos
 36 | 
 37 |         content = ('author: ' + author_name + '\n' +
 38 |                    'created_at(utc+8):\n' + str(created_at) + '\n\n' +
 39 |                    text + '\n' + 'id=' + id + '\n' +
 40 |                    ''.join([f'[CQ:image,file={photo}]' for photo in photos])
 41 |                    )
 42 | 
 43 |         yield content, item
 44 | """
 45 | 
 46 | 
 47 | def parse_timeline_item(item, only_img):
 48 |     author_name = item.author.name
 49 |     # 转为 utc+8
 50 |     created_at = item.created_at + timedelta(hours=8)
 51 |     created_at = created_at.replace(tzinfo=None)
 52 |     text = item.text
 53 |     id = item.id_str
 54 |     photos = []
 55 | 
 56 |     has_entities = hasattr(item, 'extended_entities')
 57 | 
 58 |     # 如果没有 entities
 59 |     if not has_entities:
 60 |         # 且指定只返回包含图片的 twitter，直接结束函数
 61 |         if only_img:
 62 |             return
 63 |         # 否则赋值一个空 entities
 64 |         else:
 65 |             entities = {'media': []}
 66 |     else:
 67 |         entities = item.extended_entities
 68 | 
 69 |     for media in entities['media']:
 70 |         if media['type'] == 'photo':
 71 |             photos.append(media['media_url_https'])
 72 | 
 73 |     if not photos:
 74 |         return
 75 | 
 76 |     return TimeLine(author_name=author_name,
 77 |                     created_at=created_at, text=text, id=id, photos=photos)
 78 | 
 79 | 
 80 | class TwitterListCrawler(object):
 81 |     def __init__(self, api_key, api_key_secret, access_token, access_secret, proxy='') -> None:
 82 |         self.api_key = api_key
 83 |         self.api_key_secret = api_key_secret
 84 |         self.access_token = access_token
 85 |         self.access_secret = access_secret
 86 |         self.proxy = proxy
 87 | 
 88 |     def get_timeline(self, list_id: str, count=200, pages=1, include_rts=False, only_img=True):
 89 |         """获取 count 条列表时间线数据，支持翻页"""
 90 |         if self.proxy:
 91 |             os.environ['http_proxy'] = self.proxy
 92 |             os.environ['https_proxy'] = self.proxy
 93 | 
 94 |         auth = tweepy.OAuthHandler(self.api_key, self.api_key_secret)
 95 |         auth.set_access_token(self.access_token, self.access_secret)
 96 | 
 97 |         api = tweepy.API(auth)
 98 | 
 99 |         for data in tweepy.Cursor(api.list_timeline, list_id=list_id, count=count, include_rts=include_rts, include_entities={'extended_entities': True}).pages(pages):
100 |             for item in data:
101 |                 twitter = parse_timeline_item(item, only_img)
102 | 
103 |                 if twitter:
104 |                     yield twitter
105 | 


--------------------------------------------------------------------------------
/app/nico/crawler.py:
--------------------------------------------------------------------------------
  1 | # import logging
  2 | from typing import List, Optional
  3 | 
  4 | from aiohttp import ClientSession, TCPConnector
  5 | from lxml.html import fromstring
  6 | 
  7 | # logging.Logger = 
  8 | 
  9 | def get_id(sid: str):
 10 |     if sid.startswith('im'):
 11 |         sid = sid[2:]
 12 | 
 13 |     return sid
 14 | 
 15 | 
 16 | class NSCrawler():
 17 |     def __init__(self, user_sess: Optional[str] = '', session: Optional[ClientSession] = None, proxy: Optional[str] = None):
 18 |         self.session = session or ClientSession(
 19 |             connector=TCPConnector(ssl=False))
 20 |         self.proxy = proxy
 21 |         self.headers = {
 22 |             # 'Host': 'seiga.nicovideo.jp',
 23 |             'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
 24 |                            ' AppleWebKit/537.36 (KHTML, like Gecko) '
 25 |                            'Chrome/87.0.4280.141 Safari/537.36'),
 26 |             # image_search_sort item 指定按投稿时间排序
 27 |             'Cookie': f'user_session={user_sess}; sort_search_image_keyword=image_created;image_search_sort=image_created;'
 28 |         }
 29 | 
 30 |     async def __aenter__(self):
 31 |         return self
 32 | 
 33 |     async def close(self):
 34 |         await self.session.close()
 35 | 
 36 |     async def __aexit__(self, ex1, e2, e3):
 37 |         await self.close()
 38 | 
 39 |     async def request(self, url):
 40 |         async with self.session.get(url, proxy=self.proxy, headers=self.headers) as resp:
 41 |             if 'content-type' in resp.headers:
 42 |                 ctype = resp.headers['content-type']
 43 |             else:
 44 |                 # print(resp.headers)
 45 |                 ctype = ''
 46 | 
 47 |             if 'application/json' in ctype:
 48 |                 return await resp.json()
 49 | 
 50 |             return await resp.text()
 51 | 
 52 |     async def get_one_page(self, tags: str, pn: int) -> List[str]:
 53 |         """
 54 |         :return:
 55 |            [1, 2, ...]
 56 |         """
 57 | 
 58 |         url = f'https://seiga.nicovideo.jp/tag/{tags}?page={pn}'
 59 | 
 60 |         try:
 61 |             html = await self.request(url)
 62 |             es = fromstring(html)
 63 |             im_xpath = '//li[@class="list_item list_no_trim2"]/a/@href'
 64 | 
 65 |             return [item.replace('/seiga/im', '') for item in es.xpath(im_xpath)]
 66 |         except Exception as e:
 67 |             # logging.error(f'[NICO] {tags} 请求第 {pn} 页数据出错:', e)
 68 |             return []
 69 | 
 70 |     async def get_many_pages(self, tags: str, begin: int, end: int):
 71 |         if end == begin:
 72 |             end += 1
 73 | 
 74 |         for pn in range(begin, end):
 75 |             im_list = await self.get_one_page(tags, pn)
 76 | 
 77 |             if not im_list:
 78 |                 return
 79 | 
 80 |             yield im_list
 81 | 
 82 |     async def get_info(self, sid: str):
 83 |         id = get_id(sid)
 84 | 
 85 |         api = f'https://sp.seiga.nicovideo.jp/ajax/seiga?id={id}'
 86 | 
 87 |         json_data = await self.request(api)
 88 | 
 89 |         if 'errors' in json_data:
 90 |             # logging.error('[NICO]%s 请求图片信息出错: %s', id, json_data['errors'])
 91 |             return []
 92 | 
 93 |         return json_data['target_image']
 94 | 
 95 |     async def get_source_url(self, sid: str):
 96 |         id = get_id(sid)
 97 |         # 如果用户未登录默认使用缩略图
 98 |         source_url = f'https://lohas.nicoseiga.jp/thumb/{id}i'
 99 |         api = f'https://seiga.nicovideo.jp/image/source?id={id}'
100 | 
101 |         async with self.session.get(api, headers=self.headers, proxy=self.proxy, allow_redirects=False) as resp:
102 |             if 'location' in resp.headers:
103 |                 location = resp.headers['location']
104 |                 if 'lohas.nicoseiga.jp' in location:
105 |                     source_url = location.replace('/o/', '/priv/')
106 | 
107 |         return source_url
108 | 
109 |     async def get_tag_list(self, sid: str):
110 |         id = get_id(sid)
111 |         api = f'https://seiga.nicovideo.jp/ajax/illust/tag/list?id={id}'
112 |         json_data = await self.request(api)
113 | 
114 |         if 'errors' in json_data:
115 |             # logging.error('[NICO]%s 请求 tag list 失败: %s', id, json_data['errors'])
116 |             return []
117 | 
118 |         return json_data['tag_list']
119 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Telegram 图片推送 bot
  2 | 
  3 | **支持多种网站推送**
  4 | - [Pixiv](https://www.pixiv.net/)
  5 | - [Nico Seiga](http://seiga.nicovideo.jp/)
  6 | - [Twitter](https://twitter.com/home)
  7 | - [Gelbooru](https://gelbooru.com/)
  8 | 
  9 | **其它功能**
 10 | * 消息自动重发
 11 | * 当图片数量过多时支持图片分段发送
 12 | * 判断图片大小以选择不同的消息类型
 13 | 
 14 | ## 开始
 15 | 
 16 | ### 0. 依赖
 17 | 
 18 | 1. 需要 Python3.7+
 19 | 2. 运行 `pip install -r requirements.txt` 安装依赖
 20 | 3. 一个能用的 TG 账号
 21 | 
 22 | ### 1. 创建bot
 23 | 
 24 | 1. 前往 [BotFather](https://core.telegram.org/bots#3-how-do-i-create-a-bot) 创建自己的bot
 25 | 2. 获取 BotFather 给出的 token
 26 | 
 27 | ### 2. 推送配置
 28 | 
 29 | **获取 chat id**
 30 | 
 31 | 参考：[Get the Telegram channel ID (github.com)](https://gist.github.com/mraaroncruz/e76d19f7d61d59419002db54030ebe35)
 32 | 
 33 | 如果运行 bot 时提示 chat not found，可以试试不加 `-100` 前缀。
 34 | 
 35 | **推送前需要把bot加入到你的频道/群，也可以私聊推送，具体取决于传入的 chat id 是哪一种。**
 36 | 
 37 | 修改项目根目录的全局配置 `vim config.py`
 38 | 
 39 | ```python
 40 | # bot token
 41 | TOKEN = 'xxxx'
 42 | 
 43 | # 消息发送失败时的重试次数
 44 | MESSAGE_MAX_RETRY = 12
 45 | 
 46 | # group/user/channel id 列表
 47 | CHAD_ID_LIST = ['1234567']
 48 | 
 49 | # 代理地址（本地使用需要设置）
 50 | PROXY = None # 'http://127.0.0.1:1081'
 51 | ```
 52 | 
 53 | > 推送网站均为可选，可根据实际需求配置自己需要推送的网站
 54 | 
 55 | **Gelbooru 配置**
 56 | 
 57 | `vim app/gelbooru/config.py`
 58 | 
 59 | ```python
 60 | # 推送图片的 tags
 61 | GELBOORU_TAGS = ['a', 'b']
 62 | 
 63 | # 爬取页数
 64 | GELBOORU_PAGE_NUM = 1
 65 | 
 66 | # 图片质量
 67 | # file_url: 原图
 68 | # sample_url: 压缩图
 69 | GELBOORU_IMAGE_QUALITY = 'file_url'  # or sample_url
 70 | 
 71 | # 因为 Gelbooru 限制了 API 请求次数（目测是 24 次每天），所以需要减少请求次数
 72 | # 当前时间的分钟数 % REQ_INTERVAL == 0 的时候执行爬虫
 73 | # e.g.: REQ_INTERVAL = 20，表示每个小时的 0、20、40 分钟会执行爬虫
 74 | # 具体能不能执行取决于 cron 的执行时间
 75 | REQ_INTERVAL = 20
 76 | 
 77 | # API KEY（可选；如果被限制请求频率使用 API_KEY 可解除）
 78 | # 在 https://gelbooru.com/index.php?page=account&s=options 中
 79 | API_KEY = ''
 80 | USER_ID = ''
 81 | ```
 82 | 
 83 | **Pixiv 配置**
 84 | 
 85 | `vim app/pixiv/config.py`
 86 | 
 87 | ```python
 88 | # 必须配置 PIXIV_TAGS 和 PIXIV_REFRESH_TOKEN，其他配置可默认
 89 | 
 90 | # 推送图片的 tags
 91 | PIXIV_TAGS = ['a', 'b']
 92 | 
 93 | # refresh token
 94 | # 获取方法：https://gist.github.com/ZipFile/c9ebedb224406f4f11845ab700124362
 95 | PIXIV_REFRESH_TOKEN = 'dF8jpliqu0UlM519k09ICNLQQYnbLL6MBO9XuqDGEBPFAC'
 96 | 
 97 | # 用户白名单，填入 uid，在该名单内的用户的作品会被立即推送（无视分数）
 98 | PIXIV_USER_WHITELIST = []
 99 | 
100 | # 每个 tag 爬取的最大页数
101 | PIXIV_MAX_PAGE = 1 
102 | 
103 | # 推送的投稿的最小分数，用于排除低质量的投稿
104 | # 计算公式 view + bookmark * 5
105 | PIXIV_MIN_POST_SCORE = 300
106 | 
107 | # 是否推送关注的作者
108 | PIXIV_FOLLOW = False 
109 | 
110 | # 反代 i.pixiv.re，目前不需要
111 | PIXIV_REVERSE_PROXY = ''
112 | ```
113 | 
114 | **Twitter 配置**
115 | 
116 | `vim app/twitter/config.py`
117 | 
118 | ```python
119 | # 目标列表Id
120 | # 关于 LIST 参考：https://help.twitter.com/en/using-twitter/twitter-lists
121 | TWITTER_LIST_ID = ['123456789']
122 | # 爬取页数
123 | TWITTER_PAGE_NUM = 5
124 | # 只推送含图片的动态
125 | TWITTER_ONLY_IMAGE = True
126 | 
127 | # KEY 相关，生成 Authentication Tokens 
128 | # https://developer.twitter.com/en/portal/dashboard
129 | # 获取方法：
130 | # https://developer.twitter.com/en/docs/authentication/oauth-2-0/bearer-tokens
131 | TWITTER_API_KEY = ''
132 | TWITTER_API_KEY_SECRET = ''
133 | TWITTER_ACCESS_TOKEN = ''
134 | TWITTER_ACCESS_SECRET = ''
135 | ```
136 | 
137 | **N静配置**
138 | 
139 | `vim app/nico/config.py`
140 | 
141 | ```python
142 | # 要推送的图片 tags
143 | NICO_TAGS = ['a', 'b']
144 | 
145 | # user_sess 同名 cookie，推送原图时需要
146 | NICO_USER_SESS = ''
147 | 
148 | # 爬取页数
149 | NICO_PAGE_NUM = 1
150 | 
151 | # 用户黑名单，该名单内的用户的作品不会被推送
152 | NICO_USER_BLACKLIST = []
153 | ```
154 | 
155 | ### 3. 运行
156 | 
157 | 1. `python run_crawlers.py`  运行爬虫
158 | 2. `python main.py` 发送消息
159 | 
160 | 服务器端可设置 cron 定时运行，只需编辑 `run.sh` 第一行 cd 的目录为项目在服务器的目录，然后 `sh 项目目录/run.sh` 即可。
161 | 
162 | **run.sh**
163 | 
164 | 提供一个运行脚本，该脚本先执行图片爬取功能，然后执行推送，并且实现脚本单例运行。
165 | 
166 | ```bash
167 | # 修改这行为你项目的根目录地址 
168 | cd /root/telegram-bot/th-telegram-image-bot
169 | 
170 | LOCK="LOCK"
171 | 
172 | if [ ! -f "$LOCK" ]; then
173 |   touch $LOCK
174 |   python run_crawlers.py
175 |   sleep 1
176 |   python main.py
177 |   rm -rf $LOCK
178 | else
179 |   ts=`stat -c %Y LOCK`
180 |   now=`date +%s`
181 |   if [ $[ $now - $ts ] -gt 1800 ]; then
182 |     rm -rf $LOCK
183 |     echo "Lock expired, deleted"
184 |   fi
185 | fi
186 | 
187 | ```
188 | 
189 | ## 4. 配置本地 API server（可选）
190 | 
191 | 配置 TG 的本地 API server，可以把上传图片的大小提升至 2000MB，因此发送图片大于 10 MB 的消息时不会发送文件形式的图片。
192 | 
193 | [官方地址](https://github.com/tdlib/telegram-bot-api) 中提供了安装方法，不过这里推荐用 docker 安装，更加方便且不会污染服务器环境。
194 | 
195 | 如果你还没安装 docker，去 [官方文档](https://docs.docker.com/) 找自己系统的安装方式，之后使用 [docker 镜像安装](https://hub.docker.com/r/aiogram/telegram-bot-api)，可以看到容器需要两个环境变量，具体的获取方式在文档中有写，需要注意申请 API Key 的时候关闭 AdBlock 之类的插件，以避免申请失败。
196 | 
197 | ## TODOs
198 | 
199 | 1. 只发送发送失败的消息 - group message 发送失败视为整体发送失败
200 | 2. 只有一张表
201 | 3. 爬虫提供一个 yield 方法获取来获取数据
202 | 4. 爬虫 yield 的 model 应该符合数据表的 model
203 | 5. 发送成功
204 | 6. 可在爬虫 insert 部分添加时间判断
205 | 
206 | 数据库表结构 - 同 qq 推送bot


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import logging
  3 | import os
  4 | from typing import List, Optional, Union
  5 | import imghdr
  6 | 
  7 | import httpx
  8 | from telegram import Bot, InputMediaDocument, InputMediaPhoto, InputFile
  9 | from telegram.error import BadRequest, RetryAfter, TimedOut
 10 | from tortoise import run_async
 11 | from tortoise.expressions import F
 12 | 
 13 | from app.models import ImageDB
 14 | from app.nico import config as nico_config
 15 | from app.nico.crawler import NSCrawler
 16 | from app.pixiv.config import PIXIV_REVERSE_PROXY
 17 | from config import *
 18 | from run_crawlers import init_db
 19 | 
 20 | 
 21 | logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 22 |                     level=logging.INFO)
 23 | 
 24 | 
 25 | logger = logging.getLogger(__name__)
 26 | logger.setLevel(logging.INFO)
 27 | 
 28 | 
 29 | # 发现 tg 有提供常量...
 30 | FILE_MAX_SIZE = 50
 31 | HTTP_FILE_MAXSIZE = 5
 32 | DONWLOAD_FILE_MAXSIZE = 10
 33 | PER_MESSAGE_MAX_IMAGE_COUNT = 10
 34 | 
 35 | 
 36 | if PROXY:
 37 |     os.environ['HTTP_PROXY'] = PROXY
 38 |     os.environ['HTTPS_PROXY'] = PROXY
 39 | 
 40 | 
 41 | def get_headers(url: str):
 42 |     headers = {}
 43 | 
 44 |     if url.find('i.pximg.net') != -1:
 45 |         headers['Referer'] = 'https://app-api.pixiv.net/'
 46 | 
 47 |     return headers
 48 | 
 49 | 
 50 | def get_filesizeMB(url: str):
 51 |     # 可以添加返回文件类型功能
 52 |     proxies = PROXY or None
 53 |     headers = get_headers(url)
 54 | 
 55 |     try:
 56 |         resp = httpx.head(url, headers=headers, proxies=proxies)
 57 |     except TimeoutError:
 58 |         logger.error('获取文件大小请求超时')
 59 |         return -1
 60 | 
 61 |     if 'content-length' in resp.headers:
 62 |         length = int(resp.headers['content-length'])
 63 |         return round(length / 1024 / 1024, 2)
 64 |     else:
 65 |         return -1
 66 | 
 67 | 
 68 | def get_file_size_type(url: str):
 69 |     filesize = get_filesizeMB(url)
 70 | 
 71 |     if filesize < HTTP_FILE_MAXSIZE:
 72 |         return 'photo'
 73 |     elif filesize < DONWLOAD_FILE_MAXSIZE:
 74 |         return 'download'
 75 |     elif filesize > HTTP_FILE_MAXSIZE and filesize < FILE_MAX_SIZE:
 76 |         return 'document'
 77 |     elif filesize >= FILE_MAX_SIZE:
 78 |         return 'exceed'
 79 | 
 80 | 
 81 | def download_media(url: str):
 82 |     headers = get_headers(url)
 83 | 
 84 |     return httpx.get(url, headers=headers, timeout=30).read()
 85 | 
 86 | 
 87 | def get_media_list(urls: List[str], caption):
 88 |     """获取 PhotoMediaList 或者 DocumentMediaList
 89 |     当所有文件大小 < 5MB 时，全为 PhotoMedia
 90 |     当有文件大小 >= 5MB 且 < 50 MB 时，全为 DocumentMedia
 91 |     当有文件大小 >= 50MB 时，跳过该文件
 92 |     """
 93 |     media_list = []
 94 |     section = 1
 95 |     document = False
 96 | 
 97 |     for idx, url in enumerate(urls):
 98 |         # 仅一个 caption 或者 11n 个 caption 设置标题
 99 |         title = None
100 |         # idx = 0, true; idx = 10, true, idx = 20, true; ...
101 |         # 第 idx + 1 张图片
102 |         if idx % PER_MESSAGE_MAX_IMAGE_COUNT == 0:
103 |             title = caption
104 |             if len(urls) > PER_MESSAGE_MAX_IMAGE_COUNT:
105 |                 title = title + '\n\n' + f'SECTION: {section}'
106 |                 section += 1
107 | 
108 |         ft = get_file_size_type(url)
109 | 
110 |         if ft == 'exceed':
111 |             continue
112 |         elif ft == 'document':
113 |             document = True
114 | 
115 |         filename = None
116 | 
117 |         if document:
118 |             filename = url[url.rfind('/') + 1:]
119 | 
120 |         media = {
121 |             'media': url,
122 |             'caption': title,
123 |             'filename': filename
124 |         }
125 | 
126 |         media_list.append(media)
127 | 
128 |     media_method = InputMediaDocument if document else InputMediaPhoto
129 | 
130 |     def List2InputMedia(li):
131 |         return media_method(**li)
132 | 
133 |     media_list = list(map(List2InputMedia, media_list))
134 | 
135 |     for idx in range(0, len(media_list), PER_MESSAGE_MAX_IMAGE_COUNT):
136 |         yield media_list[idx:idx+PER_MESSAGE_MAX_IMAGE_COUNT]
137 | 
138 | 
139 | async def do_send_message(bot: Bot, chat_id: str, photos, reply_message_id: int = None, retry=1):
140 |     timeout = {
141 |         'read_timeout': 30,
142 |         'write_timeout': 30,
143 |         'connect_timeout': 30,
144 |         'pool_timeout': 30
145 |     }
146 | 
147 |     if retry > 6:
148 |         return
149 | 
150 |     try:
151 |         # type: ignore
152 |         return await bot.send_media_group(chat_id, photos, reply_to_message_id=reply_message_id, **timeout)
153 |     except RetryAfter as e:
154 |         after = e.retry_after
155 |         logger.info('发送消息过于频繁，将于 %s 秒后进行第 %s 尝试' % (after, retry))
156 |         await asyncio.sleep(after)
157 |         return await do_send_message(bot, chat_id, photos, reply_message_id, retry + 1)
158 |     except BadRequest as e:
159 |         exstr = str(e)
160 |         errors = ['wrong file', 'wrong type',
161 |                   'photo_invalid_dimensions',
162 |                   'failed to get http url content',
163 |                   'image_process_failed']
164 | 
165 |         logger.info('Bad Request %s', e)
166 | 
167 |         if list(filter(lambda e: exstr.find(e) != -1, errors)):
168 |             downloaded_photos = []
169 |             logger.info('发送图片失败，尝试下载后发送 %s', retry)
170 | 
171 |             for photo in photos:
172 |                 obj = photo.media
173 |                 caption = photo.caption
174 |                 media_obj = None
175 |                 filename = None
176 | 
177 |                 if isinstance(photo.media, str):
178 |                     url = photo.media
179 |                     obj = download_media(photo.media)
180 |                     filename = url[url.rfind('/') + 1:]
181 |                 elif isinstance(photo.media, bytes):
182 |                     filename = 'wth.' + str(imghdr.what('', photo.media))
183 | 
184 |                 if isinstance(photo, InputMediaDocument) or exstr.find('photo_invalid_dimensions') != -1:
185 |                     # filename arg doesn't work
186 |                     media_obj = InputMediaDocument(obj, caption=caption)
187 |                 elif isinstance(photo, InputMediaPhoto):
188 |                     media_obj = InputMediaPhoto(obj, caption=caption)
189 | 
190 |                 if media_obj:
191 |                     downloaded_photos.append(media_obj)
192 | 
193 |             return await do_send_message(bot, chat_id, downloaded_photos, reply_message_id, retry + 1)
194 |         else:
195 |             raise
196 | 
197 | 
198 | async def send_message(bot: Bot, chat_id, message: str, urls: Optional[List[str]] = None, document=False, download=False):
199 |     """发送消息，多块消息只要有一个被发送成功则视整个消息发送成功"""
200 | 
201 |     if not urls:
202 |         await bot.send_message(chat_id, message)
203 |         return
204 | 
205 |     reply_message_id = None
206 |     for data in get_media_list(urls, message):
207 |         try:
208 |             msg_objs = await do_send_message(bot, chat_id, data, reply_message_id)
209 |             if msg_objs:
210 |                 reply_message_id = msg_objs[0].id
211 |         except Exception as e:
212 |             logger.error('下载失败 %s' % e)
213 |             message_with_error = str(
214 |                 message) + '\n\n发送图片失败: TG 无法处理图片 URL，请点击下面的链接访问原图。\n' + '\n'.join(urls) + '\n\n' + str(e)
215 | 
216 |             msg_objs = await bot.send_message(chat_id, message_with_error)
217 | 
218 |             if msg_objs:
219 |                 reply_message_id = msg_objs.id
220 | 
221 | 
222 | async def preprocess_message(message: ImageDB) -> List[str]:
223 |     """预处理数据库数据
224 |     :return: img list"""
225 |     if message.original_site == 'nico':
226 |         async with NSCrawler(nico_config.NICO_USER_SESS, proxy=PROXY) as crw:
227 |             img_list = [await crw.get_source_url(message.original_id)]
228 |     elif message.original_site == 'pixiv':
229 |         if PIXIV_REVERSE_PROXY:
230 |             img_list = list(map(lambda url: url.replace(
231 |                 'i.pximg.net', PIXIV_REVERSE_PROXY), message.pic_hash_list))
232 |         else:
233 |             img_list = message.pic_hash_list
234 |     else:
235 |         img_list = message.pic_hash_list
236 | 
237 |     return img_list
238 | 
239 | 
240 | async def send_message_and_update_db(bot: Bot, chat_id: str, message: ImageDB):
241 |     """包装发送消息方法并更新数据库"""
242 |     img_list = await preprocess_message(message)
243 | 
244 |     # 图片超过 60 张直接退出（避免消息太长出错）
245 |     if len(img_list) > 60:
246 |         return
247 | 
248 |     original_site = message.original_site
249 |     original_id = message.original_id
250 |     orm = ImageDB.filter(
251 |         original_site=original_site, original_id=original_id)
252 | 
253 |     # 发送时标记该消息已被发送
254 |     await orm.update(send_successed=True)
255 | 
256 |     logger.info(f'{original_site} {original_id} 开始发送')
257 | 
258 |     reason = ''
259 | 
260 |     try:
261 |         await send_message(bot, chat_id, str(message), img_list)
262 |     except TimedOut:
263 |         reason = 'time out'
264 | 
265 |     if reason:
266 |         await orm.update(retry=F('retry') + 1, send_successed=False, reason=reason)
267 |         logger.info('发送失败: %s', reason)
268 |     else:
269 |         await orm.update(retry=F('retry') + 1, send_successed=True, reason='')
270 |         logger.info('发送成功')
271 | 
272 |     print()
273 | 
274 | 
275 | async def main():
276 |     logger.info('开始启动推送程序...')
277 | 
278 |     await init_db()
279 |     bot = Bot(TOKEN)
280 | 
281 |     async with bot:
282 |         logger.info('启动完成！')
283 | 
284 |         async for message in ImageDB.filter(send_successed=False, retry__lt=MESSAGE_MAX_RETRY):
285 |             for chat_id in CHAD_ID_LIST:
286 |                 await send_message_and_update_db(bot, chat_id, message)
287 | 
288 | 
289 | if __name__ == '__main__':
290 |     run_async(main())
291 | 


--------------------------------------------------------------------------------