├── .gitignore ├── LICENSE ├── README.md ├── reachee-example.json ├── reachee.py ├── reachee.service ├── sender ├── telegram.py ├── telegram_summary.py └── webhook.py └── vpnfix.py /.gitignore: -------------------------------------------------------------------------------- 1 | .reachee 2 | *.json 3 | !*-example.json 4 | __pycache__/ 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 2 | Version 2, December 2004 3 | 4 | Copyright (C) 2004 Sam Hocevar 5 | 6 | Everyone is permitted to copy and distribute verbatim or modified 7 | copies of this license document, and changing it is allowed as long 8 | as the name is changed. 9 | 10 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 11 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 12 | 13 | 0. You just DO WHAT THE FUCK YOU WANT TO. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reachee 2 | 3 | 吉林大学通知爬虫,发送到 Telegram 频道、webhook 或自定义目的地。 4 | 5 | 以 WTFPL 授权开源。 6 | 7 | ## 使用说明 8 | 9 | 需要 Python 3.6+ ,先 `pip3 install requests beautifulsoup4` 。 10 | 11 | 请参照 `reachee-example.json` 建立配置文件 `reachee.json` 。 12 | 13 | 支持: 14 | - 使用直接连接或 VPNS 连接 OA 15 | - 发送格式化的通知到 Telegram 会话 16 | - 以 JSON 或 form 形式发送通知到 Webhook 17 | - 跳过标题含有关键字的通知 18 | - 隐去含有关键字的通知内容 19 | - 指定获取其他通知频道 20 | - 通过 `reachee.service` 使用 systemd 管理 21 | 22 | ## 联系 23 | 24 | 欢迎开 issue 、pr ,或者到 [Telegram@JLULUG](https://t.me/JLULUG) 转转。 25 | -------------------------------------------------------------------------------- /reachee-example.json: -------------------------------------------------------------------------------- 1 | { 2 | "daemon": true, 3 | "debug": false, 4 | "interval": 300, 5 | "channel": 179577, 6 | "vpns": { 7 | "account": "zhaoyy2119", 8 | "password": "PASSWORD" 9 | }, 10 | "senders": { 11 | "telegram": [{ 12 | "token": "1089092646:", 13 | "chat": "@JLUNews", 14 | "maxlength": 1000, 15 | "skip": [], 16 | "censor": ["先进技术研究院"] 17 | }, 18 | { 19 | "token": "1089092646:", 20 | "chat": "@JLUNewsFiltered", 21 | "maxlength": 1000, 22 | "skip": ["聘", "任免", "任职", "招生宣传", "讲话精神", "申报项目公示", "技术转移公示"], 23 | "censor": ["先进技术研究院"] 24 | } 25 | ], 26 | "telegram_summary": [{ 27 | "token": "1089092646:", 28 | "chat": "@JLUNewsSummary", 29 | "skip": ["聘", "任免", "任职", "招生宣传", "讲话精神", "申报项目公示", "技术转移公示"] 30 | }], 31 | "webhook": [{ 32 | "url": "http://localhost/post_api_or_whatever", 33 | "json": true 34 | }] 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /reachee.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | ''' 5 | Please create reachee.json from reachee-example.json 6 | or specifie config file in command line like 7 | 8 | python3 reachee.py /path/to/config.json 9 | ''' 10 | CONFIG = 'reachee.json' 11 | 12 | import re 13 | import json 14 | import logging as log 15 | from sys import argv 16 | from time import sleep 17 | from importlib import import_module 18 | import requests 19 | import bs4 20 | 21 | # 0 load config 22 | if len(argv)<2: argv.append(CONFIG) 23 | c = { 24 | 'daemon': True, 25 | 'debug': False, 26 | 'interval': 300, 27 | 'channel': 179577, 28 | 'senders': {} 29 | } 30 | c.update(json.load(open(argv[1]))) 31 | 32 | # 0 set logging 33 | log.basicConfig( 34 | format='%(asctime)s %(levelname)s %(message)s', 35 | level=(log.DEBUG if c['debug'] else log.INFO) 36 | ) 37 | log.debug(f'Config: {c}') 38 | 39 | # 0 load post record 40 | try: 41 | with open('.reachee','r') as f: 42 | posted = eval(f.read()) 43 | if not isinstance(posted, list): raise Exception 44 | except: 45 | log.error('Posted records not found!') 46 | posted = [] 47 | 48 | # 0 set variables 49 | baseLAN = 'https://oa.jlu.edu.cn/defaultroot' 50 | baseVPN = 'https://vpn.jlu.edu.cn/https/44696469646131313237446964696461a579b2620fdde512c84ea96fd9/defaultroot' 51 | baseURL = baseVPN if ('vpns' in c) else baseLAN 52 | probing = 1 if posted else 0 53 | page = 1 54 | 55 | # 0 main loop 56 | log.warning('Started') 57 | while True: 58 | try: 59 | log.info(f'Checking page {page}...') 60 | s = requests.Session() 61 | 62 | # 1 login to vpns if required 63 | if 'vpns' in c: s.post('https://vpn.jlu.edu.cn/do-login', data={ 64 | 'auth_type': 'local', 65 | 'username': c['vpns']['account'], 66 | 'password': c['vpns']['password'] 67 | }, timeout=5) 68 | 69 | # 1 fetch channel posts list 70 | r = s.get(f'{baseURL}/PortalInformation!jldxList.action?channelId={c["channel"]}&startPage={page}', timeout=5) 71 | # 1 match all links 72 | posts = bs4.BeautifulSoup(r.content, 'html.parser').find_all(name='a', class_='font14') 73 | # 1 extract post id 74 | posts = list(map((lambda x : int(re.search(r'id=(\d+)',x['href'])[1])), posts)) 75 | posts.reverse() 76 | # 1 reorder posted records 77 | posted = [x for x in posted if x not in posts] + [x for x in posted if x in posts] 78 | # 1 filter posts against posted records 79 | posts = [x for x in posts if x not in posted] 80 | log.debug(f'Posts: {posts}') 81 | 82 | # 1 probing logic 83 | if probing and posts: 84 | if page<10: 85 | log.info(f'[Probing] page {page} have news, getting earlier...') 86 | page = page+1 87 | continue 88 | else: 89 | log.info(f'[Probing] reached maximum probing page(10)') 90 | probing = False 91 | 92 | # 1 process posts 93 | for pid in posts: 94 | log.info(f'New Post: {pid}') 95 | # 2 fetch post content 96 | r = s.get(f'{baseURL}/PortalInformation!getInformation.action?id={pid}', timeout=5) 97 | # 2 extract post text 98 | dom = bs4.BeautifulSoup(r.content, 'html.parser').find(class_='content') 99 | log.debug(f'DOM: {dom}') 100 | title = dom.find(class_='content_t').text 101 | log.info(f'Title: {title}') 102 | time = dom.find(class_='content_time').contents[0].strip() 103 | dept = dom.find(class_='content_time').find('span').text 104 | def innerText(tag): 105 | if not isinstance(tag, bs4.element.Tag): return str(tag) 106 | if tag.name in ['style', 'script']: return '' 107 | result = ''.join([ innerText(x) for x in tag.contents ]) 108 | if tag.name in ['p', 'br', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']: 109 | result = f'\n{result}\n' 110 | return re.sub(r'(\s*\n\s*)+', '\n', result) 111 | content = innerText(dom.find(class_='content_font')).strip() 112 | # 2 dispatch to senders 113 | for (name, configs) in c['senders'].items(): 114 | for config in configs: 115 | import_module(f'sender.{name}').send(config, { 116 | 'pid': pid, 117 | 'title': title, 118 | 'time': time, 119 | 'dept': dept, 120 | 'content': content, 121 | 'linkLAN': f'{baseLAN}/PortalInformation!getInformation.action?id={pid}', 122 | 'linkVPN': f'{baseVPN}/PortalInformation!getInformation.action?id={pid}' 123 | }) 124 | # 2 save post record 125 | posted = (posted+[pid])[-100:] 126 | try: 127 | with open('.reachee','w') as f: 128 | f.write(repr(posted)) 129 | except: 130 | log.error('Unable to write record file!') 131 | 132 | # 1 catch-up logic 133 | if page>1: 134 | log.info(f'[Catch-Up] finished with page {page}, moving on...') 135 | page = page-1 136 | else: 137 | if not c['daemon']: exit(0) 138 | sleep(c['interval']) 139 | 140 | except requests.exceptions.RequestException as e: 141 | log.info('Network error') 142 | log.info(repr(e)) 143 | sleep(5) 144 | except Exception as e: 145 | log.error('Unexpected error') 146 | log.error(repr(e)) 147 | sleep(60) 148 | -------------------------------------------------------------------------------- /reachee.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Reachee 3 | After=network.target 4 | 5 | [Service] 6 | Type=simple 7 | User=user 8 | ExecStart=/usr/bin/env python3 /home/user/Reachee/reachee.py 9 | WorkingDirectory=/home/user/Reachee 10 | Restart=on-failure 11 | 12 | [Install] 13 | WantedBy=multi-user.target 14 | -------------------------------------------------------------------------------- /sender/telegram.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | import logging as log 6 | from time import sleep 7 | from html import escape 8 | import requests 9 | 10 | class FloodException(Exception): pass 11 | 12 | def send(config, post): 13 | # prepare config 14 | c = {'token': '', 'chat': '', 'maxlength': 1000, 'skip': [], 'censor': []} 15 | c.update(config) 16 | if not c['token'] or not c['chat']: 17 | raise Exception('[Telegram] missing parameter') 18 | 19 | # check key words 20 | if any((x in post['title']) for x in c['skip']): 21 | log.info('[Telegram] skip word hit') 22 | return 23 | if any((x in post['content']) for x in c['censor']): 24 | log.info('[Telegram] censor word hit') 25 | post['content'] = '' 26 | 27 | # fixes email addresses and links 28 | post['content'] = re.sub(r'([!-~]+\@[!-~]+)', ' \\1 ', post['content']) 29 | post['content'] = re.sub(r'(https?://[!-~]+)', '\\1 ', post['content']) 30 | 31 | # form message 32 | html = f'{escape(post["title"])}\n' 33 | html += f'{post["time"]} #{post["dept"]}\n' 34 | html += f'校内链接 VPN链接\n\n' 35 | html += f'{escape(post["content"])}' 36 | if len(html) > c['maxlength']: html = html[:c['maxlength']] + '...' 37 | log.debug(f'[Telegram] html: {html}') 38 | 39 | # call HTTP API 40 | while True: 41 | try: 42 | r = requests.post('https://api.telegram.org/bot'+c['token']+'/sendMessage', json={ 43 | 'chat_id': c['chat'], 44 | 'text': html, 45 | 'parse_mode': 'HTML', 46 | 'disable_web_page_preview': True 47 | }, timeout=5) 48 | log.debug(f'[Telegram] response: {r.text}') 49 | if not r.json()['ok']: 50 | if r.json()['error_code'] == 429: 51 | raise FloodException() 52 | else: 53 | raise Exception(r.json()['error_code']) 54 | break 55 | except FloodException: 56 | log.warning('[Telegram] hit rate limit!') 57 | sleep(30) 58 | except requests.exceptions.RequestException as e: 59 | log.info('[Telegram] network error') 60 | log.info(repr(e)) 61 | sleep(5) 62 | except Exception as e: 63 | log.error('[Telegram] unknown error') 64 | log.error(repr(e)) 65 | sleep(60) 66 | -------------------------------------------------------------------------------- /sender/telegram_summary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | import atexit 6 | import logging as log 7 | from time import sleep 8 | from html import escape 9 | from datetime import date 10 | import requests 11 | 12 | def flush(config): 13 | # prepare config 14 | c = {'token': '', 'chat': '', 'caption': '\'通知摘要\''} 15 | c.update(config) 16 | if not c['token'] or not c['chat']: 17 | raise Exception('[Telegram Summary] missing parameter') 18 | 19 | c['_buffer'] = f'{date.today().strftime("%Y%m%d")} {eval(c["caption"])}\n\n' + c['_buffer'] 20 | while True: 21 | try: 22 | r = requests.post('https://api.telegram.org/bot'+c['token']+'/sendMessage', json={ 23 | 'chat_id': c['chat'], 24 | 'text': c['_buffer'], 25 | 'parse_mode': 'HTML', 26 | 'disable_web_page_preview': True 27 | }, timeout=5) 28 | log.debug(f'[Telegram Summary] response: {r.text}') 29 | if not r.json()['ok']: raise Exception(r.json()['error_code']) 30 | break 31 | except requests.exceptions.RequestException as e: 32 | log.info('[Telegram Summary] network error') 33 | log.info(repr(e)) 34 | sleep(5) 35 | except Exception as e: 36 | log.error('[Telegram Summary] unknown error') 37 | log.error(repr(e)) 38 | sleep(60) 39 | 40 | def send(config, post): 41 | # check key words 42 | if any((x in post['title']) for x in config.get('skip', [])): 43 | log.info('[Telegram Summary] skip word hit') 44 | return 45 | 46 | # form message 47 | html = f'{escape(post["title"])} [OA / VPN]\n{post["time"][5:]} {post["dept"]}\n\n' 48 | log.debug(f'[Telegram Summary] html: {html}') 49 | 50 | # save buffer 51 | if '_buffer' not in config: atexit.register(flush, config) 52 | config['_buffer'] = config.get('_buffer', '') + html 53 | -------------------------------------------------------------------------------- /sender/webhook.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging as log 5 | from time import sleep 6 | import requests 7 | 8 | def send(config, post): 9 | c = {'url': ''} 10 | c.update(config) 11 | if not c['url']: 12 | raise Exception('[Webhook] missing parameter') 13 | 14 | while True: 15 | try: 16 | if c.get('json', False): 17 | r = requests.post(c['url'], json=post, timeout=10) 18 | else: 19 | r = requests.post(c['url'], data=post, timeout=10) 20 | log.debug(f'[Webhook] response: {r.text}') 21 | r.raise_for_status() 22 | except requests.exceptions.RequestException as e: 23 | log.info('[Webhook] network error') 24 | log.info(repr(e)) 25 | sleep(5) 26 | except Exception as e: 27 | log.error('[Webhook] Unknown Error') 28 | log.error(repr(e)) 29 | sleep(60) 30 | -------------------------------------------------------------------------------- /vpnfix.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from pyrogram import Client, enums, types 4 | 5 | app = Client( 6 | 'JLUNewsBot', 7 | 0, '', 8 | bot_token='', 9 | no_updates=True, sleep_threshold=24*60*60, 10 | ) 11 | OLD_PREFIXES = [ 12 | 'https://vpns.jlu.edu.cn/https/77726476706e69737468656265737421fff60f962b2526557a1dc7af96/', # wrdvpnisthebest! 13 | 'https://webvpn.jlu.edu.cn/https/77726476706e69737468656265737421fff60f962b2526557a1dc7af96/', 14 | 'https://vpn.jlu.edu.cn/https/6a6c7576706e6973746865676f6f64215ebd458ea69e85a6228e6380fc/', # jluvpnisthegood! 15 | ] 16 | NEW_PREFIX = 'https://vpn.jlu.edu.cn/https/44696469646131313237446964696461a579b2620fdde512c84ea96fd9/' # Didida1127Didida 17 | CHANNEL_ID = 'JLUNews' 18 | MESSAGES = range(13190, 0, -1) 19 | LINK_TEXT = ['VPN链接', 'VPNS', 'VPN'] 20 | 21 | 22 | async def main(): 23 | async with app: 24 | for msgid in MESSAGES: 25 | msg: types.Message = await app.get_messages(CHANNEL_ID, msgid) 26 | if msg.empty or msg.service or msg.forward_date: 27 | continue 28 | print(f'processing message {msgid}') 29 | #print(msg) 30 | need_fix = False 31 | if not msg.entities: 32 | print(msg) 33 | for entity in msg.entities: 34 | if entity.type == enums.MessageEntityType.TEXT_LINK: 35 | if msg.text[entity.offset:entity.offset+entity.length] not in LINK_TEXT: 36 | continue 37 | for prefix in OLD_PREFIXES: 38 | if entity.url.startswith(prefix): 39 | entity.url = NEW_PREFIX + entity.url.removeprefix(prefix) 40 | need_fix = True 41 | if need_fix: 42 | await app.edit_message_text( 43 | CHANNEL_ID, 44 | msgid, msg.text, entities=msg.entities, 45 | disable_web_page_preview=True 46 | ) 47 | print(f'message {msgid} fixed') 48 | 49 | 50 | app.run(main()) 51 | --------------------------------------------------------------------------------