├── feeds └── .gitkeep ├── messages └── .gitkeep ├── .gitignore ├── .gitmodules ├── config.json.example ├── sql └── 000-init.sql ├── README.md └── main.py /feeds/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /messages/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | messages/* 2 | feeds/* 3 | cache/* 4 | !.gitkeep 5 | config.json 6 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "WechatSogou"] 2 | path = WechatSogou 3 | url = github:Chyroc/WechatSogou 4 | -------------------------------------------------------------------------------- /config.json.example: -------------------------------------------------------------------------------- 1 | { 2 | "db_host": "localhost", 3 | "db_user": "root", 4 | "db_password": "*******", 5 | "db_database": "wechat", 6 | "message_path": "messages/", 7 | "message_types": ["TEXT", "IMAGE", "VOICE","POST", "VIDEO"], 8 | "feed_path": "feeds/", 9 | "feed_max": 40, 10 | "feed_ignore_check": false 11 | } 12 | -------------------------------------------------------------------------------- /sql/000-init.sql: -------------------------------------------------------------------------------- 1 | 2 | DROP TABLE IF EXISTS `wechat`.`messages`; 3 | DROP TABLE IF EXISTS `wechat`.`accounts`; 4 | 5 | CREATE TABLE `wechat`.`accounts` ( 6 | `id` varchar(32) NOT NULL COMMENT 'Wechat official account.', 7 | `name` varchar(256) NOT NULL COMMENT 'Account name.', 8 | `auth` varchar(256) COMMENT 'Authorization of account.', 9 | `intro` text CHARACTER SET utf8 COLLATE utf8_general_ci COMMENT 'Short description of account.', 10 | `image` varchar(512) COMMENT 'Account image.', 11 | PRIMARY KEY (`id`) 12 | ) 13 | CHARACTER SET utf8 COLLATE utf8_general_ci 14 | COMMENT = 'Storage of posts in Wechat subscriptions.'; 15 | 16 | CREATE TABLE `wechat`.`messages` ( 17 | `id` varchar(32) NOT NULL COMMENT 'Group message ID.', 18 | `wechat_id` varchar(100) NOT NULL COMMENT 'Wechat ID of subscription post.', 19 | `datetime` int(10) NOT NULL COMMENT 'Message timestamp.', 20 | `type` enum('TEXT', 'IMAGE', 'VOICE', 'POST', 'VIDEO') NOT NULL COMMENT 'Message type', 21 | PRIMARY KEY (`wechat_id`, `id`), 22 | CONSTRAINT `fk_wechat_id` FOREIGN KEY `fk_wechat_id` (`wechat_id`) 23 | REFERENCES `wechat`.`accounts` (`id`) 24 | ON DELETE RESTRICT 25 | ON UPDATE RESTRICT 26 | ) 27 | CHARACTER SET utf8 COLLATE utf8_general_ci 28 | COMMENT = 'Storage of posts in Wechat subscriptions.'; 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## RSS Feed Generator for Wechat Official Accounts 2 | 3 | > Python3 only! Sogou crawler credit to [Chyroc/WechatSogou](https://github.com/Chyroc/WechatSogou/)! 4 | 5 | ### Installation: 6 | 7 | ```shell 8 | # Install dependecies 9 | pip3 install lxml feedgen python-dateutil requests Werkzeug PyMySQL 10 | 11 | # Clone repo 12 | git clone https://github.com/dearrrfish/wechat-subscriptions-rss wrss 13 | cd wrss 14 | 15 | # Initialize database 16 | # Create a database named `wechat`, then: 17 | mysql -u root -p < sql/000-init.sql 18 | 19 | # Copy from example of `config.json`, edit as your preferences 20 | cp config.json.example config.json 21 | vim config.json 22 | 23 | # Run to test 24 | python3 main.py dapapi 25 | ``` 26 | 27 | 28 | 29 | ### Syntax: 30 | 31 | `python3 main.py [-options] wechat_ids...` 32 | 33 | #### Options: 34 | 35 | - `-c|--config` - given path of custom config file, eg. `-c ~/config.wrss.json` 36 | - `--db-host, --db-user, --db-password, --db-database` - override database parameters 37 | - `--message-path` - custom location to store json files of message details, default: `messages/` 38 | - `--message-types` - message types included in final feed. (unfinished, force to be `POST`) 39 | - `--message-ignore-check` - skip fetching new messages, dev use 40 | - `--feed-path` - custom location to output RSS feed xml file, default: `feeds/` 41 | - `--feed-max` - max number of messages adding into feed, default: 20 42 | - `--feed-ignore-check` - skip checking if has_new_messages, force generating feed 43 | - `--syslog` - output log messages to syslog 44 | 45 | 46 | 47 | #### Examples: 48 | 49 | ```shell 50 | python3 main.py --feed-path /var/www/wrss/ --feed-ignore-check --syslog dsmovie sensualguru 51 | ``` 52 | 53 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys, getopt, syslog 4 | from os import path 5 | import os 6 | import json, re 7 | import pymysql.cursors 8 | from datetime import datetime 9 | import dateutil.tz 10 | from feedgen.feed import FeedGenerator 11 | 12 | syslog_enabled = False 13 | cur_dir = os.getcwd() 14 | script_dir, script_filename = path.split(path.abspath(sys.argv[0])) 15 | sys.path.append(path.join(script_dir, 'WechatSogou')) 16 | 17 | from wechatsogou.tools import * 18 | from wechatsogou import * 19 | 20 | class console: 21 | # HEADER = '\033[95m' 22 | # OKBLUE = '\033[94m' 23 | # OKGREEN = '\033[92m' 24 | # WARNING = '\033[93m' 25 | # FAIL = '\033[91m' 26 | # ENDC = '\033[0m' 27 | # BOLD = '\033[1m' 28 | # UNDERLINE = '\033[4m' 29 | 30 | def log(msg, end='\n', wrap=['', '']): 31 | msg = console._wrap(msg, wrap) 32 | console._print(msg, end=end) 33 | 34 | def success(msg, end='\n', wrap=['', '']): 35 | OKGREEN = '\033[92m' 36 | ENDC = '\033[0m' 37 | msg = OKGREEN + console._wrap(msg, wrap) + ENDC 38 | console._print(msg, end=end) 39 | 40 | def warn(msg, end='\n', wrap=['', '']): 41 | WARNING = '\033[93m' 42 | ENDC = '\033[0m' 43 | msg = WARNING + console._wrap(msg, wrap) + ENDC 44 | console._print(msg, end=end) 45 | 46 | def error(msg, end='\n', wrap=['', '']): 47 | FAIL = '\033[91m' 48 | ENDC = '\033[0m' 49 | msg = FAIL + console._wrap(msg, wrap) + ENDC 50 | console._print(msg, end=end) 51 | 52 | def _print(msg, end='\n'): 53 | global syslog_enabled 54 | if syslog_enabled: 55 | syslog.syslog(msg) 56 | print(msg, end=end) 57 | 58 | def _wrap(msg, wrap=['', '']): 59 | if isinstance(wrap, str): 60 | mid_index = int(len(wrap)/2) 61 | wrap = [wrap[0:mid_index], wrap[mid_index:]] 62 | return wrap[0] + msg + wrap[1] 63 | 64 | def retrieve_messages(wid, config): 65 | global conn, wechats, cur_dir 66 | has_new_messages = False 67 | messages = [] 68 | 69 | console.log("Looking for existing account in database...", end='') 70 | with conn.cursor() as c: 71 | c.execute('SELECT id FROM `accounts` WHERE id = %s', (wid)) 72 | exists_account = c.fetchone() != None 73 | 74 | if not exists_account: 75 | console.error('NOT FOUND', wrap='[]') 76 | console.log('Retrieving account info and recent messages...', end='') 77 | # Retrieve both account and recent messages 78 | messages_and_info = wechats.get_gzh_message_and_info(wechatid=wid) 79 | # print(messages_and_info) 80 | messages = messages_and_info['gzh_messages'] 81 | # Store account info in database 82 | gzh_info = messages_and_info['gzh_info'] 83 | console.success('READY', wrap='[]') 84 | 85 | with conn.cursor() as c: 86 | c.execute( 87 | 'INSERT INTO `accounts` (id, name, auth, intro, image) VALUES (%s, %s, %s, %s, %s)', 88 | (wid, gzh_info['name'], gzh_info['renzhen'], gzh_info['jieshao'], gzh_info['img']) 89 | ) 90 | 91 | else: 92 | console.success('FOUND', wrap='[]') 93 | console.log("Retrieving recent messages...", end='') 94 | messages = wechats.get_gzh_message(wechatid=wid) 95 | console.success('READY', wrap='[]') 96 | 97 | # print(json.dumps(messages)) 98 | console.log('Processing %d messages...' % len(messages)) 99 | 100 | for m in messages: 101 | exists_message = False 102 | # Message ID 103 | mid = "%s-%s-%s" % (m['qunfa_id'], m.get('main', 0), m.get('fileid', 0)) 104 | with conn.cursor() as c: 105 | # Lookup in database for existing message 106 | c.execute('SELECT id FROM `messages` WHERE wechat_id=%s AND id=%s', (wid, mid)) 107 | exists_message = c.fetchone() != None 108 | 109 | filename = path.join(_get_abspath(config.get('message_path', 'messages/'), cur_dir), 110 | wid + '_' + mid + '.json') 111 | 112 | if exists_message and path.isfile(filename): 113 | console.warn('[%s] message exists, skip.' % mid) 114 | continue 115 | 116 | # New message 117 | mdatetime = m['datetime'] 118 | mtype = m['type'] 119 | message = {} 120 | try: 121 | if mtype == '1': 122 | message_type = 'TEXT' 123 | message['content'] = m.get('content', '') 124 | elif mtype == '3': 125 | message_type = 'IMAGE' 126 | message['url'] = m.get('img_url', '') 127 | elif mtype == '34': 128 | message_type = 'VOICE' 129 | message['length'] = m.get('play_length', '') 130 | message['fileId'] = m.get('fileid', '') 131 | message['src'] = m.get('audio_src', '') 132 | elif mtype == '49': 133 | message_type = 'POST' 134 | message['main'] = m.get('main', '') 135 | message['title'] = m.get('title', '') 136 | message['digest'] = m.get('digest', '') 137 | message['fileId'] = m.get('fileid', '') 138 | message['author'] = m.get('author', '') 139 | message['cover'] = m.get('cover', '') 140 | # message['copyright'] = m.get('copyright', '') 141 | # Retrieve HTML content and permanent link of post 142 | post = wechats.deal_article(m.get('content_url', '')) 143 | # print(post) 144 | message['content'] = post['content_html'] 145 | message['url'] = post['yuan'] 146 | 147 | elif mtype == '62': 148 | message_type = 'VIDEO' 149 | message['videoId'] = m.get('cnd_videoid', '') 150 | message['thumb'] = m.get('thumb', '') 151 | message['src'] = m.get('video_src', '') 152 | else: 153 | console.error('[%s] !! Unsupported message type: %s' % (mid, mtype)) 154 | continue 155 | 156 | except: 157 | console.error('[%s] !! Failed to parse message.' % mid) 158 | continue 159 | 160 | with open(filename, 'w') as fd: 161 | json.dump(message, fd) 162 | 163 | # Store in database 164 | with conn.cursor() as c: 165 | c.execute( 166 | 'INSERT INTO `messages` (id, wechat_id, datetime, type) VALUES (%s, %s, %s, %s) ' + 167 | 'ON DUPLICATE KEY UPDATE wechat_id=wechat_id, datetime=datetime, type=type', 168 | (mid, wid, mdatetime, message_type) 169 | ) 170 | 171 | has_new_messages = True 172 | console.log("[%s] >> %s" % (mid, filename)) 173 | 174 | return has_new_messages 175 | 176 | 177 | def generate_feed(wid, config): 178 | global conn, wechats, cur_dir 179 | with conn.cursor() as c: 180 | c.execute('SELECT * FROM accounts WHERE `id`=%s LIMIT 1', (wid)) 181 | account = c.fetchone() 182 | 183 | with conn.cursor() as c: 184 | c.execute( 185 | 'SELECT * FROM messages WHERE `wechat_id`=%s AND `type` IN %s ORDER BY datetime DESC, id LIMIT %s', 186 | # TODO Support other message types 187 | (wid, ['POST'], config.get('feed_max', 20)) 188 | ) 189 | messages = c.fetchall() 190 | 191 | fg = FeedGenerator() 192 | fg.id('wechat-%s' % wid) 193 | fg.title(account['name']) 194 | fg.subtitle(account['intro']) 195 | fg.link(href='http://feeds.feedburner.com/wechat-%s' % wid, rel='self') 196 | fg.logo(account['image']) 197 | 198 | for message in messages: 199 | mid = message['id'] 200 | filename = path.join(_get_abspath(config.get('message_path', 'messages/'), cur_dir), 201 | wid + '_' + mid + '.json') 202 | with open(filename, 'r') as fd: 203 | message_details = json.load(fd) 204 | 205 | if message_details: 206 | fe = fg.add_entry() 207 | fe.id(message_details['url']) 208 | fe.title(message_details['title']) 209 | fe.author(name=wid, email=message_details['author']) 210 | fe.link(href=message_details['url']) 211 | 212 | content = re.sub(r'(amp;|\s*data-[\w-]+="[^"]*"|\s*line-height:[^;]*;)', '', 213 | message_details['content'].replace('data-src', 'src'), 214 | flags=re.IGNORECASE) 215 | if message_details['cover'] != '': 216 | content = '' + content 217 | fe.content(content) 218 | 219 | dt = datetime.fromtimestamp(message['datetime'], dateutil.tz.gettz(name='Asia/Shanghai')) 220 | # fe.updated(dt) 221 | fe.pubdate(dt) 222 | 223 | else: 224 | console.log("[%s] message does not exist << %s", (mid, filename)) 225 | 226 | 227 | rss_feed = fg.rss_str(pretty=True) 228 | rss_filename = path.join(_get_abspath(config.get('feed_path', 'feeds/'), cur_dir), 229 | 'wechat-' + wid + '.xml') 230 | fg.rss_file(rss_filename) 231 | console.log('Output RSS feed to %s' % rss_filename) 232 | 233 | 234 | def _parse_argv(): 235 | global cur_dir, script_dir 236 | config = {} 237 | 238 | # load configuration from default config file if exists 239 | default_config_path = path.join(script_dir, 'config.json') 240 | if path.isfile(default_config_path): 241 | with open(default_config_path, 'r') as fd: 242 | config.update(json.load(fd)) 243 | 244 | # print(sys.argv) 245 | try: 246 | opts, wids = getopt.getopt( 247 | sys.argv[1:], 248 | 'hc:', 249 | ['help', 'config=', 'db-host=', 'db-user=', 'db-password=', 'db-database=', \ 250 | 'message-path=', 'message-types=', 'message-ignore-check', \ 251 | 'feed-max=', 'feed-path=', 'feed-ignore-check', \ 252 | 'syslog'] 253 | ) 254 | 255 | except getopt.GetoptError: 256 | _show_help() 257 | 258 | argv_config = {} 259 | config['message_type'] = path.join(cur_dir, 'messages/') 260 | 261 | # print(opts) 262 | for opt, arg in opts: 263 | # print(opt,arg) 264 | if opt in ['-h', '--help']: 265 | _show_help() 266 | elif opt in ['-c', '--config']: 267 | try: 268 | with open(arg, 'r') as fd: 269 | config.update(json.load(fd)) 270 | except: 271 | _show_help('Failed to load custom configurations from given path.') 272 | elif opt == '--db-host': 273 | argv_config['db_host'] = arg 274 | elif opt == '--db-user': 275 | argv_config['db_user'] = arg 276 | elif opt == '--db-password': 277 | argv_config['db_password'] = arg 278 | elif opt == '--db-database': 279 | argv_config['db_database'] = arg 280 | elif opt == '--message-path': 281 | argv_config['message_path'] = _get_abspath(arg, cur_dir) 282 | elif opt == '--message-types': 283 | argv_config['message_types'] = argv.upper().split(',') 284 | elif opt == '--message-ignore-check': 285 | argv_config['message_ignore_check'] = True 286 | elif opt == '--feed-path': 287 | argv_config['feed_path'] = _get_abspath(arg, cur_dir) 288 | elif opt == '--feed-max': 289 | argv_config['feed_max'] = int(arg) 290 | elif opt == '--feed-ignore-check': 291 | argv_config['feed_ignore_check'] = True 292 | elif opt == '--syslog': 293 | argv_config['syslog'] = True 294 | 295 | if len(wids) == 0: 296 | _show_help('No wechat id was given.') 297 | 298 | config.update(argv_config) 299 | config['cur_dir'] = cur_dir 300 | config['script_dir'] = script_dir 301 | 302 | return config, wids 303 | 304 | 305 | def _get_abspath(p, base_dir): 306 | if p.startswith('/'): 307 | return p 308 | else: 309 | return path.join(base_dir, p) 310 | 311 | 312 | def _show_help(msg=''): 313 | if msg != '': 314 | console.error(msg) 315 | console.log('main.py -[hc] ') 316 | sys.exit(2) 317 | 318 | 319 | if __name__ == '__main__': 320 | config, wids = _parse_argv() 321 | if config.get('syslog'): 322 | syslog_enabled = True 323 | syslog.openlog('wechat-rss', syslog.LOG_PID) 324 | 325 | conn = pymysql.connect( 326 | host=config.get('db_host', 'localhost'), 327 | user=config.get('db_user', 'root'), 328 | passwd=config.get('db_password', ''), 329 | db=config.get('db_database', 'wechat'), 330 | charset='utf8mb4', 331 | cursorclass=pymysql.cursors.DictCursor, 332 | autocommit=True 333 | ) 334 | wechats = WechatSogouApi() 335 | 336 | try: 337 | for wid in wids: 338 | has_new_messages = config.get('message_ignore_check', False) or retrieve_messages(wid, config) 339 | if has_new_messages or config.get('feed_ignore_check', False): 340 | console.log("Generating feed for wechat_id=%s..." % wid) 341 | generate_feed(wid, config) 342 | else: 343 | console.success("No new messages.") 344 | 345 | finally: 346 | conn.close() 347 | if syslog_enabled: 348 | syslog.closelog() 349 | 350 | --------------------------------------------------------------------------------