├── feeds
└── .gitkeep
├── messages
└── .gitkeep
├── .gitignore
├── .gitmodules
├── config.json.example
├── sql
└── 000-init.sql
├── README.md
└── main.py
/feeds/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/messages/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | messages/*
2 | feeds/*
3 | cache/*
4 | !.gitkeep
5 | config.json
6 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "WechatSogou"]
2 | path = WechatSogou
3 | url = github:Chyroc/WechatSogou
4 |
--------------------------------------------------------------------------------
/config.json.example:
--------------------------------------------------------------------------------
1 | {
2 | "db_host": "localhost",
3 | "db_user": "root",
4 | "db_password": "*******",
5 | "db_database": "wechat",
6 | "message_path": "messages/",
7 | "message_types": ["TEXT", "IMAGE", "VOICE","POST", "VIDEO"],
8 | "feed_path": "feeds/",
9 | "feed_max": 40,
10 | "feed_ignore_check": false
11 | }
12 |
--------------------------------------------------------------------------------
/sql/000-init.sql:
--------------------------------------------------------------------------------
1 |
2 | DROP TABLE IF EXISTS `wechat`.`messages`;
3 | DROP TABLE IF EXISTS `wechat`.`accounts`;
4 |
5 | CREATE TABLE `wechat`.`accounts` (
6 | `id` varchar(32) NOT NULL COMMENT 'Wechat official account.',
7 | `name` varchar(256) NOT NULL COMMENT 'Account name.',
8 | `auth` varchar(256) COMMENT 'Authorization of account.',
9 | `intro` text CHARACTER SET utf8 COLLATE utf8_general_ci COMMENT 'Short description of account.',
10 | `image` varchar(512) COMMENT 'Account image.',
11 | PRIMARY KEY (`id`)
12 | )
13 | CHARACTER SET utf8 COLLATE utf8_general_ci
14 | COMMENT = 'Storage of posts in Wechat subscriptions.';
15 |
16 | CREATE TABLE `wechat`.`messages` (
17 | `id` varchar(32) NOT NULL COMMENT 'Group message ID.',
18 | `wechat_id` varchar(100) NOT NULL COMMENT 'Wechat ID of subscription post.',
19 | `datetime` int(10) NOT NULL COMMENT 'Message timestamp.',
20 | `type` enum('TEXT', 'IMAGE', 'VOICE', 'POST', 'VIDEO') NOT NULL COMMENT 'Message type',
21 | PRIMARY KEY (`wechat_id`, `id`),
22 | CONSTRAINT `fk_wechat_id` FOREIGN KEY `fk_wechat_id` (`wechat_id`)
23 | REFERENCES `wechat`.`accounts` (`id`)
24 | ON DELETE RESTRICT
25 | ON UPDATE RESTRICT
26 | )
27 | CHARACTER SET utf8 COLLATE utf8_general_ci
28 | COMMENT = 'Storage of posts in Wechat subscriptions.';
29 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## RSS Feed Generator for Wechat Official Accounts
2 |
3 | > Python3 only! Sogou crawler credit to [Chyroc/WechatSogou](https://github.com/Chyroc/WechatSogou/)!
4 |
5 | ### Installation:
6 |
7 | ```shell
8 | # Install dependecies
9 | pip3 install lxml feedgen python-dateutil requests Werkzeug PyMySQL
10 |
11 | # Clone repo
12 | git clone https://github.com/dearrrfish/wechat-subscriptions-rss wrss
13 | cd wrss
14 |
15 | # Initialize database
16 | # Create a database named `wechat`, then:
17 | mysql -u root -p < sql/000-init.sql
18 |
19 | # Copy from example of `config.json`, edit as your preferences
20 | cp config.json.example config.json
21 | vim config.json
22 |
23 | # Run to test
24 | python3 main.py dapapi
25 | ```
26 |
27 |
28 |
29 | ### Syntax:
30 |
31 | `python3 main.py [-options] wechat_ids...`
32 |
33 | #### Options:
34 |
35 | - `-c|--config` - given path of custom config file, eg. `-c ~/config.wrss.json`
36 | - `--db-host, --db-user, --db-password, --db-database` - override database parameters
37 | - `--message-path` - custom location to store json files of message details, default: `messages/`
38 | - `--message-types` - message types included in final feed. (unfinished, force to be `POST`)
39 | - `--message-ignore-check` - skip fetching new messages, dev use
40 | - `--feed-path` - custom location to output RSS feed xml file, default: `feeds/`
41 | - `--feed-max` - max number of messages adding into feed, default: 20
42 | - `--feed-ignore-check` - skip checking if has_new_messages, force generating feed
43 | - `--syslog` - output log messages to syslog
44 |
45 |
46 |
47 | #### Examples:
48 |
49 | ```shell
50 | python3 main.py --feed-path /var/www/wrss/ --feed-ignore-check --syslog dsmovie sensualguru
51 | ```
52 |
53 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import sys, getopt, syslog
4 | from os import path
5 | import os
6 | import json, re
7 | import pymysql.cursors
8 | from datetime import datetime
9 | import dateutil.tz
10 | from feedgen.feed import FeedGenerator
11 |
12 | syslog_enabled = False
13 | cur_dir = os.getcwd()
14 | script_dir, script_filename = path.split(path.abspath(sys.argv[0]))
15 | sys.path.append(path.join(script_dir, 'WechatSogou'))
16 |
17 | from wechatsogou.tools import *
18 | from wechatsogou import *
19 |
20 | class console:
21 | # HEADER = '\033[95m'
22 | # OKBLUE = '\033[94m'
23 | # OKGREEN = '\033[92m'
24 | # WARNING = '\033[93m'
25 | # FAIL = '\033[91m'
26 | # ENDC = '\033[0m'
27 | # BOLD = '\033[1m'
28 | # UNDERLINE = '\033[4m'
29 |
30 | def log(msg, end='\n', wrap=['', '']):
31 | msg = console._wrap(msg, wrap)
32 | console._print(msg, end=end)
33 |
34 | def success(msg, end='\n', wrap=['', '']):
35 | OKGREEN = '\033[92m'
36 | ENDC = '\033[0m'
37 | msg = OKGREEN + console._wrap(msg, wrap) + ENDC
38 | console._print(msg, end=end)
39 |
40 | def warn(msg, end='\n', wrap=['', '']):
41 | WARNING = '\033[93m'
42 | ENDC = '\033[0m'
43 | msg = WARNING + console._wrap(msg, wrap) + ENDC
44 | console._print(msg, end=end)
45 |
46 | def error(msg, end='\n', wrap=['', '']):
47 | FAIL = '\033[91m'
48 | ENDC = '\033[0m'
49 | msg = FAIL + console._wrap(msg, wrap) + ENDC
50 | console._print(msg, end=end)
51 |
52 | def _print(msg, end='\n'):
53 | global syslog_enabled
54 | if syslog_enabled:
55 | syslog.syslog(msg)
56 | print(msg, end=end)
57 |
58 | def _wrap(msg, wrap=['', '']):
59 | if isinstance(wrap, str):
60 | mid_index = int(len(wrap)/2)
61 | wrap = [wrap[0:mid_index], wrap[mid_index:]]
62 | return wrap[0] + msg + wrap[1]
63 |
64 | def retrieve_messages(wid, config):
65 | global conn, wechats, cur_dir
66 | has_new_messages = False
67 | messages = []
68 |
69 | console.log("Looking for existing account in database...", end='')
70 | with conn.cursor() as c:
71 | c.execute('SELECT id FROM `accounts` WHERE id = %s', (wid))
72 | exists_account = c.fetchone() != None
73 |
74 | if not exists_account:
75 | console.error('NOT FOUND', wrap='[]')
76 | console.log('Retrieving account info and recent messages...', end='')
77 | # Retrieve both account and recent messages
78 | messages_and_info = wechats.get_gzh_message_and_info(wechatid=wid)
79 | # print(messages_and_info)
80 | messages = messages_and_info['gzh_messages']
81 | # Store account info in database
82 | gzh_info = messages_and_info['gzh_info']
83 | console.success('READY', wrap='[]')
84 |
85 | with conn.cursor() as c:
86 | c.execute(
87 | 'INSERT INTO `accounts` (id, name, auth, intro, image) VALUES (%s, %s, %s, %s, %s)',
88 | (wid, gzh_info['name'], gzh_info['renzhen'], gzh_info['jieshao'], gzh_info['img'])
89 | )
90 |
91 | else:
92 | console.success('FOUND', wrap='[]')
93 | console.log("Retrieving recent messages...", end='')
94 | messages = wechats.get_gzh_message(wechatid=wid)
95 | console.success('READY', wrap='[]')
96 |
97 | # print(json.dumps(messages))
98 | console.log('Processing %d messages...' % len(messages))
99 |
100 | for m in messages:
101 | exists_message = False
102 | # Message ID
103 | mid = "%s-%s-%s" % (m['qunfa_id'], m.get('main', 0), m.get('fileid', 0))
104 | with conn.cursor() as c:
105 | # Lookup in database for existing message
106 | c.execute('SELECT id FROM `messages` WHERE wechat_id=%s AND id=%s', (wid, mid))
107 | exists_message = c.fetchone() != None
108 |
109 | filename = path.join(_get_abspath(config.get('message_path', 'messages/'), cur_dir),
110 | wid + '_' + mid + '.json')
111 |
112 | if exists_message and path.isfile(filename):
113 | console.warn('[%s] message exists, skip.' % mid)
114 | continue
115 |
116 | # New message
117 | mdatetime = m['datetime']
118 | mtype = m['type']
119 | message = {}
120 | try:
121 | if mtype == '1':
122 | message_type = 'TEXT'
123 | message['content'] = m.get('content', '')
124 | elif mtype == '3':
125 | message_type = 'IMAGE'
126 | message['url'] = m.get('img_url', '')
127 | elif mtype == '34':
128 | message_type = 'VOICE'
129 | message['length'] = m.get('play_length', '')
130 | message['fileId'] = m.get('fileid', '')
131 | message['src'] = m.get('audio_src', '')
132 | elif mtype == '49':
133 | message_type = 'POST'
134 | message['main'] = m.get('main', '')
135 | message['title'] = m.get('title', '')
136 | message['digest'] = m.get('digest', '')
137 | message['fileId'] = m.get('fileid', '')
138 | message['author'] = m.get('author', '')
139 | message['cover'] = m.get('cover', '')
140 | # message['copyright'] = m.get('copyright', '')
141 | # Retrieve HTML content and permanent link of post
142 | post = wechats.deal_article(m.get('content_url', ''))
143 | # print(post)
144 | message['content'] = post['content_html']
145 | message['url'] = post['yuan']
146 |
147 | elif mtype == '62':
148 | message_type = 'VIDEO'
149 | message['videoId'] = m.get('cnd_videoid', '')
150 | message['thumb'] = m.get('thumb', '')
151 | message['src'] = m.get('video_src', '')
152 | else:
153 | console.error('[%s] !! Unsupported message type: %s' % (mid, mtype))
154 | continue
155 |
156 | except:
157 | console.error('[%s] !! Failed to parse message.' % mid)
158 | continue
159 |
160 | with open(filename, 'w') as fd:
161 | json.dump(message, fd)
162 |
163 | # Store in database
164 | with conn.cursor() as c:
165 | c.execute(
166 | 'INSERT INTO `messages` (id, wechat_id, datetime, type) VALUES (%s, %s, %s, %s) ' +
167 | 'ON DUPLICATE KEY UPDATE wechat_id=wechat_id, datetime=datetime, type=type',
168 | (mid, wid, mdatetime, message_type)
169 | )
170 |
171 | has_new_messages = True
172 | console.log("[%s] >> %s" % (mid, filename))
173 |
174 | return has_new_messages
175 |
176 |
177 | def generate_feed(wid, config):
178 | global conn, wechats, cur_dir
179 | with conn.cursor() as c:
180 | c.execute('SELECT * FROM accounts WHERE `id`=%s LIMIT 1', (wid))
181 | account = c.fetchone()
182 |
183 | with conn.cursor() as c:
184 | c.execute(
185 | 'SELECT * FROM messages WHERE `wechat_id`=%s AND `type` IN %s ORDER BY datetime DESC, id LIMIT %s',
186 | # TODO Support other message types
187 | (wid, ['POST'], config.get('feed_max', 20))
188 | )
189 | messages = c.fetchall()
190 |
191 | fg = FeedGenerator()
192 | fg.id('wechat-%s' % wid)
193 | fg.title(account['name'])
194 | fg.subtitle(account['intro'])
195 | fg.link(href='http://feeds.feedburner.com/wechat-%s' % wid, rel='self')
196 | fg.logo(account['image'])
197 |
198 | for message in messages:
199 | mid = message['id']
200 | filename = path.join(_get_abspath(config.get('message_path', 'messages/'), cur_dir),
201 | wid + '_' + mid + '.json')
202 | with open(filename, 'r') as fd:
203 | message_details = json.load(fd)
204 |
205 | if message_details:
206 | fe = fg.add_entry()
207 | fe.id(message_details['url'])
208 | fe.title(message_details['title'])
209 | fe.author(name=wid, email=message_details['author'])
210 | fe.link(href=message_details['url'])
211 |
212 | content = re.sub(r'(amp;|\s*data-[\w-]+="[^"]*"|\s*line-height:[^;]*;)', '',
213 | message_details['content'].replace('data-src', 'src'),
214 | flags=re.IGNORECASE)
215 | if message_details['cover'] != '':
216 | content = '
' + content
217 | fe.content(content)
218 |
219 | dt = datetime.fromtimestamp(message['datetime'], dateutil.tz.gettz(name='Asia/Shanghai'))
220 | # fe.updated(dt)
221 | fe.pubdate(dt)
222 |
223 | else:
224 | console.log("[%s] message does not exist << %s", (mid, filename))
225 |
226 |
227 | rss_feed = fg.rss_str(pretty=True)
228 | rss_filename = path.join(_get_abspath(config.get('feed_path', 'feeds/'), cur_dir),
229 | 'wechat-' + wid + '.xml')
230 | fg.rss_file(rss_filename)
231 | console.log('Output RSS feed to %s' % rss_filename)
232 |
233 |
234 | def _parse_argv():
235 | global cur_dir, script_dir
236 | config = {}
237 |
238 | # load configuration from default config file if exists
239 | default_config_path = path.join(script_dir, 'config.json')
240 | if path.isfile(default_config_path):
241 | with open(default_config_path, 'r') as fd:
242 | config.update(json.load(fd))
243 |
244 | # print(sys.argv)
245 | try:
246 | opts, wids = getopt.getopt(
247 | sys.argv[1:],
248 | 'hc:',
249 | ['help', 'config=', 'db-host=', 'db-user=', 'db-password=', 'db-database=', \
250 | 'message-path=', 'message-types=', 'message-ignore-check', \
251 | 'feed-max=', 'feed-path=', 'feed-ignore-check', \
252 | 'syslog']
253 | )
254 |
255 | except getopt.GetoptError:
256 | _show_help()
257 |
258 | argv_config = {}
259 | config['message_type'] = path.join(cur_dir, 'messages/')
260 |
261 | # print(opts)
262 | for opt, arg in opts:
263 | # print(opt,arg)
264 | if opt in ['-h', '--help']:
265 | _show_help()
266 | elif opt in ['-c', '--config']:
267 | try:
268 | with open(arg, 'r') as fd:
269 | config.update(json.load(fd))
270 | except:
271 | _show_help('Failed to load custom configurations from given path.')
272 | elif opt == '--db-host':
273 | argv_config['db_host'] = arg
274 | elif opt == '--db-user':
275 | argv_config['db_user'] = arg
276 | elif opt == '--db-password':
277 | argv_config['db_password'] = arg
278 | elif opt == '--db-database':
279 | argv_config['db_database'] = arg
280 | elif opt == '--message-path':
281 | argv_config['message_path'] = _get_abspath(arg, cur_dir)
282 | elif opt == '--message-types':
283 | argv_config['message_types'] = argv.upper().split(',')
284 | elif opt == '--message-ignore-check':
285 | argv_config['message_ignore_check'] = True
286 | elif opt == '--feed-path':
287 | argv_config['feed_path'] = _get_abspath(arg, cur_dir)
288 | elif opt == '--feed-max':
289 | argv_config['feed_max'] = int(arg)
290 | elif opt == '--feed-ignore-check':
291 | argv_config['feed_ignore_check'] = True
292 | elif opt == '--syslog':
293 | argv_config['syslog'] = True
294 |
295 | if len(wids) == 0:
296 | _show_help('No wechat id was given.')
297 |
298 | config.update(argv_config)
299 | config['cur_dir'] = cur_dir
300 | config['script_dir'] = script_dir
301 |
302 | return config, wids
303 |
304 |
305 | def _get_abspath(p, base_dir):
306 | if p.startswith('/'):
307 | return p
308 | else:
309 | return path.join(base_dir, p)
310 |
311 |
312 | def _show_help(msg=''):
313 | if msg != '':
314 | console.error(msg)
315 | console.log('main.py -[hc] ')
316 | sys.exit(2)
317 |
318 |
319 | if __name__ == '__main__':
320 | config, wids = _parse_argv()
321 | if config.get('syslog'):
322 | syslog_enabled = True
323 | syslog.openlog('wechat-rss', syslog.LOG_PID)
324 |
325 | conn = pymysql.connect(
326 | host=config.get('db_host', 'localhost'),
327 | user=config.get('db_user', 'root'),
328 | passwd=config.get('db_password', ''),
329 | db=config.get('db_database', 'wechat'),
330 | charset='utf8mb4',
331 | cursorclass=pymysql.cursors.DictCursor,
332 | autocommit=True
333 | )
334 | wechats = WechatSogouApi()
335 |
336 | try:
337 | for wid in wids:
338 | has_new_messages = config.get('message_ignore_check', False) or retrieve_messages(wid, config)
339 | if has_new_messages or config.get('feed_ignore_check', False):
340 | console.log("Generating feed for wechat_id=%s..." % wid)
341 | generate_feed(wid, config)
342 | else:
343 | console.success("No new messages.")
344 |
345 | finally:
346 | conn.close()
347 | if syslog_enabled:
348 | syslog.closelog()
349 |
350 |
--------------------------------------------------------------------------------