├── .gitignore
├── LICENSE
├── README.md
├── appserve.py
├── chatdig.py
├── config.sample.json
├── digest.py
├── templates
    ├── digest.css
    ├── digest.html
    ├── index.html
    └── stat.html
├── tools
    └── dbselect.cgi
├── truecaser.py
└── vendor
    ├── chinesename.py
    ├── common_surnames.py
    ├── convertbdf.py
    ├── figchar.py
    ├── learnctx.py
    ├── logcutfilter.py
    ├── lookuptable.py
    ├── mbox.conf
    ├── modelzhc.json
    ├── modelzhm.json
    ├── mosesproxy.py
    ├── pangu.py
    ├── repl.py
    ├── say.py
    ├── seccomp.py
    ├── simpcalc.py
    ├── stopwords.txt
    ├── umsgpack.py
    ├── updatelm.sh
    └── zhutil.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | *.build/
60 | *.exe
61 | 
62 | old/
63 | experiment/
64 | *.log
65 | *.db
66 | *.sh
67 | config.json
68 | cmdbot.json
69 | vendor/*.binlm
70 | vendor/*.lm
71 | vendor/chatdict.txt
72 | vendor/chatlogf.txt
73 | vendor/*.pkl
74 | vendor/namemodel.m
75 | vendor/libirc.py
76 | vendor/*.dawg
77 | vendor/pinyinlookup.py
78 | vendor/simpleime.py
79 | vendor/zhcdict.json
80 | vendor/zhconv.py
81 | vendor/bf.py
82 | vendor/lispy.py
83 | vendor/brainfuck
84 | vendor/truecase.txt
85 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Dingyuan Wang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # tg-chatdig
 2 | Dig into long and boring Telegram group chat logs.
 3 | 
 4 | For a simpler Telegram-IRC relay bot, see [tg-irc-relay](https://github.com/gumblex/tg-irc-relay)
 5 | **Deprecated**: The version 2 is renamed and published at [orizonhub](https://github.com/gumblex/orizonhub)
 6 | 
 7 | ## chatdig.py
 8 | 
 9 | Main script, handles a lot of commands. Uses a SQLite 3 database to store messages.
10 | 
11 | ## tglog-import.py
12 | 
13 | Executes `telegram-cli` and fetches history messages.
14 | 
15 | ## digest.py
16 | 
17 | Generate daily digest from the message database.
18 | 
19 | `python3 digest.py path [days=1] [update=0]`
20 | 
21 | ## vendor/
22 | 
23 | Some interesting functions.
24 | 
25 | ### say.py
26 | 
27 | Randomly writes out sentences according to the language model.
28 | 
29 | Depends on [jieba](https://github.com/fxsjy/jieba), [kenlm](https://github.com/kpu/kenlm).
30 | 
31 | See `vendor/updatelm.sh` for building language models.
32 | 
33 | ### seccomp.py
34 | 
35 | Evals user input and prints out result safely. Originally written by David Wison.
36 | 
37 | See [dw/scratch/seccomp.py](https://github.com/dw/scratch/blob/master/seccomp.py)
38 | 
39 | ### fparser.py
40 | 
41 | See [gumblex/fxcalc](https://github.com/gumblex/fxcalc)
42 | 
43 | ### External Plugins
44 | 
45 | The following components are not in this repo:
46 | 
47 | * `/bf` bf.py: [Brainf*ck interpreter](http://www.cs.princeton.edu/~ynaamad/misc/bf.htm)
48 | * `/lisp` lispy.py: [Scheme-like interpreter](http://norvig.com/lispy.html)
49 | * `/name` , namemodel.m: Part of [Chinese name generator](https://github.com/gumblex/chinesename)
50 | * `/ime` simpleime.py, pinyinlookup.py, \*.dawg: [Simple Pinyin IME](https://github.com/gumblex/simpleime)
51 | * zhconv.py, zhcdict.json: [Simplified-Traditional Chinese converter](https://github.com/gumblex/zhconv)
52 | * vendor/libirc.py: [libirc](https://github.com/m13253/libirc)
53 | 


--------------------------------------------------------------------------------
/appserve.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import os
  5 | import sys
  6 | import json
  7 | import queue
  8 | import tempfile
  9 | import resource
 10 | import threading
 11 | import traceback
 12 | import subprocess
 13 | import collections
 14 | import concurrent.futures
 15 | 
 16 | from vendor import zhutil
 17 | from vendor import zhconv
 18 | from vendor import figchar
 19 | from vendor import simpcalc
 20 | from vendor import simpleime
 21 | from vendor import mosesproxy
 22 | from vendor import chinesename
 23 | 
 24 | resource.setrlimit(resource.RLIMIT_RSS, (131072, 262144))
 25 | 
 26 | def setsplimits(cputime, memory):
 27 |     def _setlimits():
 28 |         resource.setrlimit(resource.RLIMIT_CPU, cputime)
 29 |         resource.setrlimit(resource.RLIMIT_RSS, memory)
 30 |         resource.setrlimit(resource.RLIMIT_NPROC, (1024, 1024))
 31 |     return _setlimits
 32 | 
 33 | # {"id": 1, "cmd": "bf", "args": [",[.,]", "asdasdf"]}
 34 | 
 35 | def docommands():
 36 |     global MSG_Q
 37 |     while 1:
 38 |         obj = MSG_Q.get()
 39 |         executor.submit(async_command, obj)
 40 | 
 41 | def async_command(obj):
 42 |     sys.stdout.buffer.write(json.dumps(process(obj)).encode('utf-8') + b'\n')
 43 |     sys.stdout.flush()
 44 | 
 45 | def getsaying():
 46 |     global SAY_P, SAY_Q
 47 |     while 1:
 48 |         say = getsayingbytext(mode='')
 49 |         SAY_Q.put(say)
 50 | 
 51 | def getsayingbytext(text='', mode='r'):
 52 |     global SAY_P
 53 |     with SAY_LCK:
 54 |         text = (mode + ' '.join(mosesproxy.cut(zhconv.convert(text, 'zh-hans'), HMM=False)[:60]).strip()).encode('utf-8') + b'\n'
 55 |         try:
 56 |             SAY_P.stdin.write(text)
 57 |             SAY_P.stdin.flush()
 58 |             say = SAY_P.stdout.readline().strip().decode('utf-8')
 59 |         except BrokenPipeError:
 60 |             SAY_P = subprocess.Popen(SAY_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE, cwd='vendor')
 61 |             SAY_P.stdin.write(text)
 62 |             SAY_P.stdin.flush()
 63 |             say = SAY_P.stdout.readline().strip().decode('utf-8')
 64 |     return say
 65 | 
 66 | def process(obj):
 67 |     ret, exc = None, None
 68 |     try:
 69 |         ret = COMMANDS[obj['cmd']](*obj['args'])
 70 |     except Exception:
 71 |         exc = traceback.format_exc()
 72 |     return {'id': obj['id'], 'ret': ret, 'exc': exc}
 73 | 
 74 | def cmd_calc(expr):
 75 |     '''/calc <expr> Calculate <expr>.'''
 76 |     r = calculator.pretty(expr)
 77 |     if len(r) > 200:
 78 |         r = r[:200] + '...'
 79 |     return r or 'Nothing'
 80 | 
 81 | def cmd_py(expr):
 82 |     proc = subprocess.Popen(EVIL_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd='vendor', preexec_fn=setsplimits((4, 5), (8192, 16384)))
 83 |     try:
 84 |         result, errs = proc.communicate(expr.strip().encode('utf-8'), timeout=5)
 85 |     except Exception: # TimeoutExpired
 86 |         proc.kill()
 87 |         result, errs = proc.communicate()
 88 |     finally:
 89 |         if proc.poll() is None:
 90 |             proc.terminate()
 91 |     result = result.strip().decode('utf-8', errors='replace')
 92 |     return result or 'None or error occurred.'
 93 | 
 94 | def cmd_bf(expr, datain=''):
 95 |     fd, fpath = tempfile.mkstemp()
 96 |     with os.fdopen(fd, 'wb') as temp_bf:
 97 |         temp_bf.write(''.join(c for c in expr if c in '-[>.<]+,').encode('latin_1'))
 98 |     proc = subprocess.Popen(BF_CMD + (fpath,), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, preexec_fn=setsplimits((1, 1), (1024, 2048)))
 99 |     datain = datain.encode('utf-8')
100 |     try:
101 |         result, errs = proc.communicate(datain, timeout=1)
102 |     except Exception: # TimeoutExpired
103 |         proc.kill()
104 |         result, errs = proc.communicate()
105 |     finally:
106 |         if proc.poll() is None:
107 |             proc.terminate()
108 |         os.remove(fpath)
109 |     if len(result) > 1000:
110 |         result = result[:1000] + b'...'
111 |     result = result.decode('latin_1').encode('unicode_escape').decode('latin_1').replace('\\t', '\t').replace('\\n', '\n')
112 |     if len(result) > 1000:
113 |         result = result[:1000] + '...'
114 |     return result or 'None or error occurred.'
115 | 
116 | def cmd_lisp(expr):
117 |     proc = subprocess.Popen(LISP_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd='vendor', preexec_fn=setsplimits((4, 5), (8192, 16384)))
118 |     try:
119 |         result, errs = proc.communicate(expr.strip().encode('utf-8'), timeout=5)
120 |     except Exception: # TimeoutExpired
121 |         proc.kill()
122 |         result, errs = proc.communicate()
123 |     finally:
124 |         if proc.poll() is None:
125 |             proc.terminate()
126 |     result = result.strip().decode('utf-8', errors='replace')
127 |     return result or 'None or error occurred.'
128 | 
129 | def cmd_name(expr):
130 |     surnames, names = namemodel.processinput(expr, 10)
131 |     res = []
132 |     if surnames:
133 |         res.append('姓：' + ', '.join(surnames[:10]))
134 |     if names:
135 |         res.append('名：' + ', '.join(names[:10]))
136 |     return '\n'.join(res)
137 | 
138 | def cmd_ime(expr):
139 |     return zhconv.convert(simpleime.pinyininput(expr.lower()), 'zh-hans')
140 | 
141 | def cmd_fig(expr):
142 |     r = fcgen.render(expr)
143 |     rl = r.splitlines()
144 |     if not r:
145 |         return 'Missing glyph(s).'
146 |     elif len(rl[0]) < 12 and len(rl) < 15:
147 |         return r
148 |     else:
149 |         return 'Figure too big.'
150 | 
151 | def cmd_cc(expr):
152 |     if zhconv.issimp(expr):
153 |         return zhconv.convert(expr, 'zh-hant')
154 |     else:
155 |         return zhconv.convert(expr, 'zh-hans')
156 | 
157 | def cmd_cut(tinput, lang):
158 |     if lang == 'c':
159 |         return ' '.join(mosesproxy.jiebazhc.cut(tinput, HMM=False))
160 |     else:
161 |         return ' '.join(mosesproxy.cut(tinput, HMM=False))
162 | 
163 | def cmd_wyw(tinput, lang):
164 |     if tinput == '$name':
165 |         return mosesproxy.modelname()
166 |     if lang is None:
167 |         cscore, mscore = zhutil.calctxtstat(tinput)
168 |         if cscore == mscore:
169 |             lang = None
170 |         elif zhutil.checktxttype(cscore, mscore) == 'c':
171 |             lang = 'c2m'
172 |         else:
173 |             lang = 'm2c'
174 |     if lang:
175 |         return mosesproxy.translate(tinput, lang, 0, 0, 0)
176 |     else:
177 |         return tinput
178 | 
179 | def cmd_say():
180 |     return SAY_Q.get() or 'ERROR_BRAIN_NOT_CONNECTED'
181 | 
182 | def cmd_mgw():
183 |     global MGW_P
184 |     with MGW_LCK:
185 |         try:
186 |             MGW_P.stdin.write(b'b\n')
187 |             MGW_P.stdin.flush()
188 |             say = MGW_P.stdout.readline().strip().decode('utf-8')
189 |         except BrokenPipeError:
190 |             MGW_P = subprocess.Popen(MGW_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE, cwd='vendor')
191 |             MGW_P.stdin.write(b'b\n')
192 |             MGW_P.stdin.flush()
193 |             say = MGW_P.stdout.readline().strip().decode('utf-8')
194 |     return say
195 | 
196 | def cmd_reply(expr):
197 |     return getsayingbytext(expr, 'r') or 'ERROR_BRAIN_NOT_CONNECTED'
198 | 
199 | def cmd_cont(expr):
200 |     return getsayingbytext(expr, 'c') or 'ERROR_BRAIN_NOT_CONNECTED'
201 | 
202 | COMMANDS = collections.OrderedDict((
203 | ('calc', cmd_calc),
204 | ('py', cmd_py),
205 | ('bf', cmd_bf),
206 | ('lisp', cmd_lisp),
207 | ('name', cmd_name),
208 | ('ime', cmd_ime),
209 | ('fig', cmd_fig),
210 | ('cc', cmd_cc),
211 | ('wyw', cmd_wyw),
212 | ('cut', cmd_cut),
213 | ('say', cmd_say),
214 | ('mgw', cmd_mgw),
215 | ('reply', cmd_reply),
216 | ('cont', cmd_cont)
217 | ))
218 | 
219 | MSG_Q = queue.Queue()
220 | SAY_Q = queue.Queue(maxsize=50)
221 | SAY_LCK = threading.Lock()
222 | MGW_LCK = threading.Lock()
223 | 
224 | SAY_CMD = ('python3', 'say.py', 'chat.binlm', 'chatdict.txt', 'context.pkl')
225 | SAY_P = subprocess.Popen(SAY_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE, cwd='vendor')
226 | MGW_CMD = ('python3', 'say.py', 'mgw.binlm', 'mgwdict.txt')
227 | MGW_P = subprocess.Popen(MGW_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE, cwd='vendor')
228 | 
229 | EVIL_CMD = ('python', 'seccomp.py')
230 | BF_CMD = ('vendor/brainfuck',)
231 | LISP_CMD = ('python', 'lispy.py')
232 | 
233 | executor = concurrent.futures.ThreadPoolExecutor(5)
234 | cmdthr = threading.Thread(target=docommands)
235 | cmdthr.daemon = True
236 | cmdthr.start()
237 | 
238 | saythr = threading.Thread(target=getsaying)
239 | saythr.daemon = True
240 | saythr.start()
241 | 
242 | calculator = simpcalc.Calculator('ans', True)
243 | namemodel = chinesename.NameModel('vendor/namemodel.m')
244 | simpleime.loaddict('vendor/pyindex.dawg', 'vendor/essay.dawg')
245 | fcgen = figchar.BlockGenerator('vendor/wqy.pkl', '🌝🌚')
246 | 
247 | try:
248 |     for ln in sys.stdin.buffer:
249 |         upd = json.loads(ln.decode('utf-8'))
250 |         MSG_Q.put(upd)
251 | finally:
252 |     SAY_P.terminate()
253 | 


--------------------------------------------------------------------------------
/chatdig.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python3
   2 | # -*- coding: utf-8 -*-
   3 | 
   4 | import os
   5 | import re
   6 | import sys
   7 | import math
   8 | import time
   9 | import json
  10 | import queue
  11 | import signal
  12 | import socket
  13 | import random
  14 | import logging
  15 | import sqlite3
  16 | import threading
  17 | import functools
  18 | import subprocess
  19 | import collections
  20 | import unicodedata
  21 | import concurrent.futures
  22 | 
  23 | import requests
  24 | from vendor import libirc
  25 | 
  26 | __version__ = '1.4'
  27 | 
  28 | MEDIA_TYPES = frozenset(('audio', 'document', 'photo', 'sticker', 'video', 'voice', 'contact', 'location', 'new_chat_participant', 'left_chat_participant', 'new_chat_title', 'new_chat_photo', 'delete_chat_photo', 'group_chat_created'))
  29 | EXT_MEDIA_TYPES = frozenset(('audio', 'document', 'photo', 'sticker', 'video', 'voice', 'contact', 'location', 'new_chat_participant', 'left_chat_participant', 'new_chat_title', 'new_chat_photo', 'delete_chat_photo', 'group_chat_created', '_ircuser'))
  30 | 
  31 | loglevel = logging.DEBUG if sys.argv[-1] == '-d' else logging.INFO
  32 | 
  33 | logging.basicConfig(stream=sys.stdout, format='# %(asctime)s [%(levelname)s] %(message)s', level=loglevel)
  34 | 
  35 | socket.setdefaulttimeout(60)
  36 | 
  37 | HSession = requests.Session()
  38 | USERAGENT = 'TgChatDiggerBot/%s %s' % (__version__, HSession.headers["User-Agent"])
  39 | HSession.headers["User-Agent"] = USERAGENT
  40 | 
  41 | db = sqlite3.connect('chatlog.db')
  42 | conn = db.cursor()
  43 | conn.execute('''CREATE TABLE IF NOT EXISTS messages (
  44 | id INTEGER PRIMARY KEY,
  45 | src INTEGER,
  46 | text TEXT,
  47 | media TEXT,
  48 | date INTEGER,
  49 | fwd_src INTEGER,
  50 | fwd_date INTEGER,
  51 | reply_id INTEGER
  52 | )''')
  53 | conn.execute('''CREATE TABLE IF NOT EXISTS users (
  54 | id INTEGER PRIMARY KEY,
  55 | username TEXT,
  56 | first_name TEXT,
  57 | last_name TEXT
  58 | )''')
  59 | conn.execute('CREATE TABLE IF NOT EXISTS config (id INTEGER PRIMARY KEY, val INTEGER)')
  60 | # conn.execute('CREATE TABLE IF NOT EXISTS words (word TEXT PRIMARY KEY, count INTEGER)')
  61 | 
  62 | re_ircaction = re.compile('^\x01ACTION (.*)\x01$')
  63 | re_ircforward = re.compile(r'^\[(.+?)\] (.*)$|^\*\* ([^ ]+) (.*) \*\*$')
  64 | 
  65 | class LRUCache:
  66 | 
  67 |     def __init__(self, maxlen):
  68 |         self.capacity = maxlen
  69 |         self.cache = collections.OrderedDict()
  70 | 
  71 |     def __getitem__(self, key):
  72 |         value = self.cache.pop(key)
  73 |         self.cache[key] = value
  74 |         return value
  75 | 
  76 |     def get(self, key, default=None):
  77 |         try:
  78 |             value = self.cache.pop(key)
  79 |             self.cache[key] = value
  80 |             return value
  81 |         except KeyError:
  82 |             return default
  83 | 
  84 |     def __setitem__(self, key, value):
  85 |         try:
  86 |             self.cache.pop(key)
  87 |         except KeyError:
  88 |             if len(self.cache) >= self.capacity:
  89 |                 self.cache.popitem(last=False)
  90 |         self.cache[key] = value
  91 | 
  92 | def async_func(func):
  93 |     @functools.wraps(func)
  94 |     def wrapped(*args, **kwargs):
  95 |         def func_noerr(*args, **kwargs):
  96 |             try:
  97 |                 func(*args, **kwargs)
  98 |             except Exception:
  99 |                 logging.exception('Async function failed.')
 100 |         executor.submit(func_noerr, *args, **kwargs)
 101 |     return wrapped
 102 | 
 103 | def _raise_ex(ex):
 104 |     raise ex
 105 | 
 106 | ### Polling
 107 | 
 108 | def getupdates():
 109 |     global OFFSET, MSG_Q
 110 |     while 1:
 111 |         try:
 112 |             updates = bot_api('getUpdates', offset=OFFSET, timeout=10)
 113 |         except Exception as ex:
 114 |             logging.exception('Get updates failed.')
 115 |             continue
 116 |         if updates:
 117 |             logging.debug('Messages coming.')
 118 |             OFFSET = updates[-1]["update_id"] + 1
 119 |             for upd in updates:
 120 |                 MSG_Q.put(upd)
 121 |         time.sleep(.2)
 122 | 
 123 | def checkappproc():
 124 |     global APP_P
 125 |     if APP_P.poll() is not None:
 126 |         APP_P = subprocess.Popen(APP_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
 127 | 
 128 | def runapptask(cmd, args, sendargs):
 129 |     '''`sendargs` should be (chatid, replyid)'''
 130 |     global APP_P, APP_LCK, APP_TASK
 131 |     with APP_LCK:
 132 |         # Prevent float problems
 133 |         tid = str(time.time())
 134 |         text = json.dumps({"cmd": cmd, "args": args, "id": tid})
 135 |         APP_TASK[tid] = sendargs
 136 |         try:
 137 |             APP_P.stdin.write(text.strip().encode('utf-8') + b'\n')
 138 |             APP_P.stdin.flush()
 139 |         except BrokenPipeError:
 140 |             checkappproc()
 141 |             APP_P.stdin.write(text.strip().encode('utf-8') + b'\n')
 142 |             APP_P.stdin.flush()
 143 |         logging.debug('Wrote to APP_P: ' + text)
 144 | 
 145 | def getappresult():
 146 |     global APP_P, APP_TASK
 147 |     while 1:
 148 |         try:
 149 |             result = APP_P.stdout.readline().strip().decode('utf-8')
 150 |         except BrokenPipeError:
 151 |             checkappproc()
 152 |             result = APP_P.stdout.readline().strip().decode('utf-8')
 153 |         logging.debug('Got from APP_P: ' + result)
 154 |         if result:
 155 |             obj = json.loads(result)
 156 |             if obj['exc']:
 157 |                 logging.error('Remote app server error.\n' + obj['exc'])
 158 |             sargs = APP_TASK.get(obj['id'])
 159 |             if sargs:
 160 |                 sendmsg(obj['ret'] or 'Empty.', sargs[0], sargs[1])
 161 |                 del APP_TASK[obj['id']]
 162 |             else:
 163 |                 logging.error('Task ID %s not found.' % obj['id'])
 164 | 
 165 | def checkircconn():
 166 |     global ircconn
 167 |     if not ircconn or not ircconn.sock:
 168 |         ircconn = libirc.IRCConnection()
 169 |         ircconn.connect((CFG['ircserver'], CFG['ircport']), use_ssl=CFG['ircssl'])
 170 |         if CFG.get('ircpass'):
 171 |             ircconn.setpass(CFG['ircpass'])
 172 |         ircconn.setnick(CFG['ircnick'])
 173 |         ircconn.setuser(CFG['ircnick'], CFG['ircnick'])
 174 |         ircconn.join(CFG['ircchannel'])
 175 |         logging.info('IRC (re)connected.')
 176 | 
 177 | def getircupd():
 178 |     global MSG_Q, IRCOFFSET
 179 |     while 1:
 180 |         checkircconn()
 181 |         line = ircconn.parse(block=False)
 182 |         if line and line["cmd"] == "PRIVMSG":
 183 |             if line["dest"] != CFG['ircnick'] and not re.match(CFG['ircignore'], line["nick"]):
 184 |                 msg = {
 185 |                     'message_id': IRCOFFSET,
 186 |                     'from': {'id': CFG['ircbotid'], 'first_name': CFG['ircbotname'], 'username': 'orzirc_bot'},
 187 |                     'date': int(time.time()),
 188 |                     'chat': {'id': -CFG['groupid'], 'title': CFG['ircchannel']},
 189 |                     'text': line["msg"].strip(),
 190 |                     '_ircuser': line["nick"]
 191 |                 }
 192 |                 MSG_Q.put({'update_id': IRCOFFSET, 'message': msg})
 193 |                 IRCOFFSET += 1
 194 |         time.sleep(.5)
 195 | 
 196 | def ircconn_say(dest, msg, sendnow=True):
 197 |     MIN_INT = 0.2
 198 |     if not ircconn:
 199 |         return
 200 |     curtime = time.time()
 201 |     delta = curtime - ircconn_say.lasttime
 202 |     if delta < MIN_INT:
 203 |         time.sleep(MIN_INT - delta)
 204 |     ircconn.say(dest, msg, sendnow)
 205 |     ircconn_say.lasttime = time.time()
 206 | ircconn_say.lasttime = 0
 207 | 
 208 | def irc_send(text='', reply_to_message_id=None, forward_message_id=None):
 209 |     if ircconn:
 210 |         checkircconn()
 211 |         if reply_to_message_id:
 212 |             m = MSG_CACHE.get(reply_to_message_id, {})
 213 |             logging.debug('Got reply message: ' + str(m))
 214 |             if '_ircuser' in m:
 215 |                 text = "%s: %s" % (m['_ircuser'], text)
 216 |             elif 'from' in m:
 217 |                 src = smartname(m['from'])
 218 |                 if m['from']['id'] in (CFG['botid'], CFG['ircbotid']):
 219 |                     rnmatch = re_ircforward.match(m.get('text', ''))
 220 |                     if rnmatch:
 221 |                         src = rnmatch.group(1) or src
 222 |                 text = "%s: %s" % (src, text)
 223 |         elif forward_message_id:
 224 |             # not async, so no sqlite3.ProgrammingError in db_*
 225 |             m = db_getmsg(forward_message_id)
 226 |             if m:
 227 |                 text = "Fwd %s: %s" % (smartname(m[1], True), m[2])
 228 |         lines = text.splitlines()
 229 |         if len(lines) < 3:
 230 |             text = ' '.join(lines)
 231 |         else:
 232 |             text = lines[0] + ' [...] ' + lines[-1]
 233 |         ircconn_say(CFG['ircchannel'], text)
 234 | 
 235 | @async_func
 236 | def irc_forward(msg):
 237 |     if not ircconn:
 238 |         return
 239 |     try:
 240 |         if msg['from']['id'] == CFG['ircbotid']:
 241 |             return
 242 |         checkircconn()
 243 |         text = msg.get('text', '')
 244 |         mkeys = tuple(msg.keys() & MEDIA_TYPES)
 245 |         if mkeys:
 246 |             if text:
 247 |                 text += ' ' + servemedia(msg)
 248 |             else:
 249 |                 text = servemedia(msg)
 250 |         if text and not text.startswith('@@@'):
 251 |             if 'forward_from' in msg:
 252 |                 fwdname = ''
 253 |                 if msg['forward_from']['id'] in (CFG['botid'], CFG['ircbotid']):
 254 |                     rnmatch = re_ircforward.match(msg.get('text', ''))
 255 |                     if rnmatch:
 256 |                         fwdname = rnmatch.group(1) or rnmatch.group(3)
 257 |                         text = rnmatch.group(2) or rnmatch.group(4)
 258 |                 fwdname = fwdname or smartname(msg['forward_from'])
 259 |                 text = "Fwd %s: %s" % (fwdname, text)
 260 |             elif 'reply_to_message' in msg:
 261 |                 replname = ''
 262 |                 replyu = msg['reply_to_message']['from']
 263 |                 if replyu['id'] in (CFG['botid'], CFG['ircbotid']):
 264 |                     rnmatch = re_ircforward.match(msg['reply_to_message'].get('text', ''))
 265 |                     if rnmatch:
 266 |                         replname = rnmatch.group(1) or rnmatch.group(3)
 267 |                 replname = replname or smartname(replyu)
 268 |                 text = "%s: %s" % (replname, text)
 269 |             # ignore blank lines
 270 |             text = list(filter(lambda s: s.strip(), text.splitlines()))
 271 |             if len(text) > 3:
 272 |                 text = text[:3]
 273 |                 text[-1] += ' [...]'
 274 |             for ln in text[:3]:
 275 |                 ircconn_say(CFG['ircchannel'], '[%s] %s' % (smartname(msg['from']), ln))
 276 |     except Exception:
 277 |         logging.exception('Forward a message to IRC failed.')
 278 | 
 279 | ### DB import
 280 | 
 281 | def mediaformatconv(media=None, action=None):
 282 |     type_map = {
 283 |     # media
 284 |     'photo': 'photo',
 285 |     'document': 'document',
 286 |     'unsupported': 'document',
 287 |     'geo': 'location',
 288 |     'venue': 'location',
 289 |     'contact': 'contact',
 290 |     # action
 291 |     'chat_add_user': 'new_chat_participant',
 292 |     'chat_add_user_link': 'new_chat_participant',
 293 |     'chat_del_user': 'left_chat_participant',
 294 |     'chat_rename': 'new_chat_title',
 295 |     'chat_change_photo': 'new_chat_photo',
 296 |     'chat_delete_photo': 'delete_chat_photo',
 297 |     'chat_created': 'group_chat_created'
 298 |     }
 299 |     d = {}
 300 |     caption = None
 301 |     if media:
 302 |         media = json.loads(media)
 303 |     if action:
 304 |         action = json.loads(action)
 305 |     if media and 'type' in media:
 306 |         media = media.copy()
 307 |         if media['type'] == 'photo':
 308 |             caption = media['caption']
 309 |             d['photo'] = []
 310 |         elif media['type'] in ('document', 'unsupported'):
 311 |             d['document'] = {}
 312 |         elif 'longitude' in media:
 313 |             # 'type' may be the name of the place
 314 |             d['location'] = {
 315 |                 'longitude': media['longitude'],
 316 |                 'latitude': media['latitude']
 317 |             }
 318 |         elif media['type'] == 'contact':
 319 |             del media['type']
 320 |             media['phone_number'] = media.pop('phone')
 321 |             d['contact'] = media
 322 |         # ignore other undefined types to Bot API
 323 |     if action and 'type' in action:
 324 |         newname = type_map.get(action['type'])
 325 |         if newname.endswith('chat_participant'):
 326 |             d[newname] = {
 327 |                 'id': action['user']['id'],
 328 |                 'first_name': action['user'].get('first_name', ''),
 329 |                 'last_name': action['user'].get('last_name', ''),
 330 |                 'username': action['user'].get('username', '')
 331 |             }
 332 |         elif newname == 'new_chat_title':
 333 |             d[newname] = action['title']
 334 |         elif newname == 'new_chat_photo':
 335 |             d[newname] = []
 336 |         elif newname in ('delete_chat_photo', 'group_chat_created'):
 337 |             d[newname] = True
 338 |         # ignore other undefined types to Bot API
 339 |     return json.dumps(d) if d else None, caption
 340 | 
 341 | def importdb(filename):
 342 |     logging.info('Import DB...')
 343 |     if not os.path.isfile(filename):
 344 |         logging.warning('DB not found.')
 345 |         return
 346 |     db_s = sqlite3.connect(filename)
 347 |     conn_s = db_s.cursor()
 348 |     for vals in conn_s.execute('SELECT id, src, text, media, date, fwd_src, fwd_date, reply_id, action FROM messages WHERE dest = ?', (CFG['groupid'],)):
 349 |         vals = list(vals)
 350 |         vals[0] = -250000 + vals[0]
 351 |         vals[3], caption = mediaformatconv(vals[3], vals.pop())
 352 |         vals[2] = vals[2] or caption
 353 |         conn.execute('INSERT OR IGNORE INTO messages (id, src, text, media, date, fwd_src, fwd_date, reply_id) VALUES (?,?,?,?, ?,?,?,?)', vals)
 354 |     for vals in conn_s.execute('SELECT id, username, first_name, last_name FROM users'):
 355 |         conn.execute('INSERT OR IGNORE INTO users (id, username, first_name, last_name) VALUES (?,?,?,?)', vals)
 356 |     db.commit()
 357 |     logging.info('DB import done.')
 358 | 
 359 | def importupdates(offset, number=5000):
 360 |     off = OFFSET - number
 361 |     updates = bot_api('getUpdates', offset=off, limit=100)
 362 |     while updates:
 363 |         logging.info('Imported %s - %s' % (off, updates[-1]["update_id"]))
 364 |         off = updates[-1]["update_id"] + 1
 365 |         for d in updates:
 366 |             if 'message' in d:
 367 |                 msg = d['message']
 368 |                 cls = classify(msg)
 369 |                 if cls == 0 and msg['chat']['id'] == -CFG['groupid']:
 370 |                     logmsg(msg, True)
 371 |                 elif cls == 1:
 372 |                     logmsg(msg, True)
 373 |         time.sleep(.1)
 374 |         updates = bot_api('getUpdates', offset=off, limit=100)
 375 | 
 376 | def importfixservice(filename):
 377 |     logging.info('Updating DB...')
 378 |     if not os.path.isfile(filename):
 379 |         logging.warning('DB not found.')
 380 |         return
 381 |     db_s = sqlite3.connect(filename)
 382 |     conn_s = db_s.cursor()
 383 |     for mid, text, media, action in conn_s.execute('SELECT id, text, media, action FROM messages WHERE dest = ?', (CFG['groupid'],)):
 384 |         mid -= 250000
 385 |         media, caption = mediaformatconv(media, action)
 386 |         text = text or caption
 387 |         conn.execute('UPDATE messages SET text=?, media=? WHERE id=?', (text, media, mid))
 388 |     db.commit()
 389 |     logging.info('Fix DB media column done.')
 390 | 
 391 | ### API Related
 392 | 
 393 | class BotAPIFailed(Exception):
 394 |     pass
 395 | 
 396 | def change_session():
 397 |     global HSession
 398 |     HSession.close()
 399 |     HSession = requests.Session()
 400 |     HSession.headers["User-Agent"] = USERAGENT
 401 |     logging.warning('Session changed.')
 402 | 
 403 | def bot_api(method, **params):
 404 |     for att in range(3):
 405 |         try:
 406 |             req = HSession.get(URL + method, params=params, timeout=45)
 407 |             retjson = req.content
 408 |             ret = json.loads(retjson.decode('utf-8'))
 409 |             break
 410 |         except Exception as ex:
 411 |             if att < 1:
 412 |                 time.sleep((att+1) * 2)
 413 |                 change_session()
 414 |             else:
 415 |                 raise ex
 416 |     if not ret['ok']:
 417 |         raise BotAPIFailed(repr(ret))
 418 |     return ret['result']
 419 | 
 420 | def bot_api_noerr(method, **params):
 421 |     try:
 422 |         bot_api(method, **params)
 423 |     except Exception:
 424 |         logging.exception('Async bot API failed.')
 425 | 
 426 | def sync_sendmsg(text, chat_id, reply_to_message_id=None):
 427 |     global LOG_Q
 428 |     text = text.strip()
 429 |     if not text:
 430 |         logging.warning('Empty message ignored: %s, %s' % (chat_id, reply_to_message_id))
 431 |         return
 432 |     logging.info('sendMessage(%s): %s' % (len(text), text[:20]))
 433 |     if len(text) > 2000:
 434 |         text = text[:1999] + '…'
 435 |     reply_id = reply_to_message_id
 436 |     if reply_to_message_id and reply_to_message_id < 0:
 437 |         reply_id = None
 438 |     m = bot_api('sendMessage', chat_id=chat_id, text=text, reply_to_message_id=reply_id)
 439 |     if chat_id == -CFG['groupid']:
 440 |         MSG_CACHE[m['message_id']] = m
 441 |         # IRC messages
 442 |         if reply_to_message_id is not None:
 443 |             LOG_Q.put(m)
 444 |             irc_send(text, reply_to_message_id)
 445 |     return m
 446 | 
 447 | sendmsg = async_func(sync_sendmsg)
 448 | 
 449 | #@async_func
 450 | def forward(message_id, chat_id, reply_to_message_id=None):
 451 |     global LOG_Q
 452 |     logging.info('forwardMessage: %r' % message_id)
 453 |     try:
 454 |         if message_id < 0:
 455 |             raise ValueError('Invalid message id')
 456 |         r = bot_api('forwardMessage', chat_id=chat_id, from_chat_id=-CFG['groupid'], message_id=message_id)
 457 |         logging.debug('Forwarded: %s' % message_id)
 458 |     except (ValueError, BotAPIFailed) as ex:
 459 |         m = db_getmsg(message_id)
 460 |         if m:
 461 |             r = sendmsg('[%s] %s: %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(m[4] + CFG['timezone'] * 3600)), db_getufname(m[1]), m[2]), chat_id, reply_to_message_id)
 462 |             logging.debug('Manually forwarded: %s' % message_id)
 463 |     if chat_id == -CFG['groupid']:
 464 |         LOG_Q.put(r)
 465 |         irc_send(forward_message_id=message_id)
 466 | 
 467 | #@async_func
 468 | def forwardmulti(message_ids, chat_id, reply_to_message_id=None):
 469 |     failed = False
 470 |     message_ids = tuple(message_ids)
 471 |     for message_id in message_ids:
 472 |         logging.info('forwardMessage: %r' % message_id)
 473 |         try:
 474 |             if message_id < 0:
 475 |                 raise ValueError('Invalid message id')
 476 |             r = bot_api('forwardMessage', chat_id=chat_id, from_chat_id=-CFG['groupid'], message_id=message_id)
 477 |             logging.debug('Forwarded: %s' % message_id)
 478 |             if chat_id == -CFG['groupid']:
 479 |                 LOG_Q.put(r)
 480 |         except (ValueError, BotAPIFailed) as ex:
 481 |             failed = True
 482 |             break
 483 |     if failed:
 484 |         forwardmulti_t(message_ids, chat_id, reply_to_message_id)
 485 |         logging.debug('Manually forwarded: %s' % (message_ids,))
 486 |     elif chat_id == -CFG['groupid']:
 487 |         for message_id in message_ids:
 488 |             irc_send(forward_message_id=message_id)
 489 | 
 490 | #@async_func
 491 | def forwardmulti_t(message_ids, chat_id, reply_to_message_id=None):
 492 |     text = []
 493 |     for message_id in message_ids:
 494 |         m = db_getmsg(message_id)
 495 |         if m:
 496 |             text.append('[%s] %s: %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(m[4] + CFG['timezone'] * 3600)), db_getufname(m[1]), m[2]))
 497 |     sendmsg('\n'.join(text) or 'Message(s) not found.', chat_id, reply_to_message_id)
 498 | 
 499 | @async_func
 500 | def typing(chat_id):
 501 |     logging.info('sendChatAction: %r' % chat_id)
 502 |     bot_api('sendChatAction', chat_id=chat_id, action='typing')
 503 | 
 504 | def getfile(file_id):
 505 |     logging.info('getFile: %r' % file_id)
 506 |     return bot_api('getFile', file_id=file_id)
 507 | 
 508 | def retrieve(url, filename, raisestatus=True):
 509 |     # NOTE the stream=True parameter
 510 |     r = requests.get(url, stream=True)
 511 |     if raisestatus:
 512 |         r.raise_for_status()
 513 |     with open(filename, 'wb') as f:
 514 |         for chunk in r.iter_content(chunk_size=1024):
 515 |             if chunk: # filter out keep-alive new chunks
 516 |                 f.write(chunk)
 517 |         f.flush()
 518 |     return r.status_code
 519 | 
 520 | #def extract_tag(s):
 521 |     #words = []
 522 |     #tags = []
 523 |     #for frag in s.split():
 524 |         #if frag[0] == '#':
 525 |             ## Should simulate Telegram behavior
 526 |             #tags.append(frag[1:])
 527 |             #words.extend(jieba.cut(frag[1:]))
 528 |         #elif frag[0] == '@':
 529 |             #pass
 530 |         #else:
 531 |             #words.extend(jieba.cut(frag))
 532 |     ## counting frequency in a short sentence makes no sense
 533 |     #return (words, set(tags))
 534 | 
 535 | def daystart(sec=None):
 536 |     if not sec:
 537 |         sec = time.time()
 538 |     return (sec + CFG["timezone"]*3600)//86400 * 86400 - CFG["timezone"]*3600
 539 | 
 540 | def uniq(seq): # Dave Kirby
 541 |     # Order preserving
 542 |     seen = set()
 543 |     return [x for x in seq if x not in seen and not seen.add(x)]
 544 | 
 545 | def classify(msg):
 546 |     '''
 547 |     Classify message type:
 548 | 
 549 |     - Command: (0)
 550 |             All messages that start with a slash ‘/’ (see Commands above)
 551 |             Messages that @mention the bot by username
 552 |             Replies to the bot's own messages
 553 | 
 554 |     - Group message (1)
 555 |     - IRC message (2)
 556 |     - new_chat_participant (3)
 557 |     - Ignored message (10)
 558 |     - Invalid calling (-1)
 559 |     '''
 560 |     chat = msg['chat']
 561 |     text = msg.get('text', '').strip()
 562 |     if text:
 563 |         if text[0] in "/'" or ('@' + CFG['botname']) in text:
 564 |             return 0
 565 |         elif 'first_name' in chat:
 566 |             return 0
 567 |         else:
 568 |             reply = msg.get('reply_to_message')
 569 |             if reply and reply['from']['id'] == CFG['botid']:
 570 |                 return 0
 571 | 
 572 |     # If not enabled, there won't be this kind of msg
 573 |     ircu = msg.get('_ircuser')
 574 |     if ircu and ircu != CFG['ircnick']:
 575 |         return 2
 576 | 
 577 |     if 'title' in chat:
 578 |         # Group chat
 579 |         if chat['id'] == -CFG['groupid']:
 580 |             if msg['from']['id'] == CFG['botid']:
 581 |                 return 10
 582 |             elif 'new_chat_participant' in msg:
 583 |                 return 3
 584 |             else:
 585 |                 return 1
 586 |         else:
 587 |             return 10
 588 |     else:
 589 |         return -1
 590 | 
 591 | def command(text, chatid, replyid, msg):
 592 |     try:
 593 |         t = text.strip().split(' ')
 594 |         if not t:
 595 |             return
 596 |         if t[0][0] in "/'":
 597 |             cmd = t[0][1:].lower().replace('@' + CFG['botname'], '')
 598 |             if cmd in COMMANDS:
 599 |                 if chatid > 0 or chatid == -CFG['groupid'] or cmd in PUBLIC:
 600 |                     expr = ' '.join(t[1:]).strip()
 601 |                     logging.info('Command: /%s %s' % (cmd, expr[:20]))
 602 |                     COMMANDS[cmd](expr, chatid, replyid, msg)
 603 |                 elif chatid < 0 and chatid != -CFG['groupid'] and cmd not in PUBLIC:
 604 |                     sendmsg('This command is not available for this group. Send /help for available commands.', chatid, replyid)
 605 |             elif chatid > 0:
 606 |                 sendmsg('Invalid command. Send /help for help.', chatid, replyid)
 607 |         # 233333
 608 |         #elif all(n.isdigit() for n in t):
 609 |             #COMMANDS['m'](' '.join(t), chatid, replyid, msg)
 610 |         elif chatid > 0:
 611 |             t = ' '.join(t).strip()
 612 |             logging.info('Reply: ' + t[:20])
 613 |             COMMANDS['reply'](t, chatid, replyid, msg)
 614 |     except Exception:
 615 |         logging.exception('Excute command failed.')
 616 | 
 617 | def processmsg():
 618 |     d = MSG_Q.get()
 619 |     logging.debug('Msg arrived: %r' % d)
 620 |     if 'message' in d:
 621 |         msg = d['message']
 622 |         if 'text' in msg:
 623 |             msg['text'] = msg['text'].replace('\xa0', ' ')
 624 |         elif 'caption' in msg:
 625 |             msg['text'] = msg['caption'].replace('\xa0', ' ')
 626 |         MSG_CACHE[msg['message_id']] = msg
 627 |         cls = classify(msg)
 628 |         logging.debug('Classified as: %s', cls)
 629 |         if msg['chat']['id'] == -CFG['groupid'] and CFG.get('t2i'):
 630 |             irc_forward(msg)
 631 |         if cls == 0:
 632 |             if msg['chat']['id'] == -CFG['groupid']:
 633 |                 logmsg(msg)
 634 |             rid = msg['message_id']
 635 |             if CFG.get('i2t') and '_ircuser' in msg:
 636 |                 rid = sync_sendmsg('[%s] %s' % (msg['_ircuser'], msg['text']), msg['chat']['id'])['message_id']
 637 |             command(msg['text'], msg['chat']['id'], rid, msg)
 638 |         elif cls == 1:
 639 |             logmsg(msg)
 640 |         elif cls == 2:
 641 |             logmsg(msg)
 642 |             if CFG.get('i2t'):
 643 |                 act = re_ircaction.match(msg['text'])
 644 |                 if act:
 645 |                     sendmsg('** %s %s **' % (msg['_ircuser'], act.group(1)), msg['chat']['id'])
 646 |                 else:
 647 |                     sendmsg('[%s] %s' % (msg['_ircuser'], msg['text']), msg['chat']['id'])
 648 |         elif cls == 3:
 649 |             logmsg(msg)
 650 |             cmd__welcome('', msg['chat']['id'], msg['message_id'], msg)
 651 |         elif cls == -1:
 652 |             sendmsg('Wrong usage', msg['chat']['id'], msg['message_id'])
 653 |         if cls in (1, 2) and CFG.get('autoclose') and 'forward_from' not in msg:
 654 |             autoclose(msg)
 655 |         try:
 656 |             logmsg(LOG_Q.get_nowait())
 657 |         except queue.Empty:
 658 |             pass
 659 | 
 660 | def cachemedia(msg):
 661 |     '''
 662 |     Download specified media if not exist.
 663 |     '''
 664 |     mt = msg.keys() & frozenset(('audio', 'document', 'sticker', 'video', 'voice'))
 665 |     file_ext = ''
 666 |     if mt:
 667 |         mt = mt.pop()
 668 |         file_id = msg[mt]['file_id']
 669 |         file_size = msg[mt].get('file_size')
 670 |         if mt == 'sticker':
 671 |             file_ext = '.webp'
 672 |     elif 'photo' in msg:
 673 |         photo = max(msg['photo'], key=lambda x: x['width'])
 674 |         file_id = photo['file_id']
 675 |         file_size = photo.get('file_size')
 676 |         file_ext = '.jpg'
 677 |     fp = getfile(file_id)
 678 |     file_size = fp.get('file_size') or file_size
 679 |     file_path = fp.get('file_path')
 680 |     if not file_path:
 681 |         raise BotAPIFailed("can't get file_path for " + file_id)
 682 |     file_ext = os.path.splitext(file_path)[1] or file_ext
 683 |     cachename = file_id + file_ext
 684 |     fpath = os.path.join(CFG['cachepath'], cachename)
 685 |     try:
 686 |         if os.path.isfile(fpath) and os.path.getsize(fpath) == file_size:
 687 |             return (cachename, 304)
 688 |     except Exception:
 689 |         pass
 690 |     return (cachename, retrieve(URL_FILE + file_path, fpath))
 691 | 
 692 | def timestring_a(seconds):
 693 |     m, s = divmod(seconds, 60)
 694 |     h, m = divmod(m, 60)
 695 |     return '%d:%02d:%02d' % (h, m, s)
 696 | 
 697 | def servemedia(msg):
 698 |     '''
 699 |     Reply type and link of media. This only generates links for photos.
 700 |     '''
 701 |     keys = tuple(msg.keys() & MEDIA_TYPES)
 702 |     if not keys:
 703 |         return ''
 704 |     ret = '<%s>' % keys[0]
 705 |     if 'photo' in msg:
 706 |         servemode = CFG.get('servemedia')
 707 |         if servemode:
 708 |             fname, code = cachemedia(msg)
 709 |             if servemode == 'self':
 710 |                 ret += ' %s%s' % (CFG['serveurl'], fname)
 711 |             elif servemode == 'vim-cn':
 712 |                 r = requests.post('http://img.vim-cn.com/', files={'name': open(os.path.join(CFG['cachepath'], fname), 'rb')})
 713 |                 ret += ' ' + r.text
 714 |     elif 'sticker' in msg:
 715 |         if CFG.get('servemedia') == 'self':
 716 |             fname, code = cachemedia(msg)
 717 |             ret += ' %s%s' % (CFG['serveurl'], fname)
 718 |         if msg['sticker'].get('emoji'):
 719 |             ret = msg['sticker']['emoji'] + ' ' + ret
 720 |     elif 'document' in msg:
 721 |         ret += ' %s' % (msg['document'].get('file_name', ''))
 722 |         if CFG.get('servemedia') == 'self' and msg['document'].get('file_size', 0) <= CFG.get('servemaxsize', 1048576):
 723 |             fname, code = cachemedia(msg)
 724 |             ret += ' %s%s' % (CFG['serveurl'], fname)
 725 |     elif 'video' in msg:
 726 |         ret += ' ' + timestring_a(msg['video'].get('duration', 0))
 727 |         if CFG.get('servemedia') == 'self' and msg['video'].get('file_size', 0) <= CFG.get('servemaxsize', 1048576):
 728 |             fname, code = cachemedia(msg)
 729 |             ret += ' %s%s' % (CFG['serveurl'], fname)
 730 |     elif 'voice' in msg:
 731 |         ret += ' ' + timestring_a(msg['voice'].get('duration', 0))
 732 |         if CFG.get('servemedia') == 'self' and msg['voice'].get('file_size', 0) <= CFG.get('servemaxsize', 1048576):
 733 |             fname, code = cachemedia(msg)
 734 |             ret += ' %s%s' % (CFG['serveurl'], fname)
 735 |     elif 'new_chat_title' in msg:
 736 |         ret += ' ' + msg['new_chat_title']
 737 |     return ret
 738 | 
 739 | def autoclose(msg):
 740 |     openbrckt = ('([{（［｛⦅〚⦃“‘‹«「〈《【〔⦗『〖〘｢⟦⟨⟪⟮⟬⌈⌊⦇⦉❛❝❨❪❴❬❮❰❲'
 741 |                  '⏜⎴⏞〝︵⏠﹁﹃︹︻︗︿︽﹇︷〈⦑⧼﹙﹛﹝⁽₍⦋⦍⦏⁅⸢⸤⟅⦓⦕⸦⸨｟⧘⧚⸜⸌⸂⸄⸉᚛༺༼')
 742 |     clozbrckt = (')]}）］｝⦆〛⦄”’›»」〉》】〕⦘』〗〙｣⟧⟩⟫⟯⟭⌉⌋⦈⦊❜❞❩❫❵❭❯❱❳'
 743 |                  '⏝⎵⏟〞︶⏡﹂﹄︺︼︘﹀︾﹈︸〉⦒⧽﹚﹜﹞⁾₎⦌⦎⦐⁆⸣⸥⟆⦔⦖⸧⸩｠⧙⧛⸝⸍⸃⸅⸊᚜༻༽')
 744 |     stack = []
 745 |     for ch in msg.get('text', ''):
 746 |         index = openbrckt.find(ch)
 747 |         if index >= 0:
 748 |             stack.append(index)
 749 |             continue
 750 |         index = clozbrckt.find(ch)
 751 |         if index >= 0:
 752 |             if stack and stack[-1] == index:
 753 |                 stack.pop()
 754 |     closed = ''.join(reversed(tuple(map(clozbrckt.__getitem__, stack))))
 755 |     if closed:
 756 |         if len(closed) > 20:
 757 |             closed = closed[:20] + '…'
 758 |         sendmsg(closed, msg['chat']['id'], msg['message_id'])
 759 | 
 760 | def db_adduser(d):
 761 |     user = (d['id'], d.get('username'), d.get('first_name'), d.get('last_name'))
 762 |     conn.execute('REPLACE INTO users (id, username, first_name, last_name) VALUES (?, ?, ?, ?)', user)
 763 |     USER_CACHE[d['id']] = (d.get('username'), d.get('first_name'), d.get('last_name'))
 764 |     return user
 765 | 
 766 | def db_getuser(uid):
 767 |     r = USER_CACHE.get(uid)
 768 |     if r is None:
 769 |         r = conn.execute('SELECT username, first_name, last_name FROM users WHERE id = ?', (uid,)).fetchone() or (None, None, None)
 770 |         USER_CACHE[uid] = r
 771 |     return r
 772 | 
 773 | def db_getufname(uid):
 774 |     name, last = db_getuser(uid)[1:]
 775 |     if last:
 776 |         name += ' ' + last
 777 |     return name
 778 | 
 779 | def dc_getufname(user, maxlen=100):
 780 |     USER_CACHE[user['id']] = (user.get('username'), user.get('first_name'), user.get('last_name'))
 781 |     name = user['first_name']
 782 |     if 'last_name' in user:
 783 |         name += ' ' + user['last_name']
 784 |     if len(name) > maxlen:
 785 |         name = name[:maxlen] + '…'
 786 |     return name
 787 | 
 788 | def smartname(user, db=False, limit=20):
 789 |     if db:
 790 |         first, last = db_getuser(user)[1:]
 791 |     else:
 792 |         USER_CACHE[user['id']] = (user.get('username'), user.get('first_name'), user.get('last_name'))
 793 |         first, last = user.get('first_name', ''), user.get('last_name', '')
 794 |     if not first:
 795 |         return '<%s>' % 'Unknown'[:limit-2]
 796 |     pn = first
 797 |     if last:
 798 |         pn += ' ' + last
 799 |     if len(pn) > limit:
 800 |         if len(first) > limit:
 801 |             return first.split(None, 1)[0][:limit]
 802 |         else:
 803 |             return first[:limit]
 804 |     else:
 805 |         return pn
 806 | 
 807 | @functools.lru_cache(maxsize=10)
 808 | def db_getmsg(mid):
 809 |     return conn.execute('SELECT * FROM messages WHERE id = ?', (mid,)).fetchone()
 810 | 
 811 | @functools.lru_cache(maxsize=10)
 812 | def db_getuidbyname(username):
 813 |     if username.startswith('#'):
 814 |         try:
 815 |             return int(username[1:])
 816 |         except ValueError:
 817 |             return None
 818 |     else:
 819 |         uid = conn.execute('SELECT id FROM users WHERE username LIKE ?', (username,)).fetchone()
 820 |         if uid:
 821 |             return uid[0]
 822 | 
 823 | 
 824 | def logmsg(d, iorignore=False):
 825 |     src = db_adduser(d['from'])[0]
 826 |     text = d.get('text') or d.get('caption', '')
 827 |     media = {k:d[k] for k in EXT_MEDIA_TYPES.intersection(d.keys())}
 828 |     fwd_src = db_adduser(d['forward_from'])[0] if 'forward_from' in d else None
 829 |     reply_id = d['reply_to_message']['message_id'] if 'reply_to_message' in d else None
 830 |     into = 'INSERT OR IGNORE INTO' if iorignore else 'REPLACE INTO'
 831 |     conn.execute(into + ' messages (id, src, text, media, date, fwd_src, fwd_date, reply_id) VALUES (?,?,?,?, ?,?,?,?)',
 832 |                  (d['message_id'], src, text, json.dumps(media) if media else None, d['date'], fwd_src, d.get('forward_date'), reply_id))
 833 |     logging.info('Logged %s: %s', d['message_id'], d.get('text', '')[:15])
 834 |     db.commit()
 835 | 
 836 | ### Commands
 837 | 
 838 | def cmd_getmsg(expr, chatid, replyid, msg):
 839 |     '''/m <message_id> [...] Get specified message(s) by ID(s).'''
 840 |     try:
 841 |         if not expr:
 842 |             if 'reply_to_message' in msg:
 843 |                 sendmsg('Message ID: %d' % msg['reply_to_message']['message_id'], chatid, replyid)
 844 |             else:
 845 |                 raise ValueError
 846 |         mids = tuple(map(int, expr.split()))
 847 |     except Exception:
 848 |         sendmsg('Syntax error. Usage: ' + cmd_getmsg.__doc__, chatid, replyid)
 849 |         return
 850 |     forwardmulti(mids, chatid, replyid)
 851 | 
 852 | def cmd_context(expr, chatid, replyid, msg):
 853 |     '''/context <message_id> [number=2] Show the specified message and its context. max=10'''
 854 |     expr = expr.split(' ')
 855 |     try:
 856 |         if len(expr) > 1:
 857 |             mid = max(int(expr[0]), 1)
 858 |             limit = max(min(int(expr[1]), 10), 1)
 859 |         else:
 860 |             mid, limit = int(expr[0]), 2
 861 |     except Exception:
 862 |         sendmsg('Syntax error. Usage: ' + cmd_context.__doc__, chatid, replyid)
 863 |         return
 864 |     typing(chatid)
 865 |     forwardmulti_t(range(mid - limit, mid + limit + 1), chatid, replyid)
 866 | 
 867 | def cmd_quote(expr, chatid, replyid, msg):
 868 |     '''/quote Send a today's random message.'''
 869 |     typing(chatid)
 870 |     sec = daystart()
 871 |     msg = conn.execute('SELECT id FROM messages WHERE date >= ? AND date < ? ORDER BY RANDOM() LIMIT 1', (sec, sec + 86400)).fetchone()
 872 |     if msg is None:
 873 |         msg = conn.execute('SELECT id FROM messages ORDER BY RANDOM() LIMIT 1').fetchone()
 874 |     #forwardmulti((msg[0]-1, msg[0], msg[0]+1), chatid, replyid)
 875 |     forward(msg[0], chatid, replyid)
 876 | 
 877 | def ellipsisresult(s, find, maxctx=50):
 878 |     if find:
 879 |         try:
 880 |             lnid = s.lower().index(find.lower())
 881 |             r = s[max(0, lnid - maxctx):min(len(s), lnid + maxctx)].strip()
 882 |             if len(r) < len(s):
 883 |                 r = '… %s …' % r
 884 |             return r
 885 |         except ValueError:
 886 |             return s
 887 |     else:
 888 |         return s
 889 | 
 890 | re_search_number = re.compile(r'([0-9]+)(,[0-9]+)?')
 891 | 
 892 | def cmd_search(expr, chatid, replyid, msg):
 893 |     '''/search|/s [@username] [keyword] [number=5|number,offset] Search the group log for recent messages. max(number)=20'''
 894 |     username, uid, limit, offset = None, None, 5, 0
 895 |     if expr:
 896 |         expr = expr.split(' ')
 897 |         if len(expr) > 1:
 898 |             ma = re_search_number.match(expr[-1])
 899 |             if ma:
 900 |                 expr = expr[:-1]
 901 |                 limit = max(min(int(ma.group(1)), 20), 1)
 902 |                 offset = int(ma.group(2)[1:]) if ma.group(2) else 0
 903 |         if expr[0][0] == '@':
 904 |             username = expr[0][1:]
 905 |             keyword = ' '.join(expr[1:])
 906 |         else:
 907 |             keyword = ' '.join(expr)
 908 |     else:
 909 |         keyword = ''
 910 |     if username:
 911 |         uid = db_getuidbyname(username)
 912 |     typing(chatid)
 913 |     if uid is None:
 914 |         keyword = ' '.join(expr)
 915 |         sqr = conn.execute("SELECT id, src, text, date FROM messages WHERE text LIKE ? ORDER BY date DESC LIMIT ? OFFSET ?", ('%' + keyword + '%', limit, offset)).fetchall()
 916 |     else:
 917 |         sqr = conn.execute("SELECT id, src, text, date FROM messages WHERE src = ? AND text LIKE ? ORDER BY date DESC LIMIT ? OFFSET ?", (uid, '%' + keyword + '%', limit, offset)).fetchall()
 918 |     result = []
 919 |     for mid, fr, text, date in sqr:
 920 |         text = ellipsisresult(text, keyword)
 921 |         if len(text) > 100:
 922 |             text = text[:100] + '…'
 923 |         if uid:
 924 |             result.append('[%d|%s] %s' % (mid, time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(date + CFG['timezone'] * 3600)), text))
 925 |         else:
 926 |             result.append('[%d|%s] %s: %s' % (mid, time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(date + CFG['timezone'] * 3600)), db_getufname(fr), text))
 927 |     sendmsg('\n'.join(result) or 'Found nothing.', chatid, replyid)
 928 | 
 929 | def cmd_mention(expr, chatid, replyid, msg):
 930 |     '''/mention Show last mention of you.'''
 931 |     if msg['chat']['id'] != -CFG['groupid']:
 932 |         sendmsg("This command can't be used in this chat.", chatid, replyid)
 933 |         return
 934 |     uid = msg['from']['id']
 935 |     user = db_getuser(uid)
 936 |     if user[0]:
 937 |         res = conn.execute("SELECT * FROM messages WHERE (text LIKE ? OR reply_id IN (SELECT id FROM messages WHERE src = ?)) AND src != ? ORDER BY date DESC LIMIT 1", ('%@' + user[0] + '%', uid, CFG['botid'])).fetchone()
 938 |         userat = '@' + user[0] + ' '
 939 |     else:
 940 |         res = conn.execute("SELECT * FROM messages WHERE reply_id IN (SELECT id FROM messages WHERE src = ?) AND src != ? ORDER BY date DESC LIMIT 1", (uid, CFG['botid'])).fetchone()
 941 |         userat = ''
 942 |     if res:
 943 |         reid = res[0]
 944 |         if reid > 0:
 945 |             sendmsg(userat + 'You were mentioned in this message.', chatid, reid)
 946 |         else:
 947 |             forward(reid, chatid, replyid)
 948 |     else:
 949 |         sendmsg('No mention found.', chatid, replyid)
 950 | 
 951 | def timestring(minutes):
 952 |     h, m = divmod(minutes, 60)
 953 |     d, h = divmod(h, 24)
 954 |     return (' %d 天' % d if d else '') + (' %d 小时' % h if h else '') + (' %d 分钟' % m if m else '')
 955 | 
 956 | def cmd_uinfo(expr, chatid, replyid, msg):
 957 |     '''/user|/uinfo [@username] [minutes=1440] Show information about <@username>.'''
 958 |     if 'reply_to_message' in msg:
 959 |         uid = msg['reply_to_message']['from']['id']
 960 |     else:
 961 |         uid = None
 962 |     if expr:
 963 |         expr = expr.split(' ')
 964 |         username = expr[0]
 965 |         if not username.startswith('@'):
 966 |             uid = uid or msg['from']['id']
 967 |             try:
 968 |                 minutes = min(max(int(expr[0]), 1), 3359733)
 969 |             except Exception:
 970 |                 minutes = 1440
 971 |         else:
 972 |             uid = db_getuidbyname(username[1:])
 973 |             if not uid:
 974 |                 sendmsg('User not found.', chatid, replyid)
 975 |                 return
 976 |             try:
 977 |                 minutes = min(max(int(expr[1]), 1), 3359733)
 978 |             except Exception:
 979 |                 minutes = 1440
 980 |     else:
 981 |         uid = uid or msg['from']['id']
 982 |         minutes = 1440
 983 |     user = db_getuser(uid)
 984 |     uinfoln = []
 985 |     if user[0]:
 986 |         uinfoln.append('@' + user[0])
 987 |     uinfoln.append(db_getufname(uid))
 988 |     uinfoln.append('ID: %s' % uid)
 989 |     result = [', '.join(uinfoln)]
 990 |     if msg['chat']['id'] == -CFG['groupid']:
 991 |         r = conn.execute('SELECT src FROM messages WHERE date > ?', (time.time() - minutes * 60,)).fetchall()
 992 |         timestr = timestring(minutes)
 993 |         if r:
 994 |             ctr = collections.Counter(i[0] for i in r)
 995 |             if uid in ctr:
 996 |                 rank = sorted(ctr, key=ctr.__getitem__, reverse=True).index(uid) + 1
 997 |                 result.append('在最近%s内发了 %s 条消息，占 %.2f%%，位列第 %s。' % (timestr, ctr[uid], ctr[uid]/len(r)*100, rank))
 998 |             else:
 999 |                 result.append('在最近%s内没发消息。' % timestr)
1000 |         else:
1001 |             result.append('在最近%s内没发消息。' % timestr)
1002 |     sendmsg('\n'.join(result), chatid, replyid)
1003 | 
1004 | def cmd_stat(expr, chatid, replyid, msg):
1005 |     '''/stat [minutes=1440] Show statistics.'''
1006 |     try:
1007 |         minutes = min(max(int(expr), 1), 3359733)
1008 |     except Exception:
1009 |         minutes = 1440
1010 |     r = conn.execute('SELECT src FROM messages WHERE date > ?', (time.time() - minutes * 60,)).fetchall()
1011 |     timestr = timestring(minutes)
1012 |     if not r:
1013 |         sendmsg('在最近%s内无消息。' % timestr, chatid, replyid)
1014 |         return
1015 |     ctr = collections.Counter(i[0] for i in r)
1016 |     mcomm = ctr.most_common(5)
1017 |     count = len(r)
1018 |     msg = ['在最近%s内有 %s 条消息，平均每分钟 %.2f 条。' % (timestr, count, count/minutes)]
1019 |     msg.extend('%s: %s 条，%.2f%%' % (db_getufname(k), v, v/count*100) for k, v in mcomm)
1020 |     msg.append('其他用户 %s 条，人均 %.2f 条' % (count - sum(v for k, v in mcomm), count / len(ctr)))
1021 |     sendmsg('\n'.join(msg), chatid, replyid)
1022 | 
1023 | def cmd_digest(expr, chatid, replyid, msg):
1024 |     sendmsg('Not implemented.', chatid, replyid)
1025 | 
1026 | def cmd_calc(expr, chatid, replyid, msg):
1027 |     '''/calc <expr> Calculate <expr>.'''
1028 |     if expr:
1029 |         runapptask('calc', (expr,), (chatid, replyid))
1030 |     else:
1031 |         sendmsg('Syntax error. Usage: ' + cmd_calc.__doc__, chatid, replyid)
1032 | 
1033 | def cmd_py(expr, chatid, replyid, msg):
1034 |     '''/py <expr> Evaluate Python 2 expression <expr>.'''
1035 |     if expr:
1036 |         if len(expr) > 1000:
1037 |             sendmsg('Expression too long.', chatid, replyid)
1038 |         else:
1039 |             runapptask('py', (expr,), (chatid, replyid))
1040 |     else:
1041 |         sendmsg('Syntax error. Usage: ' + cmd_py.__doc__, chatid, replyid)
1042 | 
1043 | def cmd_bf(expr, chatid, replyid, msg):
1044 |     '''/bf <expr> [|<input>] Evaluate Brainf*ck expression <expr> (with <input>).'''
1045 |     if expr:
1046 |         expr = expr.split('|', 1)
1047 |         inpt = expr[1] if len(expr) > 1 else ''
1048 |         runapptask('bf', (expr[0], inpt), (chatid, replyid))
1049 |     else:
1050 |         sendmsg('Syntax error. Usage: ' + cmd_bf.__doc__, chatid, replyid)
1051 | 
1052 | def cmd_lisp(expr, chatid, replyid, msg):
1053 |     '''/lisp <expr> Evaluate Lisp(Scheme)-like expression <expr>.'''
1054 |     if expr:
1055 |         runapptask('lisp', (expr,), (chatid, replyid))
1056 |     else:
1057 |         sendmsg('Syntax error. Usage: ' + cmd_lisp.__doc__, chatid, replyid)
1058 | 
1059 | def cmd_name(expr, chatid, replyid, msg):
1060 |     '''/name [pinyin] Get a Chinese name.'''
1061 |     runapptask('name', (expr,), (chatid, replyid))
1062 | 
1063 | def cmd_cc(expr, chatid, replyid, msg):
1064 |     '''/cc <Chinese> Simplified-Traditional Chinese conversion.'''
1065 |     tinput = ''
1066 |     if 'reply_to_message' in msg:
1067 |         tinput = msg['reply_to_message'].get('text', '')
1068 |     tinput = (expr or tinput).strip()
1069 |     runapptask('cc', (tinput,), (chatid, replyid))
1070 | 
1071 | def cmd_ime(expr, chatid, replyid, msg):
1072 |     '''/ime [pinyin] Simple Pinyin IME.'''
1073 |     tinput = ''
1074 |     if 'reply_to_message' in msg:
1075 |         tinput = msg['reply_to_message'].get('text', '')
1076 |     tinput = (expr or tinput).strip()
1077 |     if len(tinput) > 200:
1078 |         tinput = tinput[:200] + '…'
1079 |     if not tinput:
1080 |         sendmsg('Syntax error. Usage: ' + cmd_ime.__doc__, chatid, replyid)
1081 |         return
1082 |     runapptask('ime', (tinput,), (chatid, replyid))
1083 | 
1084 | def cmd_cut(expr, chatid, replyid, msg):
1085 |     '''/cut [c|m] <something> Segment <something>.'''
1086 |     if expr[:2].strip() == 'c':
1087 |         lang = 'c'
1088 |         expr = expr[2:]
1089 |     elif expr[:2].strip() == 'm':
1090 |         lang = 'm'
1091 |         expr = expr[2:]
1092 |     else:
1093 |         lang = None
1094 |     tinput = ''
1095 |     if 'reply_to_message' in msg:
1096 |         tinput = msg['reply_to_message'].get('text', '')
1097 |     tinput = (expr or tinput).strip()
1098 |     if len(tinput) > 1000:
1099 |         tinput = tinput[:1000] + '……'
1100 |     if not tinput:
1101 |         sendmsg('Syntax error. Usage: ' + cmd_cut.__doc__, chatid, replyid)
1102 |         return
1103 |     runapptask('cut', (tinput, lang), (chatid, replyid))
1104 | 
1105 | def cmd_wyw(expr, chatid, replyid, msg):
1106 |     '''/wyw [c|m] <something> Translate something to or from classical Chinese.'''
1107 |     if expr[:2].strip() == 'c':
1108 |         lang = 'c2m'
1109 |         expr = expr[2:]
1110 |     elif expr[:2].strip() == 'm':
1111 |         lang = 'm2c'
1112 |         expr = expr[2:]
1113 |     else:
1114 |         lang = None
1115 |     tinput = ''
1116 |     if 'reply_to_message' in msg:
1117 |         tinput = msg['reply_to_message'].get('text', '')
1118 |     tinput = (expr or tinput).strip()
1119 |     if len(tinput) > 1000:
1120 |         tinput = tinput[:1000] + '……'
1121 |     if not tinput:
1122 |         sendmsg('Syntax error. Usage: ' + cmd_wyw.__doc__, chatid, replyid)
1123 |         return
1124 |     typing(chatid)
1125 |     runapptask('wyw', (tinput, lang), (chatid, replyid))
1126 | 
1127 | def cmd_say(expr, chatid, replyid, msg):
1128 |     '''/say Say something interesting.'''
1129 |     #typing(chatid)
1130 |     if expr:
1131 |         runapptask('reply', (expr,), (chatid, replyid))
1132 |     else:
1133 |         runapptask('say', (), (chatid, replyid))
1134 | 
1135 | def cmd_mgw(expr, chatid, replyid, msg):
1136 |     if chatid < 0:
1137 |         return
1138 |     runapptask('mgw', (), (chatid, replyid))
1139 | 
1140 | def cmd_reply(expr, chatid, replyid, msg):
1141 |     '''/reply [question] Reply to the conversation.'''
1142 |     if 'forward_from' in msg and msg['chat']['id'] < 0:
1143 |         return
1144 |     typing(chatid)
1145 |     text = ''
1146 |     if 'reply_to_message' in msg:
1147 |         text = msg['reply_to_message'].get('text', '')
1148 |     text = (expr.strip() or text or ' '.join(t[0] for t in conn.execute("SELECT text FROM messages ORDER BY date DESC LIMIT 2").fetchall())).replace('\n', ' ')
1149 |     runapptask('reply', (text,), (chatid, replyid))
1150 | 
1151 | def cmd_cont(expr, chatid, replyid, msg):
1152 |     '''/cont [sentence] Complete the sentence.'''
1153 |     if 'forward_from' in msg and msg['chat']['id'] < 0:
1154 |         return
1155 |     typing(chatid)
1156 |     text = ''
1157 |     if 'reply_to_message' in msg:
1158 |         text = msg['reply_to_message'].get('text', '')
1159 |     text = (expr.strip() or text or conn.execute("SELECT text FROM messages ORDER BY date DESC LIMIT 1").fetchone()[0]).replace('\n', ' ')
1160 |     runapptask('cont', (text,), (chatid, replyid))
1161 | 
1162 | def cmd_echo(expr, chatid, replyid, msg):
1163 |     '''/echo Parrot back.'''
1164 |     if 'ping' in expr.lower():
1165 |         sendmsg('pong', chatid, replyid)
1166 |     elif expr:
1167 |         sendmsg(expr, chatid, replyid)
1168 |     else:
1169 |         sendmsg('ping', chatid, replyid)
1170 | 
1171 | def cmd_do(expr, chatid, replyid, msg):
1172 |     actions = collections.OrderedDict((
1173 |         ('shrug', '¯\\_(ツ)_/¯'),
1174 |         ('lenny', '( ͡° ͜ʖ ͡°)'),
1175 |         ('flip', '（╯°□°）╯︵ ┻━┻'),
1176 |         ('homo', '┌（┌　＾o＾）┐'),
1177 |         ('look', 'ಠ_ಠ'),
1178 |         ('cn', '[citation needed]'),
1179 |         ('boom', '💥'),
1180 |         ('tweet', '🐦'),
1181 |         ('blink', '👀'),
1182 |         ('see-no-evil', '🙈'),
1183 |         ('hear-no-evil', '🙉'),
1184 |         ('speak-no-evil', '🙊'),
1185 |         ('however', ('不要怪我们没有警告过你\n我们都有不顺利的时候\n'
1186 |                      'Something happened\n这真是让人尴尬\n'
1187 |                      '请坐和放宽，滚回以前的版本\n这就是你的人生\n是的，你的人生'))
1188 |     ))
1189 |     expr = expr.lower()
1190 |     res = actions.get(expr)
1191 |     if res:
1192 |         sendmsg(res, chatid, replyid)
1193 |     elif expr == 'help':
1194 |         sendmsg(', '.join(actions.keys()), chatid, replyid)
1195 |     else:
1196 |         try:
1197 |             res = unicodedata.lookup(expr)
1198 |             sendmsg(res, chatid, replyid)
1199 |             return
1200 |         except KeyError:
1201 |             pass
1202 |         if len(expr) == 1:
1203 |             try:
1204 |                 res = unicodedata.name(expr)
1205 |                 sendmsg(res, chatid, replyid)
1206 |             except ValueError:
1207 |                 sendmsg('Character not found in Unicode %s' % unicodedata.unidata_version, chatid, replyid)
1208 |         else:
1209 |             sendmsg('Something happened.', chatid, replyid)
1210 | 
1211 | def cmd_t2i(expr, chatid, replyid, msg):
1212 |     global CFG
1213 |     if msg['chat']['id'] == -CFG['groupid']:
1214 |         if expr == 'off' or CFG.get('t2i'):
1215 |             CFG['t2i'] = False
1216 |             sendmsg('Telegram to IRC forwarding disabled.', chatid, replyid)
1217 |         elif expr == 'on' or not CFG.get('t2i'):
1218 |             CFG['t2i'] = True
1219 |             sendmsg('Telegram to IRC forwarding enabled.', chatid, replyid)
1220 | 
1221 | def cmd_i2t(expr, chatid, replyid, msg):
1222 |     global CFG
1223 |     if msg['chat']['id'] == -CFG['groupid']:
1224 |         if expr == 'off' or CFG.get('i2t'):
1225 |             CFG['i2t'] = False
1226 |             sendmsg('IRC to Telegram forwarding disabled.', chatid, replyid)
1227 |         elif expr == 'on' or not CFG.get('i2t'):
1228 |             CFG['i2t'] = True
1229 |             sendmsg('IRC to Telegram forwarding enabled.', chatid, replyid)
1230 | 
1231 | def cmd_autoclose(expr, chatid, replyid, msg):
1232 |     global CFG
1233 |     if msg['chat']['id'] == -CFG['groupid']:
1234 |         if CFG.get('autoclose'):
1235 |             CFG['autoclose'] = False
1236 |             sendmsg('Auto closing brackets disabled.', chatid, replyid)
1237 |         else:
1238 |             CFG['autoclose'] = True
1239 |             sendmsg('Auto closing brackets enabled.', chatid, replyid)
1240 | 
1241 | def cmd_cancel(expr, chatid, replyid, msg):
1242 |     '''/cancel Hide keyboard and interrupt current session.'''
1243 |     bot_api('sendMessage', chat_id=chatid, text='Cancelled.', reply_to_message_id=replyid, reply_markup='{"hide_keyboard": true}')
1244 | 
1245 | def cmd__cmd(expr, chatid, replyid, msg):
1246 |     global SAY_P, APP_P
1247 |     if chatid < 0:
1248 |         return
1249 |     if expr == 'killserver':
1250 |         APP_P.terminate()
1251 |         APP_P = subprocess.Popen(APP_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
1252 |         checkappproc()
1253 |         sendmsg('Server restarted.', chatid, replyid)
1254 |         logging.info('Server restarted upon user request.')
1255 |     elif expr == 'commit':
1256 |         while 1:
1257 |             try:
1258 |                 logmsg(LOG_Q.get_nowait())
1259 |             except queue.Empty:
1260 |                 break
1261 |         db.commit()
1262 |         sendmsg('DB committed.', chatid, replyid)
1263 |         logging.info('DB committed upon user request.')
1264 |     #elif expr == 'raiseex':  # For debug
1265 |         #async_func(_raise_ex)(Exception('/_cmd raiseex'))
1266 |     #else:
1267 |         #sendmsg('ping', chatid, replyid)
1268 | 
1269 | def cmd__welcome(expr, chatid, replyid, msg):
1270 |     if chatid != -CFG['groupid']:
1271 |         return
1272 |     usr = msg["new_chat_participant"]
1273 |     USER_CACHE[usr["id"]] = (usr.get("username"), usr.get("first_name"), usr.get("last_name"))
1274 |     sendmsg('欢迎 %s 加入本群！' % dc_getufname(usr), chatid, replyid)
1275 | 
1276 | facescore = lambda x,y: 1/2*math.erfc((0.5*y-x)/(2**0.5*(0.5*y**0.5)))*100
1277 | 
1278 | fstable = [facescore(i, 100) for i in range(101)]
1279 | revface = lambda x: min((abs(x-v), k) for k,v in enumerate(fstable))[1]
1280 | 
1281 | def cmd_233(expr, chatid, replyid, msg):
1282 |     try:
1283 |         num = max(min(int(expr), 100), 1)
1284 |     except Exception:
1285 |         num = 1
1286 |     w = math.ceil(num ** .5)
1287 |     h, rem = divmod(num, w)
1288 |     txt = '\n'.join(''.join(srandom.choice('🌝🌚') for i in range(w)) for j in range(h))
1289 |     if rem:
1290 |         txt += '\n' + ''.join(srandom.choice('🌝🌚') for i in range(rem))
1291 |     wcount = txt.count('🌝')
1292 |     if num > 9:
1293 |         txt += '\n' + '(🌝%d/🌚%d' % (wcount, num - wcount)
1294 |         if num > 41:
1295 |             txt += ', 🌝%.2f%%' % facescore(wcount, num)
1296 |         txt += ')'
1297 |     sendmsg(txt, chatid, replyid)
1298 | 
1299 | def cmd_fig(expr, chatid, replyid, msg):
1300 |     '''/fig <char> Make figure out of moon faces.'''
1301 |     if expr:
1302 |         runapptask('fig', (expr,), (chatid, replyid))
1303 |     else:
1304 |         sendmsg(srandom.choice('🌝🌚'), chatid, replyid)
1305 | 
1306 | def cmd_start(expr, chatid, replyid, msg):
1307 |     if chatid != -CFG['groupid']:
1308 |         sendmsg('This is Orz Digger. It can help you search the long and boring chat log of the ##Orz group.\nSend me /help for help.', chatid, replyid)
1309 | 
1310 | def cmd_help(expr, chatid, replyid, msg):
1311 |     '''/help Show usage.'''
1312 |     if expr:
1313 |         if expr in COMMANDS:
1314 |             h = COMMANDS[expr].__doc__
1315 |             if h:
1316 |                 sendmsg(h, chatid, replyid)
1317 |             else:
1318 |                 sendmsg('Help is not available for ' + expr, chatid, replyid)
1319 |         else:
1320 |             sendmsg('Command not found.', chatid, replyid)
1321 |     elif chatid == -CFG['groupid']:
1322 |         sendmsg('Full help disabled in this group.', chatid, replyid)
1323 |     elif chatid > 0:
1324 |         sendmsg('\n'.join(uniq(cmd.__doc__ for cmd in COMMANDS.values() if cmd.__doc__)), chatid, replyid)
1325 |     else:
1326 |         sendmsg('\n'.join(uniq(cmd.__doc__ for cmdname, cmd in COMMANDS.items() if cmd.__doc__ and cmdname in PUBLIC)), chatid, replyid)
1327 | 
1328 | def sig_commit(signum, frame):
1329 |     db.commit()
1330 |     logging.info('DB committed upon signal %s' % signum)
1331 | 
1332 | # should document usage in docstrings
1333 | COMMANDS = collections.OrderedDict((
1334 | ('m', cmd_getmsg),
1335 | ('context', cmd_context),
1336 | ('s', cmd_search),
1337 | ('search', cmd_search),
1338 | ('mention', cmd_mention),
1339 | ('user', cmd_uinfo),
1340 | ('uinfo', cmd_uinfo),
1341 | ('digest', cmd_digest),
1342 | ('stat', cmd_stat),
1343 | ('calc', cmd_calc),
1344 | #('calc', cmd_py),
1345 | ('py', cmd_py),
1346 | ('bf', cmd_bf),
1347 | ('lisp', cmd_lisp),
1348 | ('name', cmd_name),
1349 | ('ime', cmd_ime),
1350 | ('fig', cmd_fig),
1351 | ('cc', cmd_cc),
1352 | ('quote', cmd_quote),
1353 | ('wyw', cmd_wyw),
1354 | ('cut', cmd_cut),
1355 | ('mgw', cmd_mgw),
1356 | ('say', cmd_say),
1357 | ('reply', cmd_reply),
1358 | #('cont', cmd_cont),
1359 | #('echo', cmd_echo),
1360 | ('do', cmd_do),
1361 | ('t2i', cmd_t2i),
1362 | ('i2t', cmd_i2t),
1363 | ('autoclose', cmd_autoclose),
1364 | ('233', cmd_233),
1365 | ('start', cmd_start),
1366 | ('help', cmd_help),
1367 | ('cancel', cmd_cancel),
1368 | ('_cmd', cmd__cmd)
1369 | ))
1370 | 
1371 | PUBLIC = set((
1372 | 'user',
1373 | 'calc',
1374 | 'py',
1375 | 'bf',
1376 | 'lisp',
1377 | 'name',
1378 | 'ime',
1379 | 'fig',
1380 | 'cc',
1381 | 'wyw',
1382 | 'cut',
1383 | 'say',
1384 | 'reply',
1385 | #'cont',
1386 | #'echo',
1387 | 'do',
1388 | '233',
1389 | 'start',
1390 | 'cancel',
1391 | 'help'
1392 | ))
1393 | 
1394 | srandom = random.SystemRandom()
1395 | 
1396 | OFFSET = conn.execute('SELECT val FROM config WHERE id = 0').fetchone()
1397 | OFFSET = OFFSET[0] if OFFSET else 0
1398 | IRCOFFSET = conn.execute('SELECT val FROM config WHERE id = 1').fetchone()
1399 | IRCOFFSET = IRCOFFSET[0] if IRCOFFSET else -1000000
1400 | USER_CACHE = LRUCache(20)
1401 | MSG_CACHE = LRUCache(10)
1402 | CFG = json.load(open('config.json'))
1403 | URL = 'https://api.telegram.org/bot%s/' % CFG['token']
1404 | URL_FILE = 'https://api.telegram.org/file/bot%s/' % CFG['token']
1405 | 
1406 | # Initialize messages in database
1407 | 
1408 | #importdb('telegram-history.db')
1409 | #importupdates(OFFSET, 2000)
1410 | #importfixservice('telegram-history.db')
1411 | #sys.exit(0)
1412 | 
1413 | signal.signal(signal.SIGUSR1, sig_commit)
1414 | 
1415 | MSG_Q = queue.Queue()
1416 | LOG_Q = queue.Queue()
1417 | APP_TASK = {}
1418 | APP_LCK = threading.Lock()
1419 | APP_CMD = ('python3', 'appserve.py')
1420 | APP_P = subprocess.Popen(APP_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
1421 | executor = concurrent.futures.ThreadPoolExecutor(10)
1422 | 
1423 | pollthr = threading.Thread(target=getupdates)
1424 | pollthr.daemon = True
1425 | pollthr.start()
1426 | 
1427 | appthr = threading.Thread(target=getappresult)
1428 | appthr.daemon = True
1429 | appthr.start()
1430 | 
1431 | ircconn = None
1432 | if 'ircserver' in CFG:
1433 |     checkircconn()
1434 |     ircthr = threading.Thread(target=getircupd)
1435 |     ircthr.daemon = True
1436 |     ircthr.start()
1437 | 
1438 | # fx233es = fparser.Parser(numtype='decimal')
1439 | 
1440 | logging.info('Satellite launched.')
1441 | 
1442 | try:
1443 |     while 1:
1444 |         try:
1445 |             processmsg()
1446 |         except Exception as ex:
1447 |             logging.exception('Failed to process a message.')
1448 |             continue
1449 | finally:
1450 |     while 1:
1451 |         try:
1452 |             logmsg(LOG_Q.get_nowait())
1453 |         except queue.Empty:
1454 |             break
1455 |     conn.execute('REPLACE INTO config (id, val) VALUES (0, ?)', (OFFSET,))
1456 |     conn.execute('REPLACE INTO config (id, val) VALUES (1, ?)', (IRCOFFSET,))
1457 |     json.dump(CFG, open('config.json', 'w'), sort_keys=True, indent=4)
1458 |     db.commit()
1459 |     APP_P.terminate()
1460 |     logging.info('Shut down cleanly.')
1461 | 


--------------------------------------------------------------------------------
/config.sample.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"token": "",
3 | 	"botname": "",
4 | 	"botid": 123456789,
5 | 	"groupid": 12345678,
6 | 	"groupname": "",
7 | 	"timezone": 8
8 | }
9 | 


--------------------------------------------------------------------------------
/digest.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import os
  5 | import re
  6 | import sys
  7 | import time
  8 | import math
  9 | import json
 10 | import shutil
 11 | import sqlite3
 12 | import operator
 13 | import itertools
 14 | import collections
 15 | 
 16 | import jinja2
 17 | import truecaser
 18 | 
 19 | #import jieba
 20 | from vendor import mosesproxy as jieba
 21 | from vendor import zhconv
 22 | 
 23 | NAME = '##Orz'
 24 | TITLE = '##Orz 分部喵'
 25 | TIMEZONE = 8 * 3600
 26 | CUTWINDOW = (0 * 3600, 6 * 3600)
 27 | LINKWINDOW = 120
 28 | CHUNKINTERV = 120
 29 | 
 30 | CFG = json.load(open('config.json'))
 31 | db = sqlite3.connect('chatlog.db')
 32 | conn = db.cursor()
 33 | 
 34 | USER_CACHE = {}
 35 | 
 36 | re_word = re.compile(r"\w+", re.UNICODE)
 37 | re_tag = re.compile(r"#\w+", re.UNICODE)
 38 | re_at = re.compile('@[A-Za-z][A-Za-z0-9_]{4,31}')
 39 | re_url = re.compile(r"(^|[\s.:;?\-\]<\(])(https?://[-\w;/?:@&=+$\|\_.!~*\|'()\[\]%#,]+[\w/#](\(\))?)(?=$|[\s',\|\(\).:;?\-\[\]>\)])")
 40 | re_ircaction = re.compile('^\x01ACTION (.*)\x01$')
 41 | _ig1 = operator.itemgetter(1)
 42 | 
 43 | MEDIA_TYPES = {
 44 | 'text': '文本',
 45 | 'audio': '声音',
 46 | 'document': '文件',
 47 | 'photo': '图片',
 48 | 'sticker': '贴纸',
 49 | 'video': '视频',
 50 | 'voice': '语音',
 51 | 'contact': '名片',
 52 | 'location': '位置',
 53 | 'service': '服务'
 54 | }
 55 | 
 56 | SERVICE = frozenset(('new_chat_participant', 'left_chat_participant', 'new_chat_title', 'new_chat_photo', 'delete_chat_photo', 'group_chat_created'))
 57 | 
 58 | def daystart(sec=None):
 59 |     if not sec:
 60 |         sec = time.time()
 61 |     return int((sec + TIMEZONE) // 86400 * 86400 - TIMEZONE)
 62 | 
 63 | def uniq(seq, key=None): # Dave Kirby
 64 |     # Order preserving
 65 |     seen = set()
 66 |     if key:
 67 |         return [x for x in seq if key(x) not in seen and not seen.add(key(x))]
 68 |     else:
 69 |         return [x for x in seq if x not in seen and not seen.add(x)]
 70 | 
 71 | def db_getuser(uid):
 72 |     r = USER_CACHE.get(uid)
 73 |     if r is None:
 74 |         r = conn.execute('SELECT username, first_name, last_name FROM users WHERE id = ?', (uid,)).fetchone() or (None, None, None)
 75 |         USER_CACHE[uid] = r
 76 |     return r
 77 | 
 78 | def db_isbot(uid):
 79 |     return (db_getuser(uid)[0] or '').lower().endswith('bot')
 80 | 
 81 | def db_getufname(uid, mmedia=None):
 82 |     if uid == CFG['ircbotid']:
 83 |         if mmedia and '_ircuser' in mmedia:
 84 |             return mmedia['_ircuser']
 85 |         else:
 86 |             return '<IRC 用户>'
 87 |     else:
 88 |         name, last = db_getuser(uid)[1:]
 89 |         if last:
 90 |             name += ' ' + last
 91 |         return name or '<未知>'
 92 | 
 93 | def db_getfirstname(uid, mmedia=None):
 94 |     if uid == CFG['ircbotid']:
 95 |         if mmedia and '_ircuser' in mmedia:
 96 |             return mmedia['_ircuser']
 97 |         else:
 98 |             return '<IRC 用户>'
 99 |     else:
100 |         fn = db_getufname(uid)
101 |         return fn.split()[0]
102 | 
103 | def strftime(fmt, t=None):
104 |     if t is None:
105 |         t = time.time()
106 |     t += TIMEZONE
107 |     return time.strftime(fmt, time.gmtime(t))
108 | 
109 | def getwday(t=None):
110 |     if t is None:
111 |         t = time.time()
112 |     t += TIMEZONE
113 |     return ('周一','周二','周三','周四','周五','周六','周日')[time.gmtime(t)[6]]
114 | 
115 | def stripreaction(text):
116 |     act = re_ircaction.match(text)
117 |     if act:
118 |         return act.group(1)
119 |     else:
120 |         return text
121 | 
122 | class DirectWeightedGraph:
123 |     d = 0.85
124 | 
125 |     def __init__(self):
126 |         self.graph = collections.defaultdict(list)
127 | 
128 |     def add_edge(self, start, end, weight):
129 |         self.graph[start].append((end, weight))
130 | 
131 |     def rank(self):
132 |         ws = collections.defaultdict(float)
133 |         outSum = collections.defaultdict(float)
134 | 
135 |         wsdef = 1.0 / (len(self.graph) or 1.0)
136 |         for n, out in self.graph.items():
137 |             ws[n] = wsdef
138 |             outSum[n] = sum((e[1] for e in out), 0.0)
139 | 
140 |         # this line for build stable iteration
141 |         sorted_keys = sorted(self.graph.keys())
142 |         for x in range(10):  # 10 iters
143 |             for n in sorted_keys:
144 |                 s = 0
145 |                 for e in self.graph[n]:
146 |                     if outSum[e[0]] and ws[e[0]]:
147 |                         s += e[1] / outSum[e[0]] * ws[e[0]]
148 |                 ws[n] = (1 - self.d) + self.d * s
149 | 
150 |         (min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])
151 | 
152 |         for w in ws.values():
153 |             if w < min_rank:
154 |                 min_rank = w
155 |             elif w > max_rank:
156 |                 max_rank = w
157 | 
158 |         for n, w in ws.items():
159 |             # to unify the weights, don't *100.
160 |             ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
161 | 
162 |         return ws
163 | 
164 | 
165 | class DigestComposer:
166 | 
167 |     def __init__(self, date):
168 |         self.template = 'digest.html'
169 |         self.date = date
170 |         self.title = ''
171 |         self.tc = truecaser.Truecaser(truecaser.loaddict(open('vendor/truecase.txt', 'rb')))
172 |         self.stopwords = frozenset(map(str.strip, open('vendor/stopwords.txt', 'r', encoding='utf-8')))
173 |         self.ircbots = re.compile(r'(titlbot|varia|Akarin).*')
174 |         self.fetchmsg(date)
175 |         self.msgindex()
176 | 
177 |     def fetchmsg(self, date):
178 |         '''
179 |         Fetch messages that best fits in a day.
180 |         '''
181 |         start = (daystart(date) + CUTWINDOW[0], daystart(date) + CUTWINDOW[1])
182 |         end = (daystart(date) + 86400 +
183 |                CUTWINDOW[0], daystart(date) + 86400 + CUTWINDOW[1])
184 |         last, lastid = start[0], 0
185 |         msgs = collections.OrderedDict()
186 |         intervals = ([], [])
187 |         for mid, src, text, date, fwd_src, fwd_date, reply_id, media in conn.execute('SELECT id, src, text, date, fwd_src, fwd_date, reply_id, media FROM messages WHERE date >= ? AND date < ? ORDER BY date ASC, id ASC', (start[0], end[1])):
188 |             msgs[mid] = (src, text or '', date, fwd_src, fwd_date, reply_id, media)
189 |             if start[0] <= date < start[1]:
190 |                 intervals[0].append((date - last, mid))
191 |             elif last < start[1] <= date:
192 |                 intervals[0].append((date - last, mid))
193 |             elif end[0] <= date < end[1]:
194 |                 if last < end[0]:
195 |                     last = end[0]
196 |                 intervals[1].append((date - last, lastid))
197 |             last = date
198 |             lastid = mid
199 |         intervals[1].append((end[1] - last, lastid))
200 |         if not msgs:
201 |             raise ValueError('Not enough messages in (%s, %s)' % (start[0], end[1]))
202 |         self.start = startd = msgs[max(intervals[0] or ((0, tuple(msgs.keys())[0]),))[1]][2]
203 |         self.end = endd = msgs[max(intervals[1] or ((0, tuple(msgs.keys())[-1]),))[1]][2]
204 |         self.msgs = collections.OrderedDict(
205 |             filter(lambda x: startd <= x[1][2] <= endd, msgs.items()))
206 | 
207 |     def msgpreprocess(self, text):
208 |         at = False
209 |         for t in jieba.cut(text, HMM=False):
210 |             if t == '@':
211 |                 at = True
212 |             elif at:
213 |                 yield '@' + t
214 |                 at = False
215 |             elif t.lower() not in self.stopwords:
216 |                 # t.isidentifier() and 
217 |                 yield t
218 | 
219 |     def msgindex(self):
220 |         self.fwd_lookup = {}
221 |         self.words = collections.Counter()
222 |         self.msgtok = {}
223 |         for mid, value in self.msgs.items():
224 |             src, text, date, fwd_src, fwd_date, reply_id, media = value
225 |             self.fwd_lookup[(src, date)] = mid
226 |             tok = self.msgtok[mid] = tuple(self.msgpreprocess(zhconv.convert(self.tc.truecase(re_url.sub('', stripreaction(text))), 'zh-hans')))
227 |             for w in frozenset(t.lower() for t in tok):
228 |                 self.words[w] += 1
229 |         self.words = dict(self.words)
230 | 
231 |     def chunker(self):
232 |         results = []
233 |         chunk = []
234 |         last = 0
235 |         for mid, value in self.msgs.items():
236 |             src, text, date, fwd_src, fwd_date, reply_id, media = value
237 |             if date - last > CHUNKINTERV and chunk:
238 |                 results.append(chunk)
239 |                 chunk = []
240 |             last = date
241 |             chunk.append(mid)
242 |         if chunk:
243 |             results.append(chunk)
244 |         return sorted(results, key=len, reverse=True)
245 | 
246 |     def tfidf(self, term, text):
247 |         return text.count(term) / len(text) * math.log(len(self.msgs) / self.words.get(term, 1))
248 | 
249 |     def tfidf_kwd(self, toks, topK=15):
250 |         toks = tuple(filter(lambda x: len(x) > 1, toks))
251 |         toklen = len(toks)
252 |         msglen = len(self.msgs)
253 |         return tuple(map(_ig1, sorted((-count / toklen * math.log(msglen / self.words.get(term, 1)), term) for term, count in collections.Counter(toks).items())))[:topK]
254 | 
255 |     def tr_kwd(self, toks, topK=15):
256 |         return jieba.analyse.textrank(' '.join(toks), topK, False, ('n', 'ns', 'nr', 'vn', 'v', 'eng'))
257 | 
258 |     def cosinesimilarity(self, a, b):
259 |         msga = self.msgtok[a]
260 |         msgb = self.msgtok[b]
261 |         vcta = {w:self.tfidf(w.lower(), msga) for w in frozenset(msga)}
262 |         vctb = {w:self.tfidf(w.lower(), msgb) for w in frozenset(msgb)}
263 |         keys = vcta.keys() & vctb.keys()
264 |         ma = sum(i**2 for i in vcta.values())**.5
265 |         mb = sum(i**2 for i in vctb.values())**.5
266 |         return (sum(vcta[i]*vctb[i] for i in keys) /
267 |                 ma / mb) if (ma and mb) else 0
268 | 
269 |     def classify(self, mid):
270 |         '''
271 |         0 - Normal messages sent by users
272 |         1 - Interesting messages sent by the bots
273 |         2 - Boring messages sent by users
274 |         3 - Boring messages sent by the bots
275 |         '''
276 |         src, text, date, fwd_src, fwd_date, reply_id, media = self.msgs[mid]
277 |         if src == CFG['botid']:
278 |             repl = self.msgs.get(reply_id)
279 |             if repl and (repl[1].startswith('/say') or repl[1].startswith('/reply')):
280 |                 return 1
281 |             else:
282 |                 return 3
283 |         elif src == CFG['ircbotid']:
284 |             mmedia = json.loads(media or '{}')
285 |             if self.ircbots.match(mmedia.get('_ircuser', '')):
286 |                 return 3
287 |             else:
288 |                 return 0
289 |         elif db_isbot(fwd_src) and len(text or '') > 75:
290 |             return 3
291 |         elif not text or text.startswith('/'):
292 |             return 2
293 |         else:
294 |             return 0
295 | 
296 |     def hotrank(self, chunk):
297 |         graph = DirectWeightedGraph()
298 |         edges = {}
299 |         similarity = self.cosinesimilarity
300 |         for mid in chunk:
301 |             src, text, date, fwd_src, fwd_date, reply_id, media = self.msgs[mid]
302 |             if self.classify(mid) > 1:
303 |                 continue
304 |             backlink = self.fwd_lookup.get((fwd_src, fwd_date)) or reply_id
305 |             if (backlink in self.msgs and (mid, backlink) not in edges):
306 |                 edges[(mid, backlink)] = similarity(mid, backlink)
307 |             for mid2, value2 in self.msgs.items():
308 |                 if 0 < date - value2[2] < LINKWINDOW:
309 |                     w = edges.get((mid, mid2)) or edges.get((mid2, mid)) or similarity(mid, mid2)
310 |                     edges[(mid, mid2)] = w
311 |                     edges[(mid2, mid)] = w
312 |         for key, weight in edges.items():
313 |             if weight:
314 |                 graph.add_edge(key[0], key[1], weight)
315 |         del edges
316 |         return sorted(graph.rank().items(), key=_ig1, reverse=True)
317 | 
318 |     def hotchunk(self):
319 |         for chunk in self.chunker()[:5]:
320 |             kwds = self.tfidf_kwd(itertools.chain.from_iterable(self.msgtok[mid] for mid in chunk if self.classify(mid) < 2))
321 |             hotmsg = []
322 |             wordinmsg = lambda x: re_word.search(self.msgs[x][1])
323 |             ranked = uniq(uniq(filter(wordinmsg, map(lambda x: self.fwd_lookup.get(operator.itemgetter(3, 4)(self.msgs[x[0]]), x[0]), self.hotrank(chunk)))), key=lambda x: self.tc.truecase(self.msgs[x][1])) or list(filter(wordinmsg, chunk)) or chunk
324 |             for mid in (ranked[:10] or chunk[:10]):
325 |                 msg = self.msgs[mid]
326 |                 text = msg[1]
327 |                 if len(text) > 500:
328 |                     text = text[:500] + '…'
329 |                 hotmsg.append((mid, stripreaction(text), msg[0], db_getfirstname(msg[0], json.loads(msg[6] or '{}')), strftime('%H:%M:%S', msg[2])))
330 |             yield (kwds, hotmsg)
331 | 
332 |     def tags(self):
333 |         tags = collections.defaultdict(list)
334 |         for mid, value in self.msgs.items():
335 |             text = value[1] or ''
336 |             for tag in re_tag.findall(text):
337 |                 tags[self.tc.truecase(tag)].append(mid)
338 |         return sorted(tags.items(), key=lambda x: (-len(x[1]), x[0]))
339 | 
340 |     def tc_preprocess(self):
341 |         titles = []
342 |         for mid, value in self.msgs.items():
343 |             media = json.loads(value[6] or '{}')
344 |             if 'new_chat_title' in media:
345 |                 titles.append((mid, media['new_chat_title']))
346 |         if titles:
347 |             prefix = [os.path.commonprefix([text for mid, text in titles])]
348 |         else:
349 |             prefix = [self.title]
350 |         for mid, text in titles:
351 |             for k in range(len(prefix), -1, -1):
352 |                 pf = ''.join(prefix[:k])
353 |                 if text.startswith(pf):
354 |                     text = text[len(pf):]
355 |                     prefix = prefix[:k]
356 |                     prefix.append(text)
357 |                     break
358 |             yield (mid, prefix)
359 | 
360 |     def titlechange(self):
361 |         last = []
362 |         for mid, prefix in self.tc_preprocess():
363 |             comm = os.path.commonprefix((last, prefix))
364 |             if len(prefix) == len(last) == len(comm) + 1:
365 |                 yield '<li>'
366 |                 msg = self.msgs[mid]
367 |                 yield (mid, prefix[-1], msg[0], db_getfirstname(msg[0]), strftime('%H:%M:%S', msg[2]))
368 |                 yield '</li>'
369 |             else:
370 |                 for k in range(len(last) - len(comm)):
371 |                     yield '</ul>'
372 |                 for item in prefix[len(comm):-1]:
373 |                     yield '<ul><li>'
374 |                     yield (mid, item)
375 |                     yield '</li>'
376 |                 yield '<ul><li>'
377 |                 msg = self.msgs[mid]
378 |                 yield (mid, prefix[-1], msg[0], db_getfirstname(msg[0]), strftime('%H:%M:%S', msg[2]))
379 |                 yield '</li>'
380 |             last = prefix
381 |         for item in last:
382 |             yield '</ul>'
383 | 
384 |     def generalinfo(self):
385 |         ctr = collections.Counter(i[0] for i in self.msgs.values())
386 |         mcomm = ctr.most_common(5)
387 |         count = len(self.msgs)
388 |         others = count - sum(v for k, v in mcomm)
389 |         delta = self.end - self.start
390 |         stat = {
391 |             'start': strftime('%d 日 %H:%M:%S', self.start),
392 |             'end': strftime('%d 日 %H:%M:%S', self.end),
393 |             'count': count,
394 |             'freq': '%.2f' % (count * 60 / delta) if delta else 'N/A',
395 |             'flooder': tuple(((k, db_getufname(k)), v, '%.2f%%' % (v/count*100)) for k, v in mcomm),
396 |             'tags': self.tags()[:6],
397 |             'others': (others, '%.2f%%' % (others/count*100)),
398 |             'avg': '%.2f' % (count / len(ctr))
399 |         }
400 |         return stat
401 | 
402 |     def render(self):
403 |         kvars = {
404 |             'name': NAME,
405 |             'date': strftime('%Y-%m-%d', self.date),
406 |             'wday': getwday(self.date),
407 |             'info': self.generalinfo(),
408 |             'hotchunk': tuple(self.hotchunk()),
409 |             'titlechange': tuple(self.titlechange()),
410 |             'gentime': strftime('%Y-%m-%d %H:%M:%S')
411 |         }
412 |         template = jinja2.Environment(loader=jinja2.FileSystemLoader('templates')).get_template(self.template)
413 |         return template.render(**kvars)
414 | 
415 | class StatComposer:
416 | 
417 |     def __init__(self):
418 |         self.template = 'stat.html'
419 |         self.tc = truecaser.Truecaser(truecaser.loaddict(open('vendor/truecase.txt', 'rb')))
420 | 
421 |     def fetchmsgstat(self):
422 |         self.msglen = self.start = self.end = 0
423 |         hourctr = [0] * 24
424 |         mediactr = collections.Counter()
425 |         usrctr = collections.Counter()
426 |         tags = collections.Counter()
427 |         for mid, src, text, date, media in conn.execute('SELECT id, src, text, date, media FROM messages ORDER BY date ASC, id ASC'):
428 |             text = text or ''
429 |             if not self.start:
430 |                 self.start = date
431 |             self.start = min(self.start, date)
432 |             self.end = max(self.end, date)
433 |             for tag in re_tag.findall(text):
434 |                 tags[self.tc.truecase(tag)] += 1
435 |             media = json.loads(media or '{}')
436 |             mt = media.keys() & MEDIA_TYPES.keys()
437 |             if mt:
438 |                 t = tuple(mt)[0]
439 |             elif media.keys() & SERVICE:
440 |                 t = 'service'
441 |             else:
442 |                 t = 'text'
443 |             hourctr[int(((date + TIMEZONE) // 3600) % 24)] += 1
444 |             mediactr[t] += 1
445 |             usrctr[src] += 1
446 |             self.msglen += 1
447 |         self.end = date
448 |         typesum = sum(mediactr.values())
449 |         types = [(MEDIA_TYPES[k], '%.2f%%' % (v * 100 / typesum)) for k, v in mediactr.most_common()]
450 |         tags = sorted(filter(lambda x: x[1] > 2, tags.items()), key=lambda x: (-x[1], x[0]))
451 |         return hourctr, types, tags, usrctr
452 | 
453 |     def generalinfo(self):
454 |         hours, types, tags, usrctr = self.fetchmsgstat()
455 |         hsum = sum(hours)
456 |         hourdist = ['%.2f%%' % (h * 100 / hsum) for h in hours]
457 |         mcomm = usrctr.most_common()
458 |         count = self.msglen
459 |         stat = {
460 |             'start': strftime('%Y-%m-%d %H:%M:%S', self.start),
461 |             'end': strftime('%Y-%m-%d %H:%M:%S', self.end),
462 |             'count': count,
463 |             'freq': '%.2f' % (count * 60 / (self.end - self.start)),
464 |             'flooder': tuple(((k, db_getufname(k)), db_getuser(k)[0] or '', '%.2f%%' % (v/count*100)) for k, v in mcomm if v > 2),
465 |             'hours': hourdist,
466 |             'types': types,
467 |             'tags': tags,
468 |             'avg': '%.2f' % (count / len(usrctr))
469 |         }
470 |         return stat
471 | 
472 |     def render(self):
473 |         kvars = {
474 |             'name': NAME,
475 |             'info': self.generalinfo(),
476 |             'gentime': strftime('%Y-%m-%d %H:%M:%S')
477 |         }
478 |         template = jinja2.Environment(loader=jinja2.FileSystemLoader('templates')).get_template(self.template)
479 |         return template.render(**kvars)
480 | 
481 | re_digest = re.compile(r'^(\d+)-(\d+)-(\d+).html$')
482 | 
483 | class DigestManager:
484 | 
485 |     def __init__(self, path='.'):
486 |         self.template = 'index.html'
487 |         self.path = path
488 | 
489 |     def copyresource(self):
490 |         for filename in ('digest.css',):
491 |             src = os.path.join('templates', filename)
492 |             dst = os.path.join(self.path, filename)
493 |             shutil.copyfile(src, dst)
494 |             shutil.copystat(src, dst)
495 | 
496 |     def writenewdigest(self, date=None, update=False):
497 |         date = date or (time.time() - 86400)
498 |         filename = os.path.join(self.path, strftime('%Y-%m-%d.html', date))
499 |         if not update and os.path.isfile(filename):
500 |             return
501 |         try:
502 |             dc = DigestComposer(date)
503 |         except ValueError:
504 |             return
505 |         dc.title = TITLE
506 |         with open(filename, 'w') as f:
507 |             f.write(dc.render())
508 |         del dc
509 | 
510 |     def writenewstat(self):
511 |         sc = StatComposer()
512 |         with open(os.path.join(self.path, 'stat.html'), 'w') as f:
513 |             f.write(sc.render())
514 |         del sc
515 | 
516 |     def genindex(self):
517 |         index = []
518 |         for filename in sorted(os.listdir(self.path), reverse=True):
519 |             fn = re_digest.match(filename)
520 |             if fn:
521 |                 index.append((filename, '%s 年 %s 月 %s 日' % fn.groups()))
522 |         return index
523 | 
524 |     def render(self):
525 |         kvars = {
526 |             'name': NAME,
527 |             'index': self.genindex(),
528 |             'gentime': strftime('%Y-%m-%d %H:%M:%S')
529 |         }
530 |         template = jinja2.Environment(loader=jinja2.FileSystemLoader('templates')).get_template(self.template)
531 |         return template.render(**kvars)
532 | 
533 |     def writenewindex(self):
534 |         with open(os.path.join(self.path, 'index.html'), 'w') as f:
535 |             f.write(self.render())
536 | 
537 | 
538 | if __name__ == '__main__':
539 | 
540 |     path = '.'
541 |     version = 1
542 |     update = False
543 | 
544 |     if len(sys.argv) > 1:
545 |         path = sys.argv[1]
546 |     if len(sys.argv) > 2:
547 |         version = int(sys.argv[2])
548 |     if len(sys.argv) > 3:
549 |         update = bool(sys.argv[3])
550 | 
551 |     start = time.time()
552 |     dm = DigestManager(path)
553 |     dm.copyresource()
554 |     for i in range(1, version+1):
555 |         dm.writenewdigest(start - 86400 * i, update)
556 |     dm.writenewstat()
557 |     dm.writenewindex()
558 |     sys.stderr.write('Done in %.4gs.\n' % (time.time() - start))
559 | 


--------------------------------------------------------------------------------
/templates/digest.css:
--------------------------------------------------------------------------------
  1 | body,
  2 | textarea,
  3 | input,
  4 | select {
  5 |     font-family: Roboto, Arial, sans-serif;
  6 |     font-size: 15px;
  7 |     line-height:1.25em;
  8 | }
  9 | .smooth {
 10 |     transition: all .2s
 11 | }
 12 | .btn {
 13 |     text-decoration: none
 14 | }
 15 | .container {
 16 |     margin: 0 1.25em;
 17 |     max-width: 75em;
 18 |     width: auto
 19 | }
 20 | label>* {
 21 |     display: inline
 22 | }
 23 | form>* {
 24 |     margin-bottom: .625em
 25 | }
 26 | .btn {
 27 |     background: #999;
 28 |     border-radius: 2px;
 29 |     border: 0;
 30 |     color: #fff;
 31 |     cursor: pointer;
 32 |     display: inline-block;
 33 |     padding: .4em 1em;
 34 |     font-size: 1em;
 35 | }
 36 | .btn:hover {
 37 |     background: #888
 38 | }
 39 | .btn:active,
 40 | .btn:focus {
 41 |     background: #777;
 42 |     outline: 0
 43 | }
 44 | .btn-a {
 45 |     background: #0ae
 46 | }
 47 | .btn-a:hover,
 48 | .btn-a:focus {
 49 |     background: #09d
 50 | }
 51 | .btn-a:active {
 52 |     background: #08b
 53 | }
 54 | .btn-b {
 55 |     background: #3c5
 56 | }
 57 | .btn-b:hover,
 58 | .btn-b:focus {
 59 |     background: #2b4
 60 | }
 61 | .btn-b:active {
 62 |     background: #2a4
 63 | }
 64 | .btn-c {
 65 |     background: #d33
 66 | }
 67 | .btn-c:hover,
 68 | .btn-c:focus {
 69 |     background: #c22
 70 | }
 71 | .btn-c:active {
 72 |     background: #b22
 73 | }
 74 | .btn-sm {
 75 |     border-radius: 2px;
 76 | }
 77 | .row {
 78 |     overflow: auto
 79 | }
 80 | .col {
 81 |     float: left
 82 | }
 83 | .table,
 84 | .c12 {
 85 |     width: 100%
 86 | }
 87 | .c11 {
 88 |     width: 91.66%
 89 | }
 90 | .c10 {
 91 |     width: 83.33%
 92 | }
 93 | .c9 {
 94 |     width: 75%
 95 | }
 96 | .c8 {
 97 |     width: 66.66%
 98 | }
 99 | .c7 {
100 |     width: 58.33%
101 | }
102 | .c6 {
103 |     width: 50%
104 | }
105 | .c5 {
106 |     width: 41.66%
107 | }
108 | .c4 {
109 |     width: 33.33%
110 | }
111 | .c3 {
112 |     width: 25%
113 | }
114 | .c2 {
115 |     width: 16.66%
116 | }
117 | .c1 {
118 |     width: 8.33%
119 | }
120 | fieldset, button {
121 |     margin: 0;
122 |     padding: 0.35em 0 0.75em;
123 |     border: 0;
124 | }
125 | .btn-sm,
126 | .nav {
127 |     font-size: .875em;
128 | }
129 | textarea,
130 | input,
131 | select,
132 | button {
133 |     padding: .2em .3em;
134 |     outline: 0;
135 |     font-size: 100%;
136 | }
137 | textarea,
138 | input,
139 | select {
140 |     border: 1px solid #ccc
141 | }
142 | textarea:focus,
143 | input:focus,
144 | select:focus {
145 |     border-color: #19E
146 | }
147 | textarea,
148 | input[type=text] {
149 |     -webkit-appearance: none;
150 |     width: 13em;
151 |     box-sizing: border-box;
152 | }
153 | @media(max-width:48em) {
154 |     .row.rmd > .col {
155 |         width: 100%;
156 |         float: none;
157 |     }
158 | }
159 | /*.table th,
160 | .table td {
161 |     padding: .5em;
162 |     text-align: left
163 | }*/
164 | table tbody>:nth-child(2n-1) {
165 |     background: whitesmoke;
166 | }
167 | .msg {
168 |     padding: 1.5em;
169 |     background: #def;
170 |     border-left: 5px solid #59d
171 | }
172 | body {
173 |     margin: .5em 2em;
174 |     background-color: whitesmoke;
175 | }
176 | a {
177 |     color: #425e5e;
178 |     padding: .25em;
179 | }
180 | a:visited {
181 |     color: #233;
182 | }
183 | a:hover, a:focus {
184 |     color: #4e7a7a;
185 | }
186 | header {
187 |     margin-top: 0;
188 |     margin-bottom: 1em;
189 | }
190 | h1 {
191 |     font-size: 1.5em;
192 |     color: #0B3861;
193 |     padding-top: .2em;
194 | }
195 | h2 {
196 |     font-size: 1.25em;
197 |     margin: 0 0 .5em;
198 | }
199 | p {
200 |     margin: 0.5em 0;
201 | }
202 | td {
203 |     text-overflow: ellipsis;
204 |     word-wrap: break-word;
205 | }
206 | .meta {
207 |     padding-left: .5em;
208 |     font-size: .8em;
209 |     color: grey;
210 |     display: inline-block;
211 |     word-wrap: break-word;
212 | }
213 | section {
214 |     background-color: white;
215 |     padding: 1.5em;
216 |     margin: 1em 0;
217 |     word-wrap: break-word;
218 |     box-sizing: border-box;
219 | }
220 | .num {
221 |     text-align: right;
222 |     min-width: 2em;
223 | }
224 | .bar {
225 |     text-align: left;
226 |     background-color: #81c3ff;
227 |     overflow-x: visible;
228 |     overflow-wrap: normal;
229 |     word-wrap: normal;
230 | }
231 | .info, footer {
232 |     font-size: .9em;
233 |     color: #233;
234 | }
235 | footer {
236 |     text-align: right;
237 | }
238 | #titlechange ul {
239 |     padding-left: 1.2em;
240 | }
241 | .topic {
242 |     border-top: 1px solid #DDD;
243 |     padding: .5em 0;
244 | }
245 | #fldrankb {
246 |     width: 100%;
247 | }
248 | .fullname {
249 |     width: 45%;
250 | }
251 | .tag {
252 |     width: 80%;
253 |     word-break: break-all;
254 | }
255 | .username {
256 |     min-width: 3em;
257 |     word-break: break-all;
258 | }
259 | .hour {
260 |     width: 2em;
261 |     font-weight: bold;
262 |     text-align: center;
263 | }
264 | .msgtype {
265 |     width: 3em;
266 |     text-align: center;
267 | }
268 | 


--------------------------------------------------------------------------------
/templates/digest.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="zh-cn">
 3 | {% macro user(uid, nick) -%}
 4 | <span class="user" data-uid="{{ uid }}">{{ nick|e }}</span>
 5 | {%- endmacro %}
 6 | {% macro msgwmeta(mid, text, uid=0, nick='', time='') -%}
 7 | <span class="msgwrap" data-mid="{{ mid }}">
 8 | <span class="mtext">{{ text|e }}</span>
 9 | {% if uid %}<span class="meta">{{ user(uid, nick) }}, {{ time }}</span>
10 | {% endif %}
11 | </span>
12 | {%- endmacro %}
13 | <head>
14 | <meta http-equiv="content-type" content="text/html; charset=UTF-8">
15 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
16 | <title>{{ name }} 日报 - {{ date }}</title>
17 | <link rel="stylesheet" href="digest.css">
18 | </head>
19 | <body>
20 | <div id="container">
21 | <header>
22 | <h1 id="pagetitle">{{ name }} 日报<span class="info"> - {{ date }} {{ wday }}</span></h1>
23 | <div class="info">
24 |     <p>开始：{{ info.start }}，结束：{{ info.end }}；<wbr>
25 |     总计 {{ info.count }} 条，每分钟 {{ info.freq }} 条，人均 {{ info.avg }} 条</p>
26 | </div>
27 | </header>
28 | <div class="row rmd">
29 | <section id="flooder" class="col c6">
30 |     <h2>水王榜</h2>
31 |     <table id="fldrank">
32 |     <thead><tr><th>全名</th><th>消息</th><th>占比</th></tr></thead>
33 |     <tbody>
34 |     {% for u in info.flooder %}
35 |     <tr><td>{{ user(*u[0]) }}</td><td class="num">{{ u[1] }}</td>
36 |     <td><div class="bar" style="width: {{ u[2] }}">{{ u[2] }}</div></td></tr>
37 |     {% endfor %}
38 |     <tr>
39 |     <td>&lt;其他用户&gt;</td>
40 |     <td class="num">{{ info.others[0] }}</td>
41 |     <td><div class="bar" style="width: {{ info.others[1] }}">{{ info.others[1] }}</div></td>
42 |     </tr>
43 |     </tbody>
44 |     </table>
45 | </section>
46 | <section id="tags" class="col c6">
47 |     <h2>标签</h2>
48 |     <table id="tagrank">
49 |     <thead><tr><th>标签</th><th>数量</th></tr></thead>
50 |     <tbody>
51 |     {% for t in info.tags %}
52 |     <tr><td>{{ t[0]|e }}</td><td class="num" data-mid="{{ t[1]|join(',') }}">{{ t[1]|length }}</td></tr>
53 |     {% endfor %}
54 |     </tbody>
55 |     </table>
56 | </section>
57 | </div>
58 | <section id="topics">
59 |     <h2>今日热点</h2>
60 |     {% for chunk in hotchunk -%}
61 |     <div class="topic">
62 |     <p><strong>关键词：</strong><span class="keywords">{{ chunk[0]|join(', ')|e }}</span></p>
63 |     <ol>{% for msg in chunk[1] %}<li>{{ msgwmeta(*msg) }}</li>{% endfor %}
64 |     </ol>
65 |     </div>
66 |     {%- endfor %}
67 | </section>
68 | <section id="titlechange">
69 |     <h2>改名部</h2>
70 |     {% for item in titlechange -%}
71 |     {% if item is string %}{{ item }}
72 |     {% else %}{{ msgwmeta(*item) }}
73 |     {% endif %}
74 |     {%- endfor %}
75 | </section>
76 | <footer>
77 | <a href="index.html">存档</a> - 更新时间：{{ gentime }}
78 | </footer>
79 | </div>
80 | </body>
81 | </html>
82 | 


--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="zh-cn">
 3 | <head>
 4 | <meta http-equiv="content-type" content="text/html; charset=UTF-8">
 5 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 | <title>{{ name }} 日报</title>
 7 | <link rel="stylesheet" href="digest.css">
 8 | </head>
 9 | <body>
10 | <div id="container">
11 | <header>
12 | <h1 id="pagetitle">{{ name }} 日报</h1>
13 | </header>
14 | <section id="index">
15 |     <h2>存档</h2>
16 |     <ul>
17 |     {% for item in index %}
18 |     <li><a href="{{ item[0] }}">{{ item[1] }}</a></li>
19 |     {% endfor %}
20 |     </ul>
21 | </section>
22 | <section id="stat">
23 |     <h2><a href="stat.html">统计</a></h2>
24 | </section>
25 | <footer>
26 | 更新时间：{{ gentime }}
27 | </footer>
28 | </div>
29 | </body>
30 | </html>
31 | 


--------------------------------------------------------------------------------
/templates/stat.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="zh-cn">
 3 | {% macro user(uid, nick) -%}
 4 | <span class="user" data-uid="{{ uid }}">{{ nick|e }}</span>
 5 | {%- endmacro %}
 6 | <head>
 7 | <meta http-equiv="content-type" content="text/html; charset=UTF-8">
 8 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
 9 | <title>{{ name }} 统计</title>
10 | <link rel="stylesheet" href="digest.css">
11 | </head>
12 | <body>
13 | <div id="container">
14 | <header>
15 | <h1 id="pagetitle">{{ name }} 统计</h1>
16 | <div class="info">
17 |     <p>自 {{ info.start }} 到 {{ info.end }}
18 |     总计 {{ info.count }} 条，每分钟 {{ info.freq }} 条，人均 {{ info.avg }} 条。</p>
19 | </div>
20 | </header>
21 | <section id="flooder">
22 |     <h2>用户发言</h2>
23 |     <table id="fldrankb" class="table">
24 |     <thead><tr><th>排名</th><th>全名</th><th>用户名</th><th>占比</th></tr></thead>
25 |     <tbody>
26 |     {% for u in info.flooder %}
27 |     <tr><td class="num">{{ loop.index }}</td><td class="fullname">{{ user(*u[0]) }}</td><td class="username">{{ u[1] }}</td>
28 |     <td><div class="bar" style="width: {{ u[2] }}">{{ u[2] }}</div></td></tr>
29 |     {% endfor %}
30 |     </tbody>
31 |     </table>
32 | </section>
33 | <div class="row rmd">
34 | <section id="tags" class="col c8">
35 |     <h2>话题标签</h2>
36 |     <table id="typerank">
37 |     <thead><tr><th>标签</th><th>数量</th></tr></thead>
38 |     <tbody>
39 |     {% for t in info.tags %}
40 |     <tr><td class="tag">{{ t[0] }}</td><td class="num">{{ t[1] }}</td></tr>
41 |     {% endfor %}
42 |     <tr><td class="tag">&lt;1~2 略&gt;</td><td class="num"></td></tr>
43 |     </tbody>
44 |     </table>
45 | </section>
46 | <div class="col c4">
47 | <section id="hourdist">
48 |     <h2>时间分布</h2>
49 |     <table id="hdtable" class="table">
50 |     <thead><tr><th>小时</th><th>占比</th></tr></thead>
51 |     <tbody>
52 |     {% for t in info.hours %}
53 |     <tr><td class="hour">{{ loop.index0 }}</td>
54 |     <td><div class="bar" style="width: {{ t }}">{{ t }}</div></td></tr>
55 |     {% endfor %}
56 |     </tbody>
57 |     </table>
58 | </section>
59 | <section id="types">
60 |     <h2>消息类型</h2>
61 |     <table id="typerank" class="table">
62 |     <thead><tr><th>类型</th><th>占比</th></tr></thead>
63 |     <tbody>
64 |     {% for t in info.types %}
65 |     <tr><td class="msgtype">{{ t[0] }}</td>
66 |     <td><div class="bar" style="width: {{ t[1] }}">{{ t[1] }}</div></td></tr>
67 |     {% endfor %}
68 |     </tbody>
69 |     </table>
70 | </section>
71 | </div>
72 | </div>
73 | <footer>
74 | <a href="index.html">存档</a> - 更新时间：{{ gentime }}
75 | </footer>
76 | </div>
77 | </body>
78 | </html>
79 | 


--------------------------------------------------------------------------------
/tools/dbselect.cgi:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import sys
 6 | import cgi
 7 | import json
 8 | import sqlite3
 9 | import calendar
10 | from email.utils import formatdate, parsedate
11 | 
12 | DB_FILE = 'chatlog.db'
13 | 
14 | MTIME = os.path.getmtime(DB_FILE)
15 | CONN = sqlite3.connect(DB_FILE)
16 | 
17 | def auth(sqltype, arg1, arg2, dbname, source):
18 |     if sqltype in (sqlite3.SQLITE_READ, sqlite3.SQLITE_SELECT, sqlite3.SQLITE_FUNCTION):
19 |         return sqlite3.SQLITE_OK
20 |     else:
21 |         return sqlite3.SQLITE_DENY
22 | 
23 | def do_query(form):
24 |     try:
25 |         sql = form['q'].value
26 |         cur = CONN.cursor()
27 |         cur.execute(sql)
28 |         return '200 OK', json.dumps({
29 |             'ret': 200,
30 |             'description': [desc[0] for desc in cur.description],
31 |             'rows': cur.fetchall()
32 |         }).encode('utf-8')
33 |     except Exception as ex:
34 |         return '400 Bad Request', json.dumps({
35 |             'ret': 400,
36 |             'error': str(ex)
37 |         }).encode('utf-8')
38 | 
39 | form = cgi.FieldStorage()
40 | try:
41 |     if calendar.timegm(parsedate(os.environ['HTTP_IF_MODIFIED_SINCE'])) >= MTIME:
42 |         print("Status: 304 Not Modified")
43 |         print()
44 |         sys.exit(0)
45 | except Exception:
46 |     pass
47 | 
48 | status, reply = do_query(form)
49 | print("Status: " + status)
50 | print("Content-Type: application/json; charset=utf-8")
51 | print("Content-Length: %d" % len(reply))
52 | print("Last-Modified: " + formatdate(MTIME, False, True))
53 | print("Connection: close")
54 | print()
55 | sys.stdout.flush()
56 | sys.stdout.buffer.write(reply)
57 | sys.stdout.buffer.flush()
58 | 


--------------------------------------------------------------------------------
/truecaser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | import sys
 6 | import collections
 7 | 
 8 | re_eng = re.compile('([A-Za-z]+)')
 9 | 
10 | def dumpdict(d, fp):
11 |     for k in sorted(d):
12 |         fp.write(('%s\t%s\n' % (k, d[k])).encode('utf-8'))
13 | 
14 | def loaddict(fp):
15 |     d = {}
16 |     for ln in fp:
17 |         ln = ln.strip().decode('utf-8').split('\t')
18 |         if len(ln) == 2:
19 |             d[ln[0]] = ln[1]
20 |     return d
21 | 
22 | def train(iterable):
23 |     d = collections.defaultdict(collections.Counter)
24 |     for ln in iterable:
25 |         for tok in re_eng.split(ln):
26 |             if 1 < len(tok) < 25 and re_eng.match(tok):
27 |                 d[tok.lower()][tok] += 1
28 |     for word, val in tuple(d.items()):
29 |         if sum(val.values()) > 1:
30 |             d[word] = val.most_common(1)[0][0]
31 |         else:
32 |             del d[word]
33 |     return dict(d)
34 | 
35 | class Truecaser:
36 |     def __init__(self, wmap):
37 |         self.wmap = wmap
38 | 
39 |     def truecase(self, text):
40 |         res = []
41 |         for tok in re_eng.split(text):
42 |             res.append(self.wmap.get(tok.lower(), tok))
43 |         return ''.join(res)
44 | 
45 | if __name__ == '__main__':
46 |     filename = sys.argv[-1]
47 | 
48 |     if len(sys.argv) > 2:
49 |         d = train(sys.stdin)
50 |         dumpdict(d, open(filename, 'wb'))
51 |     else:
52 |         tc = Truecaser(loaddict(open(filename, 'rb')))
53 |         for ln in sys.stdin:
54 |             sys.stdout.write(tc.truecase(ln))
55 | 


--------------------------------------------------------------------------------
/vendor/chinesename.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | import pickle
  6 | import random
  7 | import bisect
  8 | import operator
  9 | import functools
 10 | import itertools
 11 | from math import log
 12 | from .common_surnames import d as common_surnames
 13 | from .lookuptable import chrevlookup, pinyintrie, surnamerev
 14 | 
 15 | for py in tuple(chrevlookup.keys()):
 16 |     for ch in range(len(py)):
 17 |         frag = py[:ch+1]
 18 |         if frag not in chrevlookup:
 19 |             chrevlookup[frag] = ''
 20 | 
 21 | logtotal = log(sum(len(s) for s in chrevlookup.values()))
 22 | 
 23 | ig1 = operator.itemgetter(1)
 24 | 
 25 | phonetic_symbol = {
 26 | "ā": "a",
 27 | "á": "a",
 28 | "ǎ": "a",
 29 | "à": "a",
 30 | "ē": "e",
 31 | "é": "e",
 32 | "ě": "e",
 33 | "è": "e",
 34 | "ō": "o",
 35 | "ó": "o",
 36 | "ǒ": "o",
 37 | "ò": "o",
 38 | "ī": "i",
 39 | "í": "i",
 40 | "ǐ": "i",
 41 | "ì": "i",
 42 | "ū": "u",
 43 | "ú": "u",
 44 | "ǔ": "u",
 45 | "ù": "u",
 46 | "ü": "v",
 47 | "ǖ": "v",
 48 | "ǘ": "v",
 49 | "ǚ": "v",
 50 | "ǜ": "v",
 51 | "ń": "n",
 52 | "ň": "n",
 53 | "": "m"
 54 | }
 55 | 
 56 | 
 57 | def untone(text):
 58 |     # This is a limited version only for entities defined in xml_escape_table
 59 |     for k, v in phonetic_symbol.items():
 60 |         text = text.replace(k, v)
 61 |     return text
 62 | 
 63 | 
 64 | class WeightedRandomGenerator(object):
 65 | 
 66 |     def __init__(self, weights):
 67 |         self.totals = list(itertools.accumulate(weights))
 68 |         self.total = self.totals[-1]
 69 | 
 70 |     def __iter__(self):
 71 |         return self
 72 | 
 73 |     def __next__(self):
 74 |         rnd = random.random() * self.total
 75 |         return bisect.bisect_right(self.totals, rnd)
 76 | 
 77 |     def __call__(self):
 78 |         return self.__next__()
 79 | 
 80 | 
 81 | def _pyword_tokenize(word):
 82 |     DAG = {}
 83 |     N = len(word)
 84 |     for k in range(N):
 85 |         tmplist = []
 86 |         i = k
 87 |         frag = word[k]
 88 |         while i < N and frag in chrevlookup:
 89 |             if chrevlookup[frag]:
 90 |                 tmplist.append(i)
 91 |             i += 1
 92 |             frag = word[k:i + 1]
 93 |         if not tmplist:
 94 |             tmplist.append(k)
 95 |         DAG[k] = tmplist
 96 |     route = {N: (0, 0)}
 97 |     for idx in range(N - 1, -1, -1):
 98 |         route[idx] = max((log(len(chrevlookup.get(word[idx:x + 1], '')) or 1) -
 99 |                           logtotal + route[x + 1][0], x) for x in DAG[idx])
100 |     result = []
101 |     x = 0
102 |     while x < N:
103 |         y = route[x][1] + 1
104 |         result.append(word[x:y])
105 |         x = y
106 |     return result
107 | 
108 | pytokenize = lambda s: list(itertools.chain.from_iterable(_pyword_tokenize(w) for w in s.replace("'", ' ').lower().split()))
109 | 
110 | surnamesortkey = lambda n: -common_surnames.get(n, 0.00001)
111 | 
112 | class NameModel(object):
113 | 
114 |     def __init__(self, modelname):
115 |         with open(modelname, 'rb') as f:
116 |             self.firstchar, self.secondchar = pickle.load(f)
117 | 
118 |         del self.secondchar['']
119 |         self.snlst, snprb = tuple(zip(*common_surnames.items()))
120 |         self.fclst, fcprb = tuple(zip(*self.firstchar.items()))
121 |         self.sclst, scprb = tuple(zip(*self.secondchar.items()))
122 |         self.sngen = WeightedRandomGenerator(snprb)
123 |         self.fcgen = WeightedRandomGenerator(fcprb)
124 |         self.scgen = WeightedRandomGenerator(scprb)
125 | 
126 |     initlookup = functools.lru_cache(maxsize=10)(lambda self, ch: ''.join(set(''.join(chrevlookup[p] for p in pinyintrie.get(ch)))) if ch in pinyintrie else ch)
127 | 
128 |     lookupsurname = lambda self, pychars: ((list(itertools.chain.from_iterable(surnamerev.get(p, ()) for p in pinyintrie[pychars[0]])) if pychars[0] in pinyintrie else [pychars[0]]) if len(pychars) == 1 and len(pychars[0]) == 1 else surnamerev.get(' '.join(pychars), []))
129 | 
130 |     lookupchar = lambda self, ch: (self.initlookup(ch) if len(ch) == 1 else (chrevlookup.get(ch) or self.initlookup(ch[0])))
131 | 
132 |     fullnamesortkey = lambda self, n: -common_surnames.get(n[0], 0.00001)*self.firstchar.get(n[1])*self.secondchar.get(n[2:])
133 |     namesortkey = lambda self, n: -self.firstchar.get(n[0])*self.secondchar.get(n[1:])
134 | 
135 |     def splitname(self, romanization):
136 |         words = romanization.split()
137 |         tok = name = pytokenize(romanization)
138 |         if not name:
139 |             return [], []
140 |         if len(words) == 1:
141 |             words = name
142 |         surnames = self.lookupsurname(pytokenize(words[0]))
143 |         name = pytokenize(' '.join(words[1:]))
144 |         if not surnames:
145 |             surnames = self.lookupsurname(pytokenize(words[-1]))
146 |             name = pytokenize(' '.join(words[:-1]))
147 |             if len(words) > 2 and not surnames:
148 |                 surnames = self.lookupsurname(pytokenize(' '.join(words[:2])))
149 |                 name = pytokenize(' '.join(words[2:]))
150 |         if surnames:
151 |             surnames = sorted(frozenset(surnames), key=surnamesortkey)
152 |         else:
153 |             name = tok
154 |         return surnames, name
155 | 
156 |     def selectname(self, name, num=10):
157 |         if not name:
158 |             return []
159 |         evalnum = int(num ** (1/len(name))) + 1
160 |         namechars = [sorted(filter(ig1, ((n, self.firstchar.get(n, 1e-10 if 0x4E00 <= ord(n) < 0x9FCD else 0)) for n in self.lookupchar(name[0]))), key=ig1, reverse=1)]
161 |         namechars.extend(sorted(filter(ig1, ((n, self.secondchar.get(n, 1e-10 if 0x4E00 <= ord(n) < 0x9FCD else 0)) for n in self.lookupchar(l))), key=ig1, reverse=1)[:evalnum] for l in name[1:])
162 |         namechars = list(filter(None, namechars))[:10]
163 |         if not namechars:
164 |             return []
165 |         candidates = []
166 |         for group in itertools.product(*namechars):
167 |             gz = tuple(zip(*group))
168 |             gname = ''.join(gz[0])
169 |             gfreq = functools.reduce(operator.mul, gz[1])
170 |             candidates.append((gname, gfreq))
171 |         candidates.sort(key=ig1, reverse=1)
172 |         return [x[0] for x in candidates][:num]
173 | 
174 |     def processinput(self, userinput, num=10):
175 |         if not userinput:
176 |             return [], [self.snlst[self.sngen()] + self.fclst[self.fcgen()] + self.sclst[self.scgen()] for i in range(num)]
177 |         try:
178 |             surnames, names = self.splitname(untone(userinput).lower())
179 |             names = self.selectname(names, num=num)
180 |             if not names:
181 |                 names = [self.fclst[self.fcgen()] + self.sclst[self.scgen()] for i in range(num)]
182 |             return surnames, names
183 |         except Exception:
184 |             raise
185 |             return [], []
186 | 
187 |     def getname(self):
188 |         return self.snlst[self.sngen()] + self.fclst[self.fcgen()] + self.sclst[self.scgen()]
189 | 
190 |     __call__ = getname
191 | 
192 | if __name__ == '__main__':
193 |     while 1:
194 |         nm = NameModel('namemodel.m')
195 |         fullname = nm.getname()
196 |         #if name not in names:
197 |             #print(fullname)
198 |         print(fullname)
199 | 


--------------------------------------------------------------------------------
/vendor/common_surnames.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # 2012 data
  5 | d = {
  6 | '李': 0.0794,
  7 | '王': 0.0741,
  8 | '张': 0.0707,
  9 | '刘': 0.0538,
 10 | '陈': 0.0453,
 11 | '杨': 0.0308,
 12 | '赵': 0.0229,
 13 | '黄': 0.0223,
 14 | '周': 0.0212,
 15 | '吴': 0.0205,
 16 | '徐': 0.0173,
 17 | '孙': 0.0152,
 18 | '胡': 0.0131,
 19 | '朱': 0.0126,
 20 | '高': 0.0121,
 21 | '林': 0.0118,
 22 | '何': 0.0117,
 23 | '郭': 0.0115,
 24 | '马': 0.0105,
 25 | '罗': 0.0086,
 26 | '梁': 0.0084,
 27 | '宋': 0.0081,  # i
 28 | '郑': 0.0077,  # i
 29 | '谢': 0.0074,  # i
 30 | '韩': 0.0071,  # i
 31 | '唐': 0.0067,  # i
 32 | '冯': 0.0064,
 33 | '于': 0.0063,  # i
 34 | '董': 0.0061,
 35 | '萧': 0.0059,  # i
 36 | '程': 0.0057,
 37 | '曹': 0.0057,
 38 | '袁': 0.0056,  # i
 39 | '邓': 0.0054,  # i
 40 | '许': 0.0053,  # i
 41 | '傅': 0.0051,
 42 | '沈': 0.0050,
 43 | '曾': 0.0050,  # i
 44 | '彭': 0.0049,
 45 | '吕': 0.0047,
 46 | '苏': 0.0047,  # i
 47 | '卢': 0.0047,
 48 | '蒋': 0.0047,
 49 | '蔡': 0.0046,
 50 | '贾': 0.0042,
 51 | '丁': 0.0042,
 52 | # interpolated
 53 | '魏': 0.0042,
 54 | '薛': 0.0042,
 55 | '叶': 0.0041,
 56 | '阎': 0.0040,
 57 | '余': 0.0039,
 58 | '潘': 0.0039,
 59 | '杜': 0.0038,
 60 | '戴': 0.0038,
 61 | '夏': 0.0037,
 62 | '钟': 0.0036,
 63 | '汪': 0.0036,
 64 | '田': 0.0035,
 65 | '任': 0.0034,
 66 | '姜': 0.0034,
 67 | '范': 0.0033,
 68 | '方': 0.0033,
 69 | '石': 0.0032,
 70 | '姚': 0.0032,
 71 | '谭': 0.0031,
 72 | '盛': 0.0031,
 73 | '邹': 0.0030,
 74 | '熊': 0.0030,
 75 | '金': 0.0029,
 76 | '陆': 0.0029,
 77 | '郝': 0.0028,
 78 | '孔': 0.0028,
 79 | '白': 0.0027,
 80 | '崔': 0.0027,
 81 | '康': 0.0026,
 82 | '毛': 0.0026,
 83 | '邱': 0.0025,
 84 | '秦': 0.0025,
 85 | '江': 0.0024,
 86 | '史': 0.0024,
 87 | '顾': 0.0024,
 88 | '侯': 0.0023,
 89 | '邵': 0.0023,
 90 | '孟': 0.0022,
 91 | '龙': 0.0022,
 92 | '万': 0.0022,
 93 | '段': 0.0021,
 94 | '章': 0.0021,
 95 | '钱': 0.0020,
 96 | '汤': 0.0020,
 97 | '尹': 0.0020,
 98 | '黎': 0.0019,
 99 | '易': 0.0019,
100 | '常': 0.0019,
101 | '武': 0.0018,
102 | '乔': 0.0018,
103 | '贺': 0.0017,
104 | '赖': 0.0017,
105 | '龚': 0.0017,
106 | '文': 0.0016,
107 | '庞': 0.0016,
108 | '樊': 0.0016,
109 | '兰': 0.0015,
110 | '殷': 0.0015,
111 | '施': 0.0015,
112 | '陶': 0.0014,
113 | '洪': 0.0014,
114 | '翟': 0.0014,
115 | '安': 0.0013,
116 | '颜': 0.0013,
117 | '倪': 0.0013,
118 | '严': 0.0012,
119 | '牛': 0.0012,
120 | '温': 0.0012,
121 | '芦': 0.0012,
122 | '季': 0.0011,
123 | '俞': 0.0011,
124 | '章': 0.0011,
125 | '鲁': 0.0010,
126 | '葛': 0.0010,
127 | '伍': 0.0010,
128 | '韦': 0.0010,
129 | '申': 0.0009,
130 | '尤': 0.0009,
131 | '毕': 0.0009,
132 | '聂': 0.0008,
133 | '丛': 0.0008,
134 | '焦': 0.0008,
135 | '向': 0.0008,
136 | '柳': 0.0007,
137 | '邢': 0.0007,
138 | '路': 0.0007,
139 | '岳': 0.0007,
140 | '齐': 0.0006,
141 | '沿': 0.0006,
142 | '梅': 0.0006,
143 | '莫': 0.0006,
144 | '庄': 0.0005,
145 | '辛': 0.0005,
146 | '管': 0.0005,
147 | '祝': 0.0005,
148 | '左': 0.0004,
149 | '涂': 0.0004,
150 | '谷': 0.0004,
151 | '祁': 0.0004,
152 | '时': 0.0003,
153 | '舒': 0.0003,
154 | '耿': 0.0003,
155 | '牟': 0.0003,
156 | '卜': 0.0002,
157 | '路': 0.0002,
158 | '詹': 0.0002,
159 | '关': 0.0002,
160 | '苗': 0.0002,
161 | '凌': 0.0001,
162 | '费': 0.0001,
163 | '纪': 0.0001,
164 | '靳': 0.0001,
165 | '盛': 0.0001,
166 | '童': 0.0001,
167 | '欧': 0.0001,
168 | '甄': 0.0001,
169 | '项': 0.0001,
170 | '曲': 0.0001,
171 | '成': 0.0001,
172 | '游': 0.0001,
173 | '阳': 0.0001,
174 | '裴': 0.0001,
175 | '席': 0.0001,
176 | '卫': 0.0001,
177 | '查': 0.0001,
178 | '屈': 0.0001,
179 | '鲍': 0.0001,
180 | '位': 0.0001,
181 | '覃': 0.0001,
182 | '霍': 0.0001,
183 | '翁': 0.0001,
184 | '隋': 0.0001,
185 | '植': 0.0001,
186 | '甘': 0.0001,
187 | '景': 0.0001,
188 | '薄': 0.0001,
189 | '单': 0.0001,
190 | '包': 0.0001,
191 | '司': 0.0001,
192 | '柏': 0.0001,
193 | '宁': 0.0001,
194 | '柯': 0.0001,
195 | '阮': 0.0001,
196 | '桂': 0.0001,
197 | '闵': 0.0001,
198 | '欧': 0.0001,
199 | '阳': 0.0001,
200 | '解': 0.0001,
201 | '强': 0.0001,
202 | '柴': 0.0001,
203 | '华': 0.0001,
204 | '车': 0.0001,
205 | '冉': 0.0001,
206 | '房': 0.0001,
207 | }
208 | 


--------------------------------------------------------------------------------
/vendor/convertbdf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import pickle
 6 | import bdflib
 7 | 
 8 | def packrow(iterable):
 9 |     v = 0
10 |     for bit in iterable:
11 |         v = (v<<1) | bit
12 |     return v
13 | 
14 | def loadfrombdf(filename):
15 |     srcfile = open(filename, 'r')
16 |     fontd = bdflib.read_bdf(srcfile)
17 |     srcfile.close()
18 |     glyphs = {}
19 |     for k, v in fontd.glyphs_by_codepoint.items():
20 |         llen = len(v.bitmap()[0])
21 |         glyphs[k] = (llen,) + tuple(packrow(l) for l in v.bitmap())
22 |     maxnum = max(glyphs)
23 |     return tuple(glyphs.get(g) for g in range(max(glyphs)+1))
24 | 
25 | glyphs = loadfrombdf(sys.argv[1])
26 | pickle.dump(glyphs, open(sys.argv[2], 'wb'))
27 | 


--------------------------------------------------------------------------------
/vendor/figchar.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import pickle
 5 | import collections
 6 | 
 7 | 
 8 | class TextBlock:
 9 | 
10 |     def __init__(self, block='', blank=' '):
11 |         self.lines = []
12 |         self.blank = blank
13 |         for l in block.splitlines():
14 |             self.lines.append(l)
15 |         if not self.lines:
16 |             self.lines.append('')
17 |         self.width = max(map(len, self.lines))
18 |         self.height = len(self.lines)
19 |         self.lines = collections.deque(l.ljust(self.width) for l in self.lines)
20 | 
21 |     def hcat(self, other, justify=0):
22 |         delta = other.height - self.height
23 |         start = 0
24 |         if delta > 0:
25 |             if justify > 0:
26 |                 self.lines.extendleft(
27 |                     self.blank * self.width for i in range(delta))
28 |             elif justify < 0:
29 |                 self.lines.extend(
30 |                     self.blank * self.width for i in range(delta))
31 |             else:
32 |                 top = delta // 2
33 |                 self.lines.extendleft(
34 |                     self.blank * self.width for i in range(top))
35 |                 self.lines.extend(
36 |                     self.blank * self.width for i in range(delta - top))
37 |             self.height = other.height
38 |         elif delta < 0:
39 |             if justify > 0:
40 |                 start = -delta
41 |             elif justify == 0:
42 |                 start = -delta // 2
43 |         for ln in range(start):
44 |             self.lines[ln] += self.blank * other.width
45 |         for ln in range(other.height):
46 |             self.lines[ln + start] += other.lines[ln]
47 |         for ln in range(start + other.height, self.height):
48 |             self.lines[ln] += self.blank * other.width
49 |         self.width += other.width
50 | 
51 |     def __str__(self):
52 |         return '\n'.join(self.lines)
53 | 
54 | 
55 | class BlockGenerator:
56 | 
57 |     def __init__(self, fontfile, fillchar=' █'):
58 |         self.font = pickle.load(open(fontfile, 'rb'))
59 |         self.fillchar = fillchar
60 | 
61 |     def renderchar(self, c):
62 |         try:
63 |             lines = []
64 |             g = self.font[ord(c)]
65 |             width = g[0]
66 |             for l in g[1:]:
67 |                 s = ''.join(self.fillchar[int(b)]
68 |                             for b in bin(l)[2:].zfill(width))
69 |                 lines.append(s)
70 |             return '\n'.join(lines)
71 |         except Exception:
72 |             return ''
73 | 
74 |     def render(self, s):
75 |         lines = []
76 |         for ln in s.splitlines():
77 |             blk = TextBlock(blank=self.fillchar[0])
78 |             start = 0
79 |             for c in ln:
80 |                 if start:
81 |                     blk.hcat(
82 |                         TextBlock(self.fillchar[0], blank=self.fillchar[0]), 1)
83 |                 else:
84 |                     start = 1
85 |                 blk.hcat(
86 |                     TextBlock(self.renderchar(c), blank=self.fillchar[0]), 1)
87 |             lines.append(str(blk))
88 |         return '\n'.join(lines)
89 | 
90 | if __name__ == '__main__':
91 |     import sys
92 |     bg = BlockGenerator(*sys.argv[1:])
93 |     print(bg.render(sys.stdin.read()))
94 | 


--------------------------------------------------------------------------------
/vendor/learnctx.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import jieba
 6 | import pickle
 7 | import struct
 8 | import functools
 9 | import collections
10 | 
11 | def loaddict(fn):
12 |     dic = set('、，。；？！：')
13 |     with open(fn) as f:
14 |         for ln in f:
15 |             if not ln.strip():
16 |                 continue
17 |             w = ln.split()[0]
18 |             dic.add(w)
19 |     return sorted(dic)
20 | 
21 | wl = loaddict(sys.argv[1])
22 | 
23 | @functools.lru_cache(maxsize=200)
24 | def indexword(word):
25 |     try:
26 |         return wl.index(word)
27 |     except ValueError:
28 |         return None
29 | 
30 | packvals = lambda values: struct.pack('>' + 'H'*len(values), *values)
31 | 
32 | stopwords = frozenset(map(indexword, map(str.strip, open('stopwords.txt', 'r', encoding='utf-8'))))
33 | wd = collections.defaultdict(set)
34 | for ln in sys.stdin:
35 |     ln = set(filter(None, (indexword(word) for word in jieba.cut(ln.strip()))))
36 |     for word in ln.difference(stopwords):
37 |         wd[word] |= ln
38 | 
39 | pickle.dump(tuple(packvals(sorted(wd.get(k, ()))) for k in range(len(wl))), open('context.pkl', 'wb'))
40 | 


--------------------------------------------------------------------------------
/vendor/logcutfilter.py:
--------------------------------------------------------------------------------
 1 | #！/usr/bin/env python3
 2 | # -*- coding： utf-8 -*-
 3 | 
 4 | import sys, os
 5 | import re
 6 | import jieba
 7 | #from zhconv import convert_for_mw
 8 | from zhutil import *
 9 | 
10 | punctstr = (
11 |     '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~¢£¥·ˇˉ―‖‘’“”•′‵、。々'
12 |     '〈〉《》「」『』【】〔〕〖〗〝〞︰︱︳︴︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄'
13 |     '﹏﹐﹒﹔﹕﹖﹗﹙﹚﹛﹜﹝﹞！（），．：；？［｛｜｝～､￠￡￥')
14 | 
15 | 
16 | ucjk = frozenset(itertools.chain(
17 |     range(0x1100, 0x11FF + 1),
18 |     range(0x2E80, 0xA4CF + 1),
19 |     range(0xA840, 0xA87F + 1),
20 |     range(0xAC00, 0xD7AF + 1),
21 |     range(0xF900, 0xFAFF + 1),
22 |     range(0xFE30, 0xFE4F + 1),
23 |     range(0xFF65, 0xFFDC + 1),
24 |     range(0xFF01, 0xFF0F + 1),
25 |     range(0xFF1A, 0xFF20 + 1),
26 |     range(0xFF3B, 0xFF40 + 1),
27 |     range(0xFF5B, 0xFF60 + 1),
28 |     range(0x1F000, 0x2FFFF + 1)
29 | ))
30 | 
31 | RE_BRACKET = re.compile(' ?[（(][^\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\U0001F000-\U0001F8AD\U00020000-\U0002A6D6)）]*[)）]|"[^"]+"')
32 | 
33 | brackets = '()（）[]""‘’“”{}〈〉《》「」『』【】〔〕〖〗'
34 | 
35 | _get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(),
36 |                                                  os.path.dirname(__file__), path))
37 | 
38 | jiebazhc = jieba.Tokenizer(_get_module_path('zhcdict.txt'))
39 | jiebazhc.cache_file = "jiebazhc.cache"
40 | 
41 | #RE_BRACKETS = re.compile(' ?\((.*?)\)| ?\((.*?)\)')
42 | RE_BRACKETS = re.compile('|'.join(' ?%s.*?%s' % (re.escape(brackets[i]), re.escape(brackets[i+1])) for i in range(0, len(brackets), 2)))
43 | 
44 | tailp = frozenset("""([{£¥`〈《「『【〔〖（［｛￡￥〝︵︷︹︻︽︿﹁﹃﹙﹛﹝（｛"'“‘""")
45 | stripblank = lambda s: s.replace(' ', '').replace('\u3000', '')
46 | 
47 | if len(sys.argv) > 1:
48 | 	if sys.argv[1] == 'noop':
49 | 		cut = lambda s: (s,)
50 | 		stripblank = lambda s: s.replace('\u3000', ' ')
51 | 	else:
52 | 		cut = lambda s: jiebazhc.cut(s, HMM=False)
53 | else:
54 | 	cut = lambda s: jieba.cut(s, HMM=False)
55 | 
56 | notchinese = lambda l: not l or sum((ord(i) not in ucjk) for i in l) > .5 * len(l)
57 | brcksub = lambda matchobj: '' if notchinese(matchobj.group(0)[1:-1]) else matchobj.group(0)
58 | 
59 | def cutandsplit(s):
60 | 	for ln in filterlist(splitsentence(stripblank(s))):
61 | 		l = RE_BRACKETS.sub(brcksub, ln.strip())
62 | 		if notchinese(l):
63 | 			continue
64 | 		yield ' '.join(cut(l.replace('「', '“').replace('」', '”').replace('『', '‘').replace('』', '’').lstrip(tailpunct).rstrip(headpunct)))
65 | 
66 | cutfilter = lambda s: ' '.join(i.strip() for i in cut(s.replace(' ', '')))
67 | 
68 | lastline = ''
69 | 
70 | for ln in sys.stdin:
71 | 	l = ln.strip(' \t\n\r\x0b\x0c\u3000=[]')
72 | 	if not l or all((ord(i) not in ucjk) for i in l) or any((ord(i) in range(32)) for i in l):
73 | 		continue
74 | 	elif l[-1] in tailp:
75 | 		lastline += l
76 | 	else:
77 | 		#sys.stdout.write('\n'.join(filterlist((splitsentence(cutfilter(lastline + l))))) + '\n')
78 | 		sys.stdout.write('\n'.join(cutandsplit(lastline + l)))
79 | 		sys.stdout.write('\n')
80 | 		lastline = ''
81 | 
82 | if lastline:
83 | 	#sys.stdout.write('\n'.join(filterlist((splitsentence(cutfilter(lastline))))) + '\n')
84 | 	sys.stdout.write('\n'.join(cutandsplit(lastline)))
85 | 	sys.stdout.write('\n')
86 | 


--------------------------------------------------------------------------------
/vendor/mbox.conf:
--------------------------------------------------------------------------------
 1 | [fs]
 2 |     hide: /
 3 |     hide: /home
 4 |     hide: /tmp
 5 |     hide: /dev
 6 |     hide: /proc
 7 |     hide: /sys
 8 |     hide: /run
 9 |     hide: /mnt
10 |     hide: /media
11 |     allow: .
12 |     allow: /lib
13 |     allow: /usr/bin
14 |     allow: /usr/lib
15 |     allow: /usr/share
16 |     allow: /usr/local/lib
17 | [network]
18 |     block: 0.0.0.0
19 | 


--------------------------------------------------------------------------------
/vendor/mosesproxy.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | import os
  4 | import sys
  5 | import json
  6 | import socket
  7 | 
  8 | address = ('172.20.1.3', 13332)
  9 | 
 10 | dumpsjson = lambda x: json.dumps(x).encode('utf-8')
 11 | loadsjson = lambda x: json.loads(x.decode('utf-8'))
 12 | 
 13 | def recvall(sock, buf=1024):
 14 |     data = sock.recv(buf)
 15 |     alldata = [data]
 16 |     while data and data[-1] != 10:
 17 |         data = sock.recv(buf)
 18 |         alldata.append(data)
 19 |     return b''.join(alldata)[:-1]
 20 | 
 21 | 
 22 | def sendall(sock, data):
 23 |     sock.sendall(data + b'\n')
 24 | 
 25 | 
 26 | def receive(data, autorestart=None):
 27 |     sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 28 |     try:
 29 |         sock.connect(address)
 30 |         sendall(sock, data)
 31 |     except (ConnectionRefusedError, BrokenPipeError) as ex:
 32 |         raise ex
 33 |     received = recvall(sock)
 34 |     sock.close()
 35 |     return received
 36 | 
 37 | 
 38 | def translate(text, mode, withcount=False, withinput=True, align=True):
 39 |     return loadsjson(receive(dumpsjson((mode, text, withcount, withinput, align))))
 40 | 
 41 | 
 42 | def rawtranslate(text, mode, withcount=False):
 43 |     return loadsjson(receive(dumpsjson((mode + '.raw', text))))
 44 | 
 45 | 
 46 | def modelname():
 47 |     return loadsjson(receive(dumpsjson(('modelname',))))
 48 | 
 49 | 
 50 | def cut(*args, **kwargs):
 51 |     return loadsjson(receive(dumpsjson(('cut', args, kwargs))))
 52 | 
 53 | 
 54 | def cut_for_search(*args, **kwargs):
 55 |     return loadsjson(receive(dumpsjson(('cut_for_search', args, kwargs))))
 56 | 
 57 | 
 58 | def tokenize(*args, **kwargs):
 59 |     return loadsjson(receive(dumpsjson(('tokenize', args, kwargs))))
 60 | 
 61 | 
 62 | class jiebazhc:
 63 | 
 64 |     @staticmethod
 65 |     def cut(*args, **kwargs):
 66 |         return loadsjson(receive(dumpsjson(('jiebazhc.cut', args, kwargs))))
 67 | 
 68 |     @staticmethod
 69 |     def cut_for_search(*args, **kwargs):
 70 |         return loadsjson(receive(dumpsjson(('jiebazhc.cut_for_search', args, kwargs))))
 71 | 
 72 |     @staticmethod
 73 |     def tokenize(*args, **kwargs):
 74 |         return loadsjson(receive(dumpsjson(('jiebazhc.tokenize', args, kwargs))))
 75 | 
 76 | 
 77 | def add_word(*args, **kwargs):
 78 |     receive(dumpsjson(('add_word', args, kwargs)))
 79 | 
 80 | 
 81 | def load_userdict(*args):
 82 |     receive(dumpsjson(('load_userdict', args)))
 83 | 
 84 | 
 85 | def set_dictionary(*args):
 86 |     receive(dumpsjson(('set_dictionary', args)))
 87 | 
 88 | 
 89 | def stopserver():
 90 |     receive(dumpsjson(('stopserver',)), False)
 91 | 
 92 | 
 93 | def ping(autorestart=False):
 94 |     try:
 95 |         result = receive(dumpsjson(('ping',)), autorestart)
 96 |         return result == b'pong'
 97 |     except Exception:
 98 |         return False
 99 | 
100 | if __name__ == '__main__':
101 |     if len(sys.argv) > 1:
102 |         if sys.argv[1] == 'stop':
103 |             if ping():
104 |                 stopserver()
105 |         elif sys.argv[1] == 'ping':
106 |             if not ping():
107 |                 sys.exit(1)
108 |         elif sys.argv[1] == 'c2m':
109 |             if not ping():
110 |                 sys.exit(1)
111 |             sys.stdout.write(translate(sys.stdin.read(), 'c2m', 0, 0, 0) + '\n')
112 |         elif sys.argv[1] == 'm2c':
113 |             if not ping():
114 |                 sys.exit(1)
115 |             sys.stdout.write(translate(sys.stdin.read(), 'm2c', 0, 0, 0) + '\n')
116 |         elif sys.argv[1] == 'c2m.raw':
117 |             if not ping():
118 |                 sys.exit(1)
119 |             sys.stdout.write(translate(sys.stdin.read(), 'c2m.raw') + '\n')
120 |         elif sys.argv[1] == 'm2c.raw':
121 |             if not ping():
122 |                 sys.exit(1)
123 |             sys.stdout.write(translate(sys.stdin.read(), 'm2c.raw') + '\n')
124 |         elif sys.argv[1] == 'modelname':
125 |             if not ping():
126 |                 sys.exit(1)
127 |             sys.stdout.write((modelname() or '') + '\n')
128 |     else:
129 |         if not ping():
130 |             sys.exit(1)
131 | 


--------------------------------------------------------------------------------
/vendor/pangu.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | # Copyright (c) 2013 Vinta
 4 | # 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | # this software and associated documentation files (the "Software"), to deal in
 7 | # the Software without restriction, including without limitation the rights to
 8 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | # the Software, and to permit persons to whom the Software is furnished to do so,
10 | # subject to the following conditions:
11 | # 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | # 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 
22 | from __future__ import unicode_literals
23 | import re
24 | import sys
25 | 
26 | 
27 | _py_version = sys.version_info
28 | is_py2 = (_py_version[0] == 2)
29 | 
30 | __version__ = '2.5.6.3'
31 | __all__ = ['spacing', 'text_spacing']
32 | 
33 | CJK_QUOTE_RE = re.compile(r'([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])(["\'])')
34 | QUOTE_CJK_RE = re.compile(r'(["\'])([\u3040-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])')
35 | FIX_QUOTE_RE = re.compile(r'(["\'\(\[\{<\u201c]+)(\s*)(.+?)(\s*)(["\'\)\]\}>\u201d]+)')
36 | FIX_SINGLE_QUOTE_RE = re.compile(r'([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])( )(\')([A-Za-z])')
37 | 
38 | CJK_HASH_RE = re.compile(r'([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])(#(\S+))')
39 | HASH_CJK_RE = re.compile(r'((\S+)#)([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])')
40 | 
41 | CJK_OPERATOR_ANS_RE = re.compile(r'([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])([\+\-\*\/=&\\|<>])([A-Za-z0-9])')
42 | ANS_OPERATOR_CJK_RE = re.compile(r'([A-Za-z0-9])([\+\-\*\/=&\\|<>])([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])')
43 | 
44 | CJK_BRACKET_CJK_RE = re.compile(r'([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])([\(\[\{<\u201c]+(.*?)[\)\]\}>\u201d]+)([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])')
45 | CJK_BRACKET_RE = re.compile(r'([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])([\(\[\{<\u201c>])')
46 | BRACKET_CJK_RE = re.compile(r'([\)\]\}>\u201d<])([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])')
47 | FIX_BRACKET_RE = re.compile(r'([\(\[\{<\u201c]+)(\s*)(.+?)(\s*)([\)\]\}>\u201d]+)')
48 | 
49 | FIX_SYMBOL_RE = re.compile(r'([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])([~!;:,\.\?\u2026])([A-Za-z0-9])')
50 | 
51 | CJK_ANS_RE = re.compile(r'([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])([A-Za-z0-9`\$%\^&\*\-=\+\\\|/@\u00a1-\u00ff\u2022\u2027\u2150-\u218f])')
52 | ANS_CJK_RE = re.compile(r'([A-Za-z0-9`~\$%\^&\*\-=\+\\\|/!;:,\.\?\u00a1-\u00ff\u2022\u2026\u2027\u2150-\u218f])([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])')
53 | 
54 | 
55 | def text_spacing(text):
56 |     """
57 |     Perform paranoid text spacing on text. Always return Unicode.
58 |     """
59 | 
60 |     if is_py2 and isinstance(text, str):
61 |         text = text.decode('utf-8')
62 | 
63 |     if len(text) < 2:
64 |         return text
65 | 
66 |     text = CJK_QUOTE_RE.sub(r'\1 \2', text)
67 |     text = QUOTE_CJK_RE.sub(r'\1 \2', text)
68 |     text = FIX_QUOTE_RE.sub(r'\1\3\5', text)
69 |     text = FIX_SINGLE_QUOTE_RE.sub(r'\1\3\4', text)
70 | 
71 |     text = CJK_HASH_RE.sub(r'\1 \2', text)
72 |     text = HASH_CJK_RE.sub(r'\1 \3', text)
73 | 
74 |     text = CJK_OPERATOR_ANS_RE.sub(r'\1 \2 \3', text)
75 |     text = ANS_OPERATOR_CJK_RE.sub(r'\1 \2 \3', text)
76 | 
77 |     old_text = text
78 |     new_text = CJK_BRACKET_CJK_RE.sub(r'\1 \2 \4', old_text)
79 |     text = new_text
80 |     if old_text == new_text:
81 |         text = CJK_BRACKET_RE.sub(r'\1 \2', text)
82 |         text = BRACKET_CJK_RE.sub(r'\1 \2', text)
83 |     text = FIX_BRACKET_RE.sub(r'\1\3\5', text)
84 | 
85 |     text = FIX_SYMBOL_RE.sub(r'\1\2 \3', text)
86 | 
87 |     text = CJK_ANS_RE.sub(r'\1 \2', text)
88 |     text = ANS_CJK_RE.sub(r'\1 \2', text)
89 | 
90 |     return text
91 | 
92 | 
93 | spacing = text_spacing
94 | 


--------------------------------------------------------------------------------
/vendor/repl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import sys
 6 | 
 7 | import re
 8 | import math
 9 | 
10 | sys.stderr = sys.stdout
11 | with sys.stdin as r:
12 |     prog = r.read()
13 | 
14 | try:
15 |     ret = eval(prog)
16 |     if ret is not None:
17 |         print(ret)
18 | except SyntaxError:
19 |     exec(prog)
20 | 


--------------------------------------------------------------------------------
/vendor/say.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import re
  5 | import sys
  6 | import kenlm
  7 | import pangu
  8 | import pickle
  9 | import struct
 10 | import random
 11 | import itertools
 12 | import functools
 13 | import collections
 14 | 
 15 | srandom = random.SystemRandom()
 16 | 
 17 | RE_UCJK = re.compile(
 18 |     '([\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff'
 19 |     '\U0001F000-\U0001F8AD\U00020000-\U0002A6D6]+)')
 20 | 
 21 | RE_EN = re.compile('[a-zA-Z0-9_]')
 22 | 
 23 | punct = frozenset(
 24 |     '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~¢£¥·ˇˉ―‖‘’“”•′‵、。々'
 25 |     '〈〉《》「」『』【】〔〕〖〗〝〞︰︱︳︴︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄'
 26 |     '﹏﹐﹒﹔﹕﹖﹗﹙﹚﹛﹜﹝﹞！（），．：；？［｛｜｝～､￠￡￥')
 27 | 
 28 | unpackvals = lambda b: struct.unpack('>' + 'H' * (len(b) // 2), b)
 29 | sel_best = lambda weights: max(enumerate(weights), key=lambda x: x[1])
 30 | 
 31 | 
 32 | class LRUCache:
 33 | 
 34 |     def __init__(self, maxlen):
 35 |         self.capacity = maxlen
 36 |         self.cache = collections.OrderedDict()
 37 | 
 38 |     def __getitem__(self, key):
 39 |         value = self.cache.pop(key)
 40 |         self.cache[key] = value
 41 |         return value
 42 | 
 43 |     def get(self, key, default=None):
 44 |         try:
 45 |             value = self.cache.pop(key)
 46 |             self.cache[key] = value
 47 |             return value
 48 |         except KeyError:
 49 |             return default
 50 | 
 51 |     def __setitem__(self, key, value):
 52 |         try:
 53 |             self.cache.pop(key)
 54 |         except KeyError:
 55 |             if len(self.cache) >= self.capacity:
 56 |                 self.cache.popitem(last=False)
 57 |         self.cache[key] = value
 58 | 
 59 |     def __contains__(self, item):
 60 |         return item in self.cache
 61 | 
 62 | 
 63 | def weighted_choice_king(weights):
 64 |     total = 0
 65 |     winner = 0
 66 |     winweight = 0
 67 |     for i, w in enumerate(weights):
 68 |         total += w
 69 |         if srandom.random() * total < w:
 70 |             winner = i
 71 |             winweight = w
 72 |     return winner, winweight
 73 | 
 74 | 
 75 | def _get_indexword(model):
 76 |     @functools.lru_cache(maxsize=50)
 77 |     def indexword(word):
 78 |         try:
 79 |             return model.voc.index(word)
 80 |         except ValueError:
 81 |             return None
 82 |     return indexword
 83 | 
 84 | 
 85 | def joinword(words):
 86 |     last = False
 87 |     for w in words:
 88 |         if last and RE_EN.match(w[0]):
 89 |             yield ' '
 90 |         yield w
 91 |         if RE_EN.match(w[-1]):
 92 |             last = True
 93 | 
 94 | 
 95 | class SimpleModel:
 96 | 
 97 |     def __init__(self, lm, dictfile, ctxmodel=None, dictinit=''):
 98 |         self.lm = kenlm.LanguageModel(lm)
 99 |         self.voc = []
100 |         self._vocid = LRUCache(64)
101 |         self.ctx = pickle.load(open(ctxmodel, 'rb')) if ctxmodel else {}
102 |         self.stopfn = lambda s: len(s) > 40 or len(s) > 3 and all(i == s[-1] for i in s[-3:])
103 |         self.loaddict(dictfile, dictinit, True)
104 | 
105 |     def add_word(self, word):
106 |         if word not in self.dic:
107 |             self.dic.append(word)
108 | 
109 |     def loaddict(self, fn, init='', withsp=False):
110 |         dic = set(init)
111 |         with open(fn) as f:
112 |             for ln in f:
113 |                 ln = ln.strip()
114 |                 if not ln:
115 |                     continue
116 |                 dic.add(ln if withsp else ln.split()[0])
117 |         self.voc = sorted(dic)
118 | 
119 |     def indexword(self, word):
120 |         if word not in self._vocid:
121 |             try:
122 |                 self._vocid[word] = self.voc.index(word)
123 |             except ValueError:
124 |                 self._vocid[word] = None
125 |         return self._vocid[word]
126 | 
127 |     def say(self, context=(), continuewords=()):
128 |         context = context or continuewords
129 |         ctxvoc = list(frozenset(self.voc).intersection(map(self.voc.__getitem__, frozenset(itertools.chain.from_iterable(map(unpackvals, map(self.ctx.__getitem__, filter(None, map(self.indexword, frozenset(context)))))))))) or self.voc if context else self.voc
130 |         out = []
131 |         stack = list(continuewords)
132 |         if stack:
133 |             history = ' '.join(stack) + ' '
134 |             idx, w = weighted_choice_king(
135 |                 10**self.lm.score(history + c, 1, 0) for c in ctxvoc)
136 |         else:
137 |             idx, w = weighted_choice_king(
138 |                 10**self.lm.score(c, 1, 0) for c in ctxvoc)
139 |         out.append(ctxvoc[idx])
140 |         stack.append(ctxvoc[idx])
141 |         while 1:
142 |             bos = (len(stack) <= self.lm.order + 2)
143 |             history = ' '.join(stack[-self.lm.order - 2:]) + ' '
144 |             idx, w = weighted_choice_king(
145 |                 10**self.lm.score(history + ctxvoc[k // 2], bos, k % 2) for k in range(len(ctxvoc) * 2))
146 |             c = ctxvoc[idx // 2]
147 |             out.append(c)
148 |             stack.append(c)
149 |             if idx % 2 or self.stopfn(out):
150 |                 break
151 |         return pangu.spacing(''.join(joinword(out)))
152 | 
153 | 
154 | class POSModel:
155 | 
156 |     allpos = (
157 |         'a', 'ad', 'ag', 'an', 'b', 'c', 'd', 'df', 'dg', 'e', 'f', 'g', 'h', 'i',
158 |         'j', 'k', 'l', 'm', 'mg', 'mq', 'n', 'ng', 'nr', 'ns', 'nt', 'nz', 'o',
159 |         'p', 'q', 'r', 'rg', 'rr', 'rz', 's', 't', 'tg', 'u', 'ud', 'ug', 'uj',
160 |         'ul', 'uv', 'uz', 'v', 'vd', 'vg', 'vi', 'vn', 'vq', 'x', 'y', 'z', 'zg',
161 |         '“', '”', '、', '。', '！', '，', '．', '：', '；', '？'
162 |     )
163 | 
164 |     def __init__(self, lm, poslm, dictfile):
165 |         self.lm = kenlm.LanguageModel(lm)
166 |         self.poslm = kenlm.LanguageModel(poslm)
167 |         self.posvoc = {}
168 |         self.end = frozenset('。！？”')
169 |         self.loaddict(dictfile)
170 | 
171 |     def loaddict(self, fn):
172 |         with open(fn) as f:
173 |             for ln in f:
174 |                 l = ln.strip()
175 |                 if not l:
176 |                     continue
177 |                 try:
178 |                     w, f, p = l.split()
179 |                     p = p[:2]
180 |                     if RE_UCJK.match(w):
181 |                         if p in self.posvoc:
182 |                             self.posvoc[p].append(w)
183 |                         else:
184 |                             self.posvoc[p] = [w]
185 |                 except Exception:
186 |                     pass
187 | 
188 |     def generate_pos(self):
189 |         out = []
190 |         idx, w = weighted_choice_king(
191 |             10**self.poslm.score(c, 1, 0) for c in self.allpos)
192 |         out.append(self.allpos[idx])
193 |         yield self.allpos[idx]
194 |         while 1:
195 |             bos = (len(out) <= self.poslm.order + 2)
196 |             history = ' '.join(out[-self.poslm.order - 2:]) + ' '
197 |             idx, w = weighted_choice_king(
198 |                 10**self.poslm.score(history + self.allpos[k // 2], bos, k % 2) for k in range(len(self.allpos) * 2))
199 |             c = self.allpos[idx // 2]
200 |             out.append(c)
201 |             yield c
202 |             if idx % 2 or c in self.end:
203 |                 break
204 | 
205 |     def say(self):
206 |         orderlm = self.lm.order
207 |         out = []
208 |         for pos in self.generate_pos():
209 |             if pos in punct:
210 |                 out.append(pos)
211 |             elif pos in self.posvoc:
212 |                 bos = (len(out) <= orderlm + 2)
213 |                 history = ' '.join(out[-orderlm - 2:]) + ' '
214 |                 availvoc = self.posvoc[pos]
215 |                 idx, w = weighted_choice_king(
216 |                     10**self.lm.score(history + c, bos, 0) for c in availvoc)
217 |                 c = availvoc[idx]
218 |                 out.append(c)
219 |             else:
220 |                 out.append(pos)
221 |         return pangu.spacing(''.join(joinword(out)))
222 | 
223 | 
224 | if __name__ == '__main__':
225 |     model = SimpleModel(*sys.argv[1:])
226 |     for ln in sys.stdin:
227 |         ln = ln.strip()
228 |         if ln:
229 |             mode = ln[0]
230 |             words = ln[1:].split()
231 |         else:
232 |             mode, words = '', ()
233 |         print(model.say(words))
234 |         sys.stdout.flush()
235 | 
236 |     #model = POSModel(*sys.argv[1:])
237 |     #while 1:
238 |         #print(model.say())
239 |         #sys.stdout.flush()
240 | 


--------------------------------------------------------------------------------
/vendor/seccomp.py:
--------------------------------------------------------------------------------
  1 | # The MIT License (MIT)
  2 | # 
  3 | # Copyright (c) 2015 David Wison
  4 | # 
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | # 
 12 | # The above copyright notice and this permission notice shall be included in
 13 | # all copies or substantial portions of the Software.
 14 | # 
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | # THE SOFTWARE.
 22 | 
 23 | # pip install python-prctl cffi
 24 | 
 25 | from __future__ import division
 26 | 
 27 | import os
 28 | import sys
 29 | import signal
 30 | import socket
 31 | import struct
 32 | import marshal
 33 | import resource
 34 | 
 35 | import cffi
 36 | import prctl
 37 | 
 38 | import re
 39 | import math
 40 | import cmath
 41 | import itertools
 42 | 
 43 | reload(sys)
 44 | sys.setdefaultencoding("utf-8")
 45 | 
 46 | _ffi = cffi.FFI()
 47 | _ffi.cdef('void _exit(int);')
 48 | _libc = _ffi.dlopen(None)
 49 | 
 50 | def _exit(n=1):
 51 |     """Invoke _exit(2) system call."""
 52 |     _libc._exit(n)
 53 | 
 54 | def read_exact(fp, n):
 55 |     buf = ''
 56 |     while len(buf) < n:
 57 |         buf2 = os.read(fp.fileno(), n)
 58 |         if not buf2:
 59 |             _exit(233)
 60 |         buf += buf2
 61 |     return buf2
 62 | 
 63 | def write_exact(fp, s):
 64 |     done = 0
 65 |     while done < len(s):
 66 |         written = os.write(fp.fileno(), s[done:])
 67 |         if not written:
 68 |             _exit(233)
 69 |         done += written
 70 | 
 71 | class SecureEvalHost(object):
 72 |     def __init__(self):
 73 |         self.host, self.child = socket.socketpair()
 74 |         self.pid = None
 75 |         self.child_globals = {"__builtins__": __builtins__}
 76 | 
 77 |     def start_child(self):
 78 |         assert not self.pid
 79 |         self.pid = os.fork()
 80 |         if not self.pid:
 81 |             self._child_main()
 82 |         self.child.close()
 83 | 
 84 |     def kill_child(self):
 85 |         assert self.pid
 86 |         pid, status = os.waitpid(self.pid, os.WNOHANG)
 87 |         os.kill(self.pid, signal.SIGKILL)
 88 | 
 89 |     def do_eval(self, msg):
 90 |         try:
 91 |             return {'result': str(eval(msg['body'], self.child_globals, {}))}
 92 |         except Exception as ex:
 93 |             return {'result': repr(ex)}
 94 | 
 95 |     def _child_main(self):
 96 |         self.host.close()
 97 |         for fd in map(int, os.listdir('/proc/self/fd')):
 98 |             if fd != self.child.fileno():
 99 |                 try:
100 |                     os.close(fd)
101 |                 except OSError:
102 |                     pass
103 | 
104 |         resource.setrlimit(resource.RLIMIT_CPU, (1, 1))
105 |         prctl.set_seccomp(True)
106 |         while True:
107 |             sz, = struct.unpack('>L', read_exact(self.child, 4))
108 |             doc = marshal.loads(read_exact(self.child, sz))
109 |             if doc['cmd'] == 'eval':
110 |                 resp = self.do_eval(doc)
111 |             elif doc['cmd'] == 'exit':
112 |                 _exit(0)
113 |             goobs = marshal.dumps(resp)
114 |             write_exact(self.child, struct.pack('>L', len(goobs)))
115 |             write_exact(self.child, goobs)
116 | 
117 |     def eval(self, s):
118 |         msg = marshal.dumps({'cmd': 'eval', 'body': s})
119 |         write_exact(self.host, struct.pack('>L', len(msg)))
120 |         write_exact(self.host, msg)
121 |         sz, = struct.unpack('>L', read_exact(self.host, 4))
122 |         goobs = marshal.loads(read_exact(self.host, sz))
123 |         return goobs['result']
124 | 
125 | 
126 | def go():
127 |     sec = SecureEvalHost()
128 |     sec.child_globals.update({'re': re, 'math': math, 'cmath': cmath, 'itertools': itertools})
129 |     sec.start_child()
130 |     try:
131 |         sys.stdout.write(sec.eval(sys.stdin.read()) + '\n')
132 |     finally:
133 |         sec.kill_child()
134 | 
135 | if __name__ == '__main__':
136 |     go()
137 | 


--------------------------------------------------------------------------------
/vendor/simpcalc.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import re
  5 | import math
  6 | import cmath
  7 | import random
  8 | import operator
  9 | import collections
 10 | 
 11 | 
 12 | class CalculatorError(Exception):
 13 |     pass
 14 | 
 15 | 
 16 | class MathError(CalculatorError):
 17 |     '''The Math Error type.'''
 18 | 
 19 |     def __init__(self, pos=0, length=1):
 20 |         super().__init__(self)
 21 |         self.pos = pos
 22 |         self.length = length
 23 | 
 24 |     def __repr__(self):
 25 |         return 'MathError(%s)' % self.pos
 26 | 
 27 | 
 28 | class SyntaxError(CalculatorError):
 29 |     '''The Syntax Error type.'''
 30 | 
 31 |     def __init__(self, pos=0, length=1):
 32 |         super().__init__(self)
 33 |         self.pos = pos
 34 |         self.length = length
 35 | 
 36 |     def __repr__(self):
 37 |         return 'SyntaxError(%s)' % self.pos
 38 | 
 39 | 
 40 | class KbdBreak(CalculatorError):
 41 |     '''The Keyboard Break Error type.'''
 42 | 
 43 |     def __init__(self, pos=0, length=1):
 44 |         super().__init__(self)
 45 |         self.pos = pos
 46 |         self.length = length
 47 | 
 48 |     def __repr__(self):
 49 |         return 'KbdBreak(%s)' % self.pos
 50 | 
 51 | 
 52 | class Token:
 53 | 
 54 |     def __init__(self, name, pos, type, priority=0, argnum=0, value=None):
 55 |         self.name = name
 56 |         self.pos = pos
 57 |         self.type = type
 58 |         self.priority = priority
 59 |         self.argnum = argnum
 60 |         self.value = value
 61 | 
 62 |     def __repr__(self):
 63 |         return 'Token(%s)' % ', '.join(map(
 64 |             repr, (self.name, self.pos, self.type, self.priority, self.argnum, self.value)))
 65 | 
 66 | 
 67 | def adapt_cmath(funcname):
 68 |     def wrapped(x):
 69 |         if isinstance(x, complex):
 70 |             return getattr(cmath, funcname)(x)
 71 |         else:
 72 |             try:
 73 |                 return getattr(math, funcname)(x)
 74 |             except Exception:
 75 |                 # sqrt etc.
 76 |                 return getattr(cmath, funcname)(x)
 77 |     return wrapped
 78 | 
 79 | 
 80 | def gcd(*numbers):
 81 |     """Calculate the Greatest Common Divisor of the numbers."""
 82 |     if len(numbers) == 2:
 83 |         a, b = numbers
 84 |         while b:
 85 |             a, b = b, a % b
 86 |         return a
 87 |     elif len(numbers) < 2:
 88 |         raise TypeError(
 89 |             'gcd expected at least 2 arguments, got ' + str(len(numbers)))
 90 |     else:
 91 |         val = numbers[0]
 92 |         for i in numbers[1:]:
 93 |             while i:
 94 |                 val, i = i, val % i
 95 |         return val
 96 | 
 97 | 
 98 | def lcm(*numbers):
 99 |     """Calculate the Lowest Common Multiple of the numbers."""
100 |     if len(numbers) == 2:
101 |         return numbers[0] * numbers[1] // gcd(numbers[0], numbers[1])
102 |     elif len(numbers) < 2:
103 |         raise TypeError(
104 |             'lcm expected at least 2 arguments, got ' + str(len(numbers)))
105 |     else:
106 |         val = numbers[0]
107 |         for i in numbers[1:]:
108 |             val = val * i // gcd(val, i)
109 |         return val
110 | 
111 | 
112 | def resplit(regex, string):
113 |     pos = 0
114 |     for m in regex.finditer(string):
115 |         if m.start(0) != pos:
116 |             yield string[pos:m.start(0)]
117 |         yield string[m.start(0):m.end(0)]
118 |         pos = m.end(0)
119 |     if pos < len(string):
120 |         yield string[pos:]
121 | 
122 | 
123 | class Calculator:
124 | 
125 |     operators = collections.OrderedDict((
126 |         (" ", ('ws', 1, 1)),
127 |         ("\t", ('ws', 1, 1)),
128 |         ("(", ('(', 1, 1)),
129 |         (",", (',', 1, 2)),
130 |         ("!", ('op_l', 2, 1)),
131 |         ("^", ('op_r', 3, 2)),
132 |         ("**", ('op_r', 3, 2)),
133 |         # recognize on parsing
134 |         # ("pos", ('op_r', 4, 1)),
135 |         # ("neg", ('op_r', 4, 1)),
136 |         ("*", ('op_l', 5, 2)),
137 |         ("×", ('op_l', 5, 2)),
138 |         ("/", ('op_l', 5, 2)),
139 |         ("÷", ('op_l', 5, 2)),
140 |         ("\\", ('op_l', 5, 2)),
141 |         ("%", ('op_l', 5, 2)),
142 |         ("+", ('op_l', 6, 2)),
143 |         ("-", ('op_l', 6, 2)),
144 |         (")", (')', 7, 1))
145 |     ))
146 | 
147 |     const = {
148 |         "i": 1j,
149 |         "pi": math.pi,
150 |         "π": math.pi,
151 |         "e": math.e
152 |     }
153 | 
154 |     functions = {
155 |         "!": (math.factorial, 1),
156 |         "^": (operator.pow, 2),
157 |         "**": (operator.pow, 2),
158 |         "*": (operator.mul, 2),
159 |         "×": (operator.mul, 2),
160 |         "/": (operator.truediv, 2),
161 |         "÷": (operator.truediv, 2),
162 |         "\\": (operator.floordiv, 2),
163 |         "%": (operator.mod, 2),
164 |         "+": (operator.add, 2),
165 |         "-": (operator.sub, 2),
166 |         "pos": (operator.pos, 1),
167 |         "neg": (operator.neg, 1),
168 |         "abs": (abs, 1),
169 |         "bool": (bool, 1),
170 |         "float": (float, 1),
171 |         "int": (int, 1),
172 |         "max": (max, 2),
173 |         "min": (min, 2),
174 |         "pow": (pow, 2),
175 |         "round": (round, 1),
176 |         "ceil": (math.ceil, 1),
177 |         "copysign": (math.copysign, 2),
178 |         "fabs": (math.fabs, 1),
179 |         "factorial": (math.factorial, 1),
180 |         "floor": (math.floor, 1),
181 |         "fmod": (math.fmod, 1),
182 |         "gcd": (gcd, 2),
183 |         "lcm": (lcm, 2),
184 |         "ldexp": (math.ldexp, 1),
185 |         "trunc": (math.trunc, 1),
186 |         "real": (operator.attrgetter("real"), 1),
187 |         "imag": (operator.attrgetter("imag"), 1),
188 |         "exp": (adapt_cmath("exp"), 1),
189 |         "log": (adapt_cmath("log"), 1),
190 |         "ln": (adapt_cmath("log"), 1),
191 |         "log10": (adapt_cmath("log10"), 1),
192 |         "lg": (adapt_cmath("log10"), 1),
193 |         "sqrt": (adapt_cmath("sqrt"), 1),
194 |         "√": (adapt_cmath("sqrt"), 1),
195 |         "acos": (adapt_cmath("acos"), 1),
196 |         "asin": (adapt_cmath("asin"), 1),
197 |         "atan": (adapt_cmath("atan"), 1),
198 |         "cos": (adapt_cmath("cos"), 1),
199 |         "sin": (adapt_cmath("sin"), 1),
200 |         "tan": (adapt_cmath("tan"), 1),
201 |         "atan2": (math.atan2, 2),
202 |         "hypot": (math.hypot, 2),
203 |         "degrees": (math.degrees, 1),
204 |         "radians": (math.radians, 1),
205 |         "acosh": (adapt_cmath("acosh"), 1),
206 |         "asinh": (adapt_cmath("asinh"), 1),
207 |         "atanh": (adapt_cmath("atanh"), 1),
208 |         "cosh": (adapt_cmath("cosh"), 1),
209 |         "sinh": (adapt_cmath("sinh"), 1),
210 |         "tanh": (adapt_cmath("tanh"), 1),
211 |         "erf": (math.erf, 1),
212 |         "erfc": (math.erfc, 1),
213 |         "gamma": (math.gamma, 1),
214 |         "lgamma": (math.lgamma, 1),
215 |         "phase": (cmath.phase, 1),
216 |         "rect": (cmath.rect, 1),
217 |         "inv": (operator.inv, 1),
218 |         "and": (operator.and_, 2),
219 |         "or": (operator.or_, 2),
220 |         "xor": (operator.xor, 2),
221 |         "rand": (random.random, 0),
222 |         "randrng": (random.uniform, 2),
223 |     }
224 | 
225 |     ansvar = '_'
226 | 
227 |     re_float = re.compile(r'([0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?i?)')
228 |     re_delim = re.compile(
229 |         '(%s)' % ('|'.join(map(re.escape, operators.keys()))))
230 |     re_split = re.compile(
231 |         r'([0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?i?|%s)' % ('|'.join(map(re.escape, operators.keys()))))
232 | 
233 |     def __init__(self, ansvar=None, autoclose=False):
234 |         self.ansvar = ansvar or self.ansvar
235 |         self.vars = {self.ansvar: 0}
236 |         self.autoclose = autoclose
237 | 
238 |     def splitexpr(self, expr):
239 |         pos = 0
240 |         for s in resplit(self.re_split, expr):
241 |             s = s.lower()
242 |             if not s.strip():
243 |                 pass
244 |             elif self.re_float.match(s):
245 |                 i = 1
246 |                 if s[-1] == 'i':
247 |                     i = 1j
248 |                     s = s[:-1]
249 |                 if '.' in s or 'e' in s:
250 |                     yield Token(s, pos, 'num', value=float(s) * i)
251 |                 else:
252 |                     yield Token(s, pos, 'num', value=int(s) * i)
253 |             elif self.re_delim.match(s):
254 |                 val = self.functions[s][0] if s in self.functions else None
255 |                 yield Token(s, pos, *self.operators[s], value=val)
256 |             elif s in self.const:
257 |                 yield Token(s, pos, 'const', value=self.const[s])
258 |             elif s in self.vars:
259 |                 yield Token(s, pos, 'var')
260 |             elif s in self.functions:
261 |                 fn = self.functions[s]
262 |                 yield Token(s, pos, 'fn', argnum=fn[1], value=fn[0])
263 |             else:
264 |                 raise SyntaxError(pos, len(s))
265 |             pos += len(s)
266 | 
267 |     def torpn(self, lstin):
268 |         opstack = []
269 |         lastt = None
270 |         for key, token in enumerate(lstin):
271 |             if token.type == '(':
272 |                 opstack.append(token)
273 |             elif token.type.startswith('op'):
274 |                 if token.name in '+-' and (
275 |                         lastt is None or lastt.type in ('(', 'op_l', 'op_r', ',')):
276 |                     if token.name == '+':
277 |                         token.name = 'pos'
278 |                         token.value = operator.pos
279 |                     else:
280 |                         token.name = 'neg'
281 |                         token.value = operator.neg
282 |                     token.type = 'op_r'
283 |                     token.priority = 0
284 |                     token.argnum = 1
285 |                 if opstack:
286 |                     tok2 = opstack[-1]
287 |                     while (tok2.type.startswith('op') and
288 |                            (token.type[-1] == 'l' and token.priority >= tok2.priority or
289 |                             token.type[-1] == 'r' and token.priority > tok2.priority)):
290 |                         yield opstack.pop()
291 |                         if opstack:
292 |                             tok2 = opstack[-1]
293 |                         else:
294 |                             break
295 |                 opstack.append(token)
296 |             elif token.type == ',':
297 |                 try:
298 |                     while opstack[-1].name != '(':
299 |                         yield opstack.pop()
300 |                 except IndexError:
301 |                     raise SyntaxError(key, len(token.name))
302 |             elif token.type == ')':
303 |                 try:
304 |                     while opstack[-1].name != '(':
305 |                         yield opstack.pop()
306 |                 except IndexError:
307 |                     raise SyntaxError(key, len(token.name))
308 |                 op = opstack.pop()
309 |                 if opstack and opstack[-1].type == 'fn':
310 |                     yield opstack.pop()
311 |             elif token.type in ('const', 'var'):
312 |                 yield token
313 |             elif token.type == 'fn':
314 |                 opstack.append(token)
315 |             else:
316 |                 yield token
317 |             # check function brackets
318 |             if lastt and token.type != '(' and lastt.type == 'fn' and lastt.argnum:
319 |                 raise SyntaxError(lastt.pos, len(lastt.name))
320 |             lastt = token
321 |         while opstack:
322 |             op = opstack.pop()
323 |             if op.type != '(':
324 |                 yield op
325 |             # If self.autoclose then ignored right parenthesis is allowed.
326 |             elif not self.autoclose:
327 |                 raise SyntaxError(op.pos, len(op.name))
328 | 
329 |     def evalrpn(self, lstin):
330 |         '''Evaluates the Reverse Polish Expression.'''
331 |         numstack = []
332 |         for token in lstin:
333 |             if token.type in ('num', 'const'):
334 |                 numstack.append(token.value)
335 |             elif token.type == 'var':
336 |                 numstack.append(self.vars[token.name])
337 |             elif token.type in ('op_l', 'op_r', 'fn'):
338 |                 try:
339 |                     args = [numstack.pop() for i in range(token.argnum)]
340 |                 except IndexError:
341 |                     raise SyntaxError(token.pos, len(token.name))
342 |                 try:
343 |                     numstack.append(token.value(*reversed(args)))
344 |                 except KeyboardInterrupt:
345 |                     raise KbdBreak(token.pos, len(token.name))
346 |                 except Exception:
347 |                     raise MathError(token.pos, len(token.name))
348 |             else:
349 |                 # Logic error in program
350 |                 raise AssertionError('token %r appears in RPN' % token)
351 |         if len(numstack) > 1:
352 |             raise SyntaxError(token.pos, len(token.name))
353 |         elif numstack:
354 |             return numstack.pop()
355 |         else:
356 |             return None
357 | 
358 |     def eval(self, expr):
359 |         ret = self.evalrpn(self.torpn(self.splitexpr(expr)))
360 |         self.vars[self.ansvar] = ret
361 |         return ret
362 | 
363 |     def format(self, ret):
364 |         if ret is None:
365 |             return ''
366 |         elif isinstance(ret, complex):
367 |             s = str(ret.real) if ret.real else ''
368 |             if ret.imag:
369 |                 sign = '+' if ret.imag > 0 and s else ''
370 |                 if ret.imag == 1:
371 |                     imag = ''
372 |                 elif ret.imag == -1:
373 |                     imag = '-'
374 |                 else:
375 |                     imag = str(ret.imag)
376 |                 s += sign + imag + 'i'
377 |             elif not ret:
378 |                 s = '0'
379 |             return s
380 |         elif ret:
381 |             return str(ret)
382 |         else:
383 |             return '0'
384 | 
385 |     def pretty(self, expr):
386 |         try:
387 |             return self.format(self.eval(expr))
388 |         except MathError as ex:
389 |             return "Math Error:\n %s\n %s" % (
390 |                 expr, ' ' * ex.pos + '^' * ex.length)
391 |         except SyntaxError as ex:
392 |             return "Syntax Error:\n %s\n %s" % (
393 |                 expr, ' ' * ex.pos + '^' * ex.length)
394 |         except KbdBreak as ex:
395 |             return "Keyboard Break:\n %s\n %s" % (
396 |                 expr, ' ' * ex.pos + '^' * ex.length)
397 | 
398 | 
399 | def main():
400 |     calc = Calculator()
401 |     while 1:
402 |         try:
403 |             a = input("> ")
404 |         except (KeyboardInterrupt, EOFError):
405 |             break
406 |         #ret = calc.eval(a)
407 |         ret = calc.pretty(a)
408 |         if ret:
409 |             print(ret)
410 |     print("\b\b", end='')
411 |     return 0
412 | 
413 | if __name__ == '__main__':
414 |     try:
415 |         import readline
416 |     except ImportError:
417 |         pass
418 |     main()
419 | 


--------------------------------------------------------------------------------
/vendor/stopwords.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | about
  3 | above
  4 | after
  5 | again
  6 | against
  7 | all
  8 | am
  9 | an
 10 | and
 11 | any
 12 | are
 13 | as
 14 | at
 15 | be
 16 | because
 17 | been
 18 | before
 19 | being
 20 | below
 21 | between
 22 | both
 23 | but
 24 | by
 25 | cannot
 26 | com
 27 | could
 28 | did
 29 | do
 30 | does
 31 | doing
 32 | down
 33 | during
 34 | each
 35 | few
 36 | for
 37 | from
 38 | further
 39 | had
 40 | has
 41 | have
 42 | having
 43 | he
 44 | her
 45 | here
 46 | hers
 47 | herself
 48 | him
 49 | himself
 50 | his
 51 | how
 52 | http
 53 | https
 54 | i
 55 | if
 56 | in
 57 | into
 58 | is
 59 | it
 60 | its
 61 | itself
 62 | me
 63 | more
 64 | most
 65 | my
 66 | myself
 67 | no
 68 | nor
 69 | not
 70 | of
 71 | off
 72 | on
 73 | once
 74 | only
 75 | or
 76 | other
 77 | ought
 78 | our
 79 | ours
 80 | ourselves
 81 | out
 82 | over
 83 | own
 84 | same
 85 | she
 86 | should
 87 | so
 88 | some
 89 | such
 90 | than
 91 | that
 92 | the
 93 | their
 94 | theirs
 95 | them
 96 | themselves
 97 | then
 98 | there
 99 | these
100 | they
101 | this
102 | those
103 | through
104 | to
105 | too
106 | under
107 | until
108 | up
109 | very
110 | was
111 | we
112 | were
113 | what
114 | when
115 | where
116 | which
117 | while
118 | who
119 | whom
120 | why
121 | with
122 | would
123 | www
124 | you
125 | your
126 | yours
127 | yourself
128 | yourselves
129 | 阿
130 | 啊
131 | 哎
132 | 哎呀
133 | 哎哟
134 | 唉
135 | 嗳
136 | 安全
137 | 俺
138 | 俺们
139 | 按
140 | 按照
141 | 吧
142 | 吧哒
143 | 把
144 | 罢了
145 | 呗
146 | 帮助
147 | 保持
148 | 被
149 | 本
150 | 本着
151 | 彼
152 | 彼此
153 | 比
154 | 比方
155 | 比较
156 | 比如
157 | 鄙人
158 | 必然
159 | 必须
160 | 必要
161 | 避免
162 | 边
163 | 变成
164 | 表明
165 | 表示
166 | 别
167 | 别的
168 | 别说
169 | 并
170 | 并不
171 | 并不是
172 | 并没有
173 | 并且
174 | 不比
175 | 不变
176 | 不成
177 | 不单
178 | 不但
179 | 不得
180 | 不独
181 | 不断
182 | 不敢
183 | 不够
184 | 不管
185 | 不光
186 | 不过
187 | 不会
188 | 不仅
189 | 不久
190 | 不拘
191 | 不可
192 | 不论
193 | 不能
194 | 不怕
195 | 不然
196 | 不如
197 | 不是
198 | 不特
199 | 不同
200 | 不惟
201 | 不问
202 | 不要
203 | 不一
204 | 不只
205 | 不足
206 | 部分
207 | 采取
208 | 曾经
209 | 产生
210 | 常常
211 | 彻底
212 | 趁
213 | 趁着
214 | 乘
215 | 成为
216 | 充分
217 | 冲
218 | 出来
219 | 出去
220 | 出现
221 | 除
222 | 除此之外
223 | 除非
224 | 除了
225 | 处理
226 | 此
227 | 此间
228 | 此时
229 | 此外
230 | 从
231 | 从而
232 | 从事
233 | 促进
234 | 啐
235 | 存在
236 | 达到
237 | 打
238 | 大大
239 | 大多数
240 | 大家
241 | 大力
242 | 大量
243 | 大批
244 | 大约
245 | 代替
246 | 待
247 | 但
248 | 但是
249 | 当
250 | 当前
251 | 当然
252 | 当时
253 | 当着
254 | 到
255 | 得
256 | 得出
257 | 得到
258 | 的
259 | 的话
260 | 等
261 | 等等
262 | 地
263 | 第
264 | 叮咚
265 | 咚
266 | 对
267 | 对应
268 | 对于
269 | 多
270 | 多次
271 | 多少
272 | 多数
273 | 呃
274 | 而
275 | 而况
276 | 而且
277 | 而是
278 | 而外
279 | 而言
280 | 而已
281 | 尔后
282 | 反过来
283 | 反过来说
284 | 反应
285 | 反映
286 | 反之
287 | 范围
288 | 方便
289 | 方面
290 | 防止
291 | 非常
292 | 非但
293 | 非徒
294 | 分别
295 | 丰富
296 | 否则
297 | 复杂
298 | 附近
299 | 嘎
300 | 嘎登
301 | 该
302 | 赶
303 | 高兴
304 | 个
305 | 个别
306 | 个人
307 | 各
308 | 各地
309 | 各个
310 | 各级
311 | 各人
312 | 各位
313 | 各种
314 | 各自
315 | 给
316 | 根本
317 | 根据
318 | 跟
319 | 更加
320 | 巩固
321 | 共同
322 | 构成
323 | 固然
324 | 故
325 | 故此
326 | 关于
327 | 管
328 | 广大
329 | 广泛
330 | 归
331 | 规定
332 | 果然
333 | 果真
334 | 过
335 | 过来
336 | 过去
337 | 哈
338 | 哈哈
339 | 咳
340 | 还是
341 | 还有
342 | 行动
343 | 行为
344 | 毫不
345 | 好的
346 | 好象
347 | 呵
348 | 嗬
349 | 何
350 | 何处
351 | 何况
352 | 何时
353 | 合理
354 | 和
355 | 嘿
356 | 哼
357 | 哼唷
358 | 后来
359 | 后面
360 | 後来
361 | 後面
362 | 乎
363 | 呼哧
364 | 互相
365 | 哗
366 | 欢迎
367 | 换句话说
368 | 换言之
369 | 或
370 | 或是
371 | 或者
372 | 获得
373 | 基本
374 | 积极
375 | 即
376 | 即便
377 | 即或
378 | 即令
379 | 即若
380 | 即使
381 | 及
382 | 及其
383 | 及时
384 | 及至
385 | 极了
386 | 集中
387 | 几
388 | 几乎
389 | 几时
390 | 己
391 | 既
392 | 既然
393 | 既是
394 | 继而
395 | 继续
396 | 加强
397 | 加入
398 | 加以
399 | 加之
400 | 假如
401 | 假若
402 | 假使
403 | 坚持
404 | 坚决
405 | 鉴于
406 | 将
407 | 叫
408 | 叫做
409 | 较
410 | 较之
411 | 接着
412 | 接著
413 | 结果
414 | 结合
415 | 借
416 | 今后
417 | 今後
418 | 今年
419 | 今天
420 | 紧接着
421 | 尽
422 | 尽管
423 | 进步
424 | 进而
425 | 进行
426 | 进入
427 | 经
428 | 经常
429 | 经过
430 | 就
431 | 就是
432 | 就是说
433 | 举行
434 | 具体
435 | 具体地说
436 | 具体说来
437 | 具有
438 | 巨大
439 | 据
440 | 决定
441 | 绝对
442 | 觉得
443 | 开始
444 | 开外
445 | 开展
446 | 看出
447 | 看到
448 | 看见
449 | 看看
450 | 看来
451 | 考虑
452 | 靠
453 | 可
454 | 可见
455 | 可能
456 | 可是
457 | 可以
458 | 况且
459 | 扩大
460 | 啦
461 | 来
462 | 来着
463 | 了
464 | 了解
465 | 离
466 | 哩
467 | 里面
468 | 例如
469 | 立即
470 | 联系
471 | 连
472 | 连同
473 | 练习
474 | 良好
475 | 两者
476 | 临
477 | 另
478 | 另外
479 | 另一方面
480 | 论
481 | 吗
482 | 嘛
483 | 满足
484 | 慢说
485 | 漫说
486 | 冒
487 | 么
488 | 没有
489 | 每
490 | 每当
491 | 每个
492 | 每年
493 | 每天
494 | 们
495 | 密切
496 | 明确
497 | 明显
498 | 莫若
499 | 某
500 | 某个
501 | 某些
502 | 目前
503 | 拿
504 | 哪
505 | 哪边
506 | 哪儿
507 | 哪个
508 | 哪里
509 | 哪年
510 | 哪怕
511 | 哪天
512 | 哪些
513 | 哪样
514 | 那
515 | 那边
516 | 那儿
517 | 那个
518 | 那会儿
519 | 那里
520 | 那么
521 | 那么些
522 | 那么样
523 | 那时
524 | 那些
525 | 那样
526 | 乃
527 | 乃至
528 | 呢
529 | 能
530 | 能否
531 | 能够
532 | 嗯
533 | 你
534 | 你的
535 | 你们
536 | 您
537 | 宁
538 | 宁可
539 | 宁肯
540 | 宁愿
541 | 喏
542 | 喔唷
543 | 哦
544 | 呕
545 | 啪达
546 | 旁人
547 | 呸
548 | 凭
549 | 凭借
550 | 普遍
551 | 普通
552 | 其
553 | 其次
554 | 其二
555 | 其实
556 | 其他
557 | 其它
558 | 其一
559 | 其余
560 | 其中
561 | 企图
562 | 岂但
563 | 起
564 | 起见
565 | 起来
566 | 恰恰相反
567 | 前后
568 | 前进
569 | 前面
570 | 前者
571 | 强调
572 | 强烈
573 | 且
574 | 清楚
575 | 取得
576 | 全部
577 | 全面
578 | 却不
579 | 确定
580 | 然而
581 | 然后
582 | 然後
583 | 然则
584 | 让
585 | 人家
586 | 人们
587 | 任
588 | 任何
589 | 任凭
590 | 任务
591 | 认识
592 | 认为
593 | 认真
594 | 仍然
595 | 容易
596 | 如
597 | 如此
598 | 如果
599 | 如何
600 | 如其
601 | 如若
602 | 如上所述
603 | 如下
604 | 若
605 | 若非
606 | 若是
607 | 啥
608 | 上来
609 | 上面
610 | 上去
611 | 上升
612 | 上述
613 | 上下
614 | 尚且
615 | 少数
616 | 设若
617 | 设使
618 | 深入
619 | 甚而
620 | 甚么
621 | 甚至
622 | 省得
623 | 失去
624 | 什么
625 | 什么样
626 | 十分
627 | 实际
628 | 实现
629 | 时候
630 | 使得
631 | 使用
632 | 是
633 | 是不是
634 | 是的
635 | 是否
636 | 适当
637 | 适应
638 | 适用
639 | 首先
640 | 受到
641 | 属于
642 | 双方
643 | 谁
644 | 谁知
645 | 顺
646 | 顺着
647 | 说明
648 | 说说
649 | 似的
650 | 似乎
651 | 虽
652 | 虽然
653 | 虽说
654 | 虽则
655 | 随
656 | 随着
657 | 随著
658 | 所
659 | 所谓
660 | 所以
661 | 所有
662 | 他
663 | 他的
664 | 他们
665 | 他人
666 | 她
667 | 她的
668 | 她们
669 | 它
670 | 它的
671 | 它们
672 | 它们的
673 | 倘
674 | 倘或
675 | 倘然
676 | 倘若
677 | 倘使
678 | 特别是
679 | 特点
680 | 特殊
681 | 腾
682 | 替
683 | 通常
684 | 通过
685 | 同
686 | 同时
687 | 同样
688 | 同一
689 | 突出
690 | 突然
691 | 哇
692 | 完成
693 | 完全
694 | 万一
695 | 往
696 | 往往
697 | 望
698 | 为
699 | 为何
700 | 为了
701 | 为什么
702 | 为着
703 | 为主
704 | 维持
705 | 伟大
706 | 喂
707 | 问题
708 | 嗡嗡
709 | 我
710 | 我的
711 | 我们
712 | 乌乎
713 | 呜
714 | 呜呼
715 | 无法
716 | 无论
717 | 无宁
718 | 毋宁
719 | 兮
720 | 嘻
721 | 下来
722 | 下列
723 | 下面
724 | 下去
725 | 吓
726 | 先后
727 | 先後
728 | 先生
729 | 显然
730 | 显著
731 | 现代
732 | 现在
733 | 限制
734 | 相当
735 | 相等
736 | 相对
737 | 相对而言
738 | 相反
739 | 相似
740 | 相同
741 | 相信
742 | 相应
743 | 像
744 | 向
745 | 向着
746 | 心里
747 | 形成
748 | 嘘
749 | 需要
750 | 许多
751 | 宣布
752 | 迅速
753 | 呀
754 | 焉
755 | 严格
756 | 严重
757 | 沿
758 | 沿着
759 | 要
760 | 要不
761 | 要不然
762 | 要不是
763 | 要么
764 | 要求
765 | 要是
766 | 也
767 | 也罢
768 | 也好
769 | 也是
770 | 一
771 | 一般
772 | 一边
773 | 一次
774 | 一旦
775 | 一定
776 | 一方面
777 | 一来
778 | 一面
779 | 一片
780 | 一起
781 | 一切
782 | 一时
783 | 一天
784 | 一下
785 | 一些
786 | 一样
787 | 一则
788 | 一直
789 | 一致
790 | 依
791 | 依靠
792 | 依照
793 | 咦
794 | 移动
795 | 以
796 | 以便
797 | 以后
798 | 以後
799 | 以及
800 | 以来
801 | 以免
802 | 以前
803 | 以上
804 | 以外
805 | 以为
806 | 以下
807 | 以至
808 | 以至于
809 | 以致
810 | 已经
811 | 矣
812 | 意思
813 | 抑或
814 | 因
815 | 因此
816 | 因而
817 | 因为
818 | 引起
819 | 应当
820 | 应该
821 | 应用
822 | 哟
823 | 用
824 | 尤其
825 | 由
826 | 由此可见
827 | 由于
828 | 有
829 | 有的
830 | 有点
831 | 有关
832 | 有利
833 | 有力
834 | 有时
835 | 有所
836 | 有效
837 | 有些
838 | 有着
839 | 有著
840 | 又
841 | 于
842 | 于是
843 | 于是乎
844 | 与
845 | 与此同时
846 | 与否
847 | 与其
848 | 遇到
849 | 原来
850 | 愿意
851 | 越是
852 | 云云
853 | 允许
854 | 运用
855 | 咋
856 | 哉
857 | 再说
858 | 再者
859 | 在
860 | 在下
861 | 咱
862 | 咱们
863 | 遭到
864 | 造成
865 | 则
866 | 怎
867 | 怎么
868 | 怎么办
869 | 怎么样
870 | 怎样
871 | 战斗
872 | 掌握
873 | 朝
874 | 朝着
875 | 召开
876 | 照
877 | 照着
878 | 者
879 | 这
880 | 这边
881 | 这点
882 | 这儿
883 | 这个
884 | 这会儿
885 | 这就是说
886 | 这里
887 | 这么
888 | 这么点儿
889 | 这么些
890 | 这么样
891 | 这时
892 | 这些
893 | 这样
894 | 这种
895 | 着
896 | 着呢
897 | 真是
898 | 真正
899 | 争取
900 | 整个
901 | 正常
902 | 正如
903 | 正在
904 | 之
905 | 之后
906 | 之後
907 | 之类
908 | 之前
909 | 之所以
910 | 之一
911 | 吱
912 | 知道
913 | 直到
914 | 直接
915 | 只是
916 | 只限
917 | 只要
918 | 只有
919 | 至
920 | 至于
921 | 中间
922 | 中小
923 | 重大
924 | 重新
925 | 重要
926 | 周围
927 | 诸位
928 | 逐步
929 | 逐渐
930 | 主要
931 | 主张
932 | 注意
933 | 专门
934 | 转变
935 | 转动
936 | 转贴
937 | 准备
938 | 自
939 | 自从
940 | 自个儿
941 | 自各儿
942 | 自己
943 | 自家
944 | 自身
945 | 综上所述
946 | 总的来看
947 | 总的来说
948 | 总的说来
949 | 总而言之
950 | 总结
951 | 总是
952 | 总之
953 | 纵
954 | 纵令
955 | 纵然
956 | 纵使
957 | 组成
958 | 最大
959 | 最高
960 | 最好
961 | 最后
962 | 最後
963 | 最近
964 | 遵照
965 | 左右
966 | 作为
967 | 做到
968 | 


--------------------------------------------------------------------------------
/vendor/umsgpack.py:
--------------------------------------------------------------------------------
  1 | # u-msgpack-python v2.0 - vsergeev at gmail
  2 | # https://github.com/vsergeev/u-msgpack-python
  3 | #
  4 | # u-msgpack-python is a lightweight MessagePack serializer and deserializer
  5 | # module, compatible with both Python 2 and 3, as well CPython and PyPy
  6 | # implementations of Python. u-msgpack-python is fully compliant with the
  7 | # latest MessagePack specification.com/msgpack/msgpack/blob/master/spec.md). In
  8 | # particular, it supports the new binary, UTF-8 string, and application ext
  9 | # types.
 10 | #
 11 | # MIT License
 12 | #
 13 | # Copyright (c) 2013-2014 Ivan A. Sergeev
 14 | #
 15 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 16 | # of this software and associated documentation files (the "Software"), to deal
 17 | # in the Software without restriction, including without limitation the rights
 18 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 19 | # copies of the Software, and to permit persons to whom the Software is
 20 | # furnished to do so, subject to the following conditions:
 21 | #
 22 | # The above copyright notice and this permission notice shall be included in
 23 | # all copies or substantial portions of the Software.
 24 | #
 25 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 26 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 27 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 28 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 29 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 30 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 31 | # THE SOFTWARE.
 32 | #
 33 | """
 34 | u-msgpack-python v2.0 - vsergeev at gmail
 35 | https://github.com/vsergeev/u-msgpack-python
 36 | 
 37 | u-msgpack-python is a lightweight MessagePack serializer and deserializer
 38 | module, compatible with both Python 2 and 3, as well CPython and PyPy
 39 | implementations of Python. u-msgpack-python is fully compliant with the
 40 | latest MessagePack specification.com/msgpack/msgpack/blob/master/spec.md). In
 41 | particular, it supports the new binary, UTF-8 string, and application ext
 42 | types.
 43 | 
 44 | License: MIT
 45 | """
 46 | 
 47 | version = (2,0)
 48 | "Module version tuple"
 49 | 
 50 | import struct
 51 | import collections
 52 | import sys
 53 | import io
 54 | 
 55 | ################################################################################
 56 | ### Ext Class
 57 | ################################################################################
 58 | 
 59 | # Extension type for application-defined types and data
 60 | class Ext:
 61 |     """
 62 |     The Ext class facilitates creating a serializable extension object to store
 63 |     an application-defined type and data byte array.
 64 |     """
 65 | 
 66 |     def __init__(self, type, data):
 67 |         """
 68 |         Construct a new Ext object.
 69 | 
 70 |         Args:
 71 |             type: application-defined type integer from 0 to 127
 72 |             data: application-defined data byte array
 73 | 
 74 |         Raises:
 75 |             TypeError:
 76 |                 Specified ext type is outside of 0 to 127 range.
 77 | 
 78 |         Example:
 79 |         >>> foo = umsgpack.Ext(0x05, b"\x01\x02\x03")
 80 |         >>> umsgpack.packb({u"special stuff": foo, u"awesome": True})
 81 |         '\x82\xa7awesome\xc3\xadspecial stuff\xc7\x03\x05\x01\x02\x03'
 82 |         >>> bar = umsgpack.unpackb(_)
 83 |         >>> print(bar["special stuff"])
 84 |         Ext Object (Type: 0x05, Data: 01 02 03)
 85 |         >>>
 86 |         """
 87 |         # Application ext type should be 0 <= type <= 127
 88 |         if not isinstance(type, int) or not (type >= 0 and type <= 127):
 89 |             raise TypeError("ext type out of range")
 90 |         # Check data is type bytes
 91 |         elif sys.version_info[0] == 3 and not isinstance(data, bytes):
 92 |             raise TypeError("ext data is not type \'bytes\'")
 93 |         elif sys.version_info[0] == 2 and not isinstance(data, str):
 94 |             raise TypeError("ext data is not type \'str\'")
 95 |         self.type = type
 96 |         self.data = data
 97 | 
 98 |     def __eq__(self, other):
 99 |         """
100 |         Compare this Ext object with another for equality.
101 |         """
102 |         return (isinstance(other, self.__class__) and
103 |                 self.type == other.type and
104 |                 self.data == other.data)
105 | 
106 |     def __ne__(self, other):
107 |         """
108 |         Compare this Ext object with another for inequality.
109 |         """
110 |         return not self.__eq__(other)
111 | 
112 |     def __str__(self):
113 |         """
114 |         String representation of this Ext object.
115 |         """
116 |         s = "Ext Object (Type: 0x%02x, Data: " % self.type
117 |         for i in range(min(len(self.data), 8)):
118 |             if i > 0:
119 |                 s += " "
120 |             if isinstance(self.data[i], int):
121 |                 s += "%02x" % (self.data[i])
122 |             else:
123 |                 s += "%02x" % ord(self.data[i])
124 |         if len(self.data) > 8:
125 |             s += " ..."
126 |         s += ")"
127 |         return s
128 | 
129 | ################################################################################
130 | ### Exceptions
131 | ################################################################################
132 | 
133 | # Base Exception classes
134 | class PackException(Exception):
135 |     "Base class for exceptions encountered during packing."
136 |     pass
137 | class UnpackException(Exception):
138 |     "Base class for exceptions encountered during unpacking."
139 |     pass
140 | 
141 | # Packing error
142 | class UnsupportedTypeException(PackException):
143 |     "Object type not supported for packing."
144 |     pass
145 | 
146 | # Unpacking error
147 | class InsufficientDataException(UnpackException):
148 |     "Insufficient data to unpack the encoded object."
149 |     pass
150 | class InvalidStringException(UnpackException):
151 |     "Invalid UTF-8 string encountered during unpacking."
152 |     pass
153 | class ReservedCodeException(UnpackException):
154 |     "Reserved code encountered during unpacking."
155 |     pass
156 | class UnhashableKeyException(UnpackException):
157 |     """
158 |     Unhashable key encountered during map unpacking.
159 |     The serialized map cannot be deserialized into a Python dictionary.
160 |     """
161 |     pass
162 | class DuplicateKeyException(UnpackException):
163 |     "Duplicate key encountered during map unpacking."
164 |     pass
165 | 
166 | # Backwards compatibility
167 | KeyNotPrimitiveException = UnhashableKeyException
168 | KeyDuplicateException = DuplicateKeyException
169 | 
170 | ################################################################################
171 | ### Exported Functions and Globals
172 | ################################################################################
173 | 
174 | # Exported functions and variables, set up in __init()
175 | pack = None
176 | packb = None
177 | unpack = None
178 | unpackb = None
179 | dump = None
180 | dumps = None
181 | load = None
182 | loads = None
183 | 
184 | compatibility = False
185 | """
186 | Compatibility mode boolean.
187 | 
188 | When compatibility mode is enabled, u-msgpack-python will serialize both
189 | unicode strings and bytes into the old "raw" msgpack type, and deserialize the
190 | "raw" msgpack type into bytes. This provides backwards compatibility with the
191 | old MessagePack specification.
192 | 
193 | Example:
194 | >>> umsgpack.compatibility = True
195 | >>>
196 | >>> umsgpack.packb([u"some string", b"some bytes"])
197 | b'\x92\xabsome string\xaasome bytes'
198 | >>> umsgpack.unpackb(_)
199 | [b'some string', b'some bytes']
200 | >>>
201 | """
202 | 
203 | ################################################################################
204 | ### Packing
205 | ################################################################################
206 | 
207 | # You may notice struct.pack("B", obj) instead of the simpler chr(obj) in the
208 | # code below. This is to allow for seamless Python 2 and 3 compatibility, as
209 | # chr(obj) has a str return type instead of bytes in Python 3, and
210 | # struct.pack(...) has the right return type in both versions.
211 | 
212 | def _pack_integer(obj, fp):
213 |     if obj < 0:
214 |         if obj >= -32:
215 |             fp.write(struct.pack("b", obj))
216 |         elif obj >= -2**(8-1):
217 |             fp.write(b"\xd0" + struct.pack("b", obj))
218 |         elif obj >= -2**(16-1):
219 |             fp.write(b"\xd1" + struct.pack(">h", obj))
220 |         elif obj >= -2**(32-1):
221 |             fp.write(b"\xd2" + struct.pack(">i", obj))
222 |         elif obj >= -2**(64-1):
223 |             fp.write(b"\xd3" + struct.pack(">q", obj))
224 |         else:
225 |             raise UnsupportedTypeException("huge signed int")
226 |     else:
227 |         if obj <= 127:
228 |             fp.write(struct.pack("B", obj))
229 |         elif obj <= 2**8-1:
230 |             fp.write(b"\xcc" + struct.pack("B", obj))
231 |         elif obj <= 2**16-1:
232 |             fp.write(b"\xcd" + struct.pack(">H", obj))
233 |         elif obj <= 2**32-1:
234 |             fp.write(b"\xce" + struct.pack(">I", obj))
235 |         elif obj <= 2**64-1:
236 |             fp.write(b"\xcf" + struct.pack(">Q", obj))
237 |         else:
238 |             raise UnsupportedTypeException("huge unsigned int")
239 | 
240 | def _pack_nil(obj, fp):
241 |     fp.write(b"\xc0")
242 | 
243 | def _pack_boolean(obj, fp):
244 |     fp.write(b"\xc3" if obj else b"\xc2")
245 | 
246 | def _pack_float(obj, fp):
247 |     if _float_size == 64:
248 |         fp.write(b"\xcb" + struct.pack(">d", obj))
249 |     else:
250 |         fp.write(b"\xca" + struct.pack(">f", obj))
251 | 
252 | def _pack_string(obj, fp):
253 |     obj = obj.encode('utf-8')
254 |     if len(obj) <= 31:
255 |         fp.write(struct.pack("B", 0xa0 | len(obj)) + obj)
256 |     elif len(obj) <= 2**8-1:
257 |         fp.write(b"\xd9" + struct.pack("B", len(obj)) + obj)
258 |     elif len(obj) <= 2**16-1:
259 |         fp.write(b"\xda" + struct.pack(">H", len(obj)) + obj)
260 |     elif len(obj) <= 2**32-1:
261 |         fp.write(b"\xdb" + struct.pack(">I", len(obj)) + obj)
262 |     else:
263 |         raise UnsupportedTypeException("huge string")
264 | 
265 | def _pack_binary(obj, fp):
266 |     if len(obj) <= 2**8-1:
267 |         fp.write(b"\xc4" + struct.pack("B", len(obj)) + obj)
268 |     elif len(obj) <= 2**16-1:
269 |         fp.write(b"\xc5" + struct.pack(">H", len(obj)) + obj)
270 |     elif len(obj) <= 2**32-1:
271 |         fp.write(b"\xc6" + struct.pack(">I", len(obj)) + obj)
272 |     else:
273 |         raise UnsupportedTypeException("huge binary string")
274 | 
275 | def _pack_oldspec_raw(obj, fp):
276 |     if len(obj) <= 31:
277 |         fp.write(struct.pack("B", 0xa0 | len(obj)) + obj)
278 |     elif len(obj) <= 2**16-1:
279 |         fp.write(b"\xda" + struct.pack(">H", len(obj)) + obj)
280 |     elif len(obj) <= 2**32-1:
281 |         fp.write(b"\xdb" + struct.pack(">I", len(obj)) + obj)
282 |     else:
283 |         raise UnsupportedTypeException("huge raw string")
284 | 
285 | def _pack_ext(obj, fp):
286 |     if len(obj.data) == 1:
287 |         fp.write(b"\xd4" + struct.pack("B", obj.type & 0xff) + obj.data)
288 |     elif len(obj.data) == 2:
289 |         fp.write(b"\xd5" + struct.pack("B", obj.type & 0xff) + obj.data)
290 |     elif len(obj.data) == 4:
291 |         fp.write(b"\xd6" + struct.pack("B", obj.type & 0xff) + obj.data)
292 |     elif len(obj.data) == 8:
293 |         fp.write(b"\xd7" + struct.pack("B", obj.type & 0xff) + obj.data)
294 |     elif len(obj.data) == 16:
295 |         fp.write(b"\xd8" + struct.pack("B", obj.type & 0xff) + obj.data)
296 |     elif len(obj.data) <= 2**8-1:
297 |         fp.write(b"\xc7" + struct.pack("BB", len(obj.data), obj.type & 0xff) + obj.data)
298 |     elif len(obj.data) <= 2**16-1:
299 |         fp.write(b"\xc8" + struct.pack(">HB", len(obj.data), obj.type & 0xff) + obj.data)
300 |     elif len(obj.data) <= 2**32-1:
301 |         fp.write(b"\xc9" + struct.pack(">IB", len(obj.data), obj.type & 0xff) + obj.data)
302 |     else:
303 |         raise UnsupportedTypeException("huge ext data")
304 | 
305 | def _pack_array(obj, fp):
306 |     if len(obj) <= 15:
307 |         fp.write(struct.pack("B", 0x90 | len(obj)))
308 |     elif len(obj) <= 2**16-1:
309 |         fp.write(b"\xdc" + struct.pack(">H", len(obj)))
310 |     elif len(obj) <= 2**32-1:
311 |         fp.write(b"\xdd" + struct.pack(">I", len(obj)))
312 |     else:
313 |         raise UnsupportedTypeException("huge array")
314 | 
315 |     for e in obj:
316 |         pack(e, fp)
317 | 
318 | def _pack_map(obj, fp):
319 |     if len(obj) <= 15:
320 |         fp.write(struct.pack("B", 0x80 | len(obj)))
321 |     elif len(obj) <= 2**16-1:
322 |         fp.write(b"\xde" + struct.pack(">H", len(obj)))
323 |     elif len(obj) <= 2**32-1:
324 |         fp.write(b"\xdf" + struct.pack(">I", len(obj)))
325 |     else:
326 |         raise UnsupportedTypeException("huge array")
327 | 
328 |     for k,v in obj.items():
329 |         pack(k, fp)
330 |         pack(v, fp)
331 | 
332 | ########################################
333 | 
334 | # Pack for Python 2, with 'unicode' type, 'str' type, and 'long' type
335 | def _pack2(obj, fp):
336 |     """
337 |     Serialize a Python object into MessagePack bytes.
338 | 
339 |     Args:
340 |         obj: a Python object
341 |         fp: a .write()-supporting file-like object
342 | 
343 |     Returns:
344 |         None.
345 | 
346 |     Raises:
347 |         UnsupportedType(PackException):
348 |             Object type not supported for packing.
349 | 
350 |     Example:
351 |     >>> f = open('test.bin', 'w')
352 |     >>> umsgpack.pack({u"compact": True, u"schema": 0}, f)
353 |     >>>
354 |     """
355 | 
356 |     global compatibility
357 | 
358 |     if obj is None:
359 |         _pack_nil(obj, fp)
360 |     elif isinstance(obj, bool):
361 |         _pack_boolean(obj, fp)
362 |     elif isinstance(obj, int) or isinstance(obj, long):
363 |         _pack_integer(obj, fp)
364 |     elif isinstance(obj, float):
365 |         _pack_float(obj, fp)
366 |     elif compatibility and isinstance(obj, unicode):
367 |         _pack_oldspec_raw(bytes(obj), fp)
368 |     elif compatibility and isinstance(obj, bytes):
369 |         _pack_oldspec_raw(obj, fp)
370 |     elif isinstance(obj, unicode):
371 |         _pack_string(obj, fp)
372 |     elif isinstance(obj, str):
373 |         _pack_binary(obj, fp)
374 |     elif isinstance(obj, list) or isinstance(obj, tuple):
375 |         _pack_array(obj, fp)
376 |     elif isinstance(obj, dict):
377 |         _pack_map(obj, fp)
378 |     elif isinstance(obj, Ext):
379 |         _pack_ext(obj, fp)
380 |     else:
381 |         raise UnsupportedTypeException("unsupported type: %s" % str(type(obj)))
382 | 
383 | # Pack for Python 3, with unicode 'str' type, 'bytes' type, and no 'long' type
384 | def _pack3(obj, fp):
385 |     """
386 |     Serialize a Python object into MessagePack bytes.
387 | 
388 |     Args:
389 |         obj: a Python object
390 |         fp: a .write()-supporting file-like object
391 | 
392 |     Returns:
393 |         None.
394 | 
395 |     Raises:
396 |         UnsupportedType(PackException):
397 |             Object type not supported for packing.
398 | 
399 |     Example:
400 |     >>> f = open('test.bin', 'w')
401 |     >>> umsgpack.pack({u"compact": True, u"schema": 0}, fp)
402 |     >>>
403 |     """
404 |     global compatibility
405 | 
406 |     if obj is None:
407 |         _pack_nil(obj, fp)
408 |     elif isinstance(obj, bool):
409 |         _pack_boolean(obj, fp)
410 |     elif isinstance(obj, int):
411 |         _pack_integer(obj, fp)
412 |     elif isinstance(obj, float):
413 |         _pack_float(obj, fp)
414 |     elif compatibility and isinstance(obj, str):
415 |         _pack_oldspec_raw(obj.encode('utf-8'), fp)
416 |     elif compatibility and isinstance(obj, bytes):
417 |         _pack_oldspec_raw(obj, fp)
418 |     elif isinstance(obj, str):
419 |         _pack_string(obj, fp)
420 |     elif isinstance(obj, bytes):
421 |         _pack_binary(obj, fp)
422 |     elif isinstance(obj, list) or isinstance(obj, tuple):
423 |         _pack_array(obj, fp)
424 |     elif isinstance(obj, dict):
425 |         _pack_map(obj, fp)
426 |     elif isinstance(obj, Ext):
427 |         _pack_ext(obj, fp)
428 |     else:
429 |         raise UnsupportedTypeException("unsupported type: %s" % str(type(obj)))
430 | 
431 | def _packb2(obj):
432 |     """
433 |     Serialize a Python object into MessagePack bytes.
434 | 
435 |     Args:
436 |         obj: a Python object
437 | 
438 |     Returns:
439 |         A 'str' containing serialized MessagePack bytes.
440 | 
441 |     Raises:
442 |         UnsupportedType(PackException):
443 |             Object type not supported for packing.
444 | 
445 |     Example:
446 |     >>> umsgpack.packb({u"compact": True, u"schema": 0})
447 |     '\x82\xa7compact\xc3\xa6schema\x00'
448 |     >>>
449 |     """
450 |     fp = io.BytesIO()
451 |     _pack2(obj, fp)
452 |     return fp.getvalue()
453 | 
454 | def _packb3(obj):
455 |     """
456 |     Serialize a Python object into MessagePack bytes.
457 | 
458 |     Args:
459 |         obj: a Python object
460 | 
461 |     Returns:
462 |         A 'bytes' containing serialized MessagePack bytes.
463 | 
464 |     Raises:
465 |         UnsupportedType(PackException):
466 |             Object type not supported for packing.
467 | 
468 |     Example:
469 |     >>> umsgpack.packb({u"compact": True, u"schema": 0})
470 |     b'\x82\xa7compact\xc3\xa6schema\x00'
471 |     >>>
472 |     """
473 |     fp = io.BytesIO()
474 |     _pack3(obj, fp)
475 |     return fp.getvalue()
476 | 
477 | ################################################################################
478 | ### Unpacking
479 | ################################################################################
480 | 
481 | def _read_except(fp, n):
482 |     data = fp.read(n)
483 |     if len(data) < n:
484 |         raise InsufficientDataException()
485 |     return data
486 | 
487 | def _unpack_integer(code, fp):
488 |     if (ord(code) & 0xe0) == 0xe0:
489 |         return struct.unpack("b", code)[0]
490 |     elif code == b'\xd0':
491 |         return struct.unpack("b", _read_except(fp, 1))[0]
492 |     elif code == b'\xd1':
493 |         return struct.unpack(">h", _read_except(fp, 2))[0]
494 |     elif code == b'\xd2':
495 |         return struct.unpack(">i", _read_except(fp, 4))[0]
496 |     elif code == b'\xd3':
497 |         return struct.unpack(">q", _read_except(fp, 8))[0]
498 |     elif (ord(code) & 0x80) == 0x00:
499 |         return struct.unpack("B", code)[0]
500 |     elif code == b'\xcc':
501 |         return struct.unpack("B", _read_except(fp, 1))[0]
502 |     elif code == b'\xcd':
503 |         return struct.unpack(">H", _read_except(fp, 2))[0]
504 |     elif code == b'\xce':
505 |         return struct.unpack(">I", _read_except(fp, 4))[0]
506 |     elif code == b'\xcf':
507 |         return struct.unpack(">Q", _read_except(fp, 8))[0]
508 |     raise Exception("logic error, not int: 0x%02x" % ord(code))
509 | 
510 | def _unpack_reserved(code, fp):
511 |     if code == b'\xc1':
512 |         raise ReservedCodeException("encountered reserved code: 0x%02x" % ord(code))
513 |     raise Exception("logic error, not reserved code: 0x%02x" % ord(code))
514 | 
515 | def _unpack_nil(code, fp):
516 |     if code == b'\xc0':
517 |         return None
518 |     raise Exception("logic error, not nil: 0x%02x" % ord(code))
519 | 
520 | def _unpack_boolean(code, fp):
521 |     if code == b'\xc2':
522 |         return False
523 |     elif code == b'\xc3':
524 |         return True
525 |     raise Exception("logic error, not boolean: 0x%02x" % ord(code))
526 | 
527 | def _unpack_float(code, fp):
528 |     if code == b'\xca':
529 |         return struct.unpack(">f", _read_except(fp, 4))[0]
530 |     elif code == b'\xcb':
531 |         return struct.unpack(">d", _read_except(fp, 8))[0]
532 |     raise Exception("logic error, not float: 0x%02x" % ord(code))
533 | 
534 | def _unpack_string(code, fp):
535 |     if (ord(code) & 0xe0) == 0xa0:
536 |         length = ord(code) & ~0xe0
537 |     elif code == b'\xd9':
538 |         length = struct.unpack("B", _read_except(fp, 1))[0]
539 |     elif code == b'\xda':
540 |         length = struct.unpack(">H", _read_except(fp, 2))[0]
541 |     elif code == b'\xdb':
542 |         length = struct.unpack(">I", _read_except(fp, 4))[0]
543 |     else:
544 |         raise Exception("logic error, not string: 0x%02x" % ord(code))
545 | 
546 |     # Always return raw bytes in compatibility mode
547 |     global compatibility
548 |     if compatibility:
549 |         return _read_except(fp, length)
550 | 
551 |     try:
552 |         return bytes.decode(_read_except(fp, length), 'utf-8')
553 |     except UnicodeDecodeError:
554 |         raise InvalidStringException("unpacked string is not utf-8")
555 | 
556 | def _unpack_binary(code, fp):
557 |     if code == b'\xc4':
558 |         length = struct.unpack("B", _read_except(fp, 1))[0]
559 |     elif code == b'\xc5':
560 |         length = struct.unpack(">H", _read_except(fp, 2))[0]
561 |     elif code == b'\xc6':
562 |         length = struct.unpack(">I", _read_except(fp, 4))[0]
563 |     else:
564 |         raise Exception("logic error, not binary: 0x%02x" % ord(code))
565 | 
566 |     return _read_except(fp, length)
567 | 
568 | def _unpack_ext(code, fp):
569 |     if code == b'\xd4':
570 |         length = 1
571 |     elif code == b'\xd5':
572 |         length = 2
573 |     elif code == b'\xd6':
574 |         length = 4
575 |     elif code == b'\xd7':
576 |         length = 8
577 |     elif code == b'\xd8':
578 |         length = 16
579 |     elif code == b'\xc7':
580 |         length = struct.unpack("B", _read_except(fp, 1))[0]
581 |     elif code == b'\xc8':
582 |         length = struct.unpack(">H", _read_except(fp, 2))[0]
583 |     elif code == b'\xc9':
584 |         length = struct.unpack(">I", _read_except(fp, 4))[0]
585 |     else:
586 |         raise Exception("logic error, not ext: 0x%02x" % ord(code))
587 | 
588 |     return Ext(ord(_read_except(fp, 1)), _read_except(fp, length))
589 | 
590 | def _unpack_array(code, fp):
591 |     if (ord(code) & 0xf0) == 0x90:
592 |         length = (ord(code) & ~0xf0)
593 |     elif code == b'\xdc':
594 |         length = struct.unpack(">H", _read_except(fp, 2))[0]
595 |     elif code == b'\xdd':
596 |         length = struct.unpack(">I", _read_except(fp, 4))[0]
597 |     else:
598 |         raise Exception("logic error, not array: 0x%02x" % ord(code))
599 | 
600 |     return [_unpack(fp) for i in range(length)]
601 | 
602 | def _deep_list_to_tuple(obj):
603 |     if isinstance(obj, list):
604 |         return tuple([_deep_list_to_tuple(e) for e in obj])
605 |     return obj
606 | 
607 | def _unpack_map(code, fp):
608 |     if (ord(code) & 0xf0) == 0x80:
609 |         length = (ord(code) & ~0xf0)
610 |     elif code == b'\xde':
611 |         length = struct.unpack(">H", _read_except(fp, 2))[0]
612 |     elif code == b'\xdf':
613 |         length = struct.unpack(">I", _read_except(fp, 4))[0]
614 |     else:
615 |         raise Exception("logic error, not map: 0x%02x" % ord(code))
616 | 
617 |     d = {}
618 |     for i in range(length):
619 |         # Unpack key
620 |         k = _unpack(fp)
621 | 
622 |         if isinstance(k, list):
623 |             # Attempt to convert list into a hashable tuple
624 |             k = _deep_list_to_tuple(k)
625 |         elif not isinstance(k, collections.Hashable):
626 |             raise UnhashableKeyException("encountered unhashable key: %s, %s" % (str(k), str(type(k))))
627 |         elif k in d:
628 |             raise DuplicateKeyException("encountered duplicate key: %s, %s" % (str(k), str(type(k))))
629 | 
630 |         # Unpack value
631 |         v = _unpack(fp)
632 | 
633 |         try:
634 |             d[k] = v
635 |         except TypeError:
636 |             raise UnhashableKeyException("encountered unhashable key: %s" % str(k))
637 |     return d
638 | 
639 | def _unpack(fp):
640 |     code = _read_except(fp, 1)
641 |     return _unpack_dispatch_table[code](code, fp)
642 | 
643 | ########################################
644 | 
645 | def _unpack2(fp):
646 |     """
647 |     Deserialize MessagePack bytes into a Python object.
648 | 
649 |     Args:
650 |         fp: a .read()-supporting file-like object
651 | 
652 |     Returns:
653 |         A Python object.
654 | 
655 |     Raises:
656 |         InsufficientDataException(UnpackException):
657 |             Insufficient data to unpack the encoded object.
658 |         InvalidStringException(UnpackException):
659 |             Invalid UTF-8 string encountered during unpacking.
660 |         ReservedCodeException(UnpackException):
661 |             Reserved code encountered during unpacking.
662 |         UnhashableKeyException(UnpackException):
663 |             Unhashable key encountered during map unpacking.
664 |             The serialized map cannot be deserialized into a Python dictionary.
665 |         DuplicateKeyException(UnpackException):
666 |             Duplicate key encountered during map unpacking.
667 | 
668 |     Example:
669 |     >>> f = open("test.bin")
670 |     >>> umsgpack.unpackb(f)
671 |     {u'compact': True, u'schema': 0}
672 |     >>>
673 |     """
674 |     return _unpack(fp)
675 | 
676 | def _unpack3(fp):
677 |     """
678 |     Deserialize MessagePack bytes into a Python object.
679 | 
680 |     Args:
681 |         fp: a .read()-supporting file-like object
682 | 
683 |     Returns:
684 |         A Python object.
685 | 
686 |     Raises:
687 |         InsufficientDataException(UnpackException):
688 |             Insufficient data to unpack the encoded object.
689 |         InvalidStringException(UnpackException):
690 |             Invalid UTF-8 string encountered during unpacking.
691 |         ReservedCodeException(UnpackException):
692 |             Reserved code encountered during unpacking.
693 |         UnhashableKeyException(UnpackException):
694 |             Unhashable key encountered during map unpacking.
695 |             The serialized map cannot be deserialized into a Python dictionary.
696 |         DuplicateKeyException(UnpackException):
697 |             Duplicate key encountered during map unpacking.
698 | 
699 |     Example:
700 |     >>> f = open("test.bin")
701 |     >>> umsgpack.unpackb(f)
702 |     {'compact': True, 'schema': 0}
703 |     >>>
704 |     """
705 |     return _unpack(fp)
706 | 
707 | # For Python 2, expects a str object
708 | def _unpackb2(s):
709 |     """
710 |     Deserialize MessagePack bytes into a Python object.
711 | 
712 |     Args:
713 |         s: a 'str' containing serialized MessagePack bytes
714 | 
715 |     Returns:
716 |         A Python object.
717 | 
718 |     Raises:
719 |         TypeError:
720 |             Packed data is not type 'str'.
721 |         InsufficientDataException(UnpackException):
722 |             Insufficient data to unpack the encoded object.
723 |         InvalidStringException(UnpackException):
724 |             Invalid UTF-8 string encountered during unpacking.
725 |         ReservedCodeException(UnpackException):
726 |             Reserved code encountered during unpacking.
727 |         UnhashableKeyException(UnpackException):
728 |             Unhashable key encountered during map unpacking.
729 |             The serialized map cannot be deserialized into a Python dictionary.
730 |         DuplicateKeyException(UnpackException):
731 |             Duplicate key encountered during map unpacking.
732 | 
733 |     Example:
734 |     >>> umsgpack.unpackb(b'\x82\xa7compact\xc3\xa6schema\x00')
735 |     {u'compact': True, u'schema': 0}
736 |     >>>
737 |     """
738 |     if not isinstance(s, str):
739 |         raise TypeError("packed data is not type 'str'")
740 |     return _unpack(io.BytesIO(s))
741 | 
742 | # For Python 3, expects a bytes object
743 | def _unpackb3(s):
744 |     """
745 |     Deserialize MessagePack bytes into a Python object.
746 | 
747 |     Args:
748 |         s: a 'bytes' containing serialized MessagePack bytes
749 | 
750 |     Returns:
751 |         A Python object.
752 | 
753 |     Raises:
754 |         TypeError:
755 |             Packed data is not type 'bytes'.
756 |         InsufficientDataException(UnpackException):
757 |             Insufficient data to unpack the encoded object.
758 |         InvalidStringException(UnpackException):
759 |             Invalid UTF-8 string encountered during unpacking.
760 |         ReservedCodeException(UnpackException):
761 |             Reserved code encountered during unpacking.
762 |         UnhashableKeyException(UnpackException):
763 |             Unhashable key encountered during map unpacking.
764 |             The serialized map cannot be deserialized into a Python dictionary.
765 |         DuplicateKeyException(UnpackException):
766 |             Duplicate key encountered during map unpacking.
767 | 
768 |     Example:
769 |     >>> umsgpack.unpackb(b'\x82\xa7compact\xc3\xa6schema\x00')
770 |     {'compact': True, 'schema': 0}
771 |     >>>
772 |     """
773 |     if not isinstance(s, bytes):
774 |         raise TypeError("packed data is not type 'bytes'")
775 |     return _unpack(io.BytesIO(s))
776 | 
777 | ################################################################################
778 | ### Module Initialization
779 | ################################################################################
780 | 
781 | def __init():
782 |     global pack
783 |     global packb
784 |     global unpack
785 |     global unpackb
786 |     global dump
787 |     global dumps
788 |     global load
789 |     global loads
790 |     global compatibility
791 |     global _float_size
792 |     global _unpack_dispatch_table
793 | 
794 |     # Compatibility mode for handling strings/bytes with the old specification
795 |     compatibility = False
796 | 
797 |     # Auto-detect system float precision
798 |     if sys.float_info.mant_dig == 53:
799 |         _float_size = 64
800 |     else:
801 |         _float_size = 32
802 | 
803 |     # Map packb and unpackb to the appropriate version
804 |     if sys.version_info[0] == 3:
805 |         pack = _pack3
806 |         packb = _packb3
807 |         dump = _pack3
808 |         dumps = _packb3
809 |         unpack = _unpack3
810 |         unpackb = _unpackb3
811 |         load = _unpack3
812 |         loads = _unpackb3
813 |     else:
814 |         pack = _pack2
815 |         packb = _packb2
816 |         dump = _pack2
817 |         dumps = _packb2
818 |         unpack = _unpack2
819 |         unpackb = _unpackb2
820 |         load = _unpack2
821 |         loads = _unpackb2
822 | 
823 |     # Build a dispatch table for fast lookup of unpacking function
824 | 
825 |     _unpack_dispatch_table = {}
826 |     # Fix uint
827 |     for code in range(0, 0x7f+1):
828 |         _unpack_dispatch_table[struct.pack("B", code)] = _unpack_integer
829 |     # Fix map
830 |     for code in range(0x80, 0x8f+1):
831 |         _unpack_dispatch_table[struct.pack("B", code)] = _unpack_map
832 |     # Fix array
833 |     for code in range(0x90, 0x9f+1):
834 |         _unpack_dispatch_table[struct.pack("B", code)] = _unpack_array
835 |     # Fix str
836 |     for code in range(0xa0, 0xbf+1):
837 |         _unpack_dispatch_table[struct.pack("B", code)] = _unpack_string
838 |     # Nil
839 |     _unpack_dispatch_table[b'\xc0'] = _unpack_nil
840 |     # Reserved
841 |     _unpack_dispatch_table[b'\xc1'] = _unpack_reserved
842 |     # Boolean
843 |     _unpack_dispatch_table[b'\xc2'] = _unpack_boolean
844 |     _unpack_dispatch_table[b'\xc3'] = _unpack_boolean
845 |     # Bin
846 |     for code in range(0xc4, 0xc6+1):
847 |         _unpack_dispatch_table[struct.pack("B", code)] = _unpack_binary
848 |     # Ext
849 |     for code in range(0xc7, 0xc9+1):
850 |         _unpack_dispatch_table[struct.pack("B", code)] = _unpack_ext
851 |     # Float
852 |     _unpack_dispatch_table[b'\xca'] = _unpack_float
853 |     _unpack_dispatch_table[b'\xcb'] = _unpack_float
854 |     # Uint
855 |     for code in range(0xcc, 0xcf+1):
856 |         _unpack_dispatch_table[struct.pack("B", code)] = _unpack_integer
857 |     # Int
858 |     for code in range(0xd0, 0xd3+1):
859 |         _unpack_dispatch_table[struct.pack("B", code)] = _unpack_integer
860 |     # Fixext
861 |     for code in range(0xd4, 0xd8+1):
862 |         _unpack_dispatch_table[struct.pack("B", code)] = _unpack_ext
863 |     # String
864 |     for code in range(0xd9, 0xdb+1):
865 |         _unpack_dispatch_table[struct.pack("B", code)] = _unpack_string
866 |     # Array
867 |     _unpack_dispatch_table[b'\xdc'] = _unpack_array
868 |     _unpack_dispatch_table[b'\xdd'] = _unpack_array
869 |     # Map
870 |     _unpack_dispatch_table[b'\xde'] = _unpack_map
871 |     _unpack_dispatch_table[b'\xdf'] = _unpack_map
872 |     # Negative fixint
873 |     for code in range(0xe0, 0xff+1):
874 |         _unpack_dispatch_table[struct.pack("B", code)] = _unpack_integer
875 | 
876 | __init()
877 | 


--------------------------------------------------------------------------------
/vendor/updatelm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #### Edit paths before using
 4 | 
 5 | sqlite3 ../chatlog.db <<< 'select text from messages where text is not null and text != "" and text not like "/%" and src != 120400693;' | tee chatlog.txt | python3 ../truecaser.py -t truecase.txt
 6 | pv chatlog.txt | python3 ../truecaser.py truecase.txt | perl -p -e 's|^[^\n ]+] ||' | python3 logcutfilter.py | opencc -c t2s.json | awk '!seen[$0]++' | tee chatlogf.txt | sed 's/“//g;s/”//g;s/  / /g;s/ /\n/g' | awk '{seen[$0]++} END {for (i in seen) {if (seen[i] > 5) print i}}' > chatdict.txt
 7 | rm chatlog.txt
 8 | 
 9 | ~/software/moses/bin/lmplz -o 6 --prune 0 0 0 0 0 1 -S 50% --text chatlogf.txt --arpa chat.lm
10 | ~/software/moses/bin/build_binary trie chat.lm chat.binlm
11 | 
12 | rm chat.lm
13 | pv chatlogf.txt | pypy3 learnctx.py chatdict.txt
14 | 


--------------------------------------------------------------------------------
/vendor/zhutil.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import itertools
  4 | 
  5 | halfwidth = frozenset('!(),:;?')
  6 | fullwidth = frozenset(itertools.chain(
  7 |     range(0xFF02, 0xFF07 + 1),
  8 |     (0xFF0A, 0xFF0B, 0xFF0E, 0xFF0F, 0xFF1C, 0xFF1D,
  9 |      0xFF1E, 0xFF3C, 0xFF3E, 0xFF3F, 0xFF40),
 10 |     range(0xFF10, 0xFF19 + 1),
 11 |     range(0xFF20, 0xFF3A + 1),
 12 |     range(0xFF41, 0xFF5A + 1)))
 13 | resentencesp = re.compile('([﹒﹔﹖﹗．；。！？]["’”」』]{0,2}|：(?=["‘“「『]{1,2}|$))')
 14 | refixmissing = re.compile(
 15 |     '(^[^"‘“「『’”」』，；。！？]+["’”」』]|^["‘“「『]?[^"‘“「『’”」』]+[，；。！？][^"‘“「『‘“「『]*["’”」』])(?!["‘“「『’”」』，；。！？])')
 16 | 
 17 | punctstr = (
 18 |     '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~¢£¥·ˇˉ―‖‘’“”•′‵、。々'
 19 |     '〈〉《》「」『』【】〔〕〖〗〝〞︰︱︳︴︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄'
 20 |     '﹏﹐﹒﹔﹕﹖﹗﹙﹚﹛﹜﹝﹞！（），．：；？［｛｜｝～､￠￡￥')
 21 | 
 22 | punct = frozenset(punctstr)
 23 | 
 24 | whitespace = ' \t\n\r\x0b\x0c\u3000'
 25 | 
 26 | resplitpunct = re.compile('([%s])' % re.escape(punctstr))
 27 | 
 28 | tailpunct = ('''!),-.:;?]}¢·ˇˉ―‖’”•′■□△○●'''
 29 |              '''、。々〉》」』】〕〗〞︰︱︳︴︶︸︺︼︾﹀﹂﹄﹏'''
 30 |              '''﹐﹒﹔﹕﹖﹗﹚﹜﹞！），．：；？｜］｝～､￠''') + whitespace
 31 | headpunct = ('''([`{£¥‘“〈《「『【〔〖〝'''
 32 |              '''︵︷︹︻︽︿﹁﹃﹙﹛﹝（［｛￡￥''') + whitespace
 33 | 
 34 | openbrckt = ('([{（［｛⦅〚⦃“‘‹«「〈《【〔⦗『〖〘｢⟦⟨⟪⟮⟬⌈⌊⦇⦉❛❝❨❪❴❬❮❰❲'
 35 |              '⏜⎴⏞〝︵⏠﹁﹃︹︻︗︿︽﹇︷〈⦑⧼﹙﹛﹝⁽₍⦋⦍⦏⁅⸢⸤⟅⦓⦕⸦⸨｟⧘⧚⸜⸌⸂⸄⸉᚛༺༼')
 36 | clozbrckt = (')]}）］｝⦆〛⦄”’›»」〉》】〕⦘』〗〙｣⟧⟩⟫⟯⟭⌉⌋⦈⦊❜❞❩❫❵❭❯❱❳'
 37 |              '⏝⎵⏟〞︶⏡﹂﹄︺︼︘﹀︾﹈︸〉⦒⧽﹚﹜﹞⁾₎⦌⦎⦐⁆⸣⸥⟆⦔⦖⸧⸩｠⧙⧛⸝⸍⸃⸅⸊᚜༻༽')
 38 | 
 39 | ucjk = frozenset(itertools.chain(
 40 |     range(0x1100, 0x11FF + 1),
 41 |     range(0x2E80, 0xA4CF + 1),
 42 |     range(0xA840, 0xA87F + 1),
 43 |     range(0xAC00, 0xD7AF + 1),
 44 |     range(0xF900, 0xFAFF + 1),
 45 |     range(0xFE30, 0xFE4F + 1),
 46 |     range(0xFF65, 0xFFDC + 1),
 47 |     range(0xFF01, 0xFF0F + 1),
 48 |     range(0xFF1A, 0xFF20 + 1),
 49 |     range(0xFF3B, 0xFF40 + 1),
 50 |     range(0xFF5B, 0xFF60 + 1),
 51 |     range(0x20000, 0x2FFFF + 1)
 52 | ))
 53 | 
 54 | zhcmodel = None
 55 | zhmmodel = None
 56 | _curpath = os.path.normpath(
 57 |     os.path.join(os.getcwd(), os.path.dirname(__file__)))
 58 | 
 59 | RE_WS_IN_FW = re.compile(
 60 |     r'([‘’“”…─\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\ufe30-\ufe57\uff00-\uffef\U00020000-\U0002A6D6])\s+(?=[‘’“”…\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\ufe30-\ufe57\uff00-\uffef\U00020000-\U0002A6D6])')
 61 | 
 62 | RE_FW = re.compile(
 63 |     '([\u2018\u2019\u201c\u201d\u2026\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\ufe30-\ufe57\uff00-\uffef\U00020000-\U0002A6D6]+)')
 64 | 
 65 | RE_UCJK = re.compile(
 66 |     '([\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\U00020000-\U0002A6D6]+)')
 67 | 
 68 | # Detokenization function for Chinese.
 69 | detokenize = lambda s: RE_WS_IN_FW.sub(r'\1', s).strip()
 70 | 
 71 | 
 72 | def splitsentence(sentence):
 73 |     '''Split a piece of Chinese into sentences.'''
 74 |     # s = ''.join((chr(ord(ch)+0xFEE0) if ch in halfwidth else ch) for ch in sentence)
 75 |     s = sentence
 76 |     slist = []
 77 |     for i in resentencesp.split(s):
 78 |         if resentencesp.match(i) and slist:
 79 |             slist[-1] += i
 80 |         elif i:
 81 |             slist.append(i)
 82 |     return slist
 83 | 
 84 | 
 85 | def splithard(sentence, maxchar=None):
 86 |     '''Forcely split a piece of Chinese into sentences with the limit of max sentence length.'''
 87 |     slist = splitsentence(sentence)
 88 |     if maxchar is None:
 89 |         return slist
 90 |     slist1 = []
 91 |     for sent in slist:
 92 |         if len(sent) > maxchar:
 93 |             for i in resplitpunct.split(sent):
 94 |                 if resplitpunct.match(i) and slist1:
 95 |                     slist1[-1] += i
 96 |                 elif i:
 97 |                     slist1.append(i)
 98 |         else:
 99 |             slist1.append(sent)
100 |     slist = slist1
101 |     slist1 = []
102 |     for sent in slist:
103 |         if len(sent) > maxchar:
104 |             slist1.extend(sent[i:i + maxchar]
105 |                           for i in range(0, len(sent), maxchar))
106 |         else:
107 |             slist1.append(sent)
108 |     slist = slist1
109 |     return slist
110 | 
111 | 
112 | def fixmissing(slist):
113 |     '''Fix missing quotes.'''
114 |     newlist = []
115 |     for i in slist:
116 |         newlist.extend(filter(None, refixmissing.split(i)))
117 |     return newlist
118 | 
119 | 
120 | def filterlist(slist):
121 |     '''Get meaningful sentences.'''
122 |     for i in slist:
123 |         s = i.lstrip(tailpunct).rstrip(headpunct)
124 |         if len(s) > 1:
125 |             yield s
126 | 
127 | 
128 | def addwalls(tokiter):
129 |     '''Add walls between punctuations for Moses.'''
130 |     lastwall = False
131 |     for tok in tokiter:
132 |         if tok in punct:
133 |             if not lastwall:
134 |                 yield '<wall />'
135 |             yield tok
136 |             yield '<wall />'
137 |             lastwall = True
138 |         else:
139 |             yield tok
140 |             lastwall = False
141 | 
142 | 
143 | def addwallzone(tokiter):
144 |     '''Add walls and zones between punctuations for Moses.'''
145 |     W = '<wall />'
146 |     out = []
147 |     expect = zidx = None
148 |     for tok in tokiter:
149 |         if tok in punct:
150 |             if not (out and out[-1] == W):
151 |                 out.append(W)
152 |             if tok == expect:
153 |                 out[zidx] = '<zone>'
154 |                 out.append(tok)
155 |                 out.append('</zone>')
156 |                 expect = zidx = None
157 |             else:
158 |                 bid = openbrckt.find(tok)
159 |                 if bid > -1:
160 |                     expect = clozbrckt[bid]
161 |                     zidx = len(out) - 1
162 |                 out.append(tok)
163 |                 out.append(W)
164 |         else:
165 |             out.append(tok)
166 |     if out and out[0] == W:
167 |         out.pop(0)
168 |     if out and out[-1] == W:
169 |         out.pop()
170 |     return out
171 | 
172 | 
173 | def calctxtstat(s):
174 |     '''Detect whether a string is modern or classical Chinese.'''
175 |     global zhcmodel, zhmmodel
176 |     if zhcmodel is None:
177 |         import json
178 |         zhcmodel = json.load(
179 |             open(os.path.join(_curpath, 'modelzhc.json'), 'r', encoding='utf-8'))
180 |         zhmmodel = json.load(
181 |             open(os.path.join(_curpath, 'modelzhm.json'), 'r', encoding='utf-8'))
182 |     cscore = 0
183 |     mscore = 0
184 |     for ch in s:
185 |         ordch = ord(ch)
186 |         if 0x4E00 <= ordch < 0x9FCD:
187 |             cscore += zhcmodel[ordch - 0x4E00]
188 |             mscore += zhmmodel[ordch - 0x4E00]
189 |     return (cscore, mscore)
190 | 
191 | 
192 | def checktxttype(cscore, mscore):
193 |     if cscore > mscore:
194 |         return 'c'
195 |     elif cscore < mscore:
196 |         return 'm'
197 |     else:
198 |         return None
199 | 
200 | 
201 | def num2chinese(num, big=False, simp=True, o=False, twoalt=False):
202 |     """
203 |     Converts numbers to Chinese representations.
204 | 
205 |     `big`   : use financial characters.
206 |     `simp`  : use simplified characters instead of traditional characters.
207 |     `o`     : use 〇 for zero.
208 |     `twoalt`: use 两/兩 for two when appropriate.
209 | 
210 |     Note that `o` and `twoalt` is ignored when `big` is used, 
211 |     and `twoalt` is ignored when `o` is used for formal representations.
212 |     """
213 |     # check num first
214 |     nd = str(num)
215 |     if abs(float(nd)) >= 1e48:
216 |         raise ValueError('number out of range')
217 |     elif 'e' in nd:
218 |         raise ValueError('scientific notation is not supported')
219 |     c_symbol = '正负点' if simp else '正負點'
220 |     if o:  # formal
221 |         twoalt = False
222 |     if big:
223 |         c_basic = '零壹贰叁肆伍陆柒捌玖' if simp else '零壹貳參肆伍陸柒捌玖'
224 |         c_unit1 = '拾佰仟'
225 |         c_twoalt = '贰' if simp else '貳'
226 |     else:
227 |         c_basic = '〇一二三四五六七八九' if o else '零一二三四五六七八九'
228 |         c_unit1 = '十百千'
229 |         if twoalt:
230 |             c_twoalt = '两' if simp else '兩'
231 |         else:
232 |             c_twoalt = '二'
233 |     c_unit2 = '万亿兆京垓秭穰沟涧正载' if simp else '萬億兆京垓秭穰溝澗正載'
234 |     revuniq = lambda l: ''.join(k for k, g in itertools.groupby(reversed(l)))
235 |     nd = str(num)
236 |     result = []
237 |     if nd[0] == '+':
238 |         result.append(c_symbol[0])
239 |     elif nd[0] == '-':
240 |         result.append(c_symbol[1])
241 |     if '.' in nd:
242 |         integer, remainder = nd.lstrip('+-').split('.')
243 |     else:
244 |         integer, remainder = nd.lstrip('+-'), None
245 |     if int(integer):
246 |         splitted = [integer[max(i - 4, 0):i]
247 |                     for i in range(len(integer), 0, -4)]
248 |         intresult = []
249 |         for nu, unit in enumerate(splitted):
250 |             # special cases
251 |             if int(unit) == 0:  # 0000
252 |                 intresult.append(c_basic[0])
253 |                 continue
254 |             elif nu > 0 and int(unit) == 2:  # 0002
255 |                 intresult.append(c_twoalt + c_unit2[nu - 1])
256 |                 continue
257 |             ulist = []
258 |             unit = unit.zfill(4)
259 |             for nc, ch in enumerate(reversed(unit)):
260 |                 if ch == '0':
261 |                     if ulist:  # ???0
262 |                         ulist.append(c_basic[0])
263 |                 elif nc == 0:
264 |                     ulist.append(c_basic[int(ch)])
265 |                 elif nc == 1 and ch == '1' and unit[1] == '0':
266 |                     # special case for tens
267 |                     # edit the 'elif' if you don't like
268 |                     # 十四, 三千零十四, 三千三百一十四
269 |                     ulist.append(c_unit1[0])
270 |                 elif nc > 1 and ch == '2':
271 |                     ulist.append(c_twoalt + c_unit1[nc - 1])
272 |                 else:
273 |                     ulist.append(c_basic[int(ch)] + c_unit1[nc - 1])
274 |             ustr = revuniq(ulist)
275 |             if nu == 0:
276 |                 intresult.append(ustr)
277 |             else:
278 |                 intresult.append(ustr + c_unit2[nu - 1])
279 |         result.append(revuniq(intresult).strip(c_basic[0]))
280 |     else:
281 |         result.append(c_basic[0])
282 |     if remainder:
283 |         result.append(c_symbol[2])
284 |         result.append(''.join(c_basic[int(ch)] for ch in remainder))
285 |     return ''.join(result)
286 | 
287 | 
288 | stripquotes = lambda s: s.lstrip('"‘“「『').rstrip('"’”」』')
289 | fw2hw = lambda s: ''.join(
290 |     (chr(ord(ch) - 0xFEE0) if ord(ch) in fullwidth else ch) for ch in s)
291 | hw2fw = lambda s: ''.join(
292 |     (chr(ord(ch) + 0xFEE0) if ch in halfwidth else ch) for ch in s)
293 | 
294 | 
295 | def _test_fixsplit():
296 |     test = """从高祖父到曾孙称为“九族”。这“九族”代表着长幼尊卑秩序和家族血统的承续关系。
297 | 《诗》、《书》、《易》、《礼》、《春秋》，再加上《乐》称“六经”，这是中国古代儒家的重要经典，应当仔细阅读。
298 | 这就是：宇宙间万事万物循环变化的道理的书籍。
299 | 《连山》、《归藏》、《周易》，是我国古代的三部书，这三部书合称“三易”，“三易”是用“卦”的形式来说明宇宙间万事万物循环变化的道理的书籍。
300 | 登楼而望，慨然而叹曰：“容容其山，旅旅其石，与地终也!吁嗟人乎!病之蚀气也，如水浸火。
301 | 吾闻老聃多寿，尝读其书曰：‘吾惟无身，是以无患。’盖欲窃之而未能也”齐宣王见孟子于雪宫。
302 | “昔者齐景公问于晏子曰：‘吾欲观于转附、朝舞，遵海而南，放于琅邪。吾何修而可以比于先王观也？’
303 | 高祖说：“该怎样对付呢？”陈平说：“古代天子有巡察天下，召集诸侯。南方有云梦这个地方，陛下只管假装外出巡游云梦，在陈地召集诸侯。陈地在楚国的西边边境上，韩信听说天子因为爱好外出巡游，看形势必然没有什么大事，就会到国境外来拜见陛下。拜见，陛下趁机抓住他，这只是一个力士的事情而已。”“不知道。”高祖认为有道理。
304 | 。他们就是这样的。
305 | """.strip().split('\n')
306 |     for s in test:
307 |         print(fixmissing(splitsentence(s)))
308 | 
309 | if __name__ == '__main__':
310 |     import sys
311 |     _test_fixsplit()
312 |     print(' '.join(addwallzone('《连山》、《归藏》、《周易》，是我国古代的三部书，这三部书合称“三易”，“三易”是用“卦”的形式来说明(宇宙间万事万物循环变化的道理的书籍。')))
313 |     # print(checktxttype(sys.stdin.read()))
314 | 


--------------------------------------------------------------------------------