├── .gitignore ├── LICENSE ├── README.md ├── appserve.py ├── chatdig.py ├── config.sample.json ├── digest.py ├── templates ├── digest.css ├── digest.html ├── index.html └── stat.html ├── tools └── dbselect.cgi ├── truecaser.py └── vendor ├── chinesename.py ├── common_surnames.py ├── convertbdf.py ├── figchar.py ├── learnctx.py ├── logcutfilter.py ├── lookuptable.py ├── mbox.conf ├── modelzhc.json ├── modelzhm.json ├── mosesproxy.py ├── pangu.py ├── repl.py ├── say.py ├── seccomp.py ├── simpcalc.py ├── stopwords.txt ├── umsgpack.py ├── updatelm.sh └── zhutil.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | *.build/ 60 | *.exe 61 | 62 | old/ 63 | experiment/ 64 | *.log 65 | *.db 66 | *.sh 67 | config.json 68 | cmdbot.json 69 | vendor/*.binlm 70 | vendor/*.lm 71 | vendor/chatdict.txt 72 | vendor/chatlogf.txt 73 | vendor/*.pkl 74 | vendor/namemodel.m 75 | vendor/libirc.py 76 | vendor/*.dawg 77 | vendor/pinyinlookup.py 78 | vendor/simpleime.py 79 | vendor/zhcdict.json 80 | vendor/zhconv.py 81 | vendor/bf.py 82 | vendor/lispy.py 83 | vendor/brainfuck 84 | vendor/truecase.txt 85 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Dingyuan Wang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tg-chatdig 2 | Dig into long and boring Telegram group chat logs. 3 | 4 | For a simpler Telegram-IRC relay bot, see [tg-irc-relay](https://github.com/gumblex/tg-irc-relay) 5 | **Deprecated**: The version 2 is renamed and published at [orizonhub](https://github.com/gumblex/orizonhub) 6 | 7 | ## chatdig.py 8 | 9 | Main script, handles a lot of commands. Uses a SQLite 3 database to store messages. 10 | 11 | ## tglog-import.py 12 | 13 | Executes `telegram-cli` and fetches history messages. 14 | 15 | ## digest.py 16 | 17 | Generate daily digest from the message database. 18 | 19 | `python3 digest.py path [days=1] [update=0]` 20 | 21 | ## vendor/ 22 | 23 | Some interesting functions. 24 | 25 | ### say.py 26 | 27 | Randomly writes out sentences according to the language model. 28 | 29 | Depends on [jieba](https://github.com/fxsjy/jieba), [kenlm](https://github.com/kpu/kenlm). 30 | 31 | See `vendor/updatelm.sh` for building language models. 32 | 33 | ### seccomp.py 34 | 35 | Evals user input and prints out result safely. Originally written by David Wison. 36 | 37 | See [dw/scratch/seccomp.py](https://github.com/dw/scratch/blob/master/seccomp.py) 38 | 39 | ### fparser.py 40 | 41 | See [gumblex/fxcalc](https://github.com/gumblex/fxcalc) 42 | 43 | ### External Plugins 44 | 45 | The following components are not in this repo: 46 | 47 | * `/bf` bf.py: [Brainf*ck interpreter](http://www.cs.princeton.edu/~ynaamad/misc/bf.htm) 48 | * `/lisp` lispy.py: [Scheme-like interpreter](http://norvig.com/lispy.html) 49 | * `/name` , namemodel.m: Part of [Chinese name generator](https://github.com/gumblex/chinesename) 50 | * `/ime` simpleime.py, pinyinlookup.py, \*.dawg: [Simple Pinyin IME](https://github.com/gumblex/simpleime) 51 | * zhconv.py, zhcdict.json: [Simplified-Traditional Chinese converter](https://github.com/gumblex/zhconv) 52 | * vendor/libirc.py: [libirc](https://github.com/m13253/libirc) 53 | -------------------------------------------------------------------------------- /appserve.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import sys 6 | import json 7 | import queue 8 | import tempfile 9 | import resource 10 | import threading 11 | import traceback 12 | import subprocess 13 | import collections 14 | import concurrent.futures 15 | 16 | from vendor import zhutil 17 | from vendor import zhconv 18 | from vendor import figchar 19 | from vendor import simpcalc 20 | from vendor import simpleime 21 | from vendor import mosesproxy 22 | from vendor import chinesename 23 | 24 | resource.setrlimit(resource.RLIMIT_RSS, (131072, 262144)) 25 | 26 | def setsplimits(cputime, memory): 27 | def _setlimits(): 28 | resource.setrlimit(resource.RLIMIT_CPU, cputime) 29 | resource.setrlimit(resource.RLIMIT_RSS, memory) 30 | resource.setrlimit(resource.RLIMIT_NPROC, (1024, 1024)) 31 | return _setlimits 32 | 33 | # {"id": 1, "cmd": "bf", "args": [",[.,]", "asdasdf"]} 34 | 35 | def docommands(): 36 | global MSG_Q 37 | while 1: 38 | obj = MSG_Q.get() 39 | executor.submit(async_command, obj) 40 | 41 | def async_command(obj): 42 | sys.stdout.buffer.write(json.dumps(process(obj)).encode('utf-8') + b'\n') 43 | sys.stdout.flush() 44 | 45 | def getsaying(): 46 | global SAY_P, SAY_Q 47 | while 1: 48 | say = getsayingbytext(mode='') 49 | SAY_Q.put(say) 50 | 51 | def getsayingbytext(text='', mode='r'): 52 | global SAY_P 53 | with SAY_LCK: 54 | text = (mode + ' '.join(mosesproxy.cut(zhconv.convert(text, 'zh-hans'), HMM=False)[:60]).strip()).encode('utf-8') + b'\n' 55 | try: 56 | SAY_P.stdin.write(text) 57 | SAY_P.stdin.flush() 58 | say = SAY_P.stdout.readline().strip().decode('utf-8') 59 | except BrokenPipeError: 60 | SAY_P = subprocess.Popen(SAY_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE, cwd='vendor') 61 | SAY_P.stdin.write(text) 62 | SAY_P.stdin.flush() 63 | say = SAY_P.stdout.readline().strip().decode('utf-8') 64 | return say 65 | 66 | def process(obj): 67 | ret, exc = None, None 68 | try: 69 | ret = COMMANDS[obj['cmd']](*obj['args']) 70 | except Exception: 71 | exc = traceback.format_exc() 72 | return {'id': obj['id'], 'ret': ret, 'exc': exc} 73 | 74 | def cmd_calc(expr): 75 | '''/calc Calculate .''' 76 | r = calculator.pretty(expr) 77 | if len(r) > 200: 78 | r = r[:200] + '...' 79 | return r or 'Nothing' 80 | 81 | def cmd_py(expr): 82 | proc = subprocess.Popen(EVIL_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd='vendor', preexec_fn=setsplimits((4, 5), (8192, 16384))) 83 | try: 84 | result, errs = proc.communicate(expr.strip().encode('utf-8'), timeout=5) 85 | except Exception: # TimeoutExpired 86 | proc.kill() 87 | result, errs = proc.communicate() 88 | finally: 89 | if proc.poll() is None: 90 | proc.terminate() 91 | result = result.strip().decode('utf-8', errors='replace') 92 | return result or 'None or error occurred.' 93 | 94 | def cmd_bf(expr, datain=''): 95 | fd, fpath = tempfile.mkstemp() 96 | with os.fdopen(fd, 'wb') as temp_bf: 97 | temp_bf.write(''.join(c for c in expr if c in '-[>.<]+,').encode('latin_1')) 98 | proc = subprocess.Popen(BF_CMD + (fpath,), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, preexec_fn=setsplimits((1, 1), (1024, 2048))) 99 | datain = datain.encode('utf-8') 100 | try: 101 | result, errs = proc.communicate(datain, timeout=1) 102 | except Exception: # TimeoutExpired 103 | proc.kill() 104 | result, errs = proc.communicate() 105 | finally: 106 | if proc.poll() is None: 107 | proc.terminate() 108 | os.remove(fpath) 109 | if len(result) > 1000: 110 | result = result[:1000] + b'...' 111 | result = result.decode('latin_1').encode('unicode_escape').decode('latin_1').replace('\\t', '\t').replace('\\n', '\n') 112 | if len(result) > 1000: 113 | result = result[:1000] + '...' 114 | return result or 'None or error occurred.' 115 | 116 | def cmd_lisp(expr): 117 | proc = subprocess.Popen(LISP_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd='vendor', preexec_fn=setsplimits((4, 5), (8192, 16384))) 118 | try: 119 | result, errs = proc.communicate(expr.strip().encode('utf-8'), timeout=5) 120 | except Exception: # TimeoutExpired 121 | proc.kill() 122 | result, errs = proc.communicate() 123 | finally: 124 | if proc.poll() is None: 125 | proc.terminate() 126 | result = result.strip().decode('utf-8', errors='replace') 127 | return result or 'None or error occurred.' 128 | 129 | def cmd_name(expr): 130 | surnames, names = namemodel.processinput(expr, 10) 131 | res = [] 132 | if surnames: 133 | res.append('姓:' + ', '.join(surnames[:10])) 134 | if names: 135 | res.append('名:' + ', '.join(names[:10])) 136 | return '\n'.join(res) 137 | 138 | def cmd_ime(expr): 139 | return zhconv.convert(simpleime.pinyininput(expr.lower()), 'zh-hans') 140 | 141 | def cmd_fig(expr): 142 | r = fcgen.render(expr) 143 | rl = r.splitlines() 144 | if not r: 145 | return 'Missing glyph(s).' 146 | elif len(rl[0]) < 12 and len(rl) < 15: 147 | return r 148 | else: 149 | return 'Figure too big.' 150 | 151 | def cmd_cc(expr): 152 | if zhconv.issimp(expr): 153 | return zhconv.convert(expr, 'zh-hant') 154 | else: 155 | return zhconv.convert(expr, 'zh-hans') 156 | 157 | def cmd_cut(tinput, lang): 158 | if lang == 'c': 159 | return ' '.join(mosesproxy.jiebazhc.cut(tinput, HMM=False)) 160 | else: 161 | return ' '.join(mosesproxy.cut(tinput, HMM=False)) 162 | 163 | def cmd_wyw(tinput, lang): 164 | if tinput == '$name': 165 | return mosesproxy.modelname() 166 | if lang is None: 167 | cscore, mscore = zhutil.calctxtstat(tinput) 168 | if cscore == mscore: 169 | lang = None 170 | elif zhutil.checktxttype(cscore, mscore) == 'c': 171 | lang = 'c2m' 172 | else: 173 | lang = 'm2c' 174 | if lang: 175 | return mosesproxy.translate(tinput, lang, 0, 0, 0) 176 | else: 177 | return tinput 178 | 179 | def cmd_say(): 180 | return SAY_Q.get() or 'ERROR_BRAIN_NOT_CONNECTED' 181 | 182 | def cmd_mgw(): 183 | global MGW_P 184 | with MGW_LCK: 185 | try: 186 | MGW_P.stdin.write(b'b\n') 187 | MGW_P.stdin.flush() 188 | say = MGW_P.stdout.readline().strip().decode('utf-8') 189 | except BrokenPipeError: 190 | MGW_P = subprocess.Popen(MGW_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE, cwd='vendor') 191 | MGW_P.stdin.write(b'b\n') 192 | MGW_P.stdin.flush() 193 | say = MGW_P.stdout.readline().strip().decode('utf-8') 194 | return say 195 | 196 | def cmd_reply(expr): 197 | return getsayingbytext(expr, 'r') or 'ERROR_BRAIN_NOT_CONNECTED' 198 | 199 | def cmd_cont(expr): 200 | return getsayingbytext(expr, 'c') or 'ERROR_BRAIN_NOT_CONNECTED' 201 | 202 | COMMANDS = collections.OrderedDict(( 203 | ('calc', cmd_calc), 204 | ('py', cmd_py), 205 | ('bf', cmd_bf), 206 | ('lisp', cmd_lisp), 207 | ('name', cmd_name), 208 | ('ime', cmd_ime), 209 | ('fig', cmd_fig), 210 | ('cc', cmd_cc), 211 | ('wyw', cmd_wyw), 212 | ('cut', cmd_cut), 213 | ('say', cmd_say), 214 | ('mgw', cmd_mgw), 215 | ('reply', cmd_reply), 216 | ('cont', cmd_cont) 217 | )) 218 | 219 | MSG_Q = queue.Queue() 220 | SAY_Q = queue.Queue(maxsize=50) 221 | SAY_LCK = threading.Lock() 222 | MGW_LCK = threading.Lock() 223 | 224 | SAY_CMD = ('python3', 'say.py', 'chat.binlm', 'chatdict.txt', 'context.pkl') 225 | SAY_P = subprocess.Popen(SAY_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE, cwd='vendor') 226 | MGW_CMD = ('python3', 'say.py', 'mgw.binlm', 'mgwdict.txt') 227 | MGW_P = subprocess.Popen(MGW_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE, cwd='vendor') 228 | 229 | EVIL_CMD = ('python', 'seccomp.py') 230 | BF_CMD = ('vendor/brainfuck',) 231 | LISP_CMD = ('python', 'lispy.py') 232 | 233 | executor = concurrent.futures.ThreadPoolExecutor(5) 234 | cmdthr = threading.Thread(target=docommands) 235 | cmdthr.daemon = True 236 | cmdthr.start() 237 | 238 | saythr = threading.Thread(target=getsaying) 239 | saythr.daemon = True 240 | saythr.start() 241 | 242 | calculator = simpcalc.Calculator('ans', True) 243 | namemodel = chinesename.NameModel('vendor/namemodel.m') 244 | simpleime.loaddict('vendor/pyindex.dawg', 'vendor/essay.dawg') 245 | fcgen = figchar.BlockGenerator('vendor/wqy.pkl', '🌝🌚') 246 | 247 | try: 248 | for ln in sys.stdin.buffer: 249 | upd = json.loads(ln.decode('utf-8')) 250 | MSG_Q.put(upd) 251 | finally: 252 | SAY_P.terminate() 253 | -------------------------------------------------------------------------------- /chatdig.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import re 6 | import sys 7 | import math 8 | import time 9 | import json 10 | import queue 11 | import signal 12 | import socket 13 | import random 14 | import logging 15 | import sqlite3 16 | import threading 17 | import functools 18 | import subprocess 19 | import collections 20 | import unicodedata 21 | import concurrent.futures 22 | 23 | import requests 24 | from vendor import libirc 25 | 26 | __version__ = '1.4' 27 | 28 | MEDIA_TYPES = frozenset(('audio', 'document', 'photo', 'sticker', 'video', 'voice', 'contact', 'location', 'new_chat_participant', 'left_chat_participant', 'new_chat_title', 'new_chat_photo', 'delete_chat_photo', 'group_chat_created')) 29 | EXT_MEDIA_TYPES = frozenset(('audio', 'document', 'photo', 'sticker', 'video', 'voice', 'contact', 'location', 'new_chat_participant', 'left_chat_participant', 'new_chat_title', 'new_chat_photo', 'delete_chat_photo', 'group_chat_created', '_ircuser')) 30 | 31 | loglevel = logging.DEBUG if sys.argv[-1] == '-d' else logging.INFO 32 | 33 | logging.basicConfig(stream=sys.stdout, format='# %(asctime)s [%(levelname)s] %(message)s', level=loglevel) 34 | 35 | socket.setdefaulttimeout(60) 36 | 37 | HSession = requests.Session() 38 | USERAGENT = 'TgChatDiggerBot/%s %s' % (__version__, HSession.headers["User-Agent"]) 39 | HSession.headers["User-Agent"] = USERAGENT 40 | 41 | db = sqlite3.connect('chatlog.db') 42 | conn = db.cursor() 43 | conn.execute('''CREATE TABLE IF NOT EXISTS messages ( 44 | id INTEGER PRIMARY KEY, 45 | src INTEGER, 46 | text TEXT, 47 | media TEXT, 48 | date INTEGER, 49 | fwd_src INTEGER, 50 | fwd_date INTEGER, 51 | reply_id INTEGER 52 | )''') 53 | conn.execute('''CREATE TABLE IF NOT EXISTS users ( 54 | id INTEGER PRIMARY KEY, 55 | username TEXT, 56 | first_name TEXT, 57 | last_name TEXT 58 | )''') 59 | conn.execute('CREATE TABLE IF NOT EXISTS config (id INTEGER PRIMARY KEY, val INTEGER)') 60 | # conn.execute('CREATE TABLE IF NOT EXISTS words (word TEXT PRIMARY KEY, count INTEGER)') 61 | 62 | re_ircaction = re.compile('^\x01ACTION (.*)\x01$') 63 | re_ircforward = re.compile(r'^\[(.+?)\] (.*)$|^\*\* ([^ ]+) (.*) \*\*$') 64 | 65 | class LRUCache: 66 | 67 | def __init__(self, maxlen): 68 | self.capacity = maxlen 69 | self.cache = collections.OrderedDict() 70 | 71 | def __getitem__(self, key): 72 | value = self.cache.pop(key) 73 | self.cache[key] = value 74 | return value 75 | 76 | def get(self, key, default=None): 77 | try: 78 | value = self.cache.pop(key) 79 | self.cache[key] = value 80 | return value 81 | except KeyError: 82 | return default 83 | 84 | def __setitem__(self, key, value): 85 | try: 86 | self.cache.pop(key) 87 | except KeyError: 88 | if len(self.cache) >= self.capacity: 89 | self.cache.popitem(last=False) 90 | self.cache[key] = value 91 | 92 | def async_func(func): 93 | @functools.wraps(func) 94 | def wrapped(*args, **kwargs): 95 | def func_noerr(*args, **kwargs): 96 | try: 97 | func(*args, **kwargs) 98 | except Exception: 99 | logging.exception('Async function failed.') 100 | executor.submit(func_noerr, *args, **kwargs) 101 | return wrapped 102 | 103 | def _raise_ex(ex): 104 | raise ex 105 | 106 | ### Polling 107 | 108 | def getupdates(): 109 | global OFFSET, MSG_Q 110 | while 1: 111 | try: 112 | updates = bot_api('getUpdates', offset=OFFSET, timeout=10) 113 | except Exception as ex: 114 | logging.exception('Get updates failed.') 115 | continue 116 | if updates: 117 | logging.debug('Messages coming.') 118 | OFFSET = updates[-1]["update_id"] + 1 119 | for upd in updates: 120 | MSG_Q.put(upd) 121 | time.sleep(.2) 122 | 123 | def checkappproc(): 124 | global APP_P 125 | if APP_P.poll() is not None: 126 | APP_P = subprocess.Popen(APP_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE) 127 | 128 | def runapptask(cmd, args, sendargs): 129 | '''`sendargs` should be (chatid, replyid)''' 130 | global APP_P, APP_LCK, APP_TASK 131 | with APP_LCK: 132 | # Prevent float problems 133 | tid = str(time.time()) 134 | text = json.dumps({"cmd": cmd, "args": args, "id": tid}) 135 | APP_TASK[tid] = sendargs 136 | try: 137 | APP_P.stdin.write(text.strip().encode('utf-8') + b'\n') 138 | APP_P.stdin.flush() 139 | except BrokenPipeError: 140 | checkappproc() 141 | APP_P.stdin.write(text.strip().encode('utf-8') + b'\n') 142 | APP_P.stdin.flush() 143 | logging.debug('Wrote to APP_P: ' + text) 144 | 145 | def getappresult(): 146 | global APP_P, APP_TASK 147 | while 1: 148 | try: 149 | result = APP_P.stdout.readline().strip().decode('utf-8') 150 | except BrokenPipeError: 151 | checkappproc() 152 | result = APP_P.stdout.readline().strip().decode('utf-8') 153 | logging.debug('Got from APP_P: ' + result) 154 | if result: 155 | obj = json.loads(result) 156 | if obj['exc']: 157 | logging.error('Remote app server error.\n' + obj['exc']) 158 | sargs = APP_TASK.get(obj['id']) 159 | if sargs: 160 | sendmsg(obj['ret'] or 'Empty.', sargs[0], sargs[1]) 161 | del APP_TASK[obj['id']] 162 | else: 163 | logging.error('Task ID %s not found.' % obj['id']) 164 | 165 | def checkircconn(): 166 | global ircconn 167 | if not ircconn or not ircconn.sock: 168 | ircconn = libirc.IRCConnection() 169 | ircconn.connect((CFG['ircserver'], CFG['ircport']), use_ssl=CFG['ircssl']) 170 | if CFG.get('ircpass'): 171 | ircconn.setpass(CFG['ircpass']) 172 | ircconn.setnick(CFG['ircnick']) 173 | ircconn.setuser(CFG['ircnick'], CFG['ircnick']) 174 | ircconn.join(CFG['ircchannel']) 175 | logging.info('IRC (re)connected.') 176 | 177 | def getircupd(): 178 | global MSG_Q, IRCOFFSET 179 | while 1: 180 | checkircconn() 181 | line = ircconn.parse(block=False) 182 | if line and line["cmd"] == "PRIVMSG": 183 | if line["dest"] != CFG['ircnick'] and not re.match(CFG['ircignore'], line["nick"]): 184 | msg = { 185 | 'message_id': IRCOFFSET, 186 | 'from': {'id': CFG['ircbotid'], 'first_name': CFG['ircbotname'], 'username': 'orzirc_bot'}, 187 | 'date': int(time.time()), 188 | 'chat': {'id': -CFG['groupid'], 'title': CFG['ircchannel']}, 189 | 'text': line["msg"].strip(), 190 | '_ircuser': line["nick"] 191 | } 192 | MSG_Q.put({'update_id': IRCOFFSET, 'message': msg}) 193 | IRCOFFSET += 1 194 | time.sleep(.5) 195 | 196 | def ircconn_say(dest, msg, sendnow=True): 197 | MIN_INT = 0.2 198 | if not ircconn: 199 | return 200 | curtime = time.time() 201 | delta = curtime - ircconn_say.lasttime 202 | if delta < MIN_INT: 203 | time.sleep(MIN_INT - delta) 204 | ircconn.say(dest, msg, sendnow) 205 | ircconn_say.lasttime = time.time() 206 | ircconn_say.lasttime = 0 207 | 208 | def irc_send(text='', reply_to_message_id=None, forward_message_id=None): 209 | if ircconn: 210 | checkircconn() 211 | if reply_to_message_id: 212 | m = MSG_CACHE.get(reply_to_message_id, {}) 213 | logging.debug('Got reply message: ' + str(m)) 214 | if '_ircuser' in m: 215 | text = "%s: %s" % (m['_ircuser'], text) 216 | elif 'from' in m: 217 | src = smartname(m['from']) 218 | if m['from']['id'] in (CFG['botid'], CFG['ircbotid']): 219 | rnmatch = re_ircforward.match(m.get('text', '')) 220 | if rnmatch: 221 | src = rnmatch.group(1) or src 222 | text = "%s: %s" % (src, text) 223 | elif forward_message_id: 224 | # not async, so no sqlite3.ProgrammingError in db_* 225 | m = db_getmsg(forward_message_id) 226 | if m: 227 | text = "Fwd %s: %s" % (smartname(m[1], True), m[2]) 228 | lines = text.splitlines() 229 | if len(lines) < 3: 230 | text = ' '.join(lines) 231 | else: 232 | text = lines[0] + ' [...] ' + lines[-1] 233 | ircconn_say(CFG['ircchannel'], text) 234 | 235 | @async_func 236 | def irc_forward(msg): 237 | if not ircconn: 238 | return 239 | try: 240 | if msg['from']['id'] == CFG['ircbotid']: 241 | return 242 | checkircconn() 243 | text = msg.get('text', '') 244 | mkeys = tuple(msg.keys() & MEDIA_TYPES) 245 | if mkeys: 246 | if text: 247 | text += ' ' + servemedia(msg) 248 | else: 249 | text = servemedia(msg) 250 | if text and not text.startswith('@@@'): 251 | if 'forward_from' in msg: 252 | fwdname = '' 253 | if msg['forward_from']['id'] in (CFG['botid'], CFG['ircbotid']): 254 | rnmatch = re_ircforward.match(msg.get('text', '')) 255 | if rnmatch: 256 | fwdname = rnmatch.group(1) or rnmatch.group(3) 257 | text = rnmatch.group(2) or rnmatch.group(4) 258 | fwdname = fwdname or smartname(msg['forward_from']) 259 | text = "Fwd %s: %s" % (fwdname, text) 260 | elif 'reply_to_message' in msg: 261 | replname = '' 262 | replyu = msg['reply_to_message']['from'] 263 | if replyu['id'] in (CFG['botid'], CFG['ircbotid']): 264 | rnmatch = re_ircforward.match(msg['reply_to_message'].get('text', '')) 265 | if rnmatch: 266 | replname = rnmatch.group(1) or rnmatch.group(3) 267 | replname = replname or smartname(replyu) 268 | text = "%s: %s" % (replname, text) 269 | # ignore blank lines 270 | text = list(filter(lambda s: s.strip(), text.splitlines())) 271 | if len(text) > 3: 272 | text = text[:3] 273 | text[-1] += ' [...]' 274 | for ln in text[:3]: 275 | ircconn_say(CFG['ircchannel'], '[%s] %s' % (smartname(msg['from']), ln)) 276 | except Exception: 277 | logging.exception('Forward a message to IRC failed.') 278 | 279 | ### DB import 280 | 281 | def mediaformatconv(media=None, action=None): 282 | type_map = { 283 | # media 284 | 'photo': 'photo', 285 | 'document': 'document', 286 | 'unsupported': 'document', 287 | 'geo': 'location', 288 | 'venue': 'location', 289 | 'contact': 'contact', 290 | # action 291 | 'chat_add_user': 'new_chat_participant', 292 | 'chat_add_user_link': 'new_chat_participant', 293 | 'chat_del_user': 'left_chat_participant', 294 | 'chat_rename': 'new_chat_title', 295 | 'chat_change_photo': 'new_chat_photo', 296 | 'chat_delete_photo': 'delete_chat_photo', 297 | 'chat_created': 'group_chat_created' 298 | } 299 | d = {} 300 | caption = None 301 | if media: 302 | media = json.loads(media) 303 | if action: 304 | action = json.loads(action) 305 | if media and 'type' in media: 306 | media = media.copy() 307 | if media['type'] == 'photo': 308 | caption = media['caption'] 309 | d['photo'] = [] 310 | elif media['type'] in ('document', 'unsupported'): 311 | d['document'] = {} 312 | elif 'longitude' in media: 313 | # 'type' may be the name of the place 314 | d['location'] = { 315 | 'longitude': media['longitude'], 316 | 'latitude': media['latitude'] 317 | } 318 | elif media['type'] == 'contact': 319 | del media['type'] 320 | media['phone_number'] = media.pop('phone') 321 | d['contact'] = media 322 | # ignore other undefined types to Bot API 323 | if action and 'type' in action: 324 | newname = type_map.get(action['type']) 325 | if newname.endswith('chat_participant'): 326 | d[newname] = { 327 | 'id': action['user']['id'], 328 | 'first_name': action['user'].get('first_name', ''), 329 | 'last_name': action['user'].get('last_name', ''), 330 | 'username': action['user'].get('username', '') 331 | } 332 | elif newname == 'new_chat_title': 333 | d[newname] = action['title'] 334 | elif newname == 'new_chat_photo': 335 | d[newname] = [] 336 | elif newname in ('delete_chat_photo', 'group_chat_created'): 337 | d[newname] = True 338 | # ignore other undefined types to Bot API 339 | return json.dumps(d) if d else None, caption 340 | 341 | def importdb(filename): 342 | logging.info('Import DB...') 343 | if not os.path.isfile(filename): 344 | logging.warning('DB not found.') 345 | return 346 | db_s = sqlite3.connect(filename) 347 | conn_s = db_s.cursor() 348 | for vals in conn_s.execute('SELECT id, src, text, media, date, fwd_src, fwd_date, reply_id, action FROM messages WHERE dest = ?', (CFG['groupid'],)): 349 | vals = list(vals) 350 | vals[0] = -250000 + vals[0] 351 | vals[3], caption = mediaformatconv(vals[3], vals.pop()) 352 | vals[2] = vals[2] or caption 353 | conn.execute('INSERT OR IGNORE INTO messages (id, src, text, media, date, fwd_src, fwd_date, reply_id) VALUES (?,?,?,?, ?,?,?,?)', vals) 354 | for vals in conn_s.execute('SELECT id, username, first_name, last_name FROM users'): 355 | conn.execute('INSERT OR IGNORE INTO users (id, username, first_name, last_name) VALUES (?,?,?,?)', vals) 356 | db.commit() 357 | logging.info('DB import done.') 358 | 359 | def importupdates(offset, number=5000): 360 | off = OFFSET - number 361 | updates = bot_api('getUpdates', offset=off, limit=100) 362 | while updates: 363 | logging.info('Imported %s - %s' % (off, updates[-1]["update_id"])) 364 | off = updates[-1]["update_id"] + 1 365 | for d in updates: 366 | if 'message' in d: 367 | msg = d['message'] 368 | cls = classify(msg) 369 | if cls == 0 and msg['chat']['id'] == -CFG['groupid']: 370 | logmsg(msg, True) 371 | elif cls == 1: 372 | logmsg(msg, True) 373 | time.sleep(.1) 374 | updates = bot_api('getUpdates', offset=off, limit=100) 375 | 376 | def importfixservice(filename): 377 | logging.info('Updating DB...') 378 | if not os.path.isfile(filename): 379 | logging.warning('DB not found.') 380 | return 381 | db_s = sqlite3.connect(filename) 382 | conn_s = db_s.cursor() 383 | for mid, text, media, action in conn_s.execute('SELECT id, text, media, action FROM messages WHERE dest = ?', (CFG['groupid'],)): 384 | mid -= 250000 385 | media, caption = mediaformatconv(media, action) 386 | text = text or caption 387 | conn.execute('UPDATE messages SET text=?, media=? WHERE id=?', (text, media, mid)) 388 | db.commit() 389 | logging.info('Fix DB media column done.') 390 | 391 | ### API Related 392 | 393 | class BotAPIFailed(Exception): 394 | pass 395 | 396 | def change_session(): 397 | global HSession 398 | HSession.close() 399 | HSession = requests.Session() 400 | HSession.headers["User-Agent"] = USERAGENT 401 | logging.warning('Session changed.') 402 | 403 | def bot_api(method, **params): 404 | for att in range(3): 405 | try: 406 | req = HSession.get(URL + method, params=params, timeout=45) 407 | retjson = req.content 408 | ret = json.loads(retjson.decode('utf-8')) 409 | break 410 | except Exception as ex: 411 | if att < 1: 412 | time.sleep((att+1) * 2) 413 | change_session() 414 | else: 415 | raise ex 416 | if not ret['ok']: 417 | raise BotAPIFailed(repr(ret)) 418 | return ret['result'] 419 | 420 | def bot_api_noerr(method, **params): 421 | try: 422 | bot_api(method, **params) 423 | except Exception: 424 | logging.exception('Async bot API failed.') 425 | 426 | def sync_sendmsg(text, chat_id, reply_to_message_id=None): 427 | global LOG_Q 428 | text = text.strip() 429 | if not text: 430 | logging.warning('Empty message ignored: %s, %s' % (chat_id, reply_to_message_id)) 431 | return 432 | logging.info('sendMessage(%s): %s' % (len(text), text[:20])) 433 | if len(text) > 2000: 434 | text = text[:1999] + '…' 435 | reply_id = reply_to_message_id 436 | if reply_to_message_id and reply_to_message_id < 0: 437 | reply_id = None 438 | m = bot_api('sendMessage', chat_id=chat_id, text=text, reply_to_message_id=reply_id) 439 | if chat_id == -CFG['groupid']: 440 | MSG_CACHE[m['message_id']] = m 441 | # IRC messages 442 | if reply_to_message_id is not None: 443 | LOG_Q.put(m) 444 | irc_send(text, reply_to_message_id) 445 | return m 446 | 447 | sendmsg = async_func(sync_sendmsg) 448 | 449 | #@async_func 450 | def forward(message_id, chat_id, reply_to_message_id=None): 451 | global LOG_Q 452 | logging.info('forwardMessage: %r' % message_id) 453 | try: 454 | if message_id < 0: 455 | raise ValueError('Invalid message id') 456 | r = bot_api('forwardMessage', chat_id=chat_id, from_chat_id=-CFG['groupid'], message_id=message_id) 457 | logging.debug('Forwarded: %s' % message_id) 458 | except (ValueError, BotAPIFailed) as ex: 459 | m = db_getmsg(message_id) 460 | if m: 461 | r = sendmsg('[%s] %s: %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(m[4] + CFG['timezone'] * 3600)), db_getufname(m[1]), m[2]), chat_id, reply_to_message_id) 462 | logging.debug('Manually forwarded: %s' % message_id) 463 | if chat_id == -CFG['groupid']: 464 | LOG_Q.put(r) 465 | irc_send(forward_message_id=message_id) 466 | 467 | #@async_func 468 | def forwardmulti(message_ids, chat_id, reply_to_message_id=None): 469 | failed = False 470 | message_ids = tuple(message_ids) 471 | for message_id in message_ids: 472 | logging.info('forwardMessage: %r' % message_id) 473 | try: 474 | if message_id < 0: 475 | raise ValueError('Invalid message id') 476 | r = bot_api('forwardMessage', chat_id=chat_id, from_chat_id=-CFG['groupid'], message_id=message_id) 477 | logging.debug('Forwarded: %s' % message_id) 478 | if chat_id == -CFG['groupid']: 479 | LOG_Q.put(r) 480 | except (ValueError, BotAPIFailed) as ex: 481 | failed = True 482 | break 483 | if failed: 484 | forwardmulti_t(message_ids, chat_id, reply_to_message_id) 485 | logging.debug('Manually forwarded: %s' % (message_ids,)) 486 | elif chat_id == -CFG['groupid']: 487 | for message_id in message_ids: 488 | irc_send(forward_message_id=message_id) 489 | 490 | #@async_func 491 | def forwardmulti_t(message_ids, chat_id, reply_to_message_id=None): 492 | text = [] 493 | for message_id in message_ids: 494 | m = db_getmsg(message_id) 495 | if m: 496 | text.append('[%s] %s: %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(m[4] + CFG['timezone'] * 3600)), db_getufname(m[1]), m[2])) 497 | sendmsg('\n'.join(text) or 'Message(s) not found.', chat_id, reply_to_message_id) 498 | 499 | @async_func 500 | def typing(chat_id): 501 | logging.info('sendChatAction: %r' % chat_id) 502 | bot_api('sendChatAction', chat_id=chat_id, action='typing') 503 | 504 | def getfile(file_id): 505 | logging.info('getFile: %r' % file_id) 506 | return bot_api('getFile', file_id=file_id) 507 | 508 | def retrieve(url, filename, raisestatus=True): 509 | # NOTE the stream=True parameter 510 | r = requests.get(url, stream=True) 511 | if raisestatus: 512 | r.raise_for_status() 513 | with open(filename, 'wb') as f: 514 | for chunk in r.iter_content(chunk_size=1024): 515 | if chunk: # filter out keep-alive new chunks 516 | f.write(chunk) 517 | f.flush() 518 | return r.status_code 519 | 520 | #def extract_tag(s): 521 | #words = [] 522 | #tags = [] 523 | #for frag in s.split(): 524 | #if frag[0] == '#': 525 | ## Should simulate Telegram behavior 526 | #tags.append(frag[1:]) 527 | #words.extend(jieba.cut(frag[1:])) 528 | #elif frag[0] == '@': 529 | #pass 530 | #else: 531 | #words.extend(jieba.cut(frag)) 532 | ## counting frequency in a short sentence makes no sense 533 | #return (words, set(tags)) 534 | 535 | def daystart(sec=None): 536 | if not sec: 537 | sec = time.time() 538 | return (sec + CFG["timezone"]*3600)//86400 * 86400 - CFG["timezone"]*3600 539 | 540 | def uniq(seq): # Dave Kirby 541 | # Order preserving 542 | seen = set() 543 | return [x for x in seq if x not in seen and not seen.add(x)] 544 | 545 | def classify(msg): 546 | ''' 547 | Classify message type: 548 | 549 | - Command: (0) 550 | All messages that start with a slash ‘/’ (see Commands above) 551 | Messages that @mention the bot by username 552 | Replies to the bot's own messages 553 | 554 | - Group message (1) 555 | - IRC message (2) 556 | - new_chat_participant (3) 557 | - Ignored message (10) 558 | - Invalid calling (-1) 559 | ''' 560 | chat = msg['chat'] 561 | text = msg.get('text', '').strip() 562 | if text: 563 | if text[0] in "/'" or ('@' + CFG['botname']) in text: 564 | return 0 565 | elif 'first_name' in chat: 566 | return 0 567 | else: 568 | reply = msg.get('reply_to_message') 569 | if reply and reply['from']['id'] == CFG['botid']: 570 | return 0 571 | 572 | # If not enabled, there won't be this kind of msg 573 | ircu = msg.get('_ircuser') 574 | if ircu and ircu != CFG['ircnick']: 575 | return 2 576 | 577 | if 'title' in chat: 578 | # Group chat 579 | if chat['id'] == -CFG['groupid']: 580 | if msg['from']['id'] == CFG['botid']: 581 | return 10 582 | elif 'new_chat_participant' in msg: 583 | return 3 584 | else: 585 | return 1 586 | else: 587 | return 10 588 | else: 589 | return -1 590 | 591 | def command(text, chatid, replyid, msg): 592 | try: 593 | t = text.strip().split(' ') 594 | if not t: 595 | return 596 | if t[0][0] in "/'": 597 | cmd = t[0][1:].lower().replace('@' + CFG['botname'], '') 598 | if cmd in COMMANDS: 599 | if chatid > 0 or chatid == -CFG['groupid'] or cmd in PUBLIC: 600 | expr = ' '.join(t[1:]).strip() 601 | logging.info('Command: /%s %s' % (cmd, expr[:20])) 602 | COMMANDS[cmd](expr, chatid, replyid, msg) 603 | elif chatid < 0 and chatid != -CFG['groupid'] and cmd not in PUBLIC: 604 | sendmsg('This command is not available for this group. Send /help for available commands.', chatid, replyid) 605 | elif chatid > 0: 606 | sendmsg('Invalid command. Send /help for help.', chatid, replyid) 607 | # 233333 608 | #elif all(n.isdigit() for n in t): 609 | #COMMANDS['m'](' '.join(t), chatid, replyid, msg) 610 | elif chatid > 0: 611 | t = ' '.join(t).strip() 612 | logging.info('Reply: ' + t[:20]) 613 | COMMANDS['reply'](t, chatid, replyid, msg) 614 | except Exception: 615 | logging.exception('Excute command failed.') 616 | 617 | def processmsg(): 618 | d = MSG_Q.get() 619 | logging.debug('Msg arrived: %r' % d) 620 | if 'message' in d: 621 | msg = d['message'] 622 | if 'text' in msg: 623 | msg['text'] = msg['text'].replace('\xa0', ' ') 624 | elif 'caption' in msg: 625 | msg['text'] = msg['caption'].replace('\xa0', ' ') 626 | MSG_CACHE[msg['message_id']] = msg 627 | cls = classify(msg) 628 | logging.debug('Classified as: %s', cls) 629 | if msg['chat']['id'] == -CFG['groupid'] and CFG.get('t2i'): 630 | irc_forward(msg) 631 | if cls == 0: 632 | if msg['chat']['id'] == -CFG['groupid']: 633 | logmsg(msg) 634 | rid = msg['message_id'] 635 | if CFG.get('i2t') and '_ircuser' in msg: 636 | rid = sync_sendmsg('[%s] %s' % (msg['_ircuser'], msg['text']), msg['chat']['id'])['message_id'] 637 | command(msg['text'], msg['chat']['id'], rid, msg) 638 | elif cls == 1: 639 | logmsg(msg) 640 | elif cls == 2: 641 | logmsg(msg) 642 | if CFG.get('i2t'): 643 | act = re_ircaction.match(msg['text']) 644 | if act: 645 | sendmsg('** %s %s **' % (msg['_ircuser'], act.group(1)), msg['chat']['id']) 646 | else: 647 | sendmsg('[%s] %s' % (msg['_ircuser'], msg['text']), msg['chat']['id']) 648 | elif cls == 3: 649 | logmsg(msg) 650 | cmd__welcome('', msg['chat']['id'], msg['message_id'], msg) 651 | elif cls == -1: 652 | sendmsg('Wrong usage', msg['chat']['id'], msg['message_id']) 653 | if cls in (1, 2) and CFG.get('autoclose') and 'forward_from' not in msg: 654 | autoclose(msg) 655 | try: 656 | logmsg(LOG_Q.get_nowait()) 657 | except queue.Empty: 658 | pass 659 | 660 | def cachemedia(msg): 661 | ''' 662 | Download specified media if not exist. 663 | ''' 664 | mt = msg.keys() & frozenset(('audio', 'document', 'sticker', 'video', 'voice')) 665 | file_ext = '' 666 | if mt: 667 | mt = mt.pop() 668 | file_id = msg[mt]['file_id'] 669 | file_size = msg[mt].get('file_size') 670 | if mt == 'sticker': 671 | file_ext = '.webp' 672 | elif 'photo' in msg: 673 | photo = max(msg['photo'], key=lambda x: x['width']) 674 | file_id = photo['file_id'] 675 | file_size = photo.get('file_size') 676 | file_ext = '.jpg' 677 | fp = getfile(file_id) 678 | file_size = fp.get('file_size') or file_size 679 | file_path = fp.get('file_path') 680 | if not file_path: 681 | raise BotAPIFailed("can't get file_path for " + file_id) 682 | file_ext = os.path.splitext(file_path)[1] or file_ext 683 | cachename = file_id + file_ext 684 | fpath = os.path.join(CFG['cachepath'], cachename) 685 | try: 686 | if os.path.isfile(fpath) and os.path.getsize(fpath) == file_size: 687 | return (cachename, 304) 688 | except Exception: 689 | pass 690 | return (cachename, retrieve(URL_FILE + file_path, fpath)) 691 | 692 | def timestring_a(seconds): 693 | m, s = divmod(seconds, 60) 694 | h, m = divmod(m, 60) 695 | return '%d:%02d:%02d' % (h, m, s) 696 | 697 | def servemedia(msg): 698 | ''' 699 | Reply type and link of media. This only generates links for photos. 700 | ''' 701 | keys = tuple(msg.keys() & MEDIA_TYPES) 702 | if not keys: 703 | return '' 704 | ret = '<%s>' % keys[0] 705 | if 'photo' in msg: 706 | servemode = CFG.get('servemedia') 707 | if servemode: 708 | fname, code = cachemedia(msg) 709 | if servemode == 'self': 710 | ret += ' %s%s' % (CFG['serveurl'], fname) 711 | elif servemode == 'vim-cn': 712 | r = requests.post('http://img.vim-cn.com/', files={'name': open(os.path.join(CFG['cachepath'], fname), 'rb')}) 713 | ret += ' ' + r.text 714 | elif 'sticker' in msg: 715 | if CFG.get('servemedia') == 'self': 716 | fname, code = cachemedia(msg) 717 | ret += ' %s%s' % (CFG['serveurl'], fname) 718 | if msg['sticker'].get('emoji'): 719 | ret = msg['sticker']['emoji'] + ' ' + ret 720 | elif 'document' in msg: 721 | ret += ' %s' % (msg['document'].get('file_name', '')) 722 | if CFG.get('servemedia') == 'self' and msg['document'].get('file_size', 0) <= CFG.get('servemaxsize', 1048576): 723 | fname, code = cachemedia(msg) 724 | ret += ' %s%s' % (CFG['serveurl'], fname) 725 | elif 'video' in msg: 726 | ret += ' ' + timestring_a(msg['video'].get('duration', 0)) 727 | if CFG.get('servemedia') == 'self' and msg['video'].get('file_size', 0) <= CFG.get('servemaxsize', 1048576): 728 | fname, code = cachemedia(msg) 729 | ret += ' %s%s' % (CFG['serveurl'], fname) 730 | elif 'voice' in msg: 731 | ret += ' ' + timestring_a(msg['voice'].get('duration', 0)) 732 | if CFG.get('servemedia') == 'self' and msg['voice'].get('file_size', 0) <= CFG.get('servemaxsize', 1048576): 733 | fname, code = cachemedia(msg) 734 | ret += ' %s%s' % (CFG['serveurl'], fname) 735 | elif 'new_chat_title' in msg: 736 | ret += ' ' + msg['new_chat_title'] 737 | return ret 738 | 739 | def autoclose(msg): 740 | openbrckt = ('([{([{⦅〚⦃“‘‹«「〈《【〔⦗『〖〘「⟦⟨⟪⟮⟬⌈⌊⦇⦉❛❝❨❪❴❬❮❰❲' 741 | '⏜⎴⏞〝︵⏠﹁﹃︹︻︗︿︽﹇︷〈⦑⧼﹙﹛﹝⁽₍⦋⦍⦏⁅⸢⸤⟅⦓⦕⸦⸨⦅⧘⧚⸜⸌⸂⸄⸉᚛༺༼') 742 | clozbrckt = (')]})]}⦆〛⦄”’›»」〉》】〕⦘』〗〙」⟧⟩⟫⟯⟭⌉⌋⦈⦊❜❞❩❫❵❭❯❱❳' 743 | '⏝⎵⏟〞︶⏡﹂﹄︺︼︘﹀︾﹈︸〉⦒⧽﹚﹜﹞⁾₎⦌⦎⦐⁆⸣⸥⟆⦔⦖⸧⸩⦆⧙⧛⸝⸍⸃⸅⸊᚜༻༽') 744 | stack = [] 745 | for ch in msg.get('text', ''): 746 | index = openbrckt.find(ch) 747 | if index >= 0: 748 | stack.append(index) 749 | continue 750 | index = clozbrckt.find(ch) 751 | if index >= 0: 752 | if stack and stack[-1] == index: 753 | stack.pop() 754 | closed = ''.join(reversed(tuple(map(clozbrckt.__getitem__, stack)))) 755 | if closed: 756 | if len(closed) > 20: 757 | closed = closed[:20] + '…' 758 | sendmsg(closed, msg['chat']['id'], msg['message_id']) 759 | 760 | def db_adduser(d): 761 | user = (d['id'], d.get('username'), d.get('first_name'), d.get('last_name')) 762 | conn.execute('REPLACE INTO users (id, username, first_name, last_name) VALUES (?, ?, ?, ?)', user) 763 | USER_CACHE[d['id']] = (d.get('username'), d.get('first_name'), d.get('last_name')) 764 | return user 765 | 766 | def db_getuser(uid): 767 | r = USER_CACHE.get(uid) 768 | if r is None: 769 | r = conn.execute('SELECT username, first_name, last_name FROM users WHERE id = ?', (uid,)).fetchone() or (None, None, None) 770 | USER_CACHE[uid] = r 771 | return r 772 | 773 | def db_getufname(uid): 774 | name, last = db_getuser(uid)[1:] 775 | if last: 776 | name += ' ' + last 777 | return name 778 | 779 | def dc_getufname(user, maxlen=100): 780 | USER_CACHE[user['id']] = (user.get('username'), user.get('first_name'), user.get('last_name')) 781 | name = user['first_name'] 782 | if 'last_name' in user: 783 | name += ' ' + user['last_name'] 784 | if len(name) > maxlen: 785 | name = name[:maxlen] + '…' 786 | return name 787 | 788 | def smartname(user, db=False, limit=20): 789 | if db: 790 | first, last = db_getuser(user)[1:] 791 | else: 792 | USER_CACHE[user['id']] = (user.get('username'), user.get('first_name'), user.get('last_name')) 793 | first, last = user.get('first_name', ''), user.get('last_name', '') 794 | if not first: 795 | return '<%s>' % 'Unknown'[:limit-2] 796 | pn = first 797 | if last: 798 | pn += ' ' + last 799 | if len(pn) > limit: 800 | if len(first) > limit: 801 | return first.split(None, 1)[0][:limit] 802 | else: 803 | return first[:limit] 804 | else: 805 | return pn 806 | 807 | @functools.lru_cache(maxsize=10) 808 | def db_getmsg(mid): 809 | return conn.execute('SELECT * FROM messages WHERE id = ?', (mid,)).fetchone() 810 | 811 | @functools.lru_cache(maxsize=10) 812 | def db_getuidbyname(username): 813 | if username.startswith('#'): 814 | try: 815 | return int(username[1:]) 816 | except ValueError: 817 | return None 818 | else: 819 | uid = conn.execute('SELECT id FROM users WHERE username LIKE ?', (username,)).fetchone() 820 | if uid: 821 | return uid[0] 822 | 823 | 824 | def logmsg(d, iorignore=False): 825 | src = db_adduser(d['from'])[0] 826 | text = d.get('text') or d.get('caption', '') 827 | media = {k:d[k] for k in EXT_MEDIA_TYPES.intersection(d.keys())} 828 | fwd_src = db_adduser(d['forward_from'])[0] if 'forward_from' in d else None 829 | reply_id = d['reply_to_message']['message_id'] if 'reply_to_message' in d else None 830 | into = 'INSERT OR IGNORE INTO' if iorignore else 'REPLACE INTO' 831 | conn.execute(into + ' messages (id, src, text, media, date, fwd_src, fwd_date, reply_id) VALUES (?,?,?,?, ?,?,?,?)', 832 | (d['message_id'], src, text, json.dumps(media) if media else None, d['date'], fwd_src, d.get('forward_date'), reply_id)) 833 | logging.info('Logged %s: %s', d['message_id'], d.get('text', '')[:15]) 834 | db.commit() 835 | 836 | ### Commands 837 | 838 | def cmd_getmsg(expr, chatid, replyid, msg): 839 | '''/m [...] Get specified message(s) by ID(s).''' 840 | try: 841 | if not expr: 842 | if 'reply_to_message' in msg: 843 | sendmsg('Message ID: %d' % msg['reply_to_message']['message_id'], chatid, replyid) 844 | else: 845 | raise ValueError 846 | mids = tuple(map(int, expr.split())) 847 | except Exception: 848 | sendmsg('Syntax error. Usage: ' + cmd_getmsg.__doc__, chatid, replyid) 849 | return 850 | forwardmulti(mids, chatid, replyid) 851 | 852 | def cmd_context(expr, chatid, replyid, msg): 853 | '''/context [number=2] Show the specified message and its context. max=10''' 854 | expr = expr.split(' ') 855 | try: 856 | if len(expr) > 1: 857 | mid = max(int(expr[0]), 1) 858 | limit = max(min(int(expr[1]), 10), 1) 859 | else: 860 | mid, limit = int(expr[0]), 2 861 | except Exception: 862 | sendmsg('Syntax error. Usage: ' + cmd_context.__doc__, chatid, replyid) 863 | return 864 | typing(chatid) 865 | forwardmulti_t(range(mid - limit, mid + limit + 1), chatid, replyid) 866 | 867 | def cmd_quote(expr, chatid, replyid, msg): 868 | '''/quote Send a today's random message.''' 869 | typing(chatid) 870 | sec = daystart() 871 | msg = conn.execute('SELECT id FROM messages WHERE date >= ? AND date < ? ORDER BY RANDOM() LIMIT 1', (sec, sec + 86400)).fetchone() 872 | if msg is None: 873 | msg = conn.execute('SELECT id FROM messages ORDER BY RANDOM() LIMIT 1').fetchone() 874 | #forwardmulti((msg[0]-1, msg[0], msg[0]+1), chatid, replyid) 875 | forward(msg[0], chatid, replyid) 876 | 877 | def ellipsisresult(s, find, maxctx=50): 878 | if find: 879 | try: 880 | lnid = s.lower().index(find.lower()) 881 | r = s[max(0, lnid - maxctx):min(len(s), lnid + maxctx)].strip() 882 | if len(r) < len(s): 883 | r = '… %s …' % r 884 | return r 885 | except ValueError: 886 | return s 887 | else: 888 | return s 889 | 890 | re_search_number = re.compile(r'([0-9]+)(,[0-9]+)?') 891 | 892 | def cmd_search(expr, chatid, replyid, msg): 893 | '''/search|/s [@username] [keyword] [number=5|number,offset] Search the group log for recent messages. max(number)=20''' 894 | username, uid, limit, offset = None, None, 5, 0 895 | if expr: 896 | expr = expr.split(' ') 897 | if len(expr) > 1: 898 | ma = re_search_number.match(expr[-1]) 899 | if ma: 900 | expr = expr[:-1] 901 | limit = max(min(int(ma.group(1)), 20), 1) 902 | offset = int(ma.group(2)[1:]) if ma.group(2) else 0 903 | if expr[0][0] == '@': 904 | username = expr[0][1:] 905 | keyword = ' '.join(expr[1:]) 906 | else: 907 | keyword = ' '.join(expr) 908 | else: 909 | keyword = '' 910 | if username: 911 | uid = db_getuidbyname(username) 912 | typing(chatid) 913 | if uid is None: 914 | keyword = ' '.join(expr) 915 | sqr = conn.execute("SELECT id, src, text, date FROM messages WHERE text LIKE ? ORDER BY date DESC LIMIT ? OFFSET ?", ('%' + keyword + '%', limit, offset)).fetchall() 916 | else: 917 | sqr = conn.execute("SELECT id, src, text, date FROM messages WHERE src = ? AND text LIKE ? ORDER BY date DESC LIMIT ? OFFSET ?", (uid, '%' + keyword + '%', limit, offset)).fetchall() 918 | result = [] 919 | for mid, fr, text, date in sqr: 920 | text = ellipsisresult(text, keyword) 921 | if len(text) > 100: 922 | text = text[:100] + '…' 923 | if uid: 924 | result.append('[%d|%s] %s' % (mid, time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(date + CFG['timezone'] * 3600)), text)) 925 | else: 926 | result.append('[%d|%s] %s: %s' % (mid, time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(date + CFG['timezone'] * 3600)), db_getufname(fr), text)) 927 | sendmsg('\n'.join(result) or 'Found nothing.', chatid, replyid) 928 | 929 | def cmd_mention(expr, chatid, replyid, msg): 930 | '''/mention Show last mention of you.''' 931 | if msg['chat']['id'] != -CFG['groupid']: 932 | sendmsg("This command can't be used in this chat.", chatid, replyid) 933 | return 934 | uid = msg['from']['id'] 935 | user = db_getuser(uid) 936 | if user[0]: 937 | res = conn.execute("SELECT * FROM messages WHERE (text LIKE ? OR reply_id IN (SELECT id FROM messages WHERE src = ?)) AND src != ? ORDER BY date DESC LIMIT 1", ('%@' + user[0] + '%', uid, CFG['botid'])).fetchone() 938 | userat = '@' + user[0] + ' ' 939 | else: 940 | res = conn.execute("SELECT * FROM messages WHERE reply_id IN (SELECT id FROM messages WHERE src = ?) AND src != ? ORDER BY date DESC LIMIT 1", (uid, CFG['botid'])).fetchone() 941 | userat = '' 942 | if res: 943 | reid = res[0] 944 | if reid > 0: 945 | sendmsg(userat + 'You were mentioned in this message.', chatid, reid) 946 | else: 947 | forward(reid, chatid, replyid) 948 | else: 949 | sendmsg('No mention found.', chatid, replyid) 950 | 951 | def timestring(minutes): 952 | h, m = divmod(minutes, 60) 953 | d, h = divmod(h, 24) 954 | return (' %d 天' % d if d else '') + (' %d 小时' % h if h else '') + (' %d 分钟' % m if m else '') 955 | 956 | def cmd_uinfo(expr, chatid, replyid, msg): 957 | '''/user|/uinfo [@username] [minutes=1440] Show information about <@username>.''' 958 | if 'reply_to_message' in msg: 959 | uid = msg['reply_to_message']['from']['id'] 960 | else: 961 | uid = None 962 | if expr: 963 | expr = expr.split(' ') 964 | username = expr[0] 965 | if not username.startswith('@'): 966 | uid = uid or msg['from']['id'] 967 | try: 968 | minutes = min(max(int(expr[0]), 1), 3359733) 969 | except Exception: 970 | minutes = 1440 971 | else: 972 | uid = db_getuidbyname(username[1:]) 973 | if not uid: 974 | sendmsg('User not found.', chatid, replyid) 975 | return 976 | try: 977 | minutes = min(max(int(expr[1]), 1), 3359733) 978 | except Exception: 979 | minutes = 1440 980 | else: 981 | uid = uid or msg['from']['id'] 982 | minutes = 1440 983 | user = db_getuser(uid) 984 | uinfoln = [] 985 | if user[0]: 986 | uinfoln.append('@' + user[0]) 987 | uinfoln.append(db_getufname(uid)) 988 | uinfoln.append('ID: %s' % uid) 989 | result = [', '.join(uinfoln)] 990 | if msg['chat']['id'] == -CFG['groupid']: 991 | r = conn.execute('SELECT src FROM messages WHERE date > ?', (time.time() - minutes * 60,)).fetchall() 992 | timestr = timestring(minutes) 993 | if r: 994 | ctr = collections.Counter(i[0] for i in r) 995 | if uid in ctr: 996 | rank = sorted(ctr, key=ctr.__getitem__, reverse=True).index(uid) + 1 997 | result.append('在最近%s内发了 %s 条消息,占 %.2f%%,位列第 %s。' % (timestr, ctr[uid], ctr[uid]/len(r)*100, rank)) 998 | else: 999 | result.append('在最近%s内没发消息。' % timestr) 1000 | else: 1001 | result.append('在最近%s内没发消息。' % timestr) 1002 | sendmsg('\n'.join(result), chatid, replyid) 1003 | 1004 | def cmd_stat(expr, chatid, replyid, msg): 1005 | '''/stat [minutes=1440] Show statistics.''' 1006 | try: 1007 | minutes = min(max(int(expr), 1), 3359733) 1008 | except Exception: 1009 | minutes = 1440 1010 | r = conn.execute('SELECT src FROM messages WHERE date > ?', (time.time() - minutes * 60,)).fetchall() 1011 | timestr = timestring(minutes) 1012 | if not r: 1013 | sendmsg('在最近%s内无消息。' % timestr, chatid, replyid) 1014 | return 1015 | ctr = collections.Counter(i[0] for i in r) 1016 | mcomm = ctr.most_common(5) 1017 | count = len(r) 1018 | msg = ['在最近%s内有 %s 条消息,平均每分钟 %.2f 条。' % (timestr, count, count/minutes)] 1019 | msg.extend('%s: %s 条,%.2f%%' % (db_getufname(k), v, v/count*100) for k, v in mcomm) 1020 | msg.append('其他用户 %s 条,人均 %.2f 条' % (count - sum(v for k, v in mcomm), count / len(ctr))) 1021 | sendmsg('\n'.join(msg), chatid, replyid) 1022 | 1023 | def cmd_digest(expr, chatid, replyid, msg): 1024 | sendmsg('Not implemented.', chatid, replyid) 1025 | 1026 | def cmd_calc(expr, chatid, replyid, msg): 1027 | '''/calc Calculate .''' 1028 | if expr: 1029 | runapptask('calc', (expr,), (chatid, replyid)) 1030 | else: 1031 | sendmsg('Syntax error. Usage: ' + cmd_calc.__doc__, chatid, replyid) 1032 | 1033 | def cmd_py(expr, chatid, replyid, msg): 1034 | '''/py Evaluate Python 2 expression .''' 1035 | if expr: 1036 | if len(expr) > 1000: 1037 | sendmsg('Expression too long.', chatid, replyid) 1038 | else: 1039 | runapptask('py', (expr,), (chatid, replyid)) 1040 | else: 1041 | sendmsg('Syntax error. Usage: ' + cmd_py.__doc__, chatid, replyid) 1042 | 1043 | def cmd_bf(expr, chatid, replyid, msg): 1044 | '''/bf [|] Evaluate Brainf*ck expression (with ).''' 1045 | if expr: 1046 | expr = expr.split('|', 1) 1047 | inpt = expr[1] if len(expr) > 1 else '' 1048 | runapptask('bf', (expr[0], inpt), (chatid, replyid)) 1049 | else: 1050 | sendmsg('Syntax error. Usage: ' + cmd_bf.__doc__, chatid, replyid) 1051 | 1052 | def cmd_lisp(expr, chatid, replyid, msg): 1053 | '''/lisp Evaluate Lisp(Scheme)-like expression .''' 1054 | if expr: 1055 | runapptask('lisp', (expr,), (chatid, replyid)) 1056 | else: 1057 | sendmsg('Syntax error. Usage: ' + cmd_lisp.__doc__, chatid, replyid) 1058 | 1059 | def cmd_name(expr, chatid, replyid, msg): 1060 | '''/name [pinyin] Get a Chinese name.''' 1061 | runapptask('name', (expr,), (chatid, replyid)) 1062 | 1063 | def cmd_cc(expr, chatid, replyid, msg): 1064 | '''/cc Simplified-Traditional Chinese conversion.''' 1065 | tinput = '' 1066 | if 'reply_to_message' in msg: 1067 | tinput = msg['reply_to_message'].get('text', '') 1068 | tinput = (expr or tinput).strip() 1069 | runapptask('cc', (tinput,), (chatid, replyid)) 1070 | 1071 | def cmd_ime(expr, chatid, replyid, msg): 1072 | '''/ime [pinyin] Simple Pinyin IME.''' 1073 | tinput = '' 1074 | if 'reply_to_message' in msg: 1075 | tinput = msg['reply_to_message'].get('text', '') 1076 | tinput = (expr or tinput).strip() 1077 | if len(tinput) > 200: 1078 | tinput = tinput[:200] + '…' 1079 | if not tinput: 1080 | sendmsg('Syntax error. Usage: ' + cmd_ime.__doc__, chatid, replyid) 1081 | return 1082 | runapptask('ime', (tinput,), (chatid, replyid)) 1083 | 1084 | def cmd_cut(expr, chatid, replyid, msg): 1085 | '''/cut [c|m] Segment .''' 1086 | if expr[:2].strip() == 'c': 1087 | lang = 'c' 1088 | expr = expr[2:] 1089 | elif expr[:2].strip() == 'm': 1090 | lang = 'm' 1091 | expr = expr[2:] 1092 | else: 1093 | lang = None 1094 | tinput = '' 1095 | if 'reply_to_message' in msg: 1096 | tinput = msg['reply_to_message'].get('text', '') 1097 | tinput = (expr or tinput).strip() 1098 | if len(tinput) > 1000: 1099 | tinput = tinput[:1000] + '……' 1100 | if not tinput: 1101 | sendmsg('Syntax error. Usage: ' + cmd_cut.__doc__, chatid, replyid) 1102 | return 1103 | runapptask('cut', (tinput, lang), (chatid, replyid)) 1104 | 1105 | def cmd_wyw(expr, chatid, replyid, msg): 1106 | '''/wyw [c|m] Translate something to or from classical Chinese.''' 1107 | if expr[:2].strip() == 'c': 1108 | lang = 'c2m' 1109 | expr = expr[2:] 1110 | elif expr[:2].strip() == 'm': 1111 | lang = 'm2c' 1112 | expr = expr[2:] 1113 | else: 1114 | lang = None 1115 | tinput = '' 1116 | if 'reply_to_message' in msg: 1117 | tinput = msg['reply_to_message'].get('text', '') 1118 | tinput = (expr or tinput).strip() 1119 | if len(tinput) > 1000: 1120 | tinput = tinput[:1000] + '……' 1121 | if not tinput: 1122 | sendmsg('Syntax error. Usage: ' + cmd_wyw.__doc__, chatid, replyid) 1123 | return 1124 | typing(chatid) 1125 | runapptask('wyw', (tinput, lang), (chatid, replyid)) 1126 | 1127 | def cmd_say(expr, chatid, replyid, msg): 1128 | '''/say Say something interesting.''' 1129 | #typing(chatid) 1130 | if expr: 1131 | runapptask('reply', (expr,), (chatid, replyid)) 1132 | else: 1133 | runapptask('say', (), (chatid, replyid)) 1134 | 1135 | def cmd_mgw(expr, chatid, replyid, msg): 1136 | if chatid < 0: 1137 | return 1138 | runapptask('mgw', (), (chatid, replyid)) 1139 | 1140 | def cmd_reply(expr, chatid, replyid, msg): 1141 | '''/reply [question] Reply to the conversation.''' 1142 | if 'forward_from' in msg and msg['chat']['id'] < 0: 1143 | return 1144 | typing(chatid) 1145 | text = '' 1146 | if 'reply_to_message' in msg: 1147 | text = msg['reply_to_message'].get('text', '') 1148 | text = (expr.strip() or text or ' '.join(t[0] for t in conn.execute("SELECT text FROM messages ORDER BY date DESC LIMIT 2").fetchall())).replace('\n', ' ') 1149 | runapptask('reply', (text,), (chatid, replyid)) 1150 | 1151 | def cmd_cont(expr, chatid, replyid, msg): 1152 | '''/cont [sentence] Complete the sentence.''' 1153 | if 'forward_from' in msg and msg['chat']['id'] < 0: 1154 | return 1155 | typing(chatid) 1156 | text = '' 1157 | if 'reply_to_message' in msg: 1158 | text = msg['reply_to_message'].get('text', '') 1159 | text = (expr.strip() or text or conn.execute("SELECT text FROM messages ORDER BY date DESC LIMIT 1").fetchone()[0]).replace('\n', ' ') 1160 | runapptask('cont', (text,), (chatid, replyid)) 1161 | 1162 | def cmd_echo(expr, chatid, replyid, msg): 1163 | '''/echo Parrot back.''' 1164 | if 'ping' in expr.lower(): 1165 | sendmsg('pong', chatid, replyid) 1166 | elif expr: 1167 | sendmsg(expr, chatid, replyid) 1168 | else: 1169 | sendmsg('ping', chatid, replyid) 1170 | 1171 | def cmd_do(expr, chatid, replyid, msg): 1172 | actions = collections.OrderedDict(( 1173 | ('shrug', '¯\\_(ツ)_/¯'), 1174 | ('lenny', '( ͡° ͜ʖ ͡°)'), 1175 | ('flip', '(╯°□°)╯︵ ┻━┻'), 1176 | ('homo', '┌(┌ ^o^)┐'), 1177 | ('look', 'ಠ_ಠ'), 1178 | ('cn', '[citation needed]'), 1179 | ('boom', '💥'), 1180 | ('tweet', '🐦'), 1181 | ('blink', '👀'), 1182 | ('see-no-evil', '🙈'), 1183 | ('hear-no-evil', '🙉'), 1184 | ('speak-no-evil', '🙊'), 1185 | ('however', ('不要怪我们没有警告过你\n我们都有不顺利的时候\n' 1186 | 'Something happened\n这真是让人尴尬\n' 1187 | '请坐和放宽,滚回以前的版本\n这就是你的人生\n是的,你的人生')) 1188 | )) 1189 | expr = expr.lower() 1190 | res = actions.get(expr) 1191 | if res: 1192 | sendmsg(res, chatid, replyid) 1193 | elif expr == 'help': 1194 | sendmsg(', '.join(actions.keys()), chatid, replyid) 1195 | else: 1196 | try: 1197 | res = unicodedata.lookup(expr) 1198 | sendmsg(res, chatid, replyid) 1199 | return 1200 | except KeyError: 1201 | pass 1202 | if len(expr) == 1: 1203 | try: 1204 | res = unicodedata.name(expr) 1205 | sendmsg(res, chatid, replyid) 1206 | except ValueError: 1207 | sendmsg('Character not found in Unicode %s' % unicodedata.unidata_version, chatid, replyid) 1208 | else: 1209 | sendmsg('Something happened.', chatid, replyid) 1210 | 1211 | def cmd_t2i(expr, chatid, replyid, msg): 1212 | global CFG 1213 | if msg['chat']['id'] == -CFG['groupid']: 1214 | if expr == 'off' or CFG.get('t2i'): 1215 | CFG['t2i'] = False 1216 | sendmsg('Telegram to IRC forwarding disabled.', chatid, replyid) 1217 | elif expr == 'on' or not CFG.get('t2i'): 1218 | CFG['t2i'] = True 1219 | sendmsg('Telegram to IRC forwarding enabled.', chatid, replyid) 1220 | 1221 | def cmd_i2t(expr, chatid, replyid, msg): 1222 | global CFG 1223 | if msg['chat']['id'] == -CFG['groupid']: 1224 | if expr == 'off' or CFG.get('i2t'): 1225 | CFG['i2t'] = False 1226 | sendmsg('IRC to Telegram forwarding disabled.', chatid, replyid) 1227 | elif expr == 'on' or not CFG.get('i2t'): 1228 | CFG['i2t'] = True 1229 | sendmsg('IRC to Telegram forwarding enabled.', chatid, replyid) 1230 | 1231 | def cmd_autoclose(expr, chatid, replyid, msg): 1232 | global CFG 1233 | if msg['chat']['id'] == -CFG['groupid']: 1234 | if CFG.get('autoclose'): 1235 | CFG['autoclose'] = False 1236 | sendmsg('Auto closing brackets disabled.', chatid, replyid) 1237 | else: 1238 | CFG['autoclose'] = True 1239 | sendmsg('Auto closing brackets enabled.', chatid, replyid) 1240 | 1241 | def cmd_cancel(expr, chatid, replyid, msg): 1242 | '''/cancel Hide keyboard and interrupt current session.''' 1243 | bot_api('sendMessage', chat_id=chatid, text='Cancelled.', reply_to_message_id=replyid, reply_markup='{"hide_keyboard": true}') 1244 | 1245 | def cmd__cmd(expr, chatid, replyid, msg): 1246 | global SAY_P, APP_P 1247 | if chatid < 0: 1248 | return 1249 | if expr == 'killserver': 1250 | APP_P.terminate() 1251 | APP_P = subprocess.Popen(APP_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE) 1252 | checkappproc() 1253 | sendmsg('Server restarted.', chatid, replyid) 1254 | logging.info('Server restarted upon user request.') 1255 | elif expr == 'commit': 1256 | while 1: 1257 | try: 1258 | logmsg(LOG_Q.get_nowait()) 1259 | except queue.Empty: 1260 | break 1261 | db.commit() 1262 | sendmsg('DB committed.', chatid, replyid) 1263 | logging.info('DB committed upon user request.') 1264 | #elif expr == 'raiseex': # For debug 1265 | #async_func(_raise_ex)(Exception('/_cmd raiseex')) 1266 | #else: 1267 | #sendmsg('ping', chatid, replyid) 1268 | 1269 | def cmd__welcome(expr, chatid, replyid, msg): 1270 | if chatid != -CFG['groupid']: 1271 | return 1272 | usr = msg["new_chat_participant"] 1273 | USER_CACHE[usr["id"]] = (usr.get("username"), usr.get("first_name"), usr.get("last_name")) 1274 | sendmsg('欢迎 %s 加入本群!' % dc_getufname(usr), chatid, replyid) 1275 | 1276 | facescore = lambda x,y: 1/2*math.erfc((0.5*y-x)/(2**0.5*(0.5*y**0.5)))*100 1277 | 1278 | fstable = [facescore(i, 100) for i in range(101)] 1279 | revface = lambda x: min((abs(x-v), k) for k,v in enumerate(fstable))[1] 1280 | 1281 | def cmd_233(expr, chatid, replyid, msg): 1282 | try: 1283 | num = max(min(int(expr), 100), 1) 1284 | except Exception: 1285 | num = 1 1286 | w = math.ceil(num ** .5) 1287 | h, rem = divmod(num, w) 1288 | txt = '\n'.join(''.join(srandom.choice('🌝🌚') for i in range(w)) for j in range(h)) 1289 | if rem: 1290 | txt += '\n' + ''.join(srandom.choice('🌝🌚') for i in range(rem)) 1291 | wcount = txt.count('🌝') 1292 | if num > 9: 1293 | txt += '\n' + '(🌝%d/🌚%d' % (wcount, num - wcount) 1294 | if num > 41: 1295 | txt += ', 🌝%.2f%%' % facescore(wcount, num) 1296 | txt += ')' 1297 | sendmsg(txt, chatid, replyid) 1298 | 1299 | def cmd_fig(expr, chatid, replyid, msg): 1300 | '''/fig Make figure out of moon faces.''' 1301 | if expr: 1302 | runapptask('fig', (expr,), (chatid, replyid)) 1303 | else: 1304 | sendmsg(srandom.choice('🌝🌚'), chatid, replyid) 1305 | 1306 | def cmd_start(expr, chatid, replyid, msg): 1307 | if chatid != -CFG['groupid']: 1308 | sendmsg('This is Orz Digger. It can help you search the long and boring chat log of the ##Orz group.\nSend me /help for help.', chatid, replyid) 1309 | 1310 | def cmd_help(expr, chatid, replyid, msg): 1311 | '''/help Show usage.''' 1312 | if expr: 1313 | if expr in COMMANDS: 1314 | h = COMMANDS[expr].__doc__ 1315 | if h: 1316 | sendmsg(h, chatid, replyid) 1317 | else: 1318 | sendmsg('Help is not available for ' + expr, chatid, replyid) 1319 | else: 1320 | sendmsg('Command not found.', chatid, replyid) 1321 | elif chatid == -CFG['groupid']: 1322 | sendmsg('Full help disabled in this group.', chatid, replyid) 1323 | elif chatid > 0: 1324 | sendmsg('\n'.join(uniq(cmd.__doc__ for cmd in COMMANDS.values() if cmd.__doc__)), chatid, replyid) 1325 | else: 1326 | sendmsg('\n'.join(uniq(cmd.__doc__ for cmdname, cmd in COMMANDS.items() if cmd.__doc__ and cmdname in PUBLIC)), chatid, replyid) 1327 | 1328 | def sig_commit(signum, frame): 1329 | db.commit() 1330 | logging.info('DB committed upon signal %s' % signum) 1331 | 1332 | # should document usage in docstrings 1333 | COMMANDS = collections.OrderedDict(( 1334 | ('m', cmd_getmsg), 1335 | ('context', cmd_context), 1336 | ('s', cmd_search), 1337 | ('search', cmd_search), 1338 | ('mention', cmd_mention), 1339 | ('user', cmd_uinfo), 1340 | ('uinfo', cmd_uinfo), 1341 | ('digest', cmd_digest), 1342 | ('stat', cmd_stat), 1343 | ('calc', cmd_calc), 1344 | #('calc', cmd_py), 1345 | ('py', cmd_py), 1346 | ('bf', cmd_bf), 1347 | ('lisp', cmd_lisp), 1348 | ('name', cmd_name), 1349 | ('ime', cmd_ime), 1350 | ('fig', cmd_fig), 1351 | ('cc', cmd_cc), 1352 | ('quote', cmd_quote), 1353 | ('wyw', cmd_wyw), 1354 | ('cut', cmd_cut), 1355 | ('mgw', cmd_mgw), 1356 | ('say', cmd_say), 1357 | ('reply', cmd_reply), 1358 | #('cont', cmd_cont), 1359 | #('echo', cmd_echo), 1360 | ('do', cmd_do), 1361 | ('t2i', cmd_t2i), 1362 | ('i2t', cmd_i2t), 1363 | ('autoclose', cmd_autoclose), 1364 | ('233', cmd_233), 1365 | ('start', cmd_start), 1366 | ('help', cmd_help), 1367 | ('cancel', cmd_cancel), 1368 | ('_cmd', cmd__cmd) 1369 | )) 1370 | 1371 | PUBLIC = set(( 1372 | 'user', 1373 | 'calc', 1374 | 'py', 1375 | 'bf', 1376 | 'lisp', 1377 | 'name', 1378 | 'ime', 1379 | 'fig', 1380 | 'cc', 1381 | 'wyw', 1382 | 'cut', 1383 | 'say', 1384 | 'reply', 1385 | #'cont', 1386 | #'echo', 1387 | 'do', 1388 | '233', 1389 | 'start', 1390 | 'cancel', 1391 | 'help' 1392 | )) 1393 | 1394 | srandom = random.SystemRandom() 1395 | 1396 | OFFSET = conn.execute('SELECT val FROM config WHERE id = 0').fetchone() 1397 | OFFSET = OFFSET[0] if OFFSET else 0 1398 | IRCOFFSET = conn.execute('SELECT val FROM config WHERE id = 1').fetchone() 1399 | IRCOFFSET = IRCOFFSET[0] if IRCOFFSET else -1000000 1400 | USER_CACHE = LRUCache(20) 1401 | MSG_CACHE = LRUCache(10) 1402 | CFG = json.load(open('config.json')) 1403 | URL = 'https://api.telegram.org/bot%s/' % CFG['token'] 1404 | URL_FILE = 'https://api.telegram.org/file/bot%s/' % CFG['token'] 1405 | 1406 | # Initialize messages in database 1407 | 1408 | #importdb('telegram-history.db') 1409 | #importupdates(OFFSET, 2000) 1410 | #importfixservice('telegram-history.db') 1411 | #sys.exit(0) 1412 | 1413 | signal.signal(signal.SIGUSR1, sig_commit) 1414 | 1415 | MSG_Q = queue.Queue() 1416 | LOG_Q = queue.Queue() 1417 | APP_TASK = {} 1418 | APP_LCK = threading.Lock() 1419 | APP_CMD = ('python3', 'appserve.py') 1420 | APP_P = subprocess.Popen(APP_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE) 1421 | executor = concurrent.futures.ThreadPoolExecutor(10) 1422 | 1423 | pollthr = threading.Thread(target=getupdates) 1424 | pollthr.daemon = True 1425 | pollthr.start() 1426 | 1427 | appthr = threading.Thread(target=getappresult) 1428 | appthr.daemon = True 1429 | appthr.start() 1430 | 1431 | ircconn = None 1432 | if 'ircserver' in CFG: 1433 | checkircconn() 1434 | ircthr = threading.Thread(target=getircupd) 1435 | ircthr.daemon = True 1436 | ircthr.start() 1437 | 1438 | # fx233es = fparser.Parser(numtype='decimal') 1439 | 1440 | logging.info('Satellite launched.') 1441 | 1442 | try: 1443 | while 1: 1444 | try: 1445 | processmsg() 1446 | except Exception as ex: 1447 | logging.exception('Failed to process a message.') 1448 | continue 1449 | finally: 1450 | while 1: 1451 | try: 1452 | logmsg(LOG_Q.get_nowait()) 1453 | except queue.Empty: 1454 | break 1455 | conn.execute('REPLACE INTO config (id, val) VALUES (0, ?)', (OFFSET,)) 1456 | conn.execute('REPLACE INTO config (id, val) VALUES (1, ?)', (IRCOFFSET,)) 1457 | json.dump(CFG, open('config.json', 'w'), sort_keys=True, indent=4) 1458 | db.commit() 1459 | APP_P.terminate() 1460 | logging.info('Shut down cleanly.') 1461 | -------------------------------------------------------------------------------- /config.sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "token": "", 3 | "botname": "", 4 | "botid": 123456789, 5 | "groupid": 12345678, 6 | "groupname": "", 7 | "timezone": 8 8 | } 9 | -------------------------------------------------------------------------------- /digest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import re 6 | import sys 7 | import time 8 | import math 9 | import json 10 | import shutil 11 | import sqlite3 12 | import operator 13 | import itertools 14 | import collections 15 | 16 | import jinja2 17 | import truecaser 18 | 19 | #import jieba 20 | from vendor import mosesproxy as jieba 21 | from vendor import zhconv 22 | 23 | NAME = '##Orz' 24 | TITLE = '##Orz 分部喵' 25 | TIMEZONE = 8 * 3600 26 | CUTWINDOW = (0 * 3600, 6 * 3600) 27 | LINKWINDOW = 120 28 | CHUNKINTERV = 120 29 | 30 | CFG = json.load(open('config.json')) 31 | db = sqlite3.connect('chatlog.db') 32 | conn = db.cursor() 33 | 34 | USER_CACHE = {} 35 | 36 | re_word = re.compile(r"\w+", re.UNICODE) 37 | re_tag = re.compile(r"#\w+", re.UNICODE) 38 | re_at = re.compile('@[A-Za-z][A-Za-z0-9_]{4,31}') 39 | re_url = re.compile(r"(^|[\s.:;?\-\]<\(])(https?://[-\w;/?:@&=+$\|\_.!~*\|'()\[\]%#,]+[\w/#](\(\))?)(?=$|[\s',\|\(\).:;?\-\[\]>\)])") 40 | re_ircaction = re.compile('^\x01ACTION (.*)\x01$') 41 | _ig1 = operator.itemgetter(1) 42 | 43 | MEDIA_TYPES = { 44 | 'text': '文本', 45 | 'audio': '声音', 46 | 'document': '文件', 47 | 'photo': '图片', 48 | 'sticker': '贴纸', 49 | 'video': '视频', 50 | 'voice': '语音', 51 | 'contact': '名片', 52 | 'location': '位置', 53 | 'service': '服务' 54 | } 55 | 56 | SERVICE = frozenset(('new_chat_participant', 'left_chat_participant', 'new_chat_title', 'new_chat_photo', 'delete_chat_photo', 'group_chat_created')) 57 | 58 | def daystart(sec=None): 59 | if not sec: 60 | sec = time.time() 61 | return int((sec + TIMEZONE) // 86400 * 86400 - TIMEZONE) 62 | 63 | def uniq(seq, key=None): # Dave Kirby 64 | # Order preserving 65 | seen = set() 66 | if key: 67 | return [x for x in seq if key(x) not in seen and not seen.add(key(x))] 68 | else: 69 | return [x for x in seq if x not in seen and not seen.add(x)] 70 | 71 | def db_getuser(uid): 72 | r = USER_CACHE.get(uid) 73 | if r is None: 74 | r = conn.execute('SELECT username, first_name, last_name FROM users WHERE id = ?', (uid,)).fetchone() or (None, None, None) 75 | USER_CACHE[uid] = r 76 | return r 77 | 78 | def db_isbot(uid): 79 | return (db_getuser(uid)[0] or '').lower().endswith('bot') 80 | 81 | def db_getufname(uid, mmedia=None): 82 | if uid == CFG['ircbotid']: 83 | if mmedia and '_ircuser' in mmedia: 84 | return mmedia['_ircuser'] 85 | else: 86 | return '' 87 | else: 88 | name, last = db_getuser(uid)[1:] 89 | if last: 90 | name += ' ' + last 91 | return name or '<未知>' 92 | 93 | def db_getfirstname(uid, mmedia=None): 94 | if uid == CFG['ircbotid']: 95 | if mmedia and '_ircuser' in mmedia: 96 | return mmedia['_ircuser'] 97 | else: 98 | return '' 99 | else: 100 | fn = db_getufname(uid) 101 | return fn.split()[0] 102 | 103 | def strftime(fmt, t=None): 104 | if t is None: 105 | t = time.time() 106 | t += TIMEZONE 107 | return time.strftime(fmt, time.gmtime(t)) 108 | 109 | def getwday(t=None): 110 | if t is None: 111 | t = time.time() 112 | t += TIMEZONE 113 | return ('周一','周二','周三','周四','周五','周六','周日')[time.gmtime(t)[6]] 114 | 115 | def stripreaction(text): 116 | act = re_ircaction.match(text) 117 | if act: 118 | return act.group(1) 119 | else: 120 | return text 121 | 122 | class DirectWeightedGraph: 123 | d = 0.85 124 | 125 | def __init__(self): 126 | self.graph = collections.defaultdict(list) 127 | 128 | def add_edge(self, start, end, weight): 129 | self.graph[start].append((end, weight)) 130 | 131 | def rank(self): 132 | ws = collections.defaultdict(float) 133 | outSum = collections.defaultdict(float) 134 | 135 | wsdef = 1.0 / (len(self.graph) or 1.0) 136 | for n, out in self.graph.items(): 137 | ws[n] = wsdef 138 | outSum[n] = sum((e[1] for e in out), 0.0) 139 | 140 | # this line for build stable iteration 141 | sorted_keys = sorted(self.graph.keys()) 142 | for x in range(10): # 10 iters 143 | for n in sorted_keys: 144 | s = 0 145 | for e in self.graph[n]: 146 | if outSum[e[0]] and ws[e[0]]: 147 | s += e[1] / outSum[e[0]] * ws[e[0]] 148 | ws[n] = (1 - self.d) + self.d * s 149 | 150 | (min_rank, max_rank) = (sys.float_info[0], sys.float_info[3]) 151 | 152 | for w in ws.values(): 153 | if w < min_rank: 154 | min_rank = w 155 | elif w > max_rank: 156 | max_rank = w 157 | 158 | for n, w in ws.items(): 159 | # to unify the weights, don't *100. 160 | ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0) 161 | 162 | return ws 163 | 164 | 165 | class DigestComposer: 166 | 167 | def __init__(self, date): 168 | self.template = 'digest.html' 169 | self.date = date 170 | self.title = '' 171 | self.tc = truecaser.Truecaser(truecaser.loaddict(open('vendor/truecase.txt', 'rb'))) 172 | self.stopwords = frozenset(map(str.strip, open('vendor/stopwords.txt', 'r', encoding='utf-8'))) 173 | self.ircbots = re.compile(r'(titlbot|varia|Akarin).*') 174 | self.fetchmsg(date) 175 | self.msgindex() 176 | 177 | def fetchmsg(self, date): 178 | ''' 179 | Fetch messages that best fits in a day. 180 | ''' 181 | start = (daystart(date) + CUTWINDOW[0], daystart(date) + CUTWINDOW[1]) 182 | end = (daystart(date) + 86400 + 183 | CUTWINDOW[0], daystart(date) + 86400 + CUTWINDOW[1]) 184 | last, lastid = start[0], 0 185 | msgs = collections.OrderedDict() 186 | intervals = ([], []) 187 | for mid, src, text, date, fwd_src, fwd_date, reply_id, media in conn.execute('SELECT id, src, text, date, fwd_src, fwd_date, reply_id, media FROM messages WHERE date >= ? AND date < ? ORDER BY date ASC, id ASC', (start[0], end[1])): 188 | msgs[mid] = (src, text or '', date, fwd_src, fwd_date, reply_id, media) 189 | if start[0] <= date < start[1]: 190 | intervals[0].append((date - last, mid)) 191 | elif last < start[1] <= date: 192 | intervals[0].append((date - last, mid)) 193 | elif end[0] <= date < end[1]: 194 | if last < end[0]: 195 | last = end[0] 196 | intervals[1].append((date - last, lastid)) 197 | last = date 198 | lastid = mid 199 | intervals[1].append((end[1] - last, lastid)) 200 | if not msgs: 201 | raise ValueError('Not enough messages in (%s, %s)' % (start[0], end[1])) 202 | self.start = startd = msgs[max(intervals[0] or ((0, tuple(msgs.keys())[0]),))[1]][2] 203 | self.end = endd = msgs[max(intervals[1] or ((0, tuple(msgs.keys())[-1]),))[1]][2] 204 | self.msgs = collections.OrderedDict( 205 | filter(lambda x: startd <= x[1][2] <= endd, msgs.items())) 206 | 207 | def msgpreprocess(self, text): 208 | at = False 209 | for t in jieba.cut(text, HMM=False): 210 | if t == '@': 211 | at = True 212 | elif at: 213 | yield '@' + t 214 | at = False 215 | elif t.lower() not in self.stopwords: 216 | # t.isidentifier() and 217 | yield t 218 | 219 | def msgindex(self): 220 | self.fwd_lookup = {} 221 | self.words = collections.Counter() 222 | self.msgtok = {} 223 | for mid, value in self.msgs.items(): 224 | src, text, date, fwd_src, fwd_date, reply_id, media = value 225 | self.fwd_lookup[(src, date)] = mid 226 | tok = self.msgtok[mid] = tuple(self.msgpreprocess(zhconv.convert(self.tc.truecase(re_url.sub('', stripreaction(text))), 'zh-hans'))) 227 | for w in frozenset(t.lower() for t in tok): 228 | self.words[w] += 1 229 | self.words = dict(self.words) 230 | 231 | def chunker(self): 232 | results = [] 233 | chunk = [] 234 | last = 0 235 | for mid, value in self.msgs.items(): 236 | src, text, date, fwd_src, fwd_date, reply_id, media = value 237 | if date - last > CHUNKINTERV and chunk: 238 | results.append(chunk) 239 | chunk = [] 240 | last = date 241 | chunk.append(mid) 242 | if chunk: 243 | results.append(chunk) 244 | return sorted(results, key=len, reverse=True) 245 | 246 | def tfidf(self, term, text): 247 | return text.count(term) / len(text) * math.log(len(self.msgs) / self.words.get(term, 1)) 248 | 249 | def tfidf_kwd(self, toks, topK=15): 250 | toks = tuple(filter(lambda x: len(x) > 1, toks)) 251 | toklen = len(toks) 252 | msglen = len(self.msgs) 253 | return tuple(map(_ig1, sorted((-count / toklen * math.log(msglen / self.words.get(term, 1)), term) for term, count in collections.Counter(toks).items())))[:topK] 254 | 255 | def tr_kwd(self, toks, topK=15): 256 | return jieba.analyse.textrank(' '.join(toks), topK, False, ('n', 'ns', 'nr', 'vn', 'v', 'eng')) 257 | 258 | def cosinesimilarity(self, a, b): 259 | msga = self.msgtok[a] 260 | msgb = self.msgtok[b] 261 | vcta = {w:self.tfidf(w.lower(), msga) for w in frozenset(msga)} 262 | vctb = {w:self.tfidf(w.lower(), msgb) for w in frozenset(msgb)} 263 | keys = vcta.keys() & vctb.keys() 264 | ma = sum(i**2 for i in vcta.values())**.5 265 | mb = sum(i**2 for i in vctb.values())**.5 266 | return (sum(vcta[i]*vctb[i] for i in keys) / 267 | ma / mb) if (ma and mb) else 0 268 | 269 | def classify(self, mid): 270 | ''' 271 | 0 - Normal messages sent by users 272 | 1 - Interesting messages sent by the bots 273 | 2 - Boring messages sent by users 274 | 3 - Boring messages sent by the bots 275 | ''' 276 | src, text, date, fwd_src, fwd_date, reply_id, media = self.msgs[mid] 277 | if src == CFG['botid']: 278 | repl = self.msgs.get(reply_id) 279 | if repl and (repl[1].startswith('/say') or repl[1].startswith('/reply')): 280 | return 1 281 | else: 282 | return 3 283 | elif src == CFG['ircbotid']: 284 | mmedia = json.loads(media or '{}') 285 | if self.ircbots.match(mmedia.get('_ircuser', '')): 286 | return 3 287 | else: 288 | return 0 289 | elif db_isbot(fwd_src) and len(text or '') > 75: 290 | return 3 291 | elif not text or text.startswith('/'): 292 | return 2 293 | else: 294 | return 0 295 | 296 | def hotrank(self, chunk): 297 | graph = DirectWeightedGraph() 298 | edges = {} 299 | similarity = self.cosinesimilarity 300 | for mid in chunk: 301 | src, text, date, fwd_src, fwd_date, reply_id, media = self.msgs[mid] 302 | if self.classify(mid) > 1: 303 | continue 304 | backlink = self.fwd_lookup.get((fwd_src, fwd_date)) or reply_id 305 | if (backlink in self.msgs and (mid, backlink) not in edges): 306 | edges[(mid, backlink)] = similarity(mid, backlink) 307 | for mid2, value2 in self.msgs.items(): 308 | if 0 < date - value2[2] < LINKWINDOW: 309 | w = edges.get((mid, mid2)) or edges.get((mid2, mid)) or similarity(mid, mid2) 310 | edges[(mid, mid2)] = w 311 | edges[(mid2, mid)] = w 312 | for key, weight in edges.items(): 313 | if weight: 314 | graph.add_edge(key[0], key[1], weight) 315 | del edges 316 | return sorted(graph.rank().items(), key=_ig1, reverse=True) 317 | 318 | def hotchunk(self): 319 | for chunk in self.chunker()[:5]: 320 | kwds = self.tfidf_kwd(itertools.chain.from_iterable(self.msgtok[mid] for mid in chunk if self.classify(mid) < 2)) 321 | hotmsg = [] 322 | wordinmsg = lambda x: re_word.search(self.msgs[x][1]) 323 | ranked = uniq(uniq(filter(wordinmsg, map(lambda x: self.fwd_lookup.get(operator.itemgetter(3, 4)(self.msgs[x[0]]), x[0]), self.hotrank(chunk)))), key=lambda x: self.tc.truecase(self.msgs[x][1])) or list(filter(wordinmsg, chunk)) or chunk 324 | for mid in (ranked[:10] or chunk[:10]): 325 | msg = self.msgs[mid] 326 | text = msg[1] 327 | if len(text) > 500: 328 | text = text[:500] + '…' 329 | hotmsg.append((mid, stripreaction(text), msg[0], db_getfirstname(msg[0], json.loads(msg[6] or '{}')), strftime('%H:%M:%S', msg[2]))) 330 | yield (kwds, hotmsg) 331 | 332 | def tags(self): 333 | tags = collections.defaultdict(list) 334 | for mid, value in self.msgs.items(): 335 | text = value[1] or '' 336 | for tag in re_tag.findall(text): 337 | tags[self.tc.truecase(tag)].append(mid) 338 | return sorted(tags.items(), key=lambda x: (-len(x[1]), x[0])) 339 | 340 | def tc_preprocess(self): 341 | titles = [] 342 | for mid, value in self.msgs.items(): 343 | media = json.loads(value[6] or '{}') 344 | if 'new_chat_title' in media: 345 | titles.append((mid, media['new_chat_title'])) 346 | if titles: 347 | prefix = [os.path.commonprefix([text for mid, text in titles])] 348 | else: 349 | prefix = [self.title] 350 | for mid, text in titles: 351 | for k in range(len(prefix), -1, -1): 352 | pf = ''.join(prefix[:k]) 353 | if text.startswith(pf): 354 | text = text[len(pf):] 355 | prefix = prefix[:k] 356 | prefix.append(text) 357 | break 358 | yield (mid, prefix) 359 | 360 | def titlechange(self): 361 | last = [] 362 | for mid, prefix in self.tc_preprocess(): 363 | comm = os.path.commonprefix((last, prefix)) 364 | if len(prefix) == len(last) == len(comm) + 1: 365 | yield '
  • ' 366 | msg = self.msgs[mid] 367 | yield (mid, prefix[-1], msg[0], db_getfirstname(msg[0]), strftime('%H:%M:%S', msg[2])) 368 | yield '
  • ' 369 | else: 370 | for k in range(len(last) - len(comm)): 371 | yield '' 372 | for item in prefix[len(comm):-1]: 373 | yield '
    • ' 374 | yield (mid, item) 375 | yield '
    • ' 376 | yield '
      • ' 377 | msg = self.msgs[mid] 378 | yield (mid, prefix[-1], msg[0], db_getfirstname(msg[0]), strftime('%H:%M:%S', msg[2])) 379 | yield '
      • ' 380 | last = prefix 381 | for item in last: 382 | yield '
      ' 383 | 384 | def generalinfo(self): 385 | ctr = collections.Counter(i[0] for i in self.msgs.values()) 386 | mcomm = ctr.most_common(5) 387 | count = len(self.msgs) 388 | others = count - sum(v for k, v in mcomm) 389 | delta = self.end - self.start 390 | stat = { 391 | 'start': strftime('%d 日 %H:%M:%S', self.start), 392 | 'end': strftime('%d 日 %H:%M:%S', self.end), 393 | 'count': count, 394 | 'freq': '%.2f' % (count * 60 / delta) if delta else 'N/A', 395 | 'flooder': tuple(((k, db_getufname(k)), v, '%.2f%%' % (v/count*100)) for k, v in mcomm), 396 | 'tags': self.tags()[:6], 397 | 'others': (others, '%.2f%%' % (others/count*100)), 398 | 'avg': '%.2f' % (count / len(ctr)) 399 | } 400 | return stat 401 | 402 | def render(self): 403 | kvars = { 404 | 'name': NAME, 405 | 'date': strftime('%Y-%m-%d', self.date), 406 | 'wday': getwday(self.date), 407 | 'info': self.generalinfo(), 408 | 'hotchunk': tuple(self.hotchunk()), 409 | 'titlechange': tuple(self.titlechange()), 410 | 'gentime': strftime('%Y-%m-%d %H:%M:%S') 411 | } 412 | template = jinja2.Environment(loader=jinja2.FileSystemLoader('templates')).get_template(self.template) 413 | return template.render(**kvars) 414 | 415 | class StatComposer: 416 | 417 | def __init__(self): 418 | self.template = 'stat.html' 419 | self.tc = truecaser.Truecaser(truecaser.loaddict(open('vendor/truecase.txt', 'rb'))) 420 | 421 | def fetchmsgstat(self): 422 | self.msglen = self.start = self.end = 0 423 | hourctr = [0] * 24 424 | mediactr = collections.Counter() 425 | usrctr = collections.Counter() 426 | tags = collections.Counter() 427 | for mid, src, text, date, media in conn.execute('SELECT id, src, text, date, media FROM messages ORDER BY date ASC, id ASC'): 428 | text = text or '' 429 | if not self.start: 430 | self.start = date 431 | self.start = min(self.start, date) 432 | self.end = max(self.end, date) 433 | for tag in re_tag.findall(text): 434 | tags[self.tc.truecase(tag)] += 1 435 | media = json.loads(media or '{}') 436 | mt = media.keys() & MEDIA_TYPES.keys() 437 | if mt: 438 | t = tuple(mt)[0] 439 | elif media.keys() & SERVICE: 440 | t = 'service' 441 | else: 442 | t = 'text' 443 | hourctr[int(((date + TIMEZONE) // 3600) % 24)] += 1 444 | mediactr[t] += 1 445 | usrctr[src] += 1 446 | self.msglen += 1 447 | self.end = date 448 | typesum = sum(mediactr.values()) 449 | types = [(MEDIA_TYPES[k], '%.2f%%' % (v * 100 / typesum)) for k, v in mediactr.most_common()] 450 | tags = sorted(filter(lambda x: x[1] > 2, tags.items()), key=lambda x: (-x[1], x[0])) 451 | return hourctr, types, tags, usrctr 452 | 453 | def generalinfo(self): 454 | hours, types, tags, usrctr = self.fetchmsgstat() 455 | hsum = sum(hours) 456 | hourdist = ['%.2f%%' % (h * 100 / hsum) for h in hours] 457 | mcomm = usrctr.most_common() 458 | count = self.msglen 459 | stat = { 460 | 'start': strftime('%Y-%m-%d %H:%M:%S', self.start), 461 | 'end': strftime('%Y-%m-%d %H:%M:%S', self.end), 462 | 'count': count, 463 | 'freq': '%.2f' % (count * 60 / (self.end - self.start)), 464 | 'flooder': tuple(((k, db_getufname(k)), db_getuser(k)[0] or '', '%.2f%%' % (v/count*100)) for k, v in mcomm if v > 2), 465 | 'hours': hourdist, 466 | 'types': types, 467 | 'tags': tags, 468 | 'avg': '%.2f' % (count / len(usrctr)) 469 | } 470 | return stat 471 | 472 | def render(self): 473 | kvars = { 474 | 'name': NAME, 475 | 'info': self.generalinfo(), 476 | 'gentime': strftime('%Y-%m-%d %H:%M:%S') 477 | } 478 | template = jinja2.Environment(loader=jinja2.FileSystemLoader('templates')).get_template(self.template) 479 | return template.render(**kvars) 480 | 481 | re_digest = re.compile(r'^(\d+)-(\d+)-(\d+).html$') 482 | 483 | class DigestManager: 484 | 485 | def __init__(self, path='.'): 486 | self.template = 'index.html' 487 | self.path = path 488 | 489 | def copyresource(self): 490 | for filename in ('digest.css',): 491 | src = os.path.join('templates', filename) 492 | dst = os.path.join(self.path, filename) 493 | shutil.copyfile(src, dst) 494 | shutil.copystat(src, dst) 495 | 496 | def writenewdigest(self, date=None, update=False): 497 | date = date or (time.time() - 86400) 498 | filename = os.path.join(self.path, strftime('%Y-%m-%d.html', date)) 499 | if not update and os.path.isfile(filename): 500 | return 501 | try: 502 | dc = DigestComposer(date) 503 | except ValueError: 504 | return 505 | dc.title = TITLE 506 | with open(filename, 'w') as f: 507 | f.write(dc.render()) 508 | del dc 509 | 510 | def writenewstat(self): 511 | sc = StatComposer() 512 | with open(os.path.join(self.path, 'stat.html'), 'w') as f: 513 | f.write(sc.render()) 514 | del sc 515 | 516 | def genindex(self): 517 | index = [] 518 | for filename in sorted(os.listdir(self.path), reverse=True): 519 | fn = re_digest.match(filename) 520 | if fn: 521 | index.append((filename, '%s 年 %s 月 %s 日' % fn.groups())) 522 | return index 523 | 524 | def render(self): 525 | kvars = { 526 | 'name': NAME, 527 | 'index': self.genindex(), 528 | 'gentime': strftime('%Y-%m-%d %H:%M:%S') 529 | } 530 | template = jinja2.Environment(loader=jinja2.FileSystemLoader('templates')).get_template(self.template) 531 | return template.render(**kvars) 532 | 533 | def writenewindex(self): 534 | with open(os.path.join(self.path, 'index.html'), 'w') as f: 535 | f.write(self.render()) 536 | 537 | 538 | if __name__ == '__main__': 539 | 540 | path = '.' 541 | version = 1 542 | update = False 543 | 544 | if len(sys.argv) > 1: 545 | path = sys.argv[1] 546 | if len(sys.argv) > 2: 547 | version = int(sys.argv[2]) 548 | if len(sys.argv) > 3: 549 | update = bool(sys.argv[3]) 550 | 551 | start = time.time() 552 | dm = DigestManager(path) 553 | dm.copyresource() 554 | for i in range(1, version+1): 555 | dm.writenewdigest(start - 86400 * i, update) 556 | dm.writenewstat() 557 | dm.writenewindex() 558 | sys.stderr.write('Done in %.4gs.\n' % (time.time() - start)) 559 | -------------------------------------------------------------------------------- /templates/digest.css: -------------------------------------------------------------------------------- 1 | body, 2 | textarea, 3 | input, 4 | select { 5 | font-family: Roboto, Arial, sans-serif; 6 | font-size: 15px; 7 | line-height:1.25em; 8 | } 9 | .smooth { 10 | transition: all .2s 11 | } 12 | .btn { 13 | text-decoration: none 14 | } 15 | .container { 16 | margin: 0 1.25em; 17 | max-width: 75em; 18 | width: auto 19 | } 20 | label>* { 21 | display: inline 22 | } 23 | form>* { 24 | margin-bottom: .625em 25 | } 26 | .btn { 27 | background: #999; 28 | border-radius: 2px; 29 | border: 0; 30 | color: #fff; 31 | cursor: pointer; 32 | display: inline-block; 33 | padding: .4em 1em; 34 | font-size: 1em; 35 | } 36 | .btn:hover { 37 | background: #888 38 | } 39 | .btn:active, 40 | .btn:focus { 41 | background: #777; 42 | outline: 0 43 | } 44 | .btn-a { 45 | background: #0ae 46 | } 47 | .btn-a:hover, 48 | .btn-a:focus { 49 | background: #09d 50 | } 51 | .btn-a:active { 52 | background: #08b 53 | } 54 | .btn-b { 55 | background: #3c5 56 | } 57 | .btn-b:hover, 58 | .btn-b:focus { 59 | background: #2b4 60 | } 61 | .btn-b:active { 62 | background: #2a4 63 | } 64 | .btn-c { 65 | background: #d33 66 | } 67 | .btn-c:hover, 68 | .btn-c:focus { 69 | background: #c22 70 | } 71 | .btn-c:active { 72 | background: #b22 73 | } 74 | .btn-sm { 75 | border-radius: 2px; 76 | } 77 | .row { 78 | overflow: auto 79 | } 80 | .col { 81 | float: left 82 | } 83 | .table, 84 | .c12 { 85 | width: 100% 86 | } 87 | .c11 { 88 | width: 91.66% 89 | } 90 | .c10 { 91 | width: 83.33% 92 | } 93 | .c9 { 94 | width: 75% 95 | } 96 | .c8 { 97 | width: 66.66% 98 | } 99 | .c7 { 100 | width: 58.33% 101 | } 102 | .c6 { 103 | width: 50% 104 | } 105 | .c5 { 106 | width: 41.66% 107 | } 108 | .c4 { 109 | width: 33.33% 110 | } 111 | .c3 { 112 | width: 25% 113 | } 114 | .c2 { 115 | width: 16.66% 116 | } 117 | .c1 { 118 | width: 8.33% 119 | } 120 | fieldset, button { 121 | margin: 0; 122 | padding: 0.35em 0 0.75em; 123 | border: 0; 124 | } 125 | .btn-sm, 126 | .nav { 127 | font-size: .875em; 128 | } 129 | textarea, 130 | input, 131 | select, 132 | button { 133 | padding: .2em .3em; 134 | outline: 0; 135 | font-size: 100%; 136 | } 137 | textarea, 138 | input, 139 | select { 140 | border: 1px solid #ccc 141 | } 142 | textarea:focus, 143 | input:focus, 144 | select:focus { 145 | border-color: #19E 146 | } 147 | textarea, 148 | input[type=text] { 149 | -webkit-appearance: none; 150 | width: 13em; 151 | box-sizing: border-box; 152 | } 153 | @media(max-width:48em) { 154 | .row.rmd > .col { 155 | width: 100%; 156 | float: none; 157 | } 158 | } 159 | /*.table th, 160 | .table td { 161 | padding: .5em; 162 | text-align: left 163 | }*/ 164 | table tbody>:nth-child(2n-1) { 165 | background: whitesmoke; 166 | } 167 | .msg { 168 | padding: 1.5em; 169 | background: #def; 170 | border-left: 5px solid #59d 171 | } 172 | body { 173 | margin: .5em 2em; 174 | background-color: whitesmoke; 175 | } 176 | a { 177 | color: #425e5e; 178 | padding: .25em; 179 | } 180 | a:visited { 181 | color: #233; 182 | } 183 | a:hover, a:focus { 184 | color: #4e7a7a; 185 | } 186 | header { 187 | margin-top: 0; 188 | margin-bottom: 1em; 189 | } 190 | h1 { 191 | font-size: 1.5em; 192 | color: #0B3861; 193 | padding-top: .2em; 194 | } 195 | h2 { 196 | font-size: 1.25em; 197 | margin: 0 0 .5em; 198 | } 199 | p { 200 | margin: 0.5em 0; 201 | } 202 | td { 203 | text-overflow: ellipsis; 204 | word-wrap: break-word; 205 | } 206 | .meta { 207 | padding-left: .5em; 208 | font-size: .8em; 209 | color: grey; 210 | display: inline-block; 211 | word-wrap: break-word; 212 | } 213 | section { 214 | background-color: white; 215 | padding: 1.5em; 216 | margin: 1em 0; 217 | word-wrap: break-word; 218 | box-sizing: border-box; 219 | } 220 | .num { 221 | text-align: right; 222 | min-width: 2em; 223 | } 224 | .bar { 225 | text-align: left; 226 | background-color: #81c3ff; 227 | overflow-x: visible; 228 | overflow-wrap: normal; 229 | word-wrap: normal; 230 | } 231 | .info, footer { 232 | font-size: .9em; 233 | color: #233; 234 | } 235 | footer { 236 | text-align: right; 237 | } 238 | #titlechange ul { 239 | padding-left: 1.2em; 240 | } 241 | .topic { 242 | border-top: 1px solid #DDD; 243 | padding: .5em 0; 244 | } 245 | #fldrankb { 246 | width: 100%; 247 | } 248 | .fullname { 249 | width: 45%; 250 | } 251 | .tag { 252 | width: 80%; 253 | word-break: break-all; 254 | } 255 | .username { 256 | min-width: 3em; 257 | word-break: break-all; 258 | } 259 | .hour { 260 | width: 2em; 261 | font-weight: bold; 262 | text-align: center; 263 | } 264 | .msgtype { 265 | width: 3em; 266 | text-align: center; 267 | } 268 | -------------------------------------------------------------------------------- /templates/digest.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% macro user(uid, nick) -%} 4 | {{ nick|e }} 5 | {%- endmacro %} 6 | {% macro msgwmeta(mid, text, uid=0, nick='', time='') -%} 7 | 8 | {{ text|e }} 9 | {% if uid %}{{ user(uid, nick) }}, {{ time }} 10 | {% endif %} 11 | 12 | {%- endmacro %} 13 | 14 | 15 | 16 | {{ name }} 日报 - {{ date }} 17 | 18 | 19 | 20 |
      21 |
      22 |

      {{ name }} 日报 - {{ date }} {{ wday }}

      23 |
      24 |

      开始:{{ info.start }},结束:{{ info.end }}; 25 | 总计 {{ info.count }} 条,每分钟 {{ info.freq }} 条,人均 {{ info.avg }} 条

      26 |
      27 |
      28 |
      29 |
      30 |

      水王榜

      31 | 32 | 33 | 34 | {% for u in info.flooder %} 35 | 36 | 37 | {% endfor %} 38 | 39 | 40 | 41 | 42 | 43 | 44 |
      全名消息占比
      {{ user(*u[0]) }}{{ u[1] }}
      {{ u[2] }}
      <其他用户>{{ info.others[0] }}
      {{ info.others[1] }}
      45 |
      46 |
      47 |

      标签

      48 | 49 | 50 | 51 | {% for t in info.tags %} 52 | 53 | {% endfor %} 54 | 55 |
      标签数量
      {{ t[0]|e }}{{ t[1]|length }}
      56 |
      57 |
      58 |
      59 |

      今日热点

      60 | {% for chunk in hotchunk -%} 61 |
      62 |

      关键词:{{ chunk[0]|join(', ')|e }}

      63 |
        {% for msg in chunk[1] %}
      1. {{ msgwmeta(*msg) }}
      2. {% endfor %} 64 |
      65 |
      66 | {%- endfor %} 67 |
      68 |
      69 |

      改名部

      70 | {% for item in titlechange -%} 71 | {% if item is string %}{{ item }} 72 | {% else %}{{ msgwmeta(*item) }} 73 | {% endif %} 74 | {%- endfor %} 75 |
      76 |
      77 | 存档 - 更新时间:{{ gentime }} 78 |
      79 |
      80 | 81 | 82 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | {{ name }} 日报 7 | 8 | 9 | 10 |
      11 |
      12 |

      {{ name }} 日报

      13 |
      14 |
      15 |

      存档

      16 |
        17 | {% for item in index %} 18 |
      • {{ item[1] }}
      • 19 | {% endfor %} 20 |
      21 |
      22 |
      23 |

      统计

      24 |
      25 |
      26 | 更新时间:{{ gentime }} 27 |
      28 |
      29 | 30 | 31 | -------------------------------------------------------------------------------- /templates/stat.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% macro user(uid, nick) -%} 4 | {{ nick|e }} 5 | {%- endmacro %} 6 | 7 | 8 | 9 | {{ name }} 统计 10 | 11 | 12 | 13 |
      14 |
      15 |

      {{ name }} 统计

      16 |
      17 |

      自 {{ info.start }} 到 {{ info.end }} 18 | 总计 {{ info.count }} 条,每分钟 {{ info.freq }} 条,人均 {{ info.avg }} 条。

      19 |
      20 |
      21 |
      22 |

      用户发言

      23 | 24 | 25 | 26 | {% for u in info.flooder %} 27 | 28 | 29 | {% endfor %} 30 | 31 |
      排名全名用户名占比
      {{ loop.index }}{{ user(*u[0]) }}{{ u[1] }}
      {{ u[2] }}
      32 |
      33 |
      34 |
      35 |

      话题标签

      36 | 37 | 38 | 39 | {% for t in info.tags %} 40 | 41 | {% endfor %} 42 | 43 | 44 |
      标签数量
      {{ t[0] }}{{ t[1] }}
      <1~2 略>
      45 |
      46 |
      47 |
      48 |

      时间分布

      49 | 50 | 51 | 52 | {% for t in info.hours %} 53 | 54 | 55 | {% endfor %} 56 | 57 |
      小时占比
      {{ loop.index0 }}
      {{ t }}
      58 |
      59 |
      60 |

      消息类型

      61 | 62 | 63 | 64 | {% for t in info.types %} 65 | 66 | 67 | {% endfor %} 68 | 69 |
      类型占比
      {{ t[0] }}
      {{ t[1] }}
      70 |
      71 |
      72 |
      73 |
      74 | 存档 - 更新时间:{{ gentime }} 75 |
      76 |
      77 | 78 | 79 | -------------------------------------------------------------------------------- /tools/dbselect.cgi: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import sys 6 | import cgi 7 | import json 8 | import sqlite3 9 | import calendar 10 | from email.utils import formatdate, parsedate 11 | 12 | DB_FILE = 'chatlog.db' 13 | 14 | MTIME = os.path.getmtime(DB_FILE) 15 | CONN = sqlite3.connect(DB_FILE) 16 | 17 | def auth(sqltype, arg1, arg2, dbname, source): 18 | if sqltype in (sqlite3.SQLITE_READ, sqlite3.SQLITE_SELECT, sqlite3.SQLITE_FUNCTION): 19 | return sqlite3.SQLITE_OK 20 | else: 21 | return sqlite3.SQLITE_DENY 22 | 23 | def do_query(form): 24 | try: 25 | sql = form['q'].value 26 | cur = CONN.cursor() 27 | cur.execute(sql) 28 | return '200 OK', json.dumps({ 29 | 'ret': 200, 30 | 'description': [desc[0] for desc in cur.description], 31 | 'rows': cur.fetchall() 32 | }).encode('utf-8') 33 | except Exception as ex: 34 | return '400 Bad Request', json.dumps({ 35 | 'ret': 400, 36 | 'error': str(ex) 37 | }).encode('utf-8') 38 | 39 | form = cgi.FieldStorage() 40 | try: 41 | if calendar.timegm(parsedate(os.environ['HTTP_IF_MODIFIED_SINCE'])) >= MTIME: 42 | print("Status: 304 Not Modified") 43 | print() 44 | sys.exit(0) 45 | except Exception: 46 | pass 47 | 48 | status, reply = do_query(form) 49 | print("Status: " + status) 50 | print("Content-Type: application/json; charset=utf-8") 51 | print("Content-Length: %d" % len(reply)) 52 | print("Last-Modified: " + formatdate(MTIME, False, True)) 53 | print("Connection: close") 54 | print() 55 | sys.stdout.flush() 56 | sys.stdout.buffer.write(reply) 57 | sys.stdout.buffer.flush() 58 | -------------------------------------------------------------------------------- /truecaser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | import sys 6 | import collections 7 | 8 | re_eng = re.compile('([A-Za-z]+)') 9 | 10 | def dumpdict(d, fp): 11 | for k in sorted(d): 12 | fp.write(('%s\t%s\n' % (k, d[k])).encode('utf-8')) 13 | 14 | def loaddict(fp): 15 | d = {} 16 | for ln in fp: 17 | ln = ln.strip().decode('utf-8').split('\t') 18 | if len(ln) == 2: 19 | d[ln[0]] = ln[1] 20 | return d 21 | 22 | def train(iterable): 23 | d = collections.defaultdict(collections.Counter) 24 | for ln in iterable: 25 | for tok in re_eng.split(ln): 26 | if 1 < len(tok) < 25 and re_eng.match(tok): 27 | d[tok.lower()][tok] += 1 28 | for word, val in tuple(d.items()): 29 | if sum(val.values()) > 1: 30 | d[word] = val.most_common(1)[0][0] 31 | else: 32 | del d[word] 33 | return dict(d) 34 | 35 | class Truecaser: 36 | def __init__(self, wmap): 37 | self.wmap = wmap 38 | 39 | def truecase(self, text): 40 | res = [] 41 | for tok in re_eng.split(text): 42 | res.append(self.wmap.get(tok.lower(), tok)) 43 | return ''.join(res) 44 | 45 | if __name__ == '__main__': 46 | filename = sys.argv[-1] 47 | 48 | if len(sys.argv) > 2: 49 | d = train(sys.stdin) 50 | dumpdict(d, open(filename, 'wb')) 51 | else: 52 | tc = Truecaser(loaddict(open(filename, 'rb'))) 53 | for ln in sys.stdin: 54 | sys.stdout.write(tc.truecase(ln)) 55 | -------------------------------------------------------------------------------- /vendor/chinesename.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import pickle 6 | import random 7 | import bisect 8 | import operator 9 | import functools 10 | import itertools 11 | from math import log 12 | from .common_surnames import d as common_surnames 13 | from .lookuptable import chrevlookup, pinyintrie, surnamerev 14 | 15 | for py in tuple(chrevlookup.keys()): 16 | for ch in range(len(py)): 17 | frag = py[:ch+1] 18 | if frag not in chrevlookup: 19 | chrevlookup[frag] = '' 20 | 21 | logtotal = log(sum(len(s) for s in chrevlookup.values())) 22 | 23 | ig1 = operator.itemgetter(1) 24 | 25 | phonetic_symbol = { 26 | "ā": "a", 27 | "á": "a", 28 | "ǎ": "a", 29 | "à": "a", 30 | "ē": "e", 31 | "é": "e", 32 | "ě": "e", 33 | "è": "e", 34 | "ō": "o", 35 | "ó": "o", 36 | "ǒ": "o", 37 | "ò": "o", 38 | "ī": "i", 39 | "í": "i", 40 | "ǐ": "i", 41 | "ì": "i", 42 | "ū": "u", 43 | "ú": "u", 44 | "ǔ": "u", 45 | "ù": "u", 46 | "ü": "v", 47 | "ǖ": "v", 48 | "ǘ": "v", 49 | "ǚ": "v", 50 | "ǜ": "v", 51 | "ń": "n", 52 | "ň": "n", 53 | "": "m" 54 | } 55 | 56 | 57 | def untone(text): 58 | # This is a limited version only for entities defined in xml_escape_table 59 | for k, v in phonetic_symbol.items(): 60 | text = text.replace(k, v) 61 | return text 62 | 63 | 64 | class WeightedRandomGenerator(object): 65 | 66 | def __init__(self, weights): 67 | self.totals = list(itertools.accumulate(weights)) 68 | self.total = self.totals[-1] 69 | 70 | def __iter__(self): 71 | return self 72 | 73 | def __next__(self): 74 | rnd = random.random() * self.total 75 | return bisect.bisect_right(self.totals, rnd) 76 | 77 | def __call__(self): 78 | return self.__next__() 79 | 80 | 81 | def _pyword_tokenize(word): 82 | DAG = {} 83 | N = len(word) 84 | for k in range(N): 85 | tmplist = [] 86 | i = k 87 | frag = word[k] 88 | while i < N and frag in chrevlookup: 89 | if chrevlookup[frag]: 90 | tmplist.append(i) 91 | i += 1 92 | frag = word[k:i + 1] 93 | if not tmplist: 94 | tmplist.append(k) 95 | DAG[k] = tmplist 96 | route = {N: (0, 0)} 97 | for idx in range(N - 1, -1, -1): 98 | route[idx] = max((log(len(chrevlookup.get(word[idx:x + 1], '')) or 1) - 99 | logtotal + route[x + 1][0], x) for x in DAG[idx]) 100 | result = [] 101 | x = 0 102 | while x < N: 103 | y = route[x][1] + 1 104 | result.append(word[x:y]) 105 | x = y 106 | return result 107 | 108 | pytokenize = lambda s: list(itertools.chain.from_iterable(_pyword_tokenize(w) for w in s.replace("'", ' ').lower().split())) 109 | 110 | surnamesortkey = lambda n: -common_surnames.get(n, 0.00001) 111 | 112 | class NameModel(object): 113 | 114 | def __init__(self, modelname): 115 | with open(modelname, 'rb') as f: 116 | self.firstchar, self.secondchar = pickle.load(f) 117 | 118 | del self.secondchar[''] 119 | self.snlst, snprb = tuple(zip(*common_surnames.items())) 120 | self.fclst, fcprb = tuple(zip(*self.firstchar.items())) 121 | self.sclst, scprb = tuple(zip(*self.secondchar.items())) 122 | self.sngen = WeightedRandomGenerator(snprb) 123 | self.fcgen = WeightedRandomGenerator(fcprb) 124 | self.scgen = WeightedRandomGenerator(scprb) 125 | 126 | initlookup = functools.lru_cache(maxsize=10)(lambda self, ch: ''.join(set(''.join(chrevlookup[p] for p in pinyintrie.get(ch)))) if ch in pinyintrie else ch) 127 | 128 | lookupsurname = lambda self, pychars: ((list(itertools.chain.from_iterable(surnamerev.get(p, ()) for p in pinyintrie[pychars[0]])) if pychars[0] in pinyintrie else [pychars[0]]) if len(pychars) == 1 and len(pychars[0]) == 1 else surnamerev.get(' '.join(pychars), [])) 129 | 130 | lookupchar = lambda self, ch: (self.initlookup(ch) if len(ch) == 1 else (chrevlookup.get(ch) or self.initlookup(ch[0]))) 131 | 132 | fullnamesortkey = lambda self, n: -common_surnames.get(n[0], 0.00001)*self.firstchar.get(n[1])*self.secondchar.get(n[2:]) 133 | namesortkey = lambda self, n: -self.firstchar.get(n[0])*self.secondchar.get(n[1:]) 134 | 135 | def splitname(self, romanization): 136 | words = romanization.split() 137 | tok = name = pytokenize(romanization) 138 | if not name: 139 | return [], [] 140 | if len(words) == 1: 141 | words = name 142 | surnames = self.lookupsurname(pytokenize(words[0])) 143 | name = pytokenize(' '.join(words[1:])) 144 | if not surnames: 145 | surnames = self.lookupsurname(pytokenize(words[-1])) 146 | name = pytokenize(' '.join(words[:-1])) 147 | if len(words) > 2 and not surnames: 148 | surnames = self.lookupsurname(pytokenize(' '.join(words[:2]))) 149 | name = pytokenize(' '.join(words[2:])) 150 | if surnames: 151 | surnames = sorted(frozenset(surnames), key=surnamesortkey) 152 | else: 153 | name = tok 154 | return surnames, name 155 | 156 | def selectname(self, name, num=10): 157 | if not name: 158 | return [] 159 | evalnum = int(num ** (1/len(name))) + 1 160 | namechars = [sorted(filter(ig1, ((n, self.firstchar.get(n, 1e-10 if 0x4E00 <= ord(n) < 0x9FCD else 0)) for n in self.lookupchar(name[0]))), key=ig1, reverse=1)] 161 | namechars.extend(sorted(filter(ig1, ((n, self.secondchar.get(n, 1e-10 if 0x4E00 <= ord(n) < 0x9FCD else 0)) for n in self.lookupchar(l))), key=ig1, reverse=1)[:evalnum] for l in name[1:]) 162 | namechars = list(filter(None, namechars))[:10] 163 | if not namechars: 164 | return [] 165 | candidates = [] 166 | for group in itertools.product(*namechars): 167 | gz = tuple(zip(*group)) 168 | gname = ''.join(gz[0]) 169 | gfreq = functools.reduce(operator.mul, gz[1]) 170 | candidates.append((gname, gfreq)) 171 | candidates.sort(key=ig1, reverse=1) 172 | return [x[0] for x in candidates][:num] 173 | 174 | def processinput(self, userinput, num=10): 175 | if not userinput: 176 | return [], [self.snlst[self.sngen()] + self.fclst[self.fcgen()] + self.sclst[self.scgen()] for i in range(num)] 177 | try: 178 | surnames, names = self.splitname(untone(userinput).lower()) 179 | names = self.selectname(names, num=num) 180 | if not names: 181 | names = [self.fclst[self.fcgen()] + self.sclst[self.scgen()] for i in range(num)] 182 | return surnames, names 183 | except Exception: 184 | raise 185 | return [], [] 186 | 187 | def getname(self): 188 | return self.snlst[self.sngen()] + self.fclst[self.fcgen()] + self.sclst[self.scgen()] 189 | 190 | __call__ = getname 191 | 192 | if __name__ == '__main__': 193 | while 1: 194 | nm = NameModel('namemodel.m') 195 | fullname = nm.getname() 196 | #if name not in names: 197 | #print(fullname) 198 | print(fullname) 199 | -------------------------------------------------------------------------------- /vendor/common_surnames.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # 2012 data 5 | d = { 6 | '李': 0.0794, 7 | '王': 0.0741, 8 | '张': 0.0707, 9 | '刘': 0.0538, 10 | '陈': 0.0453, 11 | '杨': 0.0308, 12 | '赵': 0.0229, 13 | '黄': 0.0223, 14 | '周': 0.0212, 15 | '吴': 0.0205, 16 | '徐': 0.0173, 17 | '孙': 0.0152, 18 | '胡': 0.0131, 19 | '朱': 0.0126, 20 | '高': 0.0121, 21 | '林': 0.0118, 22 | '何': 0.0117, 23 | '郭': 0.0115, 24 | '马': 0.0105, 25 | '罗': 0.0086, 26 | '梁': 0.0084, 27 | '宋': 0.0081, # i 28 | '郑': 0.0077, # i 29 | '谢': 0.0074, # i 30 | '韩': 0.0071, # i 31 | '唐': 0.0067, # i 32 | '冯': 0.0064, 33 | '于': 0.0063, # i 34 | '董': 0.0061, 35 | '萧': 0.0059, # i 36 | '程': 0.0057, 37 | '曹': 0.0057, 38 | '袁': 0.0056, # i 39 | '邓': 0.0054, # i 40 | '许': 0.0053, # i 41 | '傅': 0.0051, 42 | '沈': 0.0050, 43 | '曾': 0.0050, # i 44 | '彭': 0.0049, 45 | '吕': 0.0047, 46 | '苏': 0.0047, # i 47 | '卢': 0.0047, 48 | '蒋': 0.0047, 49 | '蔡': 0.0046, 50 | '贾': 0.0042, 51 | '丁': 0.0042, 52 | # interpolated 53 | '魏': 0.0042, 54 | '薛': 0.0042, 55 | '叶': 0.0041, 56 | '阎': 0.0040, 57 | '余': 0.0039, 58 | '潘': 0.0039, 59 | '杜': 0.0038, 60 | '戴': 0.0038, 61 | '夏': 0.0037, 62 | '钟': 0.0036, 63 | '汪': 0.0036, 64 | '田': 0.0035, 65 | '任': 0.0034, 66 | '姜': 0.0034, 67 | '范': 0.0033, 68 | '方': 0.0033, 69 | '石': 0.0032, 70 | '姚': 0.0032, 71 | '谭': 0.0031, 72 | '盛': 0.0031, 73 | '邹': 0.0030, 74 | '熊': 0.0030, 75 | '金': 0.0029, 76 | '陆': 0.0029, 77 | '郝': 0.0028, 78 | '孔': 0.0028, 79 | '白': 0.0027, 80 | '崔': 0.0027, 81 | '康': 0.0026, 82 | '毛': 0.0026, 83 | '邱': 0.0025, 84 | '秦': 0.0025, 85 | '江': 0.0024, 86 | '史': 0.0024, 87 | '顾': 0.0024, 88 | '侯': 0.0023, 89 | '邵': 0.0023, 90 | '孟': 0.0022, 91 | '龙': 0.0022, 92 | '万': 0.0022, 93 | '段': 0.0021, 94 | '章': 0.0021, 95 | '钱': 0.0020, 96 | '汤': 0.0020, 97 | '尹': 0.0020, 98 | '黎': 0.0019, 99 | '易': 0.0019, 100 | '常': 0.0019, 101 | '武': 0.0018, 102 | '乔': 0.0018, 103 | '贺': 0.0017, 104 | '赖': 0.0017, 105 | '龚': 0.0017, 106 | '文': 0.0016, 107 | '庞': 0.0016, 108 | '樊': 0.0016, 109 | '兰': 0.0015, 110 | '殷': 0.0015, 111 | '施': 0.0015, 112 | '陶': 0.0014, 113 | '洪': 0.0014, 114 | '翟': 0.0014, 115 | '安': 0.0013, 116 | '颜': 0.0013, 117 | '倪': 0.0013, 118 | '严': 0.0012, 119 | '牛': 0.0012, 120 | '温': 0.0012, 121 | '芦': 0.0012, 122 | '季': 0.0011, 123 | '俞': 0.0011, 124 | '章': 0.0011, 125 | '鲁': 0.0010, 126 | '葛': 0.0010, 127 | '伍': 0.0010, 128 | '韦': 0.0010, 129 | '申': 0.0009, 130 | '尤': 0.0009, 131 | '毕': 0.0009, 132 | '聂': 0.0008, 133 | '丛': 0.0008, 134 | '焦': 0.0008, 135 | '向': 0.0008, 136 | '柳': 0.0007, 137 | '邢': 0.0007, 138 | '路': 0.0007, 139 | '岳': 0.0007, 140 | '齐': 0.0006, 141 | '沿': 0.0006, 142 | '梅': 0.0006, 143 | '莫': 0.0006, 144 | '庄': 0.0005, 145 | '辛': 0.0005, 146 | '管': 0.0005, 147 | '祝': 0.0005, 148 | '左': 0.0004, 149 | '涂': 0.0004, 150 | '谷': 0.0004, 151 | '祁': 0.0004, 152 | '时': 0.0003, 153 | '舒': 0.0003, 154 | '耿': 0.0003, 155 | '牟': 0.0003, 156 | '卜': 0.0002, 157 | '路': 0.0002, 158 | '詹': 0.0002, 159 | '关': 0.0002, 160 | '苗': 0.0002, 161 | '凌': 0.0001, 162 | '费': 0.0001, 163 | '纪': 0.0001, 164 | '靳': 0.0001, 165 | '盛': 0.0001, 166 | '童': 0.0001, 167 | '欧': 0.0001, 168 | '甄': 0.0001, 169 | '项': 0.0001, 170 | '曲': 0.0001, 171 | '成': 0.0001, 172 | '游': 0.0001, 173 | '阳': 0.0001, 174 | '裴': 0.0001, 175 | '席': 0.0001, 176 | '卫': 0.0001, 177 | '查': 0.0001, 178 | '屈': 0.0001, 179 | '鲍': 0.0001, 180 | '位': 0.0001, 181 | '覃': 0.0001, 182 | '霍': 0.0001, 183 | '翁': 0.0001, 184 | '隋': 0.0001, 185 | '植': 0.0001, 186 | '甘': 0.0001, 187 | '景': 0.0001, 188 | '薄': 0.0001, 189 | '单': 0.0001, 190 | '包': 0.0001, 191 | '司': 0.0001, 192 | '柏': 0.0001, 193 | '宁': 0.0001, 194 | '柯': 0.0001, 195 | '阮': 0.0001, 196 | '桂': 0.0001, 197 | '闵': 0.0001, 198 | '欧': 0.0001, 199 | '阳': 0.0001, 200 | '解': 0.0001, 201 | '强': 0.0001, 202 | '柴': 0.0001, 203 | '华': 0.0001, 204 | '车': 0.0001, 205 | '冉': 0.0001, 206 | '房': 0.0001, 207 | } 208 | -------------------------------------------------------------------------------- /vendor/convertbdf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import pickle 6 | import bdflib 7 | 8 | def packrow(iterable): 9 | v = 0 10 | for bit in iterable: 11 | v = (v<<1) | bit 12 | return v 13 | 14 | def loadfrombdf(filename): 15 | srcfile = open(filename, 'r') 16 | fontd = bdflib.read_bdf(srcfile) 17 | srcfile.close() 18 | glyphs = {} 19 | for k, v in fontd.glyphs_by_codepoint.items(): 20 | llen = len(v.bitmap()[0]) 21 | glyphs[k] = (llen,) + tuple(packrow(l) for l in v.bitmap()) 22 | maxnum = max(glyphs) 23 | return tuple(glyphs.get(g) for g in range(max(glyphs)+1)) 24 | 25 | glyphs = loadfrombdf(sys.argv[1]) 26 | pickle.dump(glyphs, open(sys.argv[2], 'wb')) 27 | -------------------------------------------------------------------------------- /vendor/figchar.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import pickle 5 | import collections 6 | 7 | 8 | class TextBlock: 9 | 10 | def __init__(self, block='', blank=' '): 11 | self.lines = [] 12 | self.blank = blank 13 | for l in block.splitlines(): 14 | self.lines.append(l) 15 | if not self.lines: 16 | self.lines.append('') 17 | self.width = max(map(len, self.lines)) 18 | self.height = len(self.lines) 19 | self.lines = collections.deque(l.ljust(self.width) for l in self.lines) 20 | 21 | def hcat(self, other, justify=0): 22 | delta = other.height - self.height 23 | start = 0 24 | if delta > 0: 25 | if justify > 0: 26 | self.lines.extendleft( 27 | self.blank * self.width for i in range(delta)) 28 | elif justify < 0: 29 | self.lines.extend( 30 | self.blank * self.width for i in range(delta)) 31 | else: 32 | top = delta // 2 33 | self.lines.extendleft( 34 | self.blank * self.width for i in range(top)) 35 | self.lines.extend( 36 | self.blank * self.width for i in range(delta - top)) 37 | self.height = other.height 38 | elif delta < 0: 39 | if justify > 0: 40 | start = -delta 41 | elif justify == 0: 42 | start = -delta // 2 43 | for ln in range(start): 44 | self.lines[ln] += self.blank * other.width 45 | for ln in range(other.height): 46 | self.lines[ln + start] += other.lines[ln] 47 | for ln in range(start + other.height, self.height): 48 | self.lines[ln] += self.blank * other.width 49 | self.width += other.width 50 | 51 | def __str__(self): 52 | return '\n'.join(self.lines) 53 | 54 | 55 | class BlockGenerator: 56 | 57 | def __init__(self, fontfile, fillchar=' █'): 58 | self.font = pickle.load(open(fontfile, 'rb')) 59 | self.fillchar = fillchar 60 | 61 | def renderchar(self, c): 62 | try: 63 | lines = [] 64 | g = self.font[ord(c)] 65 | width = g[0] 66 | for l in g[1:]: 67 | s = ''.join(self.fillchar[int(b)] 68 | for b in bin(l)[2:].zfill(width)) 69 | lines.append(s) 70 | return '\n'.join(lines) 71 | except Exception: 72 | return '' 73 | 74 | def render(self, s): 75 | lines = [] 76 | for ln in s.splitlines(): 77 | blk = TextBlock(blank=self.fillchar[0]) 78 | start = 0 79 | for c in ln: 80 | if start: 81 | blk.hcat( 82 | TextBlock(self.fillchar[0], blank=self.fillchar[0]), 1) 83 | else: 84 | start = 1 85 | blk.hcat( 86 | TextBlock(self.renderchar(c), blank=self.fillchar[0]), 1) 87 | lines.append(str(blk)) 88 | return '\n'.join(lines) 89 | 90 | if __name__ == '__main__': 91 | import sys 92 | bg = BlockGenerator(*sys.argv[1:]) 93 | print(bg.render(sys.stdin.read())) 94 | -------------------------------------------------------------------------------- /vendor/learnctx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import jieba 6 | import pickle 7 | import struct 8 | import functools 9 | import collections 10 | 11 | def loaddict(fn): 12 | dic = set('、,。;?!:') 13 | with open(fn) as f: 14 | for ln in f: 15 | if not ln.strip(): 16 | continue 17 | w = ln.split()[0] 18 | dic.add(w) 19 | return sorted(dic) 20 | 21 | wl = loaddict(sys.argv[1]) 22 | 23 | @functools.lru_cache(maxsize=200) 24 | def indexword(word): 25 | try: 26 | return wl.index(word) 27 | except ValueError: 28 | return None 29 | 30 | packvals = lambda values: struct.pack('>' + 'H'*len(values), *values) 31 | 32 | stopwords = frozenset(map(indexword, map(str.strip, open('stopwords.txt', 'r', encoding='utf-8')))) 33 | wd = collections.defaultdict(set) 34 | for ln in sys.stdin: 35 | ln = set(filter(None, (indexword(word) for word in jieba.cut(ln.strip())))) 36 | for word in ln.difference(stopwords): 37 | wd[word] |= ln 38 | 39 | pickle.dump(tuple(packvals(sorted(wd.get(k, ()))) for k in range(len(wl))), open('context.pkl', 'wb')) 40 | -------------------------------------------------------------------------------- /vendor/logcutfilter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys, os 5 | import re 6 | import jieba 7 | #from zhconv import convert_for_mw 8 | from zhutil import * 9 | 10 | punctstr = ( 11 | '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~¢£¥·ˇˉ―‖‘’“”•′‵、。々' 12 | '〈〉《》「」『』【】〔〕〖〗〝〞︰︱︳︴︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄' 13 | '﹏﹐﹒﹔﹕﹖﹗﹙﹚﹛﹜﹝﹞!(),.:;?[{|}~、¢£¥') 14 | 15 | 16 | ucjk = frozenset(itertools.chain( 17 | range(0x1100, 0x11FF + 1), 18 | range(0x2E80, 0xA4CF + 1), 19 | range(0xA840, 0xA87F + 1), 20 | range(0xAC00, 0xD7AF + 1), 21 | range(0xF900, 0xFAFF + 1), 22 | range(0xFE30, 0xFE4F + 1), 23 | range(0xFF65, 0xFFDC + 1), 24 | range(0xFF01, 0xFF0F + 1), 25 | range(0xFF1A, 0xFF20 + 1), 26 | range(0xFF3B, 0xFF40 + 1), 27 | range(0xFF5B, 0xFF60 + 1), 28 | range(0x1F000, 0x2FFFF + 1) 29 | )) 30 | 31 | RE_BRACKET = re.compile(' ?[((][^\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\U0001F000-\U0001F8AD\U00020000-\U0002A6D6))]*[))]|"[^"]+"') 32 | 33 | brackets = '()()[]""‘’“”{}〈〉《》「」『』【】〔〕〖〗' 34 | 35 | _get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), 36 | os.path.dirname(__file__), path)) 37 | 38 | jiebazhc = jieba.Tokenizer(_get_module_path('zhcdict.txt')) 39 | jiebazhc.cache_file = "jiebazhc.cache" 40 | 41 | #RE_BRACKETS = re.compile(' ?\((.*?)\)| ?\((.*?)\)') 42 | RE_BRACKETS = re.compile('|'.join(' ?%s.*?%s' % (re.escape(brackets[i]), re.escape(brackets[i+1])) for i in range(0, len(brackets), 2))) 43 | 44 | tailp = frozenset("""([{£¥`〈《「『【〔〖([{£¥〝︵︷︹︻︽︿﹁﹃﹙﹛﹝({"'“‘""") 45 | stripblank = lambda s: s.replace(' ', '').replace('\u3000', '') 46 | 47 | if len(sys.argv) > 1: 48 | if sys.argv[1] == 'noop': 49 | cut = lambda s: (s,) 50 | stripblank = lambda s: s.replace('\u3000', ' ') 51 | else: 52 | cut = lambda s: jiebazhc.cut(s, HMM=False) 53 | else: 54 | cut = lambda s: jieba.cut(s, HMM=False) 55 | 56 | notchinese = lambda l: not l or sum((ord(i) not in ucjk) for i in l) > .5 * len(l) 57 | brcksub = lambda matchobj: '' if notchinese(matchobj.group(0)[1:-1]) else matchobj.group(0) 58 | 59 | def cutandsplit(s): 60 | for ln in filterlist(splitsentence(stripblank(s))): 61 | l = RE_BRACKETS.sub(brcksub, ln.strip()) 62 | if notchinese(l): 63 | continue 64 | yield ' '.join(cut(l.replace('「', '“').replace('」', '”').replace('『', '‘').replace('』', '’').lstrip(tailpunct).rstrip(headpunct))) 65 | 66 | cutfilter = lambda s: ' '.join(i.strip() for i in cut(s.replace(' ', ''))) 67 | 68 | lastline = '' 69 | 70 | for ln in sys.stdin: 71 | l = ln.strip(' \t\n\r\x0b\x0c\u3000=[]') 72 | if not l or all((ord(i) not in ucjk) for i in l) or any((ord(i) in range(32)) for i in l): 73 | continue 74 | elif l[-1] in tailp: 75 | lastline += l 76 | else: 77 | #sys.stdout.write('\n'.join(filterlist((splitsentence(cutfilter(lastline + l))))) + '\n') 78 | sys.stdout.write('\n'.join(cutandsplit(lastline + l))) 79 | sys.stdout.write('\n') 80 | lastline = '' 81 | 82 | if lastline: 83 | #sys.stdout.write('\n'.join(filterlist((splitsentence(cutfilter(lastline))))) + '\n') 84 | sys.stdout.write('\n'.join(cutandsplit(lastline))) 85 | sys.stdout.write('\n') 86 | -------------------------------------------------------------------------------- /vendor/mbox.conf: -------------------------------------------------------------------------------- 1 | [fs] 2 | hide: / 3 | hide: /home 4 | hide: /tmp 5 | hide: /dev 6 | hide: /proc 7 | hide: /sys 8 | hide: /run 9 | hide: /mnt 10 | hide: /media 11 | allow: . 12 | allow: /lib 13 | allow: /usr/bin 14 | allow: /usr/lib 15 | allow: /usr/share 16 | allow: /usr/local/lib 17 | [network] 18 | block: 0.0.0.0 19 | -------------------------------------------------------------------------------- /vendor/mosesproxy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import sys 5 | import json 6 | import socket 7 | 8 | address = ('172.20.1.3', 13332) 9 | 10 | dumpsjson = lambda x: json.dumps(x).encode('utf-8') 11 | loadsjson = lambda x: json.loads(x.decode('utf-8')) 12 | 13 | def recvall(sock, buf=1024): 14 | data = sock.recv(buf) 15 | alldata = [data] 16 | while data and data[-1] != 10: 17 | data = sock.recv(buf) 18 | alldata.append(data) 19 | return b''.join(alldata)[:-1] 20 | 21 | 22 | def sendall(sock, data): 23 | sock.sendall(data + b'\n') 24 | 25 | 26 | def receive(data, autorestart=None): 27 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 28 | try: 29 | sock.connect(address) 30 | sendall(sock, data) 31 | except (ConnectionRefusedError, BrokenPipeError) as ex: 32 | raise ex 33 | received = recvall(sock) 34 | sock.close() 35 | return received 36 | 37 | 38 | def translate(text, mode, withcount=False, withinput=True, align=True): 39 | return loadsjson(receive(dumpsjson((mode, text, withcount, withinput, align)))) 40 | 41 | 42 | def rawtranslate(text, mode, withcount=False): 43 | return loadsjson(receive(dumpsjson((mode + '.raw', text)))) 44 | 45 | 46 | def modelname(): 47 | return loadsjson(receive(dumpsjson(('modelname',)))) 48 | 49 | 50 | def cut(*args, **kwargs): 51 | return loadsjson(receive(dumpsjson(('cut', args, kwargs)))) 52 | 53 | 54 | def cut_for_search(*args, **kwargs): 55 | return loadsjson(receive(dumpsjson(('cut_for_search', args, kwargs)))) 56 | 57 | 58 | def tokenize(*args, **kwargs): 59 | return loadsjson(receive(dumpsjson(('tokenize', args, kwargs)))) 60 | 61 | 62 | class jiebazhc: 63 | 64 | @staticmethod 65 | def cut(*args, **kwargs): 66 | return loadsjson(receive(dumpsjson(('jiebazhc.cut', args, kwargs)))) 67 | 68 | @staticmethod 69 | def cut_for_search(*args, **kwargs): 70 | return loadsjson(receive(dumpsjson(('jiebazhc.cut_for_search', args, kwargs)))) 71 | 72 | @staticmethod 73 | def tokenize(*args, **kwargs): 74 | return loadsjson(receive(dumpsjson(('jiebazhc.tokenize', args, kwargs)))) 75 | 76 | 77 | def add_word(*args, **kwargs): 78 | receive(dumpsjson(('add_word', args, kwargs))) 79 | 80 | 81 | def load_userdict(*args): 82 | receive(dumpsjson(('load_userdict', args))) 83 | 84 | 85 | def set_dictionary(*args): 86 | receive(dumpsjson(('set_dictionary', args))) 87 | 88 | 89 | def stopserver(): 90 | receive(dumpsjson(('stopserver',)), False) 91 | 92 | 93 | def ping(autorestart=False): 94 | try: 95 | result = receive(dumpsjson(('ping',)), autorestart) 96 | return result == b'pong' 97 | except Exception: 98 | return False 99 | 100 | if __name__ == '__main__': 101 | if len(sys.argv) > 1: 102 | if sys.argv[1] == 'stop': 103 | if ping(): 104 | stopserver() 105 | elif sys.argv[1] == 'ping': 106 | if not ping(): 107 | sys.exit(1) 108 | elif sys.argv[1] == 'c2m': 109 | if not ping(): 110 | sys.exit(1) 111 | sys.stdout.write(translate(sys.stdin.read(), 'c2m', 0, 0, 0) + '\n') 112 | elif sys.argv[1] == 'm2c': 113 | if not ping(): 114 | sys.exit(1) 115 | sys.stdout.write(translate(sys.stdin.read(), 'm2c', 0, 0, 0) + '\n') 116 | elif sys.argv[1] == 'c2m.raw': 117 | if not ping(): 118 | sys.exit(1) 119 | sys.stdout.write(translate(sys.stdin.read(), 'c2m.raw') + '\n') 120 | elif sys.argv[1] == 'm2c.raw': 121 | if not ping(): 122 | sys.exit(1) 123 | sys.stdout.write(translate(sys.stdin.read(), 'm2c.raw') + '\n') 124 | elif sys.argv[1] == 'modelname': 125 | if not ping(): 126 | sys.exit(1) 127 | sys.stdout.write((modelname() or '') + '\n') 128 | else: 129 | if not ping(): 130 | sys.exit(1) 131 | -------------------------------------------------------------------------------- /vendor/pangu.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Copyright (c) 2013 Vinta 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | # this software and associated documentation files (the "Software"), to deal in 7 | # the Software without restriction, including without limitation the rights to 8 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | # the Software, and to permit persons to whom the Software is furnished to do so, 10 | # subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | 22 | from __future__ import unicode_literals 23 | import re 24 | import sys 25 | 26 | 27 | _py_version = sys.version_info 28 | is_py2 = (_py_version[0] == 2) 29 | 30 | __version__ = '2.5.6.3' 31 | __all__ = ['spacing', 'text_spacing'] 32 | 33 | CJK_QUOTE_RE = re.compile(r'([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])(["\'])') 34 | QUOTE_CJK_RE = re.compile(r'(["\'])([\u3040-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])') 35 | FIX_QUOTE_RE = re.compile(r'(["\'\(\[\{<\u201c]+)(\s*)(.+?)(\s*)(["\'\)\]\}>\u201d]+)') 36 | FIX_SINGLE_QUOTE_RE = re.compile(r'([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])( )(\')([A-Za-z])') 37 | 38 | CJK_HASH_RE = re.compile(r'([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])(#(\S+))') 39 | HASH_CJK_RE = re.compile(r'((\S+)#)([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])') 40 | 41 | CJK_OPERATOR_ANS_RE = re.compile(r'([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])([\+\-\*\/=&\\|<>])([A-Za-z0-9])') 42 | ANS_OPERATOR_CJK_RE = re.compile(r'([A-Za-z0-9])([\+\-\*\/=&\\|<>])([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])') 43 | 44 | CJK_BRACKET_CJK_RE = re.compile(r'([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])([\(\[\{<\u201c]+(.*?)[\)\]\}>\u201d]+)([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])') 45 | CJK_BRACKET_RE = re.compile(r'([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])([\(\[\{<\u201c>])') 46 | BRACKET_CJK_RE = re.compile(r'([\)\]\}>\u201d<])([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])') 47 | FIX_BRACKET_RE = re.compile(r'([\(\[\{<\u201c]+)(\s*)(.+?)(\s*)([\)\]\}>\u201d]+)') 48 | 49 | FIX_SYMBOL_RE = re.compile(r'([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])([~!;:,\.\?\u2026])([A-Za-z0-9])') 50 | 51 | CJK_ANS_RE = re.compile(r'([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])([A-Za-z0-9`\$%\^&\*\-=\+\\\|/@\u00a1-\u00ff\u2022\u2027\u2150-\u218f])') 52 | ANS_CJK_RE = re.compile(r'([A-Za-z0-9`~\$%\^&\*\-=\+\\\|/!;:,\.\?\u00a1-\u00ff\u2022\u2026\u2027\u2150-\u218f])([\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff])') 53 | 54 | 55 | def text_spacing(text): 56 | """ 57 | Perform paranoid text spacing on text. Always return Unicode. 58 | """ 59 | 60 | if is_py2 and isinstance(text, str): 61 | text = text.decode('utf-8') 62 | 63 | if len(text) < 2: 64 | return text 65 | 66 | text = CJK_QUOTE_RE.sub(r'\1 \2', text) 67 | text = QUOTE_CJK_RE.sub(r'\1 \2', text) 68 | text = FIX_QUOTE_RE.sub(r'\1\3\5', text) 69 | text = FIX_SINGLE_QUOTE_RE.sub(r'\1\3\4', text) 70 | 71 | text = CJK_HASH_RE.sub(r'\1 \2', text) 72 | text = HASH_CJK_RE.sub(r'\1 \3', text) 73 | 74 | text = CJK_OPERATOR_ANS_RE.sub(r'\1 \2 \3', text) 75 | text = ANS_OPERATOR_CJK_RE.sub(r'\1 \2 \3', text) 76 | 77 | old_text = text 78 | new_text = CJK_BRACKET_CJK_RE.sub(r'\1 \2 \4', old_text) 79 | text = new_text 80 | if old_text == new_text: 81 | text = CJK_BRACKET_RE.sub(r'\1 \2', text) 82 | text = BRACKET_CJK_RE.sub(r'\1 \2', text) 83 | text = FIX_BRACKET_RE.sub(r'\1\3\5', text) 84 | 85 | text = FIX_SYMBOL_RE.sub(r'\1\2 \3', text) 86 | 87 | text = CJK_ANS_RE.sub(r'\1 \2', text) 88 | text = ANS_CJK_RE.sub(r'\1 \2', text) 89 | 90 | return text 91 | 92 | 93 | spacing = text_spacing 94 | -------------------------------------------------------------------------------- /vendor/repl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import sys 6 | 7 | import re 8 | import math 9 | 10 | sys.stderr = sys.stdout 11 | with sys.stdin as r: 12 | prog = r.read() 13 | 14 | try: 15 | ret = eval(prog) 16 | if ret is not None: 17 | print(ret) 18 | except SyntaxError: 19 | exec(prog) 20 | -------------------------------------------------------------------------------- /vendor/say.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | import sys 6 | import kenlm 7 | import pangu 8 | import pickle 9 | import struct 10 | import random 11 | import itertools 12 | import functools 13 | import collections 14 | 15 | srandom = random.SystemRandom() 16 | 17 | RE_UCJK = re.compile( 18 | '([\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff' 19 | '\U0001F000-\U0001F8AD\U00020000-\U0002A6D6]+)') 20 | 21 | RE_EN = re.compile('[a-zA-Z0-9_]') 22 | 23 | punct = frozenset( 24 | '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~¢£¥·ˇˉ―‖‘’“”•′‵、。々' 25 | '〈〉《》「」『』【】〔〕〖〗〝〞︰︱︳︴︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄' 26 | '﹏﹐﹒﹔﹕﹖﹗﹙﹚﹛﹜﹝﹞!(),.:;?[{|}~、¢£¥') 27 | 28 | unpackvals = lambda b: struct.unpack('>' + 'H' * (len(b) // 2), b) 29 | sel_best = lambda weights: max(enumerate(weights), key=lambda x: x[1]) 30 | 31 | 32 | class LRUCache: 33 | 34 | def __init__(self, maxlen): 35 | self.capacity = maxlen 36 | self.cache = collections.OrderedDict() 37 | 38 | def __getitem__(self, key): 39 | value = self.cache.pop(key) 40 | self.cache[key] = value 41 | return value 42 | 43 | def get(self, key, default=None): 44 | try: 45 | value = self.cache.pop(key) 46 | self.cache[key] = value 47 | return value 48 | except KeyError: 49 | return default 50 | 51 | def __setitem__(self, key, value): 52 | try: 53 | self.cache.pop(key) 54 | except KeyError: 55 | if len(self.cache) >= self.capacity: 56 | self.cache.popitem(last=False) 57 | self.cache[key] = value 58 | 59 | def __contains__(self, item): 60 | return item in self.cache 61 | 62 | 63 | def weighted_choice_king(weights): 64 | total = 0 65 | winner = 0 66 | winweight = 0 67 | for i, w in enumerate(weights): 68 | total += w 69 | if srandom.random() * total < w: 70 | winner = i 71 | winweight = w 72 | return winner, winweight 73 | 74 | 75 | def _get_indexword(model): 76 | @functools.lru_cache(maxsize=50) 77 | def indexword(word): 78 | try: 79 | return model.voc.index(word) 80 | except ValueError: 81 | return None 82 | return indexword 83 | 84 | 85 | def joinword(words): 86 | last = False 87 | for w in words: 88 | if last and RE_EN.match(w[0]): 89 | yield ' ' 90 | yield w 91 | if RE_EN.match(w[-1]): 92 | last = True 93 | 94 | 95 | class SimpleModel: 96 | 97 | def __init__(self, lm, dictfile, ctxmodel=None, dictinit=''): 98 | self.lm = kenlm.LanguageModel(lm) 99 | self.voc = [] 100 | self._vocid = LRUCache(64) 101 | self.ctx = pickle.load(open(ctxmodel, 'rb')) if ctxmodel else {} 102 | self.stopfn = lambda s: len(s) > 40 or len(s) > 3 and all(i == s[-1] for i in s[-3:]) 103 | self.loaddict(dictfile, dictinit, True) 104 | 105 | def add_word(self, word): 106 | if word not in self.dic: 107 | self.dic.append(word) 108 | 109 | def loaddict(self, fn, init='', withsp=False): 110 | dic = set(init) 111 | with open(fn) as f: 112 | for ln in f: 113 | ln = ln.strip() 114 | if not ln: 115 | continue 116 | dic.add(ln if withsp else ln.split()[0]) 117 | self.voc = sorted(dic) 118 | 119 | def indexword(self, word): 120 | if word not in self._vocid: 121 | try: 122 | self._vocid[word] = self.voc.index(word) 123 | except ValueError: 124 | self._vocid[word] = None 125 | return self._vocid[word] 126 | 127 | def say(self, context=(), continuewords=()): 128 | context = context or continuewords 129 | ctxvoc = list(frozenset(self.voc).intersection(map(self.voc.__getitem__, frozenset(itertools.chain.from_iterable(map(unpackvals, map(self.ctx.__getitem__, filter(None, map(self.indexword, frozenset(context)))))))))) or self.voc if context else self.voc 130 | out = [] 131 | stack = list(continuewords) 132 | if stack: 133 | history = ' '.join(stack) + ' ' 134 | idx, w = weighted_choice_king( 135 | 10**self.lm.score(history + c, 1, 0) for c in ctxvoc) 136 | else: 137 | idx, w = weighted_choice_king( 138 | 10**self.lm.score(c, 1, 0) for c in ctxvoc) 139 | out.append(ctxvoc[idx]) 140 | stack.append(ctxvoc[idx]) 141 | while 1: 142 | bos = (len(stack) <= self.lm.order + 2) 143 | history = ' '.join(stack[-self.lm.order - 2:]) + ' ' 144 | idx, w = weighted_choice_king( 145 | 10**self.lm.score(history + ctxvoc[k // 2], bos, k % 2) for k in range(len(ctxvoc) * 2)) 146 | c = ctxvoc[idx // 2] 147 | out.append(c) 148 | stack.append(c) 149 | if idx % 2 or self.stopfn(out): 150 | break 151 | return pangu.spacing(''.join(joinword(out))) 152 | 153 | 154 | class POSModel: 155 | 156 | allpos = ( 157 | 'a', 'ad', 'ag', 'an', 'b', 'c', 'd', 'df', 'dg', 'e', 'f', 'g', 'h', 'i', 158 | 'j', 'k', 'l', 'm', 'mg', 'mq', 'n', 'ng', 'nr', 'ns', 'nt', 'nz', 'o', 159 | 'p', 'q', 'r', 'rg', 'rr', 'rz', 's', 't', 'tg', 'u', 'ud', 'ug', 'uj', 160 | 'ul', 'uv', 'uz', 'v', 'vd', 'vg', 'vi', 'vn', 'vq', 'x', 'y', 'z', 'zg', 161 | '“', '”', '、', '。', '!', ',', '.', ':', ';', '?' 162 | ) 163 | 164 | def __init__(self, lm, poslm, dictfile): 165 | self.lm = kenlm.LanguageModel(lm) 166 | self.poslm = kenlm.LanguageModel(poslm) 167 | self.posvoc = {} 168 | self.end = frozenset('。!?”') 169 | self.loaddict(dictfile) 170 | 171 | def loaddict(self, fn): 172 | with open(fn) as f: 173 | for ln in f: 174 | l = ln.strip() 175 | if not l: 176 | continue 177 | try: 178 | w, f, p = l.split() 179 | p = p[:2] 180 | if RE_UCJK.match(w): 181 | if p in self.posvoc: 182 | self.posvoc[p].append(w) 183 | else: 184 | self.posvoc[p] = [w] 185 | except Exception: 186 | pass 187 | 188 | def generate_pos(self): 189 | out = [] 190 | idx, w = weighted_choice_king( 191 | 10**self.poslm.score(c, 1, 0) for c in self.allpos) 192 | out.append(self.allpos[idx]) 193 | yield self.allpos[idx] 194 | while 1: 195 | bos = (len(out) <= self.poslm.order + 2) 196 | history = ' '.join(out[-self.poslm.order - 2:]) + ' ' 197 | idx, w = weighted_choice_king( 198 | 10**self.poslm.score(history + self.allpos[k // 2], bos, k % 2) for k in range(len(self.allpos) * 2)) 199 | c = self.allpos[idx // 2] 200 | out.append(c) 201 | yield c 202 | if idx % 2 or c in self.end: 203 | break 204 | 205 | def say(self): 206 | orderlm = self.lm.order 207 | out = [] 208 | for pos in self.generate_pos(): 209 | if pos in punct: 210 | out.append(pos) 211 | elif pos in self.posvoc: 212 | bos = (len(out) <= orderlm + 2) 213 | history = ' '.join(out[-orderlm - 2:]) + ' ' 214 | availvoc = self.posvoc[pos] 215 | idx, w = weighted_choice_king( 216 | 10**self.lm.score(history + c, bos, 0) for c in availvoc) 217 | c = availvoc[idx] 218 | out.append(c) 219 | else: 220 | out.append(pos) 221 | return pangu.spacing(''.join(joinword(out))) 222 | 223 | 224 | if __name__ == '__main__': 225 | model = SimpleModel(*sys.argv[1:]) 226 | for ln in sys.stdin: 227 | ln = ln.strip() 228 | if ln: 229 | mode = ln[0] 230 | words = ln[1:].split() 231 | else: 232 | mode, words = '', () 233 | print(model.say(words)) 234 | sys.stdout.flush() 235 | 236 | #model = POSModel(*sys.argv[1:]) 237 | #while 1: 238 | #print(model.say()) 239 | #sys.stdout.flush() 240 | -------------------------------------------------------------------------------- /vendor/seccomp.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # 3 | # Copyright (c) 2015 David Wison 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | # pip install python-prctl cffi 24 | 25 | from __future__ import division 26 | 27 | import os 28 | import sys 29 | import signal 30 | import socket 31 | import struct 32 | import marshal 33 | import resource 34 | 35 | import cffi 36 | import prctl 37 | 38 | import re 39 | import math 40 | import cmath 41 | import itertools 42 | 43 | reload(sys) 44 | sys.setdefaultencoding("utf-8") 45 | 46 | _ffi = cffi.FFI() 47 | _ffi.cdef('void _exit(int);') 48 | _libc = _ffi.dlopen(None) 49 | 50 | def _exit(n=1): 51 | """Invoke _exit(2) system call.""" 52 | _libc._exit(n) 53 | 54 | def read_exact(fp, n): 55 | buf = '' 56 | while len(buf) < n: 57 | buf2 = os.read(fp.fileno(), n) 58 | if not buf2: 59 | _exit(233) 60 | buf += buf2 61 | return buf2 62 | 63 | def write_exact(fp, s): 64 | done = 0 65 | while done < len(s): 66 | written = os.write(fp.fileno(), s[done:]) 67 | if not written: 68 | _exit(233) 69 | done += written 70 | 71 | class SecureEvalHost(object): 72 | def __init__(self): 73 | self.host, self.child = socket.socketpair() 74 | self.pid = None 75 | self.child_globals = {"__builtins__": __builtins__} 76 | 77 | def start_child(self): 78 | assert not self.pid 79 | self.pid = os.fork() 80 | if not self.pid: 81 | self._child_main() 82 | self.child.close() 83 | 84 | def kill_child(self): 85 | assert self.pid 86 | pid, status = os.waitpid(self.pid, os.WNOHANG) 87 | os.kill(self.pid, signal.SIGKILL) 88 | 89 | def do_eval(self, msg): 90 | try: 91 | return {'result': str(eval(msg['body'], self.child_globals, {}))} 92 | except Exception as ex: 93 | return {'result': repr(ex)} 94 | 95 | def _child_main(self): 96 | self.host.close() 97 | for fd in map(int, os.listdir('/proc/self/fd')): 98 | if fd != self.child.fileno(): 99 | try: 100 | os.close(fd) 101 | except OSError: 102 | pass 103 | 104 | resource.setrlimit(resource.RLIMIT_CPU, (1, 1)) 105 | prctl.set_seccomp(True) 106 | while True: 107 | sz, = struct.unpack('>L', read_exact(self.child, 4)) 108 | doc = marshal.loads(read_exact(self.child, sz)) 109 | if doc['cmd'] == 'eval': 110 | resp = self.do_eval(doc) 111 | elif doc['cmd'] == 'exit': 112 | _exit(0) 113 | goobs = marshal.dumps(resp) 114 | write_exact(self.child, struct.pack('>L', len(goobs))) 115 | write_exact(self.child, goobs) 116 | 117 | def eval(self, s): 118 | msg = marshal.dumps({'cmd': 'eval', 'body': s}) 119 | write_exact(self.host, struct.pack('>L', len(msg))) 120 | write_exact(self.host, msg) 121 | sz, = struct.unpack('>L', read_exact(self.host, 4)) 122 | goobs = marshal.loads(read_exact(self.host, sz)) 123 | return goobs['result'] 124 | 125 | 126 | def go(): 127 | sec = SecureEvalHost() 128 | sec.child_globals.update({'re': re, 'math': math, 'cmath': cmath, 'itertools': itertools}) 129 | sec.start_child() 130 | try: 131 | sys.stdout.write(sec.eval(sys.stdin.read()) + '\n') 132 | finally: 133 | sec.kill_child() 134 | 135 | if __name__ == '__main__': 136 | go() 137 | -------------------------------------------------------------------------------- /vendor/simpcalc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | import math 6 | import cmath 7 | import random 8 | import operator 9 | import collections 10 | 11 | 12 | class CalculatorError(Exception): 13 | pass 14 | 15 | 16 | class MathError(CalculatorError): 17 | '''The Math Error type.''' 18 | 19 | def __init__(self, pos=0, length=1): 20 | super().__init__(self) 21 | self.pos = pos 22 | self.length = length 23 | 24 | def __repr__(self): 25 | return 'MathError(%s)' % self.pos 26 | 27 | 28 | class SyntaxError(CalculatorError): 29 | '''The Syntax Error type.''' 30 | 31 | def __init__(self, pos=0, length=1): 32 | super().__init__(self) 33 | self.pos = pos 34 | self.length = length 35 | 36 | def __repr__(self): 37 | return 'SyntaxError(%s)' % self.pos 38 | 39 | 40 | class KbdBreak(CalculatorError): 41 | '''The Keyboard Break Error type.''' 42 | 43 | def __init__(self, pos=0, length=1): 44 | super().__init__(self) 45 | self.pos = pos 46 | self.length = length 47 | 48 | def __repr__(self): 49 | return 'KbdBreak(%s)' % self.pos 50 | 51 | 52 | class Token: 53 | 54 | def __init__(self, name, pos, type, priority=0, argnum=0, value=None): 55 | self.name = name 56 | self.pos = pos 57 | self.type = type 58 | self.priority = priority 59 | self.argnum = argnum 60 | self.value = value 61 | 62 | def __repr__(self): 63 | return 'Token(%s)' % ', '.join(map( 64 | repr, (self.name, self.pos, self.type, self.priority, self.argnum, self.value))) 65 | 66 | 67 | def adapt_cmath(funcname): 68 | def wrapped(x): 69 | if isinstance(x, complex): 70 | return getattr(cmath, funcname)(x) 71 | else: 72 | try: 73 | return getattr(math, funcname)(x) 74 | except Exception: 75 | # sqrt etc. 76 | return getattr(cmath, funcname)(x) 77 | return wrapped 78 | 79 | 80 | def gcd(*numbers): 81 | """Calculate the Greatest Common Divisor of the numbers.""" 82 | if len(numbers) == 2: 83 | a, b = numbers 84 | while b: 85 | a, b = b, a % b 86 | return a 87 | elif len(numbers) < 2: 88 | raise TypeError( 89 | 'gcd expected at least 2 arguments, got ' + str(len(numbers))) 90 | else: 91 | val = numbers[0] 92 | for i in numbers[1:]: 93 | while i: 94 | val, i = i, val % i 95 | return val 96 | 97 | 98 | def lcm(*numbers): 99 | """Calculate the Lowest Common Multiple of the numbers.""" 100 | if len(numbers) == 2: 101 | return numbers[0] * numbers[1] // gcd(numbers[0], numbers[1]) 102 | elif len(numbers) < 2: 103 | raise TypeError( 104 | 'lcm expected at least 2 arguments, got ' + str(len(numbers))) 105 | else: 106 | val = numbers[0] 107 | for i in numbers[1:]: 108 | val = val * i // gcd(val, i) 109 | return val 110 | 111 | 112 | def resplit(regex, string): 113 | pos = 0 114 | for m in regex.finditer(string): 115 | if m.start(0) != pos: 116 | yield string[pos:m.start(0)] 117 | yield string[m.start(0):m.end(0)] 118 | pos = m.end(0) 119 | if pos < len(string): 120 | yield string[pos:] 121 | 122 | 123 | class Calculator: 124 | 125 | operators = collections.OrderedDict(( 126 | (" ", ('ws', 1, 1)), 127 | ("\t", ('ws', 1, 1)), 128 | ("(", ('(', 1, 1)), 129 | (",", (',', 1, 2)), 130 | ("!", ('op_l', 2, 1)), 131 | ("^", ('op_r', 3, 2)), 132 | ("**", ('op_r', 3, 2)), 133 | # recognize on parsing 134 | # ("pos", ('op_r', 4, 1)), 135 | # ("neg", ('op_r', 4, 1)), 136 | ("*", ('op_l', 5, 2)), 137 | ("×", ('op_l', 5, 2)), 138 | ("/", ('op_l', 5, 2)), 139 | ("÷", ('op_l', 5, 2)), 140 | ("\\", ('op_l', 5, 2)), 141 | ("%", ('op_l', 5, 2)), 142 | ("+", ('op_l', 6, 2)), 143 | ("-", ('op_l', 6, 2)), 144 | (")", (')', 7, 1)) 145 | )) 146 | 147 | const = { 148 | "i": 1j, 149 | "pi": math.pi, 150 | "π": math.pi, 151 | "e": math.e 152 | } 153 | 154 | functions = { 155 | "!": (math.factorial, 1), 156 | "^": (operator.pow, 2), 157 | "**": (operator.pow, 2), 158 | "*": (operator.mul, 2), 159 | "×": (operator.mul, 2), 160 | "/": (operator.truediv, 2), 161 | "÷": (operator.truediv, 2), 162 | "\\": (operator.floordiv, 2), 163 | "%": (operator.mod, 2), 164 | "+": (operator.add, 2), 165 | "-": (operator.sub, 2), 166 | "pos": (operator.pos, 1), 167 | "neg": (operator.neg, 1), 168 | "abs": (abs, 1), 169 | "bool": (bool, 1), 170 | "float": (float, 1), 171 | "int": (int, 1), 172 | "max": (max, 2), 173 | "min": (min, 2), 174 | "pow": (pow, 2), 175 | "round": (round, 1), 176 | "ceil": (math.ceil, 1), 177 | "copysign": (math.copysign, 2), 178 | "fabs": (math.fabs, 1), 179 | "factorial": (math.factorial, 1), 180 | "floor": (math.floor, 1), 181 | "fmod": (math.fmod, 1), 182 | "gcd": (gcd, 2), 183 | "lcm": (lcm, 2), 184 | "ldexp": (math.ldexp, 1), 185 | "trunc": (math.trunc, 1), 186 | "real": (operator.attrgetter("real"), 1), 187 | "imag": (operator.attrgetter("imag"), 1), 188 | "exp": (adapt_cmath("exp"), 1), 189 | "log": (adapt_cmath("log"), 1), 190 | "ln": (adapt_cmath("log"), 1), 191 | "log10": (adapt_cmath("log10"), 1), 192 | "lg": (adapt_cmath("log10"), 1), 193 | "sqrt": (adapt_cmath("sqrt"), 1), 194 | "√": (adapt_cmath("sqrt"), 1), 195 | "acos": (adapt_cmath("acos"), 1), 196 | "asin": (adapt_cmath("asin"), 1), 197 | "atan": (adapt_cmath("atan"), 1), 198 | "cos": (adapt_cmath("cos"), 1), 199 | "sin": (adapt_cmath("sin"), 1), 200 | "tan": (adapt_cmath("tan"), 1), 201 | "atan2": (math.atan2, 2), 202 | "hypot": (math.hypot, 2), 203 | "degrees": (math.degrees, 1), 204 | "radians": (math.radians, 1), 205 | "acosh": (adapt_cmath("acosh"), 1), 206 | "asinh": (adapt_cmath("asinh"), 1), 207 | "atanh": (adapt_cmath("atanh"), 1), 208 | "cosh": (adapt_cmath("cosh"), 1), 209 | "sinh": (adapt_cmath("sinh"), 1), 210 | "tanh": (adapt_cmath("tanh"), 1), 211 | "erf": (math.erf, 1), 212 | "erfc": (math.erfc, 1), 213 | "gamma": (math.gamma, 1), 214 | "lgamma": (math.lgamma, 1), 215 | "phase": (cmath.phase, 1), 216 | "rect": (cmath.rect, 1), 217 | "inv": (operator.inv, 1), 218 | "and": (operator.and_, 2), 219 | "or": (operator.or_, 2), 220 | "xor": (operator.xor, 2), 221 | "rand": (random.random, 0), 222 | "randrng": (random.uniform, 2), 223 | } 224 | 225 | ansvar = '_' 226 | 227 | re_float = re.compile(r'([0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?i?)') 228 | re_delim = re.compile( 229 | '(%s)' % ('|'.join(map(re.escape, operators.keys())))) 230 | re_split = re.compile( 231 | r'([0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?i?|%s)' % ('|'.join(map(re.escape, operators.keys())))) 232 | 233 | def __init__(self, ansvar=None, autoclose=False): 234 | self.ansvar = ansvar or self.ansvar 235 | self.vars = {self.ansvar: 0} 236 | self.autoclose = autoclose 237 | 238 | def splitexpr(self, expr): 239 | pos = 0 240 | for s in resplit(self.re_split, expr): 241 | s = s.lower() 242 | if not s.strip(): 243 | pass 244 | elif self.re_float.match(s): 245 | i = 1 246 | if s[-1] == 'i': 247 | i = 1j 248 | s = s[:-1] 249 | if '.' in s or 'e' in s: 250 | yield Token(s, pos, 'num', value=float(s) * i) 251 | else: 252 | yield Token(s, pos, 'num', value=int(s) * i) 253 | elif self.re_delim.match(s): 254 | val = self.functions[s][0] if s in self.functions else None 255 | yield Token(s, pos, *self.operators[s], value=val) 256 | elif s in self.const: 257 | yield Token(s, pos, 'const', value=self.const[s]) 258 | elif s in self.vars: 259 | yield Token(s, pos, 'var') 260 | elif s in self.functions: 261 | fn = self.functions[s] 262 | yield Token(s, pos, 'fn', argnum=fn[1], value=fn[0]) 263 | else: 264 | raise SyntaxError(pos, len(s)) 265 | pos += len(s) 266 | 267 | def torpn(self, lstin): 268 | opstack = [] 269 | lastt = None 270 | for key, token in enumerate(lstin): 271 | if token.type == '(': 272 | opstack.append(token) 273 | elif token.type.startswith('op'): 274 | if token.name in '+-' and ( 275 | lastt is None or lastt.type in ('(', 'op_l', 'op_r', ',')): 276 | if token.name == '+': 277 | token.name = 'pos' 278 | token.value = operator.pos 279 | else: 280 | token.name = 'neg' 281 | token.value = operator.neg 282 | token.type = 'op_r' 283 | token.priority = 0 284 | token.argnum = 1 285 | if opstack: 286 | tok2 = opstack[-1] 287 | while (tok2.type.startswith('op') and 288 | (token.type[-1] == 'l' and token.priority >= tok2.priority or 289 | token.type[-1] == 'r' and token.priority > tok2.priority)): 290 | yield opstack.pop() 291 | if opstack: 292 | tok2 = opstack[-1] 293 | else: 294 | break 295 | opstack.append(token) 296 | elif token.type == ',': 297 | try: 298 | while opstack[-1].name != '(': 299 | yield opstack.pop() 300 | except IndexError: 301 | raise SyntaxError(key, len(token.name)) 302 | elif token.type == ')': 303 | try: 304 | while opstack[-1].name != '(': 305 | yield opstack.pop() 306 | except IndexError: 307 | raise SyntaxError(key, len(token.name)) 308 | op = opstack.pop() 309 | if opstack and opstack[-1].type == 'fn': 310 | yield opstack.pop() 311 | elif token.type in ('const', 'var'): 312 | yield token 313 | elif token.type == 'fn': 314 | opstack.append(token) 315 | else: 316 | yield token 317 | # check function brackets 318 | if lastt and token.type != '(' and lastt.type == 'fn' and lastt.argnum: 319 | raise SyntaxError(lastt.pos, len(lastt.name)) 320 | lastt = token 321 | while opstack: 322 | op = opstack.pop() 323 | if op.type != '(': 324 | yield op 325 | # If self.autoclose then ignored right parenthesis is allowed. 326 | elif not self.autoclose: 327 | raise SyntaxError(op.pos, len(op.name)) 328 | 329 | def evalrpn(self, lstin): 330 | '''Evaluates the Reverse Polish Expression.''' 331 | numstack = [] 332 | for token in lstin: 333 | if token.type in ('num', 'const'): 334 | numstack.append(token.value) 335 | elif token.type == 'var': 336 | numstack.append(self.vars[token.name]) 337 | elif token.type in ('op_l', 'op_r', 'fn'): 338 | try: 339 | args = [numstack.pop() for i in range(token.argnum)] 340 | except IndexError: 341 | raise SyntaxError(token.pos, len(token.name)) 342 | try: 343 | numstack.append(token.value(*reversed(args))) 344 | except KeyboardInterrupt: 345 | raise KbdBreak(token.pos, len(token.name)) 346 | except Exception: 347 | raise MathError(token.pos, len(token.name)) 348 | else: 349 | # Logic error in program 350 | raise AssertionError('token %r appears in RPN' % token) 351 | if len(numstack) > 1: 352 | raise SyntaxError(token.pos, len(token.name)) 353 | elif numstack: 354 | return numstack.pop() 355 | else: 356 | return None 357 | 358 | def eval(self, expr): 359 | ret = self.evalrpn(self.torpn(self.splitexpr(expr))) 360 | self.vars[self.ansvar] = ret 361 | return ret 362 | 363 | def format(self, ret): 364 | if ret is None: 365 | return '' 366 | elif isinstance(ret, complex): 367 | s = str(ret.real) if ret.real else '' 368 | if ret.imag: 369 | sign = '+' if ret.imag > 0 and s else '' 370 | if ret.imag == 1: 371 | imag = '' 372 | elif ret.imag == -1: 373 | imag = '-' 374 | else: 375 | imag = str(ret.imag) 376 | s += sign + imag + 'i' 377 | elif not ret: 378 | s = '0' 379 | return s 380 | elif ret: 381 | return str(ret) 382 | else: 383 | return '0' 384 | 385 | def pretty(self, expr): 386 | try: 387 | return self.format(self.eval(expr)) 388 | except MathError as ex: 389 | return "Math Error:\n %s\n %s" % ( 390 | expr, ' ' * ex.pos + '^' * ex.length) 391 | except SyntaxError as ex: 392 | return "Syntax Error:\n %s\n %s" % ( 393 | expr, ' ' * ex.pos + '^' * ex.length) 394 | except KbdBreak as ex: 395 | return "Keyboard Break:\n %s\n %s" % ( 396 | expr, ' ' * ex.pos + '^' * ex.length) 397 | 398 | 399 | def main(): 400 | calc = Calculator() 401 | while 1: 402 | try: 403 | a = input("> ") 404 | except (KeyboardInterrupt, EOFError): 405 | break 406 | #ret = calc.eval(a) 407 | ret = calc.pretty(a) 408 | if ret: 409 | print(ret) 410 | print("\b\b", end='') 411 | return 0 412 | 413 | if __name__ == '__main__': 414 | try: 415 | import readline 416 | except ImportError: 417 | pass 418 | main() 419 | -------------------------------------------------------------------------------- /vendor/stopwords.txt: -------------------------------------------------------------------------------- 1 | a 2 | about 3 | above 4 | after 5 | again 6 | against 7 | all 8 | am 9 | an 10 | and 11 | any 12 | are 13 | as 14 | at 15 | be 16 | because 17 | been 18 | before 19 | being 20 | below 21 | between 22 | both 23 | but 24 | by 25 | cannot 26 | com 27 | could 28 | did 29 | do 30 | does 31 | doing 32 | down 33 | during 34 | each 35 | few 36 | for 37 | from 38 | further 39 | had 40 | has 41 | have 42 | having 43 | he 44 | her 45 | here 46 | hers 47 | herself 48 | him 49 | himself 50 | his 51 | how 52 | http 53 | https 54 | i 55 | if 56 | in 57 | into 58 | is 59 | it 60 | its 61 | itself 62 | me 63 | more 64 | most 65 | my 66 | myself 67 | no 68 | nor 69 | not 70 | of 71 | off 72 | on 73 | once 74 | only 75 | or 76 | other 77 | ought 78 | our 79 | ours 80 | ourselves 81 | out 82 | over 83 | own 84 | same 85 | she 86 | should 87 | so 88 | some 89 | such 90 | than 91 | that 92 | the 93 | their 94 | theirs 95 | them 96 | themselves 97 | then 98 | there 99 | these 100 | they 101 | this 102 | those 103 | through 104 | to 105 | too 106 | under 107 | until 108 | up 109 | very 110 | was 111 | we 112 | were 113 | what 114 | when 115 | where 116 | which 117 | while 118 | who 119 | whom 120 | why 121 | with 122 | would 123 | www 124 | you 125 | your 126 | yours 127 | yourself 128 | yourselves 129 | 阿 130 | 啊 131 | 哎 132 | 哎呀 133 | 哎哟 134 | 唉 135 | 嗳 136 | 安全 137 | 俺 138 | 俺们 139 | 按 140 | 按照 141 | 吧 142 | 吧哒 143 | 把 144 | 罢了 145 | 呗 146 | 帮助 147 | 保持 148 | 被 149 | 本 150 | 本着 151 | 彼 152 | 彼此 153 | 比 154 | 比方 155 | 比较 156 | 比如 157 | 鄙人 158 | 必然 159 | 必须 160 | 必要 161 | 避免 162 | 边 163 | 变成 164 | 表明 165 | 表示 166 | 别 167 | 别的 168 | 别说 169 | 并 170 | 并不 171 | 并不是 172 | 并没有 173 | 并且 174 | 不比 175 | 不变 176 | 不成 177 | 不单 178 | 不但 179 | 不得 180 | 不独 181 | 不断 182 | 不敢 183 | 不够 184 | 不管 185 | 不光 186 | 不过 187 | 不会 188 | 不仅 189 | 不久 190 | 不拘 191 | 不可 192 | 不论 193 | 不能 194 | 不怕 195 | 不然 196 | 不如 197 | 不是 198 | 不特 199 | 不同 200 | 不惟 201 | 不问 202 | 不要 203 | 不一 204 | 不只 205 | 不足 206 | 部分 207 | 采取 208 | 曾经 209 | 产生 210 | 常常 211 | 彻底 212 | 趁 213 | 趁着 214 | 乘 215 | 成为 216 | 充分 217 | 冲 218 | 出来 219 | 出去 220 | 出现 221 | 除 222 | 除此之外 223 | 除非 224 | 除了 225 | 处理 226 | 此 227 | 此间 228 | 此时 229 | 此外 230 | 从 231 | 从而 232 | 从事 233 | 促进 234 | 啐 235 | 存在 236 | 达到 237 | 打 238 | 大大 239 | 大多数 240 | 大家 241 | 大力 242 | 大量 243 | 大批 244 | 大约 245 | 代替 246 | 待 247 | 但 248 | 但是 249 | 当 250 | 当前 251 | 当然 252 | 当时 253 | 当着 254 | 到 255 | 得 256 | 得出 257 | 得到 258 | 的 259 | 的话 260 | 等 261 | 等等 262 | 地 263 | 第 264 | 叮咚 265 | 咚 266 | 对 267 | 对应 268 | 对于 269 | 多 270 | 多次 271 | 多少 272 | 多数 273 | 呃 274 | 而 275 | 而况 276 | 而且 277 | 而是 278 | 而外 279 | 而言 280 | 而已 281 | 尔后 282 | 反过来 283 | 反过来说 284 | 反应 285 | 反映 286 | 反之 287 | 范围 288 | 方便 289 | 方面 290 | 防止 291 | 非常 292 | 非但 293 | 非徒 294 | 分别 295 | 丰富 296 | 否则 297 | 复杂 298 | 附近 299 | 嘎 300 | 嘎登 301 | 该 302 | 赶 303 | 高兴 304 | 个 305 | 个别 306 | 个人 307 | 各 308 | 各地 309 | 各个 310 | 各级 311 | 各人 312 | 各位 313 | 各种 314 | 各自 315 | 给 316 | 根本 317 | 根据 318 | 跟 319 | 更加 320 | 巩固 321 | 共同 322 | 构成 323 | 固然 324 | 故 325 | 故此 326 | 关于 327 | 管 328 | 广大 329 | 广泛 330 | 归 331 | 规定 332 | 果然 333 | 果真 334 | 过 335 | 过来 336 | 过去 337 | 哈 338 | 哈哈 339 | 咳 340 | 还是 341 | 还有 342 | 行动 343 | 行为 344 | 毫不 345 | 好的 346 | 好象 347 | 呵 348 | 嗬 349 | 何 350 | 何处 351 | 何况 352 | 何时 353 | 合理 354 | 和 355 | 嘿 356 | 哼 357 | 哼唷 358 | 后来 359 | 后面 360 | 後来 361 | 後面 362 | 乎 363 | 呼哧 364 | 互相 365 | 哗 366 | 欢迎 367 | 换句话说 368 | 换言之 369 | 或 370 | 或是 371 | 或者 372 | 获得 373 | 基本 374 | 积极 375 | 即 376 | 即便 377 | 即或 378 | 即令 379 | 即若 380 | 即使 381 | 及 382 | 及其 383 | 及时 384 | 及至 385 | 极了 386 | 集中 387 | 几 388 | 几乎 389 | 几时 390 | 己 391 | 既 392 | 既然 393 | 既是 394 | 继而 395 | 继续 396 | 加强 397 | 加入 398 | 加以 399 | 加之 400 | 假如 401 | 假若 402 | 假使 403 | 坚持 404 | 坚决 405 | 鉴于 406 | 将 407 | 叫 408 | 叫做 409 | 较 410 | 较之 411 | 接着 412 | 接著 413 | 结果 414 | 结合 415 | 借 416 | 今后 417 | 今後 418 | 今年 419 | 今天 420 | 紧接着 421 | 尽 422 | 尽管 423 | 进步 424 | 进而 425 | 进行 426 | 进入 427 | 经 428 | 经常 429 | 经过 430 | 就 431 | 就是 432 | 就是说 433 | 举行 434 | 具体 435 | 具体地说 436 | 具体说来 437 | 具有 438 | 巨大 439 | 据 440 | 决定 441 | 绝对 442 | 觉得 443 | 开始 444 | 开外 445 | 开展 446 | 看出 447 | 看到 448 | 看见 449 | 看看 450 | 看来 451 | 考虑 452 | 靠 453 | 可 454 | 可见 455 | 可能 456 | 可是 457 | 可以 458 | 况且 459 | 扩大 460 | 啦 461 | 来 462 | 来着 463 | 了 464 | 了解 465 | 离 466 | 哩 467 | 里面 468 | 例如 469 | 立即 470 | 联系 471 | 连 472 | 连同 473 | 练习 474 | 良好 475 | 两者 476 | 临 477 | 另 478 | 另外 479 | 另一方面 480 | 论 481 | 吗 482 | 嘛 483 | 满足 484 | 慢说 485 | 漫说 486 | 冒 487 | 么 488 | 没有 489 | 每 490 | 每当 491 | 每个 492 | 每年 493 | 每天 494 | 们 495 | 密切 496 | 明确 497 | 明显 498 | 莫若 499 | 某 500 | 某个 501 | 某些 502 | 目前 503 | 拿 504 | 哪 505 | 哪边 506 | 哪儿 507 | 哪个 508 | 哪里 509 | 哪年 510 | 哪怕 511 | 哪天 512 | 哪些 513 | 哪样 514 | 那 515 | 那边 516 | 那儿 517 | 那个 518 | 那会儿 519 | 那里 520 | 那么 521 | 那么些 522 | 那么样 523 | 那时 524 | 那些 525 | 那样 526 | 乃 527 | 乃至 528 | 呢 529 | 能 530 | 能否 531 | 能够 532 | 嗯 533 | 你 534 | 你的 535 | 你们 536 | 您 537 | 宁 538 | 宁可 539 | 宁肯 540 | 宁愿 541 | 喏 542 | 喔唷 543 | 哦 544 | 呕 545 | 啪达 546 | 旁人 547 | 呸 548 | 凭 549 | 凭借 550 | 普遍 551 | 普通 552 | 其 553 | 其次 554 | 其二 555 | 其实 556 | 其他 557 | 其它 558 | 其一 559 | 其余 560 | 其中 561 | 企图 562 | 岂但 563 | 起 564 | 起见 565 | 起来 566 | 恰恰相反 567 | 前后 568 | 前进 569 | 前面 570 | 前者 571 | 强调 572 | 强烈 573 | 且 574 | 清楚 575 | 取得 576 | 全部 577 | 全面 578 | 却不 579 | 确定 580 | 然而 581 | 然后 582 | 然後 583 | 然则 584 | 让 585 | 人家 586 | 人们 587 | 任 588 | 任何 589 | 任凭 590 | 任务 591 | 认识 592 | 认为 593 | 认真 594 | 仍然 595 | 容易 596 | 如 597 | 如此 598 | 如果 599 | 如何 600 | 如其 601 | 如若 602 | 如上所述 603 | 如下 604 | 若 605 | 若非 606 | 若是 607 | 啥 608 | 上来 609 | 上面 610 | 上去 611 | 上升 612 | 上述 613 | 上下 614 | 尚且 615 | 少数 616 | 设若 617 | 设使 618 | 深入 619 | 甚而 620 | 甚么 621 | 甚至 622 | 省得 623 | 失去 624 | 什么 625 | 什么样 626 | 十分 627 | 实际 628 | 实现 629 | 时候 630 | 使得 631 | 使用 632 | 是 633 | 是不是 634 | 是的 635 | 是否 636 | 适当 637 | 适应 638 | 适用 639 | 首先 640 | 受到 641 | 属于 642 | 双方 643 | 谁 644 | 谁知 645 | 顺 646 | 顺着 647 | 说明 648 | 说说 649 | 似的 650 | 似乎 651 | 虽 652 | 虽然 653 | 虽说 654 | 虽则 655 | 随 656 | 随着 657 | 随著 658 | 所 659 | 所谓 660 | 所以 661 | 所有 662 | 他 663 | 他的 664 | 他们 665 | 他人 666 | 她 667 | 她的 668 | 她们 669 | 它 670 | 它的 671 | 它们 672 | 它们的 673 | 倘 674 | 倘或 675 | 倘然 676 | 倘若 677 | 倘使 678 | 特别是 679 | 特点 680 | 特殊 681 | 腾 682 | 替 683 | 通常 684 | 通过 685 | 同 686 | 同时 687 | 同样 688 | 同一 689 | 突出 690 | 突然 691 | 哇 692 | 完成 693 | 完全 694 | 万一 695 | 往 696 | 往往 697 | 望 698 | 为 699 | 为何 700 | 为了 701 | 为什么 702 | 为着 703 | 为主 704 | 维持 705 | 伟大 706 | 喂 707 | 问题 708 | 嗡嗡 709 | 我 710 | 我的 711 | 我们 712 | 乌乎 713 | 呜 714 | 呜呼 715 | 无法 716 | 无论 717 | 无宁 718 | 毋宁 719 | 兮 720 | 嘻 721 | 下来 722 | 下列 723 | 下面 724 | 下去 725 | 吓 726 | 先后 727 | 先後 728 | 先生 729 | 显然 730 | 显著 731 | 现代 732 | 现在 733 | 限制 734 | 相当 735 | 相等 736 | 相对 737 | 相对而言 738 | 相反 739 | 相似 740 | 相同 741 | 相信 742 | 相应 743 | 像 744 | 向 745 | 向着 746 | 心里 747 | 形成 748 | 嘘 749 | 需要 750 | 许多 751 | 宣布 752 | 迅速 753 | 呀 754 | 焉 755 | 严格 756 | 严重 757 | 沿 758 | 沿着 759 | 要 760 | 要不 761 | 要不然 762 | 要不是 763 | 要么 764 | 要求 765 | 要是 766 | 也 767 | 也罢 768 | 也好 769 | 也是 770 | 一 771 | 一般 772 | 一边 773 | 一次 774 | 一旦 775 | 一定 776 | 一方面 777 | 一来 778 | 一面 779 | 一片 780 | 一起 781 | 一切 782 | 一时 783 | 一天 784 | 一下 785 | 一些 786 | 一样 787 | 一则 788 | 一直 789 | 一致 790 | 依 791 | 依靠 792 | 依照 793 | 咦 794 | 移动 795 | 以 796 | 以便 797 | 以后 798 | 以後 799 | 以及 800 | 以来 801 | 以免 802 | 以前 803 | 以上 804 | 以外 805 | 以为 806 | 以下 807 | 以至 808 | 以至于 809 | 以致 810 | 已经 811 | 矣 812 | 意思 813 | 抑或 814 | 因 815 | 因此 816 | 因而 817 | 因为 818 | 引起 819 | 应当 820 | 应该 821 | 应用 822 | 哟 823 | 用 824 | 尤其 825 | 由 826 | 由此可见 827 | 由于 828 | 有 829 | 有的 830 | 有点 831 | 有关 832 | 有利 833 | 有力 834 | 有时 835 | 有所 836 | 有效 837 | 有些 838 | 有着 839 | 有著 840 | 又 841 | 于 842 | 于是 843 | 于是乎 844 | 与 845 | 与此同时 846 | 与否 847 | 与其 848 | 遇到 849 | 原来 850 | 愿意 851 | 越是 852 | 云云 853 | 允许 854 | 运用 855 | 咋 856 | 哉 857 | 再说 858 | 再者 859 | 在 860 | 在下 861 | 咱 862 | 咱们 863 | 遭到 864 | 造成 865 | 则 866 | 怎 867 | 怎么 868 | 怎么办 869 | 怎么样 870 | 怎样 871 | 战斗 872 | 掌握 873 | 朝 874 | 朝着 875 | 召开 876 | 照 877 | 照着 878 | 者 879 | 这 880 | 这边 881 | 这点 882 | 这儿 883 | 这个 884 | 这会儿 885 | 这就是说 886 | 这里 887 | 这么 888 | 这么点儿 889 | 这么些 890 | 这么样 891 | 这时 892 | 这些 893 | 这样 894 | 这种 895 | 着 896 | 着呢 897 | 真是 898 | 真正 899 | 争取 900 | 整个 901 | 正常 902 | 正如 903 | 正在 904 | 之 905 | 之后 906 | 之後 907 | 之类 908 | 之前 909 | 之所以 910 | 之一 911 | 吱 912 | 知道 913 | 直到 914 | 直接 915 | 只是 916 | 只限 917 | 只要 918 | 只有 919 | 至 920 | 至于 921 | 中间 922 | 中小 923 | 重大 924 | 重新 925 | 重要 926 | 周围 927 | 诸位 928 | 逐步 929 | 逐渐 930 | 主要 931 | 主张 932 | 注意 933 | 专门 934 | 转变 935 | 转动 936 | 转贴 937 | 准备 938 | 自 939 | 自从 940 | 自个儿 941 | 自各儿 942 | 自己 943 | 自家 944 | 自身 945 | 综上所述 946 | 总的来看 947 | 总的来说 948 | 总的说来 949 | 总而言之 950 | 总结 951 | 总是 952 | 总之 953 | 纵 954 | 纵令 955 | 纵然 956 | 纵使 957 | 组成 958 | 最大 959 | 最高 960 | 最好 961 | 最后 962 | 最後 963 | 最近 964 | 遵照 965 | 左右 966 | 作为 967 | 做到 968 | -------------------------------------------------------------------------------- /vendor/umsgpack.py: -------------------------------------------------------------------------------- 1 | # u-msgpack-python v2.0 - vsergeev at gmail 2 | # https://github.com/vsergeev/u-msgpack-python 3 | # 4 | # u-msgpack-python is a lightweight MessagePack serializer and deserializer 5 | # module, compatible with both Python 2 and 3, as well CPython and PyPy 6 | # implementations of Python. u-msgpack-python is fully compliant with the 7 | # latest MessagePack specification.com/msgpack/msgpack/blob/master/spec.md). In 8 | # particular, it supports the new binary, UTF-8 string, and application ext 9 | # types. 10 | # 11 | # MIT License 12 | # 13 | # Copyright (c) 2013-2014 Ivan A. Sergeev 14 | # 15 | # Permission is hereby granted, free of charge, to any person obtaining a copy 16 | # of this software and associated documentation files (the "Software"), to deal 17 | # in the Software without restriction, including without limitation the rights 18 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 19 | # copies of the Software, and to permit persons to whom the Software is 20 | # furnished to do so, subject to the following conditions: 21 | # 22 | # The above copyright notice and this permission notice shall be included in 23 | # all copies or substantial portions of the Software. 24 | # 25 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 26 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 27 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 28 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 29 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 30 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 31 | # THE SOFTWARE. 32 | # 33 | """ 34 | u-msgpack-python v2.0 - vsergeev at gmail 35 | https://github.com/vsergeev/u-msgpack-python 36 | 37 | u-msgpack-python is a lightweight MessagePack serializer and deserializer 38 | module, compatible with both Python 2 and 3, as well CPython and PyPy 39 | implementations of Python. u-msgpack-python is fully compliant with the 40 | latest MessagePack specification.com/msgpack/msgpack/blob/master/spec.md). In 41 | particular, it supports the new binary, UTF-8 string, and application ext 42 | types. 43 | 44 | License: MIT 45 | """ 46 | 47 | version = (2,0) 48 | "Module version tuple" 49 | 50 | import struct 51 | import collections 52 | import sys 53 | import io 54 | 55 | ################################################################################ 56 | ### Ext Class 57 | ################################################################################ 58 | 59 | # Extension type for application-defined types and data 60 | class Ext: 61 | """ 62 | The Ext class facilitates creating a serializable extension object to store 63 | an application-defined type and data byte array. 64 | """ 65 | 66 | def __init__(self, type, data): 67 | """ 68 | Construct a new Ext object. 69 | 70 | Args: 71 | type: application-defined type integer from 0 to 127 72 | data: application-defined data byte array 73 | 74 | Raises: 75 | TypeError: 76 | Specified ext type is outside of 0 to 127 range. 77 | 78 | Example: 79 | >>> foo = umsgpack.Ext(0x05, b"\x01\x02\x03") 80 | >>> umsgpack.packb({u"special stuff": foo, u"awesome": True}) 81 | '\x82\xa7awesome\xc3\xadspecial stuff\xc7\x03\x05\x01\x02\x03' 82 | >>> bar = umsgpack.unpackb(_) 83 | >>> print(bar["special stuff"]) 84 | Ext Object (Type: 0x05, Data: 01 02 03) 85 | >>> 86 | """ 87 | # Application ext type should be 0 <= type <= 127 88 | if not isinstance(type, int) or not (type >= 0 and type <= 127): 89 | raise TypeError("ext type out of range") 90 | # Check data is type bytes 91 | elif sys.version_info[0] == 3 and not isinstance(data, bytes): 92 | raise TypeError("ext data is not type \'bytes\'") 93 | elif sys.version_info[0] == 2 and not isinstance(data, str): 94 | raise TypeError("ext data is not type \'str\'") 95 | self.type = type 96 | self.data = data 97 | 98 | def __eq__(self, other): 99 | """ 100 | Compare this Ext object with another for equality. 101 | """ 102 | return (isinstance(other, self.__class__) and 103 | self.type == other.type and 104 | self.data == other.data) 105 | 106 | def __ne__(self, other): 107 | """ 108 | Compare this Ext object with another for inequality. 109 | """ 110 | return not self.__eq__(other) 111 | 112 | def __str__(self): 113 | """ 114 | String representation of this Ext object. 115 | """ 116 | s = "Ext Object (Type: 0x%02x, Data: " % self.type 117 | for i in range(min(len(self.data), 8)): 118 | if i > 0: 119 | s += " " 120 | if isinstance(self.data[i], int): 121 | s += "%02x" % (self.data[i]) 122 | else: 123 | s += "%02x" % ord(self.data[i]) 124 | if len(self.data) > 8: 125 | s += " ..." 126 | s += ")" 127 | return s 128 | 129 | ################################################################################ 130 | ### Exceptions 131 | ################################################################################ 132 | 133 | # Base Exception classes 134 | class PackException(Exception): 135 | "Base class for exceptions encountered during packing." 136 | pass 137 | class UnpackException(Exception): 138 | "Base class for exceptions encountered during unpacking." 139 | pass 140 | 141 | # Packing error 142 | class UnsupportedTypeException(PackException): 143 | "Object type not supported for packing." 144 | pass 145 | 146 | # Unpacking error 147 | class InsufficientDataException(UnpackException): 148 | "Insufficient data to unpack the encoded object." 149 | pass 150 | class InvalidStringException(UnpackException): 151 | "Invalid UTF-8 string encountered during unpacking." 152 | pass 153 | class ReservedCodeException(UnpackException): 154 | "Reserved code encountered during unpacking." 155 | pass 156 | class UnhashableKeyException(UnpackException): 157 | """ 158 | Unhashable key encountered during map unpacking. 159 | The serialized map cannot be deserialized into a Python dictionary. 160 | """ 161 | pass 162 | class DuplicateKeyException(UnpackException): 163 | "Duplicate key encountered during map unpacking." 164 | pass 165 | 166 | # Backwards compatibility 167 | KeyNotPrimitiveException = UnhashableKeyException 168 | KeyDuplicateException = DuplicateKeyException 169 | 170 | ################################################################################ 171 | ### Exported Functions and Globals 172 | ################################################################################ 173 | 174 | # Exported functions and variables, set up in __init() 175 | pack = None 176 | packb = None 177 | unpack = None 178 | unpackb = None 179 | dump = None 180 | dumps = None 181 | load = None 182 | loads = None 183 | 184 | compatibility = False 185 | """ 186 | Compatibility mode boolean. 187 | 188 | When compatibility mode is enabled, u-msgpack-python will serialize both 189 | unicode strings and bytes into the old "raw" msgpack type, and deserialize the 190 | "raw" msgpack type into bytes. This provides backwards compatibility with the 191 | old MessagePack specification. 192 | 193 | Example: 194 | >>> umsgpack.compatibility = True 195 | >>> 196 | >>> umsgpack.packb([u"some string", b"some bytes"]) 197 | b'\x92\xabsome string\xaasome bytes' 198 | >>> umsgpack.unpackb(_) 199 | [b'some string', b'some bytes'] 200 | >>> 201 | """ 202 | 203 | ################################################################################ 204 | ### Packing 205 | ################################################################################ 206 | 207 | # You may notice struct.pack("B", obj) instead of the simpler chr(obj) in the 208 | # code below. This is to allow for seamless Python 2 and 3 compatibility, as 209 | # chr(obj) has a str return type instead of bytes in Python 3, and 210 | # struct.pack(...) has the right return type in both versions. 211 | 212 | def _pack_integer(obj, fp): 213 | if obj < 0: 214 | if obj >= -32: 215 | fp.write(struct.pack("b", obj)) 216 | elif obj >= -2**(8-1): 217 | fp.write(b"\xd0" + struct.pack("b", obj)) 218 | elif obj >= -2**(16-1): 219 | fp.write(b"\xd1" + struct.pack(">h", obj)) 220 | elif obj >= -2**(32-1): 221 | fp.write(b"\xd2" + struct.pack(">i", obj)) 222 | elif obj >= -2**(64-1): 223 | fp.write(b"\xd3" + struct.pack(">q", obj)) 224 | else: 225 | raise UnsupportedTypeException("huge signed int") 226 | else: 227 | if obj <= 127: 228 | fp.write(struct.pack("B", obj)) 229 | elif obj <= 2**8-1: 230 | fp.write(b"\xcc" + struct.pack("B", obj)) 231 | elif obj <= 2**16-1: 232 | fp.write(b"\xcd" + struct.pack(">H", obj)) 233 | elif obj <= 2**32-1: 234 | fp.write(b"\xce" + struct.pack(">I", obj)) 235 | elif obj <= 2**64-1: 236 | fp.write(b"\xcf" + struct.pack(">Q", obj)) 237 | else: 238 | raise UnsupportedTypeException("huge unsigned int") 239 | 240 | def _pack_nil(obj, fp): 241 | fp.write(b"\xc0") 242 | 243 | def _pack_boolean(obj, fp): 244 | fp.write(b"\xc3" if obj else b"\xc2") 245 | 246 | def _pack_float(obj, fp): 247 | if _float_size == 64: 248 | fp.write(b"\xcb" + struct.pack(">d", obj)) 249 | else: 250 | fp.write(b"\xca" + struct.pack(">f", obj)) 251 | 252 | def _pack_string(obj, fp): 253 | obj = obj.encode('utf-8') 254 | if len(obj) <= 31: 255 | fp.write(struct.pack("B", 0xa0 | len(obj)) + obj) 256 | elif len(obj) <= 2**8-1: 257 | fp.write(b"\xd9" + struct.pack("B", len(obj)) + obj) 258 | elif len(obj) <= 2**16-1: 259 | fp.write(b"\xda" + struct.pack(">H", len(obj)) + obj) 260 | elif len(obj) <= 2**32-1: 261 | fp.write(b"\xdb" + struct.pack(">I", len(obj)) + obj) 262 | else: 263 | raise UnsupportedTypeException("huge string") 264 | 265 | def _pack_binary(obj, fp): 266 | if len(obj) <= 2**8-1: 267 | fp.write(b"\xc4" + struct.pack("B", len(obj)) + obj) 268 | elif len(obj) <= 2**16-1: 269 | fp.write(b"\xc5" + struct.pack(">H", len(obj)) + obj) 270 | elif len(obj) <= 2**32-1: 271 | fp.write(b"\xc6" + struct.pack(">I", len(obj)) + obj) 272 | else: 273 | raise UnsupportedTypeException("huge binary string") 274 | 275 | def _pack_oldspec_raw(obj, fp): 276 | if len(obj) <= 31: 277 | fp.write(struct.pack("B", 0xa0 | len(obj)) + obj) 278 | elif len(obj) <= 2**16-1: 279 | fp.write(b"\xda" + struct.pack(">H", len(obj)) + obj) 280 | elif len(obj) <= 2**32-1: 281 | fp.write(b"\xdb" + struct.pack(">I", len(obj)) + obj) 282 | else: 283 | raise UnsupportedTypeException("huge raw string") 284 | 285 | def _pack_ext(obj, fp): 286 | if len(obj.data) == 1: 287 | fp.write(b"\xd4" + struct.pack("B", obj.type & 0xff) + obj.data) 288 | elif len(obj.data) == 2: 289 | fp.write(b"\xd5" + struct.pack("B", obj.type & 0xff) + obj.data) 290 | elif len(obj.data) == 4: 291 | fp.write(b"\xd6" + struct.pack("B", obj.type & 0xff) + obj.data) 292 | elif len(obj.data) == 8: 293 | fp.write(b"\xd7" + struct.pack("B", obj.type & 0xff) + obj.data) 294 | elif len(obj.data) == 16: 295 | fp.write(b"\xd8" + struct.pack("B", obj.type & 0xff) + obj.data) 296 | elif len(obj.data) <= 2**8-1: 297 | fp.write(b"\xc7" + struct.pack("BB", len(obj.data), obj.type & 0xff) + obj.data) 298 | elif len(obj.data) <= 2**16-1: 299 | fp.write(b"\xc8" + struct.pack(">HB", len(obj.data), obj.type & 0xff) + obj.data) 300 | elif len(obj.data) <= 2**32-1: 301 | fp.write(b"\xc9" + struct.pack(">IB", len(obj.data), obj.type & 0xff) + obj.data) 302 | else: 303 | raise UnsupportedTypeException("huge ext data") 304 | 305 | def _pack_array(obj, fp): 306 | if len(obj) <= 15: 307 | fp.write(struct.pack("B", 0x90 | len(obj))) 308 | elif len(obj) <= 2**16-1: 309 | fp.write(b"\xdc" + struct.pack(">H", len(obj))) 310 | elif len(obj) <= 2**32-1: 311 | fp.write(b"\xdd" + struct.pack(">I", len(obj))) 312 | else: 313 | raise UnsupportedTypeException("huge array") 314 | 315 | for e in obj: 316 | pack(e, fp) 317 | 318 | def _pack_map(obj, fp): 319 | if len(obj) <= 15: 320 | fp.write(struct.pack("B", 0x80 | len(obj))) 321 | elif len(obj) <= 2**16-1: 322 | fp.write(b"\xde" + struct.pack(">H", len(obj))) 323 | elif len(obj) <= 2**32-1: 324 | fp.write(b"\xdf" + struct.pack(">I", len(obj))) 325 | else: 326 | raise UnsupportedTypeException("huge array") 327 | 328 | for k,v in obj.items(): 329 | pack(k, fp) 330 | pack(v, fp) 331 | 332 | ######################################## 333 | 334 | # Pack for Python 2, with 'unicode' type, 'str' type, and 'long' type 335 | def _pack2(obj, fp): 336 | """ 337 | Serialize a Python object into MessagePack bytes. 338 | 339 | Args: 340 | obj: a Python object 341 | fp: a .write()-supporting file-like object 342 | 343 | Returns: 344 | None. 345 | 346 | Raises: 347 | UnsupportedType(PackException): 348 | Object type not supported for packing. 349 | 350 | Example: 351 | >>> f = open('test.bin', 'w') 352 | >>> umsgpack.pack({u"compact": True, u"schema": 0}, f) 353 | >>> 354 | """ 355 | 356 | global compatibility 357 | 358 | if obj is None: 359 | _pack_nil(obj, fp) 360 | elif isinstance(obj, bool): 361 | _pack_boolean(obj, fp) 362 | elif isinstance(obj, int) or isinstance(obj, long): 363 | _pack_integer(obj, fp) 364 | elif isinstance(obj, float): 365 | _pack_float(obj, fp) 366 | elif compatibility and isinstance(obj, unicode): 367 | _pack_oldspec_raw(bytes(obj), fp) 368 | elif compatibility and isinstance(obj, bytes): 369 | _pack_oldspec_raw(obj, fp) 370 | elif isinstance(obj, unicode): 371 | _pack_string(obj, fp) 372 | elif isinstance(obj, str): 373 | _pack_binary(obj, fp) 374 | elif isinstance(obj, list) or isinstance(obj, tuple): 375 | _pack_array(obj, fp) 376 | elif isinstance(obj, dict): 377 | _pack_map(obj, fp) 378 | elif isinstance(obj, Ext): 379 | _pack_ext(obj, fp) 380 | else: 381 | raise UnsupportedTypeException("unsupported type: %s" % str(type(obj))) 382 | 383 | # Pack for Python 3, with unicode 'str' type, 'bytes' type, and no 'long' type 384 | def _pack3(obj, fp): 385 | """ 386 | Serialize a Python object into MessagePack bytes. 387 | 388 | Args: 389 | obj: a Python object 390 | fp: a .write()-supporting file-like object 391 | 392 | Returns: 393 | None. 394 | 395 | Raises: 396 | UnsupportedType(PackException): 397 | Object type not supported for packing. 398 | 399 | Example: 400 | >>> f = open('test.bin', 'w') 401 | >>> umsgpack.pack({u"compact": True, u"schema": 0}, fp) 402 | >>> 403 | """ 404 | global compatibility 405 | 406 | if obj is None: 407 | _pack_nil(obj, fp) 408 | elif isinstance(obj, bool): 409 | _pack_boolean(obj, fp) 410 | elif isinstance(obj, int): 411 | _pack_integer(obj, fp) 412 | elif isinstance(obj, float): 413 | _pack_float(obj, fp) 414 | elif compatibility and isinstance(obj, str): 415 | _pack_oldspec_raw(obj.encode('utf-8'), fp) 416 | elif compatibility and isinstance(obj, bytes): 417 | _pack_oldspec_raw(obj, fp) 418 | elif isinstance(obj, str): 419 | _pack_string(obj, fp) 420 | elif isinstance(obj, bytes): 421 | _pack_binary(obj, fp) 422 | elif isinstance(obj, list) or isinstance(obj, tuple): 423 | _pack_array(obj, fp) 424 | elif isinstance(obj, dict): 425 | _pack_map(obj, fp) 426 | elif isinstance(obj, Ext): 427 | _pack_ext(obj, fp) 428 | else: 429 | raise UnsupportedTypeException("unsupported type: %s" % str(type(obj))) 430 | 431 | def _packb2(obj): 432 | """ 433 | Serialize a Python object into MessagePack bytes. 434 | 435 | Args: 436 | obj: a Python object 437 | 438 | Returns: 439 | A 'str' containing serialized MessagePack bytes. 440 | 441 | Raises: 442 | UnsupportedType(PackException): 443 | Object type not supported for packing. 444 | 445 | Example: 446 | >>> umsgpack.packb({u"compact": True, u"schema": 0}) 447 | '\x82\xa7compact\xc3\xa6schema\x00' 448 | >>> 449 | """ 450 | fp = io.BytesIO() 451 | _pack2(obj, fp) 452 | return fp.getvalue() 453 | 454 | def _packb3(obj): 455 | """ 456 | Serialize a Python object into MessagePack bytes. 457 | 458 | Args: 459 | obj: a Python object 460 | 461 | Returns: 462 | A 'bytes' containing serialized MessagePack bytes. 463 | 464 | Raises: 465 | UnsupportedType(PackException): 466 | Object type not supported for packing. 467 | 468 | Example: 469 | >>> umsgpack.packb({u"compact": True, u"schema": 0}) 470 | b'\x82\xa7compact\xc3\xa6schema\x00' 471 | >>> 472 | """ 473 | fp = io.BytesIO() 474 | _pack3(obj, fp) 475 | return fp.getvalue() 476 | 477 | ################################################################################ 478 | ### Unpacking 479 | ################################################################################ 480 | 481 | def _read_except(fp, n): 482 | data = fp.read(n) 483 | if len(data) < n: 484 | raise InsufficientDataException() 485 | return data 486 | 487 | def _unpack_integer(code, fp): 488 | if (ord(code) & 0xe0) == 0xe0: 489 | return struct.unpack("b", code)[0] 490 | elif code == b'\xd0': 491 | return struct.unpack("b", _read_except(fp, 1))[0] 492 | elif code == b'\xd1': 493 | return struct.unpack(">h", _read_except(fp, 2))[0] 494 | elif code == b'\xd2': 495 | return struct.unpack(">i", _read_except(fp, 4))[0] 496 | elif code == b'\xd3': 497 | return struct.unpack(">q", _read_except(fp, 8))[0] 498 | elif (ord(code) & 0x80) == 0x00: 499 | return struct.unpack("B", code)[0] 500 | elif code == b'\xcc': 501 | return struct.unpack("B", _read_except(fp, 1))[0] 502 | elif code == b'\xcd': 503 | return struct.unpack(">H", _read_except(fp, 2))[0] 504 | elif code == b'\xce': 505 | return struct.unpack(">I", _read_except(fp, 4))[0] 506 | elif code == b'\xcf': 507 | return struct.unpack(">Q", _read_except(fp, 8))[0] 508 | raise Exception("logic error, not int: 0x%02x" % ord(code)) 509 | 510 | def _unpack_reserved(code, fp): 511 | if code == b'\xc1': 512 | raise ReservedCodeException("encountered reserved code: 0x%02x" % ord(code)) 513 | raise Exception("logic error, not reserved code: 0x%02x" % ord(code)) 514 | 515 | def _unpack_nil(code, fp): 516 | if code == b'\xc0': 517 | return None 518 | raise Exception("logic error, not nil: 0x%02x" % ord(code)) 519 | 520 | def _unpack_boolean(code, fp): 521 | if code == b'\xc2': 522 | return False 523 | elif code == b'\xc3': 524 | return True 525 | raise Exception("logic error, not boolean: 0x%02x" % ord(code)) 526 | 527 | def _unpack_float(code, fp): 528 | if code == b'\xca': 529 | return struct.unpack(">f", _read_except(fp, 4))[0] 530 | elif code == b'\xcb': 531 | return struct.unpack(">d", _read_except(fp, 8))[0] 532 | raise Exception("logic error, not float: 0x%02x" % ord(code)) 533 | 534 | def _unpack_string(code, fp): 535 | if (ord(code) & 0xe0) == 0xa0: 536 | length = ord(code) & ~0xe0 537 | elif code == b'\xd9': 538 | length = struct.unpack("B", _read_except(fp, 1))[0] 539 | elif code == b'\xda': 540 | length = struct.unpack(">H", _read_except(fp, 2))[0] 541 | elif code == b'\xdb': 542 | length = struct.unpack(">I", _read_except(fp, 4))[0] 543 | else: 544 | raise Exception("logic error, not string: 0x%02x" % ord(code)) 545 | 546 | # Always return raw bytes in compatibility mode 547 | global compatibility 548 | if compatibility: 549 | return _read_except(fp, length) 550 | 551 | try: 552 | return bytes.decode(_read_except(fp, length), 'utf-8') 553 | except UnicodeDecodeError: 554 | raise InvalidStringException("unpacked string is not utf-8") 555 | 556 | def _unpack_binary(code, fp): 557 | if code == b'\xc4': 558 | length = struct.unpack("B", _read_except(fp, 1))[0] 559 | elif code == b'\xc5': 560 | length = struct.unpack(">H", _read_except(fp, 2))[0] 561 | elif code == b'\xc6': 562 | length = struct.unpack(">I", _read_except(fp, 4))[0] 563 | else: 564 | raise Exception("logic error, not binary: 0x%02x" % ord(code)) 565 | 566 | return _read_except(fp, length) 567 | 568 | def _unpack_ext(code, fp): 569 | if code == b'\xd4': 570 | length = 1 571 | elif code == b'\xd5': 572 | length = 2 573 | elif code == b'\xd6': 574 | length = 4 575 | elif code == b'\xd7': 576 | length = 8 577 | elif code == b'\xd8': 578 | length = 16 579 | elif code == b'\xc7': 580 | length = struct.unpack("B", _read_except(fp, 1))[0] 581 | elif code == b'\xc8': 582 | length = struct.unpack(">H", _read_except(fp, 2))[0] 583 | elif code == b'\xc9': 584 | length = struct.unpack(">I", _read_except(fp, 4))[0] 585 | else: 586 | raise Exception("logic error, not ext: 0x%02x" % ord(code)) 587 | 588 | return Ext(ord(_read_except(fp, 1)), _read_except(fp, length)) 589 | 590 | def _unpack_array(code, fp): 591 | if (ord(code) & 0xf0) == 0x90: 592 | length = (ord(code) & ~0xf0) 593 | elif code == b'\xdc': 594 | length = struct.unpack(">H", _read_except(fp, 2))[0] 595 | elif code == b'\xdd': 596 | length = struct.unpack(">I", _read_except(fp, 4))[0] 597 | else: 598 | raise Exception("logic error, not array: 0x%02x" % ord(code)) 599 | 600 | return [_unpack(fp) for i in range(length)] 601 | 602 | def _deep_list_to_tuple(obj): 603 | if isinstance(obj, list): 604 | return tuple([_deep_list_to_tuple(e) for e in obj]) 605 | return obj 606 | 607 | def _unpack_map(code, fp): 608 | if (ord(code) & 0xf0) == 0x80: 609 | length = (ord(code) & ~0xf0) 610 | elif code == b'\xde': 611 | length = struct.unpack(">H", _read_except(fp, 2))[0] 612 | elif code == b'\xdf': 613 | length = struct.unpack(">I", _read_except(fp, 4))[0] 614 | else: 615 | raise Exception("logic error, not map: 0x%02x" % ord(code)) 616 | 617 | d = {} 618 | for i in range(length): 619 | # Unpack key 620 | k = _unpack(fp) 621 | 622 | if isinstance(k, list): 623 | # Attempt to convert list into a hashable tuple 624 | k = _deep_list_to_tuple(k) 625 | elif not isinstance(k, collections.Hashable): 626 | raise UnhashableKeyException("encountered unhashable key: %s, %s" % (str(k), str(type(k)))) 627 | elif k in d: 628 | raise DuplicateKeyException("encountered duplicate key: %s, %s" % (str(k), str(type(k)))) 629 | 630 | # Unpack value 631 | v = _unpack(fp) 632 | 633 | try: 634 | d[k] = v 635 | except TypeError: 636 | raise UnhashableKeyException("encountered unhashable key: %s" % str(k)) 637 | return d 638 | 639 | def _unpack(fp): 640 | code = _read_except(fp, 1) 641 | return _unpack_dispatch_table[code](code, fp) 642 | 643 | ######################################## 644 | 645 | def _unpack2(fp): 646 | """ 647 | Deserialize MessagePack bytes into a Python object. 648 | 649 | Args: 650 | fp: a .read()-supporting file-like object 651 | 652 | Returns: 653 | A Python object. 654 | 655 | Raises: 656 | InsufficientDataException(UnpackException): 657 | Insufficient data to unpack the encoded object. 658 | InvalidStringException(UnpackException): 659 | Invalid UTF-8 string encountered during unpacking. 660 | ReservedCodeException(UnpackException): 661 | Reserved code encountered during unpacking. 662 | UnhashableKeyException(UnpackException): 663 | Unhashable key encountered during map unpacking. 664 | The serialized map cannot be deserialized into a Python dictionary. 665 | DuplicateKeyException(UnpackException): 666 | Duplicate key encountered during map unpacking. 667 | 668 | Example: 669 | >>> f = open("test.bin") 670 | >>> umsgpack.unpackb(f) 671 | {u'compact': True, u'schema': 0} 672 | >>> 673 | """ 674 | return _unpack(fp) 675 | 676 | def _unpack3(fp): 677 | """ 678 | Deserialize MessagePack bytes into a Python object. 679 | 680 | Args: 681 | fp: a .read()-supporting file-like object 682 | 683 | Returns: 684 | A Python object. 685 | 686 | Raises: 687 | InsufficientDataException(UnpackException): 688 | Insufficient data to unpack the encoded object. 689 | InvalidStringException(UnpackException): 690 | Invalid UTF-8 string encountered during unpacking. 691 | ReservedCodeException(UnpackException): 692 | Reserved code encountered during unpacking. 693 | UnhashableKeyException(UnpackException): 694 | Unhashable key encountered during map unpacking. 695 | The serialized map cannot be deserialized into a Python dictionary. 696 | DuplicateKeyException(UnpackException): 697 | Duplicate key encountered during map unpacking. 698 | 699 | Example: 700 | >>> f = open("test.bin") 701 | >>> umsgpack.unpackb(f) 702 | {'compact': True, 'schema': 0} 703 | >>> 704 | """ 705 | return _unpack(fp) 706 | 707 | # For Python 2, expects a str object 708 | def _unpackb2(s): 709 | """ 710 | Deserialize MessagePack bytes into a Python object. 711 | 712 | Args: 713 | s: a 'str' containing serialized MessagePack bytes 714 | 715 | Returns: 716 | A Python object. 717 | 718 | Raises: 719 | TypeError: 720 | Packed data is not type 'str'. 721 | InsufficientDataException(UnpackException): 722 | Insufficient data to unpack the encoded object. 723 | InvalidStringException(UnpackException): 724 | Invalid UTF-8 string encountered during unpacking. 725 | ReservedCodeException(UnpackException): 726 | Reserved code encountered during unpacking. 727 | UnhashableKeyException(UnpackException): 728 | Unhashable key encountered during map unpacking. 729 | The serialized map cannot be deserialized into a Python dictionary. 730 | DuplicateKeyException(UnpackException): 731 | Duplicate key encountered during map unpacking. 732 | 733 | Example: 734 | >>> umsgpack.unpackb(b'\x82\xa7compact\xc3\xa6schema\x00') 735 | {u'compact': True, u'schema': 0} 736 | >>> 737 | """ 738 | if not isinstance(s, str): 739 | raise TypeError("packed data is not type 'str'") 740 | return _unpack(io.BytesIO(s)) 741 | 742 | # For Python 3, expects a bytes object 743 | def _unpackb3(s): 744 | """ 745 | Deserialize MessagePack bytes into a Python object. 746 | 747 | Args: 748 | s: a 'bytes' containing serialized MessagePack bytes 749 | 750 | Returns: 751 | A Python object. 752 | 753 | Raises: 754 | TypeError: 755 | Packed data is not type 'bytes'. 756 | InsufficientDataException(UnpackException): 757 | Insufficient data to unpack the encoded object. 758 | InvalidStringException(UnpackException): 759 | Invalid UTF-8 string encountered during unpacking. 760 | ReservedCodeException(UnpackException): 761 | Reserved code encountered during unpacking. 762 | UnhashableKeyException(UnpackException): 763 | Unhashable key encountered during map unpacking. 764 | The serialized map cannot be deserialized into a Python dictionary. 765 | DuplicateKeyException(UnpackException): 766 | Duplicate key encountered during map unpacking. 767 | 768 | Example: 769 | >>> umsgpack.unpackb(b'\x82\xa7compact\xc3\xa6schema\x00') 770 | {'compact': True, 'schema': 0} 771 | >>> 772 | """ 773 | if not isinstance(s, bytes): 774 | raise TypeError("packed data is not type 'bytes'") 775 | return _unpack(io.BytesIO(s)) 776 | 777 | ################################################################################ 778 | ### Module Initialization 779 | ################################################################################ 780 | 781 | def __init(): 782 | global pack 783 | global packb 784 | global unpack 785 | global unpackb 786 | global dump 787 | global dumps 788 | global load 789 | global loads 790 | global compatibility 791 | global _float_size 792 | global _unpack_dispatch_table 793 | 794 | # Compatibility mode for handling strings/bytes with the old specification 795 | compatibility = False 796 | 797 | # Auto-detect system float precision 798 | if sys.float_info.mant_dig == 53: 799 | _float_size = 64 800 | else: 801 | _float_size = 32 802 | 803 | # Map packb and unpackb to the appropriate version 804 | if sys.version_info[0] == 3: 805 | pack = _pack3 806 | packb = _packb3 807 | dump = _pack3 808 | dumps = _packb3 809 | unpack = _unpack3 810 | unpackb = _unpackb3 811 | load = _unpack3 812 | loads = _unpackb3 813 | else: 814 | pack = _pack2 815 | packb = _packb2 816 | dump = _pack2 817 | dumps = _packb2 818 | unpack = _unpack2 819 | unpackb = _unpackb2 820 | load = _unpack2 821 | loads = _unpackb2 822 | 823 | # Build a dispatch table for fast lookup of unpacking function 824 | 825 | _unpack_dispatch_table = {} 826 | # Fix uint 827 | for code in range(0, 0x7f+1): 828 | _unpack_dispatch_table[struct.pack("B", code)] = _unpack_integer 829 | # Fix map 830 | for code in range(0x80, 0x8f+1): 831 | _unpack_dispatch_table[struct.pack("B", code)] = _unpack_map 832 | # Fix array 833 | for code in range(0x90, 0x9f+1): 834 | _unpack_dispatch_table[struct.pack("B", code)] = _unpack_array 835 | # Fix str 836 | for code in range(0xa0, 0xbf+1): 837 | _unpack_dispatch_table[struct.pack("B", code)] = _unpack_string 838 | # Nil 839 | _unpack_dispatch_table[b'\xc0'] = _unpack_nil 840 | # Reserved 841 | _unpack_dispatch_table[b'\xc1'] = _unpack_reserved 842 | # Boolean 843 | _unpack_dispatch_table[b'\xc2'] = _unpack_boolean 844 | _unpack_dispatch_table[b'\xc3'] = _unpack_boolean 845 | # Bin 846 | for code in range(0xc4, 0xc6+1): 847 | _unpack_dispatch_table[struct.pack("B", code)] = _unpack_binary 848 | # Ext 849 | for code in range(0xc7, 0xc9+1): 850 | _unpack_dispatch_table[struct.pack("B", code)] = _unpack_ext 851 | # Float 852 | _unpack_dispatch_table[b'\xca'] = _unpack_float 853 | _unpack_dispatch_table[b'\xcb'] = _unpack_float 854 | # Uint 855 | for code in range(0xcc, 0xcf+1): 856 | _unpack_dispatch_table[struct.pack("B", code)] = _unpack_integer 857 | # Int 858 | for code in range(0xd0, 0xd3+1): 859 | _unpack_dispatch_table[struct.pack("B", code)] = _unpack_integer 860 | # Fixext 861 | for code in range(0xd4, 0xd8+1): 862 | _unpack_dispatch_table[struct.pack("B", code)] = _unpack_ext 863 | # String 864 | for code in range(0xd9, 0xdb+1): 865 | _unpack_dispatch_table[struct.pack("B", code)] = _unpack_string 866 | # Array 867 | _unpack_dispatch_table[b'\xdc'] = _unpack_array 868 | _unpack_dispatch_table[b'\xdd'] = _unpack_array 869 | # Map 870 | _unpack_dispatch_table[b'\xde'] = _unpack_map 871 | _unpack_dispatch_table[b'\xdf'] = _unpack_map 872 | # Negative fixint 873 | for code in range(0xe0, 0xff+1): 874 | _unpack_dispatch_table[struct.pack("B", code)] = _unpack_integer 875 | 876 | __init() 877 | -------------------------------------------------------------------------------- /vendor/updatelm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #### Edit paths before using 4 | 5 | sqlite3 ../chatlog.db <<< 'select text from messages where text is not null and text != "" and text not like "/%" and src != 120400693;' | tee chatlog.txt | python3 ../truecaser.py -t truecase.txt 6 | pv chatlog.txt | python3 ../truecaser.py truecase.txt | perl -p -e 's|^[^\n ]+] ||' | python3 logcutfilter.py | opencc -c t2s.json | awk '!seen[$0]++' | tee chatlogf.txt | sed 's/“//g;s/”//g;s/ / /g;s/ /\n/g' | awk '{seen[$0]++} END {for (i in seen) {if (seen[i] > 5) print i}}' > chatdict.txt 7 | rm chatlog.txt 8 | 9 | ~/software/moses/bin/lmplz -o 6 --prune 0 0 0 0 0 1 -S 50% --text chatlogf.txt --arpa chat.lm 10 | ~/software/moses/bin/build_binary trie chat.lm chat.binlm 11 | 12 | rm chat.lm 13 | pv chatlogf.txt | pypy3 learnctx.py chatdict.txt 14 | -------------------------------------------------------------------------------- /vendor/zhutil.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import itertools 4 | 5 | halfwidth = frozenset('!(),:;?') 6 | fullwidth = frozenset(itertools.chain( 7 | range(0xFF02, 0xFF07 + 1), 8 | (0xFF0A, 0xFF0B, 0xFF0E, 0xFF0F, 0xFF1C, 0xFF1D, 9 | 0xFF1E, 0xFF3C, 0xFF3E, 0xFF3F, 0xFF40), 10 | range(0xFF10, 0xFF19 + 1), 11 | range(0xFF20, 0xFF3A + 1), 12 | range(0xFF41, 0xFF5A + 1))) 13 | resentencesp = re.compile('([﹒﹔﹖﹗.;。!?]["’”」』]{0,2}|:(?=["‘“「『]{1,2}|$))') 14 | refixmissing = re.compile( 15 | '(^[^"‘“「『’”」』,;。!?]+["’”」』]|^["‘“「『]?[^"‘“「『’”」』]+[,;。!?][^"‘“「『‘“「『]*["’”」』])(?!["‘“「『’”」』,;。!?])') 16 | 17 | punctstr = ( 18 | '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~¢£¥·ˇˉ―‖‘’“”•′‵、。々' 19 | '〈〉《》「」『』【】〔〕〖〗〝〞︰︱︳︴︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄' 20 | '﹏﹐﹒﹔﹕﹖﹗﹙﹚﹛﹜﹝﹞!(),.:;?[{|}~、¢£¥') 21 | 22 | punct = frozenset(punctstr) 23 | 24 | whitespace = ' \t\n\r\x0b\x0c\u3000' 25 | 26 | resplitpunct = re.compile('([%s])' % re.escape(punctstr)) 27 | 28 | tailpunct = ('''!),-.:;?]}¢·ˇˉ―‖’”•′■□△○●''' 29 | '''、。々〉》」』】〕〗〞︰︱︳︴︶︸︺︼︾﹀﹂﹄﹏''' 30 | '''﹐﹒﹔﹕﹖﹗﹚﹜﹞!),.:;?|]}~、¢''') + whitespace 31 | headpunct = ('''([`{£¥‘“〈《「『【〔〖〝''' 32 | '''︵︷︹︻︽︿﹁﹃﹙﹛﹝([{£¥''') + whitespace 33 | 34 | openbrckt = ('([{([{⦅〚⦃“‘‹«「〈《【〔⦗『〖〘「⟦⟨⟪⟮⟬⌈⌊⦇⦉❛❝❨❪❴❬❮❰❲' 35 | '⏜⎴⏞〝︵⏠﹁﹃︹︻︗︿︽﹇︷〈⦑⧼﹙﹛﹝⁽₍⦋⦍⦏⁅⸢⸤⟅⦓⦕⸦⸨⦅⧘⧚⸜⸌⸂⸄⸉᚛༺༼') 36 | clozbrckt = (')]})]}⦆〛⦄”’›»」〉》】〕⦘』〗〙」⟧⟩⟫⟯⟭⌉⌋⦈⦊❜❞❩❫❵❭❯❱❳' 37 | '⏝⎵⏟〞︶⏡﹂﹄︺︼︘﹀︾﹈︸〉⦒⧽﹚﹜﹞⁾₎⦌⦎⦐⁆⸣⸥⟆⦔⦖⸧⸩⦆⧙⧛⸝⸍⸃⸅⸊᚜༻༽') 38 | 39 | ucjk = frozenset(itertools.chain( 40 | range(0x1100, 0x11FF + 1), 41 | range(0x2E80, 0xA4CF + 1), 42 | range(0xA840, 0xA87F + 1), 43 | range(0xAC00, 0xD7AF + 1), 44 | range(0xF900, 0xFAFF + 1), 45 | range(0xFE30, 0xFE4F + 1), 46 | range(0xFF65, 0xFFDC + 1), 47 | range(0xFF01, 0xFF0F + 1), 48 | range(0xFF1A, 0xFF20 + 1), 49 | range(0xFF3B, 0xFF40 + 1), 50 | range(0xFF5B, 0xFF60 + 1), 51 | range(0x20000, 0x2FFFF + 1) 52 | )) 53 | 54 | zhcmodel = None 55 | zhmmodel = None 56 | _curpath = os.path.normpath( 57 | os.path.join(os.getcwd(), os.path.dirname(__file__))) 58 | 59 | RE_WS_IN_FW = re.compile( 60 | r'([‘’“”…─\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\ufe30-\ufe57\uff00-\uffef\U00020000-\U0002A6D6])\s+(?=[‘’“”…\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\ufe30-\ufe57\uff00-\uffef\U00020000-\U0002A6D6])') 61 | 62 | RE_FW = re.compile( 63 | '([\u2018\u2019\u201c\u201d\u2026\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\ufe30-\ufe57\uff00-\uffef\U00020000-\U0002A6D6]+)') 64 | 65 | RE_UCJK = re.compile( 66 | '([\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\U00020000-\U0002A6D6]+)') 67 | 68 | # Detokenization function for Chinese. 69 | detokenize = lambda s: RE_WS_IN_FW.sub(r'\1', s).strip() 70 | 71 | 72 | def splitsentence(sentence): 73 | '''Split a piece of Chinese into sentences.''' 74 | # s = ''.join((chr(ord(ch)+0xFEE0) if ch in halfwidth else ch) for ch in sentence) 75 | s = sentence 76 | slist = [] 77 | for i in resentencesp.split(s): 78 | if resentencesp.match(i) and slist: 79 | slist[-1] += i 80 | elif i: 81 | slist.append(i) 82 | return slist 83 | 84 | 85 | def splithard(sentence, maxchar=None): 86 | '''Forcely split a piece of Chinese into sentences with the limit of max sentence length.''' 87 | slist = splitsentence(sentence) 88 | if maxchar is None: 89 | return slist 90 | slist1 = [] 91 | for sent in slist: 92 | if len(sent) > maxchar: 93 | for i in resplitpunct.split(sent): 94 | if resplitpunct.match(i) and slist1: 95 | slist1[-1] += i 96 | elif i: 97 | slist1.append(i) 98 | else: 99 | slist1.append(sent) 100 | slist = slist1 101 | slist1 = [] 102 | for sent in slist: 103 | if len(sent) > maxchar: 104 | slist1.extend(sent[i:i + maxchar] 105 | for i in range(0, len(sent), maxchar)) 106 | else: 107 | slist1.append(sent) 108 | slist = slist1 109 | return slist 110 | 111 | 112 | def fixmissing(slist): 113 | '''Fix missing quotes.''' 114 | newlist = [] 115 | for i in slist: 116 | newlist.extend(filter(None, refixmissing.split(i))) 117 | return newlist 118 | 119 | 120 | def filterlist(slist): 121 | '''Get meaningful sentences.''' 122 | for i in slist: 123 | s = i.lstrip(tailpunct).rstrip(headpunct) 124 | if len(s) > 1: 125 | yield s 126 | 127 | 128 | def addwalls(tokiter): 129 | '''Add walls between punctuations for Moses.''' 130 | lastwall = False 131 | for tok in tokiter: 132 | if tok in punct: 133 | if not lastwall: 134 | yield '' 135 | yield tok 136 | yield '' 137 | lastwall = True 138 | else: 139 | yield tok 140 | lastwall = False 141 | 142 | 143 | def addwallzone(tokiter): 144 | '''Add walls and zones between punctuations for Moses.''' 145 | W = '' 146 | out = [] 147 | expect = zidx = None 148 | for tok in tokiter: 149 | if tok in punct: 150 | if not (out and out[-1] == W): 151 | out.append(W) 152 | if tok == expect: 153 | out[zidx] = '' 154 | out.append(tok) 155 | out.append('') 156 | expect = zidx = None 157 | else: 158 | bid = openbrckt.find(tok) 159 | if bid > -1: 160 | expect = clozbrckt[bid] 161 | zidx = len(out) - 1 162 | out.append(tok) 163 | out.append(W) 164 | else: 165 | out.append(tok) 166 | if out and out[0] == W: 167 | out.pop(0) 168 | if out and out[-1] == W: 169 | out.pop() 170 | return out 171 | 172 | 173 | def calctxtstat(s): 174 | '''Detect whether a string is modern or classical Chinese.''' 175 | global zhcmodel, zhmmodel 176 | if zhcmodel is None: 177 | import json 178 | zhcmodel = json.load( 179 | open(os.path.join(_curpath, 'modelzhc.json'), 'r', encoding='utf-8')) 180 | zhmmodel = json.load( 181 | open(os.path.join(_curpath, 'modelzhm.json'), 'r', encoding='utf-8')) 182 | cscore = 0 183 | mscore = 0 184 | for ch in s: 185 | ordch = ord(ch) 186 | if 0x4E00 <= ordch < 0x9FCD: 187 | cscore += zhcmodel[ordch - 0x4E00] 188 | mscore += zhmmodel[ordch - 0x4E00] 189 | return (cscore, mscore) 190 | 191 | 192 | def checktxttype(cscore, mscore): 193 | if cscore > mscore: 194 | return 'c' 195 | elif cscore < mscore: 196 | return 'm' 197 | else: 198 | return None 199 | 200 | 201 | def num2chinese(num, big=False, simp=True, o=False, twoalt=False): 202 | """ 203 | Converts numbers to Chinese representations. 204 | 205 | `big` : use financial characters. 206 | `simp` : use simplified characters instead of traditional characters. 207 | `o` : use 〇 for zero. 208 | `twoalt`: use 两/兩 for two when appropriate. 209 | 210 | Note that `o` and `twoalt` is ignored when `big` is used, 211 | and `twoalt` is ignored when `o` is used for formal representations. 212 | """ 213 | # check num first 214 | nd = str(num) 215 | if abs(float(nd)) >= 1e48: 216 | raise ValueError('number out of range') 217 | elif 'e' in nd: 218 | raise ValueError('scientific notation is not supported') 219 | c_symbol = '正负点' if simp else '正負點' 220 | if o: # formal 221 | twoalt = False 222 | if big: 223 | c_basic = '零壹贰叁肆伍陆柒捌玖' if simp else '零壹貳參肆伍陸柒捌玖' 224 | c_unit1 = '拾佰仟' 225 | c_twoalt = '贰' if simp else '貳' 226 | else: 227 | c_basic = '〇一二三四五六七八九' if o else '零一二三四五六七八九' 228 | c_unit1 = '十百千' 229 | if twoalt: 230 | c_twoalt = '两' if simp else '兩' 231 | else: 232 | c_twoalt = '二' 233 | c_unit2 = '万亿兆京垓秭穰沟涧正载' if simp else '萬億兆京垓秭穰溝澗正載' 234 | revuniq = lambda l: ''.join(k for k, g in itertools.groupby(reversed(l))) 235 | nd = str(num) 236 | result = [] 237 | if nd[0] == '+': 238 | result.append(c_symbol[0]) 239 | elif nd[0] == '-': 240 | result.append(c_symbol[1]) 241 | if '.' in nd: 242 | integer, remainder = nd.lstrip('+-').split('.') 243 | else: 244 | integer, remainder = nd.lstrip('+-'), None 245 | if int(integer): 246 | splitted = [integer[max(i - 4, 0):i] 247 | for i in range(len(integer), 0, -4)] 248 | intresult = [] 249 | for nu, unit in enumerate(splitted): 250 | # special cases 251 | if int(unit) == 0: # 0000 252 | intresult.append(c_basic[0]) 253 | continue 254 | elif nu > 0 and int(unit) == 2: # 0002 255 | intresult.append(c_twoalt + c_unit2[nu - 1]) 256 | continue 257 | ulist = [] 258 | unit = unit.zfill(4) 259 | for nc, ch in enumerate(reversed(unit)): 260 | if ch == '0': 261 | if ulist: # ???0 262 | ulist.append(c_basic[0]) 263 | elif nc == 0: 264 | ulist.append(c_basic[int(ch)]) 265 | elif nc == 1 and ch == '1' and unit[1] == '0': 266 | # special case for tens 267 | # edit the 'elif' if you don't like 268 | # 十四, 三千零十四, 三千三百一十四 269 | ulist.append(c_unit1[0]) 270 | elif nc > 1 and ch == '2': 271 | ulist.append(c_twoalt + c_unit1[nc - 1]) 272 | else: 273 | ulist.append(c_basic[int(ch)] + c_unit1[nc - 1]) 274 | ustr = revuniq(ulist) 275 | if nu == 0: 276 | intresult.append(ustr) 277 | else: 278 | intresult.append(ustr + c_unit2[nu - 1]) 279 | result.append(revuniq(intresult).strip(c_basic[0])) 280 | else: 281 | result.append(c_basic[0]) 282 | if remainder: 283 | result.append(c_symbol[2]) 284 | result.append(''.join(c_basic[int(ch)] for ch in remainder)) 285 | return ''.join(result) 286 | 287 | 288 | stripquotes = lambda s: s.lstrip('"‘“「『').rstrip('"’”」』') 289 | fw2hw = lambda s: ''.join( 290 | (chr(ord(ch) - 0xFEE0) if ord(ch) in fullwidth else ch) for ch in s) 291 | hw2fw = lambda s: ''.join( 292 | (chr(ord(ch) + 0xFEE0) if ch in halfwidth else ch) for ch in s) 293 | 294 | 295 | def _test_fixsplit(): 296 | test = """从高祖父到曾孙称为“九族”。这“九族”代表着长幼尊卑秩序和家族血统的承续关系。 297 | 《诗》、《书》、《易》、《礼》、《春秋》,再加上《乐》称“六经”,这是中国古代儒家的重要经典,应当仔细阅读。 298 | 这就是:宇宙间万事万物循环变化的道理的书籍。 299 | 《连山》、《归藏》、《周易》,是我国古代的三部书,这三部书合称“三易”,“三易”是用“卦”的形式来说明宇宙间万事万物循环变化的道理的书籍。 300 | 登楼而望,慨然而叹曰:“容容其山,旅旅其石,与地终也!吁嗟人乎!病之蚀气也,如水浸火。 301 | 吾闻老聃多寿,尝读其书曰:‘吾惟无身,是以无患。’盖欲窃之而未能也”齐宣王见孟子于雪宫。 302 | “昔者齐景公问于晏子曰:‘吾欲观于转附、朝舞,遵海而南,放于琅邪。吾何修而可以比于先王观也?’ 303 | 高祖说:“该怎样对付呢?”陈平说:“古代天子有巡察天下,召集诸侯。南方有云梦这个地方,陛下只管假装外出巡游云梦,在陈地召集诸侯。陈地在楚国的西边边境上,韩信听说天子因为爱好外出巡游,看形势必然没有什么大事,就会到国境外来拜见陛下。拜见,陛下趁机抓住他,这只是一个力士的事情而已。”“不知道。”高祖认为有道理。 304 | 。他们就是这样的。 305 | """.strip().split('\n') 306 | for s in test: 307 | print(fixmissing(splitsentence(s))) 308 | 309 | if __name__ == '__main__': 310 | import sys 311 | _test_fixsplit() 312 | print(' '.join(addwallzone('《连山》、《归藏》、《周易》,是我国古代的三部书,这三部书合称“三易”,“三易”是用“卦”的形式来说明(宇宙间万事万物循环变化的道理的书籍。'))) 313 | # print(checktxttype(sys.stdin.read())) 314 | --------------------------------------------------------------------------------