├── tg-server.pub ├── templates ├── history.txt └── simple.html ├── getlog ├── .gitignore ├── tglive.py ├── avatar.py ├── README.md ├── dbconvert.py ├── LICENSE ├── tgcli.py ├── export.py └── logfmt.py /tg-server.pub: -------------------------------------------------------------------------------- 1 | -----BEGIN RSA PUBLIC KEY----- 2 | MIIBCgKCAQEAwVACPi9w23mF3tBkdZz+zwrzKOaaQdr01vAbU4E1pvkfj4sqDsm6 3 | lyDONS789sVoD/xCS9Y0hkkC3gtL1tSfTlgCMOOul9lcixlEKzwKENj1Yz/s7daS 4 | an9tqw3bfUV/nqgbhGX81v/+7RFAEd+RwFnK7a+XYl9sluzHRyVVaTTveB2GazTw 5 | Efzk2DWgkBluml8OREmvfraX3bkHZJTKX4EQSjBbbdJ2ZXIsRrYOXfaA+xayEGB+ 6 | 8hdlLmAjbCVfaigxX0CDqWeR1yFL9kwd9P0NsZRPsmoqVwMbMu7mStFai6aIhc3n 7 | Slv8kg9qv1m6XHVQY3PnEw+QQtqSIXklHwIDAQAB 8 | -----END RSA PUBLIC KEY----- 9 | 10 | -------------------------------------------------------------------------------- /templates/history.txt: -------------------------------------------------------------------------------- 1 | {{ peer.print }} 2 | {% if count -%} 3 | From {{ start|strftime('%Y-%m-%d %H:%M:%S') }} to {{ end|strftime('%Y-%m-%d %H:%M:%S') }}, total {{ count }} 4 | {%- endif %} 5 | {% for msg in msgs %}[{{ msg.date|strftime('%Y-%m-%d %H:%M:%S') }}] {{ msg.src.print }}{% if msg.msgtype == 'fwd' %} [Fwd: {{ msg.extra.fwd_src.print }}] 6 | {%- elif msg.msgtype == 're' %} [Re: {{ msg.extra.reply.mid }}] 7 | {%- endif %} >>>{% if msg.text %} {{ msg.text }}{% endif %}{% if msg.media %} [{{ msg.media.type|d('IRC') }}]{% endif %}{% if msg.service %} [{{ msg.action.type }}]{% endif %} 8 | {% endfor %} 9 | -------------------------------------------------------------------------------- /getlog: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BOT_LOG_FILE=chatlog.db 4 | PEER_ID=-12345678 5 | PEER_TITLE='Some Interesting Group' 6 | CACHE_PATH=img 7 | URL_PREFIX=/img/ 8 | HARD_LIMIT=100000 9 | 10 | if [ "$HTTP_IF_MODIFIED_SINCE" ]; then 11 | if [ "$(date -u -d "$HTTP_IF_MODIFIED_SINCE" +%s)" -ge "$(stat -c %Y $BOT_LOG_FILE)" ]; then 12 | echo 'Status: 304 Not Modified' 13 | echo 14 | exit 0 15 | fi 16 | fi 17 | 18 | echo 'Status: 200 OK' 19 | echo 'Content-Type: text/html; charset=utf-8' 20 | echo "Last-Modified: $(date -R -u -d @$(stat -c %Y $BOT_LOG_FILE))" 21 | echo 22 | 23 | if [ "$QUERY_STRING" ]; then 24 | limit="$(echo "$QUERY_STRING" | tr -Cd [[:digit:],])" 25 | else 26 | limit=500 27 | fi 28 | 29 | python3 logfmt.py -b $BOT_LOG_FILE -d '' -t html -D=$PEER_ID -o=- -P="$PEER_TITLE" -l $limit -L $HARD_LIMIT -c $CACHE_PATH -r $URL_PREFIX $PEER_ID 30 | 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | *.db 60 | *.db-journal 61 | -------------------------------------------------------------------------------- /tglive.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | ''' 5 | Proof-of-concept telegram message live broadcasting with 6 | [live-danmaku-hime](https://github.com/m13253/live-danmaku-hime) 7 | ''' 8 | 9 | import sys 10 | import time 11 | import tgcli 12 | import jinja2 13 | import logging 14 | import textwrap 15 | 16 | logging.basicConfig(stream=sys.stderr, 17 | format='%(asctime)s [%(levelname)s] %(message)s', level=logging.DEBUG) 18 | 19 | txt_template = '''[{{ msg.date|strftime('%H:%M') }} {{ msg.to.print_name[:8] }}] {{ msg.from.print_name }}{% if 'fwd_from' in msg %} [Fwd: {{ msg.fwd_from.print_name }}] 20 | {%- elif 'reply_id' in msg %} [Re] 21 | {%- endif %} >{% if msg.text %} {{ msg.text }}{% endif %}{% if msg.media %} [{{ msg.media.type }}]{% endif %}{% if msg.service %} [{{ msg.action.type }}]{% endif %}''' 22 | 23 | jinjaenv = jinja2.Environment(loader=jinja2.DictLoader({'txt': txt_template})) 24 | jinjaenv.filters['strftime'] = lambda date, fmt='%Y-%m-%d %H:%M:%S': time.strftime(fmt, time.localtime(date)) 25 | 26 | template = jinjaenv.get_template('txt') 27 | 28 | WIDTH = 35 29 | 30 | def print_msg(msg): 31 | logging.debug(msg) 32 | try: 33 | if msg.get('event') in ('message', 'service'): 34 | s = template.render(msg=msg).strip() 35 | s = '\n'.join(textwrap.wrap(s, WIDTH)) + '\n' 36 | sys.stdout.write(s) 37 | sys.stdout.flush() 38 | except Exception: 39 | logging.exception('Failed to process a message.') 40 | 41 | 42 | with tgcli.TelegramCliInterface(sys.argv[1]) as c: 43 | c.on_json = print_msg 44 | for ln in sys.stdin: 45 | l = ln.strip() 46 | if l == 'q': 47 | break 48 | elif l.isdigit(): 49 | WIDTH = int(l) 50 | -------------------------------------------------------------------------------- /avatar.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import sys 6 | import logging 7 | import argparse 8 | 9 | import tgcli 10 | 11 | logging.basicConfig(stream=sys.stdout, format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO) 12 | 13 | def export_avatar_peer(tc, peertype, pid, filename): 14 | peername = '%s#id%d' % (peertype, pid) 15 | if os.path.isfile(filename): 16 | logging.info('Avatar exists: ' + peername) 17 | return 18 | res = getattr(tc, 'cmd_load_%s_photo' % peertype)(peername) 19 | if 'result' in res and res['result'] != 'FAIL': 20 | os.rename(res['result'], filename) 21 | logging.info('Exported avatar for %s' % peername) 22 | else: 23 | logging.warning('Failed to export avatar for %s: %s' % (peername, res)) 24 | 25 | def export_avatar_group(tc, grouptype, pid, path): 26 | peername = '%s#id%d' % (grouptype, pid) 27 | members = {} 28 | logging.info('Fetching info for %s' % peername) 29 | if grouptype == 'channel': 30 | items = tc.cmd_channel_get_members(peername, 100) 31 | for item in items: 32 | members[item['peer_id']] = item 33 | dcount = 100 34 | while items: 35 | items = tc.cmd_channel_get_members(peername, 100, dcount) 36 | for item in items: 37 | members[item['peer_id']] = item 38 | dcount += 100 39 | else: 40 | obj = tc.cmd_chat_info(peername) 41 | for item in obj['members']: 42 | members[item['peer_id']] = item 43 | for key in members: 44 | export_avatar_peer(tc, 'user', key, os.path.join(path, '%d.jpg' % key)) 45 | 46 | def main(argv): 47 | parser = argparse.ArgumentParser(description="Export Telegram messages.") 48 | parser.add_argument("-o", "--output", help="output path", default="export") 49 | parser.add_argument("-g", "--group", help="export every user's avatar in a group or channel", action='store_true') 50 | parser.add_argument("-t", "--type", help="peer type, can be 'user', 'chat', 'channel'", default="user") 51 | parser.add_argument("-i", "--id", help="peer id", type=int) 52 | parser.add_argument("-e", "--tgbin", help="Telegram-cli binary path", default="bin/telegram-cli") 53 | args = parser.parse_args(argv) 54 | 55 | with tgcli.TelegramCliInterface(args.tgbin, run=False) as tc: 56 | tc.cmd_dialog_list() 57 | if not os.path.isdir(args.output): 58 | os.mkdir(args.output) 59 | if args.group: 60 | export_avatar_group(tc, args.type, args.id, args.output) 61 | else: 62 | export_avatar_peer(tc, args.type, args.id, os.path.join(args.output, '%s%d.jpg' % (args.type, args.id))) 63 | 64 | if __name__ == '__main__': 65 | main(sys.argv[1:]) 66 | -------------------------------------------------------------------------------- /templates/simple.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{ peer.print|escape }} 6 | 17 | 18 | 19 | 20 | 21 | 22 | {% for msg in msgs -%} 23 | 24 | 25 | 26 | 70 | 71 | {%- endfor %} 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tg-export 2 | 3 | **Deprecation notice**: Since tg-cli is not maintained and becomes unusable, this project is deprecated. Please use alternatives like [telegram-export](https://github.com/expectocode/telegram-export) instead. 4 | 5 | Export Telegram messages, using [telegram-cli](https://github.com/vysheng/tg). [Patched version](https://github.com/gumblex/tg) recommended. 6 | 7 | This version (v3) is compatible with `vysheng/tg/master` AND `vysheng/tg/test` 8 | branches. 9 | 10 | **Note**: The database format of this version (v3) is not compatible with the old ones. 11 | To convert old databases (v1 or v2), run `python3 dbconvert.py [old.db [new.db]]` 12 | 13 | ## export.py 14 | 15 | ``` 16 | $ python3 export.py -h 17 | usage: export.py [-h] [-o OUTPUT] [-d DB] [-f] [-p PEER] [-B] [-t TIMEOUT] 18 | [-l] [-L] [-e TGBIN] [-v] 19 | 20 | Export Telegram messages. 21 | 22 | optional arguments: 23 | -h, --help show this help message and exit 24 | -o OUTPUT, --output OUTPUT 25 | output path 26 | -d DB, --db DB database path 27 | -f, --force force download all messages 28 | -p PEER, --peer PEER only download messages for this peer (format: 29 | channel#id1001234567, or use partial name/title as 30 | shown in tgcli) 31 | -B, --batch-only fetch messages in batch only, don't try to get more 32 | missing messages 33 | -t TIMEOUT, --timeout TIMEOUT 34 | tg-cli command timeout 35 | -l, --logging logging mode (keep running) 36 | -L, --keep-logging first export, then keep logging 37 | -e TGBIN, --tgbin TGBIN 38 | telegram-cli binary path 39 | -v, --verbose print debug messages 40 | ``` 41 | 42 | **Lots** of workaround about the unreliability of tg-cli is included (in this script and `tgcli.py`), so the script itself may be unreliable as well. 43 | 44 | Common problems with tg-cli are: 45 | * Dies arbitrarily. 46 | * No response in the socket interface. 47 | * Slow response in the socket interface. 48 | * Half response in the socket interface, while the another half appears after the timeout. 49 | * Returns an empty array when actually there are remaining messages. 50 | 51 | **Note**: When it's trying to get the remaining messages, the telegram-cli will crash like crazy. That's due to non-existent messages. For a quick fix, use [this fork](https://github.com/gumblex/tg) of tg-cli. 52 | 53 | Which is called NO WARRANTY™. 54 | 55 | ## logfmt.py 56 | 57 | This script can process database written by `export.py` or [tg-chatdig](https://github.com/gumblex/tg-chatdig), and write out a human-readable format (txt, html, etc.) according to a jinja2 template. 58 | 59 | ``` 60 | usage: logfmt.py [-h] [-o OUTPUT] [-d DB] [-b BOTDB] [-D BOTDB_DEST] [-u] 61 | [-t TEMPLATE] [-P PEER_PRINT] [-l LIMIT] [-L HARDLIMIT] 62 | [-c CACHEDIR] [-r URLPREFIX] 63 | peer 64 | 65 | Format exported database file into human-readable format. 66 | 67 | positional arguments: 68 | peer export certain peer id or tg-cli-style peer print name 69 | 70 | optional arguments: 71 | -h, --help show this help message and exit 72 | -o OUTPUT, --output OUTPUT 73 | output path 74 | -d DB, --db DB tg-export database path 75 | -b BOTDB, --botdb BOTDB 76 | tg-chatdig bot database path 77 | -D BOTDB_DEST, --botdb-dest BOTDB_DEST 78 | tg-chatdig bot logged chat id or tg-cli-style peer 79 | name 80 | -u, --botdb-user use user information in tg-chatdig database first 81 | -t TEMPLATE, --template TEMPLATE 82 | export template, can be 'txt'(default), 'html', 83 | 'json', or template file name 84 | -P PEER_PRINT, --peer-print PEER_PRINT 85 | set print name for the peer 86 | -l LIMIT, --limit LIMIT 87 | limit the number of fetched messages and set the 88 | offset 89 | -L HARDLIMIT, --hardlimit HARDLIMIT 90 | set a hard limit of the number of messages, must be 91 | used with -l 92 | -c CACHEDIR, --cachedir CACHEDIR 93 | the path of media files 94 | -r URLPREFIX, --urlprefix URLPREFIX 95 | the url prefix of media files 96 | ``` 97 | 98 | ## tgcli.py 99 | Simple wrapper for telegram-cli interface. 100 | 101 | Example: 102 | ```python 103 | tgcli = TelegramCliInterface('../tg/bin/telegram-cli') 104 | dialogs = tgcli.cmd_dialog_list() 105 | ``` 106 | 107 | ### TelegramCliInterface(cmd, extra_args=(), run=True) 108 | 109 | * `run()` starts the subprocess, needed when object created with `run=False`. 110 | * `send_command(cmd, timeout=180, resync=True)` sends a command to tg-cli. use `resync` for consuming text since last timeout. 111 | * `cmd_*(*args, **kwargs)` is the convenience method to send a command and get response. `args` are for the command, `kwargs` are arguments for `TelegramCliInterface.send_command`. 112 | * `on_info(text)`(callback) is called when a line of text is printed on stdout. 113 | * `on_json(obj)`(callback) is called with the interpreted object when a line of json is printed on stdout. 114 | * `on_text(text)`(callback) is called when a line of anything is printed on stdout. 115 | * `on_start()`(callback) is called after telegram-cli starts. 116 | * `on_exit()`(callback) is called after telegram-cli dies. 117 | * `close()` properly ends the subprocess. 118 | 119 | `do_nothing()` function does nothing. (for callbacks) 120 | 121 | `TelegramCliExited` exception is raised if telegram-cli dies when reading an answer. 122 | 123 | ## License 124 | 125 | Now it's LGPLv3+. 126 | -------------------------------------------------------------------------------- /dbconvert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import sys 6 | import struct 7 | import sqlite3 8 | import binascii 9 | import collections 10 | 11 | class tgl_peer_id_t(collections.namedtuple('tgl_peer_id_t', 'peer_type peer_id access_hash')): 12 | ''' 13 | typedef struct { 14 | int peer_type; 15 | int peer_id; 16 | long long access_hash; 17 | } tgl_peer_id_t; 18 | ''' 19 | TGL_PEER_USER = 1 20 | TGL_PEER_CHAT = 2 21 | TGL_PEER_GEO_CHAT = 3 22 | TGL_PEER_ENCR_CHAT = 4 23 | TGL_PEER_CHANNEL = 5 24 | TGL_PEER_TEMP_ID = 100 25 | TGL_PEER_RANDOM_ID = 101 26 | TGL_PEER_UNKNOWN = 0 27 | 28 | @classmethod 29 | def loads(cls, s): 30 | return cls._make(struct.unpack(' 1: 141 | FILENAME_IN = sys.argv[1] 142 | if len(sys.argv) > 2: 143 | FILENAME_OUT = sys.argv[2] 144 | 145 | if not os.path.isfile(FILENAME_IN): 146 | print('Database file not found.') 147 | sys.exit(1) 148 | 149 | DB_IN = sqlite3.connect(FILENAME_IN) 150 | CUR_IN = DB_IN.cursor() 151 | 152 | for n in CUR_IN.execute("SELECT name FROM sqlite_master WHERE type='table'"): 153 | if n[0] == 'exportinfo': 154 | VER = 1 155 | break 156 | elif n[0] == 'peerinfo': 157 | VER = 2 158 | break 159 | else: 160 | print('Database not recognized.') 161 | sys.exit(1) 162 | 163 | print('Converting database:') 164 | 165 | DB = sqlite3.connect(FILENAME_OUT) 166 | CUR = DB.cursor() 167 | init_db(CUR) 168 | 169 | if VER == 1: 170 | print('* messages') 171 | for mid, src, dest, text, media, date, fwd_src, fwd_date, reply_id, out, unread, service, action, flags in CUR_IN.execute('SELECT * FROM messages ORDER BY id ASC'): 172 | CUR.execute('REPLACE INTO messages VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', (mid, convert_peerid1(src), convert_peerid1(dest), text, media, date, convert_peerid1(fwd_src), fwd_date, reply_id, out, unread, service, action, flags)) 173 | print('* users') 174 | for pid, phone, username, first_name, last_name, flags in CUR_IN.execute('SELECT * FROM users'): 175 | CUR.execute('REPLACE INTO users VALUES (?,?,?,?,?,?,?)', (pid, 0, phone, username, first_name, last_name, flags)) 176 | print('* chats') 177 | for pid, title, members_num, flags in CUR_IN.execute('SELECT * FROM chats'): 178 | CUR.execute('REPLACE INTO chats VALUES (?,?,?,?,?)', (pid, 0, title, members_num, flags)) 179 | print('* peerinfo') 180 | for pid, print_name, finished in CUR_IN.execute('SELECT * FROM exportinfo'): 181 | CUR.execute('REPLACE INTO peerinfo VALUES (?,?,?,?)', (convert_peerid1(pid), 'chat' if pid < 0 else 'user', print_name, finished)) 182 | else: 183 | print('* messages') 184 | for mid, src, dest, text, media, date, fwd_src, fwd_date, reply_id, out, unread, service, action, flags in CUR_IN.execute('SELECT * FROM messages ORDER BY date, id ASC'): 185 | CUR.execute('REPLACE INTO messages VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', (tgl_message_id_t.loads(mid).id, convert_peerid2(src), convert_peerid2(dest), text, media, date, convert_peerid2(fwd_src), fwd_date, convert_msgid2(reply_id), out, unread, service, action, flags)) 186 | print('* users') 187 | for pid, permanent_id, phone, username, first_name, last_name, flags in CUR_IN.execute('SELECT * FROM users'): 188 | CUR.execute('REPLACE INTO users VALUES (?,?,?,?,?,?,?)', (pid, tgl_peer_id_t.loads(permanent_id).access_hash, phone, username, first_name, last_name, flags)) 189 | print('* chats') 190 | for pid, permanent_id, title, members_num, flags in CUR_IN.execute('SELECT * FROM chats'): 191 | CUR.execute('REPLACE INTO chats VALUES (?,?,?,?,?)', (pid, tgl_peer_id_t.loads(permanent_id).access_hash, title, members_num, flags)) 192 | print('* channels') 193 | for pid, permanent_id, title, participants_count, admins_count, kicked_count, flags in CUR_IN.execute('SELECT * FROM channels'): 194 | CUR.execute('REPLACE INTO channels VALUES (?,?,?,?,?,?,?)', (pid, tgl_peer_id_t.loads(permanent_id).access_hash, title, participants_count, admins_count, kicked_count, flags)) 195 | print('* peerinfo') 196 | for pid, ptype, print_name, finished in CUR_IN.execute('SELECT * FROM peerinfo'): 197 | CUR.execute('REPLACE INTO peerinfo VALUES (?,?,?,?)', (convert_peerid2(pid), ptype, print_name, finished)) 198 | 199 | DB.commit() 200 | print('Done.') 201 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /tgcli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import time 6 | import json 7 | import socket 8 | import shutil 9 | import signal 10 | import logging 11 | import tempfile 12 | import threading 13 | import subprocess 14 | 15 | ''' 16 | tgcli.py - Library to interact with telegram-cli. 17 | Copyright (C) 2015-2016 Dingyuan Wang 18 | 19 | This program is free software: you can redistribute it and/or modify 20 | it under the terms of the GNU Lesser General Public License as 21 | published by the Free Software Foundation, either version 3 of the 22 | License, or (at your option) any later version. 23 | 24 | This program is distributed in the hope that it will be useful, 25 | but WITHOUT ANY WARRANTY; without even the implied warranty of 26 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 27 | GNU Lesser General Public License for more details. 28 | 29 | You should have received a copy of the GNU Lesser General Public 30 | License along with this program. If not, see 31 | . 32 | ''' 33 | 34 | tg_server_pub = '''-----BEGIN RSA PUBLIC KEY----- 35 | MIIBCgKCAQEAwVACPi9w23mF3tBkdZz+zwrzKOaaQdr01vAbU4E1pvkfj4sqDsm6 36 | lyDONS789sVoD/xCS9Y0hkkC3gtL1tSfTlgCMOOul9lcixlEKzwKENj1Yz/s7daS 37 | an9tqw3bfUV/nqgbhGX81v/+7RFAEd+RwFnK7a+XYl9sluzHRyVVaTTveB2GazTw 38 | Efzk2DWgkBluml8OREmvfraX3bkHZJTKX4EQSjBbbdJ2ZXIsRrYOXfaA+xayEGB+ 39 | 8hdlLmAjbCVfaigxX0CDqWeR1yFL9kwd9P0NsZRPsmoqVwMbMu7mStFai6aIhc3n 40 | Slv8kg9qv1m6XHVQY3PnEw+QQtqSIXklHwIDAQAB 41 | -----END RSA PUBLIC KEY----- 42 | ''' 43 | 44 | logger = logging.getLogger('tgcli') 45 | logger.setLevel(logging.INFO) 46 | do_nothing = lambda *args, **kwargs: None 47 | 48 | def preexec_ignore_sigint(): 49 | ''' 50 | Ignore the SIGINT signal by setting the handler to the standard 51 | signal handler SIG_IGN. 52 | ''' 53 | signal.signal(signal.SIGINT, signal.SIG_IGN) 54 | 55 | class TelegramCliExited(RuntimeError): 56 | pass 57 | 58 | class TelegramCliInterface: 59 | def __init__(self, cmd, extra_args=(), run=True, timeout=60, ignore_sigint=True): 60 | self.cmd = cmd 61 | self.extra_args = tuple(extra_args) 62 | self.proc = None 63 | self.sock = None 64 | self.buffer = b'' 65 | self.ready = threading.Event() 66 | self.closed = False 67 | self.thread = None 68 | self.tmpdir = tempfile.mkdtemp() 69 | self.timeout = timeout 70 | self.ignore_sigint = ignore_sigint 71 | # Event callbacks 72 | # `on_info`, `on_json` and `on_text` are for stdout 73 | self.on_info = logger.info 74 | self.on_json = logger.debug 75 | self.on_text = do_nothing 76 | self.on_start = lambda: logger.info('Telegram-cli started.') 77 | self.on_exit = lambda: logger.warning('Telegram-cli died.') 78 | if run: 79 | self.run() 80 | 81 | def _get_pubkey(self): 82 | tgdir = os.path.abspath(os.path.join(os.path.dirname( 83 | os.path.realpath(self.cmd)), '..')) 84 | paths = [ 85 | os.path.join(tgdir, 'tg-server.pub'), 86 | os.path.join(tgdir, 'server.pub'), 87 | '/etc/telegram-cli/server.pub', 88 | '/usr/local/etc/telegram-cli/server.pub', 89 | os.path.join(self.tmpdir, 'tg-server.pub') 90 | ] 91 | for path in paths: 92 | if os.path.isfile(path): 93 | return path 94 | else: 95 | with open(path, 'w') as f: 96 | f.write(tg_server_pub) 97 | return path 98 | 99 | def checkproc(self): 100 | if self.closed or self.proc and self.proc.poll() is None: 101 | return self.proc 102 | sockfile = os.path.join(self.tmpdir, 'tgcli.sock') 103 | if os.path.exists(sockfile): 104 | os.unlink(sockfile) 105 | self.proc = subprocess.Popen((self.cmd, '-k', self._get_pubkey(), 106 | '--json', '-R', '-C', '-S', sockfile) + self.extra_args, 107 | stdin=subprocess.PIPE, stdout=subprocess.PIPE, 108 | stderr=subprocess.STDOUT, 109 | preexec_fn=preexec_ignore_sigint if self.ignore_sigint else None) 110 | while not os.path.exists(sockfile): 111 | time.sleep(0.5) 112 | self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) 113 | self.sock.connect(sockfile) 114 | return self.proc 115 | 116 | def _run_cli(self): 117 | while not self.closed: 118 | self.checkproc() 119 | try: 120 | while not self.closed: 121 | out = self.proc.stdout.readline().decode('utf-8') 122 | if not out: 123 | break 124 | elif not self.ready.is_set(): 125 | self.on_start() 126 | self.ready.set() 127 | self.on_text(out) 128 | if out[0] in '[{': 129 | try: 130 | self.on_json(json.loads(out.strip())) 131 | except ValueError: 132 | self.on_info(out.strip()) 133 | else: 134 | self.on_info(out.strip()) 135 | except BrokenPipeError: 136 | pass 137 | finally: 138 | try: 139 | self.sock.shutdown(socket.SHUT_RDWR) 140 | except Exception: 141 | pass 142 | if self.proc and self.proc.poll() is None: 143 | self.proc.terminate() 144 | self.proc.wait() 145 | self.ready.clear() 146 | self.on_exit() 147 | 148 | def run(self): 149 | self.thread = threading.Thread(target=self._run_cli) 150 | self.thread.daemon = True 151 | self.thread.start() 152 | self.ready.wait() 153 | 154 | def restart(self): 155 | self.close() 156 | self.closed = False 157 | self.tmpdir = tempfile.mkdtemp() 158 | self.run() 159 | 160 | def close(self): 161 | if self.closed: 162 | return 163 | self.closed = True 164 | self.ready.clear() 165 | try: 166 | self.proc.wait(2) 167 | except subprocess.TimeoutExpired: 168 | self.proc.kill() 169 | if self.thread: 170 | self.thread.join(1) 171 | if os.path.isdir(self.tmpdir): 172 | shutil.rmtree(self.tmpdir, True) 173 | self.tmpdir = None 174 | 175 | def __enter__(self): 176 | if not self.thread: 177 | self.run() 178 | self.ready.wait() 179 | return self 180 | 181 | def __exit__(self, exc_type, exc_value, traceback): 182 | self.close() 183 | 184 | def __del__(self): 185 | self.close() 186 | 187 | def _readline(self): 188 | while self.ready.is_set(): 189 | lines = self.buffer.split(b'\n', 1) 190 | if len(lines) > 1: 191 | self.buffer = lines[1] 192 | return lines[0] + b'\n' 193 | else: 194 | self.buffer += self.sock.recv(1024) 195 | # usually there is an assertion error 196 | raise TelegramCliExited('telegram-cli unexpectedly exited.') 197 | 198 | def send_command(self, cmd, timeout=None, resync=True): 199 | ''' 200 | Send a command to tg-cli. 201 | use `resync` for consuming text since last timeout. 202 | ''' 203 | logger.debug(cmd) 204 | self.ready.wait() 205 | self.sock.settimeout(timeout or self.timeout) 206 | self.sock.sendall(cmd.encode('utf-8') + b'\n') 207 | line = self._readline() 208 | while resync and not line.startswith(b'ANSWER '): 209 | line = self._readline() 210 | size = int(line[7:].decode('ascii')) 211 | reply = b'' 212 | while len(reply) < size: 213 | reply += self._readline() 214 | ret = reply.decode('utf-8') 215 | try: 216 | return json.loads(ret) 217 | except ValueError: 218 | return ret 219 | 220 | def __getattr__(self, name): 221 | ''' 222 | Convenience command calling: cmd_*(*args, **kwargs) 223 | `args` are for the tg-cli command 224 | `kwargs` are for `send_command` 225 | ''' 226 | if name.startswith('cmd_'): 227 | fn = lambda *args, **kwargs: self.send_command( 228 | ' '.join(map(str, (name[4:],) + args)), **kwargs) 229 | return fn 230 | else: 231 | raise AttributeError('TelegramCliInterface has no attribute %r' % name) 232 | 233 | if __name__ == "__main__": 234 | import sys 235 | logging.basicConfig(stream=sys.stderr, format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO) 236 | with TelegramCliInterface(sys.argv[1]) as tgcli: 237 | for ln in sys.stdin: 238 | try: 239 | cmd = ln.strip() 240 | print(tgcli.send_command(cmd)) 241 | except Exception: 242 | logging.exception('Failed to execute: ' + cmd) 243 | -------------------------------------------------------------------------------- /export.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | import sys 6 | import json 7 | import time 8 | import queue 9 | import random 10 | import socket 11 | import struct 12 | import sqlite3 13 | import logging 14 | import argparse 15 | import binascii 16 | import functools 17 | import collections 18 | 19 | import tgcli 20 | 21 | __version__ = '3.0' 22 | 23 | re_msglist = re.compile(r'^\[.*\]$') 24 | re_onemsg = re.compile(r'^\{.+\}$') 25 | re_getmsg = re.compile(r'^\*\*\* [0-9.]+ id=\d+$') 26 | 27 | logging.basicConfig(stream=sys.stdout, format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO) 28 | 29 | class LRUCache: 30 | 31 | def __init__(self, maxlen): 32 | self.capacity = maxlen 33 | self.cache = collections.OrderedDict() 34 | 35 | def __getitem__(self, key): 36 | value = self.cache.pop(key) 37 | self.cache[key] = value 38 | return value 39 | 40 | def get(self, key): 41 | try: 42 | value = self.cache.pop(key) 43 | self.cache[key] = value 44 | return value 45 | except KeyError: 46 | return None 47 | 48 | def __setitem__(self, key, value): 49 | try: 50 | self.cache.pop(key) 51 | except KeyError: 52 | if len(self.cache) >= self.capacity: 53 | self.cache.popitem(last=False) 54 | self.cache[key] = value 55 | 56 | def retry_or_log(attempts=2): 57 | def decorator(func): 58 | @functools.wraps(func) 59 | def wrapped(*args, **kwargs): 60 | for att in range(attempts): 61 | try: 62 | return func(*args, **kwargs) 63 | except Exception as ex: 64 | if att == attempts-1: 65 | logging.exception('Wrapped function failed.') 66 | return wrapped 67 | return decorator 68 | 69 | def uniq(seq, key=None): # Dave Kirby 70 | # Order preserving 71 | seen = set() 72 | if key: 73 | return [x for x in seq if key(x) not in seen and not seen.add(key(x))] 74 | else: 75 | return [x for x in seq if x not in seen and not seen.add(x)] 76 | 77 | class tgl_peer_id_t(collections.namedtuple('tgl_peer_id_t', 'peer_type peer_id access_hash')): 78 | ''' 79 | typedef struct { 80 | int peer_type; 81 | int peer_id; 82 | long long access_hash; 83 | } tgl_peer_id_t; 84 | ''' 85 | TGL_PEER_USER = 1 86 | TGL_PEER_CHAT = 2 87 | TGL_PEER_GEO_CHAT = 3 88 | TGL_PEER_ENCR_CHAT = 4 89 | TGL_PEER_CHANNEL = 5 90 | TGL_PEER_TEMP_ID = 100 91 | TGL_PEER_RANDOM_ID = 101 92 | TGL_PEER_UNKNOWN = 0 93 | 94 | @classmethod 95 | def loads(cls, s): 96 | return cls._make(struct.unpack('flags & TGLMF_CREATED)) { return res; } 293 | if ret or 'flags' in msg: 294 | CONN.execute('REPLACE INTO messages VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', (getmsgid(msg, 'id'), getpeerid(msg, 'from'), getpeerid(msg, 'to'), msg.get('text'), json.dumps(msg['media']) if 'media' in msg else None, msg.get('date'), getpeerid(msg, 'fwd_from'), msg.get('fwd_date'), getmsgid(msg, 'reply_id'), msg.get('out'), msg.get('unread'), msg.get('service'), json.dumps(msg['action']) if 'action' in msg else None, msg.get('flags'))) 295 | return ret 296 | 297 | def process(obj): 298 | if isinstance(obj, list): 299 | if not obj: 300 | return (False, 0) 301 | hit = 0 302 | for msg in obj: 303 | hit += log_msg(msg) 304 | return (True, hit) 305 | elif isinstance(obj, dict): 306 | msg = obj 307 | if msg.get('event') in ('message', 'service', 'read'): 308 | return (True, log_msg(msg)) 309 | elif msg.get('event') == 'online-status': 310 | update_peer(msg['user']) 311 | elif 'peer' in msg: 312 | update_peer(msg['peer']) 313 | elif msg.get('result') == 'FAIL': 314 | if 'can not parse' in msg.get('error', ''): 315 | TGCLI.cmd_dialog_list() 316 | #raise ValueError(msg.get('error')) 317 | return (False, 0) 318 | return (None, None) 319 | # ignore non-json lines 320 | elif obj == '': 321 | raise ValueError('empty line received') 322 | return (None, None) 323 | 324 | def purge_queue(): 325 | while 1: 326 | try: 327 | process(MSG_Q.get_nowait()) 328 | except queue.Empty: 329 | break 330 | 331 | def on_start(): 332 | logging.info('Telegram-cli started.') 333 | time.sleep(2) 334 | TGCLI.cmd_dialog_list() 335 | logging.info('Telegram-cli is ready.') 336 | 337 | def logging_fmt(msg): 338 | if msg.get('event') == 'message': 339 | dst = msg['to']['print_name'] if 'to' in msg else '' 340 | src = msg['from']['print_name'] if 'from' in msg else '' 341 | return ' '.join(filter(None, (dst, src, '>>>', msg.get('text', ''), str(msg.get('media', ''))))) 342 | elif msg.get('event') == 'service': 343 | dst = msg['to']['print_name'] if 'to' in msg else '' 344 | src = msg['from']['print_name'] if 'from' in msg else '' 345 | return ' '.join(filter(None, (dst, src, '>>>', str(msg.get('action', ''))))) 346 | else: 347 | return repr(msg)[:100] 348 | 349 | def logging_status(pos, end=False, seg=1000, length=None): 350 | if pos % seg: 351 | sys.stdout.write('.') 352 | elif pos: 353 | if length: 354 | sys.stdout.write('%.2f%%' % (pos * 100 / length)) 355 | else: 356 | sys.stdout.write(str(pos)) 357 | if end: 358 | if pos and pos % seg: 359 | if length: 360 | sys.stdout.write('%.2f%%\n' % (pos * 100 / length)) 361 | else: 362 | sys.stdout.write('%d\n' % pos) 363 | else: 364 | sys.stdout.write('\n') 365 | sys.stdout.flush() 366 | 367 | def export_for(item, pos=0, force=False): 368 | logging.info('Exporting messages for %s from %d' % (item['print_name'], pos)) 369 | try: 370 | # Get the first 100 371 | if not pos: 372 | update_peer(item) 373 | msglist = TGCLI.cmd_history(print_id(item), 100) 374 | res = process(msglist) 375 | logging_status(pos) 376 | pos = 100 377 | else: 378 | res = (True, 0) 379 | # Get the recently updated messages until overlapped 380 | while res[0] is True and not res[1]: 381 | msglist = TGCLI.cmd_history(print_id(item), 100, pos) 382 | res = process(msglist) 383 | logging_status(pos) 384 | pos += 100 385 | # If force, then continue 386 | if not force: 387 | pos = max(pos, is_finished(item)) 388 | # Else, get messages from the offset of last time 389 | # Until no message is returned (may be not true) 390 | while res[0] is True: 391 | msglist = TGCLI.cmd_history(print_id(item), 100, pos) 392 | res = process(msglist) 393 | logging_status(pos) 394 | pos += 100 395 | except Exception: 396 | logging_status(pos, True) 397 | if pos > is_finished(item): 398 | set_finished(item, pos) 399 | return pos 400 | logging_status(pos, True) 401 | set_finished(item, pos) 402 | 403 | def find_holes(minv, maxv, s): 404 | for n in range(minv, maxv + 1): 405 | if n not in s: 406 | yield n 407 | 408 | def export_holes(): 409 | ''' 410 | Try to get remaining messages by using message id. 411 | ''' 412 | # First we get messages that belong to ourselves, 413 | # i.e. not channel messages or encr-chat 414 | # 17179869184 = TGL_PEER_ENCR_CHAT 4<<32 415 | got = set(i[0] for i in CONN.execute('SELECT id FROM messages WHERE dest < 17179869184') if isinstance(i[0], int)) 416 | # it doesn't verify peer_type, peer_id, access_hash 417 | if got: 418 | holes = [tgl_message_id_t(1, 0, n, 0) for n in find_holes(1, max(got), got)] 419 | # Then we get channel (supergroup) messages. 420 | if TG_TEST: 421 | channels = [tgl_peer_id_t(tgl_peer_id_t.TGL_PEER_CHANNEL, *i) for i in 422 | CONN.execute('SELECT id, access_hash FROM channels')] 423 | for channel in channels: 424 | got = set(i[0] for i in CONN.execute('SELECT id FROM messages WHERE dest = ?', (channel.to_id(),)) if isinstance(i[0], int)) 425 | if got: 426 | holes.extend(tgl_message_id_t(channel.peer_type, channel.peer_id, n, channel.access_hash) for n in find_holes(1, max(got), got)) 427 | length = len(holes) 428 | logging.info('Getting the remaining %d messages...' % length) 429 | # we need some uncertainty to work around the uncertainty of telegram-cli 430 | random.shuffle(holes) 431 | # list of mids (may be str or int, depending on TG_TEST) 432 | failed = [] 433 | for k, msg in enumerate(holes, 1): 434 | if TG_TEST: 435 | mid = msg.dumps() 436 | else: 437 | mid = msg.id 438 | try: 439 | res = process(TGCLI.send_command('get_message %s' % mid)) 440 | if not res[0]: 441 | logging.warning('%r may not exist [%.2f%%]', msg[:3], (k * 100 / length)) 442 | elif k % 10 == 0: 443 | logging_status(k, False, 100, length) 444 | except tgcli.TelegramCliExited: 445 | # interface.c:4295: print_message: Assertion `M' failed. 446 | logging.warning('%r may not exist [%.2f%%]', msg[:3], (k * 100 / length)) 447 | except Exception: 448 | failed.append(mid) 449 | logging.exception('Failed to get message ID %s' % mid) 450 | logging_status(k, True, 100, length) 451 | purge_queue() 452 | while failed: 453 | length = len(failed) 454 | logging.info('Retrying the remaining %d messages...' % length) 455 | newlist = [] 456 | # see above 457 | random.shuffle(failed) 458 | for k, mid in enumerate(failed, 1): 459 | try: 460 | res = process(TGCLI.send_command('get_message %s' % mid)) 461 | except Exception: 462 | # such an old bug (`newlist` here was `failed`) 463 | newlist.append(mid) 464 | if k % 10 == 0: 465 | logging_status(k, False, 100, length) 466 | logging_status(k, True, 100, length) 467 | failed = newlist 468 | purge_queue() 469 | 470 | def export_text(peer=None, force=False): 471 | #if force: 472 | #reset_finished() 473 | logging.info('Getting contacts...') 474 | update_peer(TGCLI.cmd_get_self()) 475 | items = TGCLI.cmd_contact_list() 476 | peer_obj = None 477 | if peer: 478 | peer_match = re.match('^(\w+)#id(\d+)$', peer) 479 | for item in items: 480 | update_peer(item) 481 | purge_queue() 482 | logging.info('Getting dialogs...') 483 | dlist = items = lastitems = TGCLI.cmd_dialog_list(100) 484 | dcount = 100 485 | while items: 486 | items = TGCLI.cmd_dialog_list(100, dcount) 487 | if frozenset(d['id'] for d in items) == frozenset(d['id'] for d in lastitems): 488 | break 489 | dlist.extend(items) 490 | dcount += 100 491 | for item in dlist: 492 | update_peer(item) 493 | if peer and peer_obj is None: 494 | if not peer_match: 495 | if peer in item.get('print_name', ''): 496 | peer_obj = item 497 | elif ((item.get('peer_type') or item.get('type')) == peer_match.group(1) 498 | and str(item.get('peer_id') or item.get('id')) == peer_match.group(2)): 499 | peer_obj = item 500 | if peer_obj: 501 | logging.info('Peer: %r' % peer_obj) 502 | dlist = [peer_obj] 503 | elif peer: 504 | logging.info('Peer not found: %s' % peer) 505 | return 506 | logging.info('Exporting messages...') 507 | failed = [] 508 | # we need some uncertainty to work around the uncertainty of telegram-cli 509 | random.shuffle(dlist) 510 | for item in dlist: 511 | res = export_for(item, 0, force) 512 | if res is not None: 513 | failed.append((item, res)) 514 | logging.warning('Failed to get messages for %s from %d' % (item['print_name'], res)) 515 | purge_queue() 516 | DB.commit() 517 | while failed: 518 | newlist = [] 519 | for item, pos in failed: 520 | res = export_for(item, pos, force) 521 | if res is not None: 522 | newlist.append((item, res)) 523 | logging.warning('Failed to get messages for %s from %d' % (item['print_name'], res)) 524 | purge_queue() 525 | failed = newlist 526 | DB.commit() 527 | logging.info('Export to database completed.') 528 | 529 | DB = None 530 | CONN = None 531 | PEER_CACHE = LRUCache(10) 532 | MSG_Q = queue.Queue() 533 | TGCLI = None 534 | DLDIR = '.' 535 | TG_TEST = True 536 | 537 | def main(argv): 538 | global TGCLI, DLDIR, TG_TEST 539 | parser = argparse.ArgumentParser(description="Export Telegram messages.") 540 | parser.add_argument("-o", "--output", help="output path", default="export") 541 | parser.add_argument("-d", "--db", help="database path", default="tg-export3.db") 542 | parser.add_argument("-f", "--force", help="force download all messages", action='store_true') 543 | parser.add_argument("-p", "--peer", help="only download messages for this peer (format: channel#id1001234567, or use partial name/title as shown in tgcli)") 544 | parser.add_argument("-B", "--batch-only", help="fetch messages in batch only, don't try to get more missing messages", action='store_true') 545 | parser.add_argument("-t", "--timeout", help="tg-cli command timeout", type=int, default=30) 546 | parser.add_argument("-l", "--logging", help="logging mode (keep running)", action='store_true') 547 | parser.add_argument("-L", "--keep-logging", help="first export, then keep logging", action='store_true') 548 | parser.add_argument("-e", "--tgbin", help="telegram-cli binary path", default="bin/telegram-cli") 549 | parser.add_argument("-v", "--verbose", help="print debug messages", action='store_true') 550 | args = parser.parse_args(argv) 551 | 552 | if args.verbose: 553 | logging.getLogger().setLevel(logging.DEBUG) 554 | tgcli.logger.setLevel(logging.DEBUG) 555 | 556 | DLDIR = args.output 557 | init_db(args.db) 558 | 559 | TGCLI = tgcli.TelegramCliInterface(args.tgbin, extra_args=('-W', '-E'), run=False, timeout=args.timeout) 560 | TGCLI.on_json = MSG_Q.put 561 | TGCLI.on_info = lambda s: tgcli.logger.info(s) if not re_getmsg.match(s) else None 562 | #TGCLI.on_text = MSG_Q.put 563 | #TGCLI.on_start = on_start 564 | TGCLI.run() 565 | TGCLI.ready.wait() 566 | time.sleep(1) 567 | 568 | # the 'test' branch of tg has channel support 569 | TG_TEST = 'channel' in TGCLI.cmd_help() 570 | 571 | try: 572 | if not args.logging: 573 | export_text(args.peer, args.force) 574 | if not args.batch_only: 575 | export_holes() 576 | if args.logging or args.keep_logging: 577 | while TGCLI.ready.is_set(): 578 | d = MSG_Q.get() 579 | logging.info(logging_fmt(d)) 580 | process(d) 581 | finally: 582 | TGCLI.close() 583 | purge_queue() 584 | DB.commit() 585 | 586 | if __name__ == '__main__': 587 | sys.exit(main(sys.argv[1:])) 588 | -------------------------------------------------------------------------------- /logfmt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import re 6 | import sys 7 | import time 8 | import json 9 | import struct 10 | import sqlite3 11 | import operator 12 | import argparse 13 | import binascii 14 | import collections 15 | 16 | import jinja2 17 | 18 | re_url = re.compile(r'''\b 19 | ( 20 | # URL (gruber v2) 21 | (?: 22 | [a-z][\w-]+:(?:/{1,3}|[a-z0-9%]) 23 | | 24 | www\d{0,3}[.] 25 | | 26 | [a-z0-9.\-]+[.][a-z]{2,4}/ 27 | | 28 | magnet:\? 29 | ) 30 | (?: 31 | [^\s()<>]+ 32 | | 33 | \(([^\s()<>]+|(\([^\s()<>]+\)))*\) 34 | )+ 35 | (?: 36 | \(([^\s()<>]+|(\([^\s()<>]+\)))*\) 37 | | 38 | [^\s`!()\[\]{};:\'".,<>?«»“”‘’] 39 | ) 40 | | 41 | # BT Hash 42 | (?: 43 | [a-f0-9]{40} 44 | ) 45 | )''', re.I | re.X) 46 | re_bthash = re.compile(r'[0-9a-f]{40}|[a-z2-7]{32}', re.I) 47 | re_limit = re.compile(r'^([0-9]+)(,[0-9]+)?$') 48 | imgfmt = frozenset(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp')) 49 | 50 | printname = lambda first, last='': (first + ' ' + last if last else first) or '' 51 | 52 | strftime = lambda date, fmt='%Y-%m-%d %H:%M:%S': time.strftime(fmt, time.localtime(date)) 53 | 54 | unkuser = lambda user: { 55 | 'peer_id': user['id'], 56 | 'id': tgl_peer_id_t(tgl_peer_id_t.TGL_PEER_USER, user['id'], 0).dumps(), 57 | 'first_name': user['first_name'], 58 | 'last_name': user.get('last_name'), 59 | 'username': user.get('username'), 60 | 'type': 'user', 61 | 'flags': 256, 62 | 'print': printname(user['first_name'], user.get('last_name')) 63 | } 64 | 65 | unkmsg = lambda mid: { 66 | 'mid': mid, 67 | 'src': {'id': 0, 'print': ''}, 68 | 'dest': {'id': 0, 'print': ''}, 69 | 'text': '', 70 | 'media': {}, 71 | 'date': 0, 72 | 'msgtype': '', 73 | 'extra': None, 74 | 'out': 0, 75 | 'unread': 0, 76 | 'service': 0, 77 | 'action': {}, 78 | 'flags': 0 79 | } 80 | 81 | def convert_msgid2(msgid): 82 | if msgid is None: 83 | return None 84 | elif isinstance(msgid, int): 85 | return msgid 86 | elif len(msgid) == 48: 87 | return tgl_message_id_t.loads(msgid).id 88 | else: 89 | return int(msgid) 90 | 91 | class tgl_peer_id_t(collections.namedtuple('tgl_peer_id_t', 'peer_type peer_id access_hash')): 92 | ''' 93 | typedef struct { 94 | int peer_type; 95 | int peer_id; 96 | long long access_hash; 97 | } tgl_peer_id_t; 98 | ''' 99 | TGL_PEER_USER = 1 100 | TGL_PEER_CHAT = 2 101 | TGL_PEER_GEO_CHAT = 3 102 | TGL_PEER_ENCR_CHAT = 4 103 | TGL_PEER_CHANNEL = 5 104 | TGL_PEER_TEMP_ID = 100 105 | TGL_PEER_RANDOM_ID = 101 106 | TGL_PEER_UNKNOWN = 0 107 | 108 | @classmethod 109 | def loads(cls, s): 110 | return cls._make(struct.unpack('= self.capacity: 175 | self.cache.popitem(last=False) 176 | self.cache[key] = value 177 | 178 | class StreamArray(list): 179 | def __init__(self, iterable): 180 | self.iterable = iterable 181 | 182 | def __iter__(self): 183 | return self.iterable 184 | 185 | # according to the comment below 186 | def __len__(self): 187 | return 1 188 | 189 | class PeerStore(collections.UserDict): 190 | 191 | def __init__(self, *args, **kwds): 192 | super().__init__(*args, **kwds) 193 | self.name = {} 194 | 195 | def __setitem__(self, key, value): 196 | self.data[self._convert(key)] = value 197 | 198 | def setname(self, key, value): 199 | self.name[value] = self._convert(key) 200 | 201 | def __getitem__(self, key): 202 | peerid, peertype = self._convert(key) 203 | try: 204 | return self.data[(peerid, peertype)] 205 | except KeyError: 206 | d = self.data[(peerid, peertype)] = {'id': peerid, 'type': peertype, 'print': ''} 207 | return d 208 | 209 | def find(self, key): 210 | try: 211 | return self.__getitem__(key) 212 | except Exception: 213 | if key in self.name: 214 | return self.data[self.name[key]] 215 | else: 216 | for k, v in self.name.items(): 217 | if key in k and v[1] != 'encr_chat': 218 | return self.data[v] 219 | return {'id': None, 'type': 'user', 'print': key} 220 | 221 | @staticmethod 222 | def _convert(key=None): 223 | peertype = None 224 | if isinstance(key, tuple): 225 | peerid, peertype = key 226 | else: 227 | peerid = key 228 | peer_id = None 229 | peer_type = tgl_peer_id_t.TGL_PEER_USER 230 | try: 231 | peerid = int(peerid) 232 | except ValueError: 233 | pass 234 | if isinstance(peerid, str): 235 | sp = peerid.split('#id', 1) 236 | if len(sp) == 2: 237 | peer_id = int(sp[1]) 238 | peertype = sp[0] 239 | else: 240 | peer = tgl_peer_id_t.loads(peerid) 241 | peer_id = peer.peer_id 242 | peer_type = peer.peer_type 243 | elif peerid is None: 244 | pass 245 | elif peerid > 4294967296: 246 | # 1 << 32 247 | peer_id = peerid & 4294967295 248 | peer_type = peerid >> 32 249 | else: 250 | peer_id = abs(peerid) 251 | peer_type = tgl_peer_id_t.TGL_PEER_CHAT if peerid < 0 else tgl_peer_id_t.TGL_PEER_USER 252 | if peertype: 253 | return (peer_id, peertype) 254 | elif peer_type == tgl_peer_id_t.TGL_PEER_USER: 255 | return (peer_id, 'user') 256 | elif peer_type == tgl_peer_id_t.TGL_PEER_CHAT: 257 | return (peer_id, 'chat') 258 | elif peer_type == tgl_peer_id_t.TGL_PEER_ENCR_CHAT: 259 | return (peer_id, 'encr_chat') 260 | elif peer_type == tgl_peer_id_t.TGL_PEER_CHANNEL: 261 | return (peer_id, 'channel') 262 | 263 | class Messages: 264 | 265 | def __init__(self, stream=False, template='history.txt'): 266 | self.peers = PeerStore() 267 | if stream: 268 | self.msgs = LRUCache(100) 269 | else: 270 | self.msgs = collections.OrderedDict() 271 | 272 | self.db_cli = None 273 | self.conn_cli = None 274 | self.db_cli_ver = None 275 | self.db_bot = None 276 | self.conn_bot = None 277 | 278 | self.limit = None 279 | self.hardlimit = None 280 | self.botdest = None 281 | 282 | self.template = template 283 | self.stream = stream 284 | # can be 'bot', 'cli' or None (no conversion) 285 | self.media_format = 'cli' 286 | self.cachedir = None 287 | self.urlprefix = None 288 | self.jinjaenv = jinja2.Environment(loader=jinja2.FileSystemLoader('templates')) 289 | self.jinjaenv.filters['strftime'] = strftime 290 | self.jinjaenv.filters['autolink'] = autolink 291 | self.jinjaenv.filters['isimg'] = lambda url: os.path.splitext(url)[1] in imgfmt 292 | self.jinjaenv.filters['smartname'] = smartname 293 | 294 | def init_db(self, filename, dbtype='cli', botuserdb=False, botdest=None): 295 | if os.path.isfile(filename): 296 | if dbtype == 'cli': 297 | self.db_cli = sqlite3.connect(filename) 298 | self.conn_cli = self.db_cli.cursor() 299 | for name, sql in self.conn_cli.execute("SELECT name, sql FROM sqlite_master WHERE type='table'"): 300 | if name == 'exportinfo': 301 | self.db_cli_ver = 1 302 | break 303 | elif name == 'peerinfo': 304 | if 'permanent_id' in sql: 305 | self.db_cli_ver = 2 306 | else: 307 | self.db_cli_ver = 3 308 | break 309 | self.userfromdb('cli') 310 | elif dbtype == 'bot': 311 | self.db_bot = sqlite3.connect(filename) 312 | self.conn_bot = self.db_bot.cursor() 313 | self.botdest = self.peers.find(botdest) 314 | if self.botdest['id'] is None: 315 | raise KeyError('peer not found: %s' % botdest) 316 | if self.botdest['type'] == 'user': 317 | self.botdest['type'] = 'chat' 318 | # self.botdest = tgl_peer_id_t.from_peer(self.botdest).to_id() 319 | self.botdest = (self.botdest['id'], self.botdest['type']) 320 | if botuserdb or not self.db_cli: 321 | self.userfromdb('bot') 322 | else: 323 | raise FileNotFoundError('Database not found: ' + filename) 324 | 325 | def msgfromdb(self, dbtype='cli', peer=None): 326 | if self.limit: 327 | match = re_limit.match(self.limit) 328 | if match: 329 | if match.group(2): 330 | limit = 'LIMIT %d OFFSET %s' % (min(int(match.group(1)), self.hardlimit), match.group(2)[1:]) 331 | else: 332 | limit = 'LIMIT %d' % min(int(match.group(1)), self.hardlimit) 333 | else: 334 | limit = 'LIMIT %d' % self.hardlimit 335 | else: 336 | limit = '' 337 | if dbtype == 'cli': 338 | if peer: 339 | if self.db_cli_ver == 1: 340 | if peer['type'] == 'user': 341 | pid = peer['id'] 342 | else: 343 | pid = -peer['id'] 344 | elif self.db_cli_ver == 2: 345 | pid = tgl_peer_id_t.from_peer(peer).dumps() 346 | else: 347 | pid = tgl_peer_id_t.from_peer(peer).to_id() 348 | c = self.conn_cli.execute('SELECT * FROM (SELECT id, src, dest, text, media, date, fwd_src, fwd_date, reply_id, out, unread, service, action, flags FROM messages WHERE src=? or dest=? ORDER BY date DESC, id DESC %s) ORDER BY date ASC, id ASC' % limit, (pid, pid)) 349 | else: 350 | c = self.conn_cli.execute('SELECT * FROM (SELECT id, src, dest, text, media, date, fwd_src, fwd_date, reply_id, out, unread, service, action, flags FROM messages ORDER BY date DESC, id DESC %s) ORDER BY date ASC, id ASC' % limit) 351 | for mid, src, dest, text, media, date, fwd_src, fwd_date, reply_id, out, unread, service, action, flags in c: 352 | if self.media_format == 'bot': 353 | media, caption = self.media_cli2bot(media, action) 354 | text = text or caption 355 | yield convert_msgid2(mid), src, dest, text, media, date, fwd_src, fwd_date, convert_msgid2(reply_id), out, unread, service, action, flags 356 | elif dbtype == 'bot' and self.botdest: 357 | for mid, src, text, media, date, fwd_src, fwd_date, reply_id in self.conn_bot.execute('SELECT * FROM (SELECT id, src, text, media, date, fwd_src, fwd_date, reply_id FROM messages ORDER BY date DESC, id DESC %s) ORDER BY date ASC, id ASC' % limit): 358 | if self.media_format == 'cli': 359 | media, action = self.media_bot2cli(text, media) 360 | else: 361 | action = None 362 | yield mid, src, self.botdest, text, media, date, fwd_src, fwd_date, reply_id, 0, 0, bool(action), action, 256 363 | else: 364 | raise ValueError('dbtype or self.botdest is invalid') 365 | 366 | def userfromdb(self, dbtype='cli'): 367 | if dbtype == 'cli': 368 | for pid, phone, username, first_name, last_name, flags in self.conn_cli.execute('SELECT id, phone, username, first_name, last_name, flags FROM users'): 369 | self.peers[(pid, 'user')] = { 370 | 'id': pid, 371 | 'type': 'user', 372 | 'phone': phone, 373 | 'username': username, 374 | 'first_name': first_name, 375 | 'last_name': last_name, 376 | 'print': printname(first_name, last_name), 377 | 'flags': flags 378 | } 379 | for pid, title, members_num, flags in self.conn_cli.execute('SELECT id, title, members_num, flags FROM chats'): 380 | self.peers[(pid, 'chat')] = { 381 | 'id': pid, 382 | 'type': 'chat', 383 | 'title': title, 384 | 'members_num': members_num, 385 | 'print': printname(title), 386 | 'flags': flags 387 | } 388 | if self.db_cli_ver > 1: 389 | for pid, title, members_num, admins_count, kicked_count, flags in self.conn_cli.execute('SELECT id, title, participants_count, admins_count, kicked_count, flags FROM channels'): 390 | self.peers[(pid, 'channel')] = { 391 | 'id': pid, 392 | 'type': 'channel', 393 | 'title': title, 394 | # keep compatible with chats 395 | 'members_num': members_num, 396 | 'admins_count': admins_count, 397 | 'kicked_count': kicked_count, 398 | 'print': printname(title), 399 | 'flags': flags 400 | } 401 | if self.db_cli_ver == 1: 402 | sql = 'SELECT id, print_name FROM exportinfo' 403 | elif self.db_cli_ver == 2: 404 | sql = 'SELECT permanent_id, print_name FROM peerinfo' 405 | else: 406 | sql = 'SELECT id, print_name FROM peerinfo' 407 | for pid, print_name in self.conn_cli.execute(sql): 408 | self.peers.setname(pid, print_name) 409 | elif dbtype == 'bot': 410 | for pid, username, first_name, last_name in self.conn_bot.execute('SELECT id, username, first_name, last_name FROM users'): 411 | self.peers[(pid, 'user')].update({ 412 | 'id': pid, 413 | 'username': username, 414 | 'first_name': first_name, 415 | 'last_name': last_name, 416 | 'print': printname(first_name, last_name) 417 | }) 418 | 419 | def media_bot2cli(self, text, media=None, strict=False): 420 | if not media: 421 | return None, None 422 | media = json.loads(media) 423 | dm = {} 424 | da = {} 425 | 426 | mt = None 427 | if self.cachedir: 428 | mt = media.keys() & frozenset(('audio', 'document', 'sticker', 'video', 'voice')) 429 | file_id = None 430 | if mt: 431 | mt = mt.pop() 432 | file_id = media[mt]['file_id'] 433 | elif 'photo' in media: 434 | file_id = max(media['photo'], key=lambda x: x['width'])['file_id'] 435 | if file_id: 436 | for fn in os.listdir(self.cachedir): 437 | if fn.startswith(file_id): 438 | dm['url'] = self.urlprefix + fn 439 | break 440 | 441 | if '_ircuser' in media: 442 | dm['_ircuser'] = media['_ircuser'] 443 | if mt and not strict: 444 | dm.update(media[mt]) 445 | 446 | if ('audio' in media or 'document' in media 447 | or 'sticker' in media or 'video' in media 448 | or 'voice' in media): 449 | if strict: 450 | dm['type'] = 'document' 451 | else: 452 | dm['type'] = mt or 'document' 453 | elif 'photo' in media: 454 | dm['type'] = 'photo' 455 | dm['caption'] = text or '' 456 | elif 'contact' in media: 457 | dm['type'] = 'contact' 458 | dm['phone'] = media['contact']['phone_number'] 459 | dm['first_name'] = media['contact']['first_name'] 460 | dm['last_name'] = media['contact'].get('last_name') 461 | dm['user_id'] = media['contact'].get('user_id') 462 | elif 'location' in media: 463 | dm['type'] = 'geo' 464 | dm['longitude'] = media['location']['longitude'] 465 | dm['latitude'] = media['location']['latitude'] 466 | elif 'venue' in media: 467 | dm['type'] = 'venue' 468 | dm['longitude'] = media['venue']['location']['longitude'] 469 | dm['latitude'] = media['venue']['location']['latitude'] 470 | if media['venue']['title']: 471 | dm['type'] = media['venue']['title'] 472 | dm['address'] = media['venue']['address'] 473 | if 'foursquare_id' in media['venue']: 474 | dm['provider'] = 'foursquare' 475 | dm['venue_id'] = media['venue']['foursquare_id'] 476 | elif 'new_chat_participant' in media: 477 | user = media['new_chat_participant'] 478 | da['type'] = 'chat_add_user' 479 | da['user'] = self.peers.get(user['id']) or unkuser(user) 480 | elif 'left_chat_participant' in media: 481 | user = media['left_chat_participant'] 482 | da['type'] = 'chat_del_user' 483 | da['user'] = self.peers.get(user['id']) or unkuser(user) 484 | elif 'new_chat_title' in media: 485 | da['type'] = 'chat_rename' 486 | da['title'] = media['new_chat_title'] 487 | elif 'new_chat_photo' in media: 488 | da['type'] = 'chat_change_photo' 489 | elif 'delete_chat_photo' in media: 490 | da['type'] = 'chat_delete_photo' 491 | elif 'group_chat_created' in media: 492 | da['type'] = 'chat_created' 493 | da['title'] = '' 494 | return json.dumps(dm) if dm else None, json.dumps(da) if da else None 495 | 496 | def media_cli2bot(media=None, action=None): 497 | type_map = { 498 | # media 499 | 'photo': 'photo', 500 | 'document': 'document', 501 | 'unsupported': 'document', 502 | 'geo': 'location', 503 | 'venue': 'location', 504 | 'contact': 'contact', 505 | # action 506 | 'chat_add_user': 'new_chat_participant', 507 | 'chat_add_user_link': 'new_chat_participant', 508 | 'chat_del_user': 'left_chat_participant', 509 | 'chat_rename': 'new_chat_title', 510 | 'chat_change_photo': 'new_chat_photo', 511 | 'chat_delete_photo': 'delete_chat_photo', 512 | 'chat_created': 'group_chat_created' 513 | } 514 | d = {} 515 | caption = None 516 | if media: 517 | media = json.loads(media) 518 | if action: 519 | action = json.loads(action) 520 | if media and 'type' in media: 521 | media = media.copy() 522 | if media['type'] == 'photo': 523 | caption = media['caption'] 524 | d['photo'] = [] 525 | elif media['type'] in ('document', 'unsupported'): 526 | d['document'] = {} 527 | elif 'longitude' in media: 528 | # 'type' may be the name of the place 529 | loc = { 530 | 'longitude': media['longitude'], 531 | 'latitude': media['latitude'] 532 | } 533 | if media['type'] == 'geo': 534 | d['location'] = loc 535 | else: 536 | d['venue'] = { 537 | 'location': loc, 538 | 'title': media['type'] if media['type'] != 'venue' else '', 539 | 'address': media['address'] 540 | } 541 | if media.get('provider') == 'foursquare' and 'venue_id' in media: 542 | d['venue']['foursquare_id'] = media['venue_id'] 543 | elif media['type'] == 'contact': 544 | del media['type'] 545 | media['phone_number'] = media.pop('phone') 546 | d['contact'] = media 547 | # ignore other undefined types to Bot API 548 | if action and 'type' in action: 549 | newname = type_map.get(action['type']) 550 | if newname.endswith('chat_participant'): 551 | d[newname] = { 552 | 'id': action['user']['id'], 553 | 'first_name': action['user'].get('first_name', ''), 554 | 'last_name': action['user'].get('last_name', ''), 555 | 'username': action['user'].get('username', '') 556 | } 557 | elif newname == 'new_chat_title': 558 | d[newname] = action['title'] 559 | elif newname == 'new_chat_photo': 560 | d[newname] = [] 561 | elif newname in ('delete_chat_photo', 'group_chat_created'): 562 | d[newname] = True 563 | # ignore other undefined types to Bot API 564 | return json.dumps(d) if d else None, caption 565 | 566 | def getmsgs(self, peer=None): 567 | db = 'cli' if self.db_cli else 'bot' 568 | for mid, src, dest, text, media, date, fwd_src, fwd_date, reply_id, out, unread, service, action, flags in self.msgfromdb(db, peer): 569 | src = self.peers[src] 570 | dest = self.peers[dest] 571 | if not (db == 'bot' or 572 | dest['id'] == peer['id'] or 573 | peer['type'] == 'user' and 574 | src['id'] == peer['id'] and dest['type'] == 'user'): 575 | continue 576 | if fwd_src: 577 | msgtype = 'fwd' 578 | extra = {'fwd_src': self.peers[fwd_src], 'fwd_date': fwd_date} 579 | elif reply_id: 580 | msgtype = 're' 581 | remsg = self.msgs.get(reply_id, unkmsg(reply_id)) 582 | if remsg['msgtype'] == 're': 583 | remsg = remsg.copy() 584 | remsg['extra'] = None 585 | extra = {'reply': remsg} 586 | else: 587 | msgtype, extra = '', None 588 | media = json.loads(media or '{}') 589 | if db == 'bot' and '_ircuser' in media: 590 | src['first_name'] = src['print'] = media['_ircuser'] 591 | msg = { 592 | 'mid': mid, 593 | 'src': src, 594 | 'dest': dest, 595 | 'text': text or media.get('caption'), 596 | 'media': media, 597 | 'date': date, 598 | 'msgtype': msgtype, 599 | 'extra': extra, 600 | 'out': out, 601 | 'unread': unread, 602 | 'service': service, 603 | 'action': json.loads(action or '{}'), 604 | 'flags': flags 605 | } 606 | self.msgs[mid] = msg 607 | yield mid, msg 608 | 609 | def render_peer(self, peer, name=None): 610 | peer = peer.copy() 611 | if name: 612 | peer['print'] = name 613 | kvars = { 614 | 'peer': peer, 615 | 'gentime': time.time() 616 | } 617 | if self.stream: 618 | kvars['msgs'] = (m for k, m in self.getmsgs(peer)) 619 | else: 620 | msgs = tuple(m for k, m in self.getmsgs(peer)) 621 | kvars['msgs'] = msgs 622 | if msgs: 623 | kvars['start'] = min(msgs, key=operator.itemgetter('date'))['date'] 624 | kvars['end'] = max(msgs, key=operator.itemgetter('date'))['date'] 625 | else: 626 | kvars['start'] = kvars['end'] = 0 627 | kvars['count'] = len(msgs) 628 | template = self.jinjaenv.get_template(self.template) 629 | yield from template.stream(**kvars) 630 | 631 | def render_peer_json(self, peer, name=None): 632 | je = json.JSONEncoder(indent=0) 633 | peer = peer.copy() 634 | if name: 635 | peer['print'] = name 636 | kvars = { 637 | 'peer': peer, 638 | 'gentime': time.time() 639 | } 640 | kvars['msgs'] = StreamArray(m for k, m in self.getmsgs(peer)) 641 | yield from je.iterencode(kvars) 642 | 643 | def autolink(text, img=True): 644 | ret = [] 645 | lastpos = 0 646 | for match in re_url.finditer(text): 647 | start, end = match.span() 648 | url = text[start:end] 649 | if re_bthash.match(url): 650 | ret.append('%s%s' % (text[lastpos:start], url, url)) 651 | elif img and os.path.splitext(url)[1] in imgfmt: 652 | ret.append('%s' % (text[lastpos:start], url, url)) 653 | else: 654 | ret.append('%s%s' % (text[lastpos:start], url, url)) 655 | lastpos = end 656 | ret.append(text[lastpos:]) 657 | return ''.join(ret) 658 | 659 | def smartname(user, limit=20): 660 | if 'first_name' not in user: 661 | return '<%s>' % 'Unknown'[:limit-2] 662 | first, last = user['first_name'], user.get('last_name', '') 663 | pn = printname(first, last) 664 | if len(pn) > limit: 665 | if len(first) > limit: 666 | return first.split(None, 1)[0][:limit] 667 | else: 668 | return first[:limit] 669 | else: 670 | return pn 671 | 672 | def main(argv): 673 | parser = argparse.ArgumentParser(description="Format exported database file into human-readable format.") 674 | parser.add_argument("-o", "--output", help="output path") 675 | parser.add_argument("-d", "--db", help="tg-export database path", default="tg-export3.db") 676 | parser.add_argument("-b", "--botdb", help="tg-chatdig bot database path", default="") 677 | parser.add_argument("-D", "--botdb-dest", help="tg-chatdig bot logged chat id or tg-cli-style peer name") 678 | parser.add_argument("-u", "--botdb-user", action="store_true", help="use user information in tg-chatdig database first") 679 | parser.add_argument("-t", "--template", help="export template, can be 'txt'(default), 'html', 'json', or template file name", default="txt") 680 | parser.add_argument("-P", "--peer-print", help="set print name for the peer") 681 | parser.add_argument("-l", "--limit", help="limit the number of fetched messages and set the offset") 682 | parser.add_argument("-L", "--hardlimit", help="set a hard limit of the number of messages, must be used with -l", type=int, default=100000) 683 | parser.add_argument("-c", "--cachedir", help="the path of media files") 684 | parser.add_argument("-r", "--urlprefix", help="the url prefix of media files") 685 | parser.add_argument("peer", help="export certain peer id or tg-cli-style peer print name") 686 | args = parser.parse_args(argv) 687 | 688 | msg = Messages(stream=args.template.endswith('html')) 689 | msg.limit = args.limit 690 | msg.hardlimit = args.hardlimit 691 | msg.cachedir = args.cachedir 692 | msg.urlprefix = args.urlprefix 693 | render_func = msg.render_peer 694 | if args.template == 'html': 695 | msg.template = 'simple.html' 696 | elif args.template == 'txt': 697 | msg.template = 'history.txt' 698 | elif args.template == 'json': 699 | render_func = msg.render_peer_json 700 | else: 701 | msg.template = args.template 702 | if args.db: 703 | msg.init_db(args.db, 'cli') 704 | if args.botdb: 705 | msg.init_db(args.botdb, 'bot', args.botdb_user or not args.db, args.botdb_dest) 706 | peer = msg.peers.find(args.peer) 707 | if peer['id'] is None: 708 | raise KeyError('peer not found: %s' % args.peer) 709 | if args.output == '-': 710 | for ln in render_func(peer, args.peer_print): 711 | sys.stdout.write(ln) 712 | else: 713 | fn = args.output 714 | if args.output is None: 715 | fn = '%s#id%d' % (peer['type'], peer['id']) 716 | if args.template == 'json': 717 | fn += '.json' 718 | elif '.' in args.template: 719 | fn += os.path.splitext(args.template)[1] 720 | else: 721 | fn += '.' + args.template 722 | with open(fn, 'w') as f: 723 | for ln in render_func(peer, args.peer_print): 724 | f.write(ln) 725 | 726 | if __name__ == '__main__': 727 | sys.exit(main(sys.argv[1:])) 728 | --------------------------------------------------------------------------------
TimeFromMessage
{{ msg.date|strftime('%Y-%m-%d %H:%M:%S') }}{{ msg.src|smartname|escape }}:{% if msg.msgtype == 'fwd' -%} 27 | Fwd {{ msg.extra.fwd_src|smartname|escape }}: 28 | {%- elif msg.msgtype == 're' -%} 29 | {{ msg.extra.reply.src|smartname|escape }}: 30 | {%- endif %} 31 | {% if msg.text -%} 32 | {{ msg.text|escape|autolink|replace("\n", "
") }} 33 | {%- endif %} 34 | {% if msg.media -%} 35 | {% if msg.media.url -%} 36 | {% if msg.media.type == 'photo' -%} 37 | {% if msg.text or msg.msgtype %}
38 | {%- endif %} 39 | {%- elif msg.media.type == 'sticker' -%} 40 | {% if msg.msgtype %}
41 | {%- endif %} 42 | {%- elif msg.media.type in ('audio', 'voice') -%} 43 | {% if msg.msgtype %}
44 | {%- endif %} 45 | {%- elif msg.media.type == 'video' -%} 46 | {% if msg.msgtype %}
47 | {%- endif %} 48 | {%- elif msg.media.type == 'document' -%} 49 | [{% if msg.media.file_name %}{{ msg.media.file_name|escape }}, {% endif %}{% if msg.media.file_size %}{{ msg.media.file_size|d(0)|filesizeformat }}, {% endif %}{{ msg.media.mime_type|d('application/octet-stream') }}] {% if msg.media.url|isimg %}
{% else %}{{ msg.media.file_name|d('file')|escape }}{% endif %} 50 | {%- else -%} 51 | [{{ msg.media.type }}] 52 | {%- endif %} 53 | {%- elif msg.media.type == 'contact' -%} 54 | [{{ msg.media|string|escape }}] 55 | {%- elif msg.media.type in ('geo', 'location', 'venue') -%} 56 | [location {{ msg.media.latitude }}, {{ msg.media.longitude }}] 57 | {%- elif '_ircuser' in msg.media -%} 58 | [IRC] 59 | {%- else %} 60 | [{{ msg.media.type }}] 61 | {%- endif %} 62 | {%- endif %}{% if msg.service -%} 63 | [{{ msg.action.type }}] 64 | {% if msg.action.type in ('chat_created', 'chat_rename') -%} 65 | {{ msg.action.title|escape|autolink }} 66 | {%- elif msg.action.type in ('chat_add_user', 'chat_del_user') -%} 67 | {{ msg.action.user.print }} 68 | {%- endif %} 69 | {%- endif %}