├── tg-server.pub
├── templates
├── history.txt
└── simple.html
├── getlog
├── .gitignore
├── tglive.py
├── avatar.py
├── README.md
├── dbconvert.py
├── LICENSE
├── tgcli.py
├── export.py
└── logfmt.py
/tg-server.pub:
--------------------------------------------------------------------------------
1 | -----BEGIN RSA PUBLIC KEY-----
2 | MIIBCgKCAQEAwVACPi9w23mF3tBkdZz+zwrzKOaaQdr01vAbU4E1pvkfj4sqDsm6
3 | lyDONS789sVoD/xCS9Y0hkkC3gtL1tSfTlgCMOOul9lcixlEKzwKENj1Yz/s7daS
4 | an9tqw3bfUV/nqgbhGX81v/+7RFAEd+RwFnK7a+XYl9sluzHRyVVaTTveB2GazTw
5 | Efzk2DWgkBluml8OREmvfraX3bkHZJTKX4EQSjBbbdJ2ZXIsRrYOXfaA+xayEGB+
6 | 8hdlLmAjbCVfaigxX0CDqWeR1yFL9kwd9P0NsZRPsmoqVwMbMu7mStFai6aIhc3n
7 | Slv8kg9qv1m6XHVQY3PnEw+QQtqSIXklHwIDAQAB
8 | -----END RSA PUBLIC KEY-----
9 |
10 |
--------------------------------------------------------------------------------
/templates/history.txt:
--------------------------------------------------------------------------------
1 | {{ peer.print }}
2 | {% if count -%}
3 | From {{ start|strftime('%Y-%m-%d %H:%M:%S') }} to {{ end|strftime('%Y-%m-%d %H:%M:%S') }}, total {{ count }}
4 | {%- endif %}
5 | {% for msg in msgs %}[{{ msg.date|strftime('%Y-%m-%d %H:%M:%S') }}] {{ msg.src.print }}{% if msg.msgtype == 'fwd' %} [Fwd: {{ msg.extra.fwd_src.print }}]
6 | {%- elif msg.msgtype == 're' %} [Re: {{ msg.extra.reply.mid }}]
7 | {%- endif %} >>>{% if msg.text %} {{ msg.text }}{% endif %}{% if msg.media %} [{{ msg.media.type|d('IRC') }}]{% endif %}{% if msg.service %} [{{ msg.action.type }}]{% endif %}
8 | {% endfor %}
9 |
--------------------------------------------------------------------------------
/getlog:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | BOT_LOG_FILE=chatlog.db
4 | PEER_ID=-12345678
5 | PEER_TITLE='Some Interesting Group'
6 | CACHE_PATH=img
7 | URL_PREFIX=/img/
8 | HARD_LIMIT=100000
9 |
10 | if [ "$HTTP_IF_MODIFIED_SINCE" ]; then
11 | if [ "$(date -u -d "$HTTP_IF_MODIFIED_SINCE" +%s)" -ge "$(stat -c %Y $BOT_LOG_FILE)" ]; then
12 | echo 'Status: 304 Not Modified'
13 | echo
14 | exit 0
15 | fi
16 | fi
17 |
18 | echo 'Status: 200 OK'
19 | echo 'Content-Type: text/html; charset=utf-8'
20 | echo "Last-Modified: $(date -R -u -d @$(stat -c %Y $BOT_LOG_FILE))"
21 | echo
22 |
23 | if [ "$QUERY_STRING" ]; then
24 | limit="$(echo "$QUERY_STRING" | tr -Cd [[:digit:],])"
25 | else
26 | limit=500
27 | fi
28 |
29 | python3 logfmt.py -b $BOT_LOG_FILE -d '' -t html -D=$PEER_ID -o=- -P="$PEER_TITLE" -l $limit -L $HARD_LIMIT -c $CACHE_PATH -r $URL_PREFIX $PEER_ID
30 |
31 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 |
26 | # PyInstaller
27 | # Usually these files are written by a python script from a template
28 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 |
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 |
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 |
46 | # Translations
47 | *.mo
48 | *.pot
49 |
50 | # Django stuff:
51 | *.log
52 |
53 | # Sphinx documentation
54 | docs/_build/
55 |
56 | # PyBuilder
57 | target/
58 |
59 | *.db
60 | *.db-journal
61 |
--------------------------------------------------------------------------------
/tglive.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | '''
5 | Proof-of-concept telegram message live broadcasting with
6 | [live-danmaku-hime](https://github.com/m13253/live-danmaku-hime)
7 | '''
8 |
9 | import sys
10 | import time
11 | import tgcli
12 | import jinja2
13 | import logging
14 | import textwrap
15 |
16 | logging.basicConfig(stream=sys.stderr,
17 | format='%(asctime)s [%(levelname)s] %(message)s', level=logging.DEBUG)
18 |
19 | txt_template = '''[{{ msg.date|strftime('%H:%M') }} {{ msg.to.print_name[:8] }}] {{ msg.from.print_name }}{% if 'fwd_from' in msg %} [Fwd: {{ msg.fwd_from.print_name }}]
20 | {%- elif 'reply_id' in msg %} [Re]
21 | {%- endif %} >{% if msg.text %} {{ msg.text }}{% endif %}{% if msg.media %} [{{ msg.media.type }}]{% endif %}{% if msg.service %} [{{ msg.action.type }}]{% endif %}'''
22 |
23 | jinjaenv = jinja2.Environment(loader=jinja2.DictLoader({'txt': txt_template}))
24 | jinjaenv.filters['strftime'] = lambda date, fmt='%Y-%m-%d %H:%M:%S': time.strftime(fmt, time.localtime(date))
25 |
26 | template = jinjaenv.get_template('txt')
27 |
28 | WIDTH = 35
29 |
30 | def print_msg(msg):
31 | logging.debug(msg)
32 | try:
33 | if msg.get('event') in ('message', 'service'):
34 | s = template.render(msg=msg).strip()
35 | s = '\n'.join(textwrap.wrap(s, WIDTH)) + '\n'
36 | sys.stdout.write(s)
37 | sys.stdout.flush()
38 | except Exception:
39 | logging.exception('Failed to process a message.')
40 |
41 |
42 | with tgcli.TelegramCliInterface(sys.argv[1]) as c:
43 | c.on_json = print_msg
44 | for ln in sys.stdin:
45 | l = ln.strip()
46 | if l == 'q':
47 | break
48 | elif l.isdigit():
49 | WIDTH = int(l)
50 |
--------------------------------------------------------------------------------
/avatar.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import os
5 | import sys
6 | import logging
7 | import argparse
8 |
9 | import tgcli
10 |
11 | logging.basicConfig(stream=sys.stdout, format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO)
12 |
13 | def export_avatar_peer(tc, peertype, pid, filename):
14 | peername = '%s#id%d' % (peertype, pid)
15 | if os.path.isfile(filename):
16 | logging.info('Avatar exists: ' + peername)
17 | return
18 | res = getattr(tc, 'cmd_load_%s_photo' % peertype)(peername)
19 | if 'result' in res and res['result'] != 'FAIL':
20 | os.rename(res['result'], filename)
21 | logging.info('Exported avatar for %s' % peername)
22 | else:
23 | logging.warning('Failed to export avatar for %s: %s' % (peername, res))
24 |
25 | def export_avatar_group(tc, grouptype, pid, path):
26 | peername = '%s#id%d' % (grouptype, pid)
27 | members = {}
28 | logging.info('Fetching info for %s' % peername)
29 | if grouptype == 'channel':
30 | items = tc.cmd_channel_get_members(peername, 100)
31 | for item in items:
32 | members[item['peer_id']] = item
33 | dcount = 100
34 | while items:
35 | items = tc.cmd_channel_get_members(peername, 100, dcount)
36 | for item in items:
37 | members[item['peer_id']] = item
38 | dcount += 100
39 | else:
40 | obj = tc.cmd_chat_info(peername)
41 | for item in obj['members']:
42 | members[item['peer_id']] = item
43 | for key in members:
44 | export_avatar_peer(tc, 'user', key, os.path.join(path, '%d.jpg' % key))
45 |
46 | def main(argv):
47 | parser = argparse.ArgumentParser(description="Export Telegram messages.")
48 | parser.add_argument("-o", "--output", help="output path", default="export")
49 | parser.add_argument("-g", "--group", help="export every user's avatar in a group or channel", action='store_true')
50 | parser.add_argument("-t", "--type", help="peer type, can be 'user', 'chat', 'channel'", default="user")
51 | parser.add_argument("-i", "--id", help="peer id", type=int)
52 | parser.add_argument("-e", "--tgbin", help="Telegram-cli binary path", default="bin/telegram-cli")
53 | args = parser.parse_args(argv)
54 |
55 | with tgcli.TelegramCliInterface(args.tgbin, run=False) as tc:
56 | tc.cmd_dialog_list()
57 | if not os.path.isdir(args.output):
58 | os.mkdir(args.output)
59 | if args.group:
60 | export_avatar_group(tc, args.type, args.id, args.output)
61 | else:
62 | export_avatar_peer(tc, args.type, args.id, os.path.join(args.output, '%s%d.jpg' % (args.type, args.id)))
63 |
64 | if __name__ == '__main__':
65 | main(sys.argv[1:])
66 |
--------------------------------------------------------------------------------
/templates/simple.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {{ peer.print|escape }}
6 |
17 |
18 |
19 |
20 | | Time | From | Message |
21 |
22 | {% for msg in msgs -%}
23 |
24 | | {{ msg.date|strftime('%Y-%m-%d %H:%M:%S') }} |
25 | {{ msg.src|smartname|escape }}: |
26 | {% if msg.msgtype == 'fwd' -%}
27 | Fwd {{ msg.extra.fwd_src|smartname|escape }}:
28 | {%- elif msg.msgtype == 're' -%}
29 | {{ msg.extra.reply.src|smartname|escape }}:
30 | {%- endif %}
31 | {% if msg.text -%}
32 | {{ msg.text|escape|autolink|replace("\n", " ") }}
33 | {%- endif %}
34 | {% if msg.media -%}
35 | {% if msg.media.url -%}
36 | {% if msg.media.type == 'photo' -%}
37 | {% if msg.text or msg.msgtype %}
38 | {%- endif %}
39 | {%- elif msg.media.type == 'sticker' -%}
40 | {% if msg.msgtype %}
41 | {%- endif %}
42 | {%- elif msg.media.type in ('audio', 'voice') -%}
43 | {% if msg.msgtype %}
44 | {%- endif %}
45 | {%- elif msg.media.type == 'video' -%}
46 | {% if msg.msgtype %}
47 | {%- endif %}
48 | {%- elif msg.media.type == 'document' -%}
49 | [{% if msg.media.file_name %}{{ msg.media.file_name|escape }}, {% endif %}{% if msg.media.file_size %}{{ msg.media.file_size|d(0)|filesizeformat }}, {% endif %}{{ msg.media.mime_type|d('application/octet-stream') }}] {% if msg.media.url|isimg %}
{% else %}{{ msg.media.file_name|d('file')|escape }}{% endif %}
50 | {%- else -%}
51 | [{{ msg.media.type }}]
52 | {%- endif %}
53 | {%- elif msg.media.type == 'contact' -%}
54 | [{{ msg.media|string|escape }}]
55 | {%- elif msg.media.type in ('geo', 'location', 'venue') -%}
56 | [location {{ msg.media.latitude }}, {{ msg.media.longitude }}]
57 | {%- elif '_ircuser' in msg.media -%}
58 | [IRC]
59 | {%- else %}
60 | [{{ msg.media.type }}]
61 | {%- endif %}
62 | {%- endif %}{% if msg.service -%}
63 | [{{ msg.action.type }}]
64 | {% if msg.action.type in ('chat_created', 'chat_rename') -%}
65 | {{ msg.action.title|escape|autolink }}
66 | {%- elif msg.action.type in ('chat_add_user', 'chat_del_user') -%}
67 | {{ msg.action.user.print }}
68 | {%- endif %}
69 | {%- endif %} |
70 |
71 | {%- endfor %}
72 |
73 |
74 |
75 |
76 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # tg-export
2 |
3 | **Deprecation notice**: Since tg-cli is not maintained and becomes unusable, this project is deprecated. Please use alternatives like [telegram-export](https://github.com/expectocode/telegram-export) instead.
4 |
5 | Export Telegram messages, using [telegram-cli](https://github.com/vysheng/tg). [Patched version](https://github.com/gumblex/tg) recommended.
6 |
7 | This version (v3) is compatible with `vysheng/tg/master` AND `vysheng/tg/test`
8 | branches.
9 |
10 | **Note**: The database format of this version (v3) is not compatible with the old ones.
11 | To convert old databases (v1 or v2), run `python3 dbconvert.py [old.db [new.db]]`
12 |
13 | ## export.py
14 |
15 | ```
16 | $ python3 export.py -h
17 | usage: export.py [-h] [-o OUTPUT] [-d DB] [-f] [-p PEER] [-B] [-t TIMEOUT]
18 | [-l] [-L] [-e TGBIN] [-v]
19 |
20 | Export Telegram messages.
21 |
22 | optional arguments:
23 | -h, --help show this help message and exit
24 | -o OUTPUT, --output OUTPUT
25 | output path
26 | -d DB, --db DB database path
27 | -f, --force force download all messages
28 | -p PEER, --peer PEER only download messages for this peer (format:
29 | channel#id1001234567, or use partial name/title as
30 | shown in tgcli)
31 | -B, --batch-only fetch messages in batch only, don't try to get more
32 | missing messages
33 | -t TIMEOUT, --timeout TIMEOUT
34 | tg-cli command timeout
35 | -l, --logging logging mode (keep running)
36 | -L, --keep-logging first export, then keep logging
37 | -e TGBIN, --tgbin TGBIN
38 | telegram-cli binary path
39 | -v, --verbose print debug messages
40 | ```
41 |
42 | **Lots** of workaround about the unreliability of tg-cli is included (in this script and `tgcli.py`), so the script itself may be unreliable as well.
43 |
44 | Common problems with tg-cli are:
45 | * Dies arbitrarily.
46 | * No response in the socket interface.
47 | * Slow response in the socket interface.
48 | * Half response in the socket interface, while the another half appears after the timeout.
49 | * Returns an empty array when actually there are remaining messages.
50 |
51 | **Note**: When it's trying to get the remaining messages, the telegram-cli will crash like crazy. That's due to non-existent messages. For a quick fix, use [this fork](https://github.com/gumblex/tg) of tg-cli.
52 |
53 | Which is called NO WARRANTY™.
54 |
55 | ## logfmt.py
56 |
57 | This script can process database written by `export.py` or [tg-chatdig](https://github.com/gumblex/tg-chatdig), and write out a human-readable format (txt, html, etc.) according to a jinja2 template.
58 |
59 | ```
60 | usage: logfmt.py [-h] [-o OUTPUT] [-d DB] [-b BOTDB] [-D BOTDB_DEST] [-u]
61 | [-t TEMPLATE] [-P PEER_PRINT] [-l LIMIT] [-L HARDLIMIT]
62 | [-c CACHEDIR] [-r URLPREFIX]
63 | peer
64 |
65 | Format exported database file into human-readable format.
66 |
67 | positional arguments:
68 | peer export certain peer id or tg-cli-style peer print name
69 |
70 | optional arguments:
71 | -h, --help show this help message and exit
72 | -o OUTPUT, --output OUTPUT
73 | output path
74 | -d DB, --db DB tg-export database path
75 | -b BOTDB, --botdb BOTDB
76 | tg-chatdig bot database path
77 | -D BOTDB_DEST, --botdb-dest BOTDB_DEST
78 | tg-chatdig bot logged chat id or tg-cli-style peer
79 | name
80 | -u, --botdb-user use user information in tg-chatdig database first
81 | -t TEMPLATE, --template TEMPLATE
82 | export template, can be 'txt'(default), 'html',
83 | 'json', or template file name
84 | -P PEER_PRINT, --peer-print PEER_PRINT
85 | set print name for the peer
86 | -l LIMIT, --limit LIMIT
87 | limit the number of fetched messages and set the
88 | offset
89 | -L HARDLIMIT, --hardlimit HARDLIMIT
90 | set a hard limit of the number of messages, must be
91 | used with -l
92 | -c CACHEDIR, --cachedir CACHEDIR
93 | the path of media files
94 | -r URLPREFIX, --urlprefix URLPREFIX
95 | the url prefix of media files
96 | ```
97 |
98 | ## tgcli.py
99 | Simple wrapper for telegram-cli interface.
100 |
101 | Example:
102 | ```python
103 | tgcli = TelegramCliInterface('../tg/bin/telegram-cli')
104 | dialogs = tgcli.cmd_dialog_list()
105 | ```
106 |
107 | ### TelegramCliInterface(cmd, extra_args=(), run=True)
108 |
109 | * `run()` starts the subprocess, needed when object created with `run=False`.
110 | * `send_command(cmd, timeout=180, resync=True)` sends a command to tg-cli. use `resync` for consuming text since last timeout.
111 | * `cmd_*(*args, **kwargs)` is the convenience method to send a command and get response. `args` are for the command, `kwargs` are arguments for `TelegramCliInterface.send_command`.
112 | * `on_info(text)`(callback) is called when a line of text is printed on stdout.
113 | * `on_json(obj)`(callback) is called with the interpreted object when a line of json is printed on stdout.
114 | * `on_text(text)`(callback) is called when a line of anything is printed on stdout.
115 | * `on_start()`(callback) is called after telegram-cli starts.
116 | * `on_exit()`(callback) is called after telegram-cli dies.
117 | * `close()` properly ends the subprocess.
118 |
119 | `do_nothing()` function does nothing. (for callbacks)
120 |
121 | `TelegramCliExited` exception is raised if telegram-cli dies when reading an answer.
122 |
123 | ## License
124 |
125 | Now it's LGPLv3+.
126 |
--------------------------------------------------------------------------------
/dbconvert.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import os
5 | import sys
6 | import struct
7 | import sqlite3
8 | import binascii
9 | import collections
10 |
11 | class tgl_peer_id_t(collections.namedtuple('tgl_peer_id_t', 'peer_type peer_id access_hash')):
12 | '''
13 | typedef struct {
14 | int peer_type;
15 | int peer_id;
16 | long long access_hash;
17 | } tgl_peer_id_t;
18 | '''
19 | TGL_PEER_USER = 1
20 | TGL_PEER_CHAT = 2
21 | TGL_PEER_GEO_CHAT = 3
22 | TGL_PEER_ENCR_CHAT = 4
23 | TGL_PEER_CHANNEL = 5
24 | TGL_PEER_TEMP_ID = 100
25 | TGL_PEER_RANDOM_ID = 101
26 | TGL_PEER_UNKNOWN = 0
27 |
28 | @classmethod
29 | def loads(cls, s):
30 | return cls._make(struct.unpack(' 1:
141 | FILENAME_IN = sys.argv[1]
142 | if len(sys.argv) > 2:
143 | FILENAME_OUT = sys.argv[2]
144 |
145 | if not os.path.isfile(FILENAME_IN):
146 | print('Database file not found.')
147 | sys.exit(1)
148 |
149 | DB_IN = sqlite3.connect(FILENAME_IN)
150 | CUR_IN = DB_IN.cursor()
151 |
152 | for n in CUR_IN.execute("SELECT name FROM sqlite_master WHERE type='table'"):
153 | if n[0] == 'exportinfo':
154 | VER = 1
155 | break
156 | elif n[0] == 'peerinfo':
157 | VER = 2
158 | break
159 | else:
160 | print('Database not recognized.')
161 | sys.exit(1)
162 |
163 | print('Converting database:')
164 |
165 | DB = sqlite3.connect(FILENAME_OUT)
166 | CUR = DB.cursor()
167 | init_db(CUR)
168 |
169 | if VER == 1:
170 | print('* messages')
171 | for mid, src, dest, text, media, date, fwd_src, fwd_date, reply_id, out, unread, service, action, flags in CUR_IN.execute('SELECT * FROM messages ORDER BY id ASC'):
172 | CUR.execute('REPLACE INTO messages VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', (mid, convert_peerid1(src), convert_peerid1(dest), text, media, date, convert_peerid1(fwd_src), fwd_date, reply_id, out, unread, service, action, flags))
173 | print('* users')
174 | for pid, phone, username, first_name, last_name, flags in CUR_IN.execute('SELECT * FROM users'):
175 | CUR.execute('REPLACE INTO users VALUES (?,?,?,?,?,?,?)', (pid, 0, phone, username, first_name, last_name, flags))
176 | print('* chats')
177 | for pid, title, members_num, flags in CUR_IN.execute('SELECT * FROM chats'):
178 | CUR.execute('REPLACE INTO chats VALUES (?,?,?,?,?)', (pid, 0, title, members_num, flags))
179 | print('* peerinfo')
180 | for pid, print_name, finished in CUR_IN.execute('SELECT * FROM exportinfo'):
181 | CUR.execute('REPLACE INTO peerinfo VALUES (?,?,?,?)', (convert_peerid1(pid), 'chat' if pid < 0 else 'user', print_name, finished))
182 | else:
183 | print('* messages')
184 | for mid, src, dest, text, media, date, fwd_src, fwd_date, reply_id, out, unread, service, action, flags in CUR_IN.execute('SELECT * FROM messages ORDER BY date, id ASC'):
185 | CUR.execute('REPLACE INTO messages VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', (tgl_message_id_t.loads(mid).id, convert_peerid2(src), convert_peerid2(dest), text, media, date, convert_peerid2(fwd_src), fwd_date, convert_msgid2(reply_id), out, unread, service, action, flags))
186 | print('* users')
187 | for pid, permanent_id, phone, username, first_name, last_name, flags in CUR_IN.execute('SELECT * FROM users'):
188 | CUR.execute('REPLACE INTO users VALUES (?,?,?,?,?,?,?)', (pid, tgl_peer_id_t.loads(permanent_id).access_hash, phone, username, first_name, last_name, flags))
189 | print('* chats')
190 | for pid, permanent_id, title, members_num, flags in CUR_IN.execute('SELECT * FROM chats'):
191 | CUR.execute('REPLACE INTO chats VALUES (?,?,?,?,?)', (pid, tgl_peer_id_t.loads(permanent_id).access_hash, title, members_num, flags))
192 | print('* channels')
193 | for pid, permanent_id, title, participants_count, admins_count, kicked_count, flags in CUR_IN.execute('SELECT * FROM channels'):
194 | CUR.execute('REPLACE INTO channels VALUES (?,?,?,?,?,?,?)', (pid, tgl_peer_id_t.loads(permanent_id).access_hash, title, participants_count, admins_count, kicked_count, flags))
195 | print('* peerinfo')
196 | for pid, ptype, print_name, finished in CUR_IN.execute('SELECT * FROM peerinfo'):
197 | CUR.execute('REPLACE INTO peerinfo VALUES (?,?,?,?)', (convert_peerid2(pid), ptype, print_name, finished))
198 |
199 | DB.commit()
200 | print('Done.')
201 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/tgcli.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import os
5 | import time
6 | import json
7 | import socket
8 | import shutil
9 | import signal
10 | import logging
11 | import tempfile
12 | import threading
13 | import subprocess
14 |
15 | '''
16 | tgcli.py - Library to interact with telegram-cli.
17 | Copyright (C) 2015-2016 Dingyuan Wang
18 |
19 | This program is free software: you can redistribute it and/or modify
20 | it under the terms of the GNU Lesser General Public License as
21 | published by the Free Software Foundation, either version 3 of the
22 | License, or (at your option) any later version.
23 |
24 | This program is distributed in the hope that it will be useful,
25 | but WITHOUT ANY WARRANTY; without even the implied warranty of
26 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 | GNU Lesser General Public License for more details.
28 |
29 | You should have received a copy of the GNU Lesser General Public
30 | License along with this program. If not, see
31 | .
32 | '''
33 |
34 | tg_server_pub = '''-----BEGIN RSA PUBLIC KEY-----
35 | MIIBCgKCAQEAwVACPi9w23mF3tBkdZz+zwrzKOaaQdr01vAbU4E1pvkfj4sqDsm6
36 | lyDONS789sVoD/xCS9Y0hkkC3gtL1tSfTlgCMOOul9lcixlEKzwKENj1Yz/s7daS
37 | an9tqw3bfUV/nqgbhGX81v/+7RFAEd+RwFnK7a+XYl9sluzHRyVVaTTveB2GazTw
38 | Efzk2DWgkBluml8OREmvfraX3bkHZJTKX4EQSjBbbdJ2ZXIsRrYOXfaA+xayEGB+
39 | 8hdlLmAjbCVfaigxX0CDqWeR1yFL9kwd9P0NsZRPsmoqVwMbMu7mStFai6aIhc3n
40 | Slv8kg9qv1m6XHVQY3PnEw+QQtqSIXklHwIDAQAB
41 | -----END RSA PUBLIC KEY-----
42 | '''
43 |
44 | logger = logging.getLogger('tgcli')
45 | logger.setLevel(logging.INFO)
46 | do_nothing = lambda *args, **kwargs: None
47 |
48 | def preexec_ignore_sigint():
49 | '''
50 | Ignore the SIGINT signal by setting the handler to the standard
51 | signal handler SIG_IGN.
52 | '''
53 | signal.signal(signal.SIGINT, signal.SIG_IGN)
54 |
55 | class TelegramCliExited(RuntimeError):
56 | pass
57 |
58 | class TelegramCliInterface:
59 | def __init__(self, cmd, extra_args=(), run=True, timeout=60, ignore_sigint=True):
60 | self.cmd = cmd
61 | self.extra_args = tuple(extra_args)
62 | self.proc = None
63 | self.sock = None
64 | self.buffer = b''
65 | self.ready = threading.Event()
66 | self.closed = False
67 | self.thread = None
68 | self.tmpdir = tempfile.mkdtemp()
69 | self.timeout = timeout
70 | self.ignore_sigint = ignore_sigint
71 | # Event callbacks
72 | # `on_info`, `on_json` and `on_text` are for stdout
73 | self.on_info = logger.info
74 | self.on_json = logger.debug
75 | self.on_text = do_nothing
76 | self.on_start = lambda: logger.info('Telegram-cli started.')
77 | self.on_exit = lambda: logger.warning('Telegram-cli died.')
78 | if run:
79 | self.run()
80 |
81 | def _get_pubkey(self):
82 | tgdir = os.path.abspath(os.path.join(os.path.dirname(
83 | os.path.realpath(self.cmd)), '..'))
84 | paths = [
85 | os.path.join(tgdir, 'tg-server.pub'),
86 | os.path.join(tgdir, 'server.pub'),
87 | '/etc/telegram-cli/server.pub',
88 | '/usr/local/etc/telegram-cli/server.pub',
89 | os.path.join(self.tmpdir, 'tg-server.pub')
90 | ]
91 | for path in paths:
92 | if os.path.isfile(path):
93 | return path
94 | else:
95 | with open(path, 'w') as f:
96 | f.write(tg_server_pub)
97 | return path
98 |
99 | def checkproc(self):
100 | if self.closed or self.proc and self.proc.poll() is None:
101 | return self.proc
102 | sockfile = os.path.join(self.tmpdir, 'tgcli.sock')
103 | if os.path.exists(sockfile):
104 | os.unlink(sockfile)
105 | self.proc = subprocess.Popen((self.cmd, '-k', self._get_pubkey(),
106 | '--json', '-R', '-C', '-S', sockfile) + self.extra_args,
107 | stdin=subprocess.PIPE, stdout=subprocess.PIPE,
108 | stderr=subprocess.STDOUT,
109 | preexec_fn=preexec_ignore_sigint if self.ignore_sigint else None)
110 | while not os.path.exists(sockfile):
111 | time.sleep(0.5)
112 | self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
113 | self.sock.connect(sockfile)
114 | return self.proc
115 |
116 | def _run_cli(self):
117 | while not self.closed:
118 | self.checkproc()
119 | try:
120 | while not self.closed:
121 | out = self.proc.stdout.readline().decode('utf-8')
122 | if not out:
123 | break
124 | elif not self.ready.is_set():
125 | self.on_start()
126 | self.ready.set()
127 | self.on_text(out)
128 | if out[0] in '[{':
129 | try:
130 | self.on_json(json.loads(out.strip()))
131 | except ValueError:
132 | self.on_info(out.strip())
133 | else:
134 | self.on_info(out.strip())
135 | except BrokenPipeError:
136 | pass
137 | finally:
138 | try:
139 | self.sock.shutdown(socket.SHUT_RDWR)
140 | except Exception:
141 | pass
142 | if self.proc and self.proc.poll() is None:
143 | self.proc.terminate()
144 | self.proc.wait()
145 | self.ready.clear()
146 | self.on_exit()
147 |
148 | def run(self):
149 | self.thread = threading.Thread(target=self._run_cli)
150 | self.thread.daemon = True
151 | self.thread.start()
152 | self.ready.wait()
153 |
154 | def restart(self):
155 | self.close()
156 | self.closed = False
157 | self.tmpdir = tempfile.mkdtemp()
158 | self.run()
159 |
160 | def close(self):
161 | if self.closed:
162 | return
163 | self.closed = True
164 | self.ready.clear()
165 | try:
166 | self.proc.wait(2)
167 | except subprocess.TimeoutExpired:
168 | self.proc.kill()
169 | if self.thread:
170 | self.thread.join(1)
171 | if os.path.isdir(self.tmpdir):
172 | shutil.rmtree(self.tmpdir, True)
173 | self.tmpdir = None
174 |
175 | def __enter__(self):
176 | if not self.thread:
177 | self.run()
178 | self.ready.wait()
179 | return self
180 |
181 | def __exit__(self, exc_type, exc_value, traceback):
182 | self.close()
183 |
184 | def __del__(self):
185 | self.close()
186 |
187 | def _readline(self):
188 | while self.ready.is_set():
189 | lines = self.buffer.split(b'\n', 1)
190 | if len(lines) > 1:
191 | self.buffer = lines[1]
192 | return lines[0] + b'\n'
193 | else:
194 | self.buffer += self.sock.recv(1024)
195 | # usually there is an assertion error
196 | raise TelegramCliExited('telegram-cli unexpectedly exited.')
197 |
198 | def send_command(self, cmd, timeout=None, resync=True):
199 | '''
200 | Send a command to tg-cli.
201 | use `resync` for consuming text since last timeout.
202 | '''
203 | logger.debug(cmd)
204 | self.ready.wait()
205 | self.sock.settimeout(timeout or self.timeout)
206 | self.sock.sendall(cmd.encode('utf-8') + b'\n')
207 | line = self._readline()
208 | while resync and not line.startswith(b'ANSWER '):
209 | line = self._readline()
210 | size = int(line[7:].decode('ascii'))
211 | reply = b''
212 | while len(reply) < size:
213 | reply += self._readline()
214 | ret = reply.decode('utf-8')
215 | try:
216 | return json.loads(ret)
217 | except ValueError:
218 | return ret
219 |
220 | def __getattr__(self, name):
221 | '''
222 | Convenience command calling: cmd_*(*args, **kwargs)
223 | `args` are for the tg-cli command
224 | `kwargs` are for `send_command`
225 | '''
226 | if name.startswith('cmd_'):
227 | fn = lambda *args, **kwargs: self.send_command(
228 | ' '.join(map(str, (name[4:],) + args)), **kwargs)
229 | return fn
230 | else:
231 | raise AttributeError('TelegramCliInterface has no attribute %r' % name)
232 |
233 | if __name__ == "__main__":
234 | import sys
235 | logging.basicConfig(stream=sys.stderr, format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO)
236 | with TelegramCliInterface(sys.argv[1]) as tgcli:
237 | for ln in sys.stdin:
238 | try:
239 | cmd = ln.strip()
240 | print(tgcli.send_command(cmd))
241 | except Exception:
242 | logging.exception('Failed to execute: ' + cmd)
243 |
--------------------------------------------------------------------------------
/export.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import re
5 | import sys
6 | import json
7 | import time
8 | import queue
9 | import random
10 | import socket
11 | import struct
12 | import sqlite3
13 | import logging
14 | import argparse
15 | import binascii
16 | import functools
17 | import collections
18 |
19 | import tgcli
20 |
21 | __version__ = '3.0'
22 |
23 | re_msglist = re.compile(r'^\[.*\]$')
24 | re_onemsg = re.compile(r'^\{.+\}$')
25 | re_getmsg = re.compile(r'^\*\*\* [0-9.]+ id=\d+$')
26 |
27 | logging.basicConfig(stream=sys.stdout, format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO)
28 |
29 | class LRUCache:
30 |
31 | def __init__(self, maxlen):
32 | self.capacity = maxlen
33 | self.cache = collections.OrderedDict()
34 |
35 | def __getitem__(self, key):
36 | value = self.cache.pop(key)
37 | self.cache[key] = value
38 | return value
39 |
40 | def get(self, key):
41 | try:
42 | value = self.cache.pop(key)
43 | self.cache[key] = value
44 | return value
45 | except KeyError:
46 | return None
47 |
48 | def __setitem__(self, key, value):
49 | try:
50 | self.cache.pop(key)
51 | except KeyError:
52 | if len(self.cache) >= self.capacity:
53 | self.cache.popitem(last=False)
54 | self.cache[key] = value
55 |
56 | def retry_or_log(attempts=2):
57 | def decorator(func):
58 | @functools.wraps(func)
59 | def wrapped(*args, **kwargs):
60 | for att in range(attempts):
61 | try:
62 | return func(*args, **kwargs)
63 | except Exception as ex:
64 | if att == attempts-1:
65 | logging.exception('Wrapped function failed.')
66 | return wrapped
67 | return decorator
68 |
69 | def uniq(seq, key=None): # Dave Kirby
70 | # Order preserving
71 | seen = set()
72 | if key:
73 | return [x for x in seq if key(x) not in seen and not seen.add(key(x))]
74 | else:
75 | return [x for x in seq if x not in seen and not seen.add(x)]
76 |
77 | class tgl_peer_id_t(collections.namedtuple('tgl_peer_id_t', 'peer_type peer_id access_hash')):
78 | '''
79 | typedef struct {
80 | int peer_type;
81 | int peer_id;
82 | long long access_hash;
83 | } tgl_peer_id_t;
84 | '''
85 | TGL_PEER_USER = 1
86 | TGL_PEER_CHAT = 2
87 | TGL_PEER_GEO_CHAT = 3
88 | TGL_PEER_ENCR_CHAT = 4
89 | TGL_PEER_CHANNEL = 5
90 | TGL_PEER_TEMP_ID = 100
91 | TGL_PEER_RANDOM_ID = 101
92 | TGL_PEER_UNKNOWN = 0
93 |
94 | @classmethod
95 | def loads(cls, s):
96 | return cls._make(struct.unpack('flags & TGLMF_CREATED)) { return res; }
293 | if ret or 'flags' in msg:
294 | CONN.execute('REPLACE INTO messages VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', (getmsgid(msg, 'id'), getpeerid(msg, 'from'), getpeerid(msg, 'to'), msg.get('text'), json.dumps(msg['media']) if 'media' in msg else None, msg.get('date'), getpeerid(msg, 'fwd_from'), msg.get('fwd_date'), getmsgid(msg, 'reply_id'), msg.get('out'), msg.get('unread'), msg.get('service'), json.dumps(msg['action']) if 'action' in msg else None, msg.get('flags')))
295 | return ret
296 |
297 | def process(obj):
298 | if isinstance(obj, list):
299 | if not obj:
300 | return (False, 0)
301 | hit = 0
302 | for msg in obj:
303 | hit += log_msg(msg)
304 | return (True, hit)
305 | elif isinstance(obj, dict):
306 | msg = obj
307 | if msg.get('event') in ('message', 'service', 'read'):
308 | return (True, log_msg(msg))
309 | elif msg.get('event') == 'online-status':
310 | update_peer(msg['user'])
311 | elif 'peer' in msg:
312 | update_peer(msg['peer'])
313 | elif msg.get('result') == 'FAIL':
314 | if 'can not parse' in msg.get('error', ''):
315 | TGCLI.cmd_dialog_list()
316 | #raise ValueError(msg.get('error'))
317 | return (False, 0)
318 | return (None, None)
319 | # ignore non-json lines
320 | elif obj == '':
321 | raise ValueError('empty line received')
322 | return (None, None)
323 |
324 | def purge_queue():
325 | while 1:
326 | try:
327 | process(MSG_Q.get_nowait())
328 | except queue.Empty:
329 | break
330 |
331 | def on_start():
332 | logging.info('Telegram-cli started.')
333 | time.sleep(2)
334 | TGCLI.cmd_dialog_list()
335 | logging.info('Telegram-cli is ready.')
336 |
337 | def logging_fmt(msg):
338 | if msg.get('event') == 'message':
339 | dst = msg['to']['print_name'] if 'to' in msg else ''
340 | src = msg['from']['print_name'] if 'from' in msg else ''
341 | return ' '.join(filter(None, (dst, src, '>>>', msg.get('text', ''), str(msg.get('media', '')))))
342 | elif msg.get('event') == 'service':
343 | dst = msg['to']['print_name'] if 'to' in msg else ''
344 | src = msg['from']['print_name'] if 'from' in msg else ''
345 | return ' '.join(filter(None, (dst, src, '>>>', str(msg.get('action', '')))))
346 | else:
347 | return repr(msg)[:100]
348 |
349 | def logging_status(pos, end=False, seg=1000, length=None):
350 | if pos % seg:
351 | sys.stdout.write('.')
352 | elif pos:
353 | if length:
354 | sys.stdout.write('%.2f%%' % (pos * 100 / length))
355 | else:
356 | sys.stdout.write(str(pos))
357 | if end:
358 | if pos and pos % seg:
359 | if length:
360 | sys.stdout.write('%.2f%%\n' % (pos * 100 / length))
361 | else:
362 | sys.stdout.write('%d\n' % pos)
363 | else:
364 | sys.stdout.write('\n')
365 | sys.stdout.flush()
366 |
367 | def export_for(item, pos=0, force=False):
368 | logging.info('Exporting messages for %s from %d' % (item['print_name'], pos))
369 | try:
370 | # Get the first 100
371 | if not pos:
372 | update_peer(item)
373 | msglist = TGCLI.cmd_history(print_id(item), 100)
374 | res = process(msglist)
375 | logging_status(pos)
376 | pos = 100
377 | else:
378 | res = (True, 0)
379 | # Get the recently updated messages until overlapped
380 | while res[0] is True and not res[1]:
381 | msglist = TGCLI.cmd_history(print_id(item), 100, pos)
382 | res = process(msglist)
383 | logging_status(pos)
384 | pos += 100
385 | # If force, then continue
386 | if not force:
387 | pos = max(pos, is_finished(item))
388 | # Else, get messages from the offset of last time
389 | # Until no message is returned (may be not true)
390 | while res[0] is True:
391 | msglist = TGCLI.cmd_history(print_id(item), 100, pos)
392 | res = process(msglist)
393 | logging_status(pos)
394 | pos += 100
395 | except Exception:
396 | logging_status(pos, True)
397 | if pos > is_finished(item):
398 | set_finished(item, pos)
399 | return pos
400 | logging_status(pos, True)
401 | set_finished(item, pos)
402 |
403 | def find_holes(minv, maxv, s):
404 | for n in range(minv, maxv + 1):
405 | if n not in s:
406 | yield n
407 |
408 | def export_holes():
409 | '''
410 | Try to get remaining messages by using message id.
411 | '''
412 | # First we get messages that belong to ourselves,
413 | # i.e. not channel messages or encr-chat
414 | # 17179869184 = TGL_PEER_ENCR_CHAT 4<<32
415 | got = set(i[0] for i in CONN.execute('SELECT id FROM messages WHERE dest < 17179869184') if isinstance(i[0], int))
416 | # it doesn't verify peer_type, peer_id, access_hash
417 | if got:
418 | holes = [tgl_message_id_t(1, 0, n, 0) for n in find_holes(1, max(got), got)]
419 | # Then we get channel (supergroup) messages.
420 | if TG_TEST:
421 | channels = [tgl_peer_id_t(tgl_peer_id_t.TGL_PEER_CHANNEL, *i) for i in
422 | CONN.execute('SELECT id, access_hash FROM channels')]
423 | for channel in channels:
424 | got = set(i[0] for i in CONN.execute('SELECT id FROM messages WHERE dest = ?', (channel.to_id(),)) if isinstance(i[0], int))
425 | if got:
426 | holes.extend(tgl_message_id_t(channel.peer_type, channel.peer_id, n, channel.access_hash) for n in find_holes(1, max(got), got))
427 | length = len(holes)
428 | logging.info('Getting the remaining %d messages...' % length)
429 | # we need some uncertainty to work around the uncertainty of telegram-cli
430 | random.shuffle(holes)
431 | # list of mids (may be str or int, depending on TG_TEST)
432 | failed = []
433 | for k, msg in enumerate(holes, 1):
434 | if TG_TEST:
435 | mid = msg.dumps()
436 | else:
437 | mid = msg.id
438 | try:
439 | res = process(TGCLI.send_command('get_message %s' % mid))
440 | if not res[0]:
441 | logging.warning('%r may not exist [%.2f%%]', msg[:3], (k * 100 / length))
442 | elif k % 10 == 0:
443 | logging_status(k, False, 100, length)
444 | except tgcli.TelegramCliExited:
445 | # interface.c:4295: print_message: Assertion `M' failed.
446 | logging.warning('%r may not exist [%.2f%%]', msg[:3], (k * 100 / length))
447 | except Exception:
448 | failed.append(mid)
449 | logging.exception('Failed to get message ID %s' % mid)
450 | logging_status(k, True, 100, length)
451 | purge_queue()
452 | while failed:
453 | length = len(failed)
454 | logging.info('Retrying the remaining %d messages...' % length)
455 | newlist = []
456 | # see above
457 | random.shuffle(failed)
458 | for k, mid in enumerate(failed, 1):
459 | try:
460 | res = process(TGCLI.send_command('get_message %s' % mid))
461 | except Exception:
462 | # such an old bug (`newlist` here was `failed`)
463 | newlist.append(mid)
464 | if k % 10 == 0:
465 | logging_status(k, False, 100, length)
466 | logging_status(k, True, 100, length)
467 | failed = newlist
468 | purge_queue()
469 |
470 | def export_text(peer=None, force=False):
471 | #if force:
472 | #reset_finished()
473 | logging.info('Getting contacts...')
474 | update_peer(TGCLI.cmd_get_self())
475 | items = TGCLI.cmd_contact_list()
476 | peer_obj = None
477 | if peer:
478 | peer_match = re.match('^(\w+)#id(\d+)$', peer)
479 | for item in items:
480 | update_peer(item)
481 | purge_queue()
482 | logging.info('Getting dialogs...')
483 | dlist = items = lastitems = TGCLI.cmd_dialog_list(100)
484 | dcount = 100
485 | while items:
486 | items = TGCLI.cmd_dialog_list(100, dcount)
487 | if frozenset(d['id'] for d in items) == frozenset(d['id'] for d in lastitems):
488 | break
489 | dlist.extend(items)
490 | dcount += 100
491 | for item in dlist:
492 | update_peer(item)
493 | if peer and peer_obj is None:
494 | if not peer_match:
495 | if peer in item.get('print_name', ''):
496 | peer_obj = item
497 | elif ((item.get('peer_type') or item.get('type')) == peer_match.group(1)
498 | and str(item.get('peer_id') or item.get('id')) == peer_match.group(2)):
499 | peer_obj = item
500 | if peer_obj:
501 | logging.info('Peer: %r' % peer_obj)
502 | dlist = [peer_obj]
503 | elif peer:
504 | logging.info('Peer not found: %s' % peer)
505 | return
506 | logging.info('Exporting messages...')
507 | failed = []
508 | # we need some uncertainty to work around the uncertainty of telegram-cli
509 | random.shuffle(dlist)
510 | for item in dlist:
511 | res = export_for(item, 0, force)
512 | if res is not None:
513 | failed.append((item, res))
514 | logging.warning('Failed to get messages for %s from %d' % (item['print_name'], res))
515 | purge_queue()
516 | DB.commit()
517 | while failed:
518 | newlist = []
519 | for item, pos in failed:
520 | res = export_for(item, pos, force)
521 | if res is not None:
522 | newlist.append((item, res))
523 | logging.warning('Failed to get messages for %s from %d' % (item['print_name'], res))
524 | purge_queue()
525 | failed = newlist
526 | DB.commit()
527 | logging.info('Export to database completed.')
528 |
529 | DB = None
530 | CONN = None
531 | PEER_CACHE = LRUCache(10)
532 | MSG_Q = queue.Queue()
533 | TGCLI = None
534 | DLDIR = '.'
535 | TG_TEST = True
536 |
537 | def main(argv):
538 | global TGCLI, DLDIR, TG_TEST
539 | parser = argparse.ArgumentParser(description="Export Telegram messages.")
540 | parser.add_argument("-o", "--output", help="output path", default="export")
541 | parser.add_argument("-d", "--db", help="database path", default="tg-export3.db")
542 | parser.add_argument("-f", "--force", help="force download all messages", action='store_true')
543 | parser.add_argument("-p", "--peer", help="only download messages for this peer (format: channel#id1001234567, or use partial name/title as shown in tgcli)")
544 | parser.add_argument("-B", "--batch-only", help="fetch messages in batch only, don't try to get more missing messages", action='store_true')
545 | parser.add_argument("-t", "--timeout", help="tg-cli command timeout", type=int, default=30)
546 | parser.add_argument("-l", "--logging", help="logging mode (keep running)", action='store_true')
547 | parser.add_argument("-L", "--keep-logging", help="first export, then keep logging", action='store_true')
548 | parser.add_argument("-e", "--tgbin", help="telegram-cli binary path", default="bin/telegram-cli")
549 | parser.add_argument("-v", "--verbose", help="print debug messages", action='store_true')
550 | args = parser.parse_args(argv)
551 |
552 | if args.verbose:
553 | logging.getLogger().setLevel(logging.DEBUG)
554 | tgcli.logger.setLevel(logging.DEBUG)
555 |
556 | DLDIR = args.output
557 | init_db(args.db)
558 |
559 | TGCLI = tgcli.TelegramCliInterface(args.tgbin, extra_args=('-W', '-E'), run=False, timeout=args.timeout)
560 | TGCLI.on_json = MSG_Q.put
561 | TGCLI.on_info = lambda s: tgcli.logger.info(s) if not re_getmsg.match(s) else None
562 | #TGCLI.on_text = MSG_Q.put
563 | #TGCLI.on_start = on_start
564 | TGCLI.run()
565 | TGCLI.ready.wait()
566 | time.sleep(1)
567 |
568 | # the 'test' branch of tg has channel support
569 | TG_TEST = 'channel' in TGCLI.cmd_help()
570 |
571 | try:
572 | if not args.logging:
573 | export_text(args.peer, args.force)
574 | if not args.batch_only:
575 | export_holes()
576 | if args.logging or args.keep_logging:
577 | while TGCLI.ready.is_set():
578 | d = MSG_Q.get()
579 | logging.info(logging_fmt(d))
580 | process(d)
581 | finally:
582 | TGCLI.close()
583 | purge_queue()
584 | DB.commit()
585 |
586 | if __name__ == '__main__':
587 | sys.exit(main(sys.argv[1:]))
588 |
--------------------------------------------------------------------------------
/logfmt.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import os
5 | import re
6 | import sys
7 | import time
8 | import json
9 | import struct
10 | import sqlite3
11 | import operator
12 | import argparse
13 | import binascii
14 | import collections
15 |
16 | import jinja2
17 |
18 | re_url = re.compile(r'''\b
19 | (
20 | # URL (gruber v2)
21 | (?:
22 | [a-z][\w-]+:(?:/{1,3}|[a-z0-9%])
23 | |
24 | www\d{0,3}[.]
25 | |
26 | [a-z0-9.\-]+[.][a-z]{2,4}/
27 | |
28 | magnet:\?
29 | )
30 | (?:
31 | [^\s()<>]+
32 | |
33 | \(([^\s()<>]+|(\([^\s()<>]+\)))*\)
34 | )+
35 | (?:
36 | \(([^\s()<>]+|(\([^\s()<>]+\)))*\)
37 | |
38 | [^\s`!()\[\]{};:\'".,<>?«»“”‘’]
39 | )
40 | |
41 | # BT Hash
42 | (?:
43 | [a-f0-9]{40}
44 | )
45 | )''', re.I | re.X)
46 | re_bthash = re.compile(r'[0-9a-f]{40}|[a-z2-7]{32}', re.I)
47 | re_limit = re.compile(r'^([0-9]+)(,[0-9]+)?$')
48 | imgfmt = frozenset(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'))
49 |
50 | printname = lambda first, last='': (first + ' ' + last if last else first) or ''
51 |
52 | strftime = lambda date, fmt='%Y-%m-%d %H:%M:%S': time.strftime(fmt, time.localtime(date))
53 |
54 | unkuser = lambda user: {
55 | 'peer_id': user['id'],
56 | 'id': tgl_peer_id_t(tgl_peer_id_t.TGL_PEER_USER, user['id'], 0).dumps(),
57 | 'first_name': user['first_name'],
58 | 'last_name': user.get('last_name'),
59 | 'username': user.get('username'),
60 | 'type': 'user',
61 | 'flags': 256,
62 | 'print': printname(user['first_name'], user.get('last_name'))
63 | }
64 |
65 | unkmsg = lambda mid: {
66 | 'mid': mid,
67 | 'src': {'id': 0, 'print': ''},
68 | 'dest': {'id': 0, 'print': ''},
69 | 'text': '',
70 | 'media': {},
71 | 'date': 0,
72 | 'msgtype': '',
73 | 'extra': None,
74 | 'out': 0,
75 | 'unread': 0,
76 | 'service': 0,
77 | 'action': {},
78 | 'flags': 0
79 | }
80 |
81 | def convert_msgid2(msgid):
82 | if msgid is None:
83 | return None
84 | elif isinstance(msgid, int):
85 | return msgid
86 | elif len(msgid) == 48:
87 | return tgl_message_id_t.loads(msgid).id
88 | else:
89 | return int(msgid)
90 |
91 | class tgl_peer_id_t(collections.namedtuple('tgl_peer_id_t', 'peer_type peer_id access_hash')):
92 | '''
93 | typedef struct {
94 | int peer_type;
95 | int peer_id;
96 | long long access_hash;
97 | } tgl_peer_id_t;
98 | '''
99 | TGL_PEER_USER = 1
100 | TGL_PEER_CHAT = 2
101 | TGL_PEER_GEO_CHAT = 3
102 | TGL_PEER_ENCR_CHAT = 4
103 | TGL_PEER_CHANNEL = 5
104 | TGL_PEER_TEMP_ID = 100
105 | TGL_PEER_RANDOM_ID = 101
106 | TGL_PEER_UNKNOWN = 0
107 |
108 | @classmethod
109 | def loads(cls, s):
110 | return cls._make(struct.unpack('= self.capacity:
175 | self.cache.popitem(last=False)
176 | self.cache[key] = value
177 |
178 | class StreamArray(list):
179 | def __init__(self, iterable):
180 | self.iterable = iterable
181 |
182 | def __iter__(self):
183 | return self.iterable
184 |
185 | # according to the comment below
186 | def __len__(self):
187 | return 1
188 |
189 | class PeerStore(collections.UserDict):
190 |
191 | def __init__(self, *args, **kwds):
192 | super().__init__(*args, **kwds)
193 | self.name = {}
194 |
195 | def __setitem__(self, key, value):
196 | self.data[self._convert(key)] = value
197 |
198 | def setname(self, key, value):
199 | self.name[value] = self._convert(key)
200 |
201 | def __getitem__(self, key):
202 | peerid, peertype = self._convert(key)
203 | try:
204 | return self.data[(peerid, peertype)]
205 | except KeyError:
206 | d = self.data[(peerid, peertype)] = {'id': peerid, 'type': peertype, 'print': ''}
207 | return d
208 |
209 | def find(self, key):
210 | try:
211 | return self.__getitem__(key)
212 | except Exception:
213 | if key in self.name:
214 | return self.data[self.name[key]]
215 | else:
216 | for k, v in self.name.items():
217 | if key in k and v[1] != 'encr_chat':
218 | return self.data[v]
219 | return {'id': None, 'type': 'user', 'print': key}
220 |
221 | @staticmethod
222 | def _convert(key=None):
223 | peertype = None
224 | if isinstance(key, tuple):
225 | peerid, peertype = key
226 | else:
227 | peerid = key
228 | peer_id = None
229 | peer_type = tgl_peer_id_t.TGL_PEER_USER
230 | try:
231 | peerid = int(peerid)
232 | except ValueError:
233 | pass
234 | if isinstance(peerid, str):
235 | sp = peerid.split('#id', 1)
236 | if len(sp) == 2:
237 | peer_id = int(sp[1])
238 | peertype = sp[0]
239 | else:
240 | peer = tgl_peer_id_t.loads(peerid)
241 | peer_id = peer.peer_id
242 | peer_type = peer.peer_type
243 | elif peerid is None:
244 | pass
245 | elif peerid > 4294967296:
246 | # 1 << 32
247 | peer_id = peerid & 4294967295
248 | peer_type = peerid >> 32
249 | else:
250 | peer_id = abs(peerid)
251 | peer_type = tgl_peer_id_t.TGL_PEER_CHAT if peerid < 0 else tgl_peer_id_t.TGL_PEER_USER
252 | if peertype:
253 | return (peer_id, peertype)
254 | elif peer_type == tgl_peer_id_t.TGL_PEER_USER:
255 | return (peer_id, 'user')
256 | elif peer_type == tgl_peer_id_t.TGL_PEER_CHAT:
257 | return (peer_id, 'chat')
258 | elif peer_type == tgl_peer_id_t.TGL_PEER_ENCR_CHAT:
259 | return (peer_id, 'encr_chat')
260 | elif peer_type == tgl_peer_id_t.TGL_PEER_CHANNEL:
261 | return (peer_id, 'channel')
262 |
263 | class Messages:
264 |
265 | def __init__(self, stream=False, template='history.txt'):
266 | self.peers = PeerStore()
267 | if stream:
268 | self.msgs = LRUCache(100)
269 | else:
270 | self.msgs = collections.OrderedDict()
271 |
272 | self.db_cli = None
273 | self.conn_cli = None
274 | self.db_cli_ver = None
275 | self.db_bot = None
276 | self.conn_bot = None
277 |
278 | self.limit = None
279 | self.hardlimit = None
280 | self.botdest = None
281 |
282 | self.template = template
283 | self.stream = stream
284 | # can be 'bot', 'cli' or None (no conversion)
285 | self.media_format = 'cli'
286 | self.cachedir = None
287 | self.urlprefix = None
288 | self.jinjaenv = jinja2.Environment(loader=jinja2.FileSystemLoader('templates'))
289 | self.jinjaenv.filters['strftime'] = strftime
290 | self.jinjaenv.filters['autolink'] = autolink
291 | self.jinjaenv.filters['isimg'] = lambda url: os.path.splitext(url)[1] in imgfmt
292 | self.jinjaenv.filters['smartname'] = smartname
293 |
294 | def init_db(self, filename, dbtype='cli', botuserdb=False, botdest=None):
295 | if os.path.isfile(filename):
296 | if dbtype == 'cli':
297 | self.db_cli = sqlite3.connect(filename)
298 | self.conn_cli = self.db_cli.cursor()
299 | for name, sql in self.conn_cli.execute("SELECT name, sql FROM sqlite_master WHERE type='table'"):
300 | if name == 'exportinfo':
301 | self.db_cli_ver = 1
302 | break
303 | elif name == 'peerinfo':
304 | if 'permanent_id' in sql:
305 | self.db_cli_ver = 2
306 | else:
307 | self.db_cli_ver = 3
308 | break
309 | self.userfromdb('cli')
310 | elif dbtype == 'bot':
311 | self.db_bot = sqlite3.connect(filename)
312 | self.conn_bot = self.db_bot.cursor()
313 | self.botdest = self.peers.find(botdest)
314 | if self.botdest['id'] is None:
315 | raise KeyError('peer not found: %s' % botdest)
316 | if self.botdest['type'] == 'user':
317 | self.botdest['type'] = 'chat'
318 | # self.botdest = tgl_peer_id_t.from_peer(self.botdest).to_id()
319 | self.botdest = (self.botdest['id'], self.botdest['type'])
320 | if botuserdb or not self.db_cli:
321 | self.userfromdb('bot')
322 | else:
323 | raise FileNotFoundError('Database not found: ' + filename)
324 |
325 | def msgfromdb(self, dbtype='cli', peer=None):
326 | if self.limit:
327 | match = re_limit.match(self.limit)
328 | if match:
329 | if match.group(2):
330 | limit = 'LIMIT %d OFFSET %s' % (min(int(match.group(1)), self.hardlimit), match.group(2)[1:])
331 | else:
332 | limit = 'LIMIT %d' % min(int(match.group(1)), self.hardlimit)
333 | else:
334 | limit = 'LIMIT %d' % self.hardlimit
335 | else:
336 | limit = ''
337 | if dbtype == 'cli':
338 | if peer:
339 | if self.db_cli_ver == 1:
340 | if peer['type'] == 'user':
341 | pid = peer['id']
342 | else:
343 | pid = -peer['id']
344 | elif self.db_cli_ver == 2:
345 | pid = tgl_peer_id_t.from_peer(peer).dumps()
346 | else:
347 | pid = tgl_peer_id_t.from_peer(peer).to_id()
348 | c = self.conn_cli.execute('SELECT * FROM (SELECT id, src, dest, text, media, date, fwd_src, fwd_date, reply_id, out, unread, service, action, flags FROM messages WHERE src=? or dest=? ORDER BY date DESC, id DESC %s) ORDER BY date ASC, id ASC' % limit, (pid, pid))
349 | else:
350 | c = self.conn_cli.execute('SELECT * FROM (SELECT id, src, dest, text, media, date, fwd_src, fwd_date, reply_id, out, unread, service, action, flags FROM messages ORDER BY date DESC, id DESC %s) ORDER BY date ASC, id ASC' % limit)
351 | for mid, src, dest, text, media, date, fwd_src, fwd_date, reply_id, out, unread, service, action, flags in c:
352 | if self.media_format == 'bot':
353 | media, caption = self.media_cli2bot(media, action)
354 | text = text or caption
355 | yield convert_msgid2(mid), src, dest, text, media, date, fwd_src, fwd_date, convert_msgid2(reply_id), out, unread, service, action, flags
356 | elif dbtype == 'bot' and self.botdest:
357 | for mid, src, text, media, date, fwd_src, fwd_date, reply_id in self.conn_bot.execute('SELECT * FROM (SELECT id, src, text, media, date, fwd_src, fwd_date, reply_id FROM messages ORDER BY date DESC, id DESC %s) ORDER BY date ASC, id ASC' % limit):
358 | if self.media_format == 'cli':
359 | media, action = self.media_bot2cli(text, media)
360 | else:
361 | action = None
362 | yield mid, src, self.botdest, text, media, date, fwd_src, fwd_date, reply_id, 0, 0, bool(action), action, 256
363 | else:
364 | raise ValueError('dbtype or self.botdest is invalid')
365 |
366 | def userfromdb(self, dbtype='cli'):
367 | if dbtype == 'cli':
368 | for pid, phone, username, first_name, last_name, flags in self.conn_cli.execute('SELECT id, phone, username, first_name, last_name, flags FROM users'):
369 | self.peers[(pid, 'user')] = {
370 | 'id': pid,
371 | 'type': 'user',
372 | 'phone': phone,
373 | 'username': username,
374 | 'first_name': first_name,
375 | 'last_name': last_name,
376 | 'print': printname(first_name, last_name),
377 | 'flags': flags
378 | }
379 | for pid, title, members_num, flags in self.conn_cli.execute('SELECT id, title, members_num, flags FROM chats'):
380 | self.peers[(pid, 'chat')] = {
381 | 'id': pid,
382 | 'type': 'chat',
383 | 'title': title,
384 | 'members_num': members_num,
385 | 'print': printname(title),
386 | 'flags': flags
387 | }
388 | if self.db_cli_ver > 1:
389 | for pid, title, members_num, admins_count, kicked_count, flags in self.conn_cli.execute('SELECT id, title, participants_count, admins_count, kicked_count, flags FROM channels'):
390 | self.peers[(pid, 'channel')] = {
391 | 'id': pid,
392 | 'type': 'channel',
393 | 'title': title,
394 | # keep compatible with chats
395 | 'members_num': members_num,
396 | 'admins_count': admins_count,
397 | 'kicked_count': kicked_count,
398 | 'print': printname(title),
399 | 'flags': flags
400 | }
401 | if self.db_cli_ver == 1:
402 | sql = 'SELECT id, print_name FROM exportinfo'
403 | elif self.db_cli_ver == 2:
404 | sql = 'SELECT permanent_id, print_name FROM peerinfo'
405 | else:
406 | sql = 'SELECT id, print_name FROM peerinfo'
407 | for pid, print_name in self.conn_cli.execute(sql):
408 | self.peers.setname(pid, print_name)
409 | elif dbtype == 'bot':
410 | for pid, username, first_name, last_name in self.conn_bot.execute('SELECT id, username, first_name, last_name FROM users'):
411 | self.peers[(pid, 'user')].update({
412 | 'id': pid,
413 | 'username': username,
414 | 'first_name': first_name,
415 | 'last_name': last_name,
416 | 'print': printname(first_name, last_name)
417 | })
418 |
419 | def media_bot2cli(self, text, media=None, strict=False):
420 | if not media:
421 | return None, None
422 | media = json.loads(media)
423 | dm = {}
424 | da = {}
425 |
426 | mt = None
427 | if self.cachedir:
428 | mt = media.keys() & frozenset(('audio', 'document', 'sticker', 'video', 'voice'))
429 | file_id = None
430 | if mt:
431 | mt = mt.pop()
432 | file_id = media[mt]['file_id']
433 | elif 'photo' in media:
434 | file_id = max(media['photo'], key=lambda x: x['width'])['file_id']
435 | if file_id:
436 | for fn in os.listdir(self.cachedir):
437 | if fn.startswith(file_id):
438 | dm['url'] = self.urlprefix + fn
439 | break
440 |
441 | if '_ircuser' in media:
442 | dm['_ircuser'] = media['_ircuser']
443 | if mt and not strict:
444 | dm.update(media[mt])
445 |
446 | if ('audio' in media or 'document' in media
447 | or 'sticker' in media or 'video' in media
448 | or 'voice' in media):
449 | if strict:
450 | dm['type'] = 'document'
451 | else:
452 | dm['type'] = mt or 'document'
453 | elif 'photo' in media:
454 | dm['type'] = 'photo'
455 | dm['caption'] = text or ''
456 | elif 'contact' in media:
457 | dm['type'] = 'contact'
458 | dm['phone'] = media['contact']['phone_number']
459 | dm['first_name'] = media['contact']['first_name']
460 | dm['last_name'] = media['contact'].get('last_name')
461 | dm['user_id'] = media['contact'].get('user_id')
462 | elif 'location' in media:
463 | dm['type'] = 'geo'
464 | dm['longitude'] = media['location']['longitude']
465 | dm['latitude'] = media['location']['latitude']
466 | elif 'venue' in media:
467 | dm['type'] = 'venue'
468 | dm['longitude'] = media['venue']['location']['longitude']
469 | dm['latitude'] = media['venue']['location']['latitude']
470 | if media['venue']['title']:
471 | dm['type'] = media['venue']['title']
472 | dm['address'] = media['venue']['address']
473 | if 'foursquare_id' in media['venue']:
474 | dm['provider'] = 'foursquare'
475 | dm['venue_id'] = media['venue']['foursquare_id']
476 | elif 'new_chat_participant' in media:
477 | user = media['new_chat_participant']
478 | da['type'] = 'chat_add_user'
479 | da['user'] = self.peers.get(user['id']) or unkuser(user)
480 | elif 'left_chat_participant' in media:
481 | user = media['left_chat_participant']
482 | da['type'] = 'chat_del_user'
483 | da['user'] = self.peers.get(user['id']) or unkuser(user)
484 | elif 'new_chat_title' in media:
485 | da['type'] = 'chat_rename'
486 | da['title'] = media['new_chat_title']
487 | elif 'new_chat_photo' in media:
488 | da['type'] = 'chat_change_photo'
489 | elif 'delete_chat_photo' in media:
490 | da['type'] = 'chat_delete_photo'
491 | elif 'group_chat_created' in media:
492 | da['type'] = 'chat_created'
493 | da['title'] = ''
494 | return json.dumps(dm) if dm else None, json.dumps(da) if da else None
495 |
496 | def media_cli2bot(media=None, action=None):
497 | type_map = {
498 | # media
499 | 'photo': 'photo',
500 | 'document': 'document',
501 | 'unsupported': 'document',
502 | 'geo': 'location',
503 | 'venue': 'location',
504 | 'contact': 'contact',
505 | # action
506 | 'chat_add_user': 'new_chat_participant',
507 | 'chat_add_user_link': 'new_chat_participant',
508 | 'chat_del_user': 'left_chat_participant',
509 | 'chat_rename': 'new_chat_title',
510 | 'chat_change_photo': 'new_chat_photo',
511 | 'chat_delete_photo': 'delete_chat_photo',
512 | 'chat_created': 'group_chat_created'
513 | }
514 | d = {}
515 | caption = None
516 | if media:
517 | media = json.loads(media)
518 | if action:
519 | action = json.loads(action)
520 | if media and 'type' in media:
521 | media = media.copy()
522 | if media['type'] == 'photo':
523 | caption = media['caption']
524 | d['photo'] = []
525 | elif media['type'] in ('document', 'unsupported'):
526 | d['document'] = {}
527 | elif 'longitude' in media:
528 | # 'type' may be the name of the place
529 | loc = {
530 | 'longitude': media['longitude'],
531 | 'latitude': media['latitude']
532 | }
533 | if media['type'] == 'geo':
534 | d['location'] = loc
535 | else:
536 | d['venue'] = {
537 | 'location': loc,
538 | 'title': media['type'] if media['type'] != 'venue' else '',
539 | 'address': media['address']
540 | }
541 | if media.get('provider') == 'foursquare' and 'venue_id' in media:
542 | d['venue']['foursquare_id'] = media['venue_id']
543 | elif media['type'] == 'contact':
544 | del media['type']
545 | media['phone_number'] = media.pop('phone')
546 | d['contact'] = media
547 | # ignore other undefined types to Bot API
548 | if action and 'type' in action:
549 | newname = type_map.get(action['type'])
550 | if newname.endswith('chat_participant'):
551 | d[newname] = {
552 | 'id': action['user']['id'],
553 | 'first_name': action['user'].get('first_name', ''),
554 | 'last_name': action['user'].get('last_name', ''),
555 | 'username': action['user'].get('username', '')
556 | }
557 | elif newname == 'new_chat_title':
558 | d[newname] = action['title']
559 | elif newname == 'new_chat_photo':
560 | d[newname] = []
561 | elif newname in ('delete_chat_photo', 'group_chat_created'):
562 | d[newname] = True
563 | # ignore other undefined types to Bot API
564 | return json.dumps(d) if d else None, caption
565 |
566 | def getmsgs(self, peer=None):
567 | db = 'cli' if self.db_cli else 'bot'
568 | for mid, src, dest, text, media, date, fwd_src, fwd_date, reply_id, out, unread, service, action, flags in self.msgfromdb(db, peer):
569 | src = self.peers[src]
570 | dest = self.peers[dest]
571 | if not (db == 'bot' or
572 | dest['id'] == peer['id'] or
573 | peer['type'] == 'user' and
574 | src['id'] == peer['id'] and dest['type'] == 'user'):
575 | continue
576 | if fwd_src:
577 | msgtype = 'fwd'
578 | extra = {'fwd_src': self.peers[fwd_src], 'fwd_date': fwd_date}
579 | elif reply_id:
580 | msgtype = 're'
581 | remsg = self.msgs.get(reply_id, unkmsg(reply_id))
582 | if remsg['msgtype'] == 're':
583 | remsg = remsg.copy()
584 | remsg['extra'] = None
585 | extra = {'reply': remsg}
586 | else:
587 | msgtype, extra = '', None
588 | media = json.loads(media or '{}')
589 | if db == 'bot' and '_ircuser' in media:
590 | src['first_name'] = src['print'] = media['_ircuser']
591 | msg = {
592 | 'mid': mid,
593 | 'src': src,
594 | 'dest': dest,
595 | 'text': text or media.get('caption'),
596 | 'media': media,
597 | 'date': date,
598 | 'msgtype': msgtype,
599 | 'extra': extra,
600 | 'out': out,
601 | 'unread': unread,
602 | 'service': service,
603 | 'action': json.loads(action or '{}'),
604 | 'flags': flags
605 | }
606 | self.msgs[mid] = msg
607 | yield mid, msg
608 |
609 | def render_peer(self, peer, name=None):
610 | peer = peer.copy()
611 | if name:
612 | peer['print'] = name
613 | kvars = {
614 | 'peer': peer,
615 | 'gentime': time.time()
616 | }
617 | if self.stream:
618 | kvars['msgs'] = (m for k, m in self.getmsgs(peer))
619 | else:
620 | msgs = tuple(m for k, m in self.getmsgs(peer))
621 | kvars['msgs'] = msgs
622 | if msgs:
623 | kvars['start'] = min(msgs, key=operator.itemgetter('date'))['date']
624 | kvars['end'] = max(msgs, key=operator.itemgetter('date'))['date']
625 | else:
626 | kvars['start'] = kvars['end'] = 0
627 | kvars['count'] = len(msgs)
628 | template = self.jinjaenv.get_template(self.template)
629 | yield from template.stream(**kvars)
630 |
631 | def render_peer_json(self, peer, name=None):
632 | je = json.JSONEncoder(indent=0)
633 | peer = peer.copy()
634 | if name:
635 | peer['print'] = name
636 | kvars = {
637 | 'peer': peer,
638 | 'gentime': time.time()
639 | }
640 | kvars['msgs'] = StreamArray(m for k, m in self.getmsgs(peer))
641 | yield from je.iterencode(kvars)
642 |
643 | def autolink(text, img=True):
644 | ret = []
645 | lastpos = 0
646 | for match in re_url.finditer(text):
647 | start, end = match.span()
648 | url = text[start:end]
649 | if re_bthash.match(url):
650 | ret.append('%s%s' % (text[lastpos:start], url, url))
651 | elif img and os.path.splitext(url)[1] in imgfmt:
652 | ret.append('%s
' % (text[lastpos:start], url, url))
653 | else:
654 | ret.append('%s%s' % (text[lastpos:start], url, url))
655 | lastpos = end
656 | ret.append(text[lastpos:])
657 | return ''.join(ret)
658 |
659 | def smartname(user, limit=20):
660 | if 'first_name' not in user:
661 | return '<%s>' % 'Unknown'[:limit-2]
662 | first, last = user['first_name'], user.get('last_name', '')
663 | pn = printname(first, last)
664 | if len(pn) > limit:
665 | if len(first) > limit:
666 | return first.split(None, 1)[0][:limit]
667 | else:
668 | return first[:limit]
669 | else:
670 | return pn
671 |
672 | def main(argv):
673 | parser = argparse.ArgumentParser(description="Format exported database file into human-readable format.")
674 | parser.add_argument("-o", "--output", help="output path")
675 | parser.add_argument("-d", "--db", help="tg-export database path", default="tg-export3.db")
676 | parser.add_argument("-b", "--botdb", help="tg-chatdig bot database path", default="")
677 | parser.add_argument("-D", "--botdb-dest", help="tg-chatdig bot logged chat id or tg-cli-style peer name")
678 | parser.add_argument("-u", "--botdb-user", action="store_true", help="use user information in tg-chatdig database first")
679 | parser.add_argument("-t", "--template", help="export template, can be 'txt'(default), 'html', 'json', or template file name", default="txt")
680 | parser.add_argument("-P", "--peer-print", help="set print name for the peer")
681 | parser.add_argument("-l", "--limit", help="limit the number of fetched messages and set the offset")
682 | parser.add_argument("-L", "--hardlimit", help="set a hard limit of the number of messages, must be used with -l", type=int, default=100000)
683 | parser.add_argument("-c", "--cachedir", help="the path of media files")
684 | parser.add_argument("-r", "--urlprefix", help="the url prefix of media files")
685 | parser.add_argument("peer", help="export certain peer id or tg-cli-style peer print name")
686 | args = parser.parse_args(argv)
687 |
688 | msg = Messages(stream=args.template.endswith('html'))
689 | msg.limit = args.limit
690 | msg.hardlimit = args.hardlimit
691 | msg.cachedir = args.cachedir
692 | msg.urlprefix = args.urlprefix
693 | render_func = msg.render_peer
694 | if args.template == 'html':
695 | msg.template = 'simple.html'
696 | elif args.template == 'txt':
697 | msg.template = 'history.txt'
698 | elif args.template == 'json':
699 | render_func = msg.render_peer_json
700 | else:
701 | msg.template = args.template
702 | if args.db:
703 | msg.init_db(args.db, 'cli')
704 | if args.botdb:
705 | msg.init_db(args.botdb, 'bot', args.botdb_user or not args.db, args.botdb_dest)
706 | peer = msg.peers.find(args.peer)
707 | if peer['id'] is None:
708 | raise KeyError('peer not found: %s' % args.peer)
709 | if args.output == '-':
710 | for ln in render_func(peer, args.peer_print):
711 | sys.stdout.write(ln)
712 | else:
713 | fn = args.output
714 | if args.output is None:
715 | fn = '%s#id%d' % (peer['type'], peer['id'])
716 | if args.template == 'json':
717 | fn += '.json'
718 | elif '.' in args.template:
719 | fn += os.path.splitext(args.template)[1]
720 | else:
721 | fn += '.' + args.template
722 | with open(fn, 'w') as f:
723 | for ln in render_func(peer, args.peer_print):
724 | f.write(ln)
725 |
726 | if __name__ == '__main__':
727 | sys.exit(main(sys.argv[1:]))
728 |
--------------------------------------------------------------------------------