├── .gitignore
├── EXPLANATIONS.md
├── LICENSE
├── README.rst
├── bin
    └── telegram-export
├── config.ini.example
├── requirements.txt
├── schema.png
├── setup.py
└── telegram_export
    ├── __init__.py
    ├── __main__.py
    ├── downloader.py
    ├── dumper.py
    ├── exporter.py
    ├── formatters
        ├── __init__.py
        ├── baseformatter.py
        ├── htmlformatter.py
        ├── nlpformatter.py
        └── textformatter.py
    ├── tests.py
    ├── tests
        ├── __init__.py
        └── test_utils.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | ### Telethon ###
  2 | *.session
  3 | *.ini
  4 | *.bin
  5 | *.tl
  6 | *.db
  7 | usermedia
  8 | 
  9 | ### Python ###
 10 | # Byte-compiled / optimized / DLL files
 11 | __pycache__/
 12 | *.py[cod]
 13 | *$py.class
 14 | 
 15 | # C extensions
 16 | *.so
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | wheels/
 32 | *.egg-info/
 33 | .installed.cfg
 34 | *.egg
 35 | MANIFEST
 36 | 
 37 | # PyInstaller
 38 | #  Usually these files are written by a python script from a template
 39 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 40 | *.manifest
 41 | *.spec
 42 | 
 43 | # Installer logs
 44 | pip-log.txt
 45 | pip-delete-this-directory.txt
 46 | 
 47 | # Unit test / coverage reports
 48 | htmlcov/
 49 | .tox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | .hypothesis/
 57 | .pytest_cache/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | .static_storage/
 66 | .media/
 67 | local_settings.py
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # celery beat schedule file
 89 | celerybeat-schedule
 90 | 
 91 | # SageMath parsed files
 92 | *.sage.py
 93 | 
 94 | # Environments
 95 | .env
 96 | .venv
 97 | env/
 98 | venv/
 99 | ENV/
100 | env.bak/
101 | venv.bak/
102 | 
103 | # Spyder project settings
104 | .spyderproject
105 | .spyproject
106 | 
107 | # Rope project settings
108 | .ropeproject
109 | 
110 | # mkdocs documentation
111 | /site
112 | 
113 | # mypy
114 | .mypy_cache/


--------------------------------------------------------------------------------
/EXPLANATIONS.md:
--------------------------------------------------------------------------------
 1 | Context ID
 2 | ==========
 3 | 
 4 | The ID of the place a message was sent. The ID of the User is used for Private
 5 | Messages, similarly the ID of the Chat (normal group) or Channel (which in turn
 6 | includes supergroups) is used for these.
 7 | 
 8 | Although it has never been the case, the IDs may collide between User, Chat and
 9 | Channels, so they are "marked" with a special prefix to differentiate them.
10 | User IDs are left as they are, Chat IDs are negated and Channel IDs prefixed
11 | with -100 to ensure that this collision never occurs.
12 | 
13 | 
14 | DateUpdated
15 | ===========
16 | 
17 | Stored as seconds since epoch, this is used to address the issue of the
18 | exporter only "seeing" a snapshot of the state, and to store historical
19 | data. When, for example, a User changes their name, the old User will be
20 | kept, and the new User with updated details will have a newer DateUpdated.
21 | 
22 | For this reason, DateUpdated and UserID together form the primary key of
23 | the User table.
24 | 
25 | 
26 | Invalidation time
27 | =================
28 | 
29 | Related to DateUpdated, this can be thought of as a cache invalidation time.
30 | When the dumper is run, it checks for if an entity (say, a User) has changed
31 | since the last export. If there has been a change, the new User will always
32 | be saved. However, if the User is the same as the last export, there is a
33 | problem. If the exporter only saves on updates, there is an information gap
34 | between User updates, even though in each individual export the exporter knew
35 | there was no change, since in analysis we cannot know if a User was checked
36 | at times between the saved records. However, if the exporter always saves the
37 | new User regardless of changes, there will be many redundant records that
38 | serve only to say that a User has not changed. The invalidation time is used
39 | to solve this. If the User has not changed, and time since the last saved
40 | record is less than this time, the new User will not be saved.
41 | 
42 | 
43 | Various schema decisions
44 | ========================
45 | 
46 | * Message text can be null since media with no caption have no text.
47 | * Message FromID can be null since Channels provide no FromID.
48 | * Supergroups are artificially separated from Channels so as not to confuse
49 |   people with technical details (to Telegram, they are the same thing).
50 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Mozilla Public License Version 2.0
  2 | ==================================
  3 | 
  4 | 1. Definitions
  5 | --------------
  6 | 
  7 | 1.1. "Contributor"
  8 |     means each individual or legal entity that creates, contributes to
  9 |     the creation of, or owns Covered Software.
 10 | 
 11 | 1.2. "Contributor Version"
 12 |     means the combination of the Contributions of others (if any) used
 13 |     by a Contributor and that particular Contributor's Contribution.
 14 | 
 15 | 1.3. "Contribution"
 16 |     means Covered Software of a particular Contributor.
 17 | 
 18 | 1.4. "Covered Software"
 19 |     means Source Code Form to which the initial Contributor has attached
 20 |     the notice in Exhibit A, the Executable Form of such Source Code
 21 |     Form, and Modifications of such Source Code Form, in each case
 22 |     including portions thereof.
 23 | 
 24 | 1.5. "Incompatible With Secondary Licenses"
 25 |     means
 26 | 
 27 |     (a) that the initial Contributor has attached the notice described
 28 |         in Exhibit B to the Covered Software; or
 29 | 
 30 |     (b) that the Covered Software was made available under the terms of
 31 |         version 1.1 or earlier of the License, but not also under the
 32 |         terms of a Secondary License.
 33 | 
 34 | 1.6. "Executable Form"
 35 |     means any form of the work other than Source Code Form.
 36 | 
 37 | 1.7. "Larger Work"
 38 |     means a work that combines Covered Software with other material, in
 39 |     a separate file or files, that is not Covered Software.
 40 | 
 41 | 1.8. "License"
 42 |     means this document.
 43 | 
 44 | 1.9. "Licensable"
 45 |     means having the right to grant, to the maximum extent possible,
 46 |     whether at the time of the initial grant or subsequently, any and
 47 |     all of the rights conveyed by this License.
 48 | 
 49 | 1.10. "Modifications"
 50 |     means any of the following:
 51 | 
 52 |     (a) any file in Source Code Form that results from an addition to,
 53 |         deletion from, or modification of the contents of Covered
 54 |         Software; or
 55 | 
 56 |     (b) any new file in Source Code Form that contains any Covered
 57 |         Software.
 58 | 
 59 | 1.11. "Patent Claims" of a Contributor
 60 |     means any patent claim(s), including without limitation, method,
 61 |     process, and apparatus claims, in any patent Licensable by such
 62 |     Contributor that would be infringed, but for the grant of the
 63 |     License, by the making, using, selling, offering for sale, having
 64 |     made, import, or transfer of either its Contributions or its
 65 |     Contributor Version.
 66 | 
 67 | 1.12. "Secondary License"
 68 |     means either the GNU General Public License, Version 2.0, the GNU
 69 |     Lesser General Public License, Version 2.1, the GNU Affero General
 70 |     Public License, Version 3.0, or any later versions of those
 71 |     licenses.
 72 | 
 73 | 1.13. "Source Code Form"
 74 |     means the form of the work preferred for making modifications.
 75 | 
 76 | 1.14. "You" (or "Your")
 77 |     means an individual or a legal entity exercising rights under this
 78 |     License. For legal entities, "You" includes any entity that
 79 |     controls, is controlled by, or is under common control with You. For
 80 |     purposes of this definition, "control" means (a) the power, direct
 81 |     or indirect, to cause the direction or management of such entity,
 82 |     whether by contract or otherwise, or (b) ownership of more than
 83 |     fifty percent (50%) of the outstanding shares or beneficial
 84 |     ownership of such entity.
 85 | 
 86 | 2. License Grants and Conditions
 87 | --------------------------------
 88 | 
 89 | 2.1. Grants
 90 | 
 91 | Each Contributor hereby grants You a world-wide, royalty-free,
 92 | non-exclusive license:
 93 | 
 94 | (a) under intellectual property rights (other than patent or trademark)
 95 |     Licensable by such Contributor to use, reproduce, make available,
 96 |     modify, display, perform, distribute, and otherwise exploit its
 97 |     Contributions, either on an unmodified basis, with Modifications, or
 98 |     as part of a Larger Work; and
 99 | 
100 | (b) under Patent Claims of such Contributor to make, use, sell, offer
101 |     for sale, have made, import, and otherwise transfer either its
102 |     Contributions or its Contributor Version.
103 | 
104 | 2.2. Effective Date
105 | 
106 | The licenses granted in Section 2.1 with respect to any Contribution
107 | become effective for each Contribution on the date the Contributor first
108 | distributes such Contribution.
109 | 
110 | 2.3. Limitations on Grant Scope
111 | 
112 | The licenses granted in this Section 2 are the only rights granted under
113 | this License. No additional rights or licenses will be implied from the
114 | distribution or licensing of Covered Software under this License.
115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a
116 | Contributor:
117 | 
118 | (a) for any code that a Contributor has removed from Covered Software;
119 |     or
120 | 
121 | (b) for infringements caused by: (i) Your and any other third party's
122 |     modifications of Covered Software, or (ii) the combination of its
123 |     Contributions with other software (except as part of its Contributor
124 |     Version); or
125 | 
126 | (c) under Patent Claims infringed by Covered Software in the absence of
127 |     its Contributions.
128 | 
129 | This License does not grant any rights in the trademarks, service marks,
130 | or logos of any Contributor (except as may be necessary to comply with
131 | the notice requirements in Section 3.4).
132 | 
133 | 2.4. Subsequent Licenses
134 | 
135 | No Contributor makes additional grants as a result of Your choice to
136 | distribute the Covered Software under a subsequent version of this
137 | License (see Section 10.2) or under the terms of a Secondary License (if
138 | permitted under the terms of Section 3.3).
139 | 
140 | 2.5. Representation
141 | 
142 | Each Contributor represents that the Contributor believes its
143 | Contributions are its original creation(s) or it has sufficient rights
144 | to grant the rights to its Contributions conveyed by this License.
145 | 
146 | 2.6. Fair Use
147 | 
148 | This License is not intended to limit any rights You have under
149 | applicable copyright doctrines of fair use, fair dealing, or other
150 | equivalents.
151 | 
152 | 2.7. Conditions
153 | 
154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
155 | in Section 2.1.
156 | 
157 | 3. Responsibilities
158 | -------------------
159 | 
160 | 3.1. Distribution of Source Form
161 | 
162 | All distribution of Covered Software in Source Code Form, including any
163 | Modifications that You create or to which You contribute, must be under
164 | the terms of this License. You must inform recipients that the Source
165 | Code Form of the Covered Software is governed by the terms of this
166 | License, and how they can obtain a copy of this License. You may not
167 | attempt to alter or restrict the recipients' rights in the Source Code
168 | Form.
169 | 
170 | 3.2. Distribution of Executable Form
171 | 
172 | If You distribute Covered Software in Executable Form then:
173 | 
174 | (a) such Covered Software must also be made available in Source Code
175 |     Form, as described in Section 3.1, and You must inform recipients of
176 |     the Executable Form how they can obtain a copy of such Source Code
177 |     Form by reasonable means in a timely manner, at a charge no more
178 |     than the cost of distribution to the recipient; and
179 | 
180 | (b) You may distribute such Executable Form under the terms of this
181 |     License, or sublicense it under different terms, provided that the
182 |     license for the Executable Form does not attempt to limit or alter
183 |     the recipients' rights in the Source Code Form under this License.
184 | 
185 | 3.3. Distribution of a Larger Work
186 | 
187 | You may create and distribute a Larger Work under terms of Your choice,
188 | provided that You also comply with the requirements of this License for
189 | the Covered Software. If the Larger Work is a combination of Covered
190 | Software with a work governed by one or more Secondary Licenses, and the
191 | Covered Software is not Incompatible With Secondary Licenses, this
192 | License permits You to additionally distribute such Covered Software
193 | under the terms of such Secondary License(s), so that the recipient of
194 | the Larger Work may, at their option, further distribute the Covered
195 | Software under the terms of either this License or such Secondary
196 | License(s).
197 | 
198 | 3.4. Notices
199 | 
200 | You may not remove or alter the substance of any license notices
201 | (including copyright notices, patent notices, disclaimers of warranty,
202 | or limitations of liability) contained within the Source Code Form of
203 | the Covered Software, except that You may alter any license notices to
204 | the extent required to remedy known factual inaccuracies.
205 | 
206 | 3.5. Application of Additional Terms
207 | 
208 | You may choose to offer, and to charge a fee for, warranty, support,
209 | indemnity or liability obligations to one or more recipients of Covered
210 | Software. However, You may do so only on Your own behalf, and not on
211 | behalf of any Contributor. You must make it absolutely clear that any
212 | such warranty, support, indemnity, or liability obligation is offered by
213 | You alone, and You hereby agree to indemnify every Contributor for any
214 | liability incurred by such Contributor as a result of warranty, support,
215 | indemnity or liability terms You offer. You may include additional
216 | disclaimers of warranty and limitations of liability specific to any
217 | jurisdiction.
218 | 
219 | 4. Inability to Comply Due to Statute or Regulation
220 | ---------------------------------------------------
221 | 
222 | If it is impossible for You to comply with any of the terms of this
223 | License with respect to some or all of the Covered Software due to
224 | statute, judicial order, or regulation then You must: (a) comply with
225 | the terms of this License to the maximum extent possible; and (b)
226 | describe the limitations and the code they affect. Such description must
227 | be placed in a text file included with all distributions of the Covered
228 | Software under this License. Except to the extent prohibited by statute
229 | or regulation, such description must be sufficiently detailed for a
230 | recipient of ordinary skill to be able to understand it.
231 | 
232 | 5. Termination
233 | --------------
234 | 
235 | 5.1. The rights granted under this License will terminate automatically
236 | if You fail to comply with any of its terms. However, if You become
237 | compliant, then the rights granted under this License from a particular
238 | Contributor are reinstated (a) provisionally, unless and until such
239 | Contributor explicitly and finally terminates Your grants, and (b) on an
240 | ongoing basis, if such Contributor fails to notify You of the
241 | non-compliance by some reasonable means prior to 60 days after You have
242 | come back into compliance. Moreover, Your grants from a particular
243 | Contributor are reinstated on an ongoing basis if such Contributor
244 | notifies You of the non-compliance by some reasonable means, this is the
245 | first time You have received notice of non-compliance with this License
246 | from such Contributor, and You become compliant prior to 30 days after
247 | Your receipt of the notice.
248 | 
249 | 5.2. If You initiate litigation against any entity by asserting a patent
250 | infringement claim (excluding declaratory judgment actions,
251 | counter-claims, and cross-claims) alleging that a Contributor Version
252 | directly or indirectly infringes any patent, then the rights granted to
253 | You by any and all Contributors for the Covered Software under Section
254 | 2.1 of this License shall terminate.
255 | 
256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
257 | end user license agreements (excluding distributors and resellers) which
258 | have been validly granted by You or Your distributors under this License
259 | prior to termination shall survive termination.
260 | 
261 | ************************************************************************
262 | *                                                                      *
263 | *  6. Disclaimer of Warranty                                           *
264 | *  -------------------------                                           *
265 | *                                                                      *
266 | *  Covered Software is provided under this License on an "as is"       *
267 | *  basis, without warranty of any kind, either expressed, implied, or  *
268 | *  statutory, including, without limitation, warranties that the       *
269 | *  Covered Software is free of defects, merchantable, fit for a        *
270 | *  particular purpose or non-infringing. The entire risk as to the     *
271 | *  quality and performance of the Covered Software is with You.        *
272 | *  Should any Covered Software prove defective in any respect, You     *
273 | *  (not any Contributor) assume the cost of any necessary servicing,   *
274 | *  repair, or correction. This disclaimer of warranty constitutes an   *
275 | *  essential part of this License. No use of any Covered Software is   *
276 | *  authorized under this License except under this disclaimer.         *
277 | *                                                                      *
278 | ************************************************************************
279 | 
280 | ************************************************************************
281 | *                                                                      *
282 | *  7. Limitation of Liability                                          *
283 | *  --------------------------                                          *
284 | *                                                                      *
285 | *  Under no circumstances and under no legal theory, whether tort      *
286 | *  (including negligence), contract, or otherwise, shall any           *
287 | *  Contributor, or anyone who distributes Covered Software as          *
288 | *  permitted above, be liable to You for any direct, indirect,         *
289 | *  special, incidental, or consequential damages of any character      *
290 | *  including, without limitation, damages for lost profits, loss of    *
291 | *  goodwill, work stoppage, computer failure or malfunction, or any    *
292 | *  and all other commercial damages or losses, even if such party      *
293 | *  shall have been informed of the possibility of such damages. This   *
294 | *  limitation of liability shall not apply to liability for death or   *
295 | *  personal injury resulting from such party's negligence to the       *
296 | *  extent applicable law prohibits such limitation. Some               *
297 | *  jurisdictions do not allow the exclusion or limitation of           *
298 | *  incidental or consequential damages, so this exclusion and          *
299 | *  limitation may not apply to You.                                    *
300 | *                                                                      *
301 | ************************************************************************
302 | 
303 | 8. Litigation
304 | -------------
305 | 
306 | Any litigation relating to this License may be brought only in the
307 | courts of a jurisdiction where the defendant maintains its principal
308 | place of business and such litigation shall be governed by laws of that
309 | jurisdiction, without reference to its conflict-of-law provisions.
310 | Nothing in this Section shall prevent a party's ability to bring
311 | cross-claims or counter-claims.
312 | 
313 | 9. Miscellaneous
314 | ----------------
315 | 
316 | This License represents the complete agreement concerning the subject
317 | matter hereof. If any provision of this License is held to be
318 | unenforceable, such provision shall be reformed only to the extent
319 | necessary to make it enforceable. Any law or regulation which provides
320 | that the language of a contract shall be construed against the drafter
321 | shall not be used to construe this License against a Contributor.
322 | 
323 | 10. Versions of the License
324 | ---------------------------
325 | 
326 | 10.1. New Versions
327 | 
328 | Mozilla Foundation is the license steward. Except as provided in Section
329 | 10.3, no one other than the license steward has the right to modify or
330 | publish new versions of this License. Each version will be given a
331 | distinguishing version number.
332 | 
333 | 10.2. Effect of New Versions
334 | 
335 | You may distribute the Covered Software under the terms of the version
336 | of the License under which You originally received the Covered Software,
337 | or under the terms of any subsequent version published by the license
338 | steward.
339 | 
340 | 10.3. Modified Versions
341 | 
342 | If you create software not governed by this License, and you want to
343 | create a new license for such software, you may create and use a
344 | modified version of this License if you rename the license and remove
345 | any references to the name of the license steward (except to note that
346 | such modified license differs from this License).
347 | 
348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary
349 | Licenses
350 | 
351 | If You choose to distribute Source Code Form that is Incompatible With
352 | Secondary Licenses under the terms of this version of the License, the
353 | notice described in Exhibit B of this License must be attached.
354 | 
355 | Exhibit A - Source Code Form License Notice
356 | -------------------------------------------
357 | 
358 |   This Source Code Form is subject to the terms of the Mozilla Public
359 |   License, v. 2.0. If a copy of the MPL was not distributed with this
360 |   file, You can obtain one at http://mozilla.org/MPL/2.0/.
361 | 
362 | If it is not possible or desirable to put the notice in a particular
363 | file, then You may include the notice in a location (such as a LICENSE
364 | file in a relevant directory) where a recipient would be likely to look
365 | for such a notice.
366 | 
367 | You may add additional accurate notices of copyright ownership.
368 | 
369 | Exhibit B - "Incompatible With Secondary Licenses" Notice
370 | ---------------------------------------------------------
371 | 
372 |   This Source Code Form is "Incompatible With Secondary Licenses", as
373 |   defined by the Mozilla Public License, v. 2.0.
374 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | telegram-export
  2 | ===============
  3 | 
  4 | .. figure:: https://user-images.githubusercontent.com/15344581/43033282-3eff18fc-8ce5-11e8-9994-fd1de40268e1.png
  5 |    :alt: Logo
  6 | 
  7 | A tool to download Telegram data (users, chats, messages, and media)
  8 | into a database (and display the saved data).
  9 | 
 10 | This project is currently archived.
 11 | ===================================
 12 | 
 13 | **The main owners no longer contribute to this repository, because Telegram updates and schema changes make it very tedious work, and we have lost interest on the project.**
 14 | 
 15 | **As an alternative, some clients like Telegram Desktop now have a built-in "Export chat history" feature (in the three-dots menu of every chat) which you may use as a replacement.**
 16 | 
 17 | 
 18 | **Database schema:**
 19 | 
 20 | .. figure:: https://user-images.githubusercontent.com/15344581/37377008-44c93d20-271f-11e8-8170-5d6071a21b8f.png
 21 |    :alt: Schema image
 22 | 
 23 | Installation
 24 | ============
 25 | 
 26 | The simplest way is to run ``sudo pip3 install --upgrade telegram_export``,
 27 | after which telegram-export should simply be available as a command: ``telegram-export``
 28 | in the terminal. That's it!
 29 | 
 30 | If you don't like using ``sudo pip``, you can use ``pip3 install --user telegram_export``,
 31 | but you'll have to add something like ``~/.local/bin/`` to your $PATH to get
 32 | the command available. If you don't want to add to PATH, you can also use
 33 | ``python3 -m telegram_export`` anywhere instead of ``telegram-export``. You'll
 34 | have a similar issue if you're using a virtualenv, but if you're using those
 35 | you probably know what you're doing anyway :)
 36 | 
 37 | Slow downloads?
 38 | ---------------
 39 | 
 40 | You may also want to install ``cryptg`` with the same method for a speed
 41 | boost when downloading media. Telegram requires a lot of encryption and
 42 | decryption and this can make downloading files especially slow unless
 43 | using a nice fast library like cryptg. One user reported a `speed
 44 | increase of
 45 | 1100% <https://github.com/expectocode/telegram-export/issues/29>`__.
 46 | 
 47 | Usage
 48 | =====
 49 | 
 50 | First, copy config.ini.example (from GitHub) to ``~/.config/telegram-export/config.ini``
 51 | and edit some values. You'll probably need to create this folder. To write your
 52 | config whitelist, you may want to refer to the output of
 53 | ``telegram-export --list-dialogs`` to get dialog IDs or
 54 | ``telegram-export --search <query>`` to filter the results.
 55 | 
 56 | Then run ``telegram-export`` and allow it to dump data.
 57 | 
 58 | Full option listing:
 59 | 
 60 | .. code::
 61 | 
 62 |     usage: __main__.py [-h] [--list-dialogs] [--search-dialogs SEARCH_STRING]
 63 |                        [--config-file CONFIG_FILE] [--contexts CONTEXTS]
 64 |                        [--format {text,html}] [--download-past-media]
 65 | 
 66 |     Download Telegram data (users, chats, messages, and media) into a database
 67 |     (and display the saved data)
 68 | 
 69 |     optional arguments:
 70 |       -h, --help            show this help message and exit
 71 |       --list-dialogs        list dialogs and exit
 72 |       --search-dialogs SEARCH_STRING
 73 |                             like --list-dialogs but searches for a dialog by
 74 |                             name/username/phone
 75 |       --config-file CONFIG_FILE
 76 |                             specify a config file. Default config.ini
 77 |       --contexts CONTEXTS   list of contexts to act on eg --contexts=12345,
 78 |                             @username (see example config whitelist for full
 79 |                             rules). Overrides whitelist/blacklist.
 80 |       --format {text,html}  formats the dumped messages with the specified
 81 |                             formatter and exits.
 82 |       --download-past-media
 83 |                             download past media instead of dumping new data (files
 84 |                             that were seen before but not downloaded).
 85 | 
 86 | 
 87 | telegram-export vs `telegram-history-dump <https://github.com/tvdstaaij/telegram-history-dump>`__
 88 | =================================================================================================
 89 | 
 90 |     *(For brevity we'll just refer them to as "export" and "dump")*
 91 | 
 92 | -  SQLite instead of jsonlines allows for far more powerful queries and
 93 |    better efficiency but loses compatibility with text-manipulating UNIX
 94 |    tools as the data is not stored as text (or even more powerful tools
 95 |    like `jq <https://stedolan.github.io/jq/>`__).
 96 | 
 97 | -  export's stored data is less complicated than dump's json dumps
 98 | 
 99 | -  Support for saving the history of a person or other dialog, so you
100 |    can see e.g. what their name was over time.
101 | 
102 | -  Using `telethon <https://github.com/LonamiWebs/Telethon>`__
103 |    instead of `tg-cli <https://github.com/vysheng/tg>`__ allows
104 |    support for newer Telegram features like pinned messages, admin logs,
105 |    user bios, first-class support for supergroups and avoids the
106 |    ``tg-cli`` bug which made dumping channels impossible, as well as
107 |    several other ``tg-cli`` annoyances (such as being somewhat harder to
108 |    install).
109 | 
110 | -  Newer and less mature than dump
111 | 
112 | -  No dedicated analysis program yet (dump has telegram-analysis and
113 |    pisg)
114 | 
115 | -  Implemented features which dump does not support (incomplete list):
116 | 
117 |    -  Admin logs
118 |    -  Dumping Users/Channels/Chats as their own entities, not just as
119 |       message metadata. This allows things like user bios, channel
120 |       descriptions and profile pictures.
121 |    -  Pinned messages (dump kind of supports this, but only by saving a
122 |       message replying to the pinned message with text 'pinned the
123 |       message')
124 |    -  Participant lists
125 | 
126 | -  Closer interaction with the Telegram API theoretically allows big
127 |    speed improvements (Practical comparison of times soon™)
128 | 
129 | -  export's database file is bound to a user (like dump), and the
130 |    program will exit if you login as another person to avoid mixing
131 |    things up. If you do use export with multiple users, you should
132 |    specify a different database for each user. You can easily select
133 |    different config files through ``--config-file``.
134 | 
135 | Limitations
136 | ===========
137 | 
138 | -  Still being worked on. It dumps things, but the schema may change and we
139 |    won't support old schema transitions.
140 | 
141 | -  Relies on `Telethon <https://github.com/LonamiWebs/Telethon>`, which is still pre-1.0.
142 | 
143 | -  Certain information is not dumped for simplicity's sake. For example,
144 |    edited messages won't be re-downloaded and there is currently no
145 |    support for multiple versions of a message in the db. However, this
146 |    shouldn't be much of an issue, since most edits or deletions are
147 |    legit and often to fix typos.
148 | 
149 | What does it do? Is it a bot?
150 | =============================
151 | 
152 | It uses the Telegram API (what Telegram apps use), so it has access to
153 | everything a Telegram app can do. This is why you need an API ID and API
154 | hash to use it, and why one from Telegram Desktop will work. Since
155 | normal clients need to download messages, media, users etc to display
156 | them in-app, telegram-export can do the same, and save them into a nice
157 | database.
158 | 
159 | So no, it's not really a bot, but it does use the same technology as
160 | **userbots** in order to work. As far as we know, it won't get you banned from
161 | using Telegram or anything like that.
162 | 
163 | Installation from source
164 | ========================
165 | 
166 | ``git clone`` this repository, then ``python3 setup.py install``. You should
167 | also read through the `Installation`_ section for related notes.
168 | 


--------------------------------------------------------------------------------
/bin/telegram-export:
--------------------------------------------------------------------------------
1 | import runpy
2 | runpy.run_module("telegram_export", run_name="__main__", alter_sys=True)
3 | 


--------------------------------------------------------------------------------
/config.ini.example:
--------------------------------------------------------------------------------
  1 | # Configuration file for telegram-export. Default values are commented.
  2 | # This file should be copied to config.ini and edited.
  3 | # Use ; or # for comment lines
  4 | 
  5 | [TelegramAPI]
  6 | 
  7 | ################ You MUST edit these values to use the exporter #################
  8 | # You can either get your own from my.telegram.org, or use a published one (easier):
  9 | # https://github.com/telegramdesktop/tdesktop/blob/dev/Telegram/SourceFiles/config.h#L222
 10 | # https://git.io/vADys (permanent link)
 11 | ApiId = 12345e
 12 | ApiHash = 0123456789abcdef0123456789abcdef
 13 | 
 14 | # Your phone number. You must supply this. Should start with +country code eg +44.
 15 | PhoneNumber = xxxxxxxxxx
 16 | 
 17 | # You can store your 2FA Password here if you don't want to enter it when logging in.
 18 | # It's more secure to not store your password in plain text
 19 | # SecondFactorPassword = xxxxxxxxxx
 20 | 
 21 | # This can be anything
 22 | ; SessionName = exporter
 23 | SessionName = exporter
 24 | 
 25 | [Dumper]
 26 | 
 27 | # Output folder, where the database, media, and cache files will be stored.
 28 | # You can leave this as . to save to your current directory (when you call the
 29 | # program), or put a path like ~/Downloads or /home/username/tg-export/. It is
 30 | # usually a better idea to set a specific directory here.
 31 | OutputDirectory = .
 32 | 
 33 | # Either Whitelist or Blacklist should be present, not both. If both are
 34 | # present, only Whitelist will be used.
 35 | # These are lists of "entities", which can be usernames, phone numbers, or
 36 | # telegram IDs.
 37 | # - Whitelist will backup only the entities listed.
 38 | # - Blacklist will backup everything except the ones listed.
 39 | # It's usually  a good idea to set a whitelist, as otherwise you will have
 40 | # to wait for lots of dialogs you don't care about to be downloaded. You can
 41 | # get a list of dialogs and their associated IDs by running
 42 | # `telegram-export.py --list-dialogs`
 43 | # # The list must be a comma separated list of entities. You may have an
 44 | # optional comment on an entry by using the : symbol. For example,
 45 | #
 46 | # Blacklist = @username : this is a comment, -1001132836449 :another comment,
 47 | # +12345678, +232525252 : the previous phone number had no comment.
 48 | # Phones must start with '+'  as shown. Usernames can be with or without @
 49 | Whitelist = @example, username, -1001132836449
 50 | # Don't forget your commas! If you miss one after a comment, everything until
 51 | # the next comma will be treated as a comment.
 52 | 
 53 | 
 54 | ############################# 'Advanced Options' #############################
 55 | 
 56 | # The file types to download, comma separated. Options are:
 57 | # "photo", "document", "video", "audio", "sticker", "voice", "chatphoto".
 58 | # An empty list (default if omitted) means all are allowed.
 59 | # Note that "chatphoto" includes profile pictures as well.
 60 | ; MediaWhitelist = chatphoto, photo, sticker
 61 | MediaWhitelist = chatphoto, photo, sticker
 62 | 
 63 | # The maximum allowed file size for a document before skipping it.
 64 | # For instance, "800KB" will only download files smaller or equal to 800KB.
 65 | # Allowed units are "B", "KB", "MB" and "GB" (decimal point allowed).
 66 | #
 67 | # No unit defaults to "MB". Setting to "0"  will not download any media.
 68 | # Note that this only applies to documents (everything but normal photos).
 69 | ; MaxSize = 1MB
 70 | MaxSize = 1MB
 71 | 
 72 | # Sets the log level used across the entire dumper. Supported values
 73 | # are the same available in the "logging" module. Defaults to DEBUG.
 74 | # Available levels (less to more verbose): ERROR, WARNING, INFO, DEBUG, NOTSET
 75 | ; LogLevel = INFO
 76 | LogLevel = INFO
 77 | 
 78 | # Database filename (without '.db')
 79 | ; DBFileName = export
 80 | DBFileName = export
 81 | 
 82 | # The format string to be used when downloading media. You can use any literal
 83 | # string you wish in the name, relative names (including directories, separated
 84 | # by the '/' character) or absolute paths. Anything inside {} will be replaced
 85 | # with a proper value, and possible placeholders are:
 86 | # {sender_id}   - Sender ID
 87 | # {context_id}  - Context ID
 88 | # {name}        - Sanitized name of the context (chat)
 89 | # {filename}    - Sanitized name of the file
 90 | # {sender_name} - Sanitized sender name
 91 | # {type}        - The media type (e.g. photo, document, video...)
 92 | #
 93 | # For instance, you could do:
 94 | #   MediaFilenameFmt = "usermedia/{name}/{type}/{filename}"
 95 | #
 96 | # That would save files under "usermedia/Chat Name/media type/media file".
 97 | # The extension will always be added automatically as a pair of ".ID.EXT". This
 98 | # allows the program to ensure that duplicate files won't be downloaded even
 99 | # if you change the format string at a later point. You shouldn't change this.
100 | #
101 | # To format the date of the mesage, you can use the format specifiers
102 | # described under the following link anywhere you wish:
103 | # https://docs.python.org/3.5/library/datetime.html#strftime-and-strptime-behavior
104 | # eg. for the year you would put %Y in the format string and for a literal %
105 | # you would put %%, though that would be a bit weird.
106 | ; MediaFilenameFmt = usermedia/{name}-{context_id}/{type}-{filename}
107 | MediaFilenameFmt = usermedia/{name}-{context_id}/{type}-{filename}
108 | 
109 | # Time after which an unchanged user should be dumped anyway, to avoid a long
110 | # information gap (see EXPLANATIONS.md). In minutes.
111 | ; InvalidationTime = 7200
112 | InvalidationTime = 7200
113 | 
114 | # Chunk size in which to retrieve messages. 100 (default, max) if not present.
115 | ; ChunkSize = 100
116 | 
117 | # Maximum chunks to retrieve from a chat (if too many). 0 (default) means all.
118 | ; MaxChunks = 0
119 | 
120 | # Sets the log level used across libaries (excluding the dumper).
121 | # Accepts the same values as LogLevel
122 | ; LibraryLogLevel = WARNING
123 | LibraryLogLevel = WARNING
124 | 
125 | # Sets proxy support
126 | # Proxy = socks5://user:password@127.0.0.1:1080
127 | # Proxy = http://127.0.0.1:8080
128 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | appdirs
2 | telethon~=1.4.3
3 | tqdm
4 | PySocks
5 | 


--------------------------------------------------------------------------------
/schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/expectocode/telegram-export/ed1cbc6ac364ada137b3fc8f9ffb8170084a65a4/schema.png


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """Setup for telegram-export"""
 2 | 
 3 | from setuptools import setup, find_packages
 4 | from codecs import open
 5 | from os import path
 6 | 
 7 | here = path.abspath(path.dirname(__file__))
 8 | 
 9 | with open("README.rst", "r") as readme:
10 |     desc=readme.read()
11 | 
12 | setup(
13 |     name='telegram-export',
14 |     license="MPL 2.0",
15 |     version='1.8.2',
16 |     description='A tool to download Telegram data (users, chats, messages, '
17 |                 'and media) into a database (and display the saved data).',
18 |     long_description=desc,
19 |     url='https://github.com/expectocode/telegram-export',
20 |     author='expectocode and Lonami',
21 |     author_email='expectocode@gmail.com',
22 |     classifiers=[
23 |         'Development Status :: 4 - Beta',
24 |         'Intended Audience :: Developers',
25 |         'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)',
26 |         'Programming Language :: Python :: 3'
27 |     ],
28 |     keywords='Telegram messaging database',
29 |     packages=find_packages(),
30 |     install_requires=[
31 |         'tqdm', 'telethon~=1.4.3', 'appdirs',
32 |         'async_generator'  # Python 3.5 async gen support
33 |     ],
34 |     scripts=['bin/telegram-export'],
35 |     test_suite='telegram_export.tests',
36 |     project_urls={
37 |         'Bug Reports': 'https://github.com/expectocode/telegram-export/issues',
38 |         'Source': 'https://github.com/expectocode/telegram-export'
39 |     }
40 | )
41 | 


--------------------------------------------------------------------------------
/telegram_export/__init__.py:
--------------------------------------------------------------------------------
1 | """Components for telegram-export"""
2 | from . import formatters, dumper, downloader, exporter
3 | 


--------------------------------------------------------------------------------
/telegram_export/__main__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """The main telegram-export script.
  3 | Handles arguments and config, then calls the Exporter.
  4 | """
  5 | import argparse
  6 | import asyncio
  7 | import configparser
  8 | import difflib
  9 | import logging
 10 | import os
 11 | import re
 12 | from contextlib import suppress
 13 | 
 14 | import tqdm
 15 | import appdirs
 16 | from telethon import TelegramClient, utils
 17 | from telegram_export.utils import parse_proxy_str
 18 | 
 19 | from telegram_export.dumper import Dumper
 20 | from telegram_export.exporter import Exporter
 21 | from telegram_export.formatters import NAME_TO_FORMATTER
 22 | 
 23 | logger = logging.getLogger('')  # Root logger
 24 | 
 25 | 
 26 | NO_USERNAME = '<no username>'
 27 | 
 28 | 
 29 | class TqdmLoggingHandler(logging.Handler):
 30 |     """Redirect all logging messages through tqdm.write()"""
 31 |     def emit(self, record):
 32 |         try:
 33 |             msg = self.format(record)
 34 |             tqdm.tqdm.write(msg)
 35 |             self.flush()
 36 |         except (KeyboardInterrupt, SystemExit):
 37 |             raise
 38 |         except:
 39 |             self.handleError(record)
 40 | 
 41 | 
 42 | def load_config(filename):
 43 |     """Load config from the specified file and return the parsed config"""
 44 |     # Get a path to the file. If it was specified, it should be fine.
 45 |     # If it was not specified, assume it's config.ini in the script's dir.
 46 |     config_dir = appdirs.user_config_dir("telegram-export")
 47 | 
 48 |     if not filename:
 49 |         filename = os.path.join(config_dir, 'config.ini')
 50 | 
 51 |     if not os.path.isfile(filename):
 52 |         logger.warning("No config file! Make one in {} and find an example "
 53 |                        "config at https://github.com/expectocode/"
 54 |                        "telegram-export/blob/master/config.ini.example."
 55 |                        "Alternatively, use --config-file FILE".format(filename))
 56 |         exit(1)
 57 | 
 58 |     defaults = {
 59 |         'SessionName': 'exporter',
 60 |         'OutputDirectory': '.',
 61 |         'MediaWhitelist': 'chatphoto, photo, sticker',
 62 |         'MaxSize': '1MB',
 63 |         'LogLevel': 'INFO',
 64 |         'DBFileName': 'export',
 65 |         'InvalidationTime': '7200',
 66 |         'ChunkSize': '100',
 67 |         'MaxChunks': '0',
 68 |         'LibraryLogLevel': 'WARNING',
 69 |         'MediaFilenameFmt': 'usermedia/{name}-{context_id}/{type}-{filename}'
 70 |     }
 71 | 
 72 |     # Load from file
 73 |     config = configparser.ConfigParser(defaults)
 74 |     config.read(filename)
 75 | 
 76 |     # Check logging level (let it raise on invalid)
 77 |     level = config['Dumper'].get('LogLevel').upper()
 78 |     handler = TqdmLoggingHandler(level)
 79 |     handler.setFormatter(logging.Formatter(
 80 |         '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 81 |     ))
 82 |     handler.setLevel(getattr(logging, level))
 83 |     logger.addHandler(handler)
 84 |     logger.setLevel(getattr(logging, level))
 85 |     # Library loggers
 86 |     level = config['Dumper'].get('LibraryLogLevel').upper()
 87 |     telethon_logger = logging.getLogger('telethon')
 88 |     telethon_logger.setLevel(getattr(logging, level))
 89 |     telethon_logger.addHandler(handler)
 90 | 
 91 |     # Convert relative paths and paths with ~
 92 |     config['Dumper']['OutputDirectory'] = os.path.abspath(os.path.expanduser(
 93 |         config['Dumper']['OutputDirectory']))
 94 |     os.makedirs(config['Dumper']['OutputDirectory'], exist_ok=True)
 95 | 
 96 |     # Convert minutes to seconds
 97 |     config['Dumper']['InvalidationTime'] = str(
 98 |         config['Dumper'].getint('InvalidationTime', 7200) * 60)
 99 | 
100 |     # Convert size to bytes
101 |     max_size = config['Dumper'].get('MaxSize')
102 |     m = re.match(r'(\d+(?:\.\d*)?)\s*([kmg]?b)?', max_size, re.IGNORECASE)
103 |     if not m:
104 |         raise ValueError('Invalid file size given for MaxSize')
105 | 
106 |     max_size = int(float(m.group(1)) * {
107 |         'B': 1024**0,
108 |         'KB': 1024**1,
109 |         'MB': 1024**2,
110 |         'GB': 1024**3,
111 |     }.get((m.group(2) or 'MB').upper()))
112 |     config['Dumper']['MaxSize'] = str(max_size)
113 |     return config
114 | 
115 | 
116 | def parse_args():
117 |     """Parse command-line arguments to the script"""
118 |     parser = argparse.ArgumentParser(description="Download Telegram data (users, chats, messages, and media) into a database (and display the saved data)")
119 |     parser.add_argument('--list-dialogs', action='store_true',
120 |                         help='list dialogs and exit')
121 | 
122 |     parser.add_argument('--search-dialogs', type=str, dest='search_string',
123 |                         help='like --list-dialogs but searches for a dialog '
124 |                              'by name/username/phone')
125 | 
126 |     parser.add_argument('--config-file', default=None,
127 |                         help='specify a config file. Default config.ini')
128 |                         # This None is handled in read_config.
129 | 
130 |     parser.add_argument('--contexts', type=str,
131 |                         help='list of contexts to act on eg --contexts=12345, '
132 |                              '@username (see example config whitelist for '
133 |                              'full rules). Overrides whitelist/blacklist. '
134 |                              'The = is required when providing multiple values.')
135 | 
136 |     parser.add_argument('--format-contexts', type=int, nargs='+',
137 |                         help='list of contexts to format eg --format-contexts='
138 |                              '12345 -1006789. Only ContextIDs are accepted, '
139 |                              'not usernames or phone numbers.')
140 | 
141 |     parser.add_argument('--format', type=str,
142 |                         help='formats the dumped messages with the specified '
143 |                              'formatter and exits. You probably want to use '
144 |                              'this in conjunction with --format-contexts.',
145 |                              choices=NAME_TO_FORMATTER)
146 | 
147 |     parser.add_argument('--download-past-media', action='store_true',
148 |                         help='download past media instead of dumping '
149 |                              'new data (files that were seen before '
150 |                              'but not downloaded).')
151 | 
152 |     parser.add_argument('--proxy', type=str, dest='proxy_string',
153 |                         help='set proxy string. '
154 |                              'Examples: socks5://user:password@127.0.0.1:1080. '
155 |                              'http://localhost:8080')
156 |     return parser.parse_args()
157 | 
158 | 
159 | def fmt_dialog(dialog, id_pad=0, username_pad=0):
160 |     """
161 |     Space-fill a row with given padding values
162 |     to ensure alignment when printing dialogs.
163 |     """
164 |     username = getattr(dialog.entity, 'username', None)
165 |     username = '@' + username if username else NO_USERNAME
166 |     return '{:<{id_pad}} | {:<{username_pad}} | {}'.format(
167 |         utils.get_peer_id(dialog.entity), username, dialog.name,
168 |         id_pad=id_pad, username_pad=username_pad
169 |     )
170 | 
171 | 
172 | def find_fmt_dialog_padding(dialogs):
173 |     """
174 |     Find the correct amount of space padding
175 |     to give dialogs when printing them.
176 |     """
177 |     no_username = NO_USERNAME[:-1]  # Account for the added '@' if username
178 |     return (
179 |         max(len(str(utils.get_peer_id(dialog.entity))) for dialog in dialogs),
180 |         max(len(getattr(dialog.entity, 'username', no_username) or no_username)
181 |             for dialog in dialogs) + 1
182 |     )
183 | 
184 | 
185 | def find_dialog(dialogs, query, top=25, threshold=0.7):
186 |     """
187 |     Iterate through dialogs and return, sorted,
188 |     the best matches for a given query.
189 |     """
190 |     seq = difflib.SequenceMatcher(b=query, autojunk=False)
191 |     scores = []
192 |     for index, dialog in enumerate(dialogs):
193 |         seq.set_seq1(dialog.name)
194 |         name_score = seq.ratio()
195 |         if query.lower() in dialog.name.lower():
196 |             # If query is a substring of the name, make it a good match.
197 |             # Slightly boost dialogs which were recently active, so not
198 |             # all substring-matched dialogs have exactly the same score.
199 |             boost = (index/len(dialogs))/25
200 |             name_score = max(name_score, 0.75 + boost)
201 |         if getattr(dialog.entity, 'username', None):
202 |             seq.set_seq1(dialog.entity.username)
203 |             username_score = seq.ratio()
204 |         else:
205 |             username_score = 0
206 |         if getattr(dialog.entity, 'phone', None):
207 |             seq.set_seq1(dialog.entity.phone)
208 |             phone_score = seq.ratio()
209 |         else:
210 |             phone_score = 0
211 | 
212 |         scores.append((dialog, max(name_score, username_score, phone_score)))
213 |     scores.sort(key=lambda t: t[1], reverse=True)
214 |     matches = tuple(score[0] for score in scores if score[1] > threshold)
215 |     num_not_shown = 0 if len(matches) <= top else len(matches) - top
216 |     return matches[:top], num_not_shown
217 | 
218 | 
219 | async def list_or_search_dialogs(args, client):
220 |     """List the user's dialogs and/or search them for a query"""
221 |     dialogs = (await client.get_dialogs(limit=None))[::-1]  # Oldest to newest
222 |     if args.list_dialogs:
223 |         id_pad, username_pad = find_fmt_dialog_padding(dialogs)
224 |         for dialog in dialogs:
225 |             print(fmt_dialog(dialog, id_pad, username_pad))
226 | 
227 |     if args.search_string:
228 |         print('Searching for "{}"...'.format(args.search_string))
229 |         found, num_not_shown = find_dialog(dialogs, args.search_string)
230 |         if not found:
231 |             print('Found no good results with "{}".'.format(args.search_string))
232 |         elif len(found) == 1:
233 |             print('Top match:', fmt_dialog(found[0]), sep='\n')
234 |         else:
235 |             if num_not_shown > 0:
236 |                 print('Showing top {} matches of {}:'.format(
237 |                     len(found), len(found) + num_not_shown))
238 |             else:
239 |                 print('Showing top {} matches:'.format(len(found)))
240 |             id_pad, username_pad = find_fmt_dialog_padding(found)
241 |             for dialog in found:
242 |                 print(fmt_dialog(dialog, id_pad, username_pad))
243 | 
244 |     await client.disconnect()
245 | 
246 | 
247 | async def main(loop):
248 |     """
249 |     The main telegram-export program. Goes through the
250 |     configured dialogs and dumps them into the database.
251 |     """
252 |     args = parse_args()
253 |     config = load_config(args.config_file)
254 |     dumper = Dumper(config['Dumper'])
255 | 
256 |     if args.contexts:
257 |         dumper.config['Whitelist'] = args.contexts
258 | 
259 |     if args.format:
260 |         formatter = NAME_TO_FORMATTER[args.format](dumper.conn)
261 |         fmt_contexts = args.format_contexts or formatter.iter_context_ids()
262 |         for cid in fmt_contexts:
263 |             formatter.format(cid, config['Dumper']['OutputDirectory'])
264 |         return
265 | 
266 |     proxy = args.proxy_string or dumper.config.get('Proxy')
267 |     if proxy:
268 |         proxy = parse_proxy_str(proxy)
269 | 
270 |     absolute_session_name = os.path.join(
271 |         config['Dumper']['OutputDirectory'],
272 |         config['TelegramAPI']['SessionName']
273 |     )
274 |     if config.has_option('TelegramAPI', 'SecondFactorPassword'):
275 |         client = await (TelegramClient(
276 |                 absolute_session_name,
277 |                 config['TelegramAPI']['ApiId'],
278 |                 config['TelegramAPI']['ApiHash'],
279 |                 loop=loop,
280 |                 proxy=proxy
281 |             ).start(config['TelegramAPI']['PhoneNumber'], password=config['TelegramAPI']['SecondFactorPassword']))
282 |     else:
283 |         client = await (TelegramClient(
284 |             absolute_session_name,
285 |             config['TelegramAPI']['ApiId'],
286 |             config['TelegramAPI']['ApiHash'],
287 |             loop=loop,
288 |             proxy=proxy
289 |         ).start(config['TelegramAPI']['PhoneNumber']))
290 | 
291 |     if args.list_dialogs or args.search_string:
292 |         return await list_or_search_dialogs(args, client)
293 | 
294 |     exporter = Exporter(client, config, dumper, loop)
295 | 
296 |     try:
297 |         if args.download_past_media:
298 |             await exporter.download_past_media()
299 |         else:
300 |             await exporter.start()
301 |     except asyncio.CancelledError:
302 |         # This should be triggered on KeyboardInterrupt's to prevent ugly
303 |         # traceback from reaching the user. Important code that always
304 |         # must run (such as the Downloader saving resume info) should go
305 |         # in their respective `finally:` blocks to ensure it gets called.
306 |         pass
307 |     finally:
308 |         await exporter.close()
309 | 
310 |     exporter.logger.info("Finished!")
311 | 
312 | 
313 | if __name__ == '__main__':
314 |     loop = asyncio.get_event_loop()
315 |     try:
316 |         ret = loop.run_until_complete(main(loop)) or 0
317 |     except KeyboardInterrupt:
318 |         ret = 1
319 |     for task in asyncio.Task.all_tasks():
320 |         task.cancel()
321 |         # Now we should await task to execute it's cancellation.
322 |         # Cancelled task raises asyncio.CancelledError that we can suppress:
323 |         if hasattr(task._coro, '__name__') and task._coro.__name__ == 'main':
324 |             continue
325 |         with suppress(asyncio.CancelledError):
326 |             loop.run_until_complete(task)
327 |     loop.stop()
328 |     loop.close()
329 |     exit(ret)
330 | 


--------------------------------------------------------------------------------
/telegram_export/downloader.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python3
  2 | import asyncio
  3 | import datetime
  4 | import itertools
  5 | import logging
  6 | import os
  7 | import time
  8 | from collections import defaultdict
  9 | 
 10 | import tqdm
 11 | from telethon import utils
 12 | from telethon.errors import ChatAdminRequiredError
 13 | from telethon.tl import types, functions
 14 | 
 15 | from . import utils as export_utils
 16 | 
 17 | __log__ = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | VALID_TYPES = {
 21 |     'photo', 'document', 'video', 'audio', 'sticker', 'voice', 'chatphoto'
 22 | }
 23 | BAR_FORMAT = "{l_bar}{bar}| {n_fmt}/{total_fmt} " \
 24 |              "[{elapsed}<{remaining}, {rate_noinv_fmt}{postfix}]"
 25 | 
 26 | 
 27 | QUEUE_TIMEOUT = 5
 28 | DOWNLOAD_PART_SIZE = 256 * 1024
 29 | 
 30 | # How long should we sleep between these requests? These numbers
 31 | # should be tuned to adjust (n requests/time spent + flood time).
 32 | USER_FULL_DELAY = 1.5
 33 | CHAT_FULL_DELAY = 1.5
 34 | MEDIA_DELAY = 3.0
 35 | HISTORY_DELAY = 1.0
 36 | 
 37 | 
 38 | class Downloader:
 39 |     """
 40 |     Download dialogs and their associated data, and dump them.
 41 |     Make Telegram API requests and sleep for the appropriate time.
 42 |     """
 43 |     def __init__(self, client, config, dumper, loop):
 44 |         self.client = client
 45 |         self.loop = loop or asyncio.get_event_loop()
 46 |         self.max_size = config.getint('MaxSize')
 47 |         self.types = {x.strip().lower()
 48 |                       for x in (config.get('MediaWhitelist') or '').split(',')
 49 |                       if x.strip()}
 50 |         self.media_fmt = os.path.join(config['OutputDirectory'],
 51 |                                       config['MediaFilenameFmt'])
 52 |         assert all(x in VALID_TYPES for x in self.types)
 53 |         if self.types:
 54 |             self.types.add('unknown')  # Always allow "unknown" media types
 55 | 
 56 |         self.dumper = dumper
 57 |         self._checked_entity_ids = set()
 58 |         self._media_bar = None
 59 | 
 60 |         # To get around the fact we always rely on the database to download
 61 |         # media (which simplifies certain operations and ensures that the
 62 |         # resulting filename are always the same) but this (the db) might not
 63 |         # have some entities dumped yet, we save the only needed information
 64 |         # in memory for every dump, that is, {peer_id: display}.
 65 |         self._displays = {}
 66 | 
 67 |         # This field keeps track of the download in progress if any, so that
 68 |         # partially downloaded files can be deleted. Only one file can be
 69 |         # downloaded at any given time, so using a set here makes no sense.
 70 |         self._incomplete_download = None
 71 | 
 72 |         # We're gonna need a few queues if we want to do things concurrently.
 73 |         # None values should be inserted to notify that the dump has finished.
 74 |         self._media_queue = asyncio.Queue()
 75 |         self._user_queue = asyncio.Queue()
 76 |         self._chat_queue = asyncio.Queue()
 77 |         self._running = False
 78 | 
 79 |     def _check_media(self, media):
 80 |         """
 81 |         Checks whether the given MessageMedia should be downloaded or not.
 82 |         """
 83 |         if not media or not self.max_size:
 84 |             return False
 85 |         if not self.types:
 86 |             return True
 87 |         return export_utils.get_media_type(media) in self.types
 88 | 
 89 |     def _dump_full_entity(self, entity):
 90 |         """
 91 |         Dumps the full entity into the Dumper, also enqueuing their profile
 92 |         photo if any so it can be downloaded later by a different coroutine.
 93 |         Supply None as the photo_id if self.types is empty or 'chatphoto' is
 94 |         not in self.types
 95 |         """
 96 |         if isinstance(entity, types.UserFull):
 97 |             if not self.types or 'chatphoto' in self.types:
 98 |                 photo_id = self.dumper.dump_media(entity.profile_photo)
 99 |             else:
100 |                 photo_id = None
101 |             self.enqueue_photo(entity.profile_photo, photo_id, entity.user)
102 |             self.dumper.dump_user(entity, photo_id=photo_id)
103 | 
104 |         elif isinstance(entity, types.Chat):
105 |             if not self.types or 'chatphoto' in self.types:
106 |                 photo_id = self.dumper.dump_media(entity.photo)
107 |             else:
108 |                 photo_id = None
109 |             self.enqueue_photo(entity.photo, photo_id, entity)
110 |             self.dumper.dump_chat(entity, photo_id=photo_id)
111 | 
112 |         elif isinstance(entity, types.messages.ChatFull):
113 |             if not self.types or 'chatphoto' in self.types:
114 |                 photo_id = self.dumper.dump_media(entity.full_chat.chat_photo)
115 |             else:
116 |                 photo_id = None
117 |             chat = next(
118 |                 x for x in entity.chats if x.id == entity.full_chat.id
119 |             )
120 |             self.enqueue_photo(entity.full_chat.chat_photo, photo_id, chat)
121 |             if chat.megagroup:
122 |                 self.dumper.dump_supergroup(entity.full_chat, chat,
123 |                                             photo_id)
124 |             else:
125 |                 self.dumper.dump_channel(entity.full_chat, chat, photo_id)
126 | 
127 |     def _dump_messages(self, messages, target):
128 |         """
129 |         Helper method to iterate the messages from a GetMessageHistoryRequest
130 |         and dump them into the Dumper, mostly to avoid excessive nesting.
131 | 
132 |         Also enqueues any media to be downloaded later by a different coroutine.
133 |         """
134 |         for m in messages:
135 |             if isinstance(m, types.Message):
136 |                 media_id = self.dumper.dump_media(m.media)
137 |                 if media_id and self._check_media(m.media):
138 |                     self.enqueue_media(
139 |                         media_id, utils.get_peer_id(target), m.from_id, m.date
140 |                     )
141 | 
142 |                 self.dumper.dump_message(
143 |                     message=m,
144 |                     context_id=utils.get_peer_id(target),
145 |                     forward_id=self.dumper.dump_forward(m.fwd_from),
146 |                     media_id=media_id
147 |                 )
148 |             elif isinstance(m, types.MessageService):
149 |                 if isinstance(m.action, types.MessageActionChatEditPhoto):
150 |                     media_id = self.dumper.dump_media(m.action.photo)
151 |                     self.enqueue_photo(m.action.photo, media_id, target,
152 |                                        peer_id=m.from_id, date=m.date)
153 |                 else:
154 |                     media_id = None
155 |                 self.dumper.dump_message_service(
156 |                     message=m,
157 |                     context_id=utils.get_peer_id(target),
158 |                     media_id=media_id
159 |                 )
160 | 
161 |     def _dump_admin_log(self, events, target):
162 |         """
163 |         Helper method to iterate the events from a GetAdminLogRequest
164 |         and dump them into the Dumper, mostly to avoid excessive nesting.
165 | 
166 |         Also enqueues any media to be downloaded later by a different coroutine.
167 |         """
168 |         for event in events:
169 |             assert isinstance(event, types.ChannelAdminLogEvent)
170 |             if isinstance(event.action,
171 |                           types.ChannelAdminLogEventActionChangePhoto):
172 |                 media_id1 = self.dumper.dump_media(event.action.new_photo)
173 |                 media_id2 = self.dumper.dump_media(event.action.prev_photo)
174 |                 self.enqueue_photo(event.action.new_photo, media_id1, target,
175 |                                    peer_id=event.user_id, date=event.date)
176 |                 self.enqueue_photo(event.action.prev_photo, media_id2, target,
177 |                                    peer_id=event.user_id, date=event.date)
178 |             else:
179 |                 media_id1 = None
180 |                 media_id2 = None
181 |             self.dumper.dump_admin_log_event(
182 |                 event, utils.get_peer_id(target), media_id1, media_id2
183 |             )
184 |         return min(e.id for e in events)
185 | 
186 |     def _get_name(self, peer_id):
187 |         if peer_id is None:
188 |             return ''
189 | 
190 |         name = self._displays.get(peer_id)
191 |         if name:
192 |             return name
193 | 
194 |         c = self.dumper.conn.cursor()
195 |         _, kind = utils.resolve_id(peer_id)
196 |         if kind == types.PeerUser:
197 |             row = c.execute('SELECT FirstName, LastName FROM User '
198 |                             'WHERE ID = ?', (peer_id,)).fetchone()
199 |             if row:
200 |                 return '{} {}'.format(row[0] or '',
201 |                                       row[1] or '').strip()
202 |         elif kind == types.PeerChat:
203 |             row = c.execute('SELECT Title FROM Chat '
204 |                             'WHERE ID = ?', (peer_id,)).fetchone()
205 |             if row:
206 |                 return row[0]
207 |         elif kind == types.PeerChannel:
208 |             row = c.execute('SELECT Title FROM Channel '
209 |                             'WHERE ID = ?', (peer_id,)).fetchone()
210 |             if row:
211 |                 return row[0]
212 |             row = c.execute('SELECT Title FROM Supergroup '
213 |                             'WHERE ID = ?', (peer_id,)).fetchone()
214 |             if row:
215 |                 return row[0]
216 |         return ''
217 | 
218 |     async def _download_media(self, media_id, context_id, sender_id, date,
219 |                               bar):
220 |         media_row = self.dumper.conn.execute(
221 |             'SELECT LocalID, VolumeID, Secret, Type, MimeType, Name, Size '
222 |             'FROM Media WHERE ID = ?', (media_id,)
223 |         ).fetchone()
224 |         # Documents have attributes and they're saved under the "document"
225 |         # namespace so we need to split it before actually comparing.
226 |         media_type = media_row[3].split('.')
227 |         media_type, media_subtype = media_type[0], media_type[-1]
228 |         if media_type not in ('photo', 'document'):
229 |             return  # Only photos or documents are actually downloadable
230 | 
231 |         formatter = defaultdict(
232 |             str,
233 |             context_id=context_id,
234 |             sender_id=sender_id,
235 |             type=media_subtype or 'unknown',
236 |             name=self._get_name(context_id) or 'unknown',
237 |             sender_name=self._get_name(sender_id) or 'unknown'
238 |         )
239 | 
240 |         # Documents might have a filename, which may have an extension. Use
241 |         # the extension from the filename if any (more accurate than mime).
242 |         ext = None
243 |         filename = media_row[5]
244 |         if filename:
245 |             filename, ext = os.path.splitext(filename)
246 |         else:
247 |             # No filename at all, set a sensible default filename
248 |             filename = date.strftime(
249 |                 '{}_%Y-%m-%d_%H-%M-%S'.format(formatter['type'])
250 |             )
251 | 
252 |         # The saved media didn't have a filename and we set our own.
253 |         # Detect a sensible extension from the known mimetype.
254 |         if not ext:
255 |             ext = export_utils.get_extension(media_row[4])
256 | 
257 |         # Apply the date to the user format string and then replace the map
258 |         formatter['filename'] = filename
259 |         filename = date.strftime(self.media_fmt).format_map(formatter)
260 |         filename += '.{}{}'.format(media_id, ext)
261 |         if os.path.isfile(filename):
262 |             __log__.debug('Skipping already-existing file %s', filename)
263 |             return
264 | 
265 |         __log__.debug('Downloading to %s', filename)
266 |         os.makedirs(os.path.dirname(filename), exist_ok=True)
267 |         if media_type == 'document':
268 |             location = types.InputDocumentFileLocation(
269 |                 id=media_row[0],
270 |                 version=media_row[1],
271 |                 access_hash=media_row[2]
272 |             )
273 |         else:
274 |             location = types.InputFileLocation(
275 |                 local_id=media_row[0],
276 |                 volume_id=media_row[1],
277 |                 secret=media_row[2]
278 |             )
279 | 
280 |         def progress(saved, total):
281 |             """Increment the tqdm progress bar"""
282 |             if total is None:
283 |                 # No size was found so the bar total wasn't incremented before
284 |                 bar.total += saved
285 |                 bar.update(saved)
286 |             elif saved == total:
287 |                 # Downloaded the last bit (which is probably <> part size)
288 |                 mod = (saved % DOWNLOAD_PART_SIZE) or DOWNLOAD_PART_SIZE
289 |                 bar.update(mod)
290 |             else:
291 |                 # All chunks are of the same size and this isn't the last one
292 |                 bar.update(DOWNLOAD_PART_SIZE)
293 | 
294 |         if media_row[6] is not None:
295 |             bar.total += media_row[6]
296 | 
297 |         self._incomplete_download = filename
298 |         await self.client.download_file(
299 |             location, file=filename, file_size=media_row[6],
300 |             part_size_kb=DOWNLOAD_PART_SIZE // 1024,
301 |             progress_callback=progress
302 |         )
303 |         self._incomplete_download = None
304 | 
305 |     async def _media_consumer(self, queue, bar):
306 |         while self._running:
307 |             start = time.time()
308 |             media_id, context_id, sender_id, date = await queue.get()
309 |             await self._download_media(media_id, context_id, sender_id,
310 |                                        datetime.datetime.utcfromtimestamp(date),
311 |                                        bar)
312 |             queue.task_done()
313 |             await asyncio.sleep(max(MEDIA_DELAY - (time.time() - start), 0),
314 |                                 loop=self.loop)
315 | 
316 |     async def _user_consumer(self, queue, bar):
317 |         while self._running:
318 |             start = time.time()
319 |             self._dump_full_entity(await self.client(
320 |                 functions.users.GetFullUserRequest(await queue.get())
321 |             ))
322 |             queue.task_done()
323 |             bar.update(1)
324 |             await asyncio.sleep(max(USER_FULL_DELAY - (time.time() - start), 0),
325 |                                 loop=self.loop)
326 | 
327 |     async def _chat_consumer(self, queue, bar):
328 |         while self._running:
329 |             start = time.time()
330 |             chat = await queue.get()
331 |             if isinstance(chat, (types.Chat, types.PeerChat)):
332 |                 self._dump_full_entity(chat)
333 |             else:  # isinstance(chat, (types.Channel, types.PeerChannel)):
334 |                 self._dump_full_entity(await self.client(
335 |                     functions.channels.GetFullChannelRequest(chat)
336 |                 ))
337 |             queue.task_done()
338 |             bar.update(1)
339 |             await asyncio.sleep(max(CHAT_FULL_DELAY - (time.time() - start), 0),
340 |                                 loop=self.loop)
341 | 
342 |     def enqueue_entities(self, entities):
343 |         """
344 |         Enqueues the given iterable of entities to be dumped later by a
345 |         different coroutine. These in turn might enqueue profile photos.
346 |         """
347 |         for entity in entities:
348 |             eid = utils.get_peer_id(entity)
349 |             self._displays[eid] = utils.get_display_name(entity)
350 |             if isinstance(entity, types.User):
351 |                 if entity.deleted or entity.min:
352 |                     continue  # Empty name would cause IntegrityError
353 |             elif isinstance(entity, types.Channel):
354 |                 if entity.left:
355 |                     continue  # Getting full info triggers ChannelPrivateError
356 |             elif not isinstance(entity, (types.Chat,
357 |                                          types.InputPeerUser,
358 |                                          types.InputPeerChat,
359 |                                          types.InputPeerChannel)):
360 |                 # Drop UserEmpty, ChatEmpty, ChatForbidden and ChannelForbidden
361 |                 continue
362 | 
363 |             if eid in self._checked_entity_ids:
364 |                 continue
365 |             else:
366 |                 self._checked_entity_ids.add(eid)
367 |                 if isinstance(entity, (types.User, types.InputPeerUser)):
368 |                     self._user_queue.put_nowait(entity)
369 |                 else:
370 |                     self._chat_queue.put_nowait(entity)
371 | 
372 |     def enqueue_media(self, media_id, context_id, sender_id, date):
373 |         """
374 |         Enqueues the given message or media from the given context entity
375 |         to be downloaded later. If the ID of the message is known it should
376 |         be set in known_id. The media won't be enqueued unless its download
377 |         is desired.
378 |         """
379 |         if not date:
380 |             date = int(time.time())
381 |         elif not isinstance(date, int):
382 |             date = int(date.timestamp())
383 |         self._media_queue.put_nowait((media_id, context_id, sender_id, date))
384 | 
385 |     def enqueue_photo(self, photo, photo_id, context,
386 |                       peer_id=None, date=None):
387 |         if not photo_id:
388 |             return
389 |         if not isinstance(context, int):
390 |             context = utils.get_peer_id(context)
391 |         if peer_id is None:
392 |             peer_id = context
393 |         if date is None:
394 |             date = getattr(photo, 'date', None) or datetime.datetime.now()
395 |         self.enqueue_media(photo_id, context, peer_id, date)
396 | 
397 |     async def start(self, target_id):
398 |         """
399 |         Starts the dump with the given target ID.
400 |         """
401 |         self._running = True
402 |         self._incomplete_download = None
403 |         target_in = await self.client.get_input_entity(target_id)
404 |         target = await self.client.get_entity(target_in)
405 |         target_id = utils.get_peer_id(target)
406 | 
407 |         found = self.dumper.get_message_count(target_id)
408 |         chat_name = utils.get_display_name(target)
409 |         msg_bar = tqdm.tqdm(unit=' messages', desc=chat_name,
410 |                             initial=found, bar_format=BAR_FORMAT)
411 |         ent_bar = tqdm.tqdm(unit=' entities', desc='entities',
412 |                             bar_format=BAR_FORMAT, postfix={'chat': chat_name})
413 |         med_bar = tqdm.tqdm(unit='B', desc='media', unit_divisor=1000,
414 |                             unit_scale=True, bar_format=BAR_FORMAT,
415 |                             total=0, postfix={'chat': chat_name})
416 |         # Divisor is 1000 not 1024 since tqdm puts a K not a Ki
417 | 
418 |         asyncio.ensure_future(self._user_consumer(self._user_queue, ent_bar),
419 |                               loop=self.loop)
420 |         asyncio.ensure_future(self._chat_consumer(self._chat_queue, ent_bar),
421 |                               loop=self.loop)
422 |         asyncio.ensure_future(self._media_consumer(self._media_queue, med_bar),
423 |                               loop=self.loop)
424 | 
425 |         self.enqueue_entities(self.dumper.iter_resume_entities(target_id))
426 |         for mid, sender_id, date in self.dumper.iter_resume_media(target_id):
427 |             self.enqueue_media(mid, target_id, sender_id, date)
428 | 
429 |         try:
430 |             self.enqueue_entities((target,))
431 |             ent_bar.total = len(self._checked_entity_ids)
432 |             req = functions.messages.GetHistoryRequest(
433 |                 peer=target_in,
434 |                 offset_id=0,
435 |                 offset_date=None,
436 |                 add_offset=0,
437 |                 limit=self.dumper.chunk_size,
438 |                 max_id=0,
439 |                 min_id=0,
440 |                 hash=0
441 |             )
442 | 
443 |             can_get_participants = (
444 |                 isinstance(target_in, types.InputPeerChat)
445 |                 or (isinstance(target, types.Channel)
446 |                     and (target.megagroup or target.admin_rights is not None))
447 |             )
448 |             if can_get_participants:
449 |                 try:
450 |                     __log__.info('Getting participants...')
451 |                     participants = await self.client.get_participants(target_in)
452 |                     added, removed = self.dumper.dump_participants_delta(
453 |                         target_id, ids=[x.id for x in participants]
454 |                     )
455 |                     __log__.info('Saved %d new members, %d left the chat.',
456 |                                  len(added), len(removed))
457 |                 except ChatAdminRequiredError:
458 |                     __log__.info('Getting participants aborted (admin '
459 |                                  'rights revoked while getting them).')
460 | 
461 |             req.offset_id, req.offset_date, stop_at = self.dumper.get_resume(
462 |                 target_id
463 |             )
464 |             if req.offset_id:
465 |                 __log__.info('Resuming at %s (%s)',
466 |                              req.offset_date, req.offset_id)
467 | 
468 |             # Check if we have access to the admin log
469 |             # TODO Resume admin log?
470 |             # Rather silly considering logs only last up to two days and
471 |             # there isn't much information in them (due to their short life).
472 |             if isinstance(target_in, types.InputPeerChannel):
473 |                 log_req = functions.channels.GetAdminLogRequest(
474 |                     target_in, q='', min_id=0, max_id=0, limit=1
475 |                 )
476 |                 try:
477 |                     await self.client(log_req)
478 |                     log_req.limit = 100
479 |                 except ChatAdminRequiredError:
480 |                     log_req = None
481 |             else:
482 |                 log_req = None
483 | 
484 |             chunks_left = self.dumper.max_chunks
485 |             # This loop is for get history, although the admin log
486 |             # is interlaced as well to dump both at the same time.
487 |             while self._running:
488 |                 start = time.time()
489 |                 history = await self.client(req)
490 |                 # Queue found entities so they can be dumped later
491 |                 self.enqueue_entities(itertools.chain(
492 |                     history.users, history.chats
493 |                 ))
494 |                 ent_bar.total = len(self._checked_entity_ids)
495 | 
496 |                 # Dump the messages from this batch
497 |                 self._dump_messages(history.messages, target)
498 | 
499 |                 # Determine whether to continue dumping or we're done
500 |                 count = len(history.messages)
501 |                 msg_bar.total = getattr(history, 'count', count)
502 |                 msg_bar.update(count)
503 |                 if history.messages:
504 |                     # We may reinsert some we already have (so found > total)
505 |                     found = min(found + len(history.messages), msg_bar.total)
506 |                     req.offset_id = min(m.id for m in history.messages)
507 |                     req.offset_date = min(m.date for m in history.messages)
508 | 
509 |                 # Receiving less messages than the limit means we have
510 |                 # reached the end, so we need to exit. Next time we'll
511 |                 # start from offset 0 again so we can check for new messages.
512 |                 #
513 |                 # We dump forward (message ID going towards 0), so as soon
514 |                 # as the minimum message ID (now in offset ID) is less than
515 |                 # the highest ID ("closest" bound we need to reach), stop.
516 |                 if count < req.limit or req.offset_id <= stop_at:
517 |                     __log__.debug('Received less messages than limit, done.')
518 |                     max_id = self.dumper.get_max_message_id(target_id) or 0 # can't have NULL
519 |                     self.dumper.save_resume(target_id, stop_at=max_id)
520 |                     break
521 | 
522 |                 # Keep track of the last target ID (smallest one),
523 |                 # so we can resume from here in case of interruption.
524 |                 self.dumper.save_resume(
525 |                     target_id, msg=req.offset_id, msg_date=req.offset_date,
526 |                     stop_at=stop_at  # We DO want to preserve stop_at.
527 |                 )
528 |                 self.dumper.commit()
529 | 
530 |                 chunks_left -= 1  # 0 means infinite, will reach -1 and never 0
531 |                 if chunks_left == 0:
532 |                     __log__.debug('Reached maximum amount of chunks, done.')
533 |                     break
534 | 
535 |                 # Interlace with the admin log request if any
536 |                 if log_req:
537 |                     result = await self.client(log_req)
538 |                     self.enqueue_entities(itertools.chain(
539 |                         result.users, result.chats
540 |                     ))
541 |                     if result.events:
542 |                         log_req.max_id = self._dump_admin_log(result.events,
543 |                                                               target)
544 |                     else:
545 |                         log_req = None
546 | 
547 |                 # We need to sleep for HISTORY_DELAY but we have already spent
548 |                 # some of it invoking (so subtract said delta from the delay).
549 |                 await asyncio.sleep(
550 |                     max(HISTORY_DELAY - (time.time() - start), 0),
551 |                     loop=self.loop
552 |                 )
553 | 
554 |             # Message loop complete, wait for the queues to empty
555 |             msg_bar.n = msg_bar.total
556 |             msg_bar.close()
557 |             self.dumper.commit()
558 | 
559 |             # This loop is specific to the admin log (to finish up)
560 |             while log_req and self._running:
561 |                 start = time.time()
562 |                 result = await self.client(log_req)
563 |                 self.enqueue_entities(itertools.chain(
564 |                     result.users, result.chats
565 |                 ))
566 |                 if result.events:
567 |                     log_req.max_id = self._dump_admin_log(result.events,
568 |                                                           target)
569 |                     await asyncio.sleep(max(
570 |                         HISTORY_DELAY - (time.time() - start), 0),
571 |                         loop=self.loop
572 |                     )
573 |                 else:
574 |                     log_req = None
575 | 
576 |             __log__.info(
577 |                 'Done. Retrieving full information about %s missing entities.',
578 |                 self._user_queue.qsize() + self._chat_queue.qsize()
579 |             )
580 |             await self._user_queue.join()
581 |             await self._chat_queue.join()
582 |             await self._media_queue.join()
583 |         finally:
584 |             self._running = False
585 |             ent_bar.n = ent_bar.total
586 |             ent_bar.close()
587 |             med_bar.n = med_bar.total
588 |             med_bar.close()
589 |             # If the download was interrupted and there are users left in the
590 |             # queue we want to save them into the database for the next run.
591 |             entities = []
592 |             while not self._user_queue.empty():
593 |                 entities.append(self._user_queue.get_nowait())
594 |             while not self._chat_queue.empty():
595 |                 entities.append(self._chat_queue.get_nowait())
596 |             if entities:
597 |                 self.dumper.save_resume_entities(target_id, entities)
598 | 
599 |             # Do the same with the media queue
600 |             media = []
601 |             while not self._media_queue.empty():
602 |                 media.append(self._media_queue.get_nowait())
603 |             self.dumper.save_resume_media(media)
604 | 
605 |             if entities or media:
606 |                 self.dumper.commit()
607 | 
608 |             # Delete partially-downloaded files
609 |             if (self._incomplete_download is not None
610 |                     and os.path.isfile(self._incomplete_download)):
611 |                 os.remove(self._incomplete_download)
612 | 
613 |     async def download_past_media(self, dumper, target_id):
614 |         """
615 |         Downloads the past media that has already been dumped into the
616 |         database but has not been downloaded for the given target ID yet.
617 | 
618 |         Media which formatted filename results in an already-existing file
619 |         will be *ignored* and not re-downloaded again.
620 |         """
621 |         # TODO Should this respect and download only allowed media? Or all?
622 |         target_in = await self.client.get_input_entity(target_id)
623 |         target = await self.client.get_entity(target_in)
624 |         target_id = utils.get_peer_id(target)
625 |         bar = tqdm.tqdm(unit='B', desc='media', unit_divisor=1000,
626 |                         unit_scale=True, bar_format=BAR_FORMAT, total=0,
627 |                         postfix={'chat': utils.get_display_name(target)})
628 | 
629 |         msg_cursor = dumper.conn.cursor()
630 |         msg_cursor.execute('SELECT ID, Date, FromID, MediaID FROM Message '
631 |                            'WHERE ContextID = ? AND MediaID IS NOT NULL',
632 |                            (target_id,))
633 | 
634 |         msg_row = msg_cursor.fetchone()
635 |         while msg_row:
636 |             await self._download_media(
637 |                 media_id=msg_row[3],
638 |                 context_id=target_id,
639 |                 sender_id=msg_row[2],
640 |                 date=datetime.datetime.utcfromtimestamp(msg_row[1]),
641 |                 bar=bar
642 |             )
643 |             msg_row = msg_cursor.fetchone()
644 | 


--------------------------------------------------------------------------------
/telegram_export/dumper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """A module for dumping export data into the database"""
  3 | import json
  4 | import logging
  5 | import sqlite3
  6 | import sys
  7 | import time
  8 | from base64 import b64encode
  9 | from datetime import datetime
 10 | from enum import Enum
 11 | import os.path
 12 | 
 13 | from telethon.tl import types
 14 | from telethon.utils import get_peer_id, resolve_id, get_input_peer
 15 | 
 16 | from . import utils
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | DB_VERSION = 1  # database version
 21 | 
 22 | 
 23 | class InputFileType(Enum):
 24 |     """An enum to specify the type of an InputFile"""
 25 |     NORMAL = 0
 26 |     DOCUMENT = 1
 27 | 
 28 | 
 29 | def sanitize_dict(dictionary):
 30 |     """
 31 |     Sanitizes a dictionary, encoding all bytes as
 32 |     Base64 so that it can be serialized as JSON.
 33 | 
 34 |     Assumes that there are no containers with bytes inside,
 35 |     and that the dictionary doesn't contain self-references.
 36 |     """
 37 |     for k, v in dictionary.items():
 38 |         if isinstance(v, bytes):
 39 |             dictionary[k] = str(b64encode(v), encoding='ascii')
 40 |         elif isinstance(v, datetime):
 41 |             dictionary[k] = v.timestamp()
 42 |         elif isinstance(v, dict):
 43 |             sanitize_dict(v)
 44 |         elif isinstance(v, list):
 45 |             for d in v:
 46 |                 if isinstance(d, dict):
 47 |                     sanitize_dict(d)
 48 | 
 49 | 
 50 | class Dumper:
 51 |     """Class to interface with the database for exports"""
 52 | 
 53 |     def __init__(self, config):
 54 |         """
 55 |         Initialise the dumper. `config` should be a dict-like
 56 |         object from the config file's Dumper section".
 57 |         """
 58 |         self.config = config
 59 |         if 'DBFileName' in self.config:
 60 |             where = self.config["DBFileName"]
 61 |             if where != ':memory:':
 62 |                 where = '{}.db'.format(os.path.join(
 63 |                     self.config['OutputDirectory'], self.config['DBFileName']
 64 |                 ))
 65 |             self.conn = sqlite3.connect(where, check_same_thread=False)
 66 |         else:
 67 |             logger.error("A database filename is required!")
 68 |             exit()
 69 |         c = self.conn.cursor()
 70 | 
 71 |         self.chunk_size = max(int(config.get('ChunkSize', 100)), 1)
 72 |         self.max_chunks = max(int(config.get('MaxChunks', 0)), 0)
 73 |         self.invalidation_time = max(config.getint('InvalidationTime', 0), -1)
 74 | 
 75 |         self.dump_methods = ('message', 'user', 'message_service', 'channel',
 76 |                              'supergroup', 'chat', 'adminlog_event', 'media',
 77 |                              'participants_delta', 'media', 'forward')
 78 | 
 79 |         self._dump_callbacks = {method: set() for method in self.dump_methods}
 80 | 
 81 |         c.execute("SELECT name FROM sqlite_master "
 82 |                   "WHERE type='table' AND name='Version'")
 83 | 
 84 |         exists = bool(c.fetchone())
 85 |         if exists:
 86 |             # Tables already exist, check for the version
 87 |             c.execute("SELECT Version FROM Version")
 88 |             version = c.fetchone()
 89 |             if not version:
 90 |                 # Sometimes there may be a table without values (see #55)
 91 |                 c.execute("DROP TABLE IF EXISTS Version")
 92 |                 exists = False
 93 |             elif version[0] != DB_VERSION:
 94 |                 self._upgrade_database(old=version[0])
 95 |                 self.conn.commit()
 96 |         if not exists:
 97 |             # Tables don't exist, create new ones
 98 |             c.execute("CREATE TABLE Version (Version INTEGER)")
 99 |             c.execute("CREATE TABLE SelfInformation (UserID INTEGER)")
100 |             c.execute("INSERT INTO Version VALUES (?)", (DB_VERSION,))
101 | 
102 |             c.execute("CREATE TABLE Forward("
103 |                       "ID INTEGER PRIMARY KEY AUTOINCREMENT,"
104 |                       "OriginalDate INT NOT NULL,"
105 |                       "FromID INT,"  # User or Channel ID
106 |                       "ChannelPost INT,"
107 |                       "PostAuthor TEXT)")
108 | 
109 |             # For InputFileLocation:
110 |             #   local_id -> LocalID
111 |             #   volume_id -> VolumeID
112 |             #   secret -> Secret
113 |             #
114 |             # For InputDocumentFileLocation:
115 |             #   id -> LocalID
116 |             #   access_hash -> Secret
117 |             #   version -> VolumeID
118 |             c.execute("CREATE TABLE Media("
119 |                       "ID INTEGER PRIMARY KEY AUTOINCREMENT,"
120 |                       # Basic useful information, if available
121 |                       "Name TEXT,"
122 |                       "MimeType TEXT,"
123 |                       "Size INT,"
124 |                       "ThumbnailID INT,"
125 |                       "Type TEXT,"
126 |                       # Fields required to download the file
127 |                       "LocalID INT,"
128 |                       "VolumeID INT,"
129 |                       "Secret INT,"
130 |                       # Whatever else as JSON here
131 |                       "Extra TEXT,"
132 |                       "FOREIGN KEY (ThumbnailID) REFERENCES Media(ID))")
133 | 
134 |             c.execute("CREATE TABLE User("
135 |                       "ID INT NOT NULL,"
136 |                       "DateUpdated INT NOT NULL,"
137 |                       "FirstName TEXT NOT NULL,"
138 |                       "LastName TEXT,"
139 |                       "Username TEXT,"
140 |                       "Phone TEXT,"
141 |                       "Bio TEXT,"
142 |                       "Bot INTEGER,"
143 |                       "CommonChatsCount INT NOT NULL,"
144 |                       "PictureID INT,"
145 |                       "FOREIGN KEY (PictureID) REFERENCES Media(ID),"
146 |                       "PRIMARY KEY (ID, DateUpdated))")
147 | 
148 |             c.execute("CREATE TABLE Channel("
149 |                       "ID INT NOT NULL,"
150 |                       "DateUpdated INT NOT NULL,"
151 |                       "About TEXT,"
152 |                       "Title TEXT NOT NULL,"
153 |                       "Username TEXT,"
154 |                       "PictureID INT,"
155 |                       "PinMessageID INT,"
156 |                       "FOREIGN KEY (PictureID) REFERENCES Media(ID),"
157 |                       "PRIMARY KEY (ID, DateUpdated))")
158 | 
159 |             c.execute("CREATE TABLE Supergroup("
160 |                       "ID INT NOT NULL,"
161 |                       "DateUpdated INT NOT NULL,"
162 |                       "About TEXT,"
163 |                       "Title TEXT NOT NULL,"
164 |                       "Username TEXT,"
165 |                       "PictureID INT,"
166 |                       "PinMessageID INT,"
167 |                       "FOREIGN KEY (PictureID) REFERENCES Media(ID),"
168 |                       "PRIMARY KEY (ID, DateUpdated))")
169 | 
170 |             c.execute("CREATE TABLE Chat("
171 |                       "ID INT NOT NULL,"
172 |                       "DateUpdated INT NOT NULL,"
173 |                       "Title TEXT NOT NULL,"
174 |                       "MigratedToID INT,"
175 |                       "PictureID INT,"
176 |                       "FOREIGN KEY (PictureID) REFERENCES Media(ID),"
177 |                       "PRIMARY KEY (ID, DateUpdated))")
178 | 
179 |             c.execute("CREATE TABLE ChatParticipants("
180 |                       "ContextID INT NOT NULL,"
181 |                       "DateUpdated INT NOT NULL,"
182 |                       "Added TEXT NOT NULL,"
183 |                       "Removed TEXT NOT NULL,"
184 |                       "PRIMARY KEY (ContextID, DateUpdated))")
185 | 
186 |             c.execute("CREATE TABLE Message("
187 |                       "ID INT NOT NULL,"
188 |                       "ContextID INT NOT NULL,"
189 |                       "Date INT NOT NULL,"
190 |                       "FromID INT,"
191 |                       "Message TEXT,"
192 |                       "ReplyMessageID INT,"
193 |                       "ForwardID INT,"
194 |                       "PostAuthor TEXT,"
195 |                       "ViewCount INT,"
196 |                       "MediaID INT,"
197 |                       "Formatting TEXT,"  # e.g. bold, italic, etc.
198 |                       "ServiceAction TEXT,"  # friendly name of action if it is
199 |                       # a MessageService
200 |                       "FOREIGN KEY (ForwardID) REFERENCES Forward(ID),"
201 |                       "FOREIGN KEY (MediaID) REFERENCES Media(ID),"
202 |                       "PRIMARY KEY (ID, ContextID))")
203 | 
204 |             c.execute("CREATE TABLE AdminLog("
205 |                       "ID INT NOT NULL,"
206 |                       "ContextID INT NOT NULL,"
207 |                       "Date INT NOT NULL,"
208 |                       "UserID INT,"
209 |                       "MediaID1 INT,"  # e.g. new photo
210 |                       "MediaID2 INT,"  # e.g. old photo
211 |                       "Action TEXT,"  # Friendly name for the action
212 |                       "Data TEXT,"  # JSON data of the entire action
213 |                       "FOREIGN KEY (MediaID1) REFERENCES Media(ID),"
214 |                       "FOREIGN KEY (MediaID2) REFERENCES Media(ID),"
215 |                       "PRIMARY KEY (ID, ContextID))")
216 | 
217 |             c.execute("CREATE TABLE Resume("
218 |                       "ContextID INT NOT NULL,"
219 |                       "ID INT NOT NULL,"
220 |                       "Date INT NOT NULL,"
221 |                       "StopAt INT NOT NULL,"
222 |                       "PRIMARY KEY (ContextID))")
223 | 
224 |             c.execute("CREATE TABLE ResumeEntity("
225 |                       "ContextID INT NOT NULL,"
226 |                       "ID INT NOT NULL,"
227 |                       "AccessHash INT,"
228 |                       "PRIMARY KEY (ContextID, ID))")
229 | 
230 |             c.execute("CREATE TABLE ResumeMedia("
231 |                       "MediaID INT NOT NULL,"
232 |                       "ContextID INT NOT NULL,"
233 |                       "SenderID INT,"
234 |                       "Date INT,"
235 |                       "PRIMARY KEY (MediaID))")
236 |             self.conn.commit()
237 | 
238 |     def _upgrade_database(self, old):
239 |         """
240 |         This method knows how to migrate from old -> DB_VERSION.
241 | 
242 |         Currently it performs no operation because this is the
243 |         first version of the tables, in the future it should alter
244 |         tables or somehow transfer the data between what changed.
245 |         """
246 | 
247 |     # TODO make these callback functions less repetitive.
248 |     # For the most friendly API, we should  have different methods for each
249 |     # kind of callback, but there could be a way to make this cleaner.
250 |     # Perhaps a dictionary mapping 'message' to the message callback set.
251 | 
252 |     def add_callback(self, dump_method, callback):
253 |         """
254 |         Add the callback function to the set of callbacks for the given
255 |         dump method. dump_method should be a string, and callback should be a
256 |         function which takes one argument - a tuple which will be dumped into
257 |         the database. The list of valid dump methods is dumper.dump_methods.
258 |         If the dumper does not dump a row due to the invalidation_time, the
259 |         callback will still be called.
260 |         """
261 |         if dump_method not in self.dump_methods:
262 |             raise ValueError("Cannot attach callback to method {}. Available "
263 |                              "methods are {}".format(dump_method, self.dump_methods))
264 | 
265 |         self._dump_callbacks[dump_method].add(callback)
266 | 
267 |     def remove_callback(self, dump_method, callback):
268 |         """
269 |         Remove the callback function from the set of callbacks for the given
270 |         dump method. Will raise KeyError if the callback is not in the set of
271 |         callbacks for that method
272 |         """
273 |         if dump_method not in self.dump_methods:
274 |             raise ValueError("Cannot remove callback from method {}. Available "
275 |                              "methods are {}".format(dump_method, self.dump_methods))
276 | 
277 |         self._dump_callbacks[dump_method].remove(callback)
278 | 
279 |     def check_self_user(self, self_id):
280 |         """
281 |         Checks the self ID. If there is a stored ID and it doesn't match the
282 |         given one, an error message is printed and the application exits.
283 |         """
284 |         cur = self.conn.cursor()
285 |         cur.execute("SELECT UserID FROM SelfInformation")
286 |         result = cur.fetchone()
287 |         if result:
288 |             if result[0] != self_id:
289 |                 print('This export database belongs to another user!',
290 |                       file=sys.stderr)
291 |                 exit(1)
292 |         else:
293 |             cur.execute("INSERT INTO SelfInformation VALUES (?)", (self_id,))
294 |             self.commit()
295 | 
296 |     def dump_message(self, message, context_id, forward_id, media_id):
297 |         """
298 |         Dump a Message into the Message table.
299 | 
300 |         Params:
301 |             Message to dump,
302 |             ID of the chat dumping,
303 |             ID of Forward in the DB (or None),
304 |             ID of message Media in the DB (or None)
305 | 
306 |         Returns:
307 |             Inserted row ID.
308 |         """
309 |         if not message.message and message.media:
310 |             message.message = getattr(message.media, 'caption', '')
311 | 
312 |         row = (message.id,
313 |                context_id,
314 |                message.date.timestamp(),
315 |                message.from_id,
316 |                message.message,
317 |                message.reply_to_msg_id,
318 |                forward_id,
319 |                message.post_author,
320 |                message.views,
321 |                media_id,
322 |                utils.encode_msg_entities(message.entities),
323 |                None)  # No MessageAction
324 | 
325 |         for callback in self._dump_callbacks['message']:
326 |             callback(row)
327 | 
328 |         return self._insert('Message', row)
329 | 
330 |     def dump_message_service(self, message, context_id, media_id):
331 |         """Similar to self.dump_message, but for MessageAction's."""
332 |         name = utils.action_to_name(message.action)
333 |         if not name:
334 |             return
335 | 
336 |         extra = message.action.to_dict()
337 |         del extra['_']  # We don't need to store the type, already have name
338 |         sanitize_dict(extra)
339 |         extra = json.dumps(extra)
340 | 
341 |         row = (message.id,
342 |                context_id,
343 |                message.date.timestamp(),
344 |                message.from_id,
345 |                extra,  # Message field contains the information
346 |                message.reply_to_msg_id,
347 |                None,  # No forward
348 |                None,  # No author
349 |                None,  # No views
350 |                media_id,  # Might have e.g. a new chat Photo
351 |                None,  # No entities
352 |                name)
353 | 
354 |         for callback in self._dump_callbacks['message_service']:
355 |             callback(row)
356 | 
357 |         return self._insert('Message', row)
358 | 
359 |     def dump_admin_log_event(self, event, context_id, media_id1, media_id2):
360 |         """Similar to self.dump_message_service but for channel actions."""
361 |         name = utils.action_to_name(event.action)
362 |         if not name:
363 |             return
364 | 
365 |         extra = event.action.to_dict()
366 |         del extra['_']  # We don't need to store the type, already have name
367 |         sanitize_dict(extra)
368 |         extra = json.dumps(extra)
369 | 
370 |         row = (event.id,
371 |                context_id,
372 |                event.date.timestamp(),
373 |                event.user_id,
374 |                media_id1,
375 |                media_id2,
376 |                name,
377 |                extra)
378 | 
379 |         for callback in self._dump_callbacks['adminlog_event']:
380 |             callback(row)
381 | 
382 |         return self._insert('AdminLog', row)
383 | 
384 |     def dump_user(self, user_full, photo_id, timestamp=None):
385 |         """Dump a UserFull into the User table
386 |         Params: UserFull to dump, MediaID of the profile photo in the DB
387 |         Returns -, or False if not added"""
388 |         # Rationale for UserFull rather than User is to get bio
389 |         values = (user_full.user.id,
390 |                   timestamp or round(time.time()),
391 |                   user_full.user.first_name,
392 |                   user_full.user.last_name,
393 |                   user_full.user.username,
394 |                   user_full.user.phone,
395 |                   user_full.about,
396 |                   user_full.user.bot,
397 |                   user_full.common_chats_count,
398 |                   photo_id)
399 | 
400 |         for callback in self._dump_callbacks['user']:
401 |             callback(values)
402 | 
403 |         return self._insert_if_valid_date('User', values, date_column=1,
404 |                                           where=('ID', user_full.user.id))
405 | 
406 |     def dump_channel(self, channel_full, channel, photo_id, timestamp=None):
407 |         """Dump a Channel into the Channel table.
408 |         Params: ChannelFull, Channel to dump, MediaID
409 |                 of the profile photo in the DB
410 |         Returns -"""
411 |         # Need to get the full object too for 'about' info
412 |         values = (get_peer_id(channel),
413 |                   timestamp or round(time.time()),
414 |                   channel_full.about,
415 |                   channel.title,
416 |                   channel.username,
417 |                   photo_id,
418 |                   channel_full.pinned_msg_id)
419 | 
420 |         for callback in self._dump_callbacks['channel']:
421 |             callback(values)
422 | 
423 |         return self._insert_if_valid_date('Channel', values, date_column=1,
424 |                                           where=('ID', get_peer_id(channel)))
425 | 
426 |     def dump_supergroup(self, supergroup_full, supergroup, photo_id,
427 |                         timestamp=None):
428 |         """Dump a Supergroup into the Supergroup table
429 |         Params: ChannelFull, Channel to dump, MediaID
430 |                 of the profile photo in the DB.
431 |         Returns -"""
432 |         # Need to get the full object too for 'about' info
433 |         values = (get_peer_id(supergroup),
434 |                   timestamp or round(time.time()),
435 |                   getattr(supergroup_full, 'about', None) or '',
436 |                   supergroup.title,
437 |                   supergroup.username,
438 |                   photo_id,
439 |                   supergroup_full.pinned_msg_id)
440 | 
441 |         for callback in self._dump_callbacks['supergroup']:
442 |             callback(values)
443 | 
444 |         return self._insert_if_valid_date('Supergroup', values, date_column=1,
445 |                                           where=('ID', get_peer_id(supergroup)))
446 | 
447 |     def dump_chat(self, chat, photo_id, timestamp=None):
448 |         """Dump a Chat into the Chat table
449 |         Params: Chat to dump, MediaID of the profile photo in the DB
450 |         Returns -"""
451 |         if isinstance(chat.migrated_to, types.InputChannel):
452 |             migrated_to_id = chat.migrated_to.channel_id
453 |         else:
454 |             migrated_to_id = None
455 | 
456 |         values = (get_peer_id(chat),
457 |                   timestamp or round(time.time()),
458 |                   chat.title,
459 |                   migrated_to_id,
460 |                   photo_id)
461 | 
462 |         for callback in self._dump_callbacks['chat']:
463 |             callback(values)
464 | 
465 |         return self._insert_if_valid_date('Chat', values, date_column=1,
466 |                                           where=('ID', get_peer_id(chat)))
467 | 
468 |     def dump_participants_delta(self, context_id, ids):
469 |         """
470 |         Dumps the delta between the last dump of IDs for the given context ID
471 |         and the current input user IDs.
472 |         """
473 |         ids = set(ids)
474 |         c = self.conn.cursor()
475 |         c.execute('SELECT Added, Removed FROM ChatParticipants '
476 |                   'WHERE ContextID = ? ORDER BY DateUpdated ASC',
477 |                   (context_id,))
478 | 
479 |         row = c.fetchone()
480 |         if not row:
481 |             added = ids
482 |             removed = set()
483 |         else:
484 |             # Build the last known list of participants from the saved deltas
485 |             last_ids = set(int(x) for x in row[0].split(','))
486 |             row = c.fetchone()
487 |             while row:
488 |                 added = set(int(x) for x in row[0].split(',') if x != '')
489 |                 removed = set(int(x) for x in row[1].split(',') if x != '')
490 |                 last_ids = (last_ids | added) - removed
491 |                 row = c.fetchone()
492 |             added = ids - last_ids
493 |             removed = last_ids - ids
494 | 
495 |         row = (context_id,
496 |                round(time.time()),
497 |                ','.join(str(x) for x in added),
498 |                ','.join(str(x) for x in removed))
499 | 
500 |         for callback in self._dump_callbacks['participants_delta']:
501 |             callback(row)
502 | 
503 |         c.execute("INSERT INTO ChatParticipants VALUES (?, ?, ?, ?)", row)
504 |         return added, removed
505 | 
506 |     def dump_media(self, media, media_type=None):
507 |         """Dump a MessageMedia into the Media table
508 |         Params: media Telethon object
509 |         Returns: ID of inserted row"""
510 |         if not media:
511 |             return
512 | 
513 |         row = {x: None for x in (
514 |             'name', 'mime_type', 'size', 'thumbnail_id',
515 |             'local_id', 'volume_id', 'secret'
516 |         )}
517 |         row['type'] = media_type
518 |         row['extra'] = media.to_dict()
519 |         sanitize_dict(row['extra'])
520 |         row['extra'] = json.dumps(row['extra'])
521 | 
522 |         if isinstance(media, types.MessageMediaContact):
523 |             row['type'] = 'contact'
524 |             row['name'] = '{} {}'.format(media.first_name, media.last_name)
525 |             row['local_id'] = media.user_id
526 |             try:
527 |                 row['secret'] = int(media.phone_number or '0')
528 |             except ValueError:
529 |                 row['secret'] = 0
530 | 
531 |         elif isinstance(media, types.MessageMediaDocument):
532 |             row['type'] = utils.get_media_type(media)
533 |             doc = media.document
534 |             if isinstance(doc, types.Document):
535 |                 row['mime_type'] = doc.mime_type
536 |                 row['size'] = doc.size
537 |                 row['thumbnail_id'] = self.dump_media(doc.thumb)
538 |                 row['local_id'] = doc.id
539 |                 row['volume_id'] = doc.version
540 |                 row['secret'] = doc.access_hash
541 |                 for attr in doc.attributes:
542 |                     if isinstance(attr, types.DocumentAttributeFilename):
543 |                         row['name'] = attr.file_name
544 | 
545 |         elif isinstance(media, types.MessageMediaEmpty):
546 |             row['type'] = 'empty'
547 |             return
548 | 
549 |         elif isinstance(media, types.MessageMediaGame):
550 |             row['type'] = 'game'
551 |             game = media.game
552 |             if isinstance(game, types.Game):
553 |                 row['name'] = game.short_name
554 |                 row['thumbnail_id'] = self.dump_media(game.photo)
555 |                 row['local_id'] = game.id
556 |                 row['secret'] = game.access_hash
557 | 
558 |         elif isinstance(media, types.MessageMediaGeo):
559 |             row['type'] = 'geo'
560 |             geo = media.geo
561 |             if isinstance(geo, types.GeoPoint):
562 |                 row['name'] = '({}, {})'.format(repr(geo.lat), repr(geo.long))
563 | 
564 |         elif isinstance(media, types.MessageMediaGeoLive):
565 |             row['type'] = 'geolive'
566 |             geo = media.geo
567 |             if isinstance(geo, types.GeoPoint):
568 |                 row['name'] = '({}, {})'.format(repr(geo.lat), repr(geo.long))
569 | 
570 |         elif isinstance(media, types.MessageMediaInvoice):
571 |             row['type'] = 'invoice'
572 |             row['name'] = media.title
573 |             row['thumbnail_id'] = self.dump_media(media.photo)
574 | 
575 |         elif isinstance(media, types.MessageMediaPhoto):
576 |             row['type'] = 'photo'
577 |             row['mime_type'] = 'image/jpeg'
578 |             media = media.photo
579 | 
580 |         elif isinstance(media, types.MessageMediaUnsupported):
581 |             row['type'] = 'unsupported'
582 |             return
583 | 
584 |         elif isinstance(media, types.MessageMediaVenue):
585 |             row['type'] = 'venue'
586 |             row['name'] = '{} - {} ({}, {} {})'.format(
587 |                 media.title, media.address,
588 |                 media.provider, media.venue_id, media.venue_type
589 |             )
590 |             geo = media.geo
591 |             if isinstance(geo, types.GeoPoint):
592 |                 row['name'] += ' at ({}, {})'.format(
593 |                     repr(geo.lat), repr(geo.long)
594 |                 )
595 | 
596 |         elif isinstance(media, types.MessageMediaWebPage):
597 |             row['type'] = 'webpage'
598 |             web = media.webpage
599 |             if isinstance(web, types.WebPage):
600 |                 row['name'] = web.title
601 |                 row['thumbnail_id'] = self.dump_media(web.photo, 'thumbnail')
602 |                 row['local_id'] = web.id
603 |                 row['secret'] = web.hash
604 | 
605 |         if isinstance(media, types.Photo):
606 |             # Extra fallback cases for common parts
607 |             row['type'] = 'photo'
608 |             row['mime_type'] = 'image/jpeg'
609 |             row['name'] = str(media.date)
610 |             sizes = [x for x in media.sizes
611 |                      if isinstance(x, (types.PhotoSize, types.PhotoCachedSize))]
612 |             if sizes:
613 |                 small = min(sizes, key=lambda s: s.w * s.h)
614 |                 large = max(sizes, key=lambda s: s.w * s.h)
615 |                 media = large
616 |                 if small != large:
617 |                     row['thumbnail_id'] = self.dump_media(small, 'thumbnail')
618 | 
619 |         if isinstance(media, (types.PhotoSize,
620 |                               types.PhotoCachedSize,
621 |                               types.PhotoSizeEmpty)):
622 |             row['type'] = 'photo'
623 |             row['mime_type'] = 'image/jpeg'
624 |             if isinstance(media, types.PhotoSizeEmpty):
625 |                 row['size'] = 0
626 |             else:
627 |                 if isinstance(media, types.PhotoSize):
628 |                     row['size'] = media.size
629 |                 elif isinstance(media, types.PhotoCachedSize):
630 |                     row['size'] = len(media.bytes)
631 |                 if isinstance(media.location, types.FileLocation):
632 |                     media = media.location
633 | 
634 |         if isinstance(media, (types.UserProfilePhoto, types.ChatPhoto)):
635 |             row['type'] = 'photo'
636 |             row['mime_type'] = 'image/jpeg'
637 |             row['thumbnail_id'] = self.dump_media(
638 |                 media.photo_small, 'thumbnail'
639 |             )
640 |             media = media.photo_big
641 | 
642 |         if isinstance(media, types.FileLocation):
643 |             row['local_id'] = media.local_id
644 |             row['volume_id'] = media.volume_id
645 |             row['secret'] = media.secret
646 | 
647 |         if row['type']:
648 |             # We'll say two files are the same if they point to the same
649 |             # downloadable content (through local_id/volume_id/secret).
650 | 
651 |             for callback in self._dump_callbacks['media']:
652 |                 callback(row)
653 | 
654 |             c = self.conn.cursor()
655 |             c.execute('SELECT ID FROM Media WHERE LocalID = ? '
656 |                       'AND VolumeID = ? AND Secret = ?',
657 |                       (row['local_id'], row['volume_id'], row['secret']))
658 |             existing_row = c.fetchone()
659 |             if existing_row:
660 |                 return existing_row[0]
661 | 
662 |             return self._insert('Media', (
663 |                 None,
664 |                 row['name'], row['mime_type'], row['size'],
665 |                 row['thumbnail_id'], row['type'],
666 |                 row['local_id'], row['volume_id'], row['secret'],
667 |                 row['extra']
668 |             ))
669 | 
670 |     def dump_forward(self, forward):
671 |         """
672 |         Dump a message forward relationship into the Forward table.
673 | 
674 |         Params: MessageFwdHeader Telethon object
675 |         Returns: ID of inserted row"""
676 |         if not forward:
677 |             return None
678 | 
679 |         row = (None,  # Database will handle this
680 |                forward.date.timestamp(),
681 |                forward.from_id,
682 |                forward.channel_post,
683 |                forward.post_author)
684 | 
685 |         for callback in self._dump_callbacks['forward']:
686 |             callback(row)
687 | 
688 |         return self._insert('Forward', row)
689 | 
690 |     def get_max_message_id(self, context_id):
691 |         """
692 |         Returns the largest saved message ID for the given
693 |         context_id, or 0 if no messages have been saved.
694 |         """
695 |         row = self.conn.execute("SELECT MAX(ID) FROM Message WHERE "
696 |                                 "ContextID = ?", (context_id,)).fetchone()
697 |         return row[0] if row else 0
698 | 
699 |     def get_message_count(self, context_id):
700 |         """Gets the message count for the given context"""
701 |         tuple_ = self.conn.execute(
702 |             "SELECT COUNT(*) FROM MESSAGE WHERE ContextID = ?", (context_id,)
703 |         ).fetchone()
704 |         return tuple_[0] if tuple_ else 0
705 | 
706 |     def get_resume(self, context_id):
707 |         """
708 |         For the given context ID, return a tuple consisting of the offset
709 |         ID and offset date from which to continue, as well as at which ID
710 |         to stop.
711 |         """
712 |         c = self.conn.execute("SELECT ID, Date, StopAt FROM Resume WHERE "
713 |                               "ContextID = ?", (context_id,))
714 |         return c.fetchone() or (0, 0, 0)
715 | 
716 |     def save_resume(self, context_id, msg=0, msg_date=0, stop_at=0):
717 |         """
718 |         Saves the information required to resume a download later.
719 |         """
720 |         if isinstance(msg_date, datetime):
721 |             msg_date = int(msg_date.timestamp())
722 | 
723 |         return self._insert('Resume', (context_id, msg, msg_date, stop_at))
724 | 
725 |     def iter_resume_entities(self, context_id):
726 |         """
727 |         Returns an iterator over the entities that need resuming for the
728 |         given context_id. Note that the entities are *removed* once the
729 |         iterator is consumed completely.
730 |         """
731 |         c = self.conn.execute("SELECT ID, AccessHash FROM ResumeEntity "
732 |                               "WHERE ContextID = ?", (context_id,))
733 |         row = c.fetchone()
734 |         while row:
735 |             kind = resolve_id(row[0])[1]
736 |             if kind == types.PeerUser:
737 |                 yield types.InputPeerUser(row[0], row[1])
738 |             elif kind == types.PeerChat:
739 |                 yield types.InputPeerChat(row[0])
740 |             elif kind == types.PeerChannel:
741 |                 yield types.InputPeerChannel(row[0], row[1])
742 |             row = c.fetchone()
743 | 
744 |         c.execute("DELETE FROM ResumeEntity WHERE ContextID = ?",
745 |                   (context_id,))
746 | 
747 |     def save_resume_entities(self, context_id, entities):
748 |         """
749 |         Saves the given entities for resuming at a later point.
750 |         """
751 |         rows = []
752 |         for ent in entities:
753 |             ent = get_input_peer(ent)
754 |             if isinstance(ent, types.InputPeerUser):
755 |                 rows.append((context_id, ent.user_id, ent.access_hash))
756 |             elif isinstance(ent, types.InputPeerChat):
757 |                 rows.append((context_id, ent.chat_id, None))
758 |             elif isinstance(ent, types.InputPeerChannel):
759 |                 rows.append((context_id, ent.channel_id, ent.access_hash))
760 |         c = self.conn.cursor()
761 |         c.executemany("INSERT OR REPLACE INTO ResumeEntity "
762 |                       "VALUES (?,?,?)", rows)
763 | 
764 |     def iter_resume_media(self, context_id):
765 |         """
766 |         Returns an iterator over the media tuples that need resuming for
767 |         the given context_id. Note that the media rows are *removed* once
768 |         the iterator is consumed completely.
769 |         """
770 |         c = self.conn.execute(
771 |             "SELECT MediaID, SenderID, Date "
772 |             "FROM ResumeMedia WHERE ContextID = ?", (context_id,)
773 |         )
774 |         row = c.fetchone()
775 |         while row:
776 |             media_id, sender_id, date = row
777 |             yield media_id, sender_id, datetime.utcfromtimestamp(date)
778 |             row = c.fetchone()
779 | 
780 |         c.execute("DELETE FROM ResumeMedia WHERE ContextID = ?",
781 |                   (context_id,))
782 | 
783 |     def save_resume_media(self, media_tuples):
784 |         """
785 |         Saves the given media tuples for resuming at a later point.
786 | 
787 |         The tuples should consist of four elements, these being
788 |         ``(media_id, context_id, sender_id, date)``.
789 |         """
790 |         self.conn.executemany("INSERT OR REPLACE INTO ResumeMedia "
791 |                               "VALUES (?,?,?,?)", media_tuples)
792 | 
793 |     def _insert_if_valid_date(self, into, values, date_column, where):
794 |         """
795 |         Helper method to self._insert(into, values) after checking that the
796 |         given values are different than the latest dump or that the delta
797 |         between the current date and the existing column date_column is
798 |         bigger than the invalidation time. `where` is used to get the last
799 |         dumped item to check for invalidation time.
800 | 
801 |         As an example, ("ID", 4) -> WHERE ID = ?, 4
802 |         """
803 |         last = self.conn.execute(
804 |             'SELECT * FROM {} WHERE {} = ? ORDER BY DateUpdated DESC'
805 |             .format(into, where[0]), (where[1],)
806 |         ).fetchone()
807 | 
808 |         if last:
809 |             delta = values[date_column] - last[date_column]
810 | 
811 |             # Note sqlite stores True as 1 and False
812 |             # as 0 but this is probably ok.
813 |             if len(values) != len(last):
814 |                 raise TypeError(
815 |                     "values has a different number of columns to table"
816 |                 )
817 |             rows_same = True
818 |             for i, val in enumerate(values):
819 |                 if i != date_column and val != last[i]:
820 |                     rows_same = False
821 | 
822 |             if delta < self.invalidation_time and rows_same:
823 |                 return False
824 |         return self._insert(into, values)
825 | 
826 |     def _insert(self, into, values):
827 |         """
828 |         Helper method to insert or replace the
829 |         given tuple of values into the given table.
830 |         """
831 |         try:
832 |             fmt = ','.join('?' * len(values))
833 |             c = self.conn.execute("INSERT OR REPLACE INTO {} VALUES ({})"
834 |                                   .format(into, fmt), values)
835 |             return c.lastrowid
836 |         except sqlite3.IntegrityError as error:
837 |             self.conn.rollback()
838 |             logger.error("Integrity error: %s", str(error))
839 |             raise
840 | 
841 |     def commit(self):
842 |         """
843 |         Commits the changes made to the database to persist on disk.
844 |         """
845 |         self.conn.commit()
846 | 


--------------------------------------------------------------------------------
/telegram_export/exporter.py:
--------------------------------------------------------------------------------
  1 | """A class to iterate through dialogs and dump them, or save past media"""
  2 | 
  3 | import logging
  4 | import re
  5 | 
  6 | from async_generator import yield_, async_generator
  7 | from telethon import utils
  8 | 
  9 | from .downloader import Downloader
 10 | 
 11 | 
 12 | @async_generator
 13 | async def entities_from_str(method, string):
 14 |     """Helper function to load entities from the config file"""
 15 |     for who in string.split(','):
 16 |         if not who.strip():
 17 |             continue
 18 |         who = who.split(':', 1)[0].strip()  # Ignore anything after ':'
 19 |         if re.match(r'[^+]-?\d+', who):
 20 |             who = int(who)
 21 |         await yield_(await method(who))
 22 | 
 23 | 
 24 | @async_generator
 25 | async def get_entities_iter(mode, in_list, client):
 26 |     """
 27 |     Get a generator of entities to act on given a mode ('blacklist',
 28 |     'whitelist') and an input from that mode. If whitelist, generator
 29 |     will be asynchronous.
 30 |     """
 31 |     # TODO change None to empty blacklist?
 32 |     mode = mode.lower()
 33 |     if mode == 'whitelist':
 34 |         assert client is not None
 35 |         async for ent in entities_from_str(client.get_input_entity, in_list):
 36 |             await yield_(ent)
 37 |     elif mode == 'blacklist':
 38 |         assert client is not None
 39 |         avoid = set()
 40 |         async for eid in entities_from_str(client.get_peer_id, in_list):
 41 |             avoid.add(eid)
 42 | 
 43 |         # TODO Should this get_dialogs call be cached? How?
 44 |         async for dialog in client.iter_dialogs():
 45 |             if dialog.id not in avoid:
 46 |                 await yield_(dialog.input_entity)
 47 | 
 48 | 
 49 | class Exporter:
 50 |     """A class to iterate through dialogs and dump them, or save past media"""
 51 |     def __init__(self, client, config, dumper, loop):
 52 |         self.client = client
 53 |         self.dumper = dumper
 54 |         self.downloader = Downloader(client, config['Dumper'], dumper, loop)
 55 |         self.logger = logging.getLogger("exporter")
 56 | 
 57 |     async def close(self):
 58 |         """Gracefully close the exporter"""
 59 |         # Downloader handles its own graceful exit
 60 |         self.logger.info("Closing exporter")
 61 |         await self.client.disconnect()
 62 |         self.dumper.conn.close()
 63 | 
 64 |     async def start(self):
 65 |         """Perform a dump of the dialogs we've been told to act on"""
 66 |         self.logger.info("Saving to %s", self.dumper.config['OutputDirectory'])
 67 |         self.dumper.check_self_user((await self.client.get_me(input_peer=True)).user_id)
 68 |         if 'Whitelist' in self.dumper.config:
 69 |             # Only whitelist, don't even get the dialogs
 70 |             async for entity in get_entities_iter('whitelist',
 71 |                                                   self.dumper.config['Whitelist'],
 72 |                                                   self.client):
 73 |                 await self.downloader.start(entity)
 74 |         elif 'Blacklist' in self.dumper.config:
 75 |             # May be blacklist, so save the IDs on who to avoid
 76 |             async for entity in get_entities_iter('blacklist',
 77 |                                                   self.dumper.config['Blacklist'],
 78 |                                                   self.client):
 79 |                 await self.downloader.start(entity)
 80 |         else:
 81 |             # Neither blacklist nor whitelist - get all
 82 |             for dialog in await self.client.get_dialogs(limit=None):
 83 |                 await self.downloader.start(dialog.entity)
 84 | 
 85 |     async def download_past_media(self):
 86 |         """
 87 |         Download past media (media we saw but didn't download before) of the
 88 |         dialogs we've been told to act on
 89 |         """
 90 |         self.logger.info("Saving to %s", self.dumper.config['OutputDirectory'])
 91 |         self.dumper.check_self_user((await self.client.get_me(input_peer=True)).user_id)
 92 | 
 93 |         if 'Whitelist' in self.dumper.config:
 94 |             # Only whitelist, don't even get the dialogs
 95 |             async for entity in get_entities_iter('whitelist',
 96 |                                                   self.dumper.config['Whitelist'],
 97 |                                                   self.client):
 98 |                 await self.downloader.download_past_media(self.dumper, entity)
 99 |         elif 'Blacklist' in self.dumper.config:
100 |             # May be blacklist, so save the IDs on who to avoid
101 |             async for entity in get_entities_iter('blacklist',
102 |                                                   self.dumper.config['Blacklist'],
103 |                                                   self.client):
104 |                 await self.downloader.download_past_media(self.dumper, entity)
105 |         else:
106 |             # Neither blacklist nor whitelist - get all
107 |             for dialog in await self.client.get_dialogs(limit=None):
108 |                 await self.downloader.download_past_media(self.dumper, dialog.entity)
109 | 


--------------------------------------------------------------------------------
/telegram_export/formatters/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Formatter's to take exported database data and display in a variety of formats.
 3 | """
 4 | from .baseformatter import BaseFormatter
 5 | from .textformatter import TextFormatter
 6 | from .htmlformatter import HtmlFormatter
 7 | from .nlpformatter import NlpFormatter
 8 | 
 9 | 
10 | # Create a map between the name of available formatter and their classes
11 | NAME_TO_FORMATTER = {}
12 | 
13 | for cls in list(locals().values()):
14 |     if (isinstance(cls, type)
15 |             and issubclass(cls, BaseFormatter) and cls != BaseFormatter):
16 |         NAME_TO_FORMATTER[cls.name()] = cls
17 | 


--------------------------------------------------------------------------------
/telegram_export/formatters/baseformatter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Utility to extract data from a telegram-export database"""
  3 | import datetime
  4 | import math
  5 | import sqlite3
  6 | import sys
  7 | from pathlib import Path
  8 | from collections import namedtuple
  9 | from abc import abstractmethod
 10 | from io import TextIOWrapper
 11 | 
 12 | import os
 13 | from telethon import utils
 14 | from telethon.tl import types
 15 | 
 16 | Message = namedtuple('Message', (
 17 |     'id', 'context_id', 'date', 'from_id', 'text', 'reply_message_id',
 18 |     'forward_id', 'post_author', 'view_count', 'media_id', 'formatting', 'out',
 19 |     'service_action', 'reply_message',  # An attribute that may be None if
 20 |     # there was no reply, a Message namedtuple if there was a reply, or () if
 21 |     # there was a reply but we don't have it in the database.
 22 |     'context', # A User, Channel, Supergroup, or Chat
 23 |     'from_user', # A User or None if a channel message
 24 | ))
 25 | 
 26 | User = namedtuple('User', (
 27 |     'id', 'date_updated', 'first_name', 'last_name', 'username', 'phone',
 28 |     'bio', 'bot', 'common_chats_count', 'picture_id'
 29 | ))
 30 | 
 31 | Channel = namedtuple('Channel', (
 32 |     'id', 'date_updated', 'about', 'title', 'username', 'picture_id',
 33 |     'pin_message_id'
 34 | ))
 35 | 
 36 | Supergroup = namedtuple('Supergroup', (
 37 |     'ID', 'date_updated', 'about', 'title', 'username', 'picture_id',
 38 |     'pin_message_id'
 39 | ))
 40 | 
 41 | Chat = namedtuple('Chat', (
 42 |     'id', 'date_updated', 'title', 'migrated_to_id', 'picture_id'
 43 | ))
 44 | 
 45 | Media = namedtuple('Media', (
 46 |     'id', 'name', 'mime_type', 'size', 'thumbnail_id', 'type', 'local_id',
 47 |     'volume_id', 'secret', 'extra'
 48 | ))
 49 | 
 50 | 
 51 | class BaseFormatter:
 52 |     """
 53 |     A class to extract data from a given telegram-export database in the form
 54 |     of named tuples.
 55 |     """
 56 |     def __init__(self, db):
 57 |         if isinstance(db, str):
 58 |             self.dbconn = sqlite3.connect('file:{}?mode=ro'.format(db), uri=True)
 59 |         elif isinstance(db, sqlite3.Connection):
 60 |             self.dbconn = db
 61 |         else:
 62 |             raise TypeError('Invalid database object given: {}'.format(type(db)))
 63 | 
 64 |         self.our_userid = self.dbconn.execute(
 65 |             "SELECT UserID FROM SelfInformation").fetchone()[0]
 66 | 
 67 |     @staticmethod
 68 |     @abstractmethod
 69 |     def name():
 70 |         """
 71 |         An abstractmethod that subclasses should implement to return their
 72 |         user-friendly name
 73 |         """
 74 |         pass
 75 | 
 76 |     @staticmethod
 77 |     def ensure_id_marked(eid, etype):
 78 |         """
 79 |         Given an entity ID and type (PeerUser, PeerChat, PeerChannel), return
 80 |         the marked ID regardless of whether the ID is already marked.
 81 |         """
 82 |         if etype == types.PeerUser:
 83 |             return eid
 84 |         if etype == types.PeerChat:
 85 |             if eid < 0:
 86 |                 return eid
 87 |             return -eid
 88 |         if etype == types.PeerChannel:
 89 |             if str(eid).startswith('-100'):
 90 |                 return eid
 91 |             # Append -100 at start. See telethon/utils.py get_peer_id.
 92 |             return -(eid + pow(10, math.floor(math.log10(eid) + 3)))
 93 | 
 94 |     def get_display_name(self, entity):
 95 |         """
 96 |         Get the display name of a Chat, Channel, Supergroup, or User namedtuple,
 97 |         or a Bot API marked Context ID. Modeled on telethon/utils.py
 98 |         get_display_name. Return '' if there is no name, and raise ValueError
 99 |         if not passed one of the above types.
100 |         """
101 |         if not entity:
102 |             return ''
103 | 
104 |         if isinstance(entity, User):
105 |             if entity.first_name and entity.last_name:
106 |                 return '{} {}'.format(entity.first_name, entity.last_name)
107 |             elif entity.first_name:
108 |                 return entity.first_name
109 |             elif entity.last_name:
110 |                 return entity.last_name
111 |             return ''
112 | 
113 |         if isinstance(entity, (Supergroup, Channel, Chat)):
114 |             if entity.title:
115 |                 return entity.title
116 |             return ''
117 | 
118 |         if isinstance(entity, int):
119 |             return self.get_display_name(self.get_entity(entity))
120 | 
121 |         raise ValueError("Cannot get display name of a {} object".format(type(entity)))
122 | 
123 |     @staticmethod
124 |     def get_timestamp(date):
125 |         """Get a unix timestamp from an int, datetime, or date"""
126 |         if date is None or isinstance(date, int):
127 |             return date
128 |         if isinstance(date, datetime.datetime):
129 |             return date.timestamp()
130 |         if isinstance(date, datetime.date):
131 |             # Midnight at the start of that day
132 |             return datetime.datetime.combine(date, datetime.time()).timestamp()
133 | 
134 |     @staticmethod
135 |     def _build_query(*args):
136 |         """
137 |         Helper method to build SQLite WHERE queries, automatically ignoring
138 |         ``None`` values. The arguments should be tuples with two values the
139 |         first being the name (e.g. "Date < ?") and the second the value.
140 | 
141 |         Returns a tuple consisting of (<where clause>, <args tuple>).
142 |         """
143 |         query = []
144 |         param = []
145 |         for arg in args:
146 |             if arg[1] is not None:
147 |                 query.append(arg[0])
148 |                 param.append(arg[1])
149 |         if query:
150 |             return ' WHERE ' + ' AND '.join(query), tuple(param)
151 |         return ' ', ()
152 | 
153 |     @classmethod
154 |     def _fetch_at_date(cls, cur, query, eid, at_date):
155 |         """
156 |         Helper method around the common operation to fetch a type by its ID
157 |         and a "DateUpdated" parameter.
158 |         """
159 |         where, query_params = cls._build_query(
160 |             ('ID = ?', eid),
161 |             ('DateUpdated <= ?', at_date)
162 |         )
163 |         # Find the newest dump before the specified date
164 |         cur.execute('{} {} ORDER BY DateUpdated DESC'
165 |                     .format(query, where), query_params)
166 |         row = cur.fetchone()
167 |         if row:
168 |             return row
169 |         # If it wasn't found in a dump from before the specified date, find the
170 |         # first time it was dumped after that date.
171 |         where, query_params = cls._build_query(
172 |             ('ID = ?', eid),
173 |             ('DateUpdated > ?', at_date)
174 |         )
175 |         cur.execute('{} {} ORDER BY DateUpdated ASC'
176 |                     .format(query, where), query_params)
177 |         return cur.fetchone()
178 | 
179 |     def format(self, target, file=None, *args, **kwargs):
180 |         """
181 |         The public method to format target contexts and output them to 'file'.
182 |         Target should be an individual Context ID. File can be a filename or
183 |         file-like object. If it is falsey, it will be interpreted as stdout.
184 |         """
185 |         if not file:
186 |             file = sys.stdout
187 |         elif isinstance(file, (str, Path)):
188 |             if os.path.isdir(file):
189 |                 file = os.path.join(file, str(target))
190 |             file = open(file, 'w')
191 |         elif not isinstance(file, TextIOWrapper):  # Is there a better way?
192 |             raise TypeError(
193 |                 "Supplied file {} could not be interpreted as a file"
194 |                 .format(file)
195 |             )
196 | 
197 |         with file:
198 |             if isinstance(target, int):
199 |                 return self._format(target, file, *args, **kwargs)
200 |             if isinstance(target, (User, Chat, Channel, Supergroup)):
201 |                 return self._format(target.id, file, *args, **kwargs)
202 | 
203 |         raise TypeError("target should be a context ID or context namedtuple")
204 | 
205 |     @abstractmethod
206 |     def _format(self, context_id, file, *args, **kwargs):
207 |         """
208 |         An abstract method that should be implemented by formatters
209 |         Context ID will always be a Bot API style ID. File will always be
210 |         something like a file object or sys.stdout, suitable for usage with
211 |         print(file=file).
212 |         """
213 |         # TODO provide a way to format many targets into one directory with one
214 |         # method, and a format syntax to specify the name scheme of the output files.
215 |         pass
216 | 
217 |     def get_messages_from_context(self, context_id, start_date=None, end_date=None,
218 |                                   from_user_id=None, order='DESC',
219 |                                   include_service=True):
220 |         """
221 |         Yield Messages from a context. Start and end date should be UTC timestamps
222 |         or datetime objects. Note that Channels will never yield any messages if
223 |         from_user_id is set, as there is no FromID for Channel messages. Order
224 |         should be ASC or DESC. Note that unlike the other methods, context_id
225 |         *must* be in the Bot API format where Channel/Supergroup IDs start with
226 |         -100 and old-style Chat IDs start with -.
227 |         """
228 |         start_date, end_date = self.get_timestamp(start_date), self.get_timestamp(end_date)
229 |         where, params = self._build_query(
230 |             ('ContextID = ?', context_id),
231 |             ('Date > ?', start_date),
232 |             ('Date < ?', end_date),
233 |             ('FromID = ?', from_user_id)
234 |         )
235 | 
236 |         cur = self.dbconn.cursor()
237 |         exclude_service = '' if include_service else ' AND ServiceAction is null'
238 |         cur.execute(
239 |             "SELECT ID, ContextID, Date, FromID, Message, ReplyMessageID, "
240 |             "ForwardID, PostAuthor, ViewCount, MediaID, Formatting, ServiceAction"
241 |             " FROM Message {}{} ORDER BY Date {}".format(where, exclude_service,
242 |                                                          order.upper()),
243 |             params
244 |         )
245 |         row = cur.fetchone()
246 |         while row:
247 |             yield self._message_from_row(row)
248 |             row = cur.fetchone()
249 |             if not row:
250 |                 return
251 | 
252 |     def _message_from_row(self, row):
253 |         """
254 |         Take a row (ID, ContextID, Date, FromID, Text, ReplyMessageID,
255 |         ForwardID, PostAuthor, ViewCount, MediaID, Formatting, ServiceAction)
256 |         and add the values for out, reply_message, context, and from_user. Also
257 |         replace date UTC timestamp with date UTC datetime. Return a Message.
258 |         Something slightly worrying: if there is a chain of many replies, this
259 |         is quite inefficient. If the chain is > 1000, possible recursion error.
260 |         """
261 |         # TODO forwards, media
262 |         out = row[3] == self.our_userid
263 |         if row[5]:  # ReplyMessageID
264 |             reply = self.get_message_by_id(row[1], row[5])
265 |         else:
266 |             reply = None
267 |         context = self.get_entity(row[1])
268 |         if row[3]:  # FromID
269 |             from_user = self.get_user(row[3])
270 |         else:
271 |             from_user = None
272 |         date = datetime.datetime.fromtimestamp(row[2])
273 | 
274 |         return Message(row[0], # ID
275 |                        row[1], # ContextID
276 |                        date,
277 |                        row[3],  # FromID
278 |                        row[4],  # Text
279 |                        row[5],  # ReplyMessageID
280 |                        row[6],  # ForwardID
281 |                        row[7],  # PostAuthor
282 |                        row[8],  # ViewCount
283 |                        row[9],  # MediaID
284 |                        row[10], # Formatting
285 |                        out,
286 |                        row[11], # ServiceAction
287 |                        reply,
288 |                        context,
289 |                        from_user)
290 | 
291 | 
292 |     def get_message_by_id(self, context_id, msg_id):
293 |         """
294 |         Returns the unique message with the given context and message ID.
295 |         Returns ``None`` if the message has not been dumped.
296 |         """
297 |         where, params = self._build_query(
298 |             ('ContextID = ?', context_id),
299 |             ('ID = ?', msg_id)
300 |         )
301 |         cur = self.dbconn.cursor()
302 |         cur.execute(
303 |             "SELECT ID, ContextID, Date, FromID, Message, ReplyMessageID, "
304 |             "ForwardID, PostAuthor, ViewCount, MediaID, Formatting, "
305 |             "ServiceAction FROM Message {}".format(where), params
306 |         )
307 |         row = cur.fetchone()
308 |         if row:
309 |             return self._message_from_row(row)
310 | 
311 |     def iter_context_ids(self):
312 |         """
313 |         Iterates over all the context IDs available. This method should
314 |         be useful if one desires to format all the available conversations.
315 |         """
316 |         cur = self.dbconn.cursor()
317 |         cur.execute('SELECT DISTINCT ContextID FROM Message')
318 |         row = cur.fetchone()
319 |         while row:
320 |             yield row[0]
321 |             row = cur.fetchone()
322 | 
323 |     def get_entity(self, context_id, at_date=None):
324 |         """
325 |         Return the entity (user, chat or channel) corresponding to this context
326 |         ID, at the given date (like all the specific methods). Context ID must
327 |         be marked in the Bot API style, as with get_messages_from_context.
328 |         """
329 |         peer_type = utils.resolve_id(context_id)[1]
330 |         if peer_type == types.PeerUser:
331 |             return self.get_user(context_id, at_date=at_date)
332 |         elif peer_type == types.PeerChat:
333 |             return self.get_chat(context_id, at_date=at_date)
334 |         elif peer_type == types.PeerChannel:
335 |             supergroup = self.get_supergroup(context_id, at_date=at_date)
336 |             if not supergroup:
337 |                 return self.get_channel(context_id, at_date=at_date)
338 |             return supergroup
339 |         else:
340 |             raise ValueError('Invalid ID {} given'.format(context_id))
341 | 
342 |     def get_user(self, uid, at_date=None):
343 |         """
344 |         Return the user with given ID or return None. If at_date is set, get
345 |         the user as they were at the given date (to the best of our knowledge).
346 |         If it is not set, get the user as we last saw them. at_date should be a UTC
347 |         timestamp or datetime object.
348 |         """
349 |         at_date = self.get_timestamp(at_date)
350 |         uid = self.ensure_id_marked(uid, types.PeerUser)
351 |         cur = self.dbconn.cursor()
352 |         query = (
353 |             "SELECT ID, DateUpdated, FirstName, LastName, Username, "
354 |             "Phone, Bio, Bot, CommonChatsCount, PictureID FROM User"
355 |         )
356 |         row = self._fetch_at_date(cur, query, uid, at_date)
357 |         if not row:
358 |             return None
359 |         user = User(*row)
360 |         return user._replace(date_updated=datetime.datetime.fromtimestamp(user.date_updated))
361 | 
362 |     def get_channel(self, cid, at_date=None):
363 |         """
364 |         Return the channel with given ID or return None. If at_date is set, get
365 |         the channel as it was at the given date (to the best of our knowledge).
366 |         at_date should be a UTC timestamp or datetime object.
367 |         """
368 |         at_date = self.get_timestamp(at_date)
369 |         cid = self.ensure_id_marked(cid, types.PeerChannel)
370 |         cur = self.dbconn.cursor()
371 |         query = (
372 |             "SELECT ID, DateUpdated, About, Title, Username, "
373 |             "PictureID, PinMessageID FROM Channel"
374 |         )
375 |         row = self._fetch_at_date(cur, query, cid, at_date)
376 |         if not row:
377 |             return None
378 |         channel = Channel(*row)
379 |         return channel._replace(date_updated=datetime.datetime.fromtimestamp(channel.date_updated))
380 | 
381 |     def get_supergroup(self, sid, at_date=None):
382 |         """
383 |         Return the supergroup with given ID or return None. If at_date is set,
384 |         get the supergroup as it was at the given date (to the best of our
385 |         knowledge). at_date should be a UTC timestamp or datetime object.
386 |         """
387 |         at_date = self.get_timestamp(at_date)
388 |         sid = self.ensure_id_marked(sid, types.PeerChannel)
389 |         cur = self.dbconn.cursor()
390 |         query = (
391 |             "SELECT ID, DateUpdated, About, Title, Username, "
392 |             "PictureID, PinMessageID FROM Supergroup"
393 |         )
394 |         row = self._fetch_at_date(cur, query, sid, at_date)
395 |         if not row:
396 |             return None
397 |         supergroup = Supergroup(*row)
398 |         return supergroup._replace(date_updated=datetime.datetime.fromtimestamp(
399 |             supergroup.date_updated))
400 | 
401 |     def get_chat(self, cid, at_date=None):
402 |         """
403 |         Return the chat with given ID or return None. If at_date is set, get
404 |         the chat as it was at the given date (to the best of our knowledge).
405 |         at_date should be a UTC timestamp or datetime object.
406 |         """
407 |         at_date = self.get_timestamp(at_date)
408 |         cid = self.ensure_id_marked(cid, types.PeerChat)
409 | 
410 |         cur = self.dbconn.cursor()
411 |         query = (
412 |             "SELECT ID, DateUpdated, Title, MigratedToID, PictureID FROM Chat"
413 |         )
414 |         row = self._fetch_at_date(cur, query, cid, at_date)
415 |         if not row:
416 |             return None
417 |         chat = Chat(*row)
418 |         return chat._replace(date_updated=datetime.datetime.fromtimestamp(chat.date_updated))
419 | 
420 |     def get_media(self, mid):
421 |         """Return the Media with given ID or return None."""
422 |         cur = self.dbconn.cursor()
423 |         cur.execute("SELECT ID, Name, MimeType, Size, ThumbnailID, Type, LocalID, "
424 |                     "VolumeID, Secret, Extra FROM Media WHERE ID = ?", (mid,))
425 |         row = cur.fetchone()
426 |         if not row:
427 |             return None
428 |         return Media(*row)
429 | 
430 | # if __name__ == '__main__':
431 |     # main()
432 | 


--------------------------------------------------------------------------------
/telegram_export/formatters/htmlformatter.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Formatter to display paginated(?) HTML of a context.
 3 | Very much unfinished and needs a web designer to work on it.
 4 | """
 5 | from . import BaseFormatter
 6 | 
 7 | 
 8 | class HtmlFormatter(BaseFormatter):
 9 |     """A Formatter class to generate HTML"""
10 |     @staticmethod
11 |     def name():
12 |         return 'html'
13 | 
14 |     def output_header(self, file, context):
15 |         """Output the header of the page. Context should be a namedtuple"""
16 |         # TODO HTML
17 |         print(self.get_display_name(context), file=file)
18 | 
19 |     def generate_message_html(self, message):
20 |         """
21 |         Return HTML for a message, showing reply message, forward headers,
22 |         view count, post author, and media (if applicable).
23 |         """
24 |         # TODO HTML
25 |         from_name = self.get_display_name(message.from_id) or "(???)"
26 |         return "{}: {}".format(from_name, message.text)
27 | 
28 |     def _format(self, context_id, file, *args, **kwargs):
29 |         """Format the given context as HTML and output to 'file'"""
30 |         entity = self.get_entity(context_id)
31 | 
32 |         self.output_header(file, entity)
33 |         for message in self.get_messages_from_context(context_id,
34 |                                                       order='ASC'):
35 |             print(self.generate_message_html(message), file=file)
36 | 


--------------------------------------------------------------------------------
/telegram_export/formatters/nlpformatter.py:
--------------------------------------------------------------------------------
 1 | """A Formatter class to output pure text"""
 2 | from . import BaseFormatter
 3 | 
 4 | 
 5 | class NlpFormatter(BaseFormatter):
 6 |     """A Formatter class to output only the text of messages,
 7 |     intended for natural language processing"""
 8 |     @staticmethod
 9 |     def name():
10 |         return 'nlp'
11 | 
12 |     def _format(self, context_id, file, *args, **kwargs):
13 |         """Format the given context as text and output to 'file'"""
14 |         entity = self.get_entity(context_id)
15 | 
16 |         for message in self.get_messages_from_context(context_id,
17 |                                                       order='ASC'):
18 |             if not message.text or message.service_action is not None:
19 |                 continue
20 |             print(message.text)
21 | 


--------------------------------------------------------------------------------
/telegram_export/formatters/textformatter.py:
--------------------------------------------------------------------------------
 1 | """A Formatter class to output pure text"""
 2 | from . import BaseFormatter
 3 | 
 4 | UNKNOWN_USER_TEXT = '(???)'
 5 | 
 6 | class TextFormatter(BaseFormatter):
 7 |     """A Formatter class to output pure text"""
 8 |     @staticmethod
 9 |     def name():
10 |         return 'text'
11 | 
12 |     def generate_message(self, message):
13 |         """Generate the text for a given Message namedtuple"""
14 |         who = self.get_display_name(
15 |             self.get_user(message.from_id)) or UNKNOWN_USER_TEXT
16 | 
17 |         if message.service_action:
18 |             return "Service action {}".format(message.service_action)
19 | 
20 |         if message.reply_message is not None:
21 |             if message.reply_message == ():  # Unlikely, message not dumped
22 |                 reply, reply_sender = '???', '???'
23 |             else:
24 |                 reply_sender = self.get_display_name(
25 |                     message.reply_message.from_user) or UNKNOWN_USER_TEXT
26 |                 replytext = message.reply_message.text or ''
27 |                 reply = ' (in reply to {}\'s: "{}")'.format(
28 |                     reply_sender, replytext)
29 |         else:
30 |             reply = ''
31 | 
32 |         when = message.date.strftime('[%d.%m.%y %H.%M.%S]')
33 |         return '{}, {}:{} {}'.format(who, when, reply or '', message.text)
34 | 
35 |     def _format(self, context_id, file, *args, **kwargs):
36 |         """Format the given context as text and output to 'file'"""
37 |         entity = self.get_entity(context_id)
38 |         name = self.get_display_name(entity) or 'unnamed'
39 | 
40 |         print('== Conversation with "{}" =='.format(name), file=file)
41 |         for message in self.get_messages_from_context(context_id,
42 |                                                       order='ASC'):
43 |             print(self.generate_message(message))
44 | 


--------------------------------------------------------------------------------
/telegram_export/tests.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import configparser
  3 | import random
  4 | import shutil
  5 | import string
  6 | import time
  7 | import unittest
  8 | from datetime import datetime, timedelta
  9 | from pathlib import Path
 10 | 
 11 | import utils
 12 | from downloader import Downloader
 13 | from dumper import Dumper
 14 | from telethon import TelegramClient, utils as tl_utils
 15 | from telethon.errors import (
 16 |     PhoneNumberOccupiedError, SessionPasswordNeededError
 17 | )
 18 | from telethon.extensions import markdown
 19 | from telethon.tl import functions, types
 20 | 
 21 | from formatters import BaseFormatter
 22 | 
 23 | # Configuration as to which tests to run
 24 | ALLOW_NETWORK = False
 25 | 
 26 | 
 27 | def gen_username(length):
 28 |     """Generates a random username of max length "length" (minimum 4)"""
 29 |     letters = string.ascii_letters + string.digits
 30 |     return 'exp_' + ''.join(random.choice(letters) for _ in range(length - 4))
 31 | 
 32 | 
 33 | def login_client(client, username):
 34 |     """
 35 |     Logs-in the given client and sets the desired username.
 36 | 
 37 |     This method will sign up, sign in, or delete existing 2FA-protected
 38 |     accounts as required.
 39 |     """
 40 |     client.session.set_dc(0, '149.154.167.40', 80)
 41 |     assert client.connect()
 42 |     phone = '+999662' + str(random.randint(0, 9999)).zfill(4)
 43 |     client.send_code_request(phone)
 44 |     while True:
 45 |         try:
 46 |             print('Signing up as', phone)
 47 |             client.sign_up('22222', username, 'User')
 48 |             break
 49 |         except PhoneNumberOccupiedError:
 50 |             try:
 51 |                 print('Signing in as', phone)
 52 |                 client.sign_in(phone, '22222')
 53 |                 break
 54 |             except SessionPasswordNeededError:
 55 |                 print('Occupied', phone, 'had password! Deleting!')
 56 |                 client(functions.account.DeleteAccountRequest(''))
 57 | 
 58 |     print('Changing', phone, 'username to', username)
 59 |     client(functions.account.UpdateUsernameRequest(username))
 60 | 
 61 | 
 62 | class TestDumpAll(unittest.TestCase):
 63 |     @classmethod
 64 |     def setUpClass(cls):
 65 |         cls.dumper_config = {'DBFileName': 'test_db', 'OutputDirectory': 'test_work_dir',
 66 |                              'MaxSize': 0}
 67 |         # TODO test with different configurations
 68 | 
 69 |         assert not Path(cls.dumper_config['OutputDirectory']).exists()
 70 | 
 71 |         Path(cls.dumper_config['OutputDirectory']).mkdir()
 72 | 
 73 |         config = configparser.ConfigParser()
 74 |         config.read('config.ini')
 75 |         config = config['TelegramAPI']
 76 | 
 77 |         cls.client = TelegramClient(None, config['ApiId'], config['ApiHash'])
 78 |         login_client(cls.client, gen_username(10))
 79 | 
 80 |         dumper = Dumper(cls.dumper_config)
 81 |         dumper.check_self_user(cls.client.get_me().id)
 82 | 
 83 |     @classmethod
 84 |     def tearDownClass(cls):
 85 |         shutil.rmtree(cls.dumper_config['OutputDirectory'])
 86 | 
 87 |     def test_interrupted_dump(self):
 88 |         """
 89 |         This method will ensure that all messages are retrieved even
 90 |         on weird conditions.
 91 |         """
 92 |         if not ALLOW_NETWORK:
 93 |             raise unittest.SkipTest('Network tests are disabled')
 94 | 
 95 |         dumper = Dumper(self.dumper_config)
 96 |         dumper.chunk_size = 1
 97 |         SEND, DUMP = True, False
 98 |         actions = (
 99 |             (3, SEND),
100 |             (2, DUMP),
101 |             (2, SEND),
102 |             (2, DUMP),  # Actually one will be dumped then back to start
103 |             (1, SEND),
104 |             (2, DUMP),
105 |             (1, SEND),
106 |             (2, DUMP),  # Actually one will be saved and the other updated
107 |             (2, SEND),
108 |             (3, DUMP),
109 |             (1, SEND),
110 |             (1, DUMP),
111 |             (1, DUMP),
112 |         )
113 | 
114 |         self.client(functions.messages.DeleteHistoryRequest('me', 0))
115 |         downloader = Downloader(self.client, self.dumper_config, dumper,
116 |                                 loop=asyncio.get_event_loop())
117 | 
118 |         which = 1
119 |         for amount, what in actions:
120 |             if what is SEND:
121 |                 print('Sending', amount, 'messages...')
122 |                 for _ in range(amount):
123 |                     self.client.send_message('me', str(which))
124 |                     which += 1
125 |                     time.sleep(1)
126 |             else:
127 |                 print('Dumping', amount, 'messages...')
128 |                 chunks = (amount + dumper.chunk_size - 1) // dumper.chunk_size
129 |                 dumper.max_chunks = chunks
130 |                 downloader.start('me')
131 | 
132 |         messages = self.client.get_message_history('me', limit=None)
133 |         print('Full history')
134 |         for msg in reversed(messages):
135 |             print('ID:', msg.id, '; Message:', msg.message)
136 | 
137 |         print('Dumped history')
138 |         fmt = BaseFormatter(dumper.conn)
139 |         my_id = self.client.get_me().id
140 |         dumped = list(fmt.get_messages_from_context(my_id, order='DESC'))
141 |         for msg in dumped:
142 |             print('ID:', msg.id, '; Message:', msg.text)
143 | 
144 |         print('Asserting dumped history matches...')
145 |         assert len(messages) == len(dumped), 'Not all messages were dumped'
146 |         assert all(a.id == b.id and a.message == b.text
147 |                    for a, b in zip(messages, dumped)),\
148 |             'Dumped messages do not match'
149 | 
150 |         print('All good! Test passed!')
151 |         self.client.disconnect()
152 | 
153 |     def test_dump_methods(self):
154 |         """Test dumper.dump_* works"""
155 |         dumper = Dumper(self.dumper_config)
156 |         message = types.Message(
157 |             id=777,
158 |             to_id=types.PeerUser(123),
159 |             date=datetime.now(),
160 |             message='Hello',
161 |             out=True,
162 |             via_bot_id=1000,
163 |             fwd_from=types.MessageFwdHeader(
164 |                 date=datetime.now() - timedelta(days=1),
165 |                 from_id=321
166 |             )
167 |         )
168 |         fwd_id = dumper.dump_forward(message.fwd_from)
169 |         dumper.dump_message(message, 123, forward_id=fwd_id, media_id=None)
170 | 
171 |         message = types.Message(
172 |             id=778,
173 |             to_id=types.PeerUser(321),
174 |             date=datetime.now(),
175 |             message='Hello',
176 |             out=False,
177 |             via_bot_id=1000,
178 |             media=types.MessageMediaPhoto(
179 |                 caption='Hi',
180 |                 ttl_seconds=40,
181 |                 photo=types.Photo(
182 |                     id=2357,
183 |                     access_hash=-123456789,
184 |                     date=datetime.now(),
185 |                     sizes=[
186 |                         types.PhotoSize(
187 |                             type='X',
188 |                             w=100,
189 |                             h=100,
190 |                             size=100 * 100,
191 |                             location=types.FileLocation(
192 |                                 dc_id=2,
193 |                                 volume_id=5,
194 |                                 local_id=7532,
195 |                                 secret=987654321
196 |                             )
197 |                         )
198 |                     ]
199 |                 )
200 |             )
201 |         )
202 |         loc = dumper.dump_media(message.media)
203 |         dumper.dump_message(message, 123, forward_id=None, media_id=loc)
204 |         dumper.dump_message_service(context_id=123, media_id=loc, message=types.MessageService(
205 |             id=779,
206 |             to_id=123,
207 |             date=datetime.now(),
208 |             action=types.MessageActionScreenshotTaken()
209 |         ))
210 | 
211 |         me = types.User(
212 |             id=123,
213 |             is_self=True,
214 |             access_hash=13515,
215 |             first_name='Me',
216 |             username='justme',
217 |             phone='1234567'
218 |         )
219 |         dumper.dump_user(photo_id=None, user_full=types.UserFull(
220 |             user=me,
221 |             link=types.contacts.Link(
222 |                 my_link=types.ContactLinkContact(),
223 |                 foreign_link=types.ContactLinkContact(),
224 |                 user=me
225 |             ),
226 |             notify_settings=types.PeerNotifySettings(0, 'beep'),
227 |             common_chats_count=3
228 |         ))
229 |         dumper.dump_chat(photo_id=None, chat=types.Chat(
230 |             id=7264,
231 |             title='Chat',
232 |             photo=types.ChatPhotoEmpty(),
233 |             participants_count=5,
234 |             date=datetime.now() - timedelta(days=10),
235 |             version=1
236 |         ))
237 | 
238 |         channel = types.Channel(
239 |             id=8247,
240 |             title='Channel',
241 |             photo=types.ChatPhotoEmpty(),
242 |             username='justchannel',
243 |             participants_count=17,
244 |             date=datetime.now() - timedelta(days=5),
245 |             version=7
246 |         )
247 |         channel_full = types.ChannelFull(
248 |             id=8247,
249 |             about='Just a Channel',
250 |             read_inbox_max_id=1051,
251 |             read_outbox_max_id=8744,
252 |             unread_count=1568,
253 |             chat_photo=types.PhotoEmpty(id=176489),
254 |             notify_settings=types.PeerNotifySettingsEmpty(),
255 |             exported_invite=types.ChatInviteEmpty(),
256 |             bot_info=[]
257 |         )
258 |         dumper.dump_supergroup(channel_full, channel, photo_id=None)
259 |         dumper.dump_channel(channel_full, channel, photo_id=None)
260 | 
261 |     def test_dump_msg_entities(self):
262 |         """Show that entities are correctly parsed and stored"""
263 |         message = types.Message(
264 |             id=1,
265 |             to_id=types.PeerUser(321),
266 |             date=datetime.now(),
267 |             message='No entities'
268 |         )
269 |         dumper = Dumper(self.dumper_config)
270 |         fmt = BaseFormatter(dumper.conn)
271 | 
272 |         # Test with no entities
273 |         dumper.dump_message(message, 123, None, None)
274 |         dumper.commit()
275 |         assert not next(fmt.get_messages_from_context(123, order='DESC')).formatting
276 | 
277 |         # Test with many entities
278 |         text, entities = markdown.parse(
279 |             'Testing message with __italic__, **bold**, inline '
280 |             '[links](https://example.com) and [mentions](@hi), '
281 |             'as well as `code` and ``pre`` blocks.'
282 |         )
283 |         entities[3] = types.MessageEntityMentionName(
284 |             entities[3].offset, entities[3].length, 123
285 |         )
286 |         message.id = 2
287 |         message.date -= timedelta(days=1)
288 |         message.message = text
289 |         message.entities = entities
290 |         dumper.dump_message(message, 123, None, None)
291 |         dumper.commit()
292 |         msg = next(fmt.get_messages_from_context(123, order='ASC'))
293 |         assert utils.decode_msg_entities(msg.formatting) == message.entities
294 | 
295 |     def test_formatter_get_chat(self):
296 |         """
297 |         Ensures that the BaseFormatter is able to fetch the expected
298 |         entities when using a date parameter.
299 |         """
300 |         chat = types.Chat(
301 |             id=123,
302 |             title='Some title',
303 |             photo=types.ChatPhotoEmpty(),
304 |             participants_count=7,
305 |             date=datetime.now(),
306 |             version=1
307 |         )
308 |         dumper = Dumper(self.dumper_config)
309 | 
310 |         fmt = BaseFormatter(dumper.conn)
311 |         for month in range(1, 13):
312 |             dumper.dump_chat(chat, None, timestamp=int(datetime(
313 |                 year=2010, month=month, day=1
314 |             ).timestamp()))
315 |         dumper.commit()
316 |         cid = tl_utils.get_peer_id(chat)
317 |         # Default should get the most recent version
318 |         date = fmt.get_chat(cid).date_updated
319 |         assert date == datetime(year=2010, month=12, day=1)
320 | 
321 |         # Expected behaviour is to get the previous available date
322 |         target = datetime(year=2010, month=6, day=29)
323 |         date = fmt.get_chat(cid, target).date_updated
324 |         assert date == datetime(year=2010, month=6, day=1)
325 | 
326 |         # Expected behaviour is to get the next date if previous unavailable
327 |         target = datetime(year=2009, month=12, day=1)
328 |         date = fmt.get_chat(cid, target).date_updated
329 |         assert date == datetime(year=2010, month=1, day=1)
330 | 
331 |     def test_formatter_get_messages(self):
332 |         """
333 |         Ensures that the BaseFormatter is able to correctly yield messages.
334 |         """
335 |         dumper = Dumper(self.dumper_config)
336 |         msg = types.Message(
337 |             id=1,
338 |             to_id=123,
339 |             date=datetime(year=2010, month=1, day=1),
340 |             message='hi'
341 |         )
342 |         for _ in range(365):
343 |             dumper.dump_message(msg, 123, forward_id=None, media_id=None)
344 |             msg.id += 1
345 |             msg.date += timedelta(days=1)
346 |             msg.to_id = 300 - msg.to_id  # Flip between two IDs
347 |         dumper.commit()
348 |         fmt = BaseFormatter(dumper.conn)
349 | 
350 |         # Assert all messages are returned
351 |         assert len(list(fmt.get_messages_from_context(123))) == 365
352 | 
353 |         # Assert only messages after a date are returned
354 |         min_date = datetime(year=2010, month=4, day=1)
355 |         assert all(m.date >= min_date for m in fmt.get_messages_from_context(
356 |             123, start_date=min_date
357 |         ))
358 | 
359 |         # Assert only messages before a date are returned
360 |         max_date = datetime(year=2010, month=4, day=1)
361 |         assert all(m.date <= max_date for m in fmt.get_messages_from_context(
362 |             123, end_date=max_date
363 |         ))
364 | 
365 |         # Assert messages are returned in a range
366 |         assert all(min_date <= m.date <= max_date for m in
367 |                    fmt.get_messages_from_context(
368 |                        123, start_date=min_date, end_date=max_date
369 |                    ))
370 | 
371 |         # Assert messages are returned in the correct order
372 |         desc = list(fmt.get_messages_from_context(123, order='DESC'))
373 |         assert all(desc[i - 1] > desc[i] for i in range(1, len(desc)))
374 | 
375 |         asc = list(fmt.get_messages_from_context(123, order='ASC'))
376 |         assert all(asc[i - 1] < asc[i] for i in range(1, len(asc)))
377 | 
378 | 
379 | if __name__ == '__main__':
380 |     unittest.main()
381 | 


--------------------------------------------------------------------------------
/telegram_export/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/expectocode/telegram-export/ed1cbc6ac364ada137b3fc8f9ffb8170084a65a4/telegram_export/tests/__init__.py


--------------------------------------------------------------------------------
/telegram_export/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import socks
 3 | from telegram_export.utils import parse_proxy_str
 4 | 
 5 | 
 6 | class TestUtils(unittest.TestCase):
 7 | 
 8 |     def setUp(self):
 9 |         pass
10 | 
11 |     def tearDown(self):
12 |         pass
13 | 
14 |     def test_parse_proxy_str(self):
15 |         host = "127.0.0.1"
16 |         port = 1080
17 |         
18 |         proxy = (socks.SOCKS5, host, port)
19 |         proxy_str = "socks5://127.0.0.1:1080"
20 |         self.assertEqual(parse_proxy_str(proxy_str), proxy)
21 | 
22 |         proxy_str = "http://127.0.0.1:1080"
23 |         proxy = (socks.HTTP, host, port)
24 |         self.assertEqual(parse_proxy_str(proxy_str), proxy)
25 | 
26 |         proxy_str = "socks4://login:password@127.0.0.1:1080"
27 |         proxy = (socks.SOCKS4, host, port, True, "login", "password")
28 |         self.assertEqual(parse_proxy_str(proxy_str), proxy)
29 | 
30 |         proxy_str = "bad_type://login:password@127.0.0.1:1080"
31 |         with self.assertRaises(ValueError):
32 |             parse_proxy_str(proxy_str)
33 | 
34 |         proxy_str = "bad_type://127.0.0.1"
35 |         with self.assertRaises(ValueError):
36 |             parse_proxy_str(proxy_str)
37 | 
38 |         proxy_str = "bad_type:127.0.0.1"
39 |         with self.assertRaises(ValueError):
40 |             parse_proxy_str(proxy_str)
41 | 
42 |         proxy_str = "127.0.0.1:1080"
43 |         with self.assertRaises(ValueError):
44 |             parse_proxy_str(proxy_str)
45 | 


--------------------------------------------------------------------------------
/telegram_export/utils.py:
--------------------------------------------------------------------------------
  1 | """Utility functions for telegram-export which aren't specific to one purpose"""
  2 | import mimetypes
  3 | 
  4 | from telethon.tl import types
  5 | from urllib.parse import urlparse
  6 | try:
  7 |     import socks
  8 | except ImportError:
  9 |     socks = None
 10 | 
 11 | ENTITY_TO_TEXT = {
 12 |     types.MessageEntityPre: 'pre',
 13 |     types.MessageEntityCode: 'code',
 14 |     types.MessageEntityBold: 'bold',
 15 |     types.MessageEntityItalic: 'italic',
 16 |     types.MessageEntityTextUrl: 'texturl',
 17 |     types.MessageEntityUrl: 'url',
 18 |     types.MessageEntityMentionName: 'mentionname'
 19 | }
 20 | 
 21 | TEXT_TO_ENTITY = {v: k for k, v in ENTITY_TO_TEXT.items()}
 22 | 
 23 | # The mimetypes module has many extension for the same mimetype and it will
 24 | # return the one that happens to be first (e.g. ".bat" for "text/plain").
 25 | # This map contains a few common mimetypes and their most common extension.
 26 | #
 27 | # The following code can be use to find out which mimetypes have several ext:
 28 | '''
 29 | import mimetypes
 30 | from collections import defaultdict
 31 | d = defaultdict(list)
 32 | for k, v in mimetypes.types_map.items():
 33 |     d[v].append(k)
 34 | 
 35 | d = {k: v for k, v in d.items() if len(v) > 1}
 36 | '''
 37 | COMMON_MIME_TO_EXTENSION = {
 38 |     'text/plain': '.txt',  # To avoid ".bat"
 39 |     'image/jpeg': '.jpg',  # To avoid ".jpe"
 40 |     'image/bmp': '.bmp',  # To avoid ".dib"
 41 |     'video/mp4': '.mp4',  # To avoid ".m4v"
 42 | }
 43 | 
 44 | 
 45 | def encode_msg_entities(entities):
 46 |     """
 47 |     Encodes a list of MessageEntity into a string so it
 48 |     can easily be dumped into e.g. Dumper's database.
 49 |     """
 50 |     if not entities:
 51 |         return None
 52 |     parsed = []
 53 |     for entity in entities:
 54 |         if entity.__class__ in ENTITY_TO_TEXT:
 55 |             if isinstance(entity, types.MessageEntityTextUrl):
 56 |                 extra = ',{}'.format(
 57 |                     entity.url.replace(',', '%2c').replace(';', '%3b')
 58 |                 )
 59 |             elif isinstance(entity, types.MessageEntityMentionName):
 60 |                 extra = ',{}'.format(entity.user_id)
 61 |             else:
 62 |                 extra = ''
 63 |             parsed.append('{},{},{}{}'.format(
 64 |                 ENTITY_TO_TEXT[type(entity)],
 65 |                 entity.offset, entity.length, extra
 66 |             ))
 67 |     return ';'.join(parsed)
 68 | 
 69 | 
 70 | def decode_msg_entities(string):
 71 |     """
 72 |     Reverses the transformation made by ``utils.encode_msg_entities``.
 73 |     """
 74 |     if not string:
 75 |         return None
 76 |     parsed = []
 77 |     for part in string.split(';'):
 78 |         split = part.split(',')
 79 |         kind, offset, length = split[0], int(split[1]), int(split[2])
 80 |         if kind in TEXT_TO_ENTITY:
 81 |             if kind == 'texturl':
 82 |                 parsed.append(types.MessageEntityTextUrl(
 83 |                     offset, length, split[-1]
 84 |                 ))
 85 |             elif kind == 'mentionname':
 86 |                 parsed.append(types.MessageEntityMentionName(
 87 |                     offset, length, int(split[-1])
 88 |                 ))
 89 |             else:
 90 |                 parsed.append(TEXT_TO_ENTITY[kind](offset, length))
 91 |     return parsed
 92 | 
 93 | 
 94 | def get_media_type(media):
 95 |     """
 96 |     Returns a friendly type for the given media.
 97 |     """
 98 |     if not media:
 99 |         return ''
100 | 
101 |     if isinstance(media, types.MessageMediaPhoto):
102 |         return 'photo'
103 | 
104 |     elif isinstance(media, types.MessageMediaDocument):
105 |         if isinstance(media, types.Document):
106 |             for attr in media.attributes:
107 |                 if isinstance(attr, types.DocumentAttributeSticker):
108 |                     return 'document.sticker'
109 |                 elif isinstance(attr, types.DocumentAttributeVideo):
110 |                     return 'document.video'
111 |                 elif isinstance(attr, types.DocumentAttributeAnimated):
112 |                     return 'document.animated'
113 |                 elif isinstance(attr, types.DocumentAttributeAudio):
114 |                     if attr.voice:
115 |                         return 'document.voice'
116 |                     return 'document.audio'
117 |         return 'document'
118 | 
119 |     if isinstance(media, (types.Photo,
120 |                           types.UserProfilePhoto, types.ChatPhoto)):
121 |         return 'chatphoto'
122 | 
123 |     return 'unknown'
124 | 
125 | 
126 | def get_extension(mime):
127 |     """
128 |     Returns the most common extension for the given mimetype, or '.bin' if
129 |     none can be found to indicate that it contains arbitrary binary data.
130 |     """
131 |     if not mime:
132 |         mime = ''
133 |     
134 |     return (
135 |         COMMON_MIME_TO_EXTENSION.get(mime)
136 |         or mimetypes.guess_extension(mime)
137 |         or '.bin'
138 |     )
139 | 
140 | 
141 | def get_file_location(media):
142 |     """
143 |     Helper method to turn arbitrary media into (InputFileLocation, size/None).
144 |     """
145 |     location = file_size = None
146 |     if isinstance(media, types.MessageMediaPhoto):
147 |         media = media.photo
148 | 
149 |     if isinstance(media, types.Photo):
150 |         for size in reversed(media.sizes):
151 |             if isinstance(size, types.PhotoSize):
152 |                 if isinstance(size.location, types.FileLocation):
153 |                     file_size = size.size
154 |                     location = size.location
155 |                     break
156 |     elif isinstance(media, types.MessageMediaDocument):
157 |         if isinstance(media.document, types.Document):
158 |             file_size = media.document.size
159 |             location = types.InputDocumentFileLocation(
160 |                 id=media.document.id,
161 |                 access_hash=media.document.access_hash,
162 |                 version=media.document.version
163 |             )
164 |     elif isinstance(media, (types.UserProfilePhoto, types.ChatPhoto)):
165 |         if isinstance(media.photo_big, types.FileLocation):
166 |             location = media.photo_big
167 |         elif isinstance(media.photo_small, types.FileLocation):
168 |             location = media.photo_small
169 | 
170 |     if isinstance(location, types.FileLocation):
171 |         location = types.InputFileLocation(
172 |             volume_id=location.volume_id,
173 |             local_id=location.local_id,
174 |             secret=location.secret
175 |         )
176 | 
177 |     return location, file_size
178 | 
179 | 
180 | def action_to_name(action):
181 |     """
182 |     Returns a namespace'd "friendly" name for the given
183 |     ``MessageAction`` or ``ChannelAdminLogEventAction``.
184 |     """
185 |     return {
186 |         types.MessageActionChannelCreate: 'channel.create',
187 |         types.MessageActionChannelMigrateFrom: 'channel.migratefrom',
188 |         types.MessageActionChatAddUser: 'chat.adduser',
189 |         types.MessageActionChatCreate: 'chat.create',
190 |         types.MessageActionChatDeletePhoto: 'chat.deletephoto',
191 |         types.MessageActionChatDeleteUser: 'chat.deleteuser',
192 |         types.MessageActionChatEditPhoto: 'chat.editphoto',
193 |         types.MessageActionChatEditTitle: 'chat.edittitle',
194 |         types.MessageActionChatJoinedByLink: 'chat.joinedbylink',
195 |         types.MessageActionChatMigrateTo: 'chat.migrateto',
196 |         types.MessageActionCustomAction: 'custom',
197 |         types.MessageActionEmpty: 'empty',
198 |         types.MessageActionGameScore: 'game.score',
199 |         types.MessageActionHistoryClear: 'history.clear',
200 |         types.MessageActionPaymentSent: 'payment.sent',
201 |         types.MessageActionPaymentSentMe: 'payment.sentme',
202 |         types.MessageActionPhoneCall: 'phone.call',
203 |         types.MessageActionPinMessage: 'pin.message',
204 |         types.MessageActionScreenshotTaken: 'screenshottaken',
205 | 
206 |         types.ChannelAdminLogEventActionChangeAbout: 'change.about',
207 |         types.ChannelAdminLogEventActionChangePhoto: 'change.photo',
208 |         types.ChannelAdminLogEventActionChangeStickerSet: 'change.stickerset',
209 |         types.ChannelAdminLogEventActionChangeTitle: 'change.title',
210 |         types.ChannelAdminLogEventActionChangeUsername: 'change.username',
211 |         types.ChannelAdminLogEventActionDeleteMessage: 'delete.message',
212 |         types.ChannelAdminLogEventActionEditMessage: 'edit.message',
213 |         types.ChannelAdminLogEventActionParticipantInvite: 'participant.invite',
214 |         types.ChannelAdminLogEventActionParticipantJoin: 'participant.join',
215 |         types.ChannelAdminLogEventActionParticipantLeave: 'participant.leave',
216 |         types.ChannelAdminLogEventActionParticipantToggleAdmin: 'participant.toggleadmin',
217 |         types.ChannelAdminLogEventActionParticipantToggleBan: 'participant.toggleban',
218 |         types.ChannelAdminLogEventActionToggleInvites: 'toggle.invites',
219 |         types.ChannelAdminLogEventActionTogglePreHistoryHidden: 'toggle.prehistoryhidden',
220 |         types.ChannelAdminLogEventActionToggleSignatures: 'toggle.signatures',
221 |         types.ChannelAdminLogEventActionUpdatePinned: 'update.pinned',
222 |     }.get(type(action), None)
223 | 
224 | 
225 | def parse_proxy_str(proxy_str):
226 |     """
227 |     Returns proxy from given string
228 |     """
229 |     if socks is None:
230 |         raise Exception('Please install PySocks if you want to use a proxy')
231 |     url_parser = urlparse(proxy_str)
232 |     proxy_type = None
233 |     proxy_type_str = url_parser.scheme
234 |     
235 |     if proxy_type_str.lower() == "socks5":
236 |         proxy_type = socks.SOCKS5
237 |     elif proxy_type_str.lower() == "socks4":
238 |         proxy_type = socks.SOCKS4
239 |     elif proxy_type_str.lower() == "https":
240 |         proxy_type = socks.HTTP
241 |     elif proxy_type_str.lower() == "http":
242 |         proxy_type = socks.HTTP
243 |     else:
244 |         raise ValueError("Proxy type %s is not supported" % proxy_type)
245 | 
246 |     host = url_parser.hostname
247 |     port = url_parser.port
248 | 
249 |     if host is None:
250 |         raise ValueError("Host parsing error")
251 |     if port is None:
252 |         raise ValueError("Port parsing error")
253 | 
254 |     user = url_parser.username
255 |     password = url_parser.password
256 | 
257 |     if user is not None and password is not None:
258 |         proxy = (proxy_type, host, port, True, user, password)
259 |     else:
260 |         proxy = (proxy_type, host, port)
261 |     return proxy
262 | 


--------------------------------------------------------------------------------