├── .github
    └── workflows
    │   └── pythonapp.yml
├── README.org
├── .gitignore
└── convert_to_plaintext.py


/.github/workflows/pythonapp.yml:
--------------------------------------------------------------------------------
 1 | name: Python application
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 | 
10 |     steps:
11 |     - uses: actions/checkout@v1
12 |     - name: Set up Python 3.7
13 |       uses: actions/setup-python@v1
14 |       with:
15 |         python-version: 3.7
16 |     - name: Install dependencies
17 |       run: |
18 |         python -m pip install --upgrade pip
19 |     - name: Lint with mypy
20 |       run: |
21 |         pip install mypy
22 |         mypy *.py
23 |     - name: Lint with pylint
24 |       run: |
25 |         pip install pylint
26 |         pylint -E *.py
27 | 


--------------------------------------------------------------------------------
/README.org:
--------------------------------------------------------------------------------
 1 | #+begin_src python :exports results :results output drawer
 2 |   import convert_to_plaintext
 3 |   print(convert_to_plaintext.__doc__)
 4 | #+end_src
 5 | 
 6 | #+RESULTS:
 7 | :results:
 8 | 
 9 | Script to convert HTML output from [[https://github.com/fabianonline/telegram_backup][telegram_backup]] tool to plaintext with proper filenames.
10 | 
11 | I'm using it for quick search (e.g. =grep=) in messages without having to go to web interface/mobile app. You can read more about it [[https://beepb00p.xyz/pkm-search.html#other][here]].
12 | 
13 | * Usage
14 | 
15 | 1. Read [[https://github.com/fabianonline/telegram_backup#usage][usage for telegram_backup]] to backup your account. Don't forget to run =--export html=!
16 | 2. Run this script with the same =--target= and =--account= arguments as for backup script, and pass =--output= directory.
17 | 
18 | Personally I've got all this set up as a daily Cron job.
19 | 
20 | ** Dependencies
21 | 
22 | ~apt install sqlite3 html2text~
23 | 
24 | :end:
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.gitignore.io/api/python,emacs
  3 | # Edit at https://www.gitignore.io/?templates=python,emacs
  4 | 
  5 | ### Emacs ###
  6 | # -*- mode: gitignore; -*-
  7 | *~
  8 | \#*\#
  9 | /.emacs.desktop
 10 | /.emacs.desktop.lock
 11 | *.elc
 12 | auto-save-list
 13 | tramp
 14 | .\#*
 15 | 
 16 | # Org-mode
 17 | .org-id-locations
 18 | *_archive
 19 | 
 20 | # flymake-mode
 21 | *_flymake.*
 22 | 
 23 | # eshell files
 24 | /eshell/history
 25 | /eshell/lastdir
 26 | 
 27 | # elpa packages
 28 | /elpa/
 29 | 
 30 | # reftex files
 31 | *.rel
 32 | 
 33 | # AUCTeX auto folder
 34 | /auto/
 35 | 
 36 | # cask packages
 37 | .cask/
 38 | dist/
 39 | 
 40 | # Flycheck
 41 | flycheck_*.el
 42 | 
 43 | # server auth directory
 44 | /server/
 45 | 
 46 | # projectiles files
 47 | .projectile
 48 | 
 49 | # directory configuration
 50 | .dir-locals.el
 51 | 
 52 | # network security
 53 | /network-security.data
 54 | 
 55 | 
 56 | ### Python ###
 57 | # Byte-compiled / optimized / DLL files
 58 | __pycache__/
 59 | *.py[cod]
 60 | *$py.class
 61 | 
 62 | # C extensions
 63 | *.so
 64 | 
 65 | # Distribution / packaging
 66 | .Python
 67 | build/
 68 | develop-eggs/
 69 | downloads/
 70 | eggs/
 71 | .eggs/
 72 | lib/
 73 | lib64/
 74 | parts/
 75 | sdist/
 76 | var/
 77 | wheels/
 78 | pip-wheel-metadata/
 79 | share/python-wheels/
 80 | *.egg-info/
 81 | .installed.cfg
 82 | *.egg
 83 | MANIFEST
 84 | 
 85 | # PyInstaller
 86 | #  Usually these files are written by a python script from a template
 87 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 88 | *.manifest
 89 | *.spec
 90 | 
 91 | # Installer logs
 92 | pip-log.txt
 93 | pip-delete-this-directory.txt
 94 | 
 95 | # Unit test / coverage reports
 96 | htmlcov/
 97 | .tox/
 98 | .nox/
 99 | .coverage
100 | .coverage.*
101 | .cache
102 | nosetests.xml
103 | coverage.xml
104 | *.cover
105 | .hypothesis/
106 | .pytest_cache/
107 | 
108 | # Translations
109 | *.mo
110 | *.pot
111 | 
112 | # Scrapy stuff:
113 | .scrapy
114 | 
115 | # Sphinx documentation
116 | docs/_build/
117 | 
118 | # PyBuilder
119 | target/
120 | 
121 | # pyenv
122 | .python-version
123 | 
124 | # pipenv
125 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
126 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
127 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
128 | #   install all needed dependencies.
129 | #Pipfile.lock
130 | 
131 | # celery beat schedule file
132 | celerybeat-schedule
133 | 
134 | # SageMath parsed files
135 | *.sage.py
136 | 
137 | # Spyder project settings
138 | .spyderproject
139 | .spyproject
140 | 
141 | # Rope project settings
142 | .ropeproject
143 | 
144 | # Mr Developer
145 | .mr.developer.cfg
146 | .project
147 | .pydevproject
148 | 
149 | # mkdocs documentation
150 | /site
151 | 
152 | # mypy
153 | .mypy_cache/
154 | .dmypy.json
155 | dmypy.json
156 | 
157 | # Pyre type checker
158 | .pyre/
159 | 
160 | # End of https://www.gitignore.io/api/python,emacs
161 | 


--------------------------------------------------------------------------------
/convert_to_plaintext.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Script to convert HTML output from [[https://github.com/fabianonline/telegram_backup][telegram_backup]] tool to plaintext with proper filenames.
  4 | 
  5 | I'm using it for quick search (e.g. =grep=) in messages without having to go to web interface/mobile app. You can read more about it [[https://beepb00p.xyz/pkm-search.html#other][here]].
  6 | 
  7 | * Usage
  8 | 
  9 | 1. Read [[https://github.com/fabianonline/telegram_backup#usage][usage for telegram_backup]] to backup your account. Don't forget to run =--export html=!
 10 | 2. Run this script with the same =--target= and =--account= arguments as for backup script, and pass =--output= directory.
 11 | 
 12 | Personally I've got all this set up as a daily Cron job.
 13 | 
 14 | ** Dependencies
 15 | 
 16 | ~apt install sqlite3 html2text~
 17 | 
 18 | """
 19 | 
 20 | from pathlib import Path
 21 | from subprocess import check_call, check_output
 22 | from fnmatch import fnmatch
 23 | from typing import Optional
 24 | 
 25 | import re
 26 | import string
 27 | import sys
 28 | 
 29 | 
 30 | USER_RE = re.compile(r'user_(?P<id>\d+)(?P<page>_p\d+)?')
 31 | CHAT_RE = re.compile(r'chat_(?P<id>\d+)(?P<page>_p\d+)?')
 32 | 
 33 | 
 34 | def query(db: Path, what: str, from_: str, where: str):
 35 |     res = check_output([
 36 |         'sqlite3',
 37 |         str(db),
 38 |         f'SELECT {what} FROM {from_} WHERE id={where};',
 39 |     ]).decode('utf8').strip()
 40 | 
 41 |     # TODO maybe strip off emoji too just in case...
 42 |     table = str.maketrans({key: None for key in string.punctuation})
 43 |     res = res.translate(table)
 44 | 
 45 |     if len(res) == 0:
 46 |         res = where # TODO ugh
 47 |     return res
 48 | 
 49 | 
 50 | def get_output_name(*, db: Path, path: Path) -> str:
 51 |     print(f'processing: {path}', file=sys.stderr)
 52 |     um = USER_RE.match(path.name)
 53 |     cm = CHAT_RE.match(path.name)
 54 |     name: str
 55 |     if um is not None:
 56 |         id_ = um.group('id')
 57 |         page = um.group('page')
 58 |         if page is None:
 59 |             page = ""
 60 |         name = query(db, 'username', 'users', id_) + page
 61 |     elif cm is not None:
 62 |         id_ = cm.group('id')
 63 |         page = cm.group('page')
 64 |         if page is None:
 65 |             page = ""
 66 |         name = query(db, 'name', 'chats', id_) + page
 67 |     else:
 68 |         raise RuntimeError(f'Unexpected file name: {path}')
 69 |     return name
 70 | 
 71 | 
 72 | def run(*, export_dir: Path, output: Path, ignore: Optional[str]=None) -> None:
 73 |     output.mkdir(parents=True, exist_ok=True)
 74 | 
 75 |     htmls  = export_dir / 'files' / 'dialogs'
 76 |     db     = export_dir / 'database.sqlite'
 77 |     html_files = sorted(htmls.glob('*.html'))
 78 |     assert len(html_files) > 0, htmls
 79 |     for path in html_files:
 80 |         name = get_output_name(db=db, path=path)
 81 |         if ignore is not None and fnmatch(name, ignore):
 82 |             print(f'Ignoring {path} due to name: {name}', file=sys.stderr)
 83 |             continue
 84 | 
 85 |         out = output / (name + ".txt")
 86 |         check_call([
 87 |             'html2text',
 88 |             '-utf8',
 89 |             '-width', '500', # hopefully, enough..
 90 |             '-o', str(out),
 91 |             str(path),
 92 |         ])
 93 | 
 94 | 
 95 | def main():
 96 |     import argparse
 97 |     p = argparse.ArgumentParser()
 98 |     p.add_argument('--target' , type=Path, help='same option as for telegram_backup tool', required=True)
 99 |     p.add_argument('--account', type=str , help='same option as for telegram_backup tool', required=True)
100 |     p.add_argument('--output' , type=Path, help='path for txt outputs'                   , required=True)
101 |     p.add_argument('--ignore' , type=str , help='glob for ignoring certain names'        , required=False)
102 |     # TODO FIXME ignore targets?
103 |     args = p.parse_args()
104 | 
105 |     export_dir = args.target / args.account
106 |     run(export_dir=export_dir, output=args.output, ignore=args.ignore)
107 | 
108 | 
109 | if __name__ == '__main__':
110 |     main()
111 | 


--------------------------------------------------------------------------------