├── .github └── workflows │ └── pythonapp.yml ├── README.org ├── .gitignore └── convert_to_plaintext.py /.github/workflows/pythonapp.yml: -------------------------------------------------------------------------------- 1 | name: Python application 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - uses: actions/checkout@v1 12 | - name: Set up Python 3.7 13 | uses: actions/setup-python@v1 14 | with: 15 | python-version: 3.7 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | - name: Lint with mypy 20 | run: | 21 | pip install mypy 22 | mypy *.py 23 | - name: Lint with pylint 24 | run: | 25 | pip install pylint 26 | pylint -E *.py 27 | -------------------------------------------------------------------------------- /README.org: -------------------------------------------------------------------------------- 1 | #+begin_src python :exports results :results output drawer 2 | import convert_to_plaintext 3 | print(convert_to_plaintext.__doc__) 4 | #+end_src 5 | 6 | #+RESULTS: 7 | :results: 8 | 9 | Script to convert HTML output from [[https://github.com/fabianonline/telegram_backup][telegram_backup]] tool to plaintext with proper filenames. 10 | 11 | I'm using it for quick search (e.g. =grep=) in messages without having to go to web interface/mobile app. You can read more about it [[https://beepb00p.xyz/pkm-search.html#other][here]]. 12 | 13 | * Usage 14 | 15 | 1. Read [[https://github.com/fabianonline/telegram_backup#usage][usage for telegram_backup]] to backup your account. Don't forget to run =--export html=! 16 | 2. Run this script with the same =--target= and =--account= arguments as for backup script, and pass =--output= directory. 17 | 18 | Personally I've got all this set up as a daily Cron job. 19 | 20 | ** Dependencies 21 | 22 | ~apt install sqlite3 html2text~ 23 | 24 | :end: 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/python,emacs 3 | # Edit at https://www.gitignore.io/?templates=python,emacs 4 | 5 | ### Emacs ### 6 | # -*- mode: gitignore; -*- 7 | *~ 8 | \#*\# 9 | /.emacs.desktop 10 | /.emacs.desktop.lock 11 | *.elc 12 | auto-save-list 13 | tramp 14 | .\#* 15 | 16 | # Org-mode 17 | .org-id-locations 18 | *_archive 19 | 20 | # flymake-mode 21 | *_flymake.* 22 | 23 | # eshell files 24 | /eshell/history 25 | /eshell/lastdir 26 | 27 | # elpa packages 28 | /elpa/ 29 | 30 | # reftex files 31 | *.rel 32 | 33 | # AUCTeX auto folder 34 | /auto/ 35 | 36 | # cask packages 37 | .cask/ 38 | dist/ 39 | 40 | # Flycheck 41 | flycheck_*.el 42 | 43 | # server auth directory 44 | /server/ 45 | 46 | # projectiles files 47 | .projectile 48 | 49 | # directory configuration 50 | .dir-locals.el 51 | 52 | # network security 53 | /network-security.data 54 | 55 | 56 | ### Python ### 57 | # Byte-compiled / optimized / DLL files 58 | __pycache__/ 59 | *.py[cod] 60 | *$py.class 61 | 62 | # C extensions 63 | *.so 64 | 65 | # Distribution / packaging 66 | .Python 67 | build/ 68 | develop-eggs/ 69 | downloads/ 70 | eggs/ 71 | .eggs/ 72 | lib/ 73 | lib64/ 74 | parts/ 75 | sdist/ 76 | var/ 77 | wheels/ 78 | pip-wheel-metadata/ 79 | share/python-wheels/ 80 | *.egg-info/ 81 | .installed.cfg 82 | *.egg 83 | MANIFEST 84 | 85 | # PyInstaller 86 | # Usually these files are written by a python script from a template 87 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 88 | *.manifest 89 | *.spec 90 | 91 | # Installer logs 92 | pip-log.txt 93 | pip-delete-this-directory.txt 94 | 95 | # Unit test / coverage reports 96 | htmlcov/ 97 | .tox/ 98 | .nox/ 99 | .coverage 100 | .coverage.* 101 | .cache 102 | nosetests.xml 103 | coverage.xml 104 | *.cover 105 | .hypothesis/ 106 | .pytest_cache/ 107 | 108 | # Translations 109 | *.mo 110 | *.pot 111 | 112 | # Scrapy stuff: 113 | .scrapy 114 | 115 | # Sphinx documentation 116 | docs/_build/ 117 | 118 | # PyBuilder 119 | target/ 120 | 121 | # pyenv 122 | .python-version 123 | 124 | # pipenv 125 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 126 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 127 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 128 | # install all needed dependencies. 129 | #Pipfile.lock 130 | 131 | # celery beat schedule file 132 | celerybeat-schedule 133 | 134 | # SageMath parsed files 135 | *.sage.py 136 | 137 | # Spyder project settings 138 | .spyderproject 139 | .spyproject 140 | 141 | # Rope project settings 142 | .ropeproject 143 | 144 | # Mr Developer 145 | .mr.developer.cfg 146 | .project 147 | .pydevproject 148 | 149 | # mkdocs documentation 150 | /site 151 | 152 | # mypy 153 | .mypy_cache/ 154 | .dmypy.json 155 | dmypy.json 156 | 157 | # Pyre type checker 158 | .pyre/ 159 | 160 | # End of https://www.gitignore.io/api/python,emacs 161 | -------------------------------------------------------------------------------- /convert_to_plaintext.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Script to convert HTML output from [[https://github.com/fabianonline/telegram_backup][telegram_backup]] tool to plaintext with proper filenames. 4 | 5 | I'm using it for quick search (e.g. =grep=) in messages without having to go to web interface/mobile app. You can read more about it [[https://beepb00p.xyz/pkm-search.html#other][here]]. 6 | 7 | * Usage 8 | 9 | 1. Read [[https://github.com/fabianonline/telegram_backup#usage][usage for telegram_backup]] to backup your account. Don't forget to run =--export html=! 10 | 2. Run this script with the same =--target= and =--account= arguments as for backup script, and pass =--output= directory. 11 | 12 | Personally I've got all this set up as a daily Cron job. 13 | 14 | ** Dependencies 15 | 16 | ~apt install sqlite3 html2text~ 17 | 18 | """ 19 | 20 | from pathlib import Path 21 | from subprocess import check_call, check_output 22 | from fnmatch import fnmatch 23 | from typing import Optional 24 | 25 | import re 26 | import string 27 | import sys 28 | 29 | 30 | USER_RE = re.compile(r'user_(?P\d+)(?P_p\d+)?') 31 | CHAT_RE = re.compile(r'chat_(?P\d+)(?P_p\d+)?') 32 | 33 | 34 | def query(db: Path, what: str, from_: str, where: str): 35 | res = check_output([ 36 | 'sqlite3', 37 | str(db), 38 | f'SELECT {what} FROM {from_} WHERE id={where};', 39 | ]).decode('utf8').strip() 40 | 41 | # TODO maybe strip off emoji too just in case... 42 | table = str.maketrans({key: None for key in string.punctuation}) 43 | res = res.translate(table) 44 | 45 | if len(res) == 0: 46 | res = where # TODO ugh 47 | return res 48 | 49 | 50 | def get_output_name(*, db: Path, path: Path) -> str: 51 | print(f'processing: {path}', file=sys.stderr) 52 | um = USER_RE.match(path.name) 53 | cm = CHAT_RE.match(path.name) 54 | name: str 55 | if um is not None: 56 | id_ = um.group('id') 57 | page = um.group('page') 58 | if page is None: 59 | page = "" 60 | name = query(db, 'username', 'users', id_) + page 61 | elif cm is not None: 62 | id_ = cm.group('id') 63 | page = cm.group('page') 64 | if page is None: 65 | page = "" 66 | name = query(db, 'name', 'chats', id_) + page 67 | else: 68 | raise RuntimeError(f'Unexpected file name: {path}') 69 | return name 70 | 71 | 72 | def run(*, export_dir: Path, output: Path, ignore: Optional[str]=None) -> None: 73 | output.mkdir(parents=True, exist_ok=True) 74 | 75 | htmls = export_dir / 'files' / 'dialogs' 76 | db = export_dir / 'database.sqlite' 77 | html_files = sorted(htmls.glob('*.html')) 78 | assert len(html_files) > 0, htmls 79 | for path in html_files: 80 | name = get_output_name(db=db, path=path) 81 | if ignore is not None and fnmatch(name, ignore): 82 | print(f'Ignoring {path} due to name: {name}', file=sys.stderr) 83 | continue 84 | 85 | out = output / (name + ".txt") 86 | check_call([ 87 | 'html2text', 88 | '-utf8', 89 | '-width', '500', # hopefully, enough.. 90 | '-o', str(out), 91 | str(path), 92 | ]) 93 | 94 | 95 | def main(): 96 | import argparse 97 | p = argparse.ArgumentParser() 98 | p.add_argument('--target' , type=Path, help='same option as for telegram_backup tool', required=True) 99 | p.add_argument('--account', type=str , help='same option as for telegram_backup tool', required=True) 100 | p.add_argument('--output' , type=Path, help='path for txt outputs' , required=True) 101 | p.add_argument('--ignore' , type=str , help='glob for ignoring certain names' , required=False) 102 | # TODO FIXME ignore targets? 103 | args = p.parse_args() 104 | 105 | export_dir = args.target / args.account 106 | run(export_dir=export_dir, output=args.output, ignore=args.ignore) 107 | 108 | 109 | if __name__ == '__main__': 110 | main() 111 | --------------------------------------------------------------------------------