├── .github ├── FUNDING.yml └── workflows │ ├── make_tracked_links_list.yml │ └── make_files_tree.yml ├── .gitignore ├── requirements.txt ├── LICENSE ├── tracked_tr_links.txt ├── README.md ├── make_and_send_alert.py ├── unwebpack_sourcemap.py ├── make_tracked_links_list.py ├── ccl_bplist.py └── make_files_tree.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: MarshalX -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.iml 3 | .env 4 | venv 5 | 6 | tracked_links.txt 7 | *.pyc 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.12.15 2 | aiodns==3.5.0 3 | aiofiles==24.1.0 4 | beautifulsoup4==4.13.4 5 | cssutils==2.11.1 6 | httpx==0.28.1 7 | requests==2.32.4 8 | uvloop==0.21.0 9 | git+https://github.com/MarshalX/pyrogram 10 | TgCrypto==1.2.5 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Il'ya (Marshal) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/make_tracked_links_list.yml: -------------------------------------------------------------------------------- 1 | name: Generate or update list of tracked links 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: '* * * * *' 7 | push: 8 | # trigger on updated link crawler rules 9 | branches: 10 | - main 11 | 12 | jobs: 13 | make_tracked_links_file: 14 | name: Make tracked links file 15 | runs-on: ubuntu-24.04 16 | timeout-minutes: 15 17 | 18 | steps: 19 | 20 | - name: Clone. 21 | uses: actions/checkout@v5 22 | with: 23 | token: ${{ secrets.PAT }} 24 | 25 | - name: Setup Python. 26 | uses: actions/setup-python@v5 27 | with: 28 | python-version: 3.13 29 | 30 | - name: Install dependencies. 31 | run: | 32 | pip install -r requirements.txt 33 | 34 | - name: Generate/update file with links. 35 | env: 36 | OUTPUT_FILENAME: "tracked_links_ci.txt" 37 | OUTPUT_RESOURCES_FILENAME: "tracked_res_links_ci.txt" 38 | OUTPUT_TRANSLATIONS_FILENAME: "tracked_tr_links_ci.txt" 39 | run: | 40 | python make_tracked_links_list.py 41 | 42 | - name: Commit and push changes. 43 | run: | 44 | git pull 45 | 46 | mv tracked_links_ci.txt tracked_links.txt 47 | mv tracked_res_links_ci.txt tracked_res_links.txt 48 | mv tracked_tr_links_ci.txt tracked_tr_links.txt 49 | 50 | git config --global user.name "github-actions[bot]" 51 | git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com" 52 | 53 | git add . 54 | git commit -m "Update tracked links" 55 | git push 56 | -------------------------------------------------------------------------------- /.github/workflows/make_files_tree.yml: -------------------------------------------------------------------------------- 1 | name: Fetch new content of tracked links and files 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: '* * * * *' 7 | push: 8 | # trigger on updated linkbase 9 | branches: 10 | - main 11 | 12 | jobs: 13 | fetch_new_content: 14 | name: Make files tree 15 | runs-on: ${{matrix.os}} 16 | continue-on-error: true 17 | timeout-minutes: 10 18 | 19 | strategy: 20 | fail-fast: false 21 | matrix: 22 | include: 23 | - mode: web 24 | os: macos-13 25 | 26 | - mode: web_res 27 | os: ubuntu-22.04 28 | 29 | - mode: web_tr 30 | os: ubuntu-22.04 31 | 32 | - mode: server 33 | os: ubuntu-22.04 34 | 35 | - mode: client 36 | os: macos-13 37 | 38 | - mode: mini_app 39 | os: ubuntu-22.04 40 | 41 | steps: 42 | 43 | - name: Clone. 44 | uses: actions/checkout@v5 45 | with: 46 | token: ${{ secrets.PAT }} 47 | 48 | - name: Setup Python. 49 | uses: actions/setup-python@v5 50 | with: 51 | python-version: 3.13 52 | 53 | - name: Install dependencies. 54 | run: | 55 | pip install -r requirements.txt 56 | 57 | - name: Generate files tree. 58 | env: 59 | OUTPUT_FOLDER: "data_ci/" 60 | TELEGRAM_SESSION: ${{ secrets.TELEGRAM_SESSION }} 61 | TELEGRAM_SESSION_TEST: ${{ secrets.TELEGRAM_SESSION_TEST }} 62 | TELEGRAM_API_ID: ${{ secrets.TELEGRAM_API_ID }} 63 | TELEGRAM_API_HASH: ${{ secrets.TELEGRAM_API_HASH }} 64 | MODE: ${{ matrix.mode }} 65 | run: | 66 | git pull 67 | python make_files_tree.py 68 | rm -rf __pycache__ 69 | 70 | - name: Prepare data. 71 | if: matrix.mode == 'web' 72 | run: | 73 | git checkout data 74 | git pull 75 | 76 | mv data/web_res data_ci/web_res 77 | mv data/web_tr data_ci/web_tr 78 | mv data/client data_ci/client 79 | mv data/server data_ci/server 80 | mv data/mini_app data_ci/mini_app 81 | 82 | rm -rf data 83 | mv data_ci data 84 | 85 | - name: Prepare data. 86 | if: matrix.mode == 'web_res' 87 | run: | 88 | git checkout data 89 | git pull 90 | 91 | mv data/web data_ci/web 92 | mv data/web_tr data_ci/web_tr 93 | mv data/client data_ci/client 94 | mv data/server data_ci/server 95 | mv data/mini_app data_ci/mini_app 96 | 97 | rm -rf data 98 | mv data_ci data 99 | 100 | - name: Prepare data. 101 | if: matrix.mode == 'web_tr' 102 | run: | 103 | git checkout data 104 | git pull 105 | 106 | mv data/web data_ci/web 107 | mv data/web_res data_ci/web_res 108 | mv data/server data_ci/server 109 | mv data/client data_ci/client 110 | mv data/mini_app data_ci/mini_app 111 | 112 | rm -rf data 113 | mv data_ci data 114 | 115 | - name: Prepare data. 116 | if: matrix.mode == 'server' 117 | run: | 118 | git checkout data 119 | git pull 120 | 121 | mv data/web data_ci/web 122 | mv data/web_res data_ci/web_res 123 | mv data/web_tr data_ci/web_tr 124 | mv data/client data_ci/client 125 | mv data/mini_app data_ci/mini_app 126 | 127 | rm -rf data 128 | mv data_ci data 129 | 130 | - name: Prepare data. 131 | if: matrix.mode == 'client' 132 | run: | 133 | git checkout data 134 | git pull 135 | 136 | mv data/web data_ci/web 137 | mv data/web_res data_ci/web_res 138 | mv data/web_tr data_ci/web_tr 139 | mv data/server data_ci/server 140 | mv data/mini_app data_ci/mini_app 141 | 142 | rm -rf data 143 | mv data_ci data 144 | 145 | - name: Prepare data. 146 | if: matrix.mode == 'mini_app' 147 | run: | 148 | git checkout data 149 | git pull 150 | 151 | mv data/web data_ci/web 152 | mv data/web_res data_ci/web_res 153 | mv data/web_tr data_ci/web_tr 154 | mv data/server data_ci/server 155 | mv data/client data_ci/client 156 | 157 | rm -rf data 158 | mv data_ci data 159 | 160 | - name: Commit and push changes. 161 | run: | 162 | git config --global user.name "github-actions[bot]" 163 | git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com" 164 | 165 | git add . 166 | git commit -m "Update content of files" 167 | git push 168 | -------------------------------------------------------------------------------- /tracked_tr_links.txt: -------------------------------------------------------------------------------- 1 | translations.telegram.org 2 | translations.telegram.org/auth 3 | translations.telegram.org/css/billboard.css 4 | translations.telegram.org/css/contest-zoo.css 5 | translations.telegram.org/css/health.css 6 | translations.telegram.org/css/jquery-ui.min.css 7 | translations.telegram.org/css/tchart.min.css 8 | translations.telegram.org/css/telegram.css 9 | translations.telegram.org/css/translations.css 10 | translations.telegram.org/en 11 | translations.telegram.org/en/android 12 | translations.telegram.org/en/android/bots_and_payments 13 | translations.telegram.org/en/android/camera_and_media 14 | translations.telegram.org/en/android/chat_list 15 | translations.telegram.org/en/android/general 16 | translations.telegram.org/en/android/groups_and_channels 17 | translations.telegram.org/en/android/login 18 | translations.telegram.org/en/android/passport 19 | translations.telegram.org/en/android/private_chats 20 | translations.telegram.org/en/android/profile 21 | translations.telegram.org/en/android/settings 22 | translations.telegram.org/en/android/stories 23 | translations.telegram.org/en/android/unsorted 24 | translations.telegram.org/en/android_x 25 | translations.telegram.org/en/android_x/bots_and_payments 26 | translations.telegram.org/en/android_x/camera_and_media 27 | translations.telegram.org/en/android_x/chat_list 28 | translations.telegram.org/en/android_x/general 29 | translations.telegram.org/en/android_x/groups_and_channels 30 | translations.telegram.org/en/android_x/login 31 | translations.telegram.org/en/android_x/passport 32 | translations.telegram.org/en/android_x/private_chats 33 | translations.telegram.org/en/android_x/profile 34 | translations.telegram.org/en/android_x/settings 35 | translations.telegram.org/en/android_x/stories 36 | translations.telegram.org/en/android_x/unsorted 37 | translations.telegram.org/en/emoji 38 | translations.telegram.org/en/ios 39 | translations.telegram.org/en/ios/bots_and_payments 40 | translations.telegram.org/en/ios/camera_and_media 41 | translations.telegram.org/en/ios/chat_list 42 | translations.telegram.org/en/ios/general 43 | translations.telegram.org/en/ios/groups_and_channels 44 | translations.telegram.org/en/ios/login 45 | translations.telegram.org/en/ios/passport 46 | translations.telegram.org/en/ios/private_chats 47 | translations.telegram.org/en/ios/profile 48 | translations.telegram.org/en/ios/settings 49 | translations.telegram.org/en/ios/stories 50 | translations.telegram.org/en/ios/unsorted 51 | translations.telegram.org/en/macos 52 | translations.telegram.org/en/macos/bots_and_payments 53 | translations.telegram.org/en/macos/camera_and_media 54 | translations.telegram.org/en/macos/chat_list 55 | translations.telegram.org/en/macos/general 56 | translations.telegram.org/en/macos/groups_and_channels 57 | translations.telegram.org/en/macos/login 58 | translations.telegram.org/en/macos/passport 59 | translations.telegram.org/en/macos/private_chats 60 | translations.telegram.org/en/macos/profile 61 | translations.telegram.org/en/macos/settings 62 | translations.telegram.org/en/macos/stories 63 | translations.telegram.org/en/macos/unsorted 64 | translations.telegram.org/en/tdesktop 65 | translations.telegram.org/en/tdesktop/bots_and_payments 66 | translations.telegram.org/en/tdesktop/camera_and_media 67 | translations.telegram.org/en/tdesktop/chat_list 68 | translations.telegram.org/en/tdesktop/general 69 | translations.telegram.org/en/tdesktop/groups_and_channels 70 | translations.telegram.org/en/tdesktop/login 71 | translations.telegram.org/en/tdesktop/passport 72 | translations.telegram.org/en/tdesktop/private_chats 73 | translations.telegram.org/en/tdesktop/profile 74 | translations.telegram.org/en/tdesktop/settings 75 | translations.telegram.org/en/tdesktop/stories 76 | translations.telegram.org/en/tdesktop/unsorted 77 | translations.telegram.org/en/unigram 78 | translations.telegram.org/en/unigram/bots_and_payments 79 | translations.telegram.org/en/unigram/camera_and_media 80 | translations.telegram.org/en/unigram/chat_list 81 | translations.telegram.org/en/unigram/general 82 | translations.telegram.org/en/unigram/groups_and_channels 83 | translations.telegram.org/en/unigram/login 84 | translations.telegram.org/en/unigram/passport 85 | translations.telegram.org/en/unigram/private_chats 86 | translations.telegram.org/en/unigram/profile 87 | translations.telegram.org/en/unigram/settings 88 | translations.telegram.org/en/unigram/stories 89 | translations.telegram.org/en/unigram/unsorted 90 | translations.telegram.org/en/weba 91 | translations.telegram.org/en/weba/bots_and_payments 92 | translations.telegram.org/en/weba/camera_and_media 93 | translations.telegram.org/en/weba/chat_list 94 | translations.telegram.org/en/weba/general 95 | translations.telegram.org/en/weba/groups_and_channels 96 | translations.telegram.org/en/weba/login 97 | translations.telegram.org/en/weba/passport 98 | translations.telegram.org/en/weba/private_chats 99 | translations.telegram.org/en/weba/profile 100 | translations.telegram.org/en/weba/settings 101 | translations.telegram.org/en/weba/stories 102 | translations.telegram.org/en/weba/unsorted 103 | translations.telegram.org/en/webk 104 | translations.telegram.org/en/webk/bots_and_payments 105 | translations.telegram.org/en/webk/camera_and_media 106 | translations.telegram.org/en/webk/chat_list 107 | translations.telegram.org/en/webk/general 108 | translations.telegram.org/en/webk/groups_and_channels 109 | translations.telegram.org/en/webk/login 110 | translations.telegram.org/en/webk/passport 111 | translations.telegram.org/en/webk/private_chats 112 | translations.telegram.org/en/webk/profile 113 | translations.telegram.org/en/webk/settings 114 | translations.telegram.org/en/webk/stories 115 | translations.telegram.org/en/webk/unsorted -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 🕷 Telegram Crawler 2 | 3 | This project is developed to automatically detect changes made 4 | to the official Telegram sites and beta clients. This is necessary for 5 | anticipating future updates and other things 6 | (new vacancies, API updates, etc). 7 | 8 | | Name | Commits | Status | 9 | |----------------------| -------- |---------------------------------------------------------------------------------------------------------------------------------------------------------------| 10 | | Data tracker | [Commits](https://github.com/MarshalX/telegram-crawler/commits/data) | ![Fetch new content of tracked links and files](https://github.com/MarshalX/telegram-crawler/actions/workflows/make_files_tree.yml/badge.svg?branch=main) | 11 | | Site links collector | [Commits](https://github.com/MarshalX/telegram-crawler/commits/main/tracked_links.txt) | ![Generate or update list of tracked links](https://github.com/MarshalX/telegram-crawler/actions/workflows/make_tracked_links_list.yml/badge.svg?branch=main) | 12 | 13 | * ✅ passing – new changes 14 | * ❌ failing – no changes 15 | 16 | You should to subscribe to **[channel with alerts](https://t.me/tgcrawl)** to stay updated. 17 | Copy of Telegram websites and client`s resources stored **[here](https://github.com/MarshalX/telegram-crawler/tree/data/data)**. 18 | 19 | ![GitHub pretty diff example](https://i.imgur.com/BK8UAju.png) 20 | 21 | ### How it works 22 | 23 | 1. [Link crawling](make_tracked_links_list.py) runs **as often as possible**. 24 | Starts crawling from the home page of the site. 25 | Detects relative and absolute sub links and recursively repeats the operation. 26 | Writes a list of unique links for future content comparison. 27 | Additionally, there is the ability to add links by hand to help the script 28 | find more hidden (links to which no one refers) links. To manage exceptions, 29 | there is a [system of rules](#example-of-link-crawler-rules-configuration) 30 | for the link crawler. 31 | 32 | 2. [Content crawling](make_files_tree.py) is launched **as often as 33 | possible** and uses the existing list of links collected in step 1. 34 | Going through the base it gets contains and builds a system of subfolders 35 | and files. Removes all dynamic content from files. It downloads beta version 36 | of Android Client, decompiles it and track resources also. Tracking of 37 | resources of Telegram for macOS presented too. 38 | 39 | 3. Using of [GitHub Actions](.github/workflows/). Works without own servers. 40 | You can just fork this repository and own tracker system by yourself. 41 | Workflows launch scripts and commit changes. All file changes are tracked 42 | by GIT and beautifully displayed on GitHub. GitHub Actions should be built 43 | correctly only if there are changes on the Telegram website. Otherwise, the 44 | workflow should fail. If build was successful, we can send notifications to 45 | Telegram channel and so on. 46 | 47 | ### FAQ 48 | 49 | **Q:** How often is "**as often as possible**"? 50 | 51 | **A:** TLTR: content update action runs every ~10 minutes. More info: 52 | - [Scheduled actions cannot be run more than once every 5 minutes.](https://github.blog/changelog/2019-11-01-github-actions-scheduled-jobs-maximum-frequency-is-changing/) 53 | - [GitHub Actions workflow not triggering at scheduled time](https://upptime.js.org/blog/2021/01/22/github-actions-schedule-not-working/). 54 | 55 | **Q:** Why there is 2 separated crawl scripts instead of one? 56 | 57 | **A:** Because the previous idea was to update tracked links once at hour. 58 | It was so comfortably to use separated scripts and workflows. 59 | After Telegram 7.7 update, I realised that find new blog posts so slowly is bad idea. 60 | 61 | **Q:** Why alert for sending alerts have while loop? 62 | 63 | **A:** Because GitHub API doesn't return information about commit immediately 64 | after push to repository. Therefore, script are waiting for information to appear... 65 | 66 | **Q:** Why are you using GitHub Personal Access Token in action/checkout workflow`s step? 67 | 68 | **A:** To have ability to trigger other workflows by on push trigger. More info: 69 | - [Action does not trigger another on push tag action ](https://github.community/t/action-does-not-trigger-another-on-push-tag-action/17148) 70 | 71 | **Q:** Why are you using GitHub PAT in [make_and_send_alert.py](make_and_send_alert.py)? 72 | 73 | **A:** To increase limits of GitHub API. 74 | 75 | **Q:** Why are you decompiling .apk file each run? 76 | 77 | **A:** Because it doesn't require much time. I am decompiling only 78 | resources (-s flag of apktool to disable disassembly of dex files). 79 | Writing a check for the need for decompilation by the hash of the apk file 80 | would take more time. 81 | 82 | ### Example of link crawler rules configuration 83 | 84 | ```python 85 | CRAWL_RULES = { 86 | # every rule is regex 87 | # empty string means match any url 88 | # allow rules with higher priority than deny 89 | 'translations.telegram.org': { 90 | 'allow': { 91 | r'^[^/]*$', # root 92 | r'org/[^/]*/$', # 1 lvl sub 93 | r'/en/[a-z_]+/$' # 1 lvl after /en/ 94 | }, 95 | 'deny': { 96 | '', # all 97 | } 98 | }, 99 | 'bugs.telegram.org': { 100 | 'deny': { 101 | '', # deny all sub domain 102 | }, 103 | }, 104 | } 105 | ``` 106 | 107 | ### Current hidden urls list 108 | 109 | ```python 110 | HIDDEN_URLS = { 111 | # 'corefork.telegram.org', # disabled 112 | 113 | 'telegram.org/privacy/gmailbot', 114 | 'telegram.org/tos', 115 | 'telegram.org/tour', 116 | 'telegram.org/evolution', 117 | 118 | 'desktop.telegram.org/changelog', 119 | } 120 | ``` 121 | 122 | ### License 123 | 124 | Licensed under the [MIT License](LICENSE). 125 | -------------------------------------------------------------------------------- /make_and_send_alert.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | import os 4 | import re 5 | from typing import Tuple 6 | 7 | import aiohttp 8 | 9 | COMMIT_SHA = os.environ['COMMIT_SHA'] 10 | 11 | # commits for test alert builder 12 | # COMMIT_SHA = '4015bd9c48b45910727569fff5e770000d85d207' # all clients + server and test server + web 13 | # COMMIT_SHA = '9cc3f0fb7c390c8cb8b789e9377f10ed5e80a089' # web and web res together 14 | # COMMIT_SHA = '4efaf918af43054ba3ff76068e83d135a9a2535d' # web 15 | # COMMIT_SHA = 'e2d725c2b3813d7c170f50b0ab21424a71466f6d' # web res 16 | 17 | TELEGRAM_BOT_TOKEN = os.environ['TELEGRAM_BOT_TOKEN'] 18 | DISCORD_BOT_TOKEN = os.environ['DISCORD_BOT_TOKEN'] 19 | GITHUB_PAT = os.environ['GITHUB_PAT'] 20 | 21 | REPOSITORY = os.environ.get('REPOSITORY', 'MarshalX/telegram-crawler') 22 | ROOT_TREE_DIR = os.environ.get('ROOT_TREE_DIR', 'data') 23 | 24 | CHAT_ID = os.environ.get('CHAT_ID', '@tgcrawl') 25 | DISCORD_CHANNEL_ID = os.environ.get('DISCORD_CHANNEL_ID', '1116390634249523283') 26 | 27 | BASE_GITHUB_API = 'https://api.github.com/' 28 | GITHUB_LAST_COMMITS = 'repos/{repo}/commits/{sha}' 29 | 30 | BASE_TELEGRAM_API = 'https://api.telegram.org/bot{token}/' 31 | TELEGRAM_SEND_MESSAGE = 'sendMessage' 32 | 33 | logger = logging.getLogger(__name__) 34 | logging.basicConfig(level=logging.INFO) 35 | 36 | STATUS_TO_EMOJI = { 37 | 'added': '✅', 38 | 'modified': '📝', 39 | 'removed': '❌', 40 | 'renamed': '🔄', 41 | 'copied': '📋', 42 | 'changed': '📝', 43 | 'unchanged': '📝', 44 | } 45 | 46 | AVAILABLE_HASHTAGS = { 47 | 'web_tr', 'web_res', 'web', 'server', 'test_server', 'client', 48 | 'ios', 'macos', 'android', 'android_dl', 'mini_app', 'wallet' 49 | } 50 | HASHTAGS_PATTERNS = { 51 | # regex will be more flexible. for example, in issue with double hashtag '#web #web_res' when data/res not changed 52 | 'web_tr': os.path.join(ROOT_TREE_DIR, 'web_tr'), 53 | 'web_res': os.path.join(ROOT_TREE_DIR, 'web_res'), 54 | 'web': os.path.join(ROOT_TREE_DIR, 'web'), 55 | 'server': os.path.join(ROOT_TREE_DIR, 'server'), 56 | 'test_server': os.path.join(ROOT_TREE_DIR, 'server', 'test'), 57 | 'client': os.path.join(ROOT_TREE_DIR, 'client'), 58 | 'ios': os.path.join(ROOT_TREE_DIR, 'client', 'ios-beta'), 59 | 'macos': os.path.join(ROOT_TREE_DIR, 'client', 'macos-beta'), 60 | 'android': os.path.join(ROOT_TREE_DIR, 'client', 'android-beta'), 61 | 'android_dl': os.path.join(ROOT_TREE_DIR, 'client', 'android-stable-dl'), 62 | 'mini_app': os.path.join(ROOT_TREE_DIR, 'mini_app'), 63 | 'wallet': os.path.join(ROOT_TREE_DIR, 'mini_app', 'wallet'), 64 | } 65 | # order is important! 66 | PATHS_TO_REMOVE_FROM_ALERT = [ 67 | os.path.join(ROOT_TREE_DIR, 'web_tr'), 68 | os.path.join(ROOT_TREE_DIR, 'web_res'), 69 | os.path.join(ROOT_TREE_DIR, 'web'), 70 | os.path.join(ROOT_TREE_DIR, 'server'), 71 | os.path.join(ROOT_TREE_DIR, 'client'), 72 | os.path.join(ROOT_TREE_DIR, 'mini_app'), 73 | ] 74 | 75 | FORUM_CHAT_ID = '@tfcrawl' 76 | HASHTAG_TO_TOPIC = { 77 | 'web': '2200', 78 | 'web_tr': '2202', 79 | 'web_res': '2206', 80 | 'server': '2317', 81 | 'ios': '2194', 82 | 'macos': '2187', 83 | 'android': '2190', 84 | 'android_dl': '12235', 85 | 'wallet': '5685', 86 | } 87 | 88 | GITHUB_API_LIMIT_PER_HOUR = 5_000 89 | COUNT_OF_RUNNING_WORKFLOW_AT_SAME_TIME = 5 # just random number ;d 90 | 91 | ROW_PER_STATUS = 5 92 | 93 | LAST_PAGE_NUMBER_REGEX = r'page=(\d+)>; rel="last"' 94 | 95 | 96 | async def send_req_until_success(session: aiohttp.ClientSession, **kwargs) -> Tuple[dict, int]: 97 | delay = 5 # in sec 98 | count_of_retries = int(GITHUB_API_LIMIT_PER_HOUR / COUNT_OF_RUNNING_WORKFLOW_AT_SAME_TIME / delay) 99 | 100 | last_page_number = 1 101 | retry_number = 1 102 | while retry_number <= count_of_retries: 103 | retry_number += 1 104 | 105 | res = await session.get(**kwargs) 106 | if res.status != 200: 107 | await asyncio.sleep(delay) 108 | continue 109 | 110 | json = await res.json() 111 | 112 | pagination_data = res.headers.get('Link', '') 113 | matches = re.findall(LAST_PAGE_NUMBER_REGEX, pagination_data) 114 | if matches: 115 | last_page_number = int(matches[0]) 116 | 117 | return json, last_page_number 118 | 119 | raise RuntimeError('Surprise. Time is over') 120 | 121 | 122 | async def send_telegram_alert(session: aiohttp.ClientSession, text: str, thread_id=None) -> aiohttp.ClientResponse: 123 | params = { 124 | 'chat_id': CHAT_ID, 125 | 'parse_mode': 'HTML', 126 | 'text': text, 127 | 'disable_web_page_preview': 1, 128 | } 129 | if thread_id: 130 | params['chat_id'] = FORUM_CHAT_ID 131 | params['message_thread_id'] = thread_id 132 | 133 | return await session.get( 134 | url=f'{BASE_TELEGRAM_API}{TELEGRAM_SEND_MESSAGE}'.format(token=TELEGRAM_BOT_TOKEN), params=params 135 | ) 136 | 137 | 138 | async def send_discord_alert( 139 | session: aiohttp.ClientSession, commit_hash: str, commit_url: str, fields: list, hashtags: str 140 | ) -> aiohttp.ClientResponse: 141 | url = f'https://discord.com/api/channels/{DISCORD_CHANNEL_ID}/messages' 142 | 143 | headers = { 144 | 'Authorization': f'Bot {DISCORD_BOT_TOKEN}', 145 | } 146 | 147 | embed_data = { 148 | 'title': f'New changes in Telegram ({commit_hash})', 149 | 'color': 0xe685cc, 150 | 'url': commit_url, 151 | 'fields': fields, 152 | 'author': { 153 | 'name': 'Marshal', 154 | 'url': 'https://github.com/MarshalX', 155 | 'icon_url': 'https://avatars.githubusercontent.com/u/15520314?v=4', 156 | }, 157 | 'footer': { 158 | 'text': hashtags, 159 | } 160 | } 161 | 162 | payload = { 163 | 'embed': embed_data 164 | } 165 | 166 | return await session.post(url=url, headers=headers, json=payload) 167 | 168 | 169 | async def main() -> None: 170 | async with aiohttp.ClientSession() as session: 171 | commit_data, last_page = await send_req_until_success( 172 | session=session, 173 | url=f'{BASE_GITHUB_API}{GITHUB_LAST_COMMITS}'.format(repo=REPOSITORY, sha=COMMIT_SHA), 174 | headers={ 175 | 'Authorization': f'token {GITHUB_PAT}' 176 | } 177 | ) 178 | commit_files = commit_data['files'] 179 | 180 | coroutine_list = list() 181 | for current_page in range(2, last_page + 1): 182 | coroutine_list.append(send_req_until_success( 183 | session=session, 184 | url=f'{BASE_GITHUB_API}{GITHUB_LAST_COMMITS}?page={current_page}'.format( 185 | repo=REPOSITORY, sha=COMMIT_SHA 186 | ), 187 | headers={ 188 | 'Authorization': f'token {GITHUB_PAT}' 189 | } 190 | )) 191 | 192 | paginated_responses = await asyncio.gather(*coroutine_list) 193 | for json_response, _ in paginated_responses: 194 | commit_files.extend(json_response['files']) 195 | 196 | commit_files = [file for file in commit_files if 'translations.telegram.org/' not in file['filename']] 197 | if not commit_files: 198 | return 199 | 200 | commit_hash = commit_data['sha'][:7] 201 | html_url = commit_data['html_url'] 202 | 203 | alert_text = f'New changes of Telegram\n\n' 204 | alert_hashtags = set() 205 | 206 | global AVAILABLE_HASHTAGS 207 | available_hashtags = AVAILABLE_HASHTAGS.copy() 208 | 209 | changes = {k: [] for k in STATUS_TO_EMOJI.keys()} 210 | changes_md = {k: [] for k in STATUS_TO_EMOJI.keys()} 211 | for file in commit_files: 212 | for available_hashtag in available_hashtags: 213 | pattern = HASHTAGS_PATTERNS[available_hashtag] 214 | if pattern in file['filename']: 215 | alert_hashtags.add(available_hashtag) 216 | 217 | # optimize substring search 218 | available_hashtags -= alert_hashtags 219 | 220 | changed_url = file['filename'].replace('.html', '') 221 | for path_to_remove in PATHS_TO_REMOVE_FROM_ALERT: 222 | if changed_url.startswith(path_to_remove): 223 | changed_url = changed_url[len(path_to_remove) + 1:] 224 | break # can't occur more than one time 225 | 226 | status = STATUS_TO_EMOJI[file['status']] 227 | changes[file['status']].append(f'{status} {changed_url}') 228 | changes_md[file['status']].append(f'- {changed_url}') 229 | 230 | discord_embed_fields = [] 231 | for i, [status, text_list] in enumerate(changes.items()): 232 | if not text_list: 233 | continue 234 | 235 | alert_text += '\n'.join(text_list[:ROW_PER_STATUS]) + '\n' 236 | discord_field_value = '\n'.join(changes_md[status][:ROW_PER_STATUS]) + '\n' 237 | 238 | if len(text_list) > ROW_PER_STATUS: 239 | count = len(text_list) - ROW_PER_STATUS 240 | alert_text += f'And {count} {status} actions more..\n' 241 | discord_field_value += f'And **{count}** {status} actions more..\n' 242 | 243 | discord_embed_fields.append({ 244 | 'name': f'{STATUS_TO_EMOJI[status]} {status.capitalize()}', 245 | 'value': discord_field_value, 246 | 'inline': False 247 | }) 248 | 249 | alert_text += '\n' 250 | 251 | link_text = f'GitHub · MarshalX/telegram-crawler@{commit_hash}' 252 | alert_text += f'{link_text}' 253 | logger.info(alert_text) 254 | 255 | if 'web_tr' in alert_hashtags or 'web_res' in alert_hashtags: 256 | alert_hashtags.remove('web') 257 | 258 | for hashtag, topic_thread_id in HASHTAG_TO_TOPIC.items(): 259 | if hashtag in alert_hashtags: 260 | logger.info(f'Sending alert to the forum. Topic: {topic_thread_id}') 261 | telegram_response = await send_telegram_alert(session, alert_text, topic_thread_id) 262 | logger.debug(await telegram_response.read()) 263 | 264 | hashtags = ' '.join([f'#{hashtag}' for hashtag in sorted(alert_hashtags)]) 265 | if alert_hashtags: 266 | alert_text += '\n\n' + hashtags 267 | 268 | telegram_response = await send_telegram_alert(session, alert_text) 269 | logger.debug(await telegram_response.read()) 270 | 271 | discord_response = await send_discord_alert(session, commit_hash, html_url, discord_embed_fields, hashtags) 272 | logger.debug(await discord_response.read()) 273 | 274 | 275 | if __name__ == '__main__': 276 | asyncio.get_event_loop().run_until_complete(main()) 277 | -------------------------------------------------------------------------------- /unwebpack_sourcemap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | unwebpack_sourcemap.py 4 | by rarecoil (github.com/rarecoil/unwebpack-sourcemap) 5 | 6 | Reads Webpack source maps and extracts the disclosed 7 | uncompiled/commented source code for review. Can detect and 8 | attempt to read sourcemaps from Webpack bundles with the `-d` 9 | flag. Puts source into a directory structure similar to dev. 10 | 11 | MIT License 12 | 13 | Copyright (c) 2019 rarecoil. 14 | 15 | Permission is hereby granted, free of charge, to any person obtaining a copy 16 | of this software and associated documentation files (the "Software"), to deal 17 | in the Software without restriction, including without limitation the rights 18 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 19 | copies of the Software, and to permit persons to whom the Software is 20 | furnished to do so, subject to the following conditions: 21 | 22 | The above copyright notice and this permission notice shall be included in all 23 | copies or substantial portions of the Software. 24 | 25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 26 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 27 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 28 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 29 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 30 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 | SOFTWARE. 32 | """ 33 | 34 | import argparse 35 | import json 36 | import os 37 | import re 38 | import string 39 | import sys 40 | from urllib.parse import urlparse 41 | from unicodedata import normalize 42 | 43 | import requests 44 | from bs4 import BeautifulSoup, SoupStrainer 45 | 46 | 47 | class SourceMapExtractor(object): 48 | """Primary SourceMapExtractor class. Feed this arguments.""" 49 | 50 | _target = None 51 | _is_local = False 52 | _attempt_sourcemap_detection = False 53 | _output_directory = "" 54 | _target_extracted_sourcemaps = [] 55 | 56 | _path_sanitiser = None 57 | 58 | def __init__(self, options): 59 | """Initialize the class.""" 60 | if 'output_directory' not in options: 61 | raise SourceMapExtractorError("output_directory must be set in options.") 62 | else: 63 | self._output_directory = os.path.abspath(options['output_directory']) 64 | if not os.path.isdir(self._output_directory): 65 | if options['make_directory'] is True: 66 | os.mkdir(self._output_directory) 67 | else: 68 | raise SourceMapExtractorError( 69 | "output_directory does not exist. Pass --make-directory to auto-make it.") 70 | 71 | self._path_sanitiser = PathSanitiser(self._output_directory) 72 | 73 | if options['disable_ssl_verification'] == True: 74 | self.disable_verify_ssl = True 75 | else: 76 | self.disable_verify_ssl = False 77 | 78 | if options['local'] == True: 79 | self._is_local = True 80 | 81 | if options['detect'] == True: 82 | self._attempt_sourcemap_detection = True 83 | 84 | self._validate_target(options['uri_or_file']) 85 | 86 | def run(self): 87 | """Run extraction process.""" 88 | if self._is_local == False: 89 | if self._attempt_sourcemap_detection: 90 | detected_sourcemaps = self._detect_js_sourcemaps(self._target) 91 | for sourcemap in detected_sourcemaps: 92 | self._parse_remote_sourcemap(sourcemap) 93 | else: 94 | self._parse_remote_sourcemap(self._target) 95 | 96 | else: 97 | self._parse_sourcemap(self._target) 98 | 99 | def _validate_target(self, target): 100 | """Do some basic validation on the target.""" 101 | parsed = urlparse(target) 102 | if self._is_local is True: 103 | self._target = os.path.abspath(target) 104 | if not os.path.isfile(self._target): 105 | raise SourceMapExtractorError( 106 | "uri_or_file is set to be a file, but doesn't seem to exist. check your path.") 107 | else: 108 | if parsed.scheme == "": 109 | raise SourceMapExtractorError("uri_or_file isn't a URI, and --local was not set. set --local?") 110 | file, ext = os.path.splitext(parsed.path) 111 | self._target = target 112 | if ext != '.map' and self._attempt_sourcemap_detection is False: 113 | print("WARNING: URI does not have .map extension, and --detect is not flagged.") 114 | 115 | def _parse_remote_sourcemap(self, uri): 116 | """GET a remote sourcemap and parse it.""" 117 | data, final_uri = self._get_remote_data(uri) 118 | if data is not None: 119 | self._parse_sourcemap(data, True) 120 | else: 121 | print("WARNING: Could not retrieve sourcemap from URI %s" % final_uri) 122 | 123 | def _detect_js_sourcemaps(self, uri): 124 | """Pull HTML and attempt to find JS files, then read the JS files and look for sourceMappingURL.""" 125 | remote_sourcemaps = [] 126 | data, final_uri = self._get_remote_data(uri) 127 | 128 | if final_uri.endswith('.js'): 129 | print("Detecting sourcemaps in JS at %s" % final_uri) 130 | # trick to not send the same request twice 131 | self._enrich_with_remote_sourcemaps('tgcrawl', remote_sourcemaps, js_data=data, last_target_uri=final_uri) 132 | return remote_sourcemaps 133 | 134 | # TODO: scan to see if this is a sourcemap instead of assuming HTML 135 | print("Detecting sourcemaps in HTML at %s" % final_uri) 136 | script_strainer = SoupStrainer("script", src=True) 137 | try: 138 | soup = BeautifulSoup(data, "html.parser", parse_only=script_strainer) 139 | except: 140 | raise SourceMapExtractorError("Could not parse HTML at URI %s" % final_uri) 141 | 142 | for script in soup: 143 | source = script['src'] 144 | parsed_uri = urlparse(source) 145 | if parsed_uri.scheme != '': 146 | next_target_uri = source 147 | else: 148 | current_uri = urlparse(final_uri) 149 | built_uri = current_uri.scheme + "://" + current_uri.netloc + source 150 | next_target_uri = built_uri 151 | self._enrich_with_remote_sourcemaps(next_target_uri, remote_sourcemaps) 152 | 153 | return remote_sourcemaps 154 | 155 | def _enrich_with_remote_sourcemaps(self, next_target_uri, remote_sourcemaps, js_data=None, last_target_uri=None): 156 | if last_target_uri is None or js_data is None: 157 | js_data, last_target_uri = self._get_remote_data(next_target_uri) 158 | 159 | # get last line of file 160 | last_line = js_data.rstrip().split("\n")[-1] 161 | regex = "\\/\\/#\s*sourceMappingURL=(.*)$" 162 | matches = re.search(regex, last_line) 163 | if matches: 164 | asset = matches.groups(0)[0].strip() 165 | asset_target = urlparse(asset) 166 | if asset_target.scheme != '': 167 | print("Detected sourcemap at remote location %s" % asset) 168 | remote_sourcemaps.append(asset) 169 | else: 170 | current_uri = urlparse(last_target_uri) 171 | asset_uri = current_uri.scheme + '://' + \ 172 | current_uri.netloc + \ 173 | os.path.dirname(current_uri.path) + \ 174 | '/' + asset 175 | print("Detected sourcemap at remote location %s" % asset_uri) 176 | remote_sourcemaps.append(asset_uri) 177 | 178 | def _parse_sourcemap(self, target, is_str=False): 179 | map_data = "" 180 | if is_str is False: 181 | if os.path.isfile(target): 182 | with open(target, 'r', encoding='utf-8', errors='ignore') as f: 183 | map_data = f.read() 184 | else: 185 | map_data = target 186 | 187 | # with the sourcemap data, pull directory structures 188 | try: 189 | map_object = json.loads(map_data) 190 | except json.JSONDecodeError: 191 | print("ERROR: Failed to parse sourcemap %s. Are you sure this is a sourcemap?" % target) 192 | return False 193 | 194 | # we need `sourcesContent` and `sources`. 195 | # do a basic validation check to make sure these exist and agree. 196 | if 'sources' not in map_object or 'sourcesContent' not in map_object: 197 | print("ERROR: Sourcemap does not contain sources and/or sourcesContent, cannot extract.") 198 | return False 199 | 200 | if len(map_object['sources']) != len(map_object['sourcesContent']): 201 | print("WARNING: sources != sourcesContent, filenames may not match content") 202 | 203 | for source, content in zip(map_object['sources'], map_object['sourcesContent']): 204 | # remove webpack:// from paths 205 | # and do some checks on it 206 | write_path = self._get_sanitised_file_path(source) 207 | if write_path is None: 208 | print("ERROR: Could not sanitize path %s" % source) 209 | continue 210 | 211 | os.makedirs(os.path.dirname(write_path), mode=0o755, exist_ok=True) 212 | with open(write_path, 'w', encoding='utf-8', errors='ignore', newline='') as f: 213 | print("Writing %s..." % os.path.basename(write_path)) 214 | f.write(content) 215 | 216 | def _get_sanitised_file_path(self, sourcePath): 217 | """Sanitise webpack paths for separators/relative paths""" 218 | sourcePath = sourcePath.replace("webpack:///", "") 219 | exts = sourcePath.split(" ") 220 | 221 | if exts[0] == "external": 222 | print("WARNING: Found external sourcemap %s, not currently supported. Skipping" % exts[1]) 223 | return None 224 | 225 | path, filename = os.path.split(sourcePath) 226 | if path[:2] == './': 227 | path = path[2:] 228 | if path[:3] == '../': 229 | path = 'parent_dir/' + path[3:] 230 | if path[:1] == '.': 231 | path = "" 232 | 233 | filepath = self._path_sanitiser.make_valid_file_path(path, filename) 234 | return filepath 235 | 236 | def _get_remote_data(self, uri): 237 | """Get remote data via http.""" 238 | 239 | if self.disable_verify_ssl == True: 240 | result = requests.get(uri, verify=False) 241 | else: 242 | result = requests.get(uri) 243 | 244 | # Redirect 245 | if not uri == result.url: 246 | return self._get_remote_data(result.url) 247 | 248 | if result.status_code == 200: 249 | result.encoding = 'utf-8' 250 | return result.text, result.url 251 | else: 252 | print("WARNING: Got status code %d for URI %s" % (result.status_code, result.url)) 253 | return None, result.url 254 | 255 | 256 | class PathSanitiser(object): 257 | """https://stackoverflow.com/questions/13939120/sanitizing-a-file-path-in-python""" 258 | 259 | EMPTY_NAME = "empty" 260 | 261 | empty_idx = 0 262 | root_path = "" 263 | 264 | def __init__(self, root_path): 265 | self.root_path = root_path 266 | 267 | def ensure_directory_exists(self, path_directory): 268 | if not os.path.exists(path_directory): 269 | os.makedirs(path_directory) 270 | 271 | def os_path_separators(self): 272 | seps = [] 273 | for sep in os.path.sep, os.path.altsep: 274 | if sep: 275 | seps.append(sep) 276 | return seps 277 | 278 | def sanitise_filesystem_name(self, potential_file_path_name): 279 | # Sort out unicode characters 280 | valid_filename = normalize('NFKD', potential_file_path_name).encode('ascii', 'ignore').decode('ascii') 281 | # Replace path separators with underscores 282 | for sep in self.os_path_separators(): 283 | valid_filename = valid_filename.replace(sep, '_') 284 | # Ensure only valid characters 285 | valid_chars = "-_.() {0}{1}".format(string.ascii_letters, string.digits) 286 | valid_filename = "".join(ch for ch in valid_filename if ch in valid_chars) 287 | # Ensure at least one letter or number to ignore names such as '..' 288 | valid_chars = "{0}{1}".format(string.ascii_letters, string.digits) 289 | test_filename = "".join(ch for ch in potential_file_path_name if ch in valid_chars) 290 | if len(test_filename) == 0: 291 | # Replace empty file name or file path part with the following 292 | valid_filename = self.EMPTY_NAME + '_' + str(self.empty_idx) 293 | 294 | # MODIFIED BY MARSHALX 295 | # self.empty_idx += 1 296 | 297 | return valid_filename 298 | 299 | def get_root_path(self): 300 | # Replace with your own root file path, e.g. '/place/to/save/files/' 301 | filepath = self.root_path 302 | filepath = os.path.abspath(filepath) 303 | # ensure trailing path separator (/) 304 | if not any(filepath[-1] == sep for sep in self.os_path_separators()): 305 | filepath = '{0}{1}'.format(filepath, os.path.sep) 306 | self.ensure_directory_exists(filepath) 307 | return filepath 308 | 309 | def path_split_into_list(self, path): 310 | # Gets all parts of the path as a list, excluding path separators 311 | parts = [] 312 | while True: 313 | newpath, tail = os.path.split(path) 314 | if newpath == path: 315 | assert not tail 316 | if path and path not in self.os_path_separators(): 317 | parts.append(path) 318 | break 319 | if tail and tail not in self.os_path_separators(): 320 | parts.append(tail) 321 | path = newpath 322 | parts.reverse() 323 | return parts 324 | 325 | def sanitise_filesystem_path(self, potential_file_path): 326 | # Splits up a path and sanitises the name of each part separately 327 | path_parts_list = self.path_split_into_list(potential_file_path) 328 | sanitised_path = '' 329 | for path_component in path_parts_list: 330 | sanitised_path = '{0}{1}{2}'.format(sanitised_path, 331 | self.sanitise_filesystem_name(path_component), 332 | os.path.sep) 333 | return sanitised_path 334 | 335 | def check_if_path_is_under(self, parent_path, child_path): 336 | # Using the function to split paths into lists of component parts, check that one path is underneath another 337 | child_parts = self.path_split_into_list(child_path) 338 | parent_parts = self.path_split_into_list(parent_path) 339 | if len(parent_parts) > len(child_parts): 340 | return False 341 | return all(part1 == part2 for part1, part2 in zip(child_parts, parent_parts)) 342 | 343 | def make_valid_file_path(self, path=None, filename=None): 344 | root_path = self.get_root_path() 345 | if path: 346 | sanitised_path = self.sanitise_filesystem_path(path) 347 | if filename: 348 | sanitised_filename = self.sanitise_filesystem_name(filename) 349 | complete_path = os.path.join(root_path, sanitised_path, sanitised_filename) 350 | else: 351 | complete_path = os.path.join(root_path, sanitised_path) 352 | else: 353 | if filename: 354 | sanitised_filename = self.sanitise_filesystem_name(filename) 355 | complete_path = os.path.join(root_path, sanitised_filename) 356 | else: 357 | complete_path = complete_path 358 | complete_path = os.path.abspath(complete_path) 359 | if self.check_if_path_is_under(root_path, complete_path): 360 | return complete_path 361 | else: 362 | return None 363 | 364 | 365 | class SourceMapExtractorError(Exception): 366 | pass 367 | 368 | 369 | if __name__ == "__main__": 370 | parser = argparse.ArgumentParser( 371 | description="A tool to extract code from Webpack sourcemaps. Turns black boxes into gray ones.") 372 | parser.add_argument("-l", "--local", action="store_true", default=False) 373 | parser.add_argument("-d", "--detect", action="store_true", default=False, 374 | help="Attempt to detect sourcemaps from JS assets in retrieved HTML.") 375 | parser.add_argument("--make-directory", action="store_true", default=False, 376 | help="Make the output directory if it doesn't exist.") 377 | parser.add_argument("--dangerously-write-paths", action="store_true", default=False, 378 | help="Write full paths. WARNING: Be careful here, you are pulling directories from an untrusted source.") 379 | parser.add_argument("--disable-ssl-verification", action="store_true", default=False, 380 | help="The script will not verify the site's SSL certificate.") 381 | 382 | parser.add_argument("uri_or_file", help="The target URI or file.") 383 | parser.add_argument("output_directory", help="Directory to output from sourcemap to.") 384 | 385 | if (len(sys.argv) < 3): 386 | parser.print_usage() 387 | sys.exit(1) 388 | 389 | args = parser.parse_args() 390 | extractor = SourceMapExtractor(vars(args)) 391 | extractor.run() 392 | -------------------------------------------------------------------------------- /make_tracked_links_list.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import codecs 3 | import logging 4 | import os 5 | import re 6 | from functools import cache 7 | from html import unescape 8 | from time import time 9 | from typing import Set, List, Union 10 | from urllib.parse import unquote 11 | 12 | import httpx 13 | import uvloop 14 | 15 | 16 | PROTOCOL = 'https://' 17 | BASE_URL = 'telegram.org' 18 | # it's necessary to help crawler to find more links 19 | HIDDEN_URLS = { 20 | 'blogfork.telegram.org', 21 | 22 | 'corefork.telegram.org', 23 | 'corefork.telegram.org/getProxyConfig', 24 | 25 | 'telegram.org/privacy/gmailbot', 26 | 'telegram.org/tos/mini-apps', 27 | 'telegram.org/tos/p2pl', 28 | 'telegram.org/tour', 29 | 'telegram.org/evolution', 30 | 'telegram.org/tos/bots', 31 | 'telegram.org/tos/business', 32 | 33 | 'desktop.telegram.org/changelog', 34 | 'td.telegram.org/current', 35 | 'td.telegram.org/current2', 36 | 'td.telegram.org/current4', 37 | 'td.telegram.org/current5', # tdx 38 | 39 | 'osx.telegram.org/updates/versions.xml', # stable 40 | 'mac-updates.telegram.org/beta/versions.xml', 41 | 42 | 'telegram.org/dl/android/apk-public-beta.json', 43 | 44 | 'instantview.telegram.org/rules', 45 | 46 | 'core.telegram.org/resources/cidr.txt', 47 | 'core.telegram.org/apple_privacy', 48 | 'core.telegram.org/getProxyConfig', 49 | 50 | 'core.telegram.org/video_stickers', 51 | 'core.telegram.org/stickers', 52 | 53 | 'promote.telegram.org', 54 | 'contest.com', 55 | 56 | # web apps beta 57 | 'comments.app/test_webview', # old 58 | 'webappcontent.telegram.org/demo', # new 59 | 'webappcontent.telegram.org/cafe', # demo 2 60 | 'webappinternal.telegram.org/botfather', 61 | 'webappinternal.telegram.org/stickers', 62 | # 'a-webappcontent.stel.com/demo', 63 | # 'a-webappcontent.stel.com/cafe', 64 | 65 | # 'fragment.com/about', 66 | # 'fragment.com/privacy', 67 | # 'fragment.com/terms', 68 | # 'fragment.com/css/auction.css', # a lot of CDN issues which TG can't fix 69 | # 'fragment.com/js/auction.js', # a lot of CDN issues which TG can't fix 70 | } 71 | ADDITIONAL_URLS = { 72 | 'raw.githubusercontent.com/telegramdesktop/tdesktop/dev/Telegram/SourceFiles/mtproto/scheme/mtproto.tl', 73 | 'raw.githubusercontent.com/telegramdesktop/tdesktop/dev/Telegram/SourceFiles/mtproto/scheme/api.tl', 74 | 'raw.githubusercontent.com/tdlib/td/master/td/generate/scheme/telegram_api.tl', 75 | 'raw.githubusercontent.com/tdlib/td/master/td/generate/scheme/secret_api.tl', 76 | 'raw.githubusercontent.com/tdlib/td/master/td/generate/scheme/td_api.tl', 77 | } 78 | BASE_URL_REGEX = r'telegram.org' 79 | 80 | CRAWL_GLOBAL_RULES = { 81 | 'allow': set(), 82 | 'deny': { 83 | r'.org/auth$', 84 | }, 85 | } 86 | # disable crawling sub links for specific domains and url patterns 87 | CRAWL_RULES = { 88 | # every rule is regex 89 | # empty string means match any url 90 | # allow rules with higher priority than deny 91 | 'translations.telegram.org': { 92 | 'allow': { 93 | r'^[^/]*$', # root 94 | r'org/[^/]*$', # 1 lvl sub 95 | r'/css/[a-z-_.]+$', # css files 96 | r'/en/[a-z_]+$', # 1 lvl after /en/ 97 | r'/en/(?!recent)[a-z_]+/[a-z_]+$', # 2 lvl after /en/. for example, /en/ios/unsorted except /en/recent 98 | }, 99 | 'deny': { 100 | '', # all 101 | } 102 | }, 103 | 'osx.telegram.org': { 104 | 'deny': { 105 | 'updates/Telegram' 106 | } 107 | }, 108 | 'bugs.telegram.org': { # crawl first page of cards sorted by rating 109 | 'deny': { 110 | # r'/c/[0-9]+/[0-9]+', # disable comments 111 | '', 112 | }, 113 | }, 114 | 'instantview.telegram.org': { 115 | 'deny': { 116 | r'templates/.+', 117 | 'samples/', 118 | 'contest', 119 | }, 120 | }, 121 | 'core.telegram.org': { 122 | 'deny': { 123 | 'bots/payments', 124 | 'tdlib/docs/classtd', 125 | 'validatedRequestedInfo', 126 | 'constructor/Updates', 127 | }, 128 | }, 129 | 'corefork.telegram.org': { 130 | 'deny': { 131 | 'bots/payments', 132 | 'tdlib/docs/classtd', 133 | 'validatedRequestedInfo', 134 | 'constructor/Updates', 135 | }, 136 | }, 137 | 'blogfork.telegram.org': { 138 | 'deny': { 139 | 'bots/payments', 140 | 'tdlib/docs/classtd', 141 | 'validatedRequestedInfo', 142 | 'constructor/Updates', 143 | }, 144 | }, 145 | 'telegram.org': { 146 | 'deny': { 147 | r'apps$', 148 | r'img/emoji/.+', 149 | r'img/StickerExample.psd$', 150 | r'/privacy$', # geolocation depended 151 | r'/tos$', # geolocation depended 152 | r'/moderation$', # dynamic graphs 153 | r'/dsa-report$', # EU only 154 | r'/tos/eu-dsa/transparency-2025$', # EU only 155 | r'/tos/eu/transparency-tco$', # EU only 156 | }, 157 | }, 158 | 'webz.telegram.org': { 159 | 'deny': { 160 | '', 161 | }, 162 | }, 163 | 'webk.telegram.org': { 164 | 'deny': { 165 | '', 166 | }, 167 | }, 168 | } 169 | CRAWL_STATUS_CODE_EXCLUSIONS = { 170 | 'webappinternal.telegram.org/botfather', 171 | 'webappinternal.telegram.org/stickers', 172 | } 173 | 174 | DIRECT_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,249}' + BASE_URL_REGEX + r')' 175 | ABSOLUTE_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,248}' + BASE_URL_REGEX + r'\b[-a-zA-Z0-9@:%_\+.~#?&//=]*)' 176 | RELATIVE_LINK_REGEX = r'\/(?!\/)([-a-zA-Z0-9\/@:%._\+~#]{0,249})' 177 | RELATIVE_JS_SCRIPTS_REGEX = r'["\'](.*\.js)["\'\?]' 178 | 179 | DOM_ATTRS = ['href', 'src'] 180 | 181 | OUTPUT_FILENAME = os.environ.get('OUTPUT_FILENAME', 'tracked_links.txt') 182 | OUTPUT_RESOURCES_FILENAME = os.environ.get('OUTPUT_RESOURCES_FILENAME', 'tracked_res_links.txt') 183 | OUTPUT_TRANSLATIONS_FILENAME = os.environ.get('OUTPUT_TRANSLATIONS_FILENAME', 'tracked_tr_links.txt') 184 | 185 | STEL_DEV_LAYER = 290 186 | 187 | TIMEOUT_CONFIGS = [ 188 | # Fast timeout for most requests 189 | {'total': 30, 'connect': 30, 'sock_connect': 30, 'sock_read': 30}, 190 | # Medium timeout for slower responses 191 | {'total': 60, 'connect': 60, 'sock_connect': 30, 'sock_read': 60}, 192 | # High timeout for problematic URLs 193 | {'total': 120, 'connect': 90, 'sock_connect': 30, 'sock_read': 90} 194 | ] 195 | 196 | logging.basicConfig(format='%(asctime)s %(levelname)s - %(message)s', level=logging.INFO) 197 | logging.getLogger('httpx').setLevel(logging.WARNING) 198 | logger = logging.getLogger(__name__) 199 | 200 | VISITED_LINKS = set() 201 | LINKS_TO_TRACK = set() 202 | LINKS_TO_TRANSLATIONS = set() 203 | LINKS_TO_TRACKABLE_RESOURCES = set() 204 | 205 | URL_RETRY_COUNT = {} 206 | RETRY_LOCK = asyncio.Lock() 207 | 208 | # Track base URLs that have had their trailing slash state flipped for retry logic 209 | SLASH_RETRY_ATTEMPTED = set() 210 | SLASH_RETRY_LOCK = asyncio.Lock() 211 | 212 | VISITED_LINKS_LOCK = asyncio.Lock() 213 | TRACKING_SETS_LOCK = asyncio.Lock() 214 | 215 | WORKERS_COUNT = 50 216 | WORKERS_TASK_QUEUE = asyncio.Queue() 217 | WORKERS_NEW_TASK_TIMEOUT = 1.0 # seconds 218 | 219 | TEXT_DECODER = codecs.getincrementaldecoder('UTF-8')(errors='strict') 220 | 221 | 222 | @cache 223 | def should_exclude(url: str) -> bool: 224 | direct_link = re.findall(DIRECT_LINK_REGEX, url)[0] 225 | domain_rules = CRAWL_RULES.get(direct_link) 226 | if not domain_rules: 227 | domain_rules = CRAWL_GLOBAL_RULES 228 | 229 | allow_rules = domain_rules.get('allow', set()) | CRAWL_GLOBAL_RULES.get('allow', set()) 230 | deny_rules = domain_rules.get('deny', set()) | CRAWL_GLOBAL_RULES.get('deny', set()) 231 | 232 | exclude = False 233 | 234 | for regex in deny_rules: 235 | if re.search(regex, url): 236 | exclude = True 237 | break 238 | 239 | for regex in allow_rules: 240 | if re.search(regex, url): 241 | exclude = False 242 | break 243 | 244 | if exclude: 245 | logger.debug('Exclude %s by rules', url) 246 | 247 | return exclude 248 | 249 | 250 | def find_absolute_links(html: str) -> Set[str]: 251 | absolute_links = set(re.findall(ABSOLUTE_LINK_REGEX, html)) 252 | 253 | return {link for link in cleanup_links(absolute_links) if not should_exclude(link)} 254 | 255 | 256 | def find_relative_links(html: str, cur_link: str) -> Set[str]: 257 | matches = re.findall(DIRECT_LINK_REGEX, cur_link) 258 | if not matches: 259 | return set() 260 | 261 | direct_cur_link = re.findall(DIRECT_LINK_REGEX, cur_link)[0] 262 | # optimization. when we want to exclude domain 263 | if should_exclude(cur_link): 264 | return set() 265 | 266 | relative_links = set() 267 | for attr in DOM_ATTRS: 268 | regex = f'{attr}="{RELATIVE_LINK_REGEX}' 269 | links = re.findall(regex, html) 270 | 271 | for link in cleanup_links(links): 272 | url = f'{direct_cur_link}/{link}' 273 | if not should_exclude(url): 274 | relative_links.add(url) 275 | 276 | return relative_links 277 | 278 | 279 | def find_relative_scripts(code: str, cur_link: str) -> Set[str]: 280 | matches = re.findall(DIRECT_LINK_REGEX, cur_link) 281 | if not matches: 282 | return set() 283 | 284 | direct_cur_link = re.findall(DIRECT_LINK_REGEX, cur_link)[0] 285 | 286 | relative_links = set() 287 | links = re.findall(RELATIVE_JS_SCRIPTS_REGEX, code) 288 | 289 | def join_paths(part1: str, part2: str) -> str: 290 | part1 = part1.rstrip('/') 291 | part2 = part2.lstrip('/') 292 | return f'{part1}/{part2}' 293 | 294 | for link in cleanup_links(links): 295 | # dirty magic for specific cases 296 | if '/' in link: # path to file from the root 297 | url = join_paths(direct_cur_link, link) 298 | else: # it is a relative link from the current folder. not from the root 299 | current_folder_link, *_ = cur_link.rsplit('/', 1) 300 | url = join_paths(current_folder_link, link) 301 | 302 | if not should_exclude(url): 303 | relative_links.add(url) 304 | 305 | return relative_links 306 | 307 | 308 | def cleanup_links(links: Union[List[str], Set[str]]) -> Set[str]: 309 | cleaned_links = set() 310 | for tmp_link in links: 311 | # normalize link 312 | link = unquote(tmp_link) 313 | link = unescape(link) 314 | link = link.replace('www.', '') 315 | link = link.replace('http://', '').replace('https://', '') 316 | link = link.replace('//', '/') # not a universal solution 317 | link = link.replace('"', '') # regex fix hack 318 | 319 | # skip anchor links 320 | if '#' in link: 321 | continue 322 | 323 | # remove get params from link 324 | if '?' in link: 325 | link = ''.join(link.split('?')[:-1]) 326 | 327 | # remove get params from link 328 | if '&' in link: 329 | link = ''.join(link.split('&')[:-1]) 330 | 331 | # skip mailto: 332 | link_parts = link.split('.') 333 | if '@' in link_parts[0]: 334 | continue 335 | 336 | # fix wildcard 337 | if link.startswith('.'): 338 | link = link[1:] 339 | 340 | if link.endswith('/'): 341 | link = link[:-1] 342 | 343 | cleaned_links.add(link) 344 | 345 | return cleaned_links 346 | 347 | 348 | def _is_x_content_type(content_types_set: Set[str], content_type) -> bool: 349 | for match_content_type in content_types_set: 350 | if match_content_type in content_type: 351 | return True 352 | 353 | return False 354 | 355 | 356 | def is_translation_url(url: str) -> bool: 357 | return 'translations.telegram.org' in url 358 | 359 | 360 | def is_textable_content_type(content_type: str) -> bool: 361 | textable_content_type = { 362 | 'plain', 363 | 'css', 364 | 'json', 365 | 'text', 366 | 'javascript', 367 | } 368 | 369 | return _is_x_content_type(textable_content_type, content_type) 370 | 371 | 372 | def is_trackable_content_type(content_type) -> bool: 373 | trackable_content_types = { 374 | 'svg', 375 | 'png', 376 | 'jpeg', 377 | 'x-icon', 378 | 'gif', 379 | 'mp4', 380 | 'webm', 381 | 'application/octet-stream', # td updates 382 | 'application/zip', 383 | } 384 | 385 | return _is_x_content_type(trackable_content_types, content_type) 386 | 387 | 388 | class ServerSideError(Exception): 389 | pass 390 | 391 | 392 | async def crawl_worker(client: httpx.AsyncClient): 393 | while True: 394 | try: 395 | url = await asyncio.wait_for(WORKERS_TASK_QUEUE.get(), timeout=WORKERS_NEW_TASK_TIMEOUT) 396 | except asyncio.TimeoutError: 397 | logger.debug(f'Worker exiting - no tasks for {WORKERS_NEW_TASK_TIMEOUT} seconds') 398 | break 399 | 400 | try: 401 | async with RETRY_LOCK: 402 | retry_count = URL_RETRY_COUNT.get(url, 0) 403 | 404 | timeout_index = min(retry_count, len(TIMEOUT_CONFIGS) - 1) 405 | timeout_config = TIMEOUT_CONFIGS[timeout_index] 406 | 407 | await _crawl(url, client, timeout_config) 408 | 409 | async with RETRY_LOCK: 410 | if url in URL_RETRY_COUNT: 411 | del URL_RETRY_COUNT[url] 412 | 413 | WORKERS_TASK_QUEUE.task_done() 414 | except (ServerSideError, httpx.ProtocolError, httpx.TimeoutException, httpx.NetworkError) as e: 415 | exc_name = e.__class__.__name__ 416 | exc_msg = str(e) if str(e) else 'No message' 417 | 418 | async with RETRY_LOCK: 419 | retry_count = URL_RETRY_COUNT.get(url, 0) 420 | URL_RETRY_COUNT[url] = retry_count + 1 421 | 422 | next_timeout_index = min(retry_count + 1, len(TIMEOUT_CONFIGS) - 1) 423 | next_timeout_config = TIMEOUT_CONFIGS[next_timeout_index] 424 | 425 | logger.warning(f'Crawl error {exc_name}: {exc_msg}. Retrying {url} with {next_timeout_config["total"]}s total timeout') 426 | 427 | await WORKERS_TASK_QUEUE.put(url) 428 | 429 | async with VISITED_LINKS_LOCK: 430 | if url in VISITED_LINKS: 431 | VISITED_LINKS.remove(url) 432 | 433 | WORKERS_TASK_QUEUE.task_done() 434 | 435 | 436 | async def _crawl(url: str, client: httpx.AsyncClient, timeout_config: dict = None): 437 | truncated_url = (url[:100] + '...') if len(url) > 100 else url 438 | 439 | async with VISITED_LINKS_LOCK: 440 | if url in VISITED_LINKS: 441 | return 442 | VISITED_LINKS.add(url) 443 | 444 | if timeout_config is None: 445 | timeout_config = TIMEOUT_CONFIGS[0] # Use default (fast) timeout 446 | 447 | timeout = httpx.Timeout( 448 | timeout= timeout_config['total'], 449 | connect=timeout_config['connect'], 450 | read=timeout_config['sock_read'], 451 | write=None, 452 | pool=None 453 | ) 454 | logger.debug('[%s] Process %s (total timeout: %ds)', len(VISITED_LINKS), truncated_url, timeout_config['total']) 455 | response = await client.get(f'{PROTOCOL}{url}', timeout=timeout) 456 | code = response.status_code 457 | 458 | if 499 < code < 600: 459 | async with VISITED_LINKS_LOCK: 460 | VISITED_LINKS.remove(url) 461 | logger.warning(f'Error 5XX. Retrying {url}') 462 | raise ServerSideError() 463 | 464 | if code not in {200, 304} and url not in CRAWL_STATUS_CODE_EXCLUSIONS: 465 | # Handle redirect and not found errors with retry logic: flip trailing slash state 466 | if code in {301, 302, 404}: 467 | async with SLASH_RETRY_LOCK: 468 | base_url = url.rstrip('/') 469 | if base_url not in SLASH_RETRY_ATTEMPTED: 470 | if url.endswith('/'): 471 | flipped_url = base_url 472 | logger.warning(f'{code} slash removal retry for {truncated_url}') 473 | else: 474 | flipped_url = f'{url}/' 475 | logger.warning(f'{code} slash addition retry for {truncated_url}') 476 | 477 | SLASH_RETRY_ATTEMPTED.add(base_url) 478 | await WORKERS_TASK_QUEUE.put(flipped_url) 479 | return 480 | else: 481 | logger.warning(f'Skip [{code}] {truncated_url}: already tried flipping slash state for {base_url}') 482 | return 483 | 484 | clean_content = response.text.replace('\n', ' ').replace('\r', ' ') 485 | truncated_content = (clean_content[:200] + '...') if len(clean_content) > 200 else clean_content 486 | logger.warning(f'Skip [{code}] {truncated_url}: {truncated_content}') 487 | 488 | return 489 | 490 | content_type = response.headers.get('content-type') 491 | if is_textable_content_type(content_type): 492 | raw_content = response.content 493 | 494 | try: 495 | content = TEXT_DECODER.decode(raw_content) 496 | except UnicodeDecodeError: 497 | if raw_content.startswith(b'GIF'): 498 | async with TRACKING_SETS_LOCK: 499 | LINKS_TO_TRACKABLE_RESOURCES.add(url) 500 | logger.debug('Add %s to LINKS_TO_TRACKABLE_RESOURCES (raw GIF content)', url) 501 | return 502 | else: 503 | logger.warning(f'Codec can\'t decode bytes. So it was a tgs file or response with broken content type {url}') 504 | return 505 | 506 | async with TRACKING_SETS_LOCK: 507 | if is_translation_url(url): 508 | LINKS_TO_TRANSLATIONS.add(url) 509 | logger.debug('Add %s to LINKS_TO_TRANSLATIONS', url) 510 | else: 511 | LINKS_TO_TRACK.add(url) 512 | logger.debug('Add %s to LINKS_TO_TRACK', url) 513 | 514 | absolute_links = find_absolute_links(content) 515 | 516 | relative_links_finder = find_relative_links 517 | if 'javascript' in content_type: 518 | relative_links_finder = find_relative_scripts 519 | 520 | relative_links = relative_links_finder(content, url) 521 | 522 | sub_links = absolute_links | relative_links 523 | for sub_url in sub_links: 524 | async with VISITED_LINKS_LOCK: 525 | if sub_url not in VISITED_LINKS: 526 | await WORKERS_TASK_QUEUE.put(sub_url) 527 | elif is_trackable_content_type(content_type): 528 | async with TRACKING_SETS_LOCK: 529 | LINKS_TO_TRACKABLE_RESOURCES.add(url) 530 | logger.debug('Add %s to LINKS_TO_TRACKABLE_RESOURCES', url) 531 | else: 532 | # for example, zip with update of macOS client 533 | logger.warning(f'Unhandled type: {content_type} from {url}') 534 | 535 | 536 | async def start(url_list: Set[str]): 537 | for url in url_list: 538 | await WORKERS_TASK_QUEUE.put(url) 539 | 540 | transport = httpx.AsyncHTTPTransport(verify=False, retries=3) 541 | async with httpx.AsyncClient(transport=transport) as client: 542 | workers = [crawl_worker(client) for _ in range(WORKERS_COUNT)] 543 | await asyncio.gather(*workers) 544 | 545 | await WORKERS_TASK_QUEUE.join() 546 | 547 | 548 | def unified_links(links_set: Set[str]) -> Set[str]: 549 | return {link.rstrip('/') for link in links_set} 550 | 551 | 552 | if __name__ == '__main__': 553 | HIDDEN_URLS.add(BASE_URL) 554 | LINKS_TO_TRACK = LINKS_TO_TRACK | ADDITIONAL_URLS 555 | 556 | logger.info('Start crawling links...') 557 | start_time = time() 558 | uvloop.run(start(HIDDEN_URLS)) 559 | logger.info(f'Stop crawling links. {time() - start_time} sec.') 560 | 561 | LINKS_TO_TRACK = unified_links(LINKS_TO_TRACK) 562 | LINKS_TO_TRACKABLE_RESOURCES = unified_links(LINKS_TO_TRACKABLE_RESOURCES) 563 | LINKS_TO_TRANSLATIONS = unified_links(LINKS_TO_TRANSLATIONS) 564 | 565 | try: 566 | OLD_URL_LIST = set() 567 | for filename in (OUTPUT_FILENAME, OUTPUT_RESOURCES_FILENAME, OUTPUT_TRANSLATIONS_FILENAME): 568 | with open(filename, 'r') as f: 569 | OLD_URL_LIST |= set([l.replace('\n', '') for l in f.readlines()]) 570 | 571 | CURRENT_URL_LIST = LINKS_TO_TRACK | LINKS_TO_TRACKABLE_RESOURCES | LINKS_TO_TRANSLATIONS 572 | 573 | logger.info(f'Is equal: {OLD_URL_LIST == CURRENT_URL_LIST}') 574 | logger.info(f'Deleted ({len(OLD_URL_LIST - CURRENT_URL_LIST)}): {OLD_URL_LIST - CURRENT_URL_LIST}') 575 | logger.info(f'Added ({len(CURRENT_URL_LIST - OLD_URL_LIST)}): {CURRENT_URL_LIST - OLD_URL_LIST}') 576 | except IOError: 577 | pass 578 | 579 | with open(OUTPUT_FILENAME, 'w') as f: 580 | f.write('\n'.join(sorted(unified_links(LINKS_TO_TRACK)))) 581 | 582 | with open(OUTPUT_RESOURCES_FILENAME, 'w') as f: 583 | f.write('\n'.join(sorted(unified_links(LINKS_TO_TRACKABLE_RESOURCES)))) 584 | 585 | with open(OUTPUT_TRANSLATIONS_FILENAME, 'w') as f: 586 | f.write('\n'.join(sorted(unified_links(LINKS_TO_TRANSLATIONS)))) 587 | -------------------------------------------------------------------------------- /ccl_bplist.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2012-2016, CCL Forensics 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | * Neither the name of the CCL Forensics nor the 13 | names of its contributors may be used to endorse or promote products 14 | derived from this software without specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL CCL FORENSICS BE LIABLE FOR ANY 20 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | """ 27 | 28 | import sys 29 | import os 30 | import struct 31 | import datetime 32 | 33 | __version__ = "0.21" 34 | __description__ = "Converts Apple binary PList files into a native Python data structure" 35 | __contact__ = "Alex Caithness" 36 | 37 | _object_converter = None 38 | def set_object_converter(function): 39 | """Sets the object converter function to be used when retrieving objects from the bplist. 40 | default is None (which will return objects in their raw form). 41 | A built in converter (ccl_bplist.NSKeyedArchiver_common_objects_convertor) which is geared 42 | toward dealling with common types in NSKeyedArchiver is available which can simplify code greatly 43 | when dealling with these types of files.""" 44 | if not hasattr(function, "__call__"): 45 | raise TypeError("function is not a function") 46 | global _object_converter 47 | _object_converter = function 48 | 49 | class BplistError(Exception): 50 | pass 51 | 52 | class BplistUID: 53 | def __init__(self, value): 54 | self.value = value 55 | 56 | def __repr__(self): 57 | return "UID: {0}".format(self.value) 58 | 59 | def __str__(self): 60 | return self.__repr__() 61 | 62 | def __decode_multibyte_int(b, signed=True): 63 | if len(b) == 1: 64 | fmt = ">B" # Always unsigned? 65 | elif len(b) == 2: 66 | fmt = ">h" 67 | elif len(b) == 3: 68 | if signed: 69 | return ((b[0] << 16) | struct.unpack(">H", b[1:])[0]) - ((b[0] >> 7) * 2 * 0x800000) 70 | else: 71 | return (b[0] << 16) | struct.unpack(">H", b[1:])[0] 72 | elif len(b) == 4: 73 | fmt = ">i" 74 | elif len(b) == 8: 75 | fmt = ">q" 76 | elif len(b) == 16: 77 | # special case for BigIntegers 78 | high, low = struct.unpack(">QQ", b) 79 | result = (high << 64) | low 80 | if high & 0x8000000000000000 and signed: 81 | result -= 0x100000000000000000000000000000000 82 | return result 83 | else: 84 | raise BplistError("Cannot decode multibyte int of length {0}".format(len(b))) 85 | 86 | if signed and len(b) > 1: 87 | return struct.unpack(fmt.lower(), b)[0] 88 | else: 89 | return struct.unpack(fmt.upper(), b)[0] 90 | 91 | def __decode_float(b, signed=True): 92 | if len(b) == 4: 93 | fmt = ">f" 94 | elif len(b) == 8: 95 | fmt = ">d" 96 | else: 97 | raise BplistError("Cannot decode float of length {0}".format(len(b))) 98 | 99 | if signed: 100 | return struct.unpack(fmt.lower(), b)[0] 101 | else: 102 | return struct.unpack(fmt.upper(), b)[0] 103 | 104 | def __decode_object(f, offset, collection_offset_size, offset_table): 105 | # Move to offset and read type 106 | #print("Decoding object at offset {0}".format(offset)) 107 | f.seek(offset) 108 | # A little hack to keep the script portable between py2.x and py3k 109 | if sys.version_info[0] < 3: 110 | type_byte = ord(f.read(1)[0]) 111 | else: 112 | type_byte = f.read(1)[0] 113 | #print("Type byte: {0}".format(hex(type_byte))) 114 | if type_byte == 0x00: # Null 0000 0000 115 | return None 116 | elif type_byte == 0x08: # False 0000 1000 117 | return False 118 | elif type_byte == 0x09: # True 0000 1001 119 | return True 120 | elif type_byte == 0x0F: # Fill 0000 1111 121 | raise BplistError("Fill type not currently supported at offset {0}".format(f.tell())) # Not sure what to return really... 122 | elif type_byte & 0xF0 == 0x10: # Int 0001 xxxx 123 | int_length = 2 ** (type_byte & 0x0F) 124 | int_bytes = f.read(int_length) 125 | return __decode_multibyte_int(int_bytes) 126 | elif type_byte & 0xF0 == 0x20: # Float 0010 nnnn 127 | float_length = 2 ** (type_byte & 0x0F) 128 | float_bytes = f.read(float_length) 129 | return __decode_float(float_bytes) 130 | elif type_byte & 0xFF == 0x33: # Date 0011 0011 131 | date_bytes = f.read(8) 132 | date_value = __decode_float(date_bytes) 133 | try: 134 | result = datetime.datetime(2001,1,1) + datetime.timedelta(seconds = date_value) 135 | except OverflowError: 136 | result = datetime.datetime.min 137 | return result 138 | elif type_byte & 0xF0 == 0x40: # Data 0100 nnnn 139 | if type_byte & 0x0F != 0x0F: 140 | # length in 4 lsb 141 | data_length = type_byte & 0x0F 142 | else: 143 | # A little hack to keep the script portable between py2.x and py3k 144 | if sys.version_info[0] < 3: 145 | int_type_byte = ord(f.read(1)[0]) 146 | else: 147 | int_type_byte = f.read(1)[0] 148 | if int_type_byte & 0xF0 != 0x10: 149 | raise BplistError("Long Data field definition not followed by int type at offset {0}".format(f.tell())) 150 | int_length = 2 ** (int_type_byte & 0x0F) 151 | int_bytes = f.read(int_length) 152 | data_length = __decode_multibyte_int(int_bytes, False) 153 | return f.read(data_length) 154 | elif type_byte & 0xF0 == 0x50: # ASCII 0101 nnnn 155 | if type_byte & 0x0F != 0x0F: 156 | # length in 4 lsb 157 | ascii_length = type_byte & 0x0F 158 | else: 159 | # A little hack to keep the script portable between py2.x and py3k 160 | if sys.version_info[0] < 3: 161 | int_type_byte = ord(f.read(1)[0]) 162 | else: 163 | int_type_byte = f.read(1)[0] 164 | if int_type_byte & 0xF0 != 0x10: 165 | raise BplistError("Long ASCII field definition not followed by int type at offset {0}".format(f.tell())) 166 | int_length = 2 ** (int_type_byte & 0x0F) 167 | int_bytes = f.read(int_length) 168 | ascii_length = __decode_multibyte_int(int_bytes, False) 169 | return f.read(ascii_length).decode("ascii") 170 | elif type_byte & 0xF0 == 0x60: # UTF-16 0110 nnnn 171 | if type_byte & 0x0F != 0x0F: 172 | # length in 4 lsb 173 | utf16_length = (type_byte & 0x0F) * 2 # Length is characters - 16bit width 174 | else: 175 | # A little hack to keep the script portable between py2.x and py3k 176 | if sys.version_info[0] < 3: 177 | int_type_byte = ord(f.read(1)[0]) 178 | else: 179 | int_type_byte = f.read(1)[0] 180 | if int_type_byte & 0xF0 != 0x10: 181 | raise BplistError("Long UTF-16 field definition not followed by int type at offset {0}".format(f.tell())) 182 | int_length = 2 ** (int_type_byte & 0x0F) 183 | int_bytes = f.read(int_length) 184 | utf16_length = __decode_multibyte_int(int_bytes, False) * 2 185 | return f.read(utf16_length).decode("utf_16_be") 186 | elif type_byte & 0xF0 == 0x80: # UID 1000 nnnn 187 | uid_length = (type_byte & 0x0F) + 1 188 | uid_bytes = f.read(uid_length) 189 | return BplistUID(__decode_multibyte_int(uid_bytes, signed=False)) 190 | elif type_byte & 0xF0 == 0xA0: # Array 1010 nnnn 191 | if type_byte & 0x0F != 0x0F: 192 | # length in 4 lsb 193 | array_count = type_byte & 0x0F 194 | else: 195 | # A little hack to keep the script portable between py2.x and py3k 196 | if sys.version_info[0] < 3: 197 | int_type_byte = ord(f.read(1)[0]) 198 | else: 199 | int_type_byte = f.read(1)[0] 200 | if int_type_byte & 0xF0 != 0x10: 201 | raise BplistError("Long Array field definition not followed by int type at offset {0}".format(f.tell())) 202 | int_length = 2 ** (int_type_byte & 0x0F) 203 | int_bytes = f.read(int_length) 204 | array_count = __decode_multibyte_int(int_bytes, signed=False) 205 | array_refs = [] 206 | for i in range(array_count): 207 | array_refs.append(__decode_multibyte_int(f.read(collection_offset_size), False)) 208 | return [__decode_object(f, offset_table[obj_ref], collection_offset_size, offset_table) for obj_ref in array_refs] 209 | elif type_byte & 0xF0 == 0xC0: # Set 1010 nnnn 210 | if type_byte & 0x0F != 0x0F: 211 | # length in 4 lsb 212 | set_count = type_byte & 0x0F 213 | else: 214 | # A little hack to keep the script portable between py2.x and py3k 215 | if sys.version_info[0] < 3: 216 | int_type_byte = ord(f.read(1)[0]) 217 | else: 218 | int_type_byte = f.read(1)[0] 219 | if int_type_byte & 0xF0 != 0x10: 220 | raise BplistError("Long Set field definition not followed by int type at offset {0}".format(f.tell())) 221 | int_length = 2 ** (int_type_byte & 0x0F) 222 | int_bytes = f.read(int_length) 223 | set_count = __decode_multibyte_int(int_bytes, signed=False) 224 | set_refs = [] 225 | for i in range(set_count): 226 | set_refs.append(__decode_multibyte_int(f.read(collection_offset_size), False)) 227 | return [__decode_object(f, offset_table[obj_ref], collection_offset_size, offset_table) for obj_ref in set_refs] 228 | elif type_byte & 0xF0 == 0xD0: # Dict 1011 nnnn 229 | if type_byte & 0x0F != 0x0F: 230 | # length in 4 lsb 231 | dict_count = type_byte & 0x0F 232 | else: 233 | # A little hack to keep the script portable between py2.x and py3k 234 | if sys.version_info[0] < 3: 235 | int_type_byte = ord(f.read(1)[0]) 236 | else: 237 | int_type_byte = f.read(1)[0] 238 | #print("Dictionary length int byte: {0}".format(hex(int_type_byte))) 239 | if int_type_byte & 0xF0 != 0x10: 240 | raise BplistError("Long Dict field definition not followed by int type at offset {0}".format(f.tell())) 241 | int_length = 2 ** (int_type_byte & 0x0F) 242 | int_bytes = f.read(int_length) 243 | dict_count = __decode_multibyte_int(int_bytes, signed=False) 244 | key_refs = [] 245 | #print("Dictionary count: {0}".format(dict_count)) 246 | for i in range(dict_count): 247 | key_refs.append(__decode_multibyte_int(f.read(collection_offset_size), False)) 248 | value_refs = [] 249 | for i in range(dict_count): 250 | value_refs.append(__decode_multibyte_int(f.read(collection_offset_size), False)) 251 | 252 | dict_result = {} 253 | for i in range(dict_count): 254 | #print("Key ref: {0}\tVal ref: {1}".format(key_refs[i], value_refs[i])) 255 | key = __decode_object(f, offset_table[key_refs[i]], collection_offset_size, offset_table) 256 | val = __decode_object(f, offset_table[value_refs[i]], collection_offset_size, offset_table) 257 | dict_result[key] = val 258 | return dict_result 259 | 260 | 261 | def load(f): 262 | """ 263 | Reads and converts a file-like object containing a binary property list. 264 | Takes a file-like object (must support reading and seeking) as an argument 265 | Returns a data structure representing the data in the property list 266 | """ 267 | # Check magic number 268 | if f.read(8) != b"bplist00": 269 | raise BplistError("Bad file header") 270 | 271 | # Read trailer 272 | f.seek(-32, os.SEEK_END) 273 | trailer = f.read(32) 274 | offset_int_size, collection_offset_size, object_count, top_level_object_index, offest_table_offset = struct.unpack(">6xbbQQQ", trailer) 275 | 276 | # Read offset table 277 | f.seek(offest_table_offset) 278 | offset_table = [] 279 | for i in range(object_count): 280 | offset_table.append(__decode_multibyte_int(f.read(offset_int_size), False)) 281 | 282 | return __decode_object(f, offset_table[top_level_object_index], collection_offset_size, offset_table) 283 | 284 | 285 | def NSKeyedArchiver_common_objects_convertor(o): 286 | """Built in converter function (suitable for submission to set_object_converter()) which automatically 287 | converts the following common data-types found in NSKeyedArchiver: 288 | NSDictionary/NSMutableDictionary; 289 | NSArray/NSMutableArray; 290 | NSSet/NSMutableSet 291 | NSString/NSMutableString 292 | NSDate 293 | $null strings""" 294 | # Conversion: NSDictionary 295 | if is_nsmutabledictionary(o): 296 | return convert_NSMutableDictionary(o) 297 | # Conversion: NSArray 298 | elif is_nsarray(o): 299 | return convert_NSArray(o) 300 | elif is_isnsset(o): 301 | return convert_NSSet(o) 302 | # Conversion: NSString 303 | elif is_nsstring(o): 304 | return convert_NSString(o) 305 | # Conversion: NSDate 306 | elif is_nsdate(o): 307 | return convert_NSDate(o) 308 | # Conversion: "$null" string 309 | elif isinstance(o, str) and o == "$null": 310 | return None 311 | # Fallback: 312 | else: 313 | return o 314 | 315 | def NSKeyedArchiver_convert(o, object_table): 316 | if isinstance(o, list): 317 | #return NsKeyedArchiverList(o, object_table) 318 | result = NsKeyedArchiverList(o, object_table) 319 | elif isinstance(o, dict): 320 | #return NsKeyedArchiverDictionary(o, object_table) 321 | result = NsKeyedArchiverDictionary(o, object_table) 322 | elif isinstance(o, BplistUID): 323 | #return NSKeyedArchiver_convert(object_table[o.value], object_table) 324 | result = NSKeyedArchiver_convert(object_table[o.value], object_table) 325 | else: 326 | #return o 327 | result = o 328 | 329 | if _object_converter: 330 | return _object_converter(result) 331 | else: 332 | return result 333 | 334 | 335 | class NsKeyedArchiverDictionary(dict): 336 | def __init__(self, original_dict, object_table): 337 | super(NsKeyedArchiverDictionary, self).__init__(original_dict) 338 | self.object_table = object_table 339 | 340 | def __getitem__(self, index): 341 | o = super(NsKeyedArchiverDictionary, self).__getitem__(index) 342 | return NSKeyedArchiver_convert(o, self.object_table) 343 | 344 | def get(self, key, default=None): 345 | return self[key] if key in self else default 346 | 347 | def values(self): 348 | for k in self: 349 | yield self[k] 350 | 351 | def items(self): 352 | for k in self: 353 | yield k, self[k] 354 | 355 | class NsKeyedArchiverList(list): 356 | def __init__(self, original_iterable, object_table): 357 | super(NsKeyedArchiverList, self).__init__(original_iterable) 358 | self.object_table = object_table 359 | 360 | def __getitem__(self, index): 361 | o = super(NsKeyedArchiverList, self).__getitem__(index) 362 | return NSKeyedArchiver_convert(o, self.object_table) 363 | 364 | def __iter__(self): 365 | for o in super(NsKeyedArchiverList, self).__iter__(): 366 | yield NSKeyedArchiver_convert(o, self.object_table) 367 | 368 | 369 | def deserialise_NsKeyedArchiver(obj, parse_whole_structure=False): 370 | """Deserialises an NSKeyedArchiver bplist rebuilding the structure. 371 | obj should usually be the top-level object returned by the load() 372 | function.""" 373 | 374 | # Check that this is an archiver and version we understand 375 | if not isinstance(obj, dict): 376 | raise TypeError("obj must be a dict") 377 | if "$archiver" not in obj or obj["$archiver"] not in ("NSKeyedArchiver", "NRKeyedArchiver"): 378 | raise ValueError("obj does not contain an '$archiver' key or the '$archiver' is unrecognised") 379 | if "$version" not in obj or obj["$version"] != 100000: 380 | raise ValueError("obj does not contain a '$version' key or the '$version' is unrecognised") 381 | 382 | object_table = obj["$objects"] 383 | if "root" in obj["$top"] and not parse_whole_structure: 384 | return NSKeyedArchiver_convert(obj["$top"]["root"], object_table) 385 | else: 386 | return NSKeyedArchiver_convert(obj["$top"], object_table) 387 | 388 | # NSMutableDictionary convenience functions 389 | def is_nsmutabledictionary(obj): 390 | if not isinstance(obj, dict): 391 | return False 392 | if "$class" not in obj.keys(): 393 | return False 394 | if obj["$class"].get("$classname") not in ("NSMutableDictionary", "NSDictionary"): 395 | return False 396 | if "NS.keys" not in obj.keys(): 397 | return False 398 | if "NS.objects" not in obj.keys(): 399 | return False 400 | 401 | return True 402 | 403 | def convert_NSMutableDictionary(obj): 404 | """Converts a NSKeyedArchiver serialised NSMutableDictionary into 405 | a straight dictionary (rather than two lists as it is serialised 406 | as)""" 407 | 408 | # The dictionary is serialised as two lists (one for keys and one 409 | # for values) which obviously removes all convenience afforded by 410 | # dictionaries. This function converts this structure to an 411 | # actual dictionary so that values can be accessed by key. 412 | 413 | if not is_nsmutabledictionary(obj): 414 | raise ValueError("obj does not have the correct structure for a NSDictionary/NSMutableDictionary serialised to a NSKeyedArchiver") 415 | keys = obj["NS.keys"] 416 | vals = obj["NS.objects"] 417 | 418 | # sense check the keys and values: 419 | if not isinstance(keys, list): 420 | raise TypeError("The 'NS.keys' value is an unexpected type (expected list; actual: {0}".format(type(keys))) 421 | if not isinstance(vals, list): 422 | raise TypeError("The 'NS.objects' value is an unexpected type (expected list; actual: {0}".format(type(vals))) 423 | if len(keys) != len(vals): 424 | raise ValueError("The length of the 'NS.keys' list ({0}) is not equal to that of the 'NS.objects ({1})".format(len(keys), len(vals))) 425 | 426 | result = {} 427 | for i,k in enumerate(keys): 428 | if k in result: 429 | raise ValueError("The 'NS.keys' list contains duplicate entries") 430 | result[k] = vals[i] 431 | 432 | return result 433 | 434 | # NSArray convenience functions 435 | def is_nsarray(obj): 436 | if not isinstance(obj, dict): 437 | return False 438 | if "$class" not in obj.keys(): 439 | return False 440 | if obj["$class"].get("$classname") not in ("NSArray", "NSMutableArray"): 441 | return False 442 | if "NS.objects" not in obj.keys(): 443 | return False 444 | 445 | return True 446 | 447 | def convert_NSArray(obj): 448 | if not is_nsarray(obj): 449 | raise ValueError("obj does not have the correct structure for a NSArray/NSMutableArray serialised to a NSKeyedArchiver") 450 | 451 | return obj["NS.objects"] 452 | 453 | # NSSet convenience functions 454 | def is_isnsset(obj): 455 | if not isinstance(obj, dict): 456 | return False 457 | if "$class" not in obj.keys(): 458 | return False 459 | if obj["$class"].get("$classname") not in ("NSSet", "NSMutableSet"): 460 | return False 461 | if "NS.objects" not in obj.keys(): 462 | return False 463 | 464 | return True 465 | 466 | def convert_NSSet(obj): 467 | if not is_isnsset(obj): 468 | raise ValueError("obj does not have the correct structure for a NSSet/NSMutableSet serialised to a NSKeyedArchiver") 469 | 470 | return list(obj["NS.objects"]) 471 | 472 | # NSString convenience functions 473 | def is_nsstring(obj): 474 | if not isinstance(obj, dict): 475 | return False 476 | if "$class" not in obj.keys(): 477 | return False 478 | if obj["$class"].get("$classname") not in ("NSString", "NSMutableString"): 479 | return False 480 | if "NS.string" not in obj.keys(): 481 | return False 482 | return True 483 | 484 | def convert_NSString(obj): 485 | if not is_nsstring(obj): 486 | raise ValueError("obj does not have the correct structure for a NSString/NSMutableString serialised to a NSKeyedArchiver") 487 | 488 | return obj["NS.string"] 489 | 490 | # NSDate convenience functions 491 | def is_nsdate(obj): 492 | if not isinstance(obj, dict): 493 | return False 494 | if "$class" not in obj.keys(): 495 | return False 496 | if obj["$class"].get("$classname") not in ("NSDate"): 497 | return False 498 | if "NS.time" not in obj.keys(): 499 | return False 500 | 501 | return True 502 | 503 | def convert_NSDate(obj): 504 | if not is_nsdate(obj): 505 | raise ValueError("obj does not have the correct structure for a NSDate serialised to a NSKeyedArchiver") 506 | 507 | return datetime.datetime(2001, 1, 1) + datetime.timedelta(seconds=obj["NS.time"]) 508 | -------------------------------------------------------------------------------- /make_files_tree.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import hashlib 3 | import json 4 | import logging 5 | import mimetypes 6 | import os 7 | import platform 8 | import random 9 | import re 10 | import shutil 11 | import socket 12 | import uuid 13 | import zipfile 14 | from asyncio.exceptions import TimeoutError 15 | from string import punctuation, whitespace 16 | from time import time 17 | from typing import List, Optional 18 | from xml.etree import ElementTree 19 | 20 | import aiofiles 21 | import aiohttp 22 | import uvloop 23 | from aiohttp import ClientConnectorError, ServerDisconnectedError 24 | 25 | import ccl_bplist 26 | 27 | PROTOCOL = 'https://' 28 | ILLEGAL_PATH_CHARS = punctuation.replace('.', '') + whitespace 29 | 30 | CRAWL_STATUS_CODE_EXCLUSIONS = { 31 | 'webappinternal.telegram.org/botfather', 32 | 'webappinternal.telegram.org/stickers', 33 | } 34 | 35 | DYNAMIC_PART_MOCK = 'telegram-crawler' 36 | 37 | INPUT_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_links.txt') 38 | INPUT_RES_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_res_links.txt') 39 | INPUT_TR_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_tr_links.txt') 40 | OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', 'data/') 41 | OUTPUT_MTPROTO_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_MTPROTO_FOLDER', 'server/')) 42 | OUTPUT_SITES_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_SITES_FOLDER', 'web/')) 43 | OUTPUT_CLIENTS_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_CLIENTS_FOLDER', 'client/')) 44 | OUTPUT_RESOURCES_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_RESOURCES_FOLDER', 'web_res/')) 45 | OUTPUT_TRANSLATIONS_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_RESOURCES_FOLDER', 'web_tr/')) 46 | OUTPUT_MINI_APPS_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_MINI_APPS_FOLDER', 'mini_app/')) 47 | 48 | TRANSLATIONS_EN_CATEGORY_URL_REGEX = r'/en/[a-z_]+/[a-z_]+$' 49 | 50 | PAGE_GENERATION_TIME_REGEX = r'' 51 | PAGE_API_HASH_REGEX = r'\?hash=[a-z0-9]+' 52 | PAGE_API_HASH_TEMPLATE = f'?hash={DYNAMIC_PART_MOCK}' 53 | TON_RATE_REGEX = r'"tonRate":"[.0-9]+"' 54 | TON_RATE_TEMPLATE = f'"tonRate":"{DYNAMIC_PART_MOCK}"' 55 | APK_BETA_TOKEN_REGEX = r'apk\?token=.*?"' 56 | APK_BETA_TOKEN_TEMPLATE = f'apk?token={DYNAMIC_PART_MOCK}"' 57 | PASSPORT_SSID_REGEX = r'passport_ssid=[a-z0-9]+_[a-z0-9]+_[a-z0-9]+' 58 | PASSPORT_SSID_TEMPLATE = f'passport_ssid={DYNAMIC_PART_MOCK}' 59 | NONCE_REGEX = r'"nonce":"[a-z0-9]+_[a-z0-9]+_[a-z0-9]+' 60 | NONCE_TEMPLATE = f'"nonce":"{DYNAMIC_PART_MOCK}' 61 | PROXY_CONFIG_SUB_NET_REGEX = r'\d+\.\d+:8888;' 62 | PROXY_CONFIG_SUB_NET_TEMPLATE = 'X.X:8888;' 63 | TRANSLATE_SUGGESTION_REGEX = r'
(.?)+
' 64 | SPARKLE_SIG_REGEX = r';sig=(.*?);' 65 | SPARKLE_SE_REGEX = r';se=(.*?);' 66 | SPARKLE_SIG_TEMPLATE = f';sig={DYNAMIC_PART_MOCK};' 67 | SPARKLE_SE_TEMPLATE = f';se={DYNAMIC_PART_MOCK};' 68 | 69 | STEL_DEV_LAYER = 190 70 | 71 | TIMEOUT = aiohttp.ClientTimeout( # mediumly sized from link collector 72 | total=60, 73 | connect=60, 74 | sock_connect=30, 75 | sock_read=60, 76 | ) 77 | HEADERS = { 78 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:99.0) Gecko/20100101 Firefox/99.0', 79 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 80 | 'Accept-Language': 'en-US,en;q=0.5', 81 | 'Accept-Encoding': 'gzip, deflate, br', 82 | 'DNT': '1', 83 | 'Connection': 'keep-alive', 84 | 'Cookie': f'stel_ln=en; stel_dev_layer={STEL_DEV_LAYER}', 85 | 'Upgrade-Insecure-Requests': '1', 86 | 'Sec-Fetch-Dest': 'document', 87 | 'Sec-Fetch-Mode': 'navigate', 88 | 'Sec-Fetch-Site': 'none', 89 | 'Sec-Fetch-User': '?1', 90 | 'Cache-Control': 'max-age=0', 91 | 'TE': 'trailers', 92 | } 93 | 94 | logging.basicConfig(format='%(message)s', level=logging.INFO) 95 | logger = logging.getLogger(__name__) 96 | 97 | 98 | def get_hash(data: bytes) -> str: 99 | return hashlib.sha256(data).hexdigest() 100 | 101 | 102 | async def download_file(url: str, path: str, session: aiohttp.ClientSession): 103 | params = {'tgcrawlNoCache': uuid.uuid4().hex} 104 | async with session.get(url, params=params) as response: 105 | if response.status != 200: 106 | return 107 | 108 | content = await response.read() 109 | 110 | async with aiofiles.open(path, mode='wb') as f: 111 | await f.write(content) 112 | 113 | 114 | async def track_additional_files( 115 | files_to_track: List[str], input_dir_name: str, output_dir_name: str, encoding='utf-8', save_hash_only=False 116 | ): 117 | kwargs = {'mode': 'r', 'encoding': encoding} 118 | if save_hash_only: 119 | kwargs['mode'] = 'rb' 120 | del kwargs['encoding'] 121 | 122 | for file in files_to_track: 123 | async with aiofiles.open(os.path.join(input_dir_name, file), **kwargs) as r_file: 124 | content = await r_file.read() 125 | 126 | if save_hash_only: 127 | content = get_hash(content) 128 | else: 129 | content = re.sub(r'id=".*"', 'id="tgcrawl"', content) 130 | 131 | filename = os.path.join(output_dir_name, file) 132 | os.makedirs(os.path.dirname(filename), exist_ok=True) 133 | async with aiofiles.open(filename, 'w', encoding='utf-8') as w_file: 134 | await w_file.write(content) 135 | 136 | 137 | async def get_download_link_of_latest_macos_release(remote_updates_manifest_url: str, session: aiohttp.ClientSession) -> Optional[str]: 138 | async with session.get(remote_updates_manifest_url) as response: 139 | if response.status != 200: 140 | logger.error(f'Error {response.status} while fetching {remote_updates_manifest_url}') 141 | return None 142 | 143 | try: 144 | response = await response.text() # we do expect XML here 145 | except Exception as e: 146 | logger.error(f'Error processing response: {e}') 147 | return None 148 | 149 | if not isinstance(response, str) and not response.lstrip().startswith(' 5: 260 | continue 261 | 262 | valid_strings.append(string.strip()) 263 | 264 | valid_strings = sorted(list(set(valid_strings))) 265 | with open(os.path.join(crawled_data_folder, 'strings.txt'), 'w', encoding='utf-8') as f: 266 | f.write('\n'.join(valid_strings)) 267 | 268 | cleanup2() 269 | 270 | 271 | async def download_telegram_ios_beta_and_extract_resources(session: aiohttp.ClientSession): 272 | # TODO fetch version automatically 273 | # ref: https://docs.github.com/en/rest/releases/releases#get-the-latest-release 274 | version = '9.0.24102' 275 | 276 | download_url = f'https://github.com/MarshalX/decrypted-telegram-ios/releases/download/{version}/Telegram-{version}.ipa' 277 | tool_download_url = 'https://github.com/MarshalX/acextract/releases/download/3.0/acextract' 278 | 279 | ipa_filename = f'Telegram-{version}.ipa' 280 | assets_extractor = 'acextract_ios' 281 | assets_filename = 'Assets.car' 282 | assets_output_dir = 'ios_assets' 283 | client_folder_name = 'ios' 284 | crawled_data_folder = os.path.join(OUTPUT_CLIENTS_FOLDER, 'ios-beta') 285 | 286 | if 'darwin' not in platform.system().lower(): 287 | await download_file(download_url, ipa_filename, session) 288 | else: 289 | await asyncio.gather( 290 | download_file(download_url, ipa_filename, session), 291 | download_file(tool_download_url, assets_extractor, session), 292 | ) 293 | 294 | # synced 295 | with zipfile.ZipFile(ipa_filename, 'r') as f: 296 | f.extractall(client_folder_name) 297 | 298 | resources_path = 'Payload/Telegram.app' 299 | 300 | files_to_convert = [ 301 | f'{resources_path}/en.lproj/Localizable.strings', 302 | f'{resources_path}/en.lproj/InfoPlist.strings', 303 | f'{resources_path}/en.lproj/AppIntentVocabulary.plist', 304 | ] 305 | for filename in files_to_convert: 306 | path = os.path.join(client_folder_name, filename) 307 | 308 | # synced cuz ccl_bplist works with file objects and doesn't support asyncio 309 | with open(path, 'rb') as r_file: 310 | plist = ccl_bplist.load(r_file) 311 | 312 | async with aiofiles.open(path, 'w', encoding='utf-8') as w_file: 313 | await w_file.write(json.dumps(plist, indent=4)) 314 | 315 | files_to_track = files_to_convert + [ 316 | f'{resources_path}/_CodeSignature/CodeResources', 317 | f'{resources_path}/SC_Info/Manifest.plist', 318 | ] 319 | await track_additional_files(files_to_track, client_folder_name, crawled_data_folder) 320 | 321 | resources_folder = os.path.join(client_folder_name, resources_path) 322 | crawled_resources_folder = os.path.join(crawled_data_folder, resources_path) 323 | _, _, hash_of_files_to_track = next(os.walk(resources_folder)) 324 | await track_additional_files( 325 | hash_of_files_to_track, resources_folder, crawled_resources_folder, save_hash_only=True 326 | ) 327 | 328 | def cleanup1(): 329 | os.path.isdir(client_folder_name) and shutil.rmtree(client_folder_name) 330 | os.remove(ipa_filename) 331 | 332 | # sry for copy-paste from macos def ;d 333 | 334 | # .car crawler works only in macOS 335 | if 'darwin' not in platform.system().lower(): 336 | cleanup1() 337 | return 338 | 339 | path_to_car = os.path.join(resources_folder, assets_filename) 340 | await (await asyncio.create_subprocess_exec('chmod', '+x', assets_extractor)).communicate() 341 | process = await asyncio.create_subprocess_exec(f'./{assets_extractor}', '-i', path_to_car, '-o', assets_output_dir) 342 | await process.communicate() 343 | 344 | def cleanup2(): 345 | cleanup1() 346 | os.path.isdir(assets_output_dir) and shutil.rmtree(assets_output_dir) 347 | os.remove(assets_extractor) 348 | 349 | if process.returncode != 0: 350 | cleanup2() 351 | return 352 | 353 | for dir_path, _, hash_of_files_to_track in os.walk(assets_output_dir): 354 | await track_additional_files( 355 | # sry for this shit ;d 356 | [os.path.join(dir_path, file).replace(f'{assets_output_dir}/', '') for file in hash_of_files_to_track], 357 | assets_output_dir, 358 | os.path.join(crawled_data_folder, assets_filename), 359 | save_hash_only=True 360 | ) 361 | 362 | cleanup2() 363 | 364 | 365 | async def download_telegram_android_and_extract_resources(session: aiohttp.ClientSession) -> None: 366 | await download_telegram_android_stable_dl_and_extract_resources(session) 367 | await download_telegram_android_beta_and_extract_resources(session) 368 | 369 | 370 | async def download_telegram_android_stable_dl_and_extract_resources(session: aiohttp.ClientSession): 371 | download_url = 'https://telegram.org/dl/android/apk' 372 | 373 | await _download_telegram_android_and_extract_resources(session, download_url, 'android-stable-dl') 374 | 375 | 376 | async def download_telegram_android_beta_and_extract_resources(session: aiohttp.ClientSession): 377 | download_url = 'https://telegram.org/dl/android/apk-public-beta' 378 | 379 | await _download_telegram_android_and_extract_resources(session, download_url, 'android-beta') 380 | 381 | 382 | async def _download_telegram_android_and_extract_resources( 383 | session: aiohttp.ClientSession, download_url: str, folder_name: str 384 | ): 385 | crawled_data_folder = os.path.join(OUTPUT_CLIENTS_FOLDER, folder_name) 386 | 387 | if not download_url: 388 | return 389 | 390 | await asyncio.gather( 391 | download_file('https://bitbucket.org/iBotPeaches/apktool/downloads/apktool_2.9.0.jar', 'tool.apk', session), 392 | download_file(download_url, 'android.apk', session), 393 | ) 394 | 395 | def cleanup(): 396 | os.path.isdir('android') and shutil.rmtree('android') 397 | os.remove('tool.apk') 398 | os.remove('android.apk') 399 | 400 | process = await asyncio.create_subprocess_exec( 401 | 'java', '-jar', 'tool.apk', 'd', '-s', '-f', 'android.apk', 402 | stdout=asyncio.subprocess.PIPE, 403 | stderr=asyncio.subprocess.STDOUT 404 | ) 405 | await process.communicate() 406 | 407 | if process.returncode != 0: 408 | cleanup() 409 | return 410 | 411 | files_to_track = [ 412 | 'res/values/strings.xml', 413 | 'res/values/public.xml' 414 | ] 415 | await track_additional_files(files_to_track, 'android', crawled_data_folder) 416 | 417 | cleanup() 418 | 419 | 420 | def parse_string_with_possible_json(input_string) -> dict: 421 | # chat gtp powered code: 422 | try: 423 | # Attempt to parse the entire input string as JSON 424 | json_object = json.loads(input_string) 425 | except json.JSONDecodeError as e: 426 | # Regular expression to find JSON objects within the string 427 | json_regex = r'{[^{}]*}' 428 | matches = re.findall(json_regex, input_string) 429 | 430 | if matches: 431 | # Use the first match as the extracted JSON 432 | json_object = json.loads(matches[0]) 433 | else: 434 | raise ValueError('No JSON found within the input string.') 435 | 436 | return json_object 437 | 438 | 439 | async def crawl_mini_app_wallet(): 440 | crawled_data_folder = os.path.join(OUTPUT_MINI_APPS_FOLDER, 'wallet') 441 | 442 | def cleanup(): 443 | os.path.isdir('wallet') and shutil.rmtree('wallet') 444 | 445 | async def _run_unwebpack_sourcemap(url: str): 446 | process = await asyncio.create_subprocess_exec( 447 | 'python', 'unwebpack_sourcemap.py', '--make-directory', '--detect', url, 'wallet', 448 | ) 449 | await process.communicate() 450 | 451 | if process.returncode != 0: 452 | cleanup() 453 | raise RuntimeError('unwebpack_sourcemap failed') 454 | 455 | crawled_unpacked_folder = os.path.join('wallet', 'webpack', 'wallet-react-form') 456 | 457 | await _run_unwebpack_sourcemap('https://walletbot.me/') 458 | 459 | webpack_chunks_db_path = os.path.join(crawled_unpacked_folder, 'webpack', 'runtime', 'get javascript chunk filename') 460 | webpack_chunks_db = parse_string_with_possible_json(open(webpack_chunks_db_path, 'r').read()) 461 | for chunk_id, chunk_name in webpack_chunks_db.items(): 462 | await _run_unwebpack_sourcemap(f'https://walletbot.me/static/js/{chunk_id}.{chunk_name}.js') 463 | 464 | files_to_track = [] 465 | 466 | crawled_empty_0_folder = os.path.join(crawled_unpacked_folder, 'empty_0') 467 | crawled_src_folder = os.path.join(crawled_empty_0_folder, 'src') 468 | for root, folders, files in os.walk(crawled_src_folder): 469 | for file in files: 470 | files_to_track.append(os.path.join(root, file).replace(f'{crawled_empty_0_folder}/', '')) 471 | 472 | await track_additional_files(files_to_track, crawled_empty_0_folder, crawled_data_folder) 473 | 474 | cleanup() 475 | 476 | 477 | async def collect_translations_paginated_content(url: str, session: aiohttp.ClientSession) -> str: 478 | import cssutils 479 | from bs4 import BeautifulSoup 480 | 481 | css_parser = cssutils.CSSParser(loglevel=logging.FATAL, raiseExceptions=False) 482 | 483 | headers = {'X-Requested-With': 'XMLHttpRequest'} 484 | content = dict() 485 | 486 | async def _get_page(offset: int): 487 | logger.info(f'Url: {url}, offset: {offset}') 488 | data = {'offset': offset, 'more': 1} 489 | 490 | try: 491 | new_offset = None 492 | async with session.post( 493 | f'{PROTOCOL}{url}', data=data, headers=headers, allow_redirects=False, timeout=TIMEOUT 494 | ) as response: 495 | if (499 < response.status < 600) or (response.status != 200): 496 | logger.debug(f'Resend cuz {response.status}') 497 | new_offset = offset 498 | else: 499 | res_json = await response.json(encoding='UTF-8') 500 | if 'more_html' in res_json and res_json['more_html']: 501 | res_json['more_html'] = re.sub(TRANSLATE_SUGGESTION_REGEX, '', res_json['more_html']) 502 | 503 | soup = BeautifulSoup(res_json['more_html'], 'html.parser') 504 | tr_items = soup.find_all('div', {'class': 'tr-key-row-wrap'}) 505 | for tr_item in tr_items: 506 | tr_key = tr_item.find('div', {'class': 'tr-value-key'}).text 507 | 508 | tr_url = tr_item.find('div', {'class': 'tr-key-row'})['data-href'] 509 | tr_url = f'https://translations.telegram.org{tr_url}' 510 | 511 | tr_photo = tr_item.find('a', {'class': 'tr-value-photo'}) 512 | if tr_photo: 513 | tr_photo = css_parser.parseStyle(tr_photo['style']).backgroundImage[5:-2] 514 | 515 | tr_has_binding = tr_item.find('span', {'class': 'has-binding binding'}) 516 | tr_has_binding = tr_has_binding is not None 517 | 518 | tr_values = tr_item.find_all('span', {'class': 'value'}) 519 | tr_value_singular, *tr_value_plural = [tr_value.decode_contents() for tr_value in tr_values] 520 | tr_values = {'singular': tr_value_singular} 521 | if tr_value_plural: 522 | tr_values['plural'] = tr_value_plural[0] 523 | 524 | content[tr_key] = { 525 | 'url': tr_url, 526 | 'photo_url': tr_photo, 527 | 'has_binding': tr_has_binding, 528 | 'values': tr_values, 529 | } 530 | 531 | new_offset = offset + 200 532 | 533 | new_offset and await _get_page(new_offset) 534 | except (ServerDisconnectedError, TimeoutError, ClientConnectorError): 535 | logger.warning(f'Client or timeout error. Retrying {url}; offset {offset}') 536 | await _get_page(offset) 537 | 538 | await _get_page(0) 539 | 540 | content = dict(sorted(content.items())) 541 | return json.dumps(content, indent=4, ensure_ascii=False) 542 | 543 | 544 | async def track_mtproto_methods(): 545 | ##################### 546 | # PATH BROKEN PYROGRAM 547 | import pkgutil 548 | from pathlib import Path 549 | pyrogram_path = Path(pkgutil.get_loader('pyrogram').path).parent 550 | broken_class_path = os.path.join(pyrogram_path, 'raw', 'types', 'story_fwd_header.py') 551 | with open(broken_class_path, 'w', encoding='UTF-8') as f: 552 | # I rly don't want to fix bug in pyrogram about using reserved words as argument names 553 | f.write('class StoryFwdHeader: ...') 554 | ##################### 555 | 556 | from pyrogram import Client 557 | 558 | kw = { 559 | 'api_id': int(os.environ['TELEGRAM_API_ID']), 560 | 'api_hash': os.environ['TELEGRAM_API_HASH'], 561 | 'app_version': '@tgcrawl', 562 | 'in_memory': True 563 | } 564 | 565 | test_dc = 2 566 | test_phone_prefix = '99966' 567 | test_phone_suffix = os.environ.get('TELEGRAM_TEST_PHONE_SUFFIX', random.randint(1000, 9999)) 568 | test_phone_number = f'{test_phone_prefix}{test_dc}{test_phone_suffix}' 569 | test_phone_code = str(test_dc) * 5 570 | 571 | app_test = Client('crawler_test', phone_number=test_phone_number, phone_code=test_phone_code, test_mode=True, **kw) 572 | app = Client('crawler', session_string=os.environ['TELEGRAM_SESSION'], **kw) 573 | 574 | await asyncio.gather(app_test.start(), app.start()) 575 | await asyncio.gather(_fetch_and_track_mtproto(app, ''), _fetch_and_track_mtproto(app_test, 'test')) 576 | 577 | 578 | async def _fetch_and_track_mtproto(app, output_dir): 579 | from pyrogram.raw import functions 580 | from pyrogram.raw.types import InputStickerSetShortName 581 | 582 | configs = { 583 | 'GetConfig': await app.invoke(functions.help.GetConfig()), 584 | 'GetCdnConfig': await app.invoke(functions.help.GetCdnConfig()), 585 | # 'GetInviteText': await app.invoke(functions.help.GetInviteText()), 586 | # 'GetSupport': await app.invoke(functions.help.GetSupport()), 587 | # 'GetSupportName': await app.invoke(functions.help.GetSupportName()), 588 | # 'GetPassportConfig': await app.invoke(functions.help.GetPassportConfig(hash=0)), 589 | 'GetCountriesList': await app.invoke(functions.help.GetCountriesList(lang_code='en', hash=0)), 590 | 'GetAppConfig': await app.invoke(functions.help.GetAppConfig(hash=0)), 591 | # 'GetAppUpdate': await app.invoke(functions.help.GetAppUpdate(source='')), 592 | # 'AnimatedEmoji': await app.invoke( 593 | # functions.messages.GetStickerSet(stickerset=InputStickerSetAnimatedEmoji(), hash=0) 594 | # ), 595 | 'GetAvailableReactions': await app.invoke(functions.messages.GetAvailableReactions(hash=0)), 596 | 'GetPremiumPromo': await app.invoke(functions.help.GetPremiumPromo()), 597 | } 598 | 599 | sticker_set_short_names = { 600 | 'EmojiAnimations', 601 | 'EmojiAroundAnimations', 602 | 'EmojiShortAnimations', 603 | 'EmojiAppearAnimations', 604 | 'EmojiCenterAnimations', 605 | 'AnimatedEmojies', 606 | 'EmojiGenericAnimations', 607 | } 608 | 609 | if app.test_mode: 610 | sticker_set_short_names.add('PremiumGifts') 611 | sticker_set_short_names.add('StatusEmojiWhite') 612 | else: 613 | sticker_set_short_names.add('UtyaDuckFull') 614 | sticker_set_short_names.add('GiftsPremium') 615 | sticker_set_short_names.add('StatusPack') 616 | sticker_set_short_names.add('RestrictedEmoji') 617 | 618 | for short_name in sticker_set_short_names: 619 | sticker_set = await app.invoke(functions.messages.GetStickerSet( 620 | stickerset=InputStickerSetShortName(short_name=short_name), hash=0 621 | )) 622 | configs[f'sticker_set/{short_name}'] = sticker_set 623 | 624 | bots_usernames_to_track = {'BotFather', 'DurgerKingBot', 'asmico_attach_bot'} 625 | if app.test_mode: 626 | bots_usernames_to_track.add('izpremiumbot') 627 | else: 628 | bots_usernames_to_track.add('PremiumBot') 629 | 630 | bots_usernames_to_track.clear() 631 | for bot_username in bots_usernames_to_track: 632 | bot_peer = await app.resolve_peer(bot_username) 633 | bot_full = (await app.invoke(functions.users.GetFullUser(id=bot_peer))) 634 | configs[f'bot/{bot_username}'] = f'{{"full_user": {str(bot_full.full_user)}, "users": {str(bot_full.users)}}}' 635 | 636 | peers_to_track = set() 637 | if not app.test_mode: 638 | peers_to_track.add('invoice') 639 | peers_to_track.add('premium') 640 | 641 | peers_to_track.clear() 642 | for peer_id in peers_to_track: 643 | peer = await app.resolve_peer(peer_id) 644 | configs[f'peer/{peer_id}'] = peer 645 | 646 | configs['GetPremiumPromo'].users = [] 647 | configs['GetPremiumPromo'].status_text = 'crawler' 648 | configs['GetPremiumPromo'].status_entities = [] 649 | configs['GetPremiumPromo'].period_options = [] 650 | 651 | configs['GetAppConfig'].hash = 'crawler' 652 | 653 | keys_to_hide = {'access_hash', 'autologin_token', 'file_reference', 'file_reference_base64', 'pending_suggestions'} 654 | if app.test_mode: 655 | keys_to_hide.add('dialog_filters_tooltip') 656 | 657 | def rem_rec(config): 658 | if not isinstance(config, dict): 659 | return 660 | 661 | for key, value in config.items(): 662 | if isinstance(value, dict): 663 | rem_rec(value) 664 | elif isinstance(value, list): 665 | for item in value: 666 | rem_rec(item) 667 | elif key == 'key' and value in keys_to_hide: 668 | config['value']['value'] = 'crawler' 669 | elif key in keys_to_hide: 670 | config[key] = 'crawler' 671 | 672 | methods_to_filter = {'GetAppConfig', 'GetAvailableReactions', 'GetPremiumPromo'} 673 | sticker_sets_to_filter = {f'sticker_set/{name}' for name in sticker_set_short_names} 674 | bots_to_filter = {f'bot/{name}' for name in bots_usernames_to_track} 675 | peers_to_filter = {f'peer/{name}' for name in peers_to_track} 676 | 677 | combined_filter = methods_to_filter | sticker_sets_to_filter | bots_to_filter | peers_to_filter 678 | for config_name in combined_filter: 679 | configs[config_name] = json.loads(str(configs[config_name])) 680 | rem_rec(configs[config_name]) 681 | configs[config_name] = json.dumps(configs[config_name], ensure_ascii=False, indent=4) 682 | 683 | configs['GetConfig'].date = 0 684 | configs['GetConfig'].expires = 0 685 | configs['GetConfig'].autologin_token = 'crawler' 686 | configs['GetConfig'].dc_options = [] 687 | 688 | for file, content in configs.items(): 689 | filename = os.path.join(OUTPUT_MTPROTO_FOLDER, output_dir, f'{file}.json') 690 | os.makedirs(os.path.dirname(filename), exist_ok=True) 691 | async with aiofiles.open(filename, 'w', encoding='utf-8') as w_file: 692 | await w_file.write(str(content)) 693 | 694 | await app.stop() 695 | 696 | 697 | def is_hashable_only_content_type(content_type) -> bool: 698 | hashable_only_content_types = ( 699 | 'png', 700 | 'jpeg', 701 | 'x-icon', 702 | 'gif', 703 | 'mp4', 704 | 'webm', 705 | 'zip', 706 | 'stream', 707 | ) 708 | 709 | for hashable_only_content_type in hashable_only_content_types: 710 | if hashable_only_content_type in content_type: 711 | return True 712 | 713 | return False 714 | 715 | 716 | class RetryError(Exception): 717 | def __init__(self, message: str, new_url: Optional[str] = None): 718 | super().__init__(message) 719 | self.new_url = new_url 720 | 721 | 722 | async def crawl(url: str, session: aiohttp.ClientSession, output_dir: str): 723 | while True: 724 | try: 725 | await _crawl(url, session, output_dir) 726 | except (RetryError, ServerDisconnectedError, TimeoutError, ClientConnectorError) as e: 727 | if isinstance(e, RetryError) and e.new_url is not None: 728 | url = e.new_url 729 | logger.warning(f'Client or timeout error ({e}). Retrying {url}') 730 | else: 731 | break 732 | 733 | 734 | SLASH_RETRY_ATTEMPTED = set() 735 | 736 | 737 | async def _crawl(url: str, session: aiohttp.ClientSession, output_dir: str): 738 | truncated_url = (url[:100] + '...') if len(url) > 100 else url 739 | 740 | logger.debug(f'Process {truncated_url}') 741 | async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT, headers=HEADERS) as response: 742 | code = response.status 743 | if 499 < code < 600: 744 | msg = f'Error 5XX. Retrying {truncated_url}' 745 | logger.warning(msg) 746 | raise RetryError(msg) 747 | 748 | if code not in {200, 304} and url not in CRAWL_STATUS_CODE_EXCLUSIONS: 749 | if code in {301, 302, 404}: 750 | base_url = url.rstrip('/') 751 | if base_url not in SLASH_RETRY_ATTEMPTED: 752 | if url.endswith('/'): 753 | flipped_url = base_url 754 | logger.warning(f'{code} slash removal retry for {truncated_url}') 755 | else: 756 | flipped_url = f'{url}/' 757 | logger.warning(f'{code} slash addition retry for {truncated_url}') 758 | 759 | SLASH_RETRY_ATTEMPTED.add(base_url) 760 | raise RetryError(f'{code} slash retry for {truncated_url}', new_url=flipped_url) 761 | 762 | content = await response.text() 763 | clean_content = content.replace('\n', ' ').replace('\r', ' ') 764 | truncated_content = (clean_content[:200] + '...') if len(clean_content) > 200 else clean_content 765 | logger.warning(f'Skip [{code}] {truncated_url}: {truncated_content}') 766 | 767 | return 768 | 769 | # bypass external slashes and so on 770 | url_parts = [p for p in url.split('/') if p not in ILLEGAL_PATH_CHARS] 771 | 772 | content_type = response.content_type 773 | 774 | # handle pure domains and html pages without ext in url as html do enable syntax highlighting 775 | page_type, _ = mimetypes.guess_type(url) 776 | 777 | ext = '' 778 | if page_type: 779 | ext = mimetypes.guess_extension(page_type) or '' 780 | if ext != '' and url.endswith(ext): 781 | ext = '' 782 | 783 | if url.endswith('.tl'): 784 | page_type = 'text/plain' 785 | 786 | if page_type is None or len(url_parts) == 1: 787 | ext = '.html' 788 | content_type = 'text/html' 789 | 790 | if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url) or 'td.telegram.org/current' in url: 791 | ext = '.json' 792 | content_type = 'application/json' 793 | 794 | is_hashable_only = is_hashable_only_content_type(content_type) 795 | # amazing dirt for media files like 796 | # telegram.org/file/811140591/1/q7zZHjgES6s/9d121a89ffb0015837 797 | # with response content type HTML instead of image. 798 | # shame on you. 799 | # sometimes it returns a correct type. 800 | # noice load balancing 801 | is_sucking_file = '/file/' in url and 'text' in content_type 802 | 803 | # I don't add ext by content type for images, and so on cuz TG servers suck. 804 | # Some servers do not return a correct content type. 805 | # Some servers do... 806 | if is_hashable_only or is_sucking_file: 807 | ext = '.sha256' 808 | 809 | filename = os.path.join(output_dir, *url_parts) + ext 810 | os.makedirs(os.path.dirname(filename), exist_ok=True) 811 | 812 | if is_sucking_file or is_hashable_only: 813 | content = await response.read() 814 | async with aiofiles.open(filename, 'w', encoding='utf-8') as f: 815 | await f.write(get_hash(content)) 816 | return 817 | 818 | content = await response.text(encoding='UTF-8') 819 | if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url): 820 | content = await collect_translations_paginated_content(url, session) 821 | 822 | content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content) 823 | content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content) 824 | content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content) 825 | content = re.sub(NONCE_REGEX, NONCE_TEMPLATE, content) 826 | content = re.sub(PROXY_CONFIG_SUB_NET_REGEX, PROXY_CONFIG_SUB_NET_TEMPLATE, content) 827 | content = re.sub(SPARKLE_SIG_REGEX, SPARKLE_SIG_TEMPLATE, content) 828 | content = re.sub(SPARKLE_SE_REGEX, SPARKLE_SE_TEMPLATE, content) 829 | content = re.sub(TON_RATE_REGEX, TON_RATE_TEMPLATE, content) 830 | content = re.sub(APK_BETA_TOKEN_REGEX, APK_BETA_TOKEN_TEMPLATE, content) 831 | 832 | # there is a problem with the files with the same name (in the same path) but different case 833 | # the content is random because of the async 834 | # there is only one page with this problem, for now: 835 | # - corefork.telegram.org/constructor/Updates 836 | # - corefork.telegram.org/constructor/updates 837 | async with aiofiles.open(filename, 'w', encoding='utf-8') as f: 838 | logger.debug(f'Write to {filename}') 839 | await f.write(content) 840 | 841 | 842 | async def _crawl_web(session: aiohttp.ClientSession, input_filename: str, output_folder=None): 843 | with open(input_filename, 'r') as f: 844 | tracked_urls = set([l.replace('\n', '') for l in f.readlines()]) 845 | 846 | await asyncio.gather(*[crawl(url, session, output_folder) for url in tracked_urls]) 847 | 848 | 849 | async def crawl_web(session: aiohttp.ClientSession): 850 | await _crawl_web(session, INPUT_FILENAME, OUTPUT_SITES_FOLDER) 851 | 852 | 853 | async def crawl_web_res(session: aiohttp.ClientSession): 854 | await _crawl_web(session, INPUT_RES_FILENAME, OUTPUT_RESOURCES_FOLDER) 855 | 856 | 857 | async def _collect_and_track_all_translation_keys(): 858 | translations = dict() 859 | 860 | start_folder = 'en/' 861 | file_format = '.json' 862 | output_filename = 'translation_keys.json' 863 | 864 | for root, folder, files in os.walk(OUTPUT_TRANSLATIONS_FOLDER): 865 | for file in files: 866 | if not file.endswith(file_format) or file == output_filename: 867 | continue 868 | 869 | async with aiofiles.open(os.path.join(root, file), encoding='utf-8') as f: 870 | content = json.loads(await f.read()) 871 | 872 | client = root[root.index(start_folder) + len(start_folder):] 873 | if client not in translations: 874 | translations[client] = list() 875 | 876 | translations[client].extend(content.keys()) 877 | 878 | for client in translations.keys(): 879 | translations[client] = sorted(translations[client]) 880 | 881 | translations = dict(sorted(translations.items())) 882 | 883 | async with aiofiles.open(os.path.join(OUTPUT_TRANSLATIONS_FOLDER, output_filename), 'w', encoding='utf-8') as f: 884 | await f.write(json.dumps(translations, indent=4)) 885 | 886 | 887 | async def crawl_web_tr(session: aiohttp.ClientSession): 888 | await _crawl_web(session, INPUT_TR_FILENAME, OUTPUT_TRANSLATIONS_FOLDER) 889 | await _collect_and_track_all_translation_keys() 890 | 891 | 892 | async def start(mode: str): 893 | # Optimized TCP connector for web crawling 894 | tcp_connector = aiohttp.TCPConnector( 895 | ssl=False, # Disable SSL verification for crawling 896 | use_dns_cache=False, # Disable DNS caching 897 | force_close=True, # Force close connections after use 898 | family=socket.AF_INET, # Use IPv4 only to avoid potential IPv6 issues 899 | ) 900 | 901 | async with aiohttp.ClientSession(connector=tcp_connector, trust_env=True) as session: 902 | mode == 'all' and await asyncio.gather( 903 | crawl_web(session), 904 | crawl_web_res(session), 905 | crawl_web_tr(session), 906 | track_mtproto_methods(), 907 | download_telegram_android_beta_and_extract_resources(session), 908 | download_telegram_macos_beta_and_extract_resources(session), 909 | download_telegram_ios_beta_and_extract_resources(session), 910 | crawl_mini_app_wallet(), 911 | ) 912 | mode == 'web' and await asyncio.gather( 913 | crawl_web(session), 914 | ) 915 | mode == 'web_res' and await asyncio.gather( 916 | crawl_web_res(session), 917 | ) 918 | mode == 'web_tr' and await asyncio.gather( 919 | crawl_web_tr(session), 920 | ) 921 | mode == 'server' and await asyncio.gather( 922 | track_mtproto_methods(), 923 | ) 924 | mode == 'client' and await asyncio.gather( 925 | download_telegram_android_and_extract_resources(session), 926 | download_telegram_macos_beta_and_extract_resources(session), 927 | download_telegram_ios_beta_and_extract_resources(session), 928 | ) 929 | mode == 'mini_app' and await asyncio.gather( 930 | crawl_mini_app_wallet(), 931 | ) 932 | 933 | 934 | if __name__ == '__main__': 935 | run_mode = 'all' 936 | if 'MODE' in os.environ: 937 | run_mode = os.environ['MODE'] 938 | 939 | start_time = time() 940 | logger.info(f'Start crawling content of tracked urls...') 941 | uvloop.run(start(run_mode)) 942 | logger.info(f'Stop crawling content in mode {run_mode}. {time() - start_time} sec.') 943 | --------------------------------------------------------------------------------