├── .github
    ├── FUNDING.yml
    └── workflows
    │   ├── make_tracked_links_list.yml
    │   └── make_files_tree.yml
├── .gitignore
├── requirements.txt
├── LICENSE
├── tracked_tr_links.txt
├── README.md
├── make_and_send_alert.py
├── unwebpack_sourcemap.py
├── make_tracked_links_list.py
├── ccl_bplist.py
└── make_files_tree.py


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: MarshalX


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.iml
3 | .env
4 | venv
5 | 
6 | tracked_links.txt
7 | *.pyc
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.12.15
 2 | aiodns==3.5.0
 3 | aiofiles==24.1.0
 4 | beautifulsoup4==4.13.4
 5 | cssutils==2.11.1
 6 | httpx==0.28.1
 7 | requests==2.32.4
 8 | uvloop==0.21.0
 9 | git+https://github.com/MarshalX/pyrogram
10 | TgCrypto==1.2.5
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Il'ya (Marshal)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/make_tracked_links_list.yml:
--------------------------------------------------------------------------------
 1 | name: Generate or update list of tracked links
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   schedule:
 6 |     - cron: '* * * * *'
 7 |   push:
 8 |     # trigger on updated link crawler rules
 9 |     branches:
10 |       - main
11 | 
12 | jobs:
13 |   make_tracked_links_file:
14 |     name: Make tracked links file
15 |     runs-on: ubuntu-24.04
16 |     timeout-minutes: 15
17 | 
18 |     steps:
19 | 
20 |       - name: Clone.
21 |         uses: actions/checkout@v5
22 |         with:
23 |           token: ${{ secrets.PAT }}
24 | 
25 |       - name: Setup Python.
26 |         uses: actions/setup-python@v5
27 |         with:
28 |           python-version: 3.13
29 | 
30 |       - name: Install dependencies.
31 |         run: |
32 |           pip install -r requirements.txt
33 | 
34 |       - name: Generate/update file with links.
35 |         env:
36 |           OUTPUT_FILENAME: "tracked_links_ci.txt"
37 |           OUTPUT_RESOURCES_FILENAME: "tracked_res_links_ci.txt"
38 |           OUTPUT_TRANSLATIONS_FILENAME: "tracked_tr_links_ci.txt"
39 |         run: |
40 |           python make_tracked_links_list.py
41 | 
42 |       - name: Commit and push changes.
43 |         run: |
44 |           git pull
45 | 
46 |           mv tracked_links_ci.txt tracked_links.txt
47 |           mv tracked_res_links_ci.txt tracked_res_links.txt
48 |           mv tracked_tr_links_ci.txt tracked_tr_links.txt
49 | 
50 |           git config --global user.name "github-actions[bot]"
51 |           git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com"
52 | 
53 |           git add .
54 |           git commit -m "Update tracked links"
55 |           git push
56 | 


--------------------------------------------------------------------------------
/.github/workflows/make_files_tree.yml:
--------------------------------------------------------------------------------
  1 | name: Fetch new content of tracked links and files
  2 | 
  3 | on:
  4 |   workflow_dispatch:
  5 |   schedule:
  6 |     - cron: '* * * * *'
  7 |   push:
  8 |     # trigger on updated linkbase
  9 |     branches:
 10 |       - main
 11 | 
 12 | jobs:
 13 |   fetch_new_content:
 14 |     name: Make files tree
 15 |     runs-on: ${{matrix.os}}
 16 |     continue-on-error: true
 17 |     timeout-minutes: 10
 18 | 
 19 |     strategy:
 20 |       fail-fast: false
 21 |       matrix:
 22 |         include:
 23 |           - mode: web
 24 |             os: macos-13
 25 | 
 26 |           - mode: web_res
 27 |             os: ubuntu-22.04
 28 | 
 29 |           - mode: web_tr
 30 |             os: ubuntu-22.04
 31 | 
 32 |           - mode: server
 33 |             os: ubuntu-22.04
 34 | 
 35 |           - mode: client
 36 |             os: macos-13
 37 | 
 38 |           - mode: mini_app
 39 |             os: ubuntu-22.04
 40 | 
 41 |     steps:
 42 | 
 43 |       - name: Clone.
 44 |         uses: actions/checkout@v5
 45 |         with:
 46 |           token: ${{ secrets.PAT }}
 47 | 
 48 |       - name: Setup Python.
 49 |         uses: actions/setup-python@v5
 50 |         with:
 51 |           python-version: 3.13
 52 | 
 53 |       - name: Install dependencies.
 54 |         run: |
 55 |           pip install -r requirements.txt
 56 | 
 57 |       - name: Generate files tree.
 58 |         env:
 59 |           OUTPUT_FOLDER: "data_ci/"
 60 |           TELEGRAM_SESSION: ${{ secrets.TELEGRAM_SESSION }}
 61 |           TELEGRAM_SESSION_TEST: ${{ secrets.TELEGRAM_SESSION_TEST }}
 62 |           TELEGRAM_API_ID: ${{ secrets.TELEGRAM_API_ID }}
 63 |           TELEGRAM_API_HASH: ${{ secrets.TELEGRAM_API_HASH }}
 64 |           MODE: ${{ matrix.mode }}
 65 |         run: |
 66 |           git pull
 67 |           python make_files_tree.py
 68 |           rm -rf __pycache__
 69 | 
 70 |       - name: Prepare data.
 71 |         if: matrix.mode == 'web'
 72 |         run: |
 73 |           git checkout data
 74 |           git pull
 75 | 
 76 |           mv data/web_res data_ci/web_res
 77 |           mv data/web_tr data_ci/web_tr
 78 |           mv data/client data_ci/client
 79 |           mv data/server data_ci/server
 80 |           mv data/mini_app data_ci/mini_app
 81 | 
 82 |           rm -rf data
 83 |           mv data_ci data
 84 | 
 85 |       - name: Prepare data.
 86 |         if: matrix.mode == 'web_res'
 87 |         run: |
 88 |           git checkout data
 89 |           git pull
 90 | 
 91 |           mv data/web data_ci/web
 92 |           mv data/web_tr data_ci/web_tr
 93 |           mv data/client data_ci/client
 94 |           mv data/server data_ci/server
 95 |           mv data/mini_app data_ci/mini_app
 96 | 
 97 |           rm -rf data
 98 |           mv data_ci data
 99 | 
100 |       - name: Prepare data.
101 |         if: matrix.mode == 'web_tr'
102 |         run: |
103 |           git checkout data
104 |           git pull
105 | 
106 |           mv data/web data_ci/web
107 |           mv data/web_res data_ci/web_res
108 |           mv data/server data_ci/server
109 |           mv data/client data_ci/client
110 |           mv data/mini_app data_ci/mini_app
111 | 
112 |           rm -rf data
113 |           mv data_ci data
114 | 
115 |       - name: Prepare data.
116 |         if: matrix.mode == 'server'
117 |         run: |
118 |           git checkout data
119 |           git pull
120 | 
121 |           mv data/web data_ci/web
122 |           mv data/web_res data_ci/web_res
123 |           mv data/web_tr data_ci/web_tr
124 |           mv data/client data_ci/client
125 |           mv data/mini_app data_ci/mini_app
126 | 
127 |           rm -rf data
128 |           mv data_ci data
129 | 
130 |       - name: Prepare data.
131 |         if: matrix.mode == 'client'
132 |         run: |
133 |           git checkout data
134 |           git pull
135 | 
136 |           mv data/web data_ci/web
137 |           mv data/web_res data_ci/web_res
138 |           mv data/web_tr data_ci/web_tr
139 |           mv data/server data_ci/server
140 |           mv data/mini_app data_ci/mini_app
141 | 
142 |           rm -rf data
143 |           mv data_ci data
144 | 
145 |       - name: Prepare data.
146 |         if: matrix.mode == 'mini_app'
147 |         run: |
148 |           git checkout data
149 |           git pull
150 | 
151 |           mv data/web data_ci/web
152 |           mv data/web_res data_ci/web_res
153 |           mv data/web_tr data_ci/web_tr
154 |           mv data/server data_ci/server
155 |           mv data/client data_ci/client
156 | 
157 |           rm -rf data
158 |           mv data_ci data
159 | 
160 |       - name: Commit and push changes.
161 |         run: |
162 |           git config --global user.name "github-actions[bot]"
163 |           git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com"
164 | 
165 |           git add .
166 |           git commit -m "Update content of files"
167 |           git push
168 | 


--------------------------------------------------------------------------------
/tracked_tr_links.txt:
--------------------------------------------------------------------------------
  1 | translations.telegram.org
  2 | translations.telegram.org/auth
  3 | translations.telegram.org/css/billboard.css
  4 | translations.telegram.org/css/contest-zoo.css
  5 | translations.telegram.org/css/health.css
  6 | translations.telegram.org/css/jquery-ui.min.css
  7 | translations.telegram.org/css/tchart.min.css
  8 | translations.telegram.org/css/telegram.css
  9 | translations.telegram.org/css/translations.css
 10 | translations.telegram.org/en
 11 | translations.telegram.org/en/android
 12 | translations.telegram.org/en/android/bots_and_payments
 13 | translations.telegram.org/en/android/camera_and_media
 14 | translations.telegram.org/en/android/chat_list
 15 | translations.telegram.org/en/android/general
 16 | translations.telegram.org/en/android/groups_and_channels
 17 | translations.telegram.org/en/android/login
 18 | translations.telegram.org/en/android/passport
 19 | translations.telegram.org/en/android/private_chats
 20 | translations.telegram.org/en/android/profile
 21 | translations.telegram.org/en/android/settings
 22 | translations.telegram.org/en/android/stories
 23 | translations.telegram.org/en/android/unsorted
 24 | translations.telegram.org/en/android_x
 25 | translations.telegram.org/en/android_x/bots_and_payments
 26 | translations.telegram.org/en/android_x/camera_and_media
 27 | translations.telegram.org/en/android_x/chat_list
 28 | translations.telegram.org/en/android_x/general
 29 | translations.telegram.org/en/android_x/groups_and_channels
 30 | translations.telegram.org/en/android_x/login
 31 | translations.telegram.org/en/android_x/passport
 32 | translations.telegram.org/en/android_x/private_chats
 33 | translations.telegram.org/en/android_x/profile
 34 | translations.telegram.org/en/android_x/settings
 35 | translations.telegram.org/en/android_x/stories
 36 | translations.telegram.org/en/android_x/unsorted
 37 | translations.telegram.org/en/emoji
 38 | translations.telegram.org/en/ios
 39 | translations.telegram.org/en/ios/bots_and_payments
 40 | translations.telegram.org/en/ios/camera_and_media
 41 | translations.telegram.org/en/ios/chat_list
 42 | translations.telegram.org/en/ios/general
 43 | translations.telegram.org/en/ios/groups_and_channels
 44 | translations.telegram.org/en/ios/login
 45 | translations.telegram.org/en/ios/passport
 46 | translations.telegram.org/en/ios/private_chats
 47 | translations.telegram.org/en/ios/profile
 48 | translations.telegram.org/en/ios/settings
 49 | translations.telegram.org/en/ios/stories
 50 | translations.telegram.org/en/ios/unsorted
 51 | translations.telegram.org/en/macos
 52 | translations.telegram.org/en/macos/bots_and_payments
 53 | translations.telegram.org/en/macos/camera_and_media
 54 | translations.telegram.org/en/macos/chat_list
 55 | translations.telegram.org/en/macos/general
 56 | translations.telegram.org/en/macos/groups_and_channels
 57 | translations.telegram.org/en/macos/login
 58 | translations.telegram.org/en/macos/passport
 59 | translations.telegram.org/en/macos/private_chats
 60 | translations.telegram.org/en/macos/profile
 61 | translations.telegram.org/en/macos/settings
 62 | translations.telegram.org/en/macos/stories
 63 | translations.telegram.org/en/macos/unsorted
 64 | translations.telegram.org/en/tdesktop
 65 | translations.telegram.org/en/tdesktop/bots_and_payments
 66 | translations.telegram.org/en/tdesktop/camera_and_media
 67 | translations.telegram.org/en/tdesktop/chat_list
 68 | translations.telegram.org/en/tdesktop/general
 69 | translations.telegram.org/en/tdesktop/groups_and_channels
 70 | translations.telegram.org/en/tdesktop/login
 71 | translations.telegram.org/en/tdesktop/passport
 72 | translations.telegram.org/en/tdesktop/private_chats
 73 | translations.telegram.org/en/tdesktop/profile
 74 | translations.telegram.org/en/tdesktop/settings
 75 | translations.telegram.org/en/tdesktop/stories
 76 | translations.telegram.org/en/tdesktop/unsorted
 77 | translations.telegram.org/en/unigram
 78 | translations.telegram.org/en/unigram/bots_and_payments
 79 | translations.telegram.org/en/unigram/camera_and_media
 80 | translations.telegram.org/en/unigram/chat_list
 81 | translations.telegram.org/en/unigram/general
 82 | translations.telegram.org/en/unigram/groups_and_channels
 83 | translations.telegram.org/en/unigram/login
 84 | translations.telegram.org/en/unigram/passport
 85 | translations.telegram.org/en/unigram/private_chats
 86 | translations.telegram.org/en/unigram/profile
 87 | translations.telegram.org/en/unigram/settings
 88 | translations.telegram.org/en/unigram/stories
 89 | translations.telegram.org/en/unigram/unsorted
 90 | translations.telegram.org/en/weba
 91 | translations.telegram.org/en/weba/bots_and_payments
 92 | translations.telegram.org/en/weba/camera_and_media
 93 | translations.telegram.org/en/weba/chat_list
 94 | translations.telegram.org/en/weba/general
 95 | translations.telegram.org/en/weba/groups_and_channels
 96 | translations.telegram.org/en/weba/login
 97 | translations.telegram.org/en/weba/passport
 98 | translations.telegram.org/en/weba/private_chats
 99 | translations.telegram.org/en/weba/profile
100 | translations.telegram.org/en/weba/settings
101 | translations.telegram.org/en/weba/stories
102 | translations.telegram.org/en/weba/unsorted
103 | translations.telegram.org/en/webk
104 | translations.telegram.org/en/webk/bots_and_payments
105 | translations.telegram.org/en/webk/camera_and_media
106 | translations.telegram.org/en/webk/chat_list
107 | translations.telegram.org/en/webk/general
108 | translations.telegram.org/en/webk/groups_and_channels
109 | translations.telegram.org/en/webk/login
110 | translations.telegram.org/en/webk/passport
111 | translations.telegram.org/en/webk/private_chats
112 | translations.telegram.org/en/webk/profile
113 | translations.telegram.org/en/webk/settings
114 | translations.telegram.org/en/webk/stories
115 | translations.telegram.org/en/webk/unsorted


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## 🕷 Telegram Crawler
  2 | 
  3 | This project is developed to automatically detect changes made 
  4 | to the official Telegram sites and beta clients. This is necessary for 
  5 | anticipating future updates and other things 
  6 | (new vacancies, API updates, etc).
  7 | 
  8 | | Name                 | Commits  | Status                                                                                                                                                        |
  9 | |----------------------| -------- |---------------------------------------------------------------------------------------------------------------------------------------------------------------|
 10 | | Data tracker         | [Commits](https://github.com/MarshalX/telegram-crawler/commits/data)  | ![Fetch new content of tracked links and files](https://github.com/MarshalX/telegram-crawler/actions/workflows/make_files_tree.yml/badge.svg?branch=main)     |
 11 | | Site links collector | [Commits](https://github.com/MarshalX/telegram-crawler/commits/main/tracked_links.txt)  | ![Generate or update list of tracked links](https://github.com/MarshalX/telegram-crawler/actions/workflows/make_tracked_links_list.yml/badge.svg?branch=main) |
 12 | 
 13 | * ✅ passing – new changes
 14 | * ❌ failing – no changes
 15 | 
 16 | You should to subscribe to **[channel with alerts](https://t.me/tgcrawl)** to stay updated.
 17 | Copy of Telegram websites and client`s resources stored **[here](https://github.com/MarshalX/telegram-crawler/tree/data/data)**.
 18 | 
 19 | ![GitHub pretty diff example](https://i.imgur.com/BK8UAju.png)
 20 | 
 21 | ### How it works
 22 | 
 23 | 1. [Link crawling](make_tracked_links_list.py) runs **as often as possible**. 
 24 |    Starts crawling from the home page of the site. 
 25 |    Detects relative and absolute sub links and recursively repeats the operation. 
 26 |    Writes a list of unique links for future content comparison. 
 27 |    Additionally, there is the ability to add links by hand to help the script 
 28 |    find more hidden (links to which no one refers) links. To manage exceptions,
 29 |    there is a [system of rules](#example-of-link-crawler-rules-configuration)
 30 |    for the link crawler.
 31 | 
 32 | 2. [Content crawling](make_files_tree.py) is launched **as often as 
 33 |    possible** and uses the existing list of links collected in step 1. 
 34 |    Going through the base it gets contains and builds a system of subfolders 
 35 |    and files. Removes all dynamic content from files. It downloads beta version 
 36 |    of Android Client, decompiles it and track resources also. Tracking of 
 37 |    resources of Telegram for macOS presented too.
 38 | 
 39 | 3. Using of [GitHub Actions](.github/workflows/). Works without own servers.
 40 |    You can just fork this repository and own tracker system by yourself.
 41 |    Workflows launch scripts and commit changes. All file changes are tracked 
 42 |    by GIT and beautifully displayed on GitHub. GitHub Actions should be built
 43 |    correctly only if there are changes on the Telegram website. Otherwise, the 
 44 |    workflow should fail. If build was successful, we can send notifications to 
 45 |    Telegram channel and so on.
 46 | 
 47 | ### FAQ
 48 | 
 49 | **Q:** How often is "**as often as possible**"?
 50 | 
 51 | **A:** TLTR: content update action runs every ~10 minutes. More info:
 52 | - [Scheduled actions cannot be run more than once every 5 minutes.](https://github.blog/changelog/2019-11-01-github-actions-scheduled-jobs-maximum-frequency-is-changing/)
 53 | - [GitHub Actions workflow not triggering at scheduled time](https://upptime.js.org/blog/2021/01/22/github-actions-schedule-not-working/).
 54 | 
 55 | **Q:** Why there is 2 separated crawl scripts instead of one?
 56 | 
 57 | **A:** Because the previous idea was to update tracked links once at hour.
 58 | It was so comfortably to use separated scripts and workflows.
 59 | After Telegram 7.7 update, I realised that find new blog posts so slowly is bad idea.
 60 | 
 61 | **Q:** Why alert for sending alerts have while loop?
 62 | 
 63 | **A:** Because GitHub API doesn't return information about commit immediately 
 64 | after push to repository. Therefore, script are waiting for information to appear...
 65 | 
 66 | **Q:** Why are you using GitHub Personal Access Token in action/checkout workflow`s step?
 67 | 
 68 | **A:** To have ability to trigger other workflows by on push trigger. More info:
 69 | - [Action does not trigger another on push tag action ](https://github.community/t/action-does-not-trigger-another-on-push-tag-action/17148)
 70 | 
 71 | **Q:** Why are you using GitHub PAT in [make_and_send_alert.py](make_and_send_alert.py)?
 72 | 
 73 | **A:** To increase limits of GitHub API.
 74 | 
 75 | **Q:** Why are you decompiling .apk file each run?
 76 | 
 77 | **A:** Because it doesn't require much time. I am decompiling only 
 78 | resources (-s flag of apktool to disable disassembly of dex files). 
 79 | Writing a check for the need for decompilation by the hash of the apk file 
 80 | would take more time.
 81 | 
 82 | ### Example of link crawler rules configuration
 83 | 
 84 | ```python
 85 | CRAWL_RULES = {
 86 |     # every rule is regex
 87 |     # empty string means match any url
 88 |     # allow rules with higher priority than deny
 89 |     'translations.telegram.org': {
 90 |         'allow': {
 91 |             r'^[^/]*$',  # root
 92 |             r'org/[^/]*/$',  # 1 lvl sub
 93 |             r'/en/[a-z_]+/$'  # 1 lvl after /en/
 94 |         },
 95 |         'deny': {
 96 |             '',  # all
 97 |         }
 98 |     },
 99 |     'bugs.telegram.org': {
100 |         'deny': {
101 |             '',    # deny all sub domain
102 |         },
103 |     },
104 | }
105 | ```
106 | 
107 | ### Current hidden urls list
108 | 
109 | ```python
110 | HIDDEN_URLS = {
111 |     # 'corefork.telegram.org', # disabled
112 | 
113 |     'telegram.org/privacy/gmailbot',
114 |     'telegram.org/tos',
115 |     'telegram.org/tour',
116 |     'telegram.org/evolution',
117 | 
118 |     'desktop.telegram.org/changelog',
119 | }
120 | ```
121 | 
122 | ### License
123 | 
124 | Licensed under the [MIT License](LICENSE).
125 | 


--------------------------------------------------------------------------------
/make_and_send_alert.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import logging
  3 | import os
  4 | import re
  5 | from typing import Tuple
  6 | 
  7 | import aiohttp
  8 | 
  9 | COMMIT_SHA = os.environ['COMMIT_SHA']
 10 | 
 11 | # commits for test alert builder
 12 | # COMMIT_SHA = '4015bd9c48b45910727569fff5e770000d85d207' # all clients + server and test server + web
 13 | # COMMIT_SHA = '9cc3f0fb7c390c8cb8b789e9377f10ed5e80a089' # web and web res together
 14 | # COMMIT_SHA = '4efaf918af43054ba3ff76068e83d135a9a2535d' # web
 15 | # COMMIT_SHA = 'e2d725c2b3813d7c170f50b0ab21424a71466f6d' # web res
 16 | 
 17 | TELEGRAM_BOT_TOKEN = os.environ['TELEGRAM_BOT_TOKEN']
 18 | DISCORD_BOT_TOKEN = os.environ['DISCORD_BOT_TOKEN']
 19 | GITHUB_PAT = os.environ['GITHUB_PAT']
 20 | 
 21 | REPOSITORY = os.environ.get('REPOSITORY', 'MarshalX/telegram-crawler')
 22 | ROOT_TREE_DIR = os.environ.get('ROOT_TREE_DIR', 'data')
 23 | 
 24 | CHAT_ID = os.environ.get('CHAT_ID', '@tgcrawl')
 25 | DISCORD_CHANNEL_ID = os.environ.get('DISCORD_CHANNEL_ID', '1116390634249523283')
 26 | 
 27 | BASE_GITHUB_API = 'https://api.github.com/'
 28 | GITHUB_LAST_COMMITS = 'repos/{repo}/commits/{sha}'
 29 | 
 30 | BASE_TELEGRAM_API = 'https://api.telegram.org/bot{token}/'
 31 | TELEGRAM_SEND_MESSAGE = 'sendMessage'
 32 | 
 33 | logger = logging.getLogger(__name__)
 34 | logging.basicConfig(level=logging.INFO)
 35 | 
 36 | STATUS_TO_EMOJI = {
 37 |     'added': '✅',
 38 |     'modified': '📝',
 39 |     'removed': '❌',
 40 |     'renamed': '🔄',
 41 |     'copied': '📋',
 42 |     'changed': '📝',
 43 |     'unchanged': '📝',
 44 | }
 45 | 
 46 | AVAILABLE_HASHTAGS = {
 47 |     'web_tr', 'web_res', 'web', 'server', 'test_server', 'client',
 48 |     'ios', 'macos', 'android', 'android_dl', 'mini_app', 'wallet'
 49 | }
 50 | HASHTAGS_PATTERNS = {
 51 |     # regex will be more flexible. for example, in issue with double hashtag '#web #web_res' when data/res not changed
 52 |     'web_tr': os.path.join(ROOT_TREE_DIR, 'web_tr'),
 53 |     'web_res': os.path.join(ROOT_TREE_DIR, 'web_res'),
 54 |     'web': os.path.join(ROOT_TREE_DIR, 'web'),
 55 |     'server': os.path.join(ROOT_TREE_DIR, 'server'),
 56 |     'test_server': os.path.join(ROOT_TREE_DIR, 'server', 'test'),
 57 |     'client': os.path.join(ROOT_TREE_DIR, 'client'),
 58 |     'ios': os.path.join(ROOT_TREE_DIR, 'client', 'ios-beta'),
 59 |     'macos': os.path.join(ROOT_TREE_DIR, 'client', 'macos-beta'),
 60 |     'android': os.path.join(ROOT_TREE_DIR, 'client', 'android-beta'),
 61 |     'android_dl': os.path.join(ROOT_TREE_DIR, 'client', 'android-stable-dl'),
 62 |     'mini_app': os.path.join(ROOT_TREE_DIR, 'mini_app'),
 63 |     'wallet': os.path.join(ROOT_TREE_DIR, 'mini_app', 'wallet'),
 64 | }
 65 | # order is important!
 66 | PATHS_TO_REMOVE_FROM_ALERT = [
 67 |     os.path.join(ROOT_TREE_DIR, 'web_tr'),
 68 |     os.path.join(ROOT_TREE_DIR, 'web_res'),
 69 |     os.path.join(ROOT_TREE_DIR, 'web'),
 70 |     os.path.join(ROOT_TREE_DIR, 'server'),
 71 |     os.path.join(ROOT_TREE_DIR, 'client'),
 72 |     os.path.join(ROOT_TREE_DIR, 'mini_app'),
 73 | ]
 74 | 
 75 | FORUM_CHAT_ID = '@tfcrawl'
 76 | HASHTAG_TO_TOPIC = {
 77 |     'web': '2200',
 78 |     'web_tr': '2202',
 79 |     'web_res': '2206',
 80 |     'server': '2317',
 81 |     'ios': '2194',
 82 |     'macos': '2187',
 83 |     'android': '2190',
 84 |     'android_dl': '12235',
 85 |     'wallet': '5685',
 86 | }
 87 | 
 88 | GITHUB_API_LIMIT_PER_HOUR = 5_000
 89 | COUNT_OF_RUNNING_WORKFLOW_AT_SAME_TIME = 5  # just random number ;d
 90 | 
 91 | ROW_PER_STATUS = 5
 92 | 
 93 | LAST_PAGE_NUMBER_REGEX = r'page=(\d+)>; rel="last"'
 94 | 
 95 | 
 96 | async def send_req_until_success(session: aiohttp.ClientSession, **kwargs) -> Tuple[dict, int]:
 97 |     delay = 5  # in sec
 98 |     count_of_retries = int(GITHUB_API_LIMIT_PER_HOUR / COUNT_OF_RUNNING_WORKFLOW_AT_SAME_TIME / delay)
 99 | 
100 |     last_page_number = 1
101 |     retry_number = 1
102 |     while retry_number <= count_of_retries:
103 |         retry_number += 1
104 | 
105 |         res = await session.get(**kwargs)
106 |         if res.status != 200:
107 |             await asyncio.sleep(delay)
108 |             continue
109 | 
110 |         json = await res.json()
111 | 
112 |         pagination_data = res.headers.get('Link', '')
113 |         matches = re.findall(LAST_PAGE_NUMBER_REGEX, pagination_data)
114 |         if matches:
115 |             last_page_number = int(matches[0])
116 | 
117 |         return json, last_page_number
118 | 
119 |     raise RuntimeError('Surprise. Time is over')
120 | 
121 | 
122 | async def send_telegram_alert(session: aiohttp.ClientSession, text: str, thread_id=None) -> aiohttp.ClientResponse:
123 |     params = {
124 |         'chat_id': CHAT_ID,
125 |         'parse_mode': 'HTML',
126 |         'text': text,
127 |         'disable_web_page_preview': 1,
128 |     }
129 |     if thread_id:
130 |         params['chat_id'] = FORUM_CHAT_ID
131 |         params['message_thread_id'] = thread_id
132 | 
133 |     return await session.get(
134 |         url=f'{BASE_TELEGRAM_API}{TELEGRAM_SEND_MESSAGE}'.format(token=TELEGRAM_BOT_TOKEN), params=params
135 |     )
136 | 
137 | 
138 | async def send_discord_alert(
139 |         session: aiohttp.ClientSession, commit_hash: str, commit_url: str, fields: list, hashtags: str
140 | ) -> aiohttp.ClientResponse:
141 |     url = f'https://discord.com/api/channels/{DISCORD_CHANNEL_ID}/messages'
142 | 
143 |     headers = {
144 |         'Authorization': f'Bot {DISCORD_BOT_TOKEN}',
145 |     }
146 | 
147 |     embed_data = {
148 |         'title': f'New changes in Telegram ({commit_hash})',
149 |         'color': 0xe685cc,
150 |         'url': commit_url,
151 |         'fields': fields,
152 |         'author': {
153 |             'name': 'Marshal',
154 |             'url': 'https://github.com/MarshalX',
155 |             'icon_url': 'https://avatars.githubusercontent.com/u/15520314?v=4',
156 |         },
157 |         'footer': {
158 |             'text': hashtags,
159 |         }
160 |     }
161 | 
162 |     payload = {
163 |         'embed': embed_data
164 |     }
165 | 
166 |     return await session.post(url=url, headers=headers, json=payload)
167 | 
168 | 
169 | async def main() -> None:
170 |     async with aiohttp.ClientSession() as session:
171 |         commit_data, last_page = await send_req_until_success(
172 |             session=session,
173 |             url=f'{BASE_GITHUB_API}{GITHUB_LAST_COMMITS}'.format(repo=REPOSITORY, sha=COMMIT_SHA),
174 |             headers={
175 |                 'Authorization': f'token {GITHUB_PAT}'
176 |             }
177 |         )
178 |         commit_files = commit_data['files']
179 | 
180 |         coroutine_list = list()
181 |         for current_page in range(2, last_page + 1):
182 |             coroutine_list.append(send_req_until_success(
183 |                 session=session,
184 |                 url=f'{BASE_GITHUB_API}{GITHUB_LAST_COMMITS}?page={current_page}'.format(
185 |                     repo=REPOSITORY, sha=COMMIT_SHA
186 |                 ),
187 |                 headers={
188 |                     'Authorization': f'token {GITHUB_PAT}'
189 |                 }
190 |             ))
191 | 
192 |         paginated_responses = await asyncio.gather(*coroutine_list)
193 |         for json_response, _ in paginated_responses:
194 |             commit_files.extend(json_response['files'])
195 | 
196 |         commit_files = [file for file in commit_files if 'translations.telegram.org/' not in file['filename']]
197 |         if not commit_files:
198 |             return
199 | 
200 |         commit_hash = commit_data['sha'][:7]
201 |         html_url = commit_data['html_url']
202 | 
203 |         alert_text = f'<b>New changes of Telegram</b>\n\n'
204 |         alert_hashtags = set()
205 | 
206 |         global AVAILABLE_HASHTAGS
207 |         available_hashtags = AVAILABLE_HASHTAGS.copy()
208 | 
209 |         changes = {k: [] for k in STATUS_TO_EMOJI.keys()}
210 |         changes_md = {k: [] for k in STATUS_TO_EMOJI.keys()}
211 |         for file in commit_files:
212 |             for available_hashtag in available_hashtags:
213 |                 pattern = HASHTAGS_PATTERNS[available_hashtag]
214 |                 if pattern in file['filename']:
215 |                     alert_hashtags.add(available_hashtag)
216 | 
217 |             # optimize substring search
218 |             available_hashtags -= alert_hashtags
219 | 
220 |             changed_url = file['filename'].replace('.html', '')
221 |             for path_to_remove in PATHS_TO_REMOVE_FROM_ALERT:
222 |                 if changed_url.startswith(path_to_remove):
223 |                     changed_url = changed_url[len(path_to_remove) + 1:]
224 |                     break   # can't occur more than one time
225 | 
226 |             status = STATUS_TO_EMOJI[file['status']]
227 |             changes[file['status']].append(f'{status} <code>{changed_url}</code>')
228 |             changes_md[file['status']].append(f'- {changed_url}')
229 | 
230 |         discord_embed_fields = []
231 |         for i, [status, text_list] in enumerate(changes.items()):
232 |             if not text_list:
233 |                 continue
234 | 
235 |             alert_text += '\n'.join(text_list[:ROW_PER_STATUS]) + '\n'
236 |             discord_field_value = '\n'.join(changes_md[status][:ROW_PER_STATUS]) + '\n'
237 | 
238 |             if len(text_list) > ROW_PER_STATUS:
239 |                 count = len(text_list) - ROW_PER_STATUS
240 |                 alert_text += f'And <b>{count}</b> {status} actions more..\n'
241 |                 discord_field_value += f'And **{count}** {status} actions more..\n'
242 | 
243 |             discord_embed_fields.append({
244 |                 'name': f'{STATUS_TO_EMOJI[status]} {status.capitalize()}',
245 |                 'value': discord_field_value,
246 |                 'inline': False
247 |             })
248 | 
249 |             alert_text += '\n'
250 | 
251 |         link_text = f'GitHub · MarshalX/telegram-crawler@{commit_hash}'
252 |         alert_text += f'<a href="{html_url}">{link_text}</a>'
253 |         logger.info(alert_text)
254 | 
255 |         if 'web_tr' in alert_hashtags or 'web_res' in alert_hashtags:
256 |             alert_hashtags.remove('web')
257 | 
258 |         for hashtag, topic_thread_id in HASHTAG_TO_TOPIC.items():
259 |             if hashtag in alert_hashtags:
260 |                 logger.info(f'Sending alert to the forum. Topic: {topic_thread_id}')
261 |                 telegram_response = await send_telegram_alert(session, alert_text, topic_thread_id)
262 |                 logger.debug(await telegram_response.read())
263 | 
264 |         hashtags = ' '.join([f'#{hashtag}' for hashtag in sorted(alert_hashtags)])
265 |         if alert_hashtags:
266 |             alert_text += '\n\n' + hashtags
267 | 
268 |         telegram_response = await send_telegram_alert(session, alert_text)
269 |         logger.debug(await telegram_response.read())
270 | 
271 |         discord_response = await send_discord_alert(session, commit_hash, html_url, discord_embed_fields, hashtags)
272 |         logger.debug(await discord_response.read())
273 | 
274 | 
275 | if __name__ == '__main__':
276 |     asyncio.get_event_loop().run_until_complete(main())
277 | 


--------------------------------------------------------------------------------
/unwebpack_sourcemap.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 |     unwebpack_sourcemap.py
  4 |     by rarecoil (github.com/rarecoil/unwebpack-sourcemap)
  5 | 
  6 |     Reads Webpack source maps and extracts the disclosed
  7 |     uncompiled/commented source code for review. Can detect and
  8 |     attempt to read sourcemaps from Webpack bundles with the `-d`
  9 |     flag. Puts source into a directory structure similar to dev.
 10 | 
 11 | MIT License
 12 | 
 13 | Copyright (c) 2019 rarecoil.
 14 | 
 15 | Permission is hereby granted, free of charge, to any person obtaining a copy
 16 | of this software and associated documentation files (the "Software"), to deal
 17 | in the Software without restriction, including without limitation the rights
 18 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 19 | copies of the Software, and to permit persons to whom the Software is
 20 | furnished to do so, subject to the following conditions:
 21 | 
 22 | The above copyright notice and this permission notice shall be included in all
 23 | copies or substantial portions of the Software.
 24 | 
 25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 26 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 27 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 28 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 29 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 30 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 31 | SOFTWARE.
 32 | """
 33 | 
 34 | import argparse
 35 | import json
 36 | import os
 37 | import re
 38 | import string
 39 | import sys
 40 | from urllib.parse import urlparse
 41 | from unicodedata import normalize
 42 | 
 43 | import requests
 44 | from bs4 import BeautifulSoup, SoupStrainer
 45 | 
 46 | 
 47 | class SourceMapExtractor(object):
 48 |     """Primary SourceMapExtractor class. Feed this arguments."""
 49 | 
 50 |     _target = None
 51 |     _is_local = False
 52 |     _attempt_sourcemap_detection = False
 53 |     _output_directory = ""
 54 |     _target_extracted_sourcemaps = []
 55 | 
 56 |     _path_sanitiser = None
 57 | 
 58 |     def __init__(self, options):
 59 |         """Initialize the class."""
 60 |         if 'output_directory' not in options:
 61 |             raise SourceMapExtractorError("output_directory must be set in options.")
 62 |         else:
 63 |             self._output_directory = os.path.abspath(options['output_directory'])
 64 |             if not os.path.isdir(self._output_directory):
 65 |                 if options['make_directory'] is True:
 66 |                     os.mkdir(self._output_directory)
 67 |                 else:
 68 |                     raise SourceMapExtractorError(
 69 |                         "output_directory does not exist. Pass --make-directory to auto-make it.")
 70 | 
 71 |         self._path_sanitiser = PathSanitiser(self._output_directory)
 72 | 
 73 |         if options['disable_ssl_verification'] == True:
 74 |             self.disable_verify_ssl = True
 75 |         else:
 76 |             self.disable_verify_ssl = False
 77 | 
 78 |         if options['local'] == True:
 79 |             self._is_local = True
 80 | 
 81 |         if options['detect'] == True:
 82 |             self._attempt_sourcemap_detection = True
 83 | 
 84 |         self._validate_target(options['uri_or_file'])
 85 | 
 86 |     def run(self):
 87 |         """Run extraction process."""
 88 |         if self._is_local == False:
 89 |             if self._attempt_sourcemap_detection:
 90 |                 detected_sourcemaps = self._detect_js_sourcemaps(self._target)
 91 |                 for sourcemap in detected_sourcemaps:
 92 |                     self._parse_remote_sourcemap(sourcemap)
 93 |             else:
 94 |                 self._parse_remote_sourcemap(self._target)
 95 | 
 96 |         else:
 97 |             self._parse_sourcemap(self._target)
 98 | 
 99 |     def _validate_target(self, target):
100 |         """Do some basic validation on the target."""
101 |         parsed = urlparse(target)
102 |         if self._is_local is True:
103 |             self._target = os.path.abspath(target)
104 |             if not os.path.isfile(self._target):
105 |                 raise SourceMapExtractorError(
106 |                     "uri_or_file is set to be a file, but doesn't seem to exist. check your path.")
107 |         else:
108 |             if parsed.scheme == "":
109 |                 raise SourceMapExtractorError("uri_or_file isn't a URI, and --local was not set. set --local?")
110 |             file, ext = os.path.splitext(parsed.path)
111 |             self._target = target
112 |             if ext != '.map' and self._attempt_sourcemap_detection is False:
113 |                 print("WARNING: URI does not have .map extension, and --detect is not flagged.")
114 | 
115 |     def _parse_remote_sourcemap(self, uri):
116 |         """GET a remote sourcemap and parse it."""
117 |         data, final_uri = self._get_remote_data(uri)
118 |         if data is not None:
119 |             self._parse_sourcemap(data, True)
120 |         else:
121 |             print("WARNING: Could not retrieve sourcemap from URI %s" % final_uri)
122 | 
123 |     def _detect_js_sourcemaps(self, uri):
124 |         """Pull HTML and attempt to find JS files, then read the JS files and look for sourceMappingURL."""
125 |         remote_sourcemaps = []
126 |         data, final_uri = self._get_remote_data(uri)
127 | 
128 |         if final_uri.endswith('.js'):
129 |             print("Detecting sourcemaps in JS at %s" % final_uri)
130 |             # trick to not send the same request twice
131 |             self._enrich_with_remote_sourcemaps('tgcrawl', remote_sourcemaps, js_data=data, last_target_uri=final_uri)
132 |             return remote_sourcemaps
133 | 
134 |         # TODO: scan to see if this is a sourcemap instead of assuming HTML
135 |         print("Detecting sourcemaps in HTML at %s" % final_uri)
136 |         script_strainer = SoupStrainer("script", src=True)
137 |         try:
138 |             soup = BeautifulSoup(data, "html.parser", parse_only=script_strainer)
139 |         except:
140 |             raise SourceMapExtractorError("Could not parse HTML at URI %s" % final_uri)
141 | 
142 |         for script in soup:
143 |             source = script['src']
144 |             parsed_uri = urlparse(source)
145 |             if parsed_uri.scheme != '':
146 |                 next_target_uri = source
147 |             else:
148 |                 current_uri = urlparse(final_uri)
149 |                 built_uri = current_uri.scheme + "://" + current_uri.netloc + source
150 |                 next_target_uri = built_uri
151 |             self._enrich_with_remote_sourcemaps(next_target_uri, remote_sourcemaps)
152 | 
153 |         return remote_sourcemaps
154 | 
155 |     def _enrich_with_remote_sourcemaps(self, next_target_uri, remote_sourcemaps, js_data=None, last_target_uri=None):
156 |         if last_target_uri is None or js_data is None:
157 |             js_data, last_target_uri = self._get_remote_data(next_target_uri)
158 | 
159 |         # get last line of file
160 |         last_line = js_data.rstrip().split("\n")[-1]
161 |         regex = "\\/\\/#\s*sourceMappingURL=(.*)$"
162 |         matches = re.search(regex, last_line)
163 |         if matches:
164 |             asset = matches.groups(0)[0].strip()
165 |             asset_target = urlparse(asset)
166 |             if asset_target.scheme != '':
167 |                 print("Detected sourcemap at remote location %s" % asset)
168 |                 remote_sourcemaps.append(asset)
169 |             else:
170 |                 current_uri = urlparse(last_target_uri)
171 |                 asset_uri = current_uri.scheme + '://' + \
172 |                             current_uri.netloc + \
173 |                             os.path.dirname(current_uri.path) + \
174 |                             '/' + asset
175 |                 print("Detected sourcemap at remote location %s" % asset_uri)
176 |                 remote_sourcemaps.append(asset_uri)
177 | 
178 |     def _parse_sourcemap(self, target, is_str=False):
179 |         map_data = ""
180 |         if is_str is False:
181 |             if os.path.isfile(target):
182 |                 with open(target, 'r', encoding='utf-8', errors='ignore') as f:
183 |                     map_data = f.read()
184 |         else:
185 |             map_data = target
186 | 
187 |         # with the sourcemap data, pull directory structures
188 |         try:
189 |             map_object = json.loads(map_data)
190 |         except json.JSONDecodeError:
191 |             print("ERROR: Failed to parse sourcemap %s. Are you sure this is a sourcemap?" % target)
192 |             return False
193 | 
194 |         # we need `sourcesContent` and `sources`.
195 |         # do a basic validation check to make sure these exist and agree.
196 |         if 'sources' not in map_object or 'sourcesContent' not in map_object:
197 |             print("ERROR: Sourcemap does not contain sources and/or sourcesContent, cannot extract.")
198 |             return False
199 | 
200 |         if len(map_object['sources']) != len(map_object['sourcesContent']):
201 |             print("WARNING: sources != sourcesContent, filenames may not match content")
202 | 
203 |         for source, content in zip(map_object['sources'], map_object['sourcesContent']):
204 |             # remove webpack:// from paths
205 |             # and do some checks on it
206 |             write_path = self._get_sanitised_file_path(source)
207 |             if write_path is None:
208 |                 print("ERROR: Could not sanitize path %s" % source)
209 |                 continue
210 | 
211 |             os.makedirs(os.path.dirname(write_path), mode=0o755, exist_ok=True)
212 |             with open(write_path, 'w', encoding='utf-8', errors='ignore', newline='') as f:
213 |                 print("Writing %s..." % os.path.basename(write_path))
214 |                 f.write(content)
215 | 
216 |     def _get_sanitised_file_path(self, sourcePath):
217 |         """Sanitise webpack paths for separators/relative paths"""
218 |         sourcePath = sourcePath.replace("webpack:///", "")
219 |         exts = sourcePath.split(" ")
220 | 
221 |         if exts[0] == "external":
222 |             print("WARNING: Found external sourcemap %s, not currently supported. Skipping" % exts[1])
223 |             return None
224 | 
225 |         path, filename = os.path.split(sourcePath)
226 |         if path[:2] == './':
227 |             path = path[2:]
228 |         if path[:3] == '../':
229 |             path = 'parent_dir/' + path[3:]
230 |         if path[:1] == '.':
231 |             path = ""
232 | 
233 |         filepath = self._path_sanitiser.make_valid_file_path(path, filename)
234 |         return filepath
235 | 
236 |     def _get_remote_data(self, uri):
237 |         """Get remote data via http."""
238 | 
239 |         if self.disable_verify_ssl == True:
240 |             result = requests.get(uri, verify=False)
241 |         else:
242 |             result = requests.get(uri)
243 | 
244 |         # Redirect
245 |         if not uri == result.url:
246 |             return self._get_remote_data(result.url)
247 | 
248 |         if result.status_code == 200:
249 |             result.encoding = 'utf-8'
250 |             return result.text, result.url
251 |         else:
252 |             print("WARNING: Got status code %d for URI %s" % (result.status_code, result.url))
253 |             return None, result.url
254 | 
255 | 
256 | class PathSanitiser(object):
257 |     """https://stackoverflow.com/questions/13939120/sanitizing-a-file-path-in-python"""
258 | 
259 |     EMPTY_NAME = "empty"
260 | 
261 |     empty_idx = 0
262 |     root_path = ""
263 | 
264 |     def __init__(self, root_path):
265 |         self.root_path = root_path
266 | 
267 |     def ensure_directory_exists(self, path_directory):
268 |         if not os.path.exists(path_directory):
269 |             os.makedirs(path_directory)
270 | 
271 |     def os_path_separators(self):
272 |         seps = []
273 |         for sep in os.path.sep, os.path.altsep:
274 |             if sep:
275 |                 seps.append(sep)
276 |         return seps
277 | 
278 |     def sanitise_filesystem_name(self, potential_file_path_name):
279 |         # Sort out unicode characters
280 |         valid_filename = normalize('NFKD', potential_file_path_name).encode('ascii', 'ignore').decode('ascii')
281 |         # Replace path separators with underscores
282 |         for sep in self.os_path_separators():
283 |             valid_filename = valid_filename.replace(sep, '_')
284 |         # Ensure only valid characters
285 |         valid_chars = "-_.() {0}{1}".format(string.ascii_letters, string.digits)
286 |         valid_filename = "".join(ch for ch in valid_filename if ch in valid_chars)
287 |         # Ensure at least one letter or number to ignore names such as '..'
288 |         valid_chars = "{0}{1}".format(string.ascii_letters, string.digits)
289 |         test_filename = "".join(ch for ch in potential_file_path_name if ch in valid_chars)
290 |         if len(test_filename) == 0:
291 |             # Replace empty file name or file path part with the following
292 |             valid_filename = self.EMPTY_NAME + '_' + str(self.empty_idx)
293 | 
294 |             # MODIFIED BY MARSHALX
295 |             # self.empty_idx += 1
296 | 
297 |         return valid_filename
298 | 
299 |     def get_root_path(self):
300 |         # Replace with your own root file path, e.g. '/place/to/save/files/'
301 |         filepath = self.root_path
302 |         filepath = os.path.abspath(filepath)
303 |         # ensure trailing path separator (/)
304 |         if not any(filepath[-1] == sep for sep in self.os_path_separators()):
305 |             filepath = '{0}{1}'.format(filepath, os.path.sep)
306 |         self.ensure_directory_exists(filepath)
307 |         return filepath
308 | 
309 |     def path_split_into_list(self, path):
310 |         # Gets all parts of the path as a list, excluding path separators
311 |         parts = []
312 |         while True:
313 |             newpath, tail = os.path.split(path)
314 |             if newpath == path:
315 |                 assert not tail
316 |                 if path and path not in self.os_path_separators():
317 |                     parts.append(path)
318 |                 break
319 |             if tail and tail not in self.os_path_separators():
320 |                 parts.append(tail)
321 |             path = newpath
322 |         parts.reverse()
323 |         return parts
324 | 
325 |     def sanitise_filesystem_path(self, potential_file_path):
326 |         # Splits up a path and sanitises the name of each part separately
327 |         path_parts_list = self.path_split_into_list(potential_file_path)
328 |         sanitised_path = ''
329 |         for path_component in path_parts_list:
330 |             sanitised_path = '{0}{1}{2}'.format(sanitised_path,
331 |                                                 self.sanitise_filesystem_name(path_component),
332 |                                                 os.path.sep)
333 |         return sanitised_path
334 | 
335 |     def check_if_path_is_under(self, parent_path, child_path):
336 |         # Using the function to split paths into lists of component parts, check that one path is underneath another
337 |         child_parts = self.path_split_into_list(child_path)
338 |         parent_parts = self.path_split_into_list(parent_path)
339 |         if len(parent_parts) > len(child_parts):
340 |             return False
341 |         return all(part1 == part2 for part1, part2 in zip(child_parts, parent_parts))
342 | 
343 |     def make_valid_file_path(self, path=None, filename=None):
344 |         root_path = self.get_root_path()
345 |         if path:
346 |             sanitised_path = self.sanitise_filesystem_path(path)
347 |             if filename:
348 |                 sanitised_filename = self.sanitise_filesystem_name(filename)
349 |                 complete_path = os.path.join(root_path, sanitised_path, sanitised_filename)
350 |             else:
351 |                 complete_path = os.path.join(root_path, sanitised_path)
352 |         else:
353 |             if filename:
354 |                 sanitised_filename = self.sanitise_filesystem_name(filename)
355 |                 complete_path = os.path.join(root_path, sanitised_filename)
356 |             else:
357 |                 complete_path = complete_path
358 |         complete_path = os.path.abspath(complete_path)
359 |         if self.check_if_path_is_under(root_path, complete_path):
360 |             return complete_path
361 |         else:
362 |             return None
363 | 
364 | 
365 | class SourceMapExtractorError(Exception):
366 |     pass
367 | 
368 | 
369 | if __name__ == "__main__":
370 |     parser = argparse.ArgumentParser(
371 |         description="A tool to extract code from Webpack sourcemaps. Turns black boxes into gray ones.")
372 |     parser.add_argument("-l", "--local", action="store_true", default=False)
373 |     parser.add_argument("-d", "--detect", action="store_true", default=False,
374 |                         help="Attempt to detect sourcemaps from JS assets in retrieved HTML.")
375 |     parser.add_argument("--make-directory", action="store_true", default=False,
376 |                         help="Make the output directory if it doesn't exist.")
377 |     parser.add_argument("--dangerously-write-paths", action="store_true", default=False,
378 |                         help="Write full paths. WARNING: Be careful here, you are pulling directories from an untrusted source.")
379 |     parser.add_argument("--disable-ssl-verification", action="store_true", default=False,
380 |                         help="The script will not verify the site's SSL certificate.")
381 | 
382 |     parser.add_argument("uri_or_file", help="The target URI or file.")
383 |     parser.add_argument("output_directory", help="Directory to output from sourcemap to.")
384 | 
385 |     if (len(sys.argv) < 3):
386 |         parser.print_usage()
387 |         sys.exit(1)
388 | 
389 |     args = parser.parse_args()
390 |     extractor = SourceMapExtractor(vars(args))
391 |     extractor.run()
392 | 


--------------------------------------------------------------------------------
/make_tracked_links_list.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import codecs
  3 | import logging
  4 | import os
  5 | import re
  6 | from functools import cache
  7 | from html import unescape
  8 | from time import time
  9 | from typing import Set, List, Union
 10 | from urllib.parse import unquote
 11 | 
 12 | import httpx
 13 | import uvloop
 14 | 
 15 | 
 16 | PROTOCOL = 'https://'
 17 | BASE_URL = 'telegram.org'
 18 | # it's necessary to help crawler to find more links
 19 | HIDDEN_URLS = {
 20 |     'blogfork.telegram.org',
 21 | 
 22 |     'corefork.telegram.org',
 23 |     'corefork.telegram.org/getProxyConfig',
 24 | 
 25 |     'telegram.org/privacy/gmailbot',
 26 |     'telegram.org/tos/mini-apps',
 27 |     'telegram.org/tos/p2pl',
 28 |     'telegram.org/tour',
 29 |     'telegram.org/evolution',
 30 |     'telegram.org/tos/bots',
 31 |     'telegram.org/tos/business',
 32 | 
 33 |     'desktop.telegram.org/changelog',
 34 |     'td.telegram.org/current',
 35 |     'td.telegram.org/current2',
 36 |     'td.telegram.org/current4',
 37 |     'td.telegram.org/current5',    # tdx
 38 | 
 39 |     'osx.telegram.org/updates/versions.xml',    # stable
 40 |     'mac-updates.telegram.org/beta/versions.xml',
 41 | 
 42 |     'telegram.org/dl/android/apk-public-beta.json',
 43 | 
 44 |     'instantview.telegram.org/rules',
 45 | 
 46 |     'core.telegram.org/resources/cidr.txt',
 47 |     'core.telegram.org/apple_privacy',
 48 |     'core.telegram.org/getProxyConfig',
 49 | 
 50 |     'core.telegram.org/video_stickers',
 51 |     'core.telegram.org/stickers',
 52 | 
 53 |     'promote.telegram.org',
 54 |     'contest.com',
 55 | 
 56 |     # web apps beta
 57 |     'comments.app/test_webview',    # old
 58 |     'webappcontent.telegram.org/demo',  # new
 59 |     'webappcontent.telegram.org/cafe',  # demo 2
 60 |     'webappinternal.telegram.org/botfather',
 61 |     'webappinternal.telegram.org/stickers',
 62 |     # 'a-webappcontent.stel.com/demo',
 63 |     # 'a-webappcontent.stel.com/cafe',
 64 | 
 65 |     # 'fragment.com/about',
 66 |     # 'fragment.com/privacy',
 67 |     # 'fragment.com/terms',
 68 |     # 'fragment.com/css/auction.css',   # a lot of CDN issues which TG can't fix
 69 |     # 'fragment.com/js/auction.js',  # a lot of CDN issues which TG can't fix
 70 | }
 71 | ADDITIONAL_URLS = {
 72 |     'raw.githubusercontent.com/telegramdesktop/tdesktop/dev/Telegram/SourceFiles/mtproto/scheme/mtproto.tl',
 73 |     'raw.githubusercontent.com/telegramdesktop/tdesktop/dev/Telegram/SourceFiles/mtproto/scheme/api.tl',
 74 |     'raw.githubusercontent.com/tdlib/td/master/td/generate/scheme/telegram_api.tl',
 75 |     'raw.githubusercontent.com/tdlib/td/master/td/generate/scheme/secret_api.tl',
 76 |     'raw.githubusercontent.com/tdlib/td/master/td/generate/scheme/td_api.tl',
 77 | }
 78 | BASE_URL_REGEX = r'telegram.org'
 79 | 
 80 | CRAWL_GLOBAL_RULES = {
 81 |     'allow': set(),
 82 |     'deny': {
 83 |         r'.org/auth$',
 84 |     },
 85 | }
 86 | # disable crawling sub links for specific domains and url patterns
 87 | CRAWL_RULES = {
 88 |     # every rule is regex
 89 |     # empty string means match any url
 90 |     # allow rules with higher priority than deny
 91 |     'translations.telegram.org': {
 92 |         'allow': {
 93 |             r'^[^/]*$',  # root
 94 |             r'org/[^/]*$',  # 1 lvl sub
 95 |             r'/css/[a-z-_.]+$',  # css files
 96 |             r'/en/[a-z_]+$',  # 1 lvl after /en/
 97 |             r'/en/(?!recent)[a-z_]+/[a-z_]+$',  # 2 lvl after /en/. for example, /en/ios/unsorted except /en/recent
 98 |         },
 99 |         'deny': {
100 |             '',  # all
101 |         }
102 |     },
103 |     'osx.telegram.org': {
104 |       'deny': {
105 |           'updates/Telegram'
106 |       }
107 |     },
108 |     'bugs.telegram.org': {  # crawl first page of cards sorted by rating
109 |         'deny': {
110 |             # r'/c/[0-9]+/[0-9]+',  # disable comments
111 |             '',
112 |         },
113 |     },
114 |     'instantview.telegram.org': {
115 |         'deny': {
116 |             r'templates/.+',
117 |             'samples/',
118 |             'contest',
119 |         },
120 |     },
121 |     'core.telegram.org': {
122 |         'deny': {
123 |             'bots/payments',
124 |             'tdlib/docs/classtd',
125 |             'validatedRequestedInfo',
126 |             'constructor/Updates',
127 |         },
128 |     },
129 |     'corefork.telegram.org': {
130 |         'deny': {
131 |             'bots/payments',
132 |             'tdlib/docs/classtd',
133 |             'validatedRequestedInfo',
134 |             'constructor/Updates',
135 |         },
136 |     },
137 |     'blogfork.telegram.org': {
138 |         'deny': {
139 |             'bots/payments',
140 |             'tdlib/docs/classtd',
141 |             'validatedRequestedInfo',
142 |             'constructor/Updates',
143 |         },
144 |     },
145 |     'telegram.org': {
146 |         'deny': {
147 |             r'apps$',
148 |             r'img/emoji/.+',
149 |             r'img/StickerExample.psd$',
150 |             r'/privacy$',  # geolocation depended
151 |             r'/tos$',  # geolocation depended
152 |             r'/moderation$',  # dynamic graphs
153 |             r'/dsa-report$',  # EU only
154 |             r'/tos/eu-dsa/transparency-2025$',  # EU only
155 |             r'/tos/eu/transparency-tco$',  # EU only
156 |         },
157 |     },
158 |     'webz.telegram.org': {
159 |         'deny': {
160 |             '',
161 |         },
162 |     },
163 |     'webk.telegram.org': {
164 |         'deny': {
165 |             '',
166 |         },
167 |     },
168 | }
169 | CRAWL_STATUS_CODE_EXCLUSIONS = {
170 |     'webappinternal.telegram.org/botfather',
171 |     'webappinternal.telegram.org/stickers',
172 | }
173 | 
174 | DIRECT_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,249}' + BASE_URL_REGEX + r')'
175 | ABSOLUTE_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,248}' + BASE_URL_REGEX + r'\b[-a-zA-Z0-9@:%_\+.~#?&//=]*)'
176 | RELATIVE_LINK_REGEX = r'\/(?!\/)([-a-zA-Z0-9\/@:%._\+~#]{0,249})'
177 | RELATIVE_JS_SCRIPTS_REGEX = r'["\'](.*\.js)["\'\?]'
178 | 
179 | DOM_ATTRS = ['href', 'src']
180 | 
181 | OUTPUT_FILENAME = os.environ.get('OUTPUT_FILENAME', 'tracked_links.txt')
182 | OUTPUT_RESOURCES_FILENAME = os.environ.get('OUTPUT_RESOURCES_FILENAME', 'tracked_res_links.txt')
183 | OUTPUT_TRANSLATIONS_FILENAME = os.environ.get('OUTPUT_TRANSLATIONS_FILENAME', 'tracked_tr_links.txt')
184 | 
185 | STEL_DEV_LAYER = 290
186 | 
187 | TIMEOUT_CONFIGS = [
188 |     # Fast timeout for most requests
189 |     {'total': 30, 'connect': 30, 'sock_connect': 30, 'sock_read': 30},
190 |     # Medium timeout for slower responses  
191 |     {'total': 60, 'connect': 60, 'sock_connect': 30, 'sock_read': 60},
192 |     # High timeout for problematic URLs
193 |     {'total': 120, 'connect': 90, 'sock_connect': 30, 'sock_read': 90}
194 | ]
195 | 
196 | logging.basicConfig(format='%(asctime)s  %(levelname)s - %(message)s', level=logging.INFO)
197 | logging.getLogger('httpx').setLevel(logging.WARNING)
198 | logger = logging.getLogger(__name__)
199 | 
200 | VISITED_LINKS = set()
201 | LINKS_TO_TRACK = set()
202 | LINKS_TO_TRANSLATIONS = set()
203 | LINKS_TO_TRACKABLE_RESOURCES = set()
204 | 
205 | URL_RETRY_COUNT = {}
206 | RETRY_LOCK = asyncio.Lock()
207 | 
208 | # Track base URLs that have had their trailing slash state flipped for retry logic
209 | SLASH_RETRY_ATTEMPTED = set()
210 | SLASH_RETRY_LOCK = asyncio.Lock()
211 | 
212 | VISITED_LINKS_LOCK = asyncio.Lock()
213 | TRACKING_SETS_LOCK = asyncio.Lock()
214 | 
215 | WORKERS_COUNT = 50
216 | WORKERS_TASK_QUEUE = asyncio.Queue()
217 | WORKERS_NEW_TASK_TIMEOUT = 1.0  # seconds
218 | 
219 | TEXT_DECODER = codecs.getincrementaldecoder('UTF-8')(errors='strict')
220 | 
221 | 
222 | @cache
223 | def should_exclude(url: str) -> bool:
224 |     direct_link = re.findall(DIRECT_LINK_REGEX, url)[0]
225 |     domain_rules = CRAWL_RULES.get(direct_link)
226 |     if not domain_rules:
227 |         domain_rules = CRAWL_GLOBAL_RULES
228 | 
229 |     allow_rules = domain_rules.get('allow', set()) | CRAWL_GLOBAL_RULES.get('allow', set())
230 |     deny_rules = domain_rules.get('deny', set()) | CRAWL_GLOBAL_RULES.get('deny', set())
231 | 
232 |     exclude = False
233 | 
234 |     for regex in deny_rules:
235 |         if re.search(regex, url):
236 |             exclude = True
237 |             break
238 | 
239 |     for regex in allow_rules:
240 |         if re.search(regex, url):
241 |             exclude = False
242 |             break
243 | 
244 |     if exclude:
245 |         logger.debug('Exclude %s by rules', url)
246 | 
247 |     return exclude
248 | 
249 | 
250 | def find_absolute_links(html: str) -> Set[str]:
251 |     absolute_links = set(re.findall(ABSOLUTE_LINK_REGEX, html))
252 | 
253 |     return {link for link in cleanup_links(absolute_links) if not should_exclude(link)}
254 | 
255 | 
256 | def find_relative_links(html: str, cur_link: str) -> Set[str]:
257 |     matches = re.findall(DIRECT_LINK_REGEX, cur_link)
258 |     if not matches:
259 |         return set()
260 | 
261 |     direct_cur_link = re.findall(DIRECT_LINK_REGEX, cur_link)[0]
262 |     # optimization. when we want to exclude domain
263 |     if should_exclude(cur_link):
264 |         return set()
265 | 
266 |     relative_links = set()
267 |     for attr in DOM_ATTRS:
268 |         regex = f'{attr}="{RELATIVE_LINK_REGEX}'
269 |         links = re.findall(regex, html)
270 | 
271 |         for link in cleanup_links(links):
272 |             url = f'{direct_cur_link}/{link}'
273 |             if not should_exclude(url):
274 |                 relative_links.add(url)
275 | 
276 |     return relative_links
277 | 
278 | 
279 | def find_relative_scripts(code: str, cur_link: str) -> Set[str]:
280 |     matches = re.findall(DIRECT_LINK_REGEX, cur_link)
281 |     if not matches:
282 |         return set()
283 | 
284 |     direct_cur_link = re.findall(DIRECT_LINK_REGEX, cur_link)[0]
285 | 
286 |     relative_links = set()
287 |     links = re.findall(RELATIVE_JS_SCRIPTS_REGEX, code)
288 | 
289 |     def join_paths(part1: str, part2: str) -> str:
290 |         part1 = part1.rstrip('/')
291 |         part2 = part2.lstrip('/')
292 |         return f'{part1}/{part2}'
293 | 
294 |     for link in cleanup_links(links):
295 |         # dirty magic for specific cases
296 |         if '/' in link:    # path to file from the root
297 |             url = join_paths(direct_cur_link, link)
298 |         else:   # it is a relative link from the current folder. not from the root
299 |             current_folder_link, *_ = cur_link.rsplit('/', 1)
300 |             url = join_paths(current_folder_link, link)
301 | 
302 |         if not should_exclude(url):
303 |             relative_links.add(url)
304 | 
305 |     return relative_links
306 | 
307 | 
308 | def cleanup_links(links: Union[List[str], Set[str]]) -> Set[str]:
309 |     cleaned_links = set()
310 |     for tmp_link in links:
311 |         # normalize link
312 |         link = unquote(tmp_link)
313 |         link = unescape(link)
314 |         link = link.replace('www.', '')
315 |         link = link.replace('http://', '').replace('https://', '')
316 |         link = link.replace('//', '/')  # not a universal solution
317 |         link = link.replace('"', '')  # regex fix hack
318 | 
319 |         # skip anchor links
320 |         if '#' in link:
321 |             continue
322 | 
323 |         # remove get params from link
324 |         if '?' in link:
325 |             link = ''.join(link.split('?')[:-1])
326 | 
327 |         # remove get params from link
328 |         if '&' in link:
329 |             link = ''.join(link.split('&')[:-1])
330 | 
331 |         # skip mailto:
332 |         link_parts = link.split('.')
333 |         if '@' in link_parts[0]:
334 |             continue
335 | 
336 |         # fix wildcard
337 |         if link.startswith('.'):
338 |             link = link[1:]
339 | 
340 |         if link.endswith('/'):
341 |             link = link[:-1]
342 | 
343 |         cleaned_links.add(link)
344 | 
345 |     return cleaned_links
346 | 
347 | 
348 | def _is_x_content_type(content_types_set: Set[str], content_type) -> bool:
349 |     for match_content_type in content_types_set:
350 |         if match_content_type in content_type:
351 |             return True
352 | 
353 |     return False
354 | 
355 | 
356 | def is_translation_url(url: str) -> bool:
357 |     return 'translations.telegram.org' in url
358 | 
359 | 
360 | def is_textable_content_type(content_type: str) -> bool:
361 |     textable_content_type = {
362 |         'plain',
363 |         'css',
364 |         'json',
365 |         'text',
366 |         'javascript',
367 |     }
368 | 
369 |     return _is_x_content_type(textable_content_type, content_type)
370 | 
371 | 
372 | def is_trackable_content_type(content_type) -> bool:
373 |     trackable_content_types = {
374 |         'svg',
375 |         'png',
376 |         'jpeg',
377 |         'x-icon',
378 |         'gif',
379 |         'mp4',
380 |         'webm',
381 |         'application/octet-stream',    # td updates
382 |         'application/zip',
383 |     }
384 | 
385 |     return _is_x_content_type(trackable_content_types, content_type)
386 | 
387 | 
388 | class ServerSideError(Exception):
389 |     pass
390 | 
391 | 
392 | async def crawl_worker(client: httpx.AsyncClient):
393 |     while True:
394 |         try:
395 |             url = await asyncio.wait_for(WORKERS_TASK_QUEUE.get(), timeout=WORKERS_NEW_TASK_TIMEOUT)
396 |         except asyncio.TimeoutError:
397 |             logger.debug(f'Worker exiting - no tasks for {WORKERS_NEW_TASK_TIMEOUT} seconds')
398 |             break
399 | 
400 |         try:
401 |             async with RETRY_LOCK:
402 |                 retry_count = URL_RETRY_COUNT.get(url, 0)
403 |             
404 |             timeout_index = min(retry_count, len(TIMEOUT_CONFIGS) - 1)
405 |             timeout_config = TIMEOUT_CONFIGS[timeout_index]
406 | 
407 |             await _crawl(url, client, timeout_config)
408 | 
409 |             async with RETRY_LOCK:
410 |                 if url in URL_RETRY_COUNT:
411 |                     del URL_RETRY_COUNT[url]
412 | 
413 |             WORKERS_TASK_QUEUE.task_done()
414 |         except (ServerSideError, httpx.ProtocolError, httpx.TimeoutException, httpx.NetworkError) as e:
415 |             exc_name = e.__class__.__name__
416 |             exc_msg = str(e) if str(e) else 'No message'
417 | 
418 |             async with RETRY_LOCK:
419 |                 retry_count = URL_RETRY_COUNT.get(url, 0)
420 |                 URL_RETRY_COUNT[url] = retry_count + 1
421 | 
422 |             next_timeout_index = min(retry_count + 1, len(TIMEOUT_CONFIGS) - 1)
423 |             next_timeout_config = TIMEOUT_CONFIGS[next_timeout_index]
424 | 
425 |             logger.warning(f'Crawl error {exc_name}: {exc_msg}. Retrying {url} with {next_timeout_config["total"]}s total timeout')
426 | 
427 |             await WORKERS_TASK_QUEUE.put(url)
428 | 
429 |             async with VISITED_LINKS_LOCK:
430 |                 if url in VISITED_LINKS:
431 |                     VISITED_LINKS.remove(url)
432 | 
433 |             WORKERS_TASK_QUEUE.task_done()
434 | 
435 | 
436 | async def _crawl(url: str, client: httpx.AsyncClient, timeout_config: dict = None):
437 |     truncated_url = (url[:100] + '...') if len(url) > 100 else url
438 | 
439 |     async with VISITED_LINKS_LOCK:
440 |         if url in VISITED_LINKS:
441 |             return
442 |         VISITED_LINKS.add(url)
443 | 
444 |     if timeout_config is None:
445 |         timeout_config = TIMEOUT_CONFIGS[0]  # Use default (fast) timeout
446 | 
447 |     timeout = httpx.Timeout(
448 |         timeout= timeout_config['total'],
449 |         connect=timeout_config['connect'],
450 |         read=timeout_config['sock_read'],
451 |         write=None,
452 |         pool=None
453 |     )
454 |     logger.debug('[%s] Process %s (total timeout: %ds)', len(VISITED_LINKS), truncated_url, timeout_config['total'])
455 |     response = await client.get(f'{PROTOCOL}{url}', timeout=timeout)
456 |     code = response.status_code
457 | 
458 |     if 499 < code < 600:
459 |         async with VISITED_LINKS_LOCK:
460 |             VISITED_LINKS.remove(url)
461 |         logger.warning(f'Error 5XX. Retrying {url}')
462 |         raise ServerSideError()
463 | 
464 |     if code not in {200, 304} and url not in CRAWL_STATUS_CODE_EXCLUSIONS:
465 |         # Handle redirect and not found errors with retry logic: flip trailing slash state
466 |         if code in {301, 302, 404}:
467 |             async with SLASH_RETRY_LOCK:
468 |                 base_url = url.rstrip('/')
469 |                 if base_url not in SLASH_RETRY_ATTEMPTED:
470 |                     if url.endswith('/'):
471 |                         flipped_url = base_url
472 |                         logger.warning(f'{code} slash removal retry for {truncated_url}')
473 |                     else:
474 |                         flipped_url = f'{url}/'
475 |                         logger.warning(f'{code} slash addition retry for {truncated_url}')
476 | 
477 |                     SLASH_RETRY_ATTEMPTED.add(base_url)
478 |                     await WORKERS_TASK_QUEUE.put(flipped_url)
479 |                     return
480 |                 else:
481 |                     logger.warning(f'Skip [{code}] {truncated_url}: already tried flipping slash state for {base_url}')
482 |                     return
483 | 
484 |         clean_content = response.text.replace('\n', ' ').replace('\r', ' ')
485 |         truncated_content = (clean_content[:200] + '...') if len(clean_content) > 200 else clean_content
486 |         logger.warning(f'Skip [{code}] {truncated_url}: {truncated_content}')
487 | 
488 |         return
489 | 
490 |     content_type = response.headers.get('content-type')
491 |     if is_textable_content_type(content_type):
492 |         raw_content = response.content
493 | 
494 |         try:
495 |             content = TEXT_DECODER.decode(raw_content)
496 |         except UnicodeDecodeError:
497 |             if raw_content.startswith(b'GIF'):
498 |                 async with TRACKING_SETS_LOCK:
499 |                     LINKS_TO_TRACKABLE_RESOURCES.add(url)
500 |                     logger.debug('Add %s to LINKS_TO_TRACKABLE_RESOURCES (raw GIF content)', url)
501 |                 return
502 |             else:
503 |                 logger.warning(f'Codec can\'t decode bytes. So it was a tgs file or response with broken content type {url}')
504 |                 return
505 | 
506 |         async with TRACKING_SETS_LOCK:
507 |             if is_translation_url(url):
508 |                 LINKS_TO_TRANSLATIONS.add(url)
509 |                 logger.debug('Add %s to LINKS_TO_TRANSLATIONS', url)
510 |             else:
511 |                 LINKS_TO_TRACK.add(url)
512 |                 logger.debug('Add %s to LINKS_TO_TRACK', url)
513 | 
514 |         absolute_links = find_absolute_links(content)
515 | 
516 |         relative_links_finder = find_relative_links
517 |         if 'javascript' in content_type:
518 |             relative_links_finder = find_relative_scripts
519 | 
520 |         relative_links = relative_links_finder(content, url)
521 | 
522 |         sub_links = absolute_links | relative_links
523 |         for sub_url in sub_links:
524 |             async with VISITED_LINKS_LOCK:
525 |                 if sub_url not in VISITED_LINKS:
526 |                     await WORKERS_TASK_QUEUE.put(sub_url)
527 |     elif is_trackable_content_type(content_type):
528 |         async with TRACKING_SETS_LOCK:
529 |             LINKS_TO_TRACKABLE_RESOURCES.add(url)
530 |             logger.debug('Add %s to LINKS_TO_TRACKABLE_RESOURCES', url)
531 |     else:
532 |         # for example, zip with update of macOS client
533 |         logger.warning(f'Unhandled type: {content_type} from {url}')
534 | 
535 | 
536 | async def start(url_list: Set[str]):
537 |     for url in url_list:
538 |         await WORKERS_TASK_QUEUE.put(url)
539 | 
540 |     transport = httpx.AsyncHTTPTransport(verify=False, retries=3)
541 |     async with httpx.AsyncClient(transport=transport) as client:
542 |         workers = [crawl_worker(client) for _ in range(WORKERS_COUNT)]
543 |         await asyncio.gather(*workers)
544 | 
545 |     await WORKERS_TASK_QUEUE.join()
546 | 
547 | 
548 | def unified_links(links_set: Set[str]) -> Set[str]:
549 |     return {link.rstrip('/') for link in links_set}
550 | 
551 | 
552 | if __name__ == '__main__':
553 |     HIDDEN_URLS.add(BASE_URL)
554 |     LINKS_TO_TRACK = LINKS_TO_TRACK | ADDITIONAL_URLS
555 | 
556 |     logger.info('Start crawling links...')
557 |     start_time = time()
558 |     uvloop.run(start(HIDDEN_URLS))
559 |     logger.info(f'Stop crawling links. {time() - start_time} sec.')
560 | 
561 |     LINKS_TO_TRACK = unified_links(LINKS_TO_TRACK)
562 |     LINKS_TO_TRACKABLE_RESOURCES = unified_links(LINKS_TO_TRACKABLE_RESOURCES)
563 |     LINKS_TO_TRANSLATIONS = unified_links(LINKS_TO_TRANSLATIONS)
564 | 
565 |     try:
566 |         OLD_URL_LIST = set()
567 |         for filename in (OUTPUT_FILENAME, OUTPUT_RESOURCES_FILENAME, OUTPUT_TRANSLATIONS_FILENAME):
568 |             with open(filename, 'r') as f:
569 |                 OLD_URL_LIST |= set([l.replace('\n', '') for l in f.readlines()])
570 | 
571 |         CURRENT_URL_LIST = LINKS_TO_TRACK | LINKS_TO_TRACKABLE_RESOURCES | LINKS_TO_TRANSLATIONS
572 | 
573 |         logger.info(f'Is equal: {OLD_URL_LIST == CURRENT_URL_LIST}')
574 |         logger.info(f'Deleted ({len(OLD_URL_LIST - CURRENT_URL_LIST)}): {OLD_URL_LIST - CURRENT_URL_LIST}')
575 |         logger.info(f'Added ({len(CURRENT_URL_LIST - OLD_URL_LIST)}): {CURRENT_URL_LIST - OLD_URL_LIST}')
576 |     except IOError:
577 |         pass
578 | 
579 |     with open(OUTPUT_FILENAME, 'w') as f:
580 |         f.write('\n'.join(sorted(unified_links(LINKS_TO_TRACK))))
581 | 
582 |     with open(OUTPUT_RESOURCES_FILENAME, 'w') as f:
583 |         f.write('\n'.join(sorted(unified_links(LINKS_TO_TRACKABLE_RESOURCES))))
584 | 
585 |     with open(OUTPUT_TRANSLATIONS_FILENAME, 'w') as f:
586 |         f.write('\n'.join(sorted(unified_links(LINKS_TO_TRANSLATIONS))))
587 | 


--------------------------------------------------------------------------------
/ccl_bplist.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright (c) 2012-2016, CCL Forensics
  3 | All rights reserved.
  4 | 
  5 | Redistribution and use in source and binary forms, with or without
  6 | modification, are permitted provided that the following conditions are met:
  7 |     * Redistributions of source code must retain the above copyright
  8 |       notice, this list of conditions and the following disclaimer.
  9 |     * Redistributions in binary form must reproduce the above copyright
 10 |       notice, this list of conditions and the following disclaimer in the
 11 |       documentation and/or other materials provided with the distribution.
 12 |     * Neither the name of the CCL Forensics nor the
 13 |       names of its contributors may be used to endorse or promote products
 14 |       derived from this software without specific prior written permission.
 15 | 
 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 | DISCLAIMED. IN NO EVENT SHALL CCL FORENSICS BE LIABLE FOR ANY
 20 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | """
 27 | 
 28 | import sys
 29 | import os
 30 | import struct
 31 | import datetime
 32 | 
 33 | __version__ = "0.21"
 34 | __description__ = "Converts Apple binary PList files into a native Python data structure"
 35 | __contact__ = "Alex Caithness"
 36 | 
 37 | _object_converter = None
 38 | def set_object_converter(function):
 39 |     """Sets the object converter function to be used when retrieving objects from the bplist.
 40 |     default is None (which will return objects in their raw form).
 41 |     A built in converter (ccl_bplist.NSKeyedArchiver_common_objects_convertor) which is geared
 42 |     toward dealling with common types in NSKeyedArchiver is available which can simplify code greatly
 43 |     when dealling with these types of files."""
 44 |     if not hasattr(function, "__call__"):
 45 |         raise TypeError("function is not a function")
 46 |     global _object_converter
 47 |     _object_converter = function
 48 | 
 49 | class BplistError(Exception):
 50 |     pass
 51 | 
 52 | class BplistUID:
 53 |     def __init__(self, value):
 54 |         self.value = value
 55 | 
 56 |     def __repr__(self):
 57 |         return "UID: {0}".format(self.value)
 58 | 
 59 |     def __str__(self):
 60 |         return self.__repr__()
 61 | 
 62 | def __decode_multibyte_int(b, signed=True):
 63 |     if len(b) == 1:
 64 |         fmt = ">B" # Always unsigned?
 65 |     elif len(b) == 2:
 66 |         fmt = ">h"
 67 |     elif len(b) == 3:
 68 |         if signed:
 69 |             return ((b[0] << 16) | struct.unpack(">H", b[1:])[0]) - ((b[0] >> 7) * 2 * 0x800000)
 70 |         else:
 71 |             return (b[0] << 16) | struct.unpack(">H", b[1:])[0]
 72 |     elif len(b) == 4:
 73 |         fmt = ">i"
 74 |     elif len(b) == 8:
 75 |         fmt = ">q"
 76 |     elif len(b) == 16:
 77 |         # special case for BigIntegers
 78 |         high, low = struct.unpack(">QQ", b)
 79 |         result = (high << 64) | low
 80 |         if high & 0x8000000000000000 and signed:
 81 |             result -= 0x100000000000000000000000000000000
 82 |         return result
 83 |     else:
 84 |         raise BplistError("Cannot decode multibyte int of length {0}".format(len(b)))
 85 | 
 86 |     if signed and len(b) > 1:
 87 |         return struct.unpack(fmt.lower(), b)[0]
 88 |     else:
 89 |         return struct.unpack(fmt.upper(), b)[0]
 90 | 
 91 | def __decode_float(b, signed=True):
 92 |     if len(b) == 4:
 93 |         fmt = ">f"
 94 |     elif len(b) == 8:
 95 |         fmt = ">d"
 96 |     else:
 97 |         raise BplistError("Cannot decode float of length {0}".format(len(b)))
 98 | 
 99 |     if signed:
100 |         return struct.unpack(fmt.lower(), b)[0]
101 |     else:
102 |         return struct.unpack(fmt.upper(), b)[0]
103 | 
104 | def __decode_object(f, offset, collection_offset_size, offset_table):
105 |     # Move to offset and read type
106 |     #print("Decoding object at offset {0}".format(offset))
107 |     f.seek(offset)
108 |     # A little hack to keep the script portable between py2.x and py3k
109 |     if sys.version_info[0] < 3:
110 |         type_byte = ord(f.read(1)[0])
111 |     else:
112 |         type_byte = f.read(1)[0]
113 |     #print("Type byte: {0}".format(hex(type_byte)))
114 |     if type_byte == 0x00: # Null      0000 0000
115 |         return None
116 |     elif type_byte == 0x08: # False   0000 1000
117 |         return False
118 |     elif type_byte == 0x09: # True    0000 1001
119 |         return True
120 |     elif type_byte == 0x0F: # Fill    0000 1111
121 |         raise BplistError("Fill type not currently supported at offset {0}".format(f.tell())) # Not sure what to return really...
122 |     elif type_byte & 0xF0 == 0x10: # Int    0001 xxxx
123 |         int_length = 2 ** (type_byte & 0x0F)
124 |         int_bytes = f.read(int_length)
125 |         return __decode_multibyte_int(int_bytes)
126 |     elif type_byte & 0xF0 == 0x20: # Float   0010 nnnn
127 |         float_length = 2 ** (type_byte & 0x0F)
128 |         float_bytes = f.read(float_length)
129 |         return __decode_float(float_bytes)
130 |     elif type_byte & 0xFF == 0x33: # Date   0011 0011
131 |         date_bytes = f.read(8)
132 |         date_value = __decode_float(date_bytes)
133 |         try:
134 |             result = datetime.datetime(2001,1,1) + datetime.timedelta(seconds = date_value)
135 |         except OverflowError:
136 |             result = datetime.datetime.min
137 |         return result
138 |     elif type_byte & 0xF0 == 0x40: # Data   0100 nnnn
139 |         if type_byte & 0x0F != 0x0F:
140 |             # length in 4 lsb
141 |             data_length = type_byte & 0x0F
142 |         else:
143 |             # A little hack to keep the script portable between py2.x and py3k
144 |             if sys.version_info[0] < 3:
145 |                 int_type_byte = ord(f.read(1)[0])
146 |             else:
147 |                 int_type_byte = f.read(1)[0]
148 |             if int_type_byte & 0xF0 != 0x10:
149 |                 raise BplistError("Long Data field definition not followed by int type at offset {0}".format(f.tell()))
150 |             int_length = 2 ** (int_type_byte & 0x0F)
151 |             int_bytes = f.read(int_length)
152 |             data_length = __decode_multibyte_int(int_bytes, False)
153 |         return f.read(data_length)
154 |     elif type_byte & 0xF0 == 0x50: # ASCII  0101 nnnn
155 |         if type_byte & 0x0F != 0x0F:
156 |             # length in 4 lsb
157 |             ascii_length = type_byte & 0x0F
158 |         else:
159 |             # A little hack to keep the script portable between py2.x and py3k
160 |             if sys.version_info[0] < 3:
161 |                 int_type_byte = ord(f.read(1)[0])
162 |             else:
163 |                 int_type_byte = f.read(1)[0]
164 |             if int_type_byte & 0xF0 != 0x10:
165 |                 raise BplistError("Long ASCII field definition not followed by int type at offset {0}".format(f.tell()))
166 |             int_length = 2 ** (int_type_byte & 0x0F)
167 |             int_bytes = f.read(int_length)
168 |             ascii_length = __decode_multibyte_int(int_bytes, False)
169 |         return f.read(ascii_length).decode("ascii")
170 |     elif type_byte & 0xF0 == 0x60: # UTF-16  0110 nnnn
171 |         if type_byte & 0x0F != 0x0F:
172 |             # length in 4 lsb
173 |             utf16_length = (type_byte & 0x0F) * 2 # Length is characters - 16bit width
174 |         else:
175 |             # A little hack to keep the script portable between py2.x and py3k
176 |             if sys.version_info[0] < 3:
177 |                 int_type_byte = ord(f.read(1)[0])
178 |             else:
179 |                 int_type_byte = f.read(1)[0]
180 |             if int_type_byte & 0xF0 != 0x10:
181 |                 raise BplistError("Long UTF-16 field definition not followed by int type at offset {0}".format(f.tell()))
182 |             int_length = 2 ** (int_type_byte & 0x0F)
183 |             int_bytes = f.read(int_length)
184 |             utf16_length = __decode_multibyte_int(int_bytes, False) * 2
185 |         return f.read(utf16_length).decode("utf_16_be")
186 |     elif type_byte & 0xF0 == 0x80: # UID    1000 nnnn
187 |         uid_length = (type_byte & 0x0F) + 1
188 |         uid_bytes = f.read(uid_length)
189 |         return BplistUID(__decode_multibyte_int(uid_bytes, signed=False))
190 |     elif type_byte & 0xF0 == 0xA0: # Array  1010 nnnn
191 |         if type_byte & 0x0F != 0x0F:
192 |             # length in 4 lsb
193 |             array_count = type_byte & 0x0F
194 |         else:
195 |             # A little hack to keep the script portable between py2.x and py3k
196 |             if sys.version_info[0] < 3:
197 |                 int_type_byte = ord(f.read(1)[0])
198 |             else:
199 |                 int_type_byte = f.read(1)[0]
200 |             if int_type_byte & 0xF0 != 0x10:
201 |                 raise BplistError("Long Array field definition not followed by int type at offset {0}".format(f.tell()))
202 |             int_length = 2 ** (int_type_byte & 0x0F)
203 |             int_bytes = f.read(int_length)
204 |             array_count = __decode_multibyte_int(int_bytes, signed=False)
205 |         array_refs = []
206 |         for i in range(array_count):
207 |             array_refs.append(__decode_multibyte_int(f.read(collection_offset_size), False))
208 |         return [__decode_object(f, offset_table[obj_ref], collection_offset_size, offset_table) for obj_ref in array_refs]
209 |     elif type_byte & 0xF0 == 0xC0: # Set  1010 nnnn
210 |         if type_byte & 0x0F != 0x0F:
211 |             # length in 4 lsb
212 |             set_count = type_byte & 0x0F
213 |         else:
214 |             # A little hack to keep the script portable between py2.x and py3k
215 |             if sys.version_info[0] < 3:
216 |                 int_type_byte = ord(f.read(1)[0])
217 |             else:
218 |                 int_type_byte = f.read(1)[0]
219 |             if int_type_byte & 0xF0 != 0x10:
220 |                 raise BplistError("Long Set field definition not followed by int type at offset {0}".format(f.tell()))
221 |             int_length = 2 ** (int_type_byte & 0x0F)
222 |             int_bytes = f.read(int_length)
223 |             set_count = __decode_multibyte_int(int_bytes, signed=False)
224 |         set_refs = []
225 |         for i in range(set_count):
226 |             set_refs.append(__decode_multibyte_int(f.read(collection_offset_size), False))
227 |         return [__decode_object(f, offset_table[obj_ref], collection_offset_size, offset_table) for obj_ref in set_refs]
228 |     elif type_byte & 0xF0 == 0xD0: # Dict  1011 nnnn
229 |         if type_byte & 0x0F != 0x0F:
230 |             # length in 4 lsb
231 |             dict_count = type_byte & 0x0F
232 |         else:
233 |             # A little hack to keep the script portable between py2.x and py3k
234 |             if sys.version_info[0] < 3:
235 |                 int_type_byte = ord(f.read(1)[0])
236 |             else:
237 |                 int_type_byte = f.read(1)[0]
238 |             #print("Dictionary length int byte: {0}".format(hex(int_type_byte)))
239 |             if int_type_byte & 0xF0 != 0x10:
240 |                 raise BplistError("Long Dict field definition not followed by int type at offset {0}".format(f.tell()))
241 |             int_length = 2 ** (int_type_byte & 0x0F)
242 |             int_bytes = f.read(int_length)
243 |             dict_count = __decode_multibyte_int(int_bytes, signed=False)
244 |         key_refs = []
245 |         #print("Dictionary count: {0}".format(dict_count))
246 |         for i in range(dict_count):
247 |             key_refs.append(__decode_multibyte_int(f.read(collection_offset_size), False))
248 |         value_refs = []
249 |         for i in range(dict_count):
250 |             value_refs.append(__decode_multibyte_int(f.read(collection_offset_size), False))
251 | 
252 |         dict_result = {}
253 |         for i in range(dict_count):
254 |             #print("Key ref: {0}\tVal ref: {1}".format(key_refs[i], value_refs[i]))
255 |             key = __decode_object(f, offset_table[key_refs[i]], collection_offset_size, offset_table)
256 |             val = __decode_object(f, offset_table[value_refs[i]], collection_offset_size, offset_table)
257 |             dict_result[key] = val
258 |         return dict_result
259 | 
260 | 
261 | def load(f):
262 |     """
263 |     Reads and converts a file-like object containing a binary property list.
264 |     Takes a file-like object (must support reading and seeking) as an argument
265 |     Returns a data structure representing the data in the property list
266 |     """
267 |     # Check magic number
268 |     if f.read(8) != b"bplist00":
269 |         raise BplistError("Bad file header")
270 | 
271 |     # Read trailer
272 |     f.seek(-32, os.SEEK_END)
273 |     trailer = f.read(32)
274 |     offset_int_size, collection_offset_size, object_count, top_level_object_index, offest_table_offset = struct.unpack(">6xbbQQQ", trailer)
275 | 
276 |     # Read offset table
277 |     f.seek(offest_table_offset)
278 |     offset_table = []
279 |     for i in range(object_count):
280 |         offset_table.append(__decode_multibyte_int(f.read(offset_int_size), False))
281 | 
282 |     return __decode_object(f, offset_table[top_level_object_index], collection_offset_size, offset_table)
283 | 
284 | 
285 | def NSKeyedArchiver_common_objects_convertor(o):
286 |     """Built in converter function (suitable for submission to set_object_converter()) which automatically
287 |     converts the following common data-types found in NSKeyedArchiver:
288 |     NSDictionary/NSMutableDictionary;
289 |     NSArray/NSMutableArray;
290 |     NSSet/NSMutableSet
291 |     NSString/NSMutableString
292 |     NSDate
293 |     $null strings"""
294 |     # Conversion: NSDictionary
295 |     if is_nsmutabledictionary(o):
296 |         return convert_NSMutableDictionary(o)
297 |     # Conversion: NSArray
298 |     elif is_nsarray(o):
299 |         return convert_NSArray(o)
300 |     elif is_isnsset(o):
301 |         return convert_NSSet(o)
302 |     # Conversion: NSString
303 |     elif is_nsstring(o):
304 |         return convert_NSString(o)
305 |     # Conversion: NSDate
306 |     elif is_nsdate(o):
307 |         return convert_NSDate(o)
308 |     # Conversion: "$null" string
309 |     elif isinstance(o, str) and o == "$null":
310 |         return None
311 |     # Fallback:
312 |     else:
313 |         return o
314 | 
315 | def NSKeyedArchiver_convert(o, object_table):
316 |     if isinstance(o, list):
317 |         #return NsKeyedArchiverList(o, object_table)
318 |         result = NsKeyedArchiverList(o, object_table)
319 |     elif isinstance(o, dict):
320 |         #return NsKeyedArchiverDictionary(o, object_table)
321 |         result = NsKeyedArchiverDictionary(o, object_table)
322 |     elif isinstance(o, BplistUID):
323 |         #return NSKeyedArchiver_convert(object_table[o.value], object_table)
324 |         result = NSKeyedArchiver_convert(object_table[o.value], object_table)
325 |     else:
326 |         #return o
327 |         result = o
328 | 
329 |     if _object_converter:
330 |         return _object_converter(result)
331 |     else:
332 |         return result
333 | 
334 | 
335 | class NsKeyedArchiverDictionary(dict):
336 |     def __init__(self, original_dict, object_table):
337 |         super(NsKeyedArchiverDictionary, self).__init__(original_dict)
338 |         self.object_table = object_table
339 | 
340 |     def __getitem__(self, index):
341 |         o = super(NsKeyedArchiverDictionary, self).__getitem__(index)
342 |         return NSKeyedArchiver_convert(o, self.object_table)
343 | 
344 |     def get(self, key, default=None):
345 |         return self[key] if key in self else default
346 | 
347 |     def values(self):
348 |         for k in self:
349 |             yield self[k]
350 | 
351 |     def items(self):
352 |         for k in self:
353 |             yield k, self[k]
354 | 
355 | class NsKeyedArchiverList(list):
356 |     def __init__(self, original_iterable, object_table):
357 |         super(NsKeyedArchiverList, self).__init__(original_iterable)
358 |         self.object_table = object_table
359 | 
360 |     def __getitem__(self, index):
361 |         o = super(NsKeyedArchiverList, self).__getitem__(index)
362 |         return NSKeyedArchiver_convert(o, self.object_table)
363 | 
364 |     def __iter__(self):
365 |         for o in super(NsKeyedArchiverList, self).__iter__():
366 |             yield NSKeyedArchiver_convert(o, self.object_table)
367 | 
368 | 
369 | def deserialise_NsKeyedArchiver(obj, parse_whole_structure=False):
370 |     """Deserialises an NSKeyedArchiver bplist rebuilding the structure.
371 |        obj should usually be the top-level object returned by the load()
372 |        function."""
373 | 
374 |     # Check that this is an archiver and version we understand
375 |     if not isinstance(obj, dict):
376 |         raise TypeError("obj must be a dict")
377 |     if "$archiver" not in obj or obj["$archiver"] not in ("NSKeyedArchiver", "NRKeyedArchiver"):
378 |         raise ValueError("obj does not contain an '$archiver' key or the '$archiver' is unrecognised")
379 |     if "$version" not in obj or obj["$version"] != 100000:
380 |         raise ValueError("obj does not contain a '$version' key or the '$version' is unrecognised")
381 | 
382 |     object_table = obj["$objects"]
383 |     if "root" in obj["$top"] and not parse_whole_structure:
384 |         return NSKeyedArchiver_convert(obj["$top"]["root"], object_table)
385 |     else:
386 |         return NSKeyedArchiver_convert(obj["$top"], object_table)
387 | 
388 | # NSMutableDictionary convenience functions
389 | def is_nsmutabledictionary(obj):
390 |     if not isinstance(obj, dict):
391 |         return False
392 |     if "$class" not in obj.keys():
393 |         return False
394 |     if obj["$class"].get("$classname") not in ("NSMutableDictionary", "NSDictionary"):
395 |         return False
396 |     if "NS.keys" not in obj.keys():
397 |         return False
398 |     if "NS.objects" not in obj.keys():
399 |         return False
400 | 
401 |     return True
402 | 
403 | def convert_NSMutableDictionary(obj):
404 |     """Converts a NSKeyedArchiver serialised NSMutableDictionary into
405 |        a straight dictionary (rather than two lists as it is serialised
406 |        as)"""
407 | 
408 |     # The dictionary is serialised as two lists (one for keys and one
409 |     # for values) which obviously removes all convenience afforded by
410 |     # dictionaries. This function converts this structure to an
411 |     # actual dictionary so that values can be accessed by key.
412 | 
413 |     if not is_nsmutabledictionary(obj):
414 |         raise ValueError("obj does not have the correct structure for a NSDictionary/NSMutableDictionary serialised to a NSKeyedArchiver")
415 |     keys = obj["NS.keys"]
416 |     vals = obj["NS.objects"]
417 | 
418 |     # sense check the keys and values:
419 |     if not isinstance(keys, list):
420 |         raise TypeError("The 'NS.keys' value is an unexpected type (expected list; actual: {0}".format(type(keys)))
421 |     if not isinstance(vals, list):
422 |         raise TypeError("The 'NS.objects' value is an unexpected type (expected list; actual: {0}".format(type(vals)))
423 |     if len(keys) != len(vals):
424 |         raise ValueError("The length of the 'NS.keys' list ({0}) is not equal to that of the 'NS.objects ({1})".format(len(keys), len(vals)))
425 | 
426 |     result = {}
427 |     for i,k in enumerate(keys):
428 |         if k in result:
429 |             raise ValueError("The 'NS.keys' list contains duplicate entries")
430 |         result[k] = vals[i]
431 | 
432 |     return result
433 | 
434 | # NSArray convenience functions
435 | def is_nsarray(obj):
436 |     if not isinstance(obj, dict):
437 |         return False
438 |     if "$class" not in obj.keys():
439 |         return False
440 |     if obj["$class"].get("$classname") not in ("NSArray", "NSMutableArray"):
441 |         return False
442 |     if "NS.objects" not in obj.keys():
443 |         return False
444 | 
445 |     return True
446 | 
447 | def convert_NSArray(obj):
448 |     if not is_nsarray(obj):
449 |         raise ValueError("obj does not have the correct structure for a NSArray/NSMutableArray serialised to a NSKeyedArchiver")
450 | 
451 |     return obj["NS.objects"]
452 | 
453 | # NSSet convenience functions
454 | def is_isnsset(obj):
455 |     if not isinstance(obj, dict):
456 |         return False
457 |     if "$class" not in obj.keys():
458 |         return False
459 |     if obj["$class"].get("$classname") not in ("NSSet", "NSMutableSet"):
460 |         return False
461 |     if "NS.objects" not in obj.keys():
462 |         return False
463 | 
464 |     return True
465 | 
466 | def convert_NSSet(obj):
467 |     if not is_isnsset(obj):
468 |         raise ValueError("obj does not have the correct structure for a NSSet/NSMutableSet serialised to a NSKeyedArchiver")
469 | 
470 |     return list(obj["NS.objects"])
471 | 
472 | # NSString convenience functions
473 | def is_nsstring(obj):
474 |     if not isinstance(obj, dict):
475 |         return False
476 |     if "$class" not in obj.keys():
477 |         return False
478 |     if obj["$class"].get("$classname") not in ("NSString", "NSMutableString"):
479 |         return False
480 |     if "NS.string" not in obj.keys():
481 |         return False
482 |     return True
483 | 
484 | def convert_NSString(obj):
485 |     if not is_nsstring(obj):
486 |         raise ValueError("obj does not have the correct structure for a NSString/NSMutableString serialised to a NSKeyedArchiver")
487 | 
488 |     return obj["NS.string"]
489 | 
490 | # NSDate convenience functions
491 | def is_nsdate(obj):
492 |     if not isinstance(obj, dict):
493 |         return False
494 |     if "$class" not in obj.keys():
495 |         return False
496 |     if obj["$class"].get("$classname") not in ("NSDate"):
497 |         return False
498 |     if "NS.time" not in obj.keys():
499 |         return False
500 | 
501 |     return True
502 | 
503 | def convert_NSDate(obj):
504 |     if not is_nsdate(obj):
505 |         raise ValueError("obj does not have the correct structure for a NSDate serialised to a NSKeyedArchiver")
506 | 
507 |     return datetime.datetime(2001, 1, 1) + datetime.timedelta(seconds=obj["NS.time"])
508 | 


--------------------------------------------------------------------------------
/make_files_tree.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import hashlib
  3 | import json
  4 | import logging
  5 | import mimetypes
  6 | import os
  7 | import platform
  8 | import random
  9 | import re
 10 | import shutil
 11 | import socket
 12 | import uuid
 13 | import zipfile
 14 | from asyncio.exceptions import TimeoutError
 15 | from string import punctuation, whitespace
 16 | from time import time
 17 | from typing import List, Optional
 18 | from xml.etree import ElementTree
 19 | 
 20 | import aiofiles
 21 | import aiohttp
 22 | import uvloop
 23 | from aiohttp import ClientConnectorError, ServerDisconnectedError
 24 | 
 25 | import ccl_bplist
 26 | 
 27 | PROTOCOL = 'https://'
 28 | ILLEGAL_PATH_CHARS = punctuation.replace('.', '') + whitespace
 29 | 
 30 | CRAWL_STATUS_CODE_EXCLUSIONS = {
 31 |     'webappinternal.telegram.org/botfather',
 32 |     'webappinternal.telegram.org/stickers',
 33 | }
 34 | 
 35 | DYNAMIC_PART_MOCK = 'telegram-crawler'
 36 | 
 37 | INPUT_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_links.txt')
 38 | INPUT_RES_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_res_links.txt')
 39 | INPUT_TR_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_tr_links.txt')
 40 | OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', 'data/')
 41 | OUTPUT_MTPROTO_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_MTPROTO_FOLDER', 'server/'))
 42 | OUTPUT_SITES_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_SITES_FOLDER', 'web/'))
 43 | OUTPUT_CLIENTS_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_CLIENTS_FOLDER', 'client/'))
 44 | OUTPUT_RESOURCES_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_RESOURCES_FOLDER', 'web_res/'))
 45 | OUTPUT_TRANSLATIONS_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_RESOURCES_FOLDER', 'web_tr/'))
 46 | OUTPUT_MINI_APPS_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_MINI_APPS_FOLDER', 'mini_app/'))
 47 | 
 48 | TRANSLATIONS_EN_CATEGORY_URL_REGEX = r'/en/[a-z_]+/[a-z_]+$'
 49 | 
 50 | PAGE_GENERATION_TIME_REGEX = r'<!-- page generated in .+ -->'
 51 | PAGE_API_HASH_REGEX = r'\?hash=[a-z0-9]+'
 52 | PAGE_API_HASH_TEMPLATE = f'?hash={DYNAMIC_PART_MOCK}'
 53 | TON_RATE_REGEX = r'"tonRate":"[.0-9]+"'
 54 | TON_RATE_TEMPLATE = f'"tonRate":"{DYNAMIC_PART_MOCK}"'
 55 | APK_BETA_TOKEN_REGEX = r'apk\?token=.*?"'
 56 | APK_BETA_TOKEN_TEMPLATE = f'apk?token={DYNAMIC_PART_MOCK}"'
 57 | PASSPORT_SSID_REGEX = r'passport_ssid=[a-z0-9]+_[a-z0-9]+_[a-z0-9]+'
 58 | PASSPORT_SSID_TEMPLATE = f'passport_ssid={DYNAMIC_PART_MOCK}'
 59 | NONCE_REGEX = r'"nonce":"[a-z0-9]+_[a-z0-9]+_[a-z0-9]+'
 60 | NONCE_TEMPLATE = f'"nonce":"{DYNAMIC_PART_MOCK}'
 61 | PROXY_CONFIG_SUB_NET_REGEX = r'\d+\.\d+:8888;'
 62 | PROXY_CONFIG_SUB_NET_TEMPLATE = 'X.X:8888;'
 63 | TRANSLATE_SUGGESTION_REGEX = r'<div class="tr-value-suggestion">(.?)+</div>'
 64 | SPARKLE_SIG_REGEX = r';sig=(.*?);'
 65 | SPARKLE_SE_REGEX = r';se=(.*?);'
 66 | SPARKLE_SIG_TEMPLATE = f';sig={DYNAMIC_PART_MOCK};'
 67 | SPARKLE_SE_TEMPLATE = f';se={DYNAMIC_PART_MOCK};'
 68 | 
 69 | STEL_DEV_LAYER = 190
 70 | 
 71 | TIMEOUT = aiohttp.ClientTimeout(  # mediumly sized from link collector
 72 |     total=60,
 73 |     connect=60,
 74 |     sock_connect=30,
 75 |     sock_read=60,
 76 | )
 77 | HEADERS = {
 78 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:99.0) Gecko/20100101 Firefox/99.0',
 79 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
 80 |     'Accept-Language': 'en-US,en;q=0.5',
 81 |     'Accept-Encoding': 'gzip, deflate, br',
 82 |     'DNT': '1',
 83 |     'Connection': 'keep-alive',
 84 |     'Cookie': f'stel_ln=en; stel_dev_layer={STEL_DEV_LAYER}',
 85 |     'Upgrade-Insecure-Requests': '1',
 86 |     'Sec-Fetch-Dest': 'document',
 87 |     'Sec-Fetch-Mode': 'navigate',
 88 |     'Sec-Fetch-Site': 'none',
 89 |     'Sec-Fetch-User': '?1',
 90 |     'Cache-Control': 'max-age=0',
 91 |     'TE': 'trailers',
 92 | }
 93 | 
 94 | logging.basicConfig(format='%(message)s', level=logging.INFO)
 95 | logger = logging.getLogger(__name__)
 96 | 
 97 | 
 98 | def get_hash(data: bytes) -> str:
 99 |     return hashlib.sha256(data).hexdigest()
100 | 
101 | 
102 | async def download_file(url: str, path: str, session: aiohttp.ClientSession):
103 |     params = {'tgcrawlNoCache': uuid.uuid4().hex}
104 |     async with session.get(url, params=params) as response:
105 |         if response.status != 200:
106 |             return
107 | 
108 |         content = await response.read()
109 | 
110 |     async with aiofiles.open(path, mode='wb') as f:
111 |         await f.write(content)
112 | 
113 | 
114 | async def track_additional_files(
115 |         files_to_track: List[str], input_dir_name: str, output_dir_name: str, encoding='utf-8', save_hash_only=False
116 | ):
117 |     kwargs = {'mode': 'r', 'encoding': encoding}
118 |     if save_hash_only:
119 |         kwargs['mode'] = 'rb'
120 |         del kwargs['encoding']
121 | 
122 |     for file in files_to_track:
123 |         async with aiofiles.open(os.path.join(input_dir_name, file), **kwargs) as r_file:
124 |             content = await r_file.read()
125 | 
126 |         if save_hash_only:
127 |             content = get_hash(content)
128 |         else:
129 |             content = re.sub(r'id=".*"', 'id="tgcrawl"', content)
130 | 
131 |         filename = os.path.join(output_dir_name, file)
132 |         os.makedirs(os.path.dirname(filename), exist_ok=True)
133 |         async with aiofiles.open(filename, 'w', encoding='utf-8') as w_file:
134 |             await w_file.write(content)
135 | 
136 | 
137 | async def get_download_link_of_latest_macos_release(remote_updates_manifest_url: str, session: aiohttp.ClientSession) -> Optional[str]:
138 |     async with session.get(remote_updates_manifest_url) as response:
139 |         if response.status != 200:
140 |             logger.error(f'Error {response.status} while fetching {remote_updates_manifest_url}')
141 |             return None
142 | 
143 |         try:
144 |             response = await response.text()  # we do expect XML here
145 |         except Exception as e:
146 |             logger.error(f'Error processing response: {e}')
147 |             return None
148 | 
149 |     if not isinstance(response, str) and not response.lstrip().startswith('<rss'):
150 |         logger.error('Response is not a valid XML string')
151 |         return None
152 | 
153 |     root = ElementTree.fromstring(response)
154 |     item = root.find('.//item')
155 |     if item is not None:
156 |         enclosure = item.find('enclosure')
157 |         if enclosure is not None:
158 |             return enclosure.get('url')
159 | 
160 |     return None
161 | 
162 | 
163 | async def download_telegram_macos_beta_and_extract_resources(session: aiohttp.ClientSession):
164 |     remote_updates_manifest_url = 'https://mac-updates.telegram.org/beta/versions.xml'
165 |     download_url = await get_download_link_of_latest_macos_release(remote_updates_manifest_url, session)
166 | 
167 |     if not download_url:
168 |         return
169 | 
170 |     crawled_data_folder = os.path.join(OUTPUT_CLIENTS_FOLDER, 'macos-beta')
171 |     client_folder_name = 'macos'
172 |     client_archive_name = 'macos.zip'
173 | 
174 |     assets_output_dir = 'macos_assets'
175 |     assets_filename = 'Assets.car'
176 |     assets_extractor = 'acextract'
177 | 
178 |     tool_download_url = 'https://github.com/MarshalX/acextract/releases/download/3.0/acextract'
179 | 
180 |     if 'darwin' not in platform.system().lower():
181 |         await download_file(download_url, client_archive_name, session)
182 |     else:
183 |         await asyncio.gather(
184 |             download_file(download_url, client_archive_name, session),
185 |             download_file(tool_download_url, assets_extractor, session),
186 |         )
187 | 
188 |     # synced
189 |     with zipfile.ZipFile(client_archive_name, 'r') as f:
190 |         f.extractall(client_folder_name)
191 | 
192 |     resources_path = 'Telegram.app/Contents/Resources'
193 |     files_to_track = [
194 |         f'{resources_path}/en.lproj/Localizable.strings',
195 |     ]
196 |     await track_additional_files(files_to_track, client_folder_name, crawled_data_folder, 'utf-16')
197 | 
198 |     _, _, hash_of_files_to_track = next(os.walk(f'{client_folder_name}/{resources_path}'))
199 |     hash_of_files_to_track = [f'{resources_path}/{i}' for i in hash_of_files_to_track if i != assets_filename]
200 |     await track_additional_files(hash_of_files_to_track, client_folder_name, crawled_data_folder, save_hash_only=True)
201 | 
202 |     def cleanup1():
203 |         os.path.isdir(client_folder_name) and shutil.rmtree(client_folder_name)
204 |         os.remove(client_archive_name)
205 | 
206 |     # .car crawler works only in macOS
207 |     if 'darwin' not in platform.system().lower():
208 |         cleanup1()
209 |         return
210 | 
211 |     path_to_car = os.path.join(client_folder_name, resources_path, assets_filename)
212 |     await (await asyncio.create_subprocess_exec('chmod', '+x', assets_extractor)).communicate()
213 |     process = await asyncio.create_subprocess_exec(f'./{assets_extractor}', '-i', path_to_car, '-o', assets_output_dir)
214 |     await process.communicate()
215 | 
216 |     def cleanup2():
217 |         cleanup1()
218 |         os.path.isdir(assets_output_dir) and shutil.rmtree(assets_output_dir)
219 |         os.remove(assets_extractor)
220 | 
221 |     if process.returncode != 0:
222 |         cleanup2()
223 |         return
224 | 
225 |     _, _, hash_of_files_to_track = next(os.walk(assets_output_dir))
226 |     await track_additional_files(
227 |         hash_of_files_to_track,
228 |         assets_output_dir,
229 |         os.path.join(crawled_data_folder, assets_filename),
230 |         save_hash_only=True
231 |     )
232 | 
233 |     cleanup2()
234 | 
235 |     return  # the code below returns a random result depending on the system?
236 | 
237 |     executable_path = os.path.join(client_folder_name, 'Telegram.app/Contents/MacOS/Telegram')
238 |     process = await asyncio.create_subprocess_exec(
239 |         f'strings', '-n', '7', '-arch', 'x86_64', '--', executable_path, stdout=asyncio.subprocess.PIPE
240 |     )
241 | 
242 |     stdout = b''
243 |     while process.returncode is None:
244 |         stdout_part = await process.stdout.read(1024)
245 |         if not stdout_part:
246 |             break
247 | 
248 |         stdout += stdout_part
249 | 
250 |     if process.returncode != 0:
251 |         cleanup2()
252 |         return
253 | 
254 |     import string
255 |     binary_strings = stdout.decode('utf-8').split('\n')
256 |     special_chars = list(string.punctuation)
257 |     valid_strings = []
258 |     for string in binary_strings:
259 |         if sum([1 for char in string if char in special_chars]) > 5:
260 |             continue
261 | 
262 |         valid_strings.append(string.strip())
263 | 
264 |     valid_strings = sorted(list(set(valid_strings)))
265 |     with open(os.path.join(crawled_data_folder, 'strings.txt'), 'w', encoding='utf-8') as f:
266 |         f.write('\n'.join(valid_strings))
267 | 
268 |     cleanup2()
269 | 
270 | 
271 | async def download_telegram_ios_beta_and_extract_resources(session: aiohttp.ClientSession):
272 |     # TODO fetch version automatically
273 |     # ref: https://docs.github.com/en/rest/releases/releases#get-the-latest-release
274 |     version = '9.0.24102'
275 | 
276 |     download_url = f'https://github.com/MarshalX/decrypted-telegram-ios/releases/download/{version}/Telegram-{version}.ipa'
277 |     tool_download_url = 'https://github.com/MarshalX/acextract/releases/download/3.0/acextract'
278 | 
279 |     ipa_filename = f'Telegram-{version}.ipa'
280 |     assets_extractor = 'acextract_ios'
281 |     assets_filename = 'Assets.car'
282 |     assets_output_dir = 'ios_assets'
283 |     client_folder_name = 'ios'
284 |     crawled_data_folder = os.path.join(OUTPUT_CLIENTS_FOLDER, 'ios-beta')
285 | 
286 |     if 'darwin' not in platform.system().lower():
287 |         await download_file(download_url, ipa_filename, session)
288 |     else:
289 |         await asyncio.gather(
290 |             download_file(download_url, ipa_filename, session),
291 |             download_file(tool_download_url, assets_extractor, session),
292 |         )
293 | 
294 |     # synced
295 |     with zipfile.ZipFile(ipa_filename, 'r') as f:
296 |         f.extractall(client_folder_name)
297 | 
298 |     resources_path = 'Payload/Telegram.app'
299 | 
300 |     files_to_convert = [
301 |         f'{resources_path}/en.lproj/Localizable.strings',
302 |         f'{resources_path}/en.lproj/InfoPlist.strings',
303 |         f'{resources_path}/en.lproj/AppIntentVocabulary.plist',
304 |     ]
305 |     for filename in files_to_convert:
306 |         path = os.path.join(client_folder_name, filename)
307 | 
308 |         # synced cuz ccl_bplist works with file objects and doesn't support asyncio
309 |         with open(path, 'rb') as r_file:
310 |             plist = ccl_bplist.load(r_file)
311 | 
312 |         async with aiofiles.open(path, 'w', encoding='utf-8') as w_file:
313 |             await w_file.write(json.dumps(plist, indent=4))
314 | 
315 |     files_to_track = files_to_convert + [
316 |         f'{resources_path}/_CodeSignature/CodeResources',
317 |         f'{resources_path}/SC_Info/Manifest.plist',
318 |     ]
319 |     await track_additional_files(files_to_track, client_folder_name, crawled_data_folder)
320 | 
321 |     resources_folder = os.path.join(client_folder_name, resources_path)
322 |     crawled_resources_folder = os.path.join(crawled_data_folder, resources_path)
323 |     _, _, hash_of_files_to_track = next(os.walk(resources_folder))
324 |     await track_additional_files(
325 |         hash_of_files_to_track, resources_folder, crawled_resources_folder, save_hash_only=True
326 |     )
327 | 
328 |     def cleanup1():
329 |         os.path.isdir(client_folder_name) and shutil.rmtree(client_folder_name)
330 |         os.remove(ipa_filename)
331 | 
332 |     # sry for copy-paste from macos def ;d
333 | 
334 |     # .car crawler works only in macOS
335 |     if 'darwin' not in platform.system().lower():
336 |         cleanup1()
337 |         return
338 | 
339 |     path_to_car = os.path.join(resources_folder, assets_filename)
340 |     await (await asyncio.create_subprocess_exec('chmod', '+x', assets_extractor)).communicate()
341 |     process = await asyncio.create_subprocess_exec(f'./{assets_extractor}', '-i', path_to_car, '-o', assets_output_dir)
342 |     await process.communicate()
343 | 
344 |     def cleanup2():
345 |         cleanup1()
346 |         os.path.isdir(assets_output_dir) and shutil.rmtree(assets_output_dir)
347 |         os.remove(assets_extractor)
348 | 
349 |     if process.returncode != 0:
350 |         cleanup2()
351 |         return
352 | 
353 |     for dir_path, _, hash_of_files_to_track in os.walk(assets_output_dir):
354 |         await track_additional_files(
355 |             # sry for this shit ;d
356 |             [os.path.join(dir_path, file).replace(f'{assets_output_dir}/', '') for file in hash_of_files_to_track],
357 |             assets_output_dir,
358 |             os.path.join(crawled_data_folder, assets_filename),
359 |             save_hash_only=True
360 |         )
361 | 
362 |     cleanup2()
363 | 
364 | 
365 | async def download_telegram_android_and_extract_resources(session: aiohttp.ClientSession) -> None:
366 |     await download_telegram_android_stable_dl_and_extract_resources(session)
367 |     await download_telegram_android_beta_and_extract_resources(session)
368 | 
369 | 
370 | async def download_telegram_android_stable_dl_and_extract_resources(session: aiohttp.ClientSession):
371 |     download_url = 'https://telegram.org/dl/android/apk'
372 | 
373 |     await _download_telegram_android_and_extract_resources(session, download_url, 'android-stable-dl')
374 | 
375 | 
376 | async def download_telegram_android_beta_and_extract_resources(session: aiohttp.ClientSession):
377 |     download_url = 'https://telegram.org/dl/android/apk-public-beta'
378 | 
379 |     await _download_telegram_android_and_extract_resources(session, download_url, 'android-beta')
380 | 
381 | 
382 | async def _download_telegram_android_and_extract_resources(
383 |         session: aiohttp.ClientSession, download_url: str, folder_name: str
384 | ):
385 |     crawled_data_folder = os.path.join(OUTPUT_CLIENTS_FOLDER, folder_name)
386 | 
387 |     if not download_url:
388 |         return
389 | 
390 |     await asyncio.gather(
391 |         download_file('https://bitbucket.org/iBotPeaches/apktool/downloads/apktool_2.9.0.jar', 'tool.apk', session),
392 |         download_file(download_url, 'android.apk', session),
393 |     )
394 | 
395 |     def cleanup():
396 |         os.path.isdir('android') and shutil.rmtree('android')
397 |         os.remove('tool.apk')
398 |         os.remove('android.apk')
399 | 
400 |     process = await asyncio.create_subprocess_exec(
401 |         'java', '-jar', 'tool.apk', 'd', '-s', '-f', 'android.apk',
402 |         stdout=asyncio.subprocess.PIPE,
403 |         stderr=asyncio.subprocess.STDOUT
404 |     )
405 |     await process.communicate()
406 | 
407 |     if process.returncode != 0:
408 |         cleanup()
409 |         return
410 | 
411 |     files_to_track = [
412 |         'res/values/strings.xml',
413 |         'res/values/public.xml'
414 |     ]
415 |     await track_additional_files(files_to_track, 'android', crawled_data_folder)
416 | 
417 |     cleanup()
418 | 
419 | 
420 | def parse_string_with_possible_json(input_string) -> dict:
421 |     # chat gtp powered code:
422 |     try:
423 |         # Attempt to parse the entire input string as JSON
424 |         json_object = json.loads(input_string)
425 |     except json.JSONDecodeError as e:
426 |         # Regular expression to find JSON objects within the string
427 |         json_regex = r'{[^{}]*}'
428 |         matches = re.findall(json_regex, input_string)
429 | 
430 |         if matches:
431 |             # Use the first match as the extracted JSON
432 |             json_object = json.loads(matches[0])
433 |         else:
434 |             raise ValueError('No JSON found within the input string.')
435 | 
436 |     return json_object
437 | 
438 | 
439 | async def crawl_mini_app_wallet():
440 |     crawled_data_folder = os.path.join(OUTPUT_MINI_APPS_FOLDER, 'wallet')
441 | 
442 |     def cleanup():
443 |         os.path.isdir('wallet') and shutil.rmtree('wallet')
444 | 
445 |     async def _run_unwebpack_sourcemap(url: str):
446 |         process = await asyncio.create_subprocess_exec(
447 |             'python', 'unwebpack_sourcemap.py', '--make-directory', '--detect', url, 'wallet',
448 |         )
449 |         await process.communicate()
450 | 
451 |         if process.returncode != 0:
452 |             cleanup()
453 |             raise RuntimeError('unwebpack_sourcemap failed')
454 | 
455 |     crawled_unpacked_folder = os.path.join('wallet', 'webpack', 'wallet-react-form')
456 | 
457 |     await _run_unwebpack_sourcemap('https://walletbot.me/')
458 | 
459 |     webpack_chunks_db_path = os.path.join(crawled_unpacked_folder, 'webpack', 'runtime', 'get javascript chunk filename')
460 |     webpack_chunks_db = parse_string_with_possible_json(open(webpack_chunks_db_path, 'r').read())
461 |     for chunk_id, chunk_name in webpack_chunks_db.items():
462 |         await _run_unwebpack_sourcemap(f'https://walletbot.me/static/js/{chunk_id}.{chunk_name}.js')
463 | 
464 |     files_to_track = []
465 | 
466 |     crawled_empty_0_folder = os.path.join(crawled_unpacked_folder, 'empty_0')
467 |     crawled_src_folder = os.path.join(crawled_empty_0_folder, 'src')
468 |     for root, folders, files in os.walk(crawled_src_folder):
469 |         for file in files:
470 |             files_to_track.append(os.path.join(root, file).replace(f'{crawled_empty_0_folder}/', ''))
471 | 
472 |     await track_additional_files(files_to_track, crawled_empty_0_folder, crawled_data_folder)
473 | 
474 |     cleanup()
475 | 
476 | 
477 | async def collect_translations_paginated_content(url: str, session: aiohttp.ClientSession) -> str:
478 |     import cssutils
479 |     from bs4 import BeautifulSoup
480 | 
481 |     css_parser = cssutils.CSSParser(loglevel=logging.FATAL, raiseExceptions=False)
482 | 
483 |     headers = {'X-Requested-With': 'XMLHttpRequest'}
484 |     content = dict()
485 | 
486 |     async def _get_page(offset: int):
487 |         logger.info(f'Url: {url}, offset: {offset}')
488 |         data = {'offset': offset, 'more': 1}
489 | 
490 |         try:
491 |             new_offset = None
492 |             async with session.post(
493 |                     f'{PROTOCOL}{url}', data=data, headers=headers, allow_redirects=False, timeout=TIMEOUT
494 |             ) as response:
495 |                 if (499 < response.status < 600) or (response.status != 200):
496 |                     logger.debug(f'Resend cuz {response.status}')
497 |                     new_offset = offset
498 |                 else:
499 |                     res_json = await response.json(encoding='UTF-8')
500 |                     if 'more_html' in res_json and res_json['more_html']:
501 |                         res_json['more_html'] = re.sub(TRANSLATE_SUGGESTION_REGEX, '', res_json['more_html'])
502 | 
503 |                         soup = BeautifulSoup(res_json['more_html'], 'html.parser')
504 |                         tr_items = soup.find_all('div', {'class': 'tr-key-row-wrap'})
505 |                         for tr_item in tr_items:
506 |                             tr_key = tr_item.find('div', {'class': 'tr-value-key'}).text
507 | 
508 |                             tr_url = tr_item.find('div', {'class': 'tr-key-row'})['data-href']
509 |                             tr_url = f'https://translations.telegram.org{tr_url}'
510 | 
511 |                             tr_photo = tr_item.find('a', {'class': 'tr-value-photo'})
512 |                             if tr_photo:
513 |                                 tr_photo = css_parser.parseStyle(tr_photo['style']).backgroundImage[5:-2]
514 | 
515 |                             tr_has_binding = tr_item.find('span', {'class': 'has-binding binding'})
516 |                             tr_has_binding = tr_has_binding is not None
517 | 
518 |                             tr_values = tr_item.find_all('span', {'class': 'value'})
519 |                             tr_value_singular, *tr_value_plural = [tr_value.decode_contents() for tr_value in tr_values]
520 |                             tr_values = {'singular': tr_value_singular}
521 |                             if tr_value_plural:
522 |                                 tr_values['plural'] = tr_value_plural[0]
523 | 
524 |                             content[tr_key] = {
525 |                                 'url': tr_url,
526 |                                 'photo_url': tr_photo,
527 |                                 'has_binding': tr_has_binding,
528 |                                 'values': tr_values,
529 |                             }
530 | 
531 |                         new_offset = offset + 200
532 | 
533 |             new_offset and await _get_page(new_offset)
534 |         except (ServerDisconnectedError, TimeoutError, ClientConnectorError):
535 |             logger.warning(f'Client or timeout error. Retrying {url}; offset {offset}')
536 |             await _get_page(offset)
537 | 
538 |     await _get_page(0)
539 | 
540 |     content = dict(sorted(content.items()))
541 |     return json.dumps(content, indent=4, ensure_ascii=False)
542 | 
543 | 
544 | async def track_mtproto_methods():
545 |     #####################
546 |     # PATH BROKEN PYROGRAM
547 |     import pkgutil
548 |     from pathlib import Path
549 |     pyrogram_path = Path(pkgutil.get_loader('pyrogram').path).parent
550 |     broken_class_path = os.path.join(pyrogram_path, 'raw', 'types', 'story_fwd_header.py')
551 |     with open(broken_class_path, 'w', encoding='UTF-8') as f:
552 |         # I rly don't want to fix bug in pyrogram about using reserved words as argument names
553 |         f.write('class StoryFwdHeader: ...')
554 |     #####################
555 | 
556 |     from pyrogram import Client
557 | 
558 |     kw = {
559 |         'api_id': int(os.environ['TELEGRAM_API_ID']),
560 |         'api_hash': os.environ['TELEGRAM_API_HASH'],
561 |         'app_version': '@tgcrawl',
562 |         'in_memory': True
563 |     }
564 | 
565 |     test_dc = 2
566 |     test_phone_prefix = '99966'
567 |     test_phone_suffix = os.environ.get('TELEGRAM_TEST_PHONE_SUFFIX', random.randint(1000, 9999))
568 |     test_phone_number = f'{test_phone_prefix}{test_dc}{test_phone_suffix}'
569 |     test_phone_code = str(test_dc) * 5
570 | 
571 |     app_test = Client('crawler_test', phone_number=test_phone_number, phone_code=test_phone_code, test_mode=True, **kw)
572 |     app = Client('crawler', session_string=os.environ['TELEGRAM_SESSION'], **kw)
573 | 
574 |     await asyncio.gather(app_test.start(), app.start())
575 |     await asyncio.gather(_fetch_and_track_mtproto(app, ''), _fetch_and_track_mtproto(app_test, 'test'))
576 | 
577 | 
578 | async def _fetch_and_track_mtproto(app, output_dir):
579 |     from pyrogram.raw import functions
580 |     from pyrogram.raw.types import InputStickerSetShortName
581 | 
582 |     configs = {
583 |         'GetConfig': await app.invoke(functions.help.GetConfig()),
584 |         'GetCdnConfig': await app.invoke(functions.help.GetCdnConfig()),
585 |         # 'GetInviteText': await app.invoke(functions.help.GetInviteText()),
586 |         # 'GetSupport': await app.invoke(functions.help.GetSupport()),
587 |         # 'GetSupportName': await app.invoke(functions.help.GetSupportName()),
588 |         # 'GetPassportConfig': await app.invoke(functions.help.GetPassportConfig(hash=0)),
589 |         'GetCountriesList': await app.invoke(functions.help.GetCountriesList(lang_code='en', hash=0)),
590 |         'GetAppConfig': await app.invoke(functions.help.GetAppConfig(hash=0)),
591 |         # 'GetAppUpdate': await app.invoke(functions.help.GetAppUpdate(source='')),
592 |         # 'AnimatedEmoji': await app.invoke(
593 |         #     functions.messages.GetStickerSet(stickerset=InputStickerSetAnimatedEmoji(), hash=0)
594 |         # ),
595 |         'GetAvailableReactions': await app.invoke(functions.messages.GetAvailableReactions(hash=0)),
596 |         'GetPremiumPromo': await app.invoke(functions.help.GetPremiumPromo()),
597 |     }
598 | 
599 |     sticker_set_short_names = {
600 |         'EmojiAnimations',
601 |         'EmojiAroundAnimations',
602 |         'EmojiShortAnimations',
603 |         'EmojiAppearAnimations',
604 |         'EmojiCenterAnimations',
605 |         'AnimatedEmojies',
606 |         'EmojiGenericAnimations',
607 |     }
608 | 
609 |     if app.test_mode:
610 |         sticker_set_short_names.add('PremiumGifts')
611 |         sticker_set_short_names.add('StatusEmojiWhite')
612 |     else:
613 |         sticker_set_short_names.add('UtyaDuckFull')
614 |         sticker_set_short_names.add('GiftsPremium')
615 |         sticker_set_short_names.add('StatusPack')
616 |         sticker_set_short_names.add('RestrictedEmoji')
617 | 
618 |     for short_name in sticker_set_short_names:
619 |         sticker_set = await app.invoke(functions.messages.GetStickerSet(
620 |             stickerset=InputStickerSetShortName(short_name=short_name), hash=0
621 |         ))
622 |         configs[f'sticker_set/{short_name}'] = sticker_set
623 | 
624 |     bots_usernames_to_track = {'BotFather', 'DurgerKingBot', 'asmico_attach_bot'}
625 |     if app.test_mode:
626 |         bots_usernames_to_track.add('izpremiumbot')
627 |     else:
628 |         bots_usernames_to_track.add('PremiumBot')
629 | 
630 |     bots_usernames_to_track.clear()
631 |     for bot_username in bots_usernames_to_track:
632 |         bot_peer = await app.resolve_peer(bot_username)
633 |         bot_full = (await app.invoke(functions.users.GetFullUser(id=bot_peer)))
634 |         configs[f'bot/{bot_username}'] = f'{{"full_user": {str(bot_full.full_user)}, "users": {str(bot_full.users)}}}'
635 | 
636 |     peers_to_track = set()
637 |     if not app.test_mode:
638 |         peers_to_track.add('invoice')
639 |         peers_to_track.add('premium')
640 | 
641 |     peers_to_track.clear()
642 |     for peer_id in peers_to_track:
643 |         peer = await app.resolve_peer(peer_id)
644 |         configs[f'peer/{peer_id}'] = peer
645 | 
646 |     configs['GetPremiumPromo'].users = []
647 |     configs['GetPremiumPromo'].status_text = 'crawler'
648 |     configs['GetPremiumPromo'].status_entities = []
649 |     configs['GetPremiumPromo'].period_options = []
650 | 
651 |     configs['GetAppConfig'].hash = 'crawler'
652 | 
653 |     keys_to_hide = {'access_hash', 'autologin_token', 'file_reference', 'file_reference_base64', 'pending_suggestions'}
654 |     if app.test_mode:
655 |         keys_to_hide.add('dialog_filters_tooltip')
656 | 
657 |     def rem_rec(config):
658 |         if not isinstance(config, dict):
659 |             return
660 | 
661 |         for key, value in config.items():
662 |             if isinstance(value, dict):
663 |                 rem_rec(value)
664 |             elif isinstance(value, list):
665 |                 for item in value:
666 |                     rem_rec(item)
667 |             elif key == 'key' and value in keys_to_hide:
668 |                 config['value']['value'] = 'crawler'
669 |             elif key in keys_to_hide:
670 |                 config[key] = 'crawler'
671 | 
672 |     methods_to_filter = {'GetAppConfig', 'GetAvailableReactions', 'GetPremiumPromo'}
673 |     sticker_sets_to_filter = {f'sticker_set/{name}' for name in sticker_set_short_names}
674 |     bots_to_filter = {f'bot/{name}' for name in bots_usernames_to_track}
675 |     peers_to_filter = {f'peer/{name}' for name in peers_to_track}
676 | 
677 |     combined_filter = methods_to_filter | sticker_sets_to_filter | bots_to_filter | peers_to_filter
678 |     for config_name in combined_filter:
679 |         configs[config_name] = json.loads(str(configs[config_name]))
680 |         rem_rec(configs[config_name])
681 |         configs[config_name] = json.dumps(configs[config_name], ensure_ascii=False, indent=4)
682 | 
683 |     configs['GetConfig'].date = 0
684 |     configs['GetConfig'].expires = 0
685 |     configs['GetConfig'].autologin_token = 'crawler'
686 |     configs['GetConfig'].dc_options = []
687 | 
688 |     for file, content in configs.items():
689 |         filename = os.path.join(OUTPUT_MTPROTO_FOLDER, output_dir, f'{file}.json')
690 |         os.makedirs(os.path.dirname(filename), exist_ok=True)
691 |         async with aiofiles.open(filename, 'w', encoding='utf-8') as w_file:
692 |             await w_file.write(str(content))
693 | 
694 |     await app.stop()
695 | 
696 | 
697 | def is_hashable_only_content_type(content_type) -> bool:
698 |     hashable_only_content_types = (
699 |         'png',
700 |         'jpeg',
701 |         'x-icon',
702 |         'gif',
703 |         'mp4',
704 |         'webm',
705 |         'zip',
706 |         'stream',
707 |     )
708 | 
709 |     for hashable_only_content_type in hashable_only_content_types:
710 |         if hashable_only_content_type in content_type:
711 |             return True
712 | 
713 |     return False
714 | 
715 | 
716 | class RetryError(Exception):
717 |     def __init__(self, message: str, new_url: Optional[str] = None):
718 |         super().__init__(message)
719 |         self.new_url = new_url
720 | 
721 | 
722 | async def crawl(url: str, session: aiohttp.ClientSession, output_dir: str):
723 |     while True:
724 |         try:
725 |             await _crawl(url, session, output_dir)
726 |         except (RetryError, ServerDisconnectedError, TimeoutError, ClientConnectorError) as e:
727 |             if isinstance(e, RetryError) and e.new_url is not None:
728 |                 url = e.new_url
729 |             logger.warning(f'Client or timeout error ({e}). Retrying {url}')
730 |         else:
731 |             break
732 | 
733 | 
734 | SLASH_RETRY_ATTEMPTED = set()
735 | 
736 | 
737 | async def _crawl(url: str, session: aiohttp.ClientSession, output_dir: str):
738 |     truncated_url = (url[:100] + '...') if len(url) > 100 else url
739 | 
740 |     logger.debug(f'Process {truncated_url}')
741 |     async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT, headers=HEADERS) as response:
742 |         code = response.status
743 |         if 499 < code < 600:
744 |             msg = f'Error 5XX. Retrying {truncated_url}'
745 |             logger.warning(msg)
746 |             raise RetryError(msg)
747 | 
748 |         if code not in {200, 304} and url not in CRAWL_STATUS_CODE_EXCLUSIONS:
749 |             if code in {301, 302, 404}:
750 |                 base_url = url.rstrip('/')
751 |                 if base_url not in SLASH_RETRY_ATTEMPTED:
752 |                     if url.endswith('/'):
753 |                         flipped_url = base_url
754 |                         logger.warning(f'{code} slash removal retry for {truncated_url}')
755 |                     else:
756 |                         flipped_url = f'{url}/'
757 |                         logger.warning(f'{code} slash addition retry for {truncated_url}')
758 | 
759 |                     SLASH_RETRY_ATTEMPTED.add(base_url)
760 |                     raise RetryError(f'{code} slash retry for {truncated_url}', new_url=flipped_url)
761 | 
762 |             content = await response.text()
763 |             clean_content = content.replace('\n', ' ').replace('\r', ' ')
764 |             truncated_content = (clean_content[:200] + '...') if len(clean_content) > 200 else clean_content
765 |             logger.warning(f'Skip [{code}] {truncated_url}: {truncated_content}')
766 | 
767 |             return
768 | 
769 |         # bypass external slashes and so on
770 |         url_parts = [p for p in url.split('/') if p not in ILLEGAL_PATH_CHARS]
771 | 
772 |         content_type = response.content_type
773 | 
774 |         # handle pure domains and html pages without ext in url as html do enable syntax highlighting
775 |         page_type, _ = mimetypes.guess_type(url)
776 | 
777 |         ext = ''
778 |         if page_type:
779 |             ext = mimetypes.guess_extension(page_type) or ''
780 |             if ext != '' and url.endswith(ext):
781 |                 ext = ''
782 | 
783 |         if url.endswith('.tl'):
784 |             page_type = 'text/plain'
785 | 
786 |         if page_type is None or len(url_parts) == 1:
787 |             ext = '.html'
788 |             content_type = 'text/html'
789 | 
790 |         if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url) or 'td.telegram.org/current' in url:
791 |             ext = '.json'
792 |             content_type = 'application/json'
793 | 
794 |         is_hashable_only = is_hashable_only_content_type(content_type)
795 |         # amazing dirt for media files like
796 |         # telegram.org/file/811140591/1/q7zZHjgES6s/9d121a89ffb0015837
797 |         # with response content type HTML instead of image.
798 |         # shame on you.
799 |         # sometimes it returns a correct type.
800 |         # noice load balancing
801 |         is_sucking_file = '/file/' in url and 'text' in content_type
802 | 
803 |         # I don't add ext by content type for images, and so on cuz TG servers suck.
804 |         # Some servers do not return a correct content type.
805 |         # Some servers do...
806 |         if is_hashable_only or is_sucking_file:
807 |             ext = '.sha256'
808 | 
809 |         filename = os.path.join(output_dir, *url_parts) + ext
810 |         os.makedirs(os.path.dirname(filename), exist_ok=True)
811 | 
812 |         if is_sucking_file or is_hashable_only:
813 |             content = await response.read()
814 |             async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
815 |                 await f.write(get_hash(content))
816 |             return
817 | 
818 |         content = await response.text(encoding='UTF-8')
819 |         if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url):
820 |             content = await collect_translations_paginated_content(url, session)
821 | 
822 |         content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
823 |         content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
824 |         content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content)
825 |         content = re.sub(NONCE_REGEX, NONCE_TEMPLATE, content)
826 |         content = re.sub(PROXY_CONFIG_SUB_NET_REGEX, PROXY_CONFIG_SUB_NET_TEMPLATE, content)
827 |         content = re.sub(SPARKLE_SIG_REGEX, SPARKLE_SIG_TEMPLATE, content)
828 |         content = re.sub(SPARKLE_SE_REGEX, SPARKLE_SE_TEMPLATE, content)
829 |         content = re.sub(TON_RATE_REGEX, TON_RATE_TEMPLATE, content)
830 |         content = re.sub(APK_BETA_TOKEN_REGEX, APK_BETA_TOKEN_TEMPLATE, content)
831 | 
832 |         # there is a problem with the files with the same name (in the same path) but different case
833 |         # the content is random because of the async
834 |         # there is only one page with this problem, for now:
835 |         # - corefork.telegram.org/constructor/Updates
836 |         # - corefork.telegram.org/constructor/updates
837 |         async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
838 |             logger.debug(f'Write to {filename}')
839 |             await f.write(content)
840 | 
841 | 
842 | async def _crawl_web(session: aiohttp.ClientSession, input_filename: str, output_folder=None):
843 |     with open(input_filename, 'r') as f:
844 |         tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
845 | 
846 |     await asyncio.gather(*[crawl(url, session, output_folder) for url in tracked_urls])
847 | 
848 | 
849 | async def crawl_web(session: aiohttp.ClientSession):
850 |     await _crawl_web(session, INPUT_FILENAME, OUTPUT_SITES_FOLDER)
851 | 
852 | 
853 | async def crawl_web_res(session: aiohttp.ClientSession):
854 |     await _crawl_web(session, INPUT_RES_FILENAME, OUTPUT_RESOURCES_FOLDER)
855 | 
856 | 
857 | async def _collect_and_track_all_translation_keys():
858 |     translations = dict()
859 | 
860 |     start_folder = 'en/'
861 |     file_format = '.json'
862 |     output_filename = 'translation_keys.json'
863 | 
864 |     for root, folder, files in os.walk(OUTPUT_TRANSLATIONS_FOLDER):
865 |         for file in files:
866 |             if not file.endswith(file_format) or file == output_filename:
867 |                 continue
868 | 
869 |             async with aiofiles.open(os.path.join(root, file), encoding='utf-8') as f:
870 |                 content = json.loads(await f.read())
871 | 
872 |                 client = root[root.index(start_folder) + len(start_folder):]
873 |                 if client not in translations:
874 |                     translations[client] = list()
875 | 
876 |                 translations[client].extend(content.keys())
877 | 
878 |     for client in translations.keys():
879 |         translations[client] = sorted(translations[client])
880 | 
881 |     translations = dict(sorted(translations.items()))
882 | 
883 |     async with aiofiles.open(os.path.join(OUTPUT_TRANSLATIONS_FOLDER, output_filename), 'w', encoding='utf-8') as f:
884 |         await f.write(json.dumps(translations, indent=4))
885 | 
886 | 
887 | async def crawl_web_tr(session: aiohttp.ClientSession):
888 |     await _crawl_web(session, INPUT_TR_FILENAME, OUTPUT_TRANSLATIONS_FOLDER)
889 |     await _collect_and_track_all_translation_keys()
890 | 
891 | 
892 | async def start(mode: str):
893 |     # Optimized TCP connector for web crawling
894 |     tcp_connector = aiohttp.TCPConnector(
895 |         ssl=False,             # Disable SSL verification for crawling
896 |         use_dns_cache=False,          # Disable DNS caching
897 |         force_close=True,             # Force close connections after use
898 |         family=socket.AF_INET,        # Use IPv4 only to avoid potential IPv6 issues
899 |     )
900 | 
901 |     async with aiohttp.ClientSession(connector=tcp_connector, trust_env=True) as session:
902 |         mode == 'all' and await asyncio.gather(
903 |             crawl_web(session),
904 |             crawl_web_res(session),
905 |             crawl_web_tr(session),
906 |             track_mtproto_methods(),
907 |             download_telegram_android_beta_and_extract_resources(session),
908 |             download_telegram_macos_beta_and_extract_resources(session),
909 |             download_telegram_ios_beta_and_extract_resources(session),
910 |             crawl_mini_app_wallet(),
911 |         )
912 |         mode == 'web' and await asyncio.gather(
913 |             crawl_web(session),
914 |         )
915 |         mode == 'web_res' and await asyncio.gather(
916 |             crawl_web_res(session),
917 |         )
918 |         mode == 'web_tr' and await asyncio.gather(
919 |             crawl_web_tr(session),
920 |         )
921 |         mode == 'server' and await asyncio.gather(
922 |             track_mtproto_methods(),
923 |         )
924 |         mode == 'client' and await asyncio.gather(
925 |             download_telegram_android_and_extract_resources(session),
926 |             download_telegram_macos_beta_and_extract_resources(session),
927 |             download_telegram_ios_beta_and_extract_resources(session),
928 |         )
929 |         mode == 'mini_app' and await asyncio.gather(
930 |             crawl_mini_app_wallet(),
931 |         )
932 | 
933 | 
934 | if __name__ == '__main__':
935 |     run_mode = 'all'
936 |     if 'MODE' in os.environ:
937 |         run_mode = os.environ['MODE']
938 | 
939 |     start_time = time()
940 |     logger.info(f'Start crawling content of tracked urls...')
941 |     uvloop.run(start(run_mode))
942 |     logger.info(f'Stop crawling content in mode {run_mode}. {time() - start_time} sec.')
943 | 


--------------------------------------------------------------------------------