├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── ignore-list
├── pipeline.py
└── telegram.lua


/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | *.pyc
3 | wget-lua
4 | wget-at
5 | STOP
6 | BANNED
7 | data/
8 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM atdr.meo.ws/archiveteam/grab-base
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <http://unlicense.org>
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # telegram-grab
 2 | 
 3 | More information about the archiving project can be found on the ArchiveTeam wiki: [Telegram](https://wiki.archiveteam.org/index.php?title=Telegram)
 4 | 
 5 | ## Setup instructions
 6 | 
 7 | ### General instructions
 8 | 
 9 | Data integrity is very important in Archive Team projects. Please note the following important rules:
10 | 
11 | * [Do not use proxies or VPNs](https://wiki.archiveteam.org/index.php/ArchiveTeam_Warrior#Can_I_use_whatever_internet_access_for_the_Warrior?).
12 | * Run the project using the either the Warrior or the project-specific Docker container as listed below. [Do not modify project code](https://wiki.archiveteam.org/index.php/ArchiveTeam_Warrior#I'd_like_to_help_write_code_or_I_want_to_tweak_the_scripts_to_run_to_my_liking._Where_can_I_find_more_info?_Where_is_the_source_code_and_repository?). Compiling the project dependencies yourself is no longer supported.
13 | * You can share your tracker nickname(s) across machine(s) you personally operate, but not with machines operated by other users. Nickname sharing makes it harder to inspect data if a problem arises.
14 | * [Use clean internet connections](https://wiki.archiveteam.org/index.php/ArchiveTeam_Warrior#Can_I_use_whatever_internet_access_for_the_Warrior?).
15 | * Only x64-based machines are supported. [ARM (used on Raspberry Pi and Apple Silicon Macs) is not currently supported](https://wiki.archiveteam.org/index.php/ArchiveTeam_Warrior#Can_I_run_the_Warrior_on_ARM_or_some_other_unusual_architecture?).
16 | * See the [Archive Team Wiki](https://wiki.archiveteam.org/index.php/ArchiveTeam_Warrior#Warrior_FAQ) for additional information.
17 | 
18 | We strongly encourage you to join the IRC channel associated with this project in order to be informed about project updates and other important announcements, as well as to be reachable in the event of an issue. The Archive Team Wiki has [more information about IRC](https://wiki.archiveteam.org/index.php/Archiveteam:IRC). We can be found at hackint IRC [#telegrab](https://webirc.hackint.org/#irc://irc.hackint.org/#telegrab).
19 | 
20 | **If you have any questions or issues during setup, please review the wiki pages or contact us on IRC for troubleshooting information.**
21 | 
22 | ### Running the project
23 | 
24 | #### Archive Team Warrior (recommended for most users)
25 | 
26 | This and other archiving projects can easily be run using the [Archive Team Warrior](https://wiki.archiveteam.org/index.php/ArchiveTeam_Warrior) virtual machine. Follow the [instructions on the Archive Team wiki](https://wiki.archiveteam.org/index.php/ArchiveTeam_Warrior) for installing the Warrior, and from the web interface running at `http://localhost:8001/`, enter the nickname that you want to be shown as on the tracker. There is no registration, just pick a nickname you like. Then, select the `Telegram` project in the Warrior interface.
27 | 
28 | #### Project-specific Docker container (for more advanced users)
29 | 
30 | Alternatively, more advanced users can also run projects using Docker. While users of the Warrior can switch between projects using a web interface, Docker containers are specific to each project. However, while the Warrior supports a maximum of 6 concurrent items, a Docker container supports a maximum of 20 concurrent items. The instructions below are a short overview. For more information and detailed explanations of the commands, follow the follow the [Docker instructions on the Archive Team wiki](https://wiki.archiveteam.org/index.php/Running_Archive_Team_Projects_with_Docker).
31 | 
32 | It is advised to use [Watchtower](https://github.com/containrrr/watchtower) to automatically update the project container:
33 | 
34 |     docker run -d --name watchtower --restart=unless-stopped -v /var/run/docker.sock:/var/run/docker.sock containrrr/watchtower --label-enable --cleanup --interval 3600 --include-restarting
35 | 
36 | after which the project container can be run:
37 | 
38 |     docker run -d --name archiveteam --label=com.centurylinklabs.watchtower.enable=true --log-driver json-file --log-opt max-size=50m --restart=unless-stopped atdr.meo.ws/archiveteam/telegram-grab --concurrent 1 YOURNICKHERE
39 | 
40 | Be sure to replace `YOURNICKHERE` with the nickname that you want to be shown as on the tracker. There is no registration, just pick a nickname you like.
41 | 
42 | ### Supporting Archive Team
43 | 
44 | Behind the scenes Archive Team has infrastructure to run the projects and process the data with. If you would like to help out with the costs of our infrastructure, a donation on our [Open Collective](https://opencollective.com/archiveteam) would be very welcome.
45 | 
46 | ### Issues in the code
47 | 
48 | If you notice a bug and want to file a bug report, please use the GitHub issues tracker.
49 | 
50 | Are you a developer? Help write code for us! Look at our [developer documentation](https://wiki.archiveteam.org/index.php?title=Dev) for details.
51 | 
52 | ### Other problems
53 | 
54 | Have an issue not listed here? Join us on IRC and ask! We can be found at hackint IRC [#telegrab](https://webirc.hackint.org/#irc://irc.hackint.org/#telegrab).
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/ignore-list:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArchiveTeam/telegram-grab/61ab68d65aab9ced9cec890de549abe20550a0ae/ignore-list


--------------------------------------------------------------------------------
/pipeline.py:
--------------------------------------------------------------------------------
  1 | # encoding=utf8
  2 | import datetime
  3 | from distutils.version import StrictVersion
  4 | import hashlib
  5 | import os.path
  6 | import random
  7 | import re
  8 | from seesaw.config import realize, NumberConfigValue
  9 | from seesaw.externalprocess import ExternalProcess
 10 | from seesaw.item import ItemInterpolation, ItemValue
 11 | from seesaw.task import SimpleTask, LimitConcurrent
 12 | from seesaw.tracker import GetItemFromTracker, PrepareStatsForTracker, \
 13 |     UploadWithTracker, SendDoneToTracker
 14 | import shutil
 15 | import socket
 16 | import subprocess
 17 | import sys
 18 | import time
 19 | import string
 20 | import urllib.parse
 21 | 
 22 | import seesaw
 23 | from seesaw.externalprocess import WgetDownload
 24 | from seesaw.pipeline import Pipeline
 25 | from seesaw.project import Project
 26 | from seesaw.util import find_executable
 27 | 
 28 | from tornado import httpclient
 29 | 
 30 | import requests
 31 | import zstandard
 32 | 
 33 | if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'):
 34 |     raise Exception('This pipeline needs seesaw version 0.8.5 or higher.')
 35 | 
 36 | 
 37 | ###########################################################################
 38 | # Find a useful Wget+Lua executable.
 39 | #
 40 | # WGET_AT will be set to the first path that
 41 | # 1. does not crash with --version, and
 42 | # 2. prints the required version string
 43 | 
 44 | class HigherVersion:
 45 |     def __init__(self, expression, min_version):
 46 |         self._expression = re.compile(expression)
 47 |         self._min_version = min_version
 48 | 
 49 |     def search(self, text):
 50 |         for result in self._expression.findall(text):
 51 |             if result >= self._min_version:
 52 |                 print('Found version {}.'.format(result))
 53 |                 return True
 54 | 
 55 | WGET_AT = find_executable(
 56 |     'Wget+AT',
 57 |     HigherVersion(
 58 |         r'(GNU Wget 1\.[0-9]{2}\.[0-9]{1}-at\.[0-9]{8}\.[0-9]{2})[^0-9a-zA-Z\.-_]',
 59 |         'GNU Wget 1.21.3-at.20241119.01'
 60 |     ),
 61 |     [
 62 |         './wget-at',
 63 |         '/home/warrior/data/wget-at'
 64 |     ]
 65 | )
 66 | 
 67 | if not WGET_AT:
 68 |     raise Exception('No usable Wget+At found.')
 69 | 
 70 | 
 71 | ###########################################################################
 72 | # The version number of this pipeline definition.
 73 | #
 74 | # Update this each time you make a non-cosmetic change.
 75 | # It will be added to the WARC files and reported to the tracker.
 76 | VERSION = '20250102.02'
 77 | USER_AGENT = 'Archive Team'
 78 | TRACKER_ID = 'telegram'
 79 | TRACKER_HOST = 'legacy-api.arpa.li'
 80 | MULTI_ITEM_SIZE = 100
 81 | 
 82 | 
 83 | ###########################################################################
 84 | # This section defines project-specific tasks.
 85 | #
 86 | # Simple tasks (tasks that do not need any concurrency) are based on the
 87 | # SimpleTask class and have a process(item) method that is called for
 88 | # each item.
 89 | class CheckIP(SimpleTask):
 90 |     def __init__(self):
 91 |         SimpleTask.__init__(self, 'CheckIP')
 92 |         self._counter = 0
 93 | 
 94 |     def process(self, item):
 95 |         # NEW for 2014! Check if we are behind firewall/proxy
 96 | 
 97 |         if self._counter <= 0:
 98 |             item.log_output('Checking IP address.')
 99 |             ip_set = set()
100 | 
101 |             ip_set.add(socket.gethostbyname('twitter.com'))
102 |             #ip_set.add(socket.gethostbyname('facebook.com'))
103 |             ip_set.add(socket.gethostbyname('youtube.com'))
104 |             ip_set.add(socket.gethostbyname('microsoft.com'))
105 |             ip_set.add(socket.gethostbyname('icanhas.cheezburger.com'))
106 |             ip_set.add(socket.gethostbyname('archiveteam.org'))
107 | 
108 |             if len(ip_set) != 5:
109 |                 item.log_output('Got IP addresses: {0}'.format(ip_set))
110 |                 item.log_output(
111 |                     'Are you behind a firewall/proxy? That is a big no-no!')
112 |                 raise Exception(
113 |                     'Are you behind a firewall/proxy? That is a big no-no!')
114 | 
115 |         # Check only occasionally
116 |         if self._counter <= 0:
117 |             self._counter = 10
118 |         else:
119 |             self._counter -= 1
120 | 
121 | 
122 | class PrepareDirectories(SimpleTask):
123 |     def __init__(self, warc_prefix):
124 |         SimpleTask.__init__(self, 'PrepareDirectories')
125 |         self.warc_prefix = warc_prefix
126 | 
127 |     def process(self, item):
128 |         item_name = item['item_name']
129 |         item_name_hash = hashlib.sha1(item_name.encode('utf8')).hexdigest()
130 |         escaped_item_name = item_name_hash
131 |         dirname = '/'.join((item['data_dir'], escaped_item_name))
132 | 
133 |         if os.path.isdir(dirname):
134 |             shutil.rmtree(dirname)
135 | 
136 |         os.makedirs(dirname)
137 | 
138 |         item['item_dir'] = dirname
139 |         item['warc_file_base'] = '-'.join([
140 |             self.warc_prefix,
141 |             item_name_hash,
142 |             time.strftime('%Y%m%d-%H%M%S')
143 |         ])
144 | 
145 |         open('%(item_dir)s/%(warc_file_base)s.warc.zst' % item, 'w').close()
146 |         open('%(item_dir)s/%(warc_file_base)s_data.txt' % item, 'w').close()
147 | 
148 | class MoveFiles(SimpleTask):
149 |     def __init__(self):
150 |         SimpleTask.__init__(self, 'MoveFiles')
151 | 
152 |     def process(self, item):
153 |         os.rename('%(item_dir)s/%(warc_file_base)s.warc.zst' % item,
154 |               '%(data_dir)s/%(warc_file_base)s.%(dict_project)s.%(dict_id)s.warc.zst' % item)
155 |         os.rename('%(item_dir)s/%(warc_file_base)s_data.txt' % item,
156 |               '%(data_dir)s/%(warc_file_base)s_data.txt' % item)
157 | 
158 |         shutil.rmtree('%(item_dir)s' % item)
159 | 
160 | 
161 | class SetBadUrls(SimpleTask):
162 |     def __init__(self):
163 |         SimpleTask.__init__(self, 'SetBadUrls')
164 | 
165 |     def process(self, item):
166 |         item['item_name_original'] = item['item_name']
167 |         items = item['item_name'].split('\0')
168 |         items_lower = [s.lower().split('#', 1)[0] for s in items]
169 |         items_lower = [urllib.parse.unquote(s) for s in items_lower]
170 |         with open('%(item_dir)s/%(warc_file_base)s_bad-items.txt' % item, 'r') as f:
171 |             for aborted_item in f:
172 |                 aborted_item = urllib.parse.unquote(aborted_item.strip().lower())
173 |                 index = items_lower.index(aborted_item)
174 |                 item.log_output('Item {} is aborted.'.format(aborted_item))
175 |                 items.pop(index)
176 |                 items_lower.pop(index)
177 |         item['item_name'] = '\0'.join(items)
178 | 
179 | 
180 | class MaybeSendDoneToTracker(SendDoneToTracker):
181 |     def enqueue(self, item):
182 |         if len(item['item_name']) == 0:
183 |             return self.complete_item(item)
184 |         return super(MaybeSendDoneToTracker, self).enqueue(item)
185 | 
186 | 
187 | def get_hash(filename):
188 |     with open(filename, 'rb') as in_file:
189 |         return hashlib.sha1(in_file.read()).hexdigest()
190 | 
191 | CWD = os.getcwd()
192 | PIPELINE_SHA1 = get_hash(os.path.join(CWD, 'pipeline.py'))
193 | LUA_SHA1 = get_hash(os.path.join(CWD, 'telegram.lua'))
194 | 
195 | def stats_id_function(item):
196 |     d = {
197 |         'pipeline_hash': PIPELINE_SHA1,
198 |         'lua_hash': LUA_SHA1,
199 |         'python_version': sys.version,
200 |     }
201 | 
202 |     return d
203 | 
204 | 
205 | class ZstdDict(object):
206 |     created = 0
207 |     data = None
208 | 
209 |     @classmethod
210 |     def get_dict(cls):
211 |         if cls.data is not None and time.time() - cls.created < 1800:
212 |             return cls.data
213 |         response = requests.get(
214 |             'https://legacy-api.arpa.li/dictionary',
215 |             params={
216 |                 'project': TRACKER_ID
217 |             }
218 |         )
219 |         response.raise_for_status()
220 |         response = response.json()
221 |         if cls.data is not None and response['id'] == cls.data['id']:
222 |             cls.created = time.time()
223 |             return cls.data
224 |         print('Downloading latest dictionary.')
225 |         response_dict = requests.get(response['url'])
226 |         response_dict.raise_for_status()
227 |         raw_data = response_dict.content
228 |         if hashlib.sha256(raw_data).hexdigest() != response['sha256']:
229 |             raise ValueError('Hash of downloaded dictionary does not match.')
230 |         if raw_data[:4] == b'\x28\xB5\x2F\xFD':
231 |             raw_data = zstandard.ZstdDecompressor().decompress(raw_data)
232 |         cls.data = {
233 |             'id': response['id'],
234 |             'dict': raw_data
235 |         }
236 |         cls.created = time.time()
237 |         return cls.data
238 | 
239 | 
240 | class WgetArgs(object):
241 |     post_chars = string.digits + string.ascii_lowercase
242 | 
243 |     def int_to_str(self, i):
244 |         d, m = divmod(i, 36)
245 |         if d > 0:
246 |             return self.int_to_str(d) + self.post_chars[m]
247 |         return self.post_chars[m]
248 | 
249 |     def realize(self, item):
250 |         wget_args = [
251 |             WGET_AT,
252 |             '-U', USER_AGENT,
253 |             '-nv',
254 |             '--host-lookups', 'dns',
255 |             '--hosts-file', '/dev/null',
256 |             '--resolvconf-file', '/dev/null',
257 |             '--dns-servers', '9.9.9.10,149.112.112.10,2620:fe::10,2620:fe::fe:10',
258 |             '--reject-reserved-subnets',
259 |             '--content-on-error',
260 |             '--lua-script', 'telegram.lua',
261 |             '-o', ItemInterpolation('%(item_dir)s/wget.log'),
262 |             '--no-check-certificate',
263 |             '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
264 |             '--truncate-output',
265 |             '-e', 'robots=off',
266 |             '--rotate-dns',
267 |             '--recursive', '--level=inf',
268 |             '--no-parent',
269 |             '--page-requisites',
270 |             '--timeout', '30',
271 |             '--tries', 'inf',
272 |             '--domains', 't.me,telegram.org',
273 |             '--span-hosts',
274 |             '--waitretry', '30',
275 |             '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
276 |             '--warc-header', 'operator: Archive Team',
277 |             '--warc-header', 'x-wget-at-project-version: ' + VERSION,
278 |             '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID,
279 |             '--warc-dedup-url-agnostic',
280 |             '--warc-compression-use-zstd',
281 |             '--warc-zstd-dict-no-include',
282 |             '--header', 'Accept-Language: en-US;q=0.9, en;q=0.8',
283 |             '--secure-protocol', 'TLSv1_2'
284 |         ]
285 |         dict_data = ZstdDict.get_dict()
286 |         with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f:
287 |             f.write(dict_data['dict'])
288 |         item['dict_id'] = dict_data['id']
289 |         item['dict_project'] = TRACKER_ID
290 |         wget_args.extend([
291 |             '--warc-zstd-dict', ItemInterpolation('%(item_dir)s/zstdict'),
292 |         ])
293 | 
294 |         if 'PREFER_IPV4' in os.environ:
295 |             wget_args.extend(['--prefer-family', 'IPv4'])
296 |         elif 'PREFER_IPV6' in os.environ:
297 |             wget_args.extend(['--prefer-family', 'IPv6'])
298 | 
299 |         item['item_name'] = '\0'.join(
300 |             s for s in item['item_name'].split('\0')
301 |             if not s.startswith('user:') and not s.startswith('channel:+')
302 |         )
303 | 
304 |         for item_name in item['item_name'].split('\0'):
305 |             wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name])
306 |             wget_args.append('item-name://'+item_name)
307 |             item_type, item_value = item_name.split(':', 1)
308 |             item_value = item_value.split('#', 1)[0]
309 |             if item_type == 'post':
310 |                 group, post_id = item_value.split(':', 1)
311 |                 wget_args.extend(['--warc-header', 'telegram-post: {}/{}'.format(group, post_id)])
312 |                 wget_args.append('https://t.me/{}/{}?embed=1'.format(group, post_id))
313 |             elif item_type == 'channel':
314 |                 wget_args.extend(['--warc-header', 'telegram-channel: '+item_value])
315 |                 wget_args.append('https://t.me/s/'+item_value)
316 |             elif item_type == 'comment':
317 |                 group, post_id, comment_id = item_value.split(':', 2)
318 |                 wget_args.extend(['--warc-header', 'telegram-comment: {}/{}?comment={}'.format(group, post_id, comment_id)])
319 |                 wget_args.append('https://t.me/{}/{}?comment={}'.format(group, post_id, comment_id))
320 |             #elif item_type == 'url':
321 |             #    wget_args.extend(['--warc-header', 'telegram-resource: '+item_value])
322 |             #    wget_args.append(item_value)
323 |             else:
324 |                 raise Exception('Unknown item')
325 | 
326 |         if 'bind_address' in globals():
327 |             wget_args.extend(['--bind-address', globals()['bind_address']])
328 |             print('')
329 |             print('*** Wget will bind address at {0} ***'.format(
330 |                 globals()['bind_address']))
331 |             print('')
332 | 
333 |         return realize(wget_args, item)
334 | 
335 | ###########################################################################
336 | # Initialize the project.
337 | #
338 | # This will be shown in the warrior management panel. The logo should not
339 | # be too big. The deadline is optional.
340 | project = Project(
341 |     title='Telegram',
342 |     project_html='''
343 |         <img class="project-logo" alt="Project logo" src="https://wiki.archiveteam.org/images/thumb/7/7d/Telegram-icon.png/600px-Telegram-icon.png" height="50px" title=""/>
344 |         <h2>telegram.org <span class="links"><a href="https://telegram.org/">Website</a> &middot; <a href="http://tracker.archiveteam.org/telegram/">Leaderboard</a></span></h2>
345 |         <p>Archiving public Telegram channels.</p>
346 |     '''
347 | )
348 | 
349 | pipeline = Pipeline(
350 |     CheckIP(),
351 |     GetItemFromTracker('http://{}/{}/multi={}/'
352 |         .format(TRACKER_HOST, TRACKER_ID, MULTI_ITEM_SIZE),
353 |         downloader, VERSION),
354 |     PrepareDirectories(warc_prefix=TRACKER_ID),
355 |     WgetDownload(
356 |         WgetArgs(),
357 |         max_tries=2,
358 |         accept_on_exit_code=[0, 4, 8],
359 |         env={
360 |             'item_dir': ItemValue('item_dir'),
361 |             'warc_file_base': ItemValue('warc_file_base')
362 |         }
363 |     ),
364 |     SetBadUrls(),
365 |     PrepareStatsForTracker(
366 |         defaults={'downloader': downloader, 'version': VERSION},
367 |         file_groups={
368 |             'data': [
369 |                 ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.zst')
370 |             ]
371 |         },
372 |         id_function=stats_id_function,
373 |     ),
374 |     MoveFiles(),
375 |     LimitConcurrent(NumberConfigValue(min=1, max=20, default='20',
376 |         name='shared:rsync_threads', title='Rsync threads',
377 |         description='The maximum number of concurrent uploads.'),
378 |         UploadWithTracker(
379 |             'http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
380 |             downloader=downloader,
381 |             version=VERSION,
382 |             files=[
383 |                 ItemInterpolation('%(data_dir)s/%(warc_file_base)s.%(dict_project)s.%(dict_id)s.warc.zst'),
384 |                 ItemInterpolation('%(data_dir)s/%(warc_file_base)s_data.txt')
385 |             ],
386 |             rsync_target_source_path=ItemInterpolation('%(data_dir)s/'),
387 |             rsync_extra_args=[
388 |                 '--recursive',
389 |                 '--min-size', '1',
390 |                 '--no-compress',
391 |                 '--compress-level', '0',
392 |                # '--ipv6'
393 |             ]
394 |         ),
395 |     ),
396 |     MaybeSendDoneToTracker(
397 |         tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
398 |         stats=ItemValue('stats')
399 |     )
400 | )
401 | 


--------------------------------------------------------------------------------
/telegram.lua:
--------------------------------------------------------------------------------
   1 | local urlparse = require("socket.url")
   2 | local http = require("socket.http")
   3 | local cjson = require("cjson")
   4 | local utf8 = require("utf8")
   5 | 
   6 | local item_dir = os.getenv("item_dir")
   7 | local warc_file_base = os.getenv("warc_file_base")
   8 | local item_type = nil
   9 | local item_name = nil
  10 | local item_value = nil
  11 | local item_channel = nil
  12 | local item_post = nil
  13 | local item_comment = nil
  14 | local item_start_time = nil
  15 | 
  16 | local selftext = nil
  17 | 
  18 | if urlparse == nil or http == nil then
  19 |   io.stdout:write("socket not corrently installed.\n")
  20 |   io.stdout:flush()
  21 |   abortgrab = true
  22 | end
  23 | 
  24 | local url_count = 0
  25 | local tries = 0
  26 | local downloaded = {}
  27 | local addedtolist = {}
  28 | local abortgrab = false
  29 | local killgrab = false
  30 | local queue_resources = true
  31 | 
  32 | local discovered_outlinks = {}
  33 | local discovered_items = {}
  34 | local discovered_channels = {}
  35 | local discovered_group_items = {}
  36 | local discovered_manycomments = {}
  37 | local bad_items = {}
  38 | local ids = {}
  39 | local covered_posts = {}
  40 | local to_queue = {}
  41 | local allowed_resources = {}
  42 | local is_sub_post = false
  43 | local is_group_post = false
  44 | local is_media_not_supported = false
  45 | local is_only_in_app = false
  46 | local api_url = nil
  47 | local api_peer = nil
  48 | local api_top_msg_id = nil
  49 | local api_discussion_hash = nil
  50 | local comments_max_pages = 100
  51 | local comments_page_count = 0
  52 | 
  53 | local disco_finished = false
  54 | local disco_on = false
  55 | local disco_post_id = 0
  56 | local disco_first_id = nil
  57 | local disco_scan_size = 1
  58 | local disco_explored_top = false
  59 | local disco_count = 0
  60 | local disco_checked = {}
  61 | local disco_current_url = nil
  62 | 
  63 | local retry_url = false
  64 | 
  65 | local current_js = {
  66 |   ["widget-frame.js"] = "63",
  67 |   ["tgwallpaper.min.js"] = "3",
  68 |   ["tgsticker.js"] = "31",
  69 |   ["telegram-web.js"] = "14",
  70 |   ["telegram-widget.js"] = "22",
  71 |   ["discussion-widget.js"] = "10"
  72 | }
  73 | 
  74 | math.randomseed(os.time())
  75 | 
  76 | for ignore in io.open("ignore-list", "r"):lines() do
  77 |   downloaded[ignore] = true
  78 | end
  79 | 
  80 | abort_item = function(item)
  81 |   abortgrab = true
  82 |   if not item then
  83 |     item = item_name
  84 |   end
  85 |   if not bad_items[item] then
  86 |     io.stdout:write("Aborting item " .. item .. ".\n")
  87 |     io.stdout:flush()
  88 |     bad_items[item] = true
  89 |   end
  90 | end
  91 | 
  92 | kill_grab = function(item)
  93 |   io.stdout:write("Aborting crawling.\n")
  94 |   killgrab = true
  95 | end
  96 | 
  97 | read_file = function(file)
  98 |   if file then
  99 |     local f = assert(io.open(file))
 100 |     local data = f:read("*all")
 101 |     f:close()
 102 |     return data
 103 |   else
 104 |     return ""
 105 |   end
 106 | end
 107 | 
 108 | processed = function(url)
 109 |   for _, v in pairs(discovered_outlinks) do
 110 |     if v[url] then
 111 |       return true
 112 |     end
 113 |   end
 114 |   if downloaded[url] or addedtolist[url]
 115 |     or (discovered_outlinks[""] and discovered_outlinks[""][url]) then
 116 |     return true
 117 |   end
 118 |   return false
 119 | end
 120 | 
 121 | encode_params = function(d)
 122 |   local result = ""
 123 |   for k, v in pairs(d) do
 124 |     if result ~= "" then
 125 |       result = result .. "&"
 126 |     end
 127 |     result = result .. k .. "=" .. urlparse.escape(v)
 128 |   end
 129 |   return result
 130 | end
 131 | 
 132 | discover_item = function(target, item)
 133 |   if not item then
 134 |     return nil
 135 |   end
 136 |   local shard = ""
 137 |   if string.match(item, "^https?://[^/]*telegram%.org/dl%?")
 138 |     or string.match(item, "^https?://[^/]*cdn%-telegram%.org/")
 139 |     or string.match(item, "^https?://[^/]*telesco%.pe/") then
 140 |     shard = "telegram"
 141 |   end
 142 |   if not target[shard] then
 143 |     target[shard] = {}
 144 |   end
 145 |   if not target[shard][item] then
 146 |     target[shard][item] = true
 147 |   end
 148 | end
 149 | 
 150 | find_item = function(url)
 151 |   if disco_current_url
 152 |     and string.match(url, "^([^#]+)") == string.match(disco_current_url, "^([^#]+)") then
 153 |     return nil
 154 |   end
 155 |   local value = nil
 156 |   local type_ = nil
 157 |   --[[if not string.match(url, "^https?://t%.me/")
 158 |     and not string.match(url, "^https?://www%.t%.me/")
 159 |     and not string.match(url, "^https?://telegram%.me/")
 160 |     and not string.match(url, "^https?://www%.telegram%.me/") then
 161 |     value = url
 162 |     type_ = 'url'
 163 |   end]]
 164 |   if not value then
 165 |     value = string.match(url, "^https?://t%.me/s/([^/%?&]+)$")
 166 |     type_ = 'channel'
 167 |   end
 168 |   if not value then
 169 |     value = string.match(url, "^https?://t%.me/([^/]+/[^/]+)%?embed=1$")
 170 |     type_ = 'post'
 171 |   end
 172 |   if not value then
 173 |     value = string.match(url, "^https?://t%.me/([^/]+/[0-9]+%?comment=[0-9]+)$")
 174 |     type_ = 'comment'
 175 |   end
 176 |   if value and not covered_posts[string.lower(value)] then
 177 |     item_type = type_
 178 |     ids = {}
 179 |     disco_finished = false
 180 |     disco_on = false
 181 |     disco_post_id = 0
 182 |     disco_first_id = nil
 183 |     disco_scan_size = 1
 184 |     disco_explored_top = false
 185 |     disco_count = 0
 186 |     disco_checked = {}
 187 |     disco_current_url = nil
 188 |     if --[[type_ == "url" or]] type_ == "channel" then
 189 |       item_value = value
 190 |       if type_ == "channel" then
 191 |         item_channel = value
 192 |         ids[value] = true
 193 |       end
 194 |     elseif type_ == "post" then
 195 |       item_value = string.gsub(value, "/", ":")
 196 |       item_channel, item_post = string.match(value, "^([^/]+)/(.+)$")
 197 |       ids[item_post] = true
 198 |     elseif type_ == "comment" then
 199 |       item_channel, item_post, item_comment = string.match(value, "^([^/]+)/([0-9]+)%?comment=([0-9]+)$")
 200 |       item_value = item_channel .. ":" .. item_post .. ":" .. item_comment
 201 |       ids[item_comment] = true
 202 |     end
 203 |     item_name_new = item_type .. ":" .. item_value
 204 |     if item_name_new ~= item_name then
 205 |       abortgrab = false
 206 |       queue_resources = true
 207 |       item_start_time = os.time(os.date("!*t"))
 208 |       retry_url = false
 209 |       api_url = nil
 210 |       api_peer = nil
 211 |       api_top_msg_id = nil
 212 |       api_discussion_hash = nil
 213 |       comments_max_pages = 100
 214 |       comments_page_count = 0
 215 |       is_group_post = false
 216 |       is_media_not_supported = false
 217 |       is_only_in_app = false
 218 |       tries = 0
 219 |       item_name = item_name_new
 220 |       print("Archiving item " .. item_name)
 221 |     end
 222 |   end
 223 | end
 224 | 
 225 | allowed = function(url, parenturl)
 226 |   if url == api_url and not parenturl then
 227 |     return true
 228 |   end
 229 | 
 230 |   if item_type ~= "comment" then
 231 |     local a, b, c = string.match(url, "^https?://[^/]+/([^/]+)/([0-9]+)%?comment=([0-9]+)$")
 232 |     if a then
 233 |       discover_item(discovered_items, "comment:" .. a .. ":" .. b .. ":" .. c)
 234 |       return false
 235 |     end
 236 |   end
 237 | 
 238 |   if string.match(url, "%?q=")
 239 |     or string.match(url, "%?before=")
 240 |     or string.match(url, "%?after=")
 241 |     or string.match(url, "^https?://[^/]+/addstickers/") then
 242 |     return false
 243 |   end
 244 | 
 245 |   for _, pattern in pairs({
 246 |     "^https?://[^/]+%.me/([^/%?&#]+)",
 247 |     "^https?://[^/]+%.me/s/([^/%?&#]+)",
 248 |     "^https?://([^%./]+)%.t%.me/"
 249 |   }) do
 250 |     local new_channel = string.match(url, pattern)
 251 |     if new_channel
 252 |       and new_channel ~= "s"
 253 |       and new_channel ~= "api"
 254 |       and new_channel ~= "css" then
 255 |       discover_item(discovered_channels, "channel:" .. new_channel)
 256 |     end
 257 |   end
 258 | 
 259 |   if string.match(url, "^https?://[^/]*telesco%.pe/")
 260 |     or string.match(url, "^https?://[^/]*cdn%-telegram%.org/") then
 261 |     if item_type == "url" then
 262 |       return true
 263 |     end
 264 |     if allowed_resources[url] then
 265 |       return true
 266 |     end
 267 |     if not queue_resources then
 268 |       return false
 269 |     end
 270 |     allowed_resources[url] = true
 271 |     return allowed(url, parenturl)
 272 |   end
 273 | 
 274 |   if not string.match(url, "^https?://t%.me/")
 275 |     and not string.match(url, "^https?://[^%.]+%.t%.me/")
 276 |     and not string.match(url, "^https?://telegram%.me/")
 277 |     and not string.match(url, "^https?://www%.telegram%.me/") then
 278 |     local temp = ""
 279 |     for c in string.gmatch(url, "(.)") do
 280 |       local b = string.byte(c)
 281 |       if b < 32 or b > 126 then
 282 |         c = string.format("%%%02X", b)
 283 |       end
 284 |       temp = temp .. c
 285 |     end
 286 |     discover_item(discovered_outlinks, string.match(temp, "^([^%s]+)"))
 287 |     return false
 288 |   end
 289 | 
 290 |   if not string.match(url, "^https?://t%.me/")
 291 |     and not string.match(url, "^https?://[^%.]+%.t%.me/")
 292 |     and not string.match(url, "^https?://[^/]*telegram%.me/") then
 293 |     return false
 294 |   end
 295 | 
 296 |   if item_type == "post"
 297 |     or item_type == "comment" then
 298 |     local has_post_id = false
 299 |     for s in string.gmatch(url, "([0-9a-zA-Z_]+)") do
 300 |       if ids[s] then
 301 |         has_post_id = true
 302 |       end
 303 |     end
 304 |     if has_post_id then
 305 |       for r in string.gmatch(url, "([^/%?&]+)") do
 306 |         if item_channel == r then
 307 |           return true
 308 |         end
 309 |       end
 310 |     end
 311 |   end
 312 | 
 313 |   if item_type == "channel" then
 314 |     if string.match(url, "^https?://[^/]+/[^/]+/[0-9]+")
 315 |       or string.match(url, "^https?://[^/]+/s/[^/]+/[0-9]+")
 316 |       or string.match(url, "%?before=")
 317 |       or string.match(url, "%?after=")
 318 |       or string.match(url, "%?q=") then
 319 |       return false
 320 |     end
 321 |     for _, pattern in pairs({
 322 |       "([^/%?&]+)",
 323 |       "([^/%?&%.]+)"
 324 |     }) do
 325 |       for s in string.gmatch(url, pattern) do
 326 |         if ids[s] then
 327 |           return true
 328 |         end
 329 |       end
 330 |     end
 331 |   end
 332 | 
 333 |   return false
 334 | end
 335 | 
 336 | wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_parsed, iri, verdict, reason)
 337 |   local url = urlpos["url"]["url"]
 338 |   local html = urlpos["link_expect_html"]
 339 | 
 340 |   --[[if item_type == "url" then
 341 |     return false
 342 |   end
 343 | 
 344 |   if not processed(url) and allowed(url, parent["url"])
 345 |     and string.match(url, "^https?://[^/]+%.me/") then
 346 |     addedtolist[url] = true
 347 |     return true
 348 |   end]]
 349 | 
 350 |   return false
 351 | end
 352 | 
 353 | wget.callbacks.get_urls = function(file, url, is_css, iri)
 354 |   local urls = {}
 355 |   local html = nil
 356 | 
 357 |   downloaded[url] = true
 358 | 
 359 |   if abortgrab then
 360 |     return {}
 361 |   end
 362 | 
 363 |   if item_type == "url" then
 364 |     return urls
 365 |   end
 366 | 
 367 |   local function decode_codepoint(newurl)
 368 |     newurl = string.gsub(
 369 |       newurl, "\\[uU]([0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F])",
 370 |       function (s)
 371 |         return utf8.char(tonumber(s, 16))
 372 |       end
 373 |     )
 374 |     return newurl
 375 |   end
 376 | 
 377 |   local function check(newurl)
 378 |     newurl = decode_codepoint(newurl)
 379 |     local origurl = url
 380 |     local url = string.match(newurl, "^([^#]+)")
 381 |     local url_ = string.match(url, "^(.-)[%.\\]*$")
 382 |     while string.find(url_, "&amp;") do
 383 |       url_ = string.gsub(url_, "&amp;", "&")
 384 |     end
 385 |     if not processed(url_)
 386 |       and string.match(url_, "^https?://[^/%.]+%..+")
 387 |       and allowed(url_, origurl) then
 388 |       if string.match(url_, "%?before=") or string.match(url_, "%?after=") then
 389 |         table.insert(urls, {
 390 |           url=url_,
 391 |           headers={
 392 |             ["X-Requested-With"]="XMLHttpRequest",
 393 |             ["Accept"]="application/json, text/javascript, */*; q=0.01"
 394 |           }
 395 |         })
 396 |       else
 397 |         table.insert(urls, { url=url_ })
 398 |       end
 399 |       addedtolist[url_] = true
 400 |       addedtolist[url] = true
 401 |     end
 402 |   end
 403 | 
 404 |   local function checknewurl(newurl)
 405 |     newurl = decode_codepoint(newurl)
 406 |     if string.match(newurl, "['\"><]") then
 407 |       return nil
 408 |     end
 409 |     if string.match(newurl, "^https?:////") then
 410 |       check(string.gsub(newurl, ":////", "://"))
 411 |     elseif string.match(newurl, "^https?://") then
 412 |       check(newurl)
 413 |     elseif string.match(newurl, "^https?:\\/\\?/") then
 414 |       check(string.gsub(newurl, "\\", ""))
 415 |     elseif string.match(newurl, "^\\/\\/") then
 416 |       checknewurl(string.gsub(newurl, "\\", ""))
 417 |     elseif string.match(newurl, "^//") then
 418 |       check(urlparse.absolute(url, newurl))
 419 |     elseif string.match(newurl, "^\\/") then
 420 |       checknewurl(string.gsub(newurl, "\\", ""))
 421 |     elseif string.match(newurl, "^/") then
 422 |       check(urlparse.absolute(url, newurl))
 423 |     elseif string.match(newurl, "^%.%./") then
 424 |       if string.match(url, "^https?://[^/]+/[^/]+/") then
 425 |         check(urlparse.absolute(url, newurl))
 426 |       else
 427 |         checknewurl(string.match(newurl, "^%.%.(/.+)$"))
 428 |       end
 429 |     elseif string.match(newurl, "^%./") then
 430 |       check(urlparse.absolute(url, newurl))
 431 |     end
 432 |   end
 433 | 
 434 |   local function checknewshorturl(newurl)
 435 |     newurl = decode_codepoint(newurl)
 436 |     if string.match(newurl, "^%?") then
 437 |       check(urlparse.absolute(url, newurl))
 438 |     elseif not (
 439 |       string.match(newurl, "^https?:\\?/\\?//?/?")
 440 |       or string.match(newurl, "^[/\\]")
 441 |       or string.match(newurl, "^%./")
 442 |       or string.match(newurl, "^[jJ]ava[sS]cript:")
 443 |       or string.match(newurl, "^[mM]ail[tT]o:")
 444 |       or string.match(newurl, "^vine:")
 445 |       or string.match(newurl, "^android%-app:")
 446 |       or string.match(newurl, "^ios%-app:")
 447 |       or string.match(newurl, "^data:")
 448 |       or string.match(newurl, "^irc:")
 449 |       or string.match(newurl, "^%${")
 450 |     ) then
 451 |       check(urlparse.absolute(url, newurl))
 452 |     end
 453 |   end
 454 | 
 455 |   local function queue_discussion(data_before)
 456 |     local encoded_params = encode_params({
 457 |       peer=api_peer,
 458 |       top_msg_id=api_top_msg_id,
 459 |       discussion_hash=api_discussion_hash,
 460 |       before_id=data_before,
 461 |       method="loadComments"
 462 |     })
 463 |     if addedtolist[encoded_params] then
 464 |       return nil
 465 |     end
 466 |     if comments_page_count >= comments_max_pages then
 467 |       io.stdout:write("Queued maximum number of " .. tostring(comments_max_pages) .. " of comments pages.\n")
 468 |       io.stdout:flush()
 469 |       return nil
 470 |     end
 471 |     comments_page_count = comments_page_count + 1
 472 |     io.stdout:write("Requesting discussion data before " .. data_before .. ".\n")
 473 |     io.stdout:flush()
 474 |     table.insert(urls, {
 475 |       url=api_url,
 476 |       post_data=encoded_params,
 477 |       headers={
 478 |         ["X-Requested-With"]="XMLHttpRequest",
 479 |         ["Accept"]="application/json, text/javascript, */*; q=0.01"
 480 |       }
 481 |     })
 482 |     addedtolist[encoded_params] = true
 483 |   end
 484 | 
 485 |   if disco_finished and not disco_on then
 486 |     io.stdout:write("Discovery proces finished.\n")
 487 |     io.stdout:flush()
 488 |     disco_finished = false
 489 |     check("https://" .. item_channel .. ".t.me/")
 490 |     return urls
 491 |   end
 492 | 
 493 |   if disco_on
 494 |     and (
 495 |       string.match(url, "^https?://t%.me/[^/%?]+$")
 496 |       or string.match(url, "^https?://t%.me/[^/%?]+/[0-9]+%?embed=1$")
 497 |     ) then
 498 |     local candidate_id = nil
 499 |     if disco_finished then
 500 |       disco_on = false
 501 |       if disco_first_id == 0 or not disco_first_id then
 502 |         disco_current_url = "https://t.me/telegram/3?embed=1#"
 503 |         table.insert(urls, { url=disco_current_url })
 504 |         return urls
 505 |       end
 506 |       candidate_id = disco_first_id
 507 |     else
 508 |       candidate_id = disco_post_id + math.ceil(-math.log(1-math.random())*8^disco_scan_size)
 509 |       while disco_checked[candidate_id] do
 510 |         candidate_id = candidate_id + 1
 511 |       end
 512 |     end
 513 |     disco_current_url = "https://t.me/" .. item_channel .. "/" .. tostring(candidate_id) .. "?embed=1"
 514 |     local newurl = disco_current_url .. "#"
 515 |     if disco_checked[candidate_id] then
 516 |       newurl = newurl .. "#"
 517 |     end
 518 |     disco_checked[candidate_id] = true
 519 |     table.insert(urls, { url=newurl })
 520 |     os.execute("sleep 0.6")
 521 |     return urls
 522 |   end
 523 | 
 524 |   queue_resources = true
 525 | 
 526 |   local domain, path = string.match(url, "^https?://([^/]+)(/.*)$")
 527 |   if (
 528 |     domain == "www.t.me"
 529 |     or domain == "t.me"
 530 |     or domain == "www.telegram.me"
 531 |     or domain == "telegram.me"
 532 |   )
 533 |   and not string.match(url, "%?embed=1")
 534 |   and item_type ~= "comment"
 535 |   and not (
 536 |     disco_on
 537 |     and string.match(url, "^https?://t%.me/[^/%?]+/[0-9]+$")
 538 |   ) then
 539 |     check("https://t.me" .. path)
 540 |     check("https://telegram.me" .. path)
 541 |   elseif string.match(domain, "telesco%.pe") then
 542 |     check(string.gsub(url, "telesco%.pe", "cdn%-telegram%.org"))
 543 |   elseif string.match(domain, "cdn%-telegram%.org") then
 544 |     check(string.gsub(url, "cdn%-telegram%.org", "telesco%.pe"))
 545 |   end
 546 | 
 547 |   for url, _ in pairs(to_queue) do
 548 |     io.stdout:write("Queuing extra URL " .. url .. ".\n")
 549 |     io.stdout:flush()
 550 |     check(url)
 551 |   end
 552 |   to_queue = {}
 553 | 
 554 |   if allowed(url) and status_code < 300
 555 |     and string.match(url, "^https?://[^/]+%.me/") then
 556 |     html = read_file(file)
 557 |     if string.match(url, "^https?://[^/]+/[^/]+/[0-9]+%?embed=1$") then
 558 |       --[[local html_new = string.gsub(html, '<div%s+class="tgme_widget_message_user">.-</div>', "")
 559 |       if html == html_new then
 560 |         io.stdout:write("No profile image.\n")
 561 |         io.stdout:flush()
 562 |         abort_item()
 563 |         return {}
 564 |       end
 565 |       html = html_new]]
 566 |       if is_sub_post then
 567 |         io.stdout:write("Found sub post.\n")
 568 |         io.stdout:flush()
 569 |         is_sub_post = false
 570 |         return {}
 571 |       end
 572 |       discover_item(discovered_outlinks, string.match(html, '<i[^>]+class="tgme_widget_message_user_photo[^"]+"[^>]+>%s*<img%s+src="([^"]+)">%s*</i>'))
 573 |       local base = string.match(url, "^([^%?]+)")
 574 |       check(base .. "?embed=1&discussion=1")
 575 |       --check(base .. "?embed=1&discussion=1&comments_limit=5")
 576 |       check(base)
 577 |       check(base .. "?embed=1")
 578 |       check(base .. "?embed=1&mode=tme")
 579 |       if string.match(html, "%?single") then
 580 |         check(base .. "?single")
 581 |         --check(base .. "?single=1")
 582 |         check(base .. "?embed=1&single=1")
 583 |         check(base .. "?embed=1&mode=tme&single=1")
 584 |       end
 585 |       check(string.gsub(url, "^(https?://[^/]+/)([^%?]+)%?.*", "%1s/%2"))
 586 |       --check(string.gsub(url, "^(https?://[^/]+/)([^%?]-)/([0-9]+)%?.*", "%1s/%2?before=%3"))
 587 |       --check(string.gsub(url, "^(https?://[^/]+/)([^%?]-)/([0-9]+)%?.*", "%1s/%2?after=%3"))
 588 |       --check(string.gsub(url, "^(https?://[^/]+/)([^%?]-)/([0-9]+)%?.*", "%1share/url?url=%1%2/%3"))
 589 |     elseif --[[string.match(url, "^https?://[^/]+/[^/]+/[0-9]+")
 590 |       or]] string.match(url, "^https?://[^/]+/s/[^/]+/[0-9]+") then
 591 |       queue_resources = false
 592 |     end
 593 |     if string.match(url, "%?embed=1&discussion=1$") then
 594 |       check(url .. "&comments_limit=5")
 595 |     end
 596 |     if string.match(url, "^https?://[^/]+/s/[^/%?&]+$") then
 597 |       check(string.gsub(url, "^(https?://[^/]+/)s/([^/%?&]+)$", "%1%2"))
 598 |       check("https://" .. item_channel .. ".t.me/")
 599 |       local highest_id = -1
 600 |       local actual_channel = nil
 601 |       for channel, id in string.gmatch(html, 'data%-post="([^/]+)/([0-9]+)"') do
 602 |         if string.lower(channel) == string.lower(item_channel) then
 603 |           actual_channel = channel
 604 |           id = tonumber(id)
 605 |           if id > highest_id then
 606 |             highest_id = id
 607 |           end
 608 |         end
 609 |       end
 610 |       if actual_channel then
 611 |         if highest_id > -1 then
 612 |           for i=0,highest_id do
 613 |             discover_item(discovered_items, "post:" .. actual_channel .. ":" .. tostring(i))
 614 |           end
 615 |         end
 616 |       end
 617 |       local image_url = string.match(html, '<meta%s+property="og:image"%s+content="([^"]+)"')
 618 |       local twitter_url = string.match(html, '<meta%s+property="twitter:image"%s+content="([^"]+)"')
 619 |       if image_url ~= twitter_url then
 620 |         io.stdout:write("Profile images not equal for og:image and twitter:image.\n")
 621 |         io.stdout:flush()
 622 |         abort_item()
 623 |         return {}
 624 |       end
 625 |       if image_url then
 626 |         check(image_url)
 627 |       end
 628 |       for newurl in string.gmatch(html, '<i%s+class="tgme_page_photo_image[^"]+"[^>]+>%s*<img%s+src="([^"]+)"') do
 629 |         check(newurl)
 630 |       end
 631 |     end
 632 |     if (
 633 |       item_type == "post"
 634 |       and string.match(url, "^https?://[^/]+/[^/]+/[0-9]+$")
 635 |     ) or (
 636 |       item_type == "comment"
 637 |       and string.match(url, "^https?://[^/]+/[^/]+/[0-9]+%?comment=[0-9]+$")
 638 |     ) then
 639 |       discover_item(discovered_outlinks, string.match(html, '<meta%s+property="og:image"%s+content="([^"]+)">'))
 640 |     end
 641 |     if item_type == "post"
 642 |       and (
 643 |         string.match(url, "^https?://[^/]+/s/[^/%?&]+%?")
 644 |         or string.match(url, "^https?://[^/]+/s/[^/]+/[0-9]+")
 645 |       ) then
 646 |       queue_resources = false
 647 |     end
 648 |     if string.match(url, "[%?&]discussion=1") then
 649 |       queue_resources = false
 650 |       local data = cjson.decode(string.match(html, "TWidgetAuth%.init%(({.-})%);"))
 651 |       local discussion = cjson.decode(string.match(html, "TWidgetDiscussion%.init%(({.-})%);"))
 652 |       local comments_count = discussion["comments_cnt"]
 653 |       if not comments_count then
 654 |         comments_count = 0
 655 |       end
 656 |       if comments_count > 200 then
 657 |         comments_max_pages = 2 -- 50 comments per page
 658 |         discover_item(discovered_manycomments, item_name)
 659 |       end
 660 |       api_url = data['api_url']
 661 |       local form_data = string.match(html, "(<form[^>]+>.-</form>)")
 662 |       local data_before = string.match(html, '<div%s+class="tme_messages_more%s+accent_bghover%s+js%-messages_more"%s+data%-before="([0-9]+)">')
 663 |       if data_before then
 664 |         api_peer = string.match(form_data, '<input[^>]+name="peer"%s+value="([^"]+)"%s*/>')
 665 |         api_top_msg_id = string.match(form_data, '<input[^>]+name="top_msg_id"%s+value="([^"]+)"%s*/>')
 666 |         api_discussion_hash = string.match(form_data, '<input[^>]+name="discussion_hash"%s+value="([^"]+)"%s*/>')
 667 |         queue_discussion(data_before)
 668 |       end
 669 |     end
 670 |     if url == api_url then
 671 |       queue_resources = false
 672 |       local data_before = string.match(html, '<div%s+class=\\"tme_messages_more%s+accent_bghover%s+js%-messages_more\\"%s+data%-before=\\"([0-9]+)\\">')
 673 |       if data_before then
 674 |         queue_discussion(data_before)
 675 |       end
 676 |     end
 677 |     if string.match(url, "%?comment=[0-9]+$") then
 678 |       local data_telegram_post = string.match(html, 'data%-telegram%-post="([^"]+)"')
 679 |       if not data_telegram_post then
 680 |         data_telegram_post = string.match(html, 'data%-post="([^"]+)"')
 681 |       end
 682 |       if data_telegram_post then
 683 |         if string.match(data_telegram_post, "^[^/]+/[0-9]+$") then
 684 |           discover_item(discovered_items, "post:" .. string.gsub(data_telegram_post, "/", ":"))
 685 |         elseif string.match(data_telegram_post, "^[0-9]+$") then
 686 |           discover_item(discovered_items, "post:" .. item_channel .. ":" .. data_telegram_post)
 687 |         end
 688 |       end
 689 |     end
 690 |     html = string.gsub(html, "</span>", "")
 691 |     for newurl in string.gmatch(string.gsub(html, "&quot;", '"'), '([^"]+)') do
 692 |       checknewurl(newurl)
 693 |     end
 694 |     for newurl in string.gmatch(string.gsub(html, "&#039;", "'"), "([^']+)") do
 695 |       checknewurl(newurl)
 696 |     end
 697 |     for newurl in string.gmatch(html, ">%s*([^<%s]+)") do
 698 |       checknewurl(newurl)
 699 |     end
 700 |     for newurl in string.gmatch(html, "[^%-]href='([^']+)'") do
 701 |       checknewshorturl(newurl)
 702 |     end
 703 |     for newurl in string.gmatch(html, '[^%-]href="([^"]+)"') do
 704 |       checknewshorturl(newurl)
 705 |     end
 706 |     for newurl in string.gmatch(html, ":%s*url%(([^%)]+)%)") do
 707 |       checknewurl(newurl)
 708 |     end
 709 |   end
 710 | 
 711 |   return urls
 712 | end
 713 | 
 714 | wget.callbacks.write_to_warc = function(url, http_stat)
 715 |   find_item(url["url"])
 716 | 
 717 |   if item_type == "channel"
 718 |     and string.match(item_value, "%%") then
 719 |     local new_item = string.match(item_value, "^([^%%]+)")
 720 |     discover_item(discovered_items, "channel:" .. new_item)
 721 |     abort_item()
 722 |   end
 723 | 
 724 |   if item_type == "channel"
 725 |     and os.time(os.date("!*t")) - item_start_time > 3600 then
 726 |     io.stdout:write("Channel item has been running for more than an hour.\n")
 727 |     io.stdout:flush()
 728 |     abort_item()
 729 |     return false
 730 |   end
 731 | 
 732 |   if abortgrab then
 733 |     abort_item()
 734 |     return false
 735 |   end
 736 | 
 737 |   if string.match(url["url"], "^https?://[^/]*telesco%.pe/")
 738 |     or string.match(url["url"], "^https?://[^/]*cdn%-telegram%.org/") then
 739 |     if http_stat["statcode"] == 404 then
 740 |       return true
 741 |     elseif http_stat["statcode"] ~= 200 then
 742 |       abort_item()
 743 |       return false
 744 |     end
 745 |   end
 746 | 
 747 |   if disco_on
 748 |     and string.match(url["url"], "^https?://t%.me/[^/%?]+$") then
 749 |     local html = read_file(http_stat["local_file"])
 750 |     if string.match(html, '<a%s+class="tgme_action_button_new[^>]+>Send Message</a>') then
 751 |       disco_on = false
 752 |       abort_item()
 753 |       return false
 754 |     end
 755 |   end
 756 | 
 757 |   if disco_on
 758 |     and string.match(url["url"], "^https?://t%.me/[^/%?]+/[0-9]+%?embed=1$") then
 759 |     local html = read_file(http_stat["local_file"])
 760 |     local actual_channel = string.match(html, 'data%-post="([^/]+)/([0-9]+)"')
 761 |     if not actual_channel then
 762 |       disco_count = disco_count + 1
 763 |       if disco_count == 20 then
 764 |         if disco_scan_size >= 7 then
 765 |           disco_explored_top = true
 766 |         end
 767 |         if disco_scan_size == 1 and disco_explored_top then
 768 |           disco_finished = true
 769 |           return false
 770 |         end
 771 |         if not disco_explored_top then
 772 |           disco_scan_size = disco_scan_size + 1
 773 |         else
 774 |           disco_scan_size = disco_scan_size - 1
 775 |         end
 776 |         disco_count = 0
 777 |         io.stdout:write("Setting scan size to " .. tostring(disco_scan_size) .. ".\n")
 778 |         io.stdout:flush()
 779 |       end
 780 |     else
 781 |       disco_post_id = tonumber(string.match(url["url"], "([0-9]+)%?embed=1$"))
 782 |       io.stdout:write("Setting post ID " .. tostring(disco_post_id) .. ".\n")
 783 |       io.stdout:flush()
 784 |       for i=0,disco_post_id do
 785 |         discover_item(discovered_group_items, "post:" .. actual_channel .. ":" .. tostring(i))
 786 |       end
 787 |       if not disco_first_id then
 788 |         disco_first_id = disco_post_id
 789 |       end
 790 |       disco_scan_size = disco_scan_size + 1
 791 |       disco_count = 0
 792 |       io.stdout:write("Setting scan size to " .. tostring(disco_scan_size) .. ".\n")
 793 |       io.stdout:flush()
 794 |     end
 795 |     return false
 796 |   end
 797 | 
 798 |   if http_stat["statcode"] == 302
 799 |     and (
 800 |       (
 801 |         (is_group_post or is_only_in_app)
 802 |         and string.match(url["url"], "^https?://[^/]+/s/[^/]+/[0-9]+$")
 803 |       )
 804 |       or (
 805 |         string.match(url["url"], "^https?://[^%./]+%.t%.me/")
 806 |         and string.lower(http_stat["newloc"]) == "https://t.me/" .. string.lower(item_channel)
 807 |       )
 808 |     ) then
 809 |     retry_url = false
 810 |     tries = 0
 811 |     return true
 812 |   end
 813 | 
 814 |   if http_stat["statcode"] ~= 200 and not string.match(url["url"], "%?single") then
 815 |     io.stdout:write("Status code not 200\n")
 816 |     io.stdout:flush()
 817 |     retry_url = true
 818 |     return false
 819 |   end
 820 | 
 821 |   if string.match(url["url"], "^https?://[^/]+%.me/") then
 822 |     local html = read_file(http_stat["local_file"])
 823 |     if string.match(url["url"], "%?before=")
 824 |       or string.match(url["url"], "%?after=") then
 825 |       html = cjson.decode(html)
 826 |     end
 827 |     if url["url"] == api_url then
 828 |       local data = cjson.decode(html)
 829 |       if data["comments_cnt"] < 5 or not data["ok"] then
 830 |         io.stdout:write("Did not receive \"ok\" from API server.\n")
 831 |         io.stdout:flush()
 832 |         abort_item()
 833 |         return false
 834 |       end
 835 |       html = string.gsub(data["comments_html"], "\\", "")
 836 |     end
 837 |     if string.match(url["url"], "%?embed=1$") then
 838 |       if string.match(html, '<a%s+class="tgme_widget_message_author_name"%s+href="')
 839 |         or string.match(html, '<span%s+class="tgme_widget_message_author_name">%s*<span dir="auto">Deleted Account</span>%s*</span>')
 840 |         or string.match(html, '<div%s+class="tgme_widget_message_author[^"]+">%s*<span%s+class="tgme_widget_message_author_name">') then
 841 |         io.stdout:write("This is a group post.\n")
 842 |         io.stdout:flush()
 843 |         is_group_post = true
 844 |       end
 845 |       if string.match(html, '<div%s+class="message_media_not_supported_label">Please open Telegram to view this post</div>') then
 846 |         io.stdout:write("Post only viewable in app.\n")
 847 |         io.stdout:flush()
 848 |         is_only_in_app = true
 849 |       end
 850 |       if string.match(html, '<div%s+class="message_media_not_supported">')
 851 |         and string.match(html, '<div%s+class="message_media_not_supported_label">') then
 852 |         io.stdout:write("This post has unsupported media.\n")
 853 |         io.stdout:flush()
 854 |         is_media_not_supported = true
 855 |       end
 856 |     end
 857 |     if string.match(url["url"], "%?embed=1$") and string.match(html, "%?single") then
 858 |       local found_ids = {}
 859 |       local current_id = tonumber(item_post)
 860 |       for channel, id in string.gmatch(html, "([^/]+)/([0-9]+)%?single[^a-zA-Z0-9]") do
 861 |         if string.lower(channel) == string.lower(item_channel) then
 862 |           found_ids[tonumber(id)] = true
 863 |         end
 864 |       end
 865 |       while found_ids[current_id] do
 866 |         current_id = current_id - 1
 867 |       end
 868 |       current_id = current_id + 1
 869 |       local min_id = current_id
 870 |       if min_id ~= tonumber(item_post) then
 871 |         is_sub_post = true
 872 |         return false
 873 |       end
 874 |       while found_ids[current_id] do
 875 |         current_id = current_id + 1
 876 |       end
 877 |       current_id = current_id - 1
 878 |       local max_id = current_id
 879 |       for id=min_id,max_id do
 880 |         id = tostring(id)
 881 |         ids[id] = true
 882 |         covered_posts[string.lower(item_channel) .. "/" .. id] = true
 883 |         to_queue["https://t.me/" .. item_channel .. "/" .. id .. "?embed=1"] = true
 884 |       end
 885 |     end
 886 |     for js_name, version in string.gmatch(html, "/([^/]+%.js)%?([0-9]+)") do
 887 |       if current_js[js_name] ~= version then
 888 |         io.stdout:write("Script " .. js_name .. " with version " .. version .. " is not known.\n")
 889 |         io.stdout:flush()
 890 |         os.execute("sleep 600")
 891 |         abort_item()
 892 |         return false
 893 |       end
 894 |     end
 895 |     --[[if string.match(url["url"], "%?embed=1&discussion=1") then
 896 |       if string.match(html, '"comments_cnt"')
 897 |         and not string.match(html, '<div%s+class="tme_no_messages_found">') then
 898 |         io.stdout:write("Found discussions comments. Not currently supported.\n")
 899 |         io.stdout:flush()
 900 |         abort_item()
 901 |         return false
 902 |       end
 903 |       return true
 904 |     end]]
 905 |     if not string.match(html, "cdn%-telegram%.org")
 906 |       and not string.match(html, "telesco%.pe") then
 907 |       io.stdout:write("Could not find CDNs on " .. url["url"] .. ".\n")
 908 |       io.stdout:flush()
 909 |       if http_stat["statcode"] == 302
 910 |         and string.match(url["url"], "%?single") then
 911 |         io.stdout:write("Valid 302 ?single page.\n")
 912 |         io.stdout:flush()
 913 |       elseif not (
 914 |         (
 915 |           item_type == "post"
 916 |           or item_type == "comment"
 917 |         )
 918 |         and (
 919 |           string.match(html, '<div%s+class="tgme_page%s+tgme_page_post">')
 920 |           and string.match(html, '<div%s+class="tgme_page_widget">')
 921 |         ) or (
 922 |           is_media_not_supported
 923 |           and string.match(html, '<a%s+class="tgme_action_button_new shine"%s+href="tg://[^"]+">View Post</a>')
 924 |         )
 925 |       ) and not (
 926 |         item_type == "post"
 927 |         and (
 928 |           string.match(url["url"], "%?embed=1$")
 929 |           or string.match(url["url"], "%?embed=1&single=1$")
 930 |           or string.match(url["url"], "%?embed=1&mode=tme$")
 931 |           or string.match(url["url"], "%?embed=1&mode=tme&single=1$")
 932 |         )
 933 |         and string.match(html, '<img%s+src="data:[^"/]+/')
 934 |         --and string.match(html, '<div%s+class="tgme_widget_message_text%s+js%-message_text"')
 935 |         and string.lower(string.match(html, 'data%-post="([^/]+/[0-9]+)"')) == string.lower(string.match(url["url"], "([^/]+/[0-9]+)%?embed=1"))
 936 |       ) and not (
 937 |         string.match(url["url"], "^https?://[^/]+/s/")
 938 |         and string.match(html, '<div%s+class="tgme_channel_info_header_username">')
 939 |         and string.match(html, '<div%s+class="tgme_channel_info_header_title">')
 940 |         and string.match(html, '<div%s+class="tgme_channel_info_counters">')
 941 |       ) and not (
 942 |         item_type == "channel"
 943 |         and string.match(url["url"], "^https?://[^/]+/[^/%?]+$")
 944 |         and (
 945 |           string.match(html, 'href="/s/[^"/%?]+">Preview%s+channel<')
 946 |           or string.match(html, "tgme_page_title")
 947 |         )
 948 |       ) and not (
 949 |         string.match(url["url"], "/share/url%?url=")
 950 |         and string.match(html, '<div%s+class="tgme_page_desc_header">')
 951 |         and string.match(html, '<a%s+class="tgme_action_button_new shine"%s+href="tg://msg_url%?url=[^"]+">Share</a>')
 952 |       ) and not (
 953 |         string.match(url["url"], "%?embed=1&discussion=1")
 954 |         and (
 955 |           not string.match(html, '<div%s+class="tme_no_messages_found">')
 956 |           or string.match(html, '<div%s+class="tme_no_messages_found">Discussion%s+is%s+not%s+available%s+at the%s+moment%.')
 957 |           or string.match(html, '<div%s+class="tme_no_messages_found">Please%s+open%s+Telegram%s+to%s+view%s+this%s+discussion%s+from')
 958 |           or string.match(html, '<div%s+class="tme_no_messages_found">This%s+group%s+can’t%s+be%s+displayed%s+because%s+it%s+violated%s+local%s+laws%.')
 959 |           or string.match(html, '<h3%s+class="tgme_post_discussion_header">%s*<span%s+class="js%-header">Comments</span>%s+on%s+<a%s+href="https?://t%.me/[^/]+/[0-9]+">this%s+post</a>%s*</h3>')
 960 |           or string.match(html, '<div%s+class="tme_no_messages_found">Array%s+</div>')
 961 |         )
 962 |       ) and not (
 963 |         url["url"] == api_url
 964 |         and string.match(html, '<span%s+class="tgme_widget_message_author_name"%s+dir="auto">')
 965 |         and (
 966 |           string.match(html, '<div%s+class="tgme_widget_message_text%s+js%-message_reply_text"%s+dir="auto">')
 967 |           or string.match(html, '<div%s+class="tgme_widget_message_text%s+js%-message_text"%s+dir="auto">')
 968 |         )
 969 |         and string.match(html, '<input%s+type="hidden"%s+name="reply_to_id"%s+value="[0-9]+">')
 970 |       ) then
 971 |         retry_url = true
 972 |         return false
 973 |       else
 974 |         io.stdout:write("Still valid page.\n")
 975 |         io.stdout:flush()
 976 |       end
 977 |     end
 978 |     if not string.match(url["url"], "[%?&]discussion=1")
 979 |       and url["url"] ~= api_url then
 980 |       if string.match(url["url"], "[%?&]embed=1") then
 981 |         if string.match(html, "tgme_widget_message_error")
 982 |           or not string.match(html, "tgme_widget_message_author") then
 983 |           io.stdout:write("Post does not exist.\n")
 984 |           io.stdout:flush()
 985 |           retry_url = true
 986 |           return false
 987 |         end
 988 |       elseif http_stat["statcode"] == 200 then
 989 |         local image_domain = string.match(html, '<meta%s+property="og:image"%s+content="([^"]*)"')
 990 |         if not image_domain or (
 991 |           image_domain ~= ""
 992 |           and not string.match(image_domain, "cdn%-telegram%.org/")
 993 |           and not string.match(image_domain, "telesco%.pe/")
 994 |           and not string.match(image_domain, "telegram%.org/img/")
 995 |           and not string.match(image_domain, "data:image/")
 996 |         ) then
 997 |           io.stdout:write("Main image has bad domain.\n")
 998 |           io.stdout:flush()
 999 |           retry_url = true
1000 |           return false
1001 |         end
1002 |       end
1003 |     end
1004 |   end
1005 | 
1006 |   retry_url = false
1007 |   tries = 0
1008 | 
1009 |   if disco_finished and not disco_on then
1010 |     return false
1011 |   end
1012 | 
1013 |   return true
1014 | end
1015 | 
1016 | wget.callbacks.httploop_result = function(url, err, http_stat)
1017 |   status_code = http_stat["statcode"]
1018 | 
1019 |   url_count = url_count + 1
1020 |   io.stdout:write(url_count .. "=" .. status_code .. " " .. url["url"] .. " \n")
1021 |   io.stdout:flush()
1022 | 
1023 |   if killgrab then
1024 |     return wget.actions.ABORT
1025 |   end
1026 | 
1027 |   find_item(url["url"])
1028 | 
1029 |   if status_code >= 300 and status_code <= 399 then
1030 |     local newloc = urlparse.absolute(url["url"], http_stat["newloc"])
1031 |     if status_code == 302 and string.match(newloc, "^https?://telegram%.org/") then
1032 |       abort_item()
1033 |     end
1034 |     if (processed(newloc) or not allowed(newloc, url["url"]))
1035 |       and not retry_url then
1036 |       tries = 0
1037 |       return wget.actions.EXIT
1038 |     end
1039 |   end
1040 | 
1041 |   if status_code == 200 then
1042 |     downloaded[url["url"]] = true
1043 |     downloaded[string.gsub(url["url"], "https?://", "http://")] = true
1044 |   end
1045 | 
1046 |   if disco_finished and retry_url then
1047 |     abort_item()
1048 |   end
1049 | 
1050 |   if abortgrab then
1051 |     if disco_finished then
1052 |       disco_finished = false
1053 |     end
1054 |     abort_item()
1055 |     return wget.actions.EXIT
1056 |   end
1057 | 
1058 |   if retry_url or status_code == 0 then
1059 |     io.stdout:write("Server returned bad response. Sleeping.\n")
1060 |     io.stdout:flush()
1061 |     local maxtries = 11
1062 |     if (item_type == "post" and string.match(url["url"], "%?embed=1$"))
1063 |       or (item_type == "comment" and string.match(url["url"], "%?comment=[0-9]+$"))
1064 |       or (item_type == "channel" and string.match(url["url"], "^https?://t%.me/([^/%?&]+)$")) then
1065 |       io.stdout:write("Bad response on first URL.\n")
1066 |       io.stdout:flush()
1067 |       maxtries = 0
1068 |     elseif item_type == "channel"
1069 |       and string.match(url["url"], "^https?://t%.me/s/([^/%?&]+)$") then
1070 |       disco_on = true
1071 |       io.stdout:write("Maybe a channel with no public index.\n")
1072 |       io.stdout:flush()
1073 |       return wget.actions.NOTHING
1074 |     end
1075 |     tries = tries + 1
1076 |     if tries > maxtries then
1077 |       tries = 0
1078 |       abort_item()
1079 |       return wget.actions.EXIT
1080 |     end
1081 |     os.execute("sleep " .. math.random(
1082 |       math.floor(math.pow(2, tries-0.5)),
1083 |       math.floor(math.pow(2, tries))
1084 |     ))
1085 |     return wget.actions.CONTINUE
1086 |   end
1087 | 
1088 |   tries = 0
1089 | 
1090 |   return wget.actions.NOTHING
1091 | end
1092 | 
1093 | wget.callbacks.finish = function(start_time, end_time, wall_time, numurls, total_downloaded_bytes, total_download_time)
1094 |   local function submit_backfeed(items, key, shard)
1095 |     local tries = 0
1096 |     local maxtries = 10
1097 |     local parameters = ""
1098 |     if shard ~= "" then
1099 |       parameters = "?shard=" .. shard
1100 |     end
1101 |     while tries < maxtries do
1102 |       if killgrab then
1103 |         return false
1104 |       end
1105 |       local body, code, headers, status = http.request(
1106 |         "https://legacy-api.arpa.li/backfeed/legacy/" .. key .. parameters,
1107 |         items .. "\0"
1108 |       )
1109 |       if code == 200 and body ~= nil and cjson.decode(body)["status_code"] == 200 then
1110 |         io.stdout:write(string.match(body, "^(.-)%s*$") .. "\n")
1111 |         io.stdout:flush()
1112 |         break
1113 |       end
1114 |       io.stdout:write("Failed to submit discovered URLs." .. tostring(code) .. tostring(body) .. "\n")
1115 |       io.stdout:flush()
1116 |       os.execute("sleep " .. math.floor(math.pow(2, tries)))
1117 |       tries = tries + 1
1118 |     end
1119 |     if tries == maxtries then
1120 |       kill_grab()
1121 |     end
1122 |   end
1123 | 
1124 |   local file = io.open(item_dir .. "/" .. warc_file_base .. "_bad-items.txt", "w")
1125 |   for url, _ in pairs(bad_items) do
1126 |     file:write(url .. "\n")
1127 |   end
1128 |   file:close()
1129 |   for key, data in pairs({
1130 |     ["telegram-x2kj4uadm0lrniv"] = discovered_items,
1131 |     ["telegram-groups-temp-sqk1lhix8mnnk4p"] = discovered_group_items,
1132 |     --["telegram-iy46ve7bql0k79p"] = discovered_channels,
1133 |     ["telegram-channels-aqpadsraxi2b78y"] = discovered_channels,
1134 |     ["telegram-manycomments-q65xztozh8liqi1g"] = discovered_manycomments,
1135 |     ["urls-h051713fi1agegy"] = discovered_outlinks
1136 |   }) do
1137 |     for shard, urls_data in pairs(data) do
1138 |       print('queuing for', string.match(key, "^(.+)%-"), "on shard", shard)
1139 |       local items = nil
1140 |       local count = 0
1141 |       local progress_count = 0
1142 |       local all_counted = 0
1143 |       for _ in pairs(urls_data) do
1144 |         all_counted = all_counted + 1
1145 |       end
1146 |       print("queuing", all_counted, " items")
1147 |       for item, _ in pairs(urls_data) do
1148 |         --print("found item", item)
1149 |         if items == nil then
1150 |           items = item
1151 |         else
1152 |           items = items .. "\0" .. item
1153 |         end
1154 |         count = count + 1
1155 |         progress_count = progress_count + 1
1156 |         if count == 400 then
1157 |           io.stdout:write(tostring(progress_count) .. " of " .. tostring(all_counted) .. " ")
1158 |           submit_backfeed(items, key, shard)
1159 |           items = nil
1160 |           count = 0
1161 |         end
1162 |       end
1163 |       if items ~= nil then
1164 |         submit_backfeed(items, key, shard)
1165 |       end
1166 |     end
1167 |   end
1168 | end
1169 | 
1170 | wget.callbacks.before_exit = function(exit_status, exit_status_string)
1171 |   if killgrab then
1172 |     return wget.exits.IO_FAIL
1173 |   end
1174 |   if abortgrab then
1175 |     abort_item()
1176 |   end
1177 |   return exit_status
1178 | end
1179 | 
1180 | 


--------------------------------------------------------------------------------