├── README.md
├── combined_lists.py
├── generate_daily_list.py
├── generate_domain_parts.py
├── global_config.py
├── job_handler.py
├── job_server.py
├── notify_email.py
├── requirements.txt
└── shared.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Tranco: A Research-Oriented Top Sites Ranking Hardened Against Manipulation
 2 | 
 3 | *By Victor Le Pochat, Tom Van Goethem, Samaneh Tajalizadehkhoob, Maciej Korczyński and Wouter Joosen*
 4 | 
 5 | This repository contains the source code driving the generation of the Tranco ranking provided at [https://tranco-list.eu/](https://tranco-list.eu/). This new top websites ranking was proposed in our paper [Tranco: A Research-Oriented Top Sites Ranking Hardened Against Manipulation](https://tranco-list.eu/assets/tranco-ndss19.pdf).
 6 | 
 7 | * `combined_lists.py` contains the core code for generating new lists based on a configuration passed to `combined_lists.generate_combined_list`.
 8 | * `shared.py` and `global_config.py` contain several configuration variables; `shared.DEFAULT_TRANCO_CONFIG` gives the configuration of the default (daily updated) Tranco list.
 9 | * `generate_daily_list.py` runs daily to generate the default Tranco list.
10 | * `job_handler.py` contains either the code for submitting jobs to an `rq` queue for processing, or code to relay requests for list generation to a remote host.
11 | * `job_server.py` accepts request for list generation on a remote host.
12 | * `notify_email.py` contains code to notify users when their list has been generated.
13 | * `generate_domain_parts.py` preprocesses rankings to extract the different components of domains.
14 | 


--------------------------------------------------------------------------------
/combined_lists.py:
--------------------------------------------------------------------------------
  1 | # Imports
  2 | import csv
  3 | import datetime
  4 | import glob
  5 | import shutil
  6 | import time
  7 | import traceback
  8 | import zipfile
  9 | from itertools import islice
 10 | import os
 11 | import tempfile
 12 | 
 13 | # Imports of configuration variables
 14 | from global_config import *
 15 | 
 16 | # Constants
 17 | GLOBAL_MAX_RANK = 1000000
 18 | LIST_FILENAME_FORMAT = "{}.csv"
 19 | from shared import ZIP_FILENAME_FORMAT
 20 | 
 21 | # When using AWS services, set up retrieval and storage of lists for S3
 22 | if USE_S3:
 23 |     import boto3
 24 |     s3_resource = boto3.resource('s3', region_name="us-east-1")
 25 |     toplists_archive_bucket = s3_resource.Bucket(name=TOPLISTS_ARCHIVE_S3_BUCKET)
 26 |     from smart_open import smart_open
 27 | 
 28 | # List ID generation
 29 | from hashids import Hashids
 30 | hsh = Hashids(salt="tsr", min_length=4, alphabet="BCDFGHJKLMNPQRSTVWXYZ23456789")
 31 | 
 32 | # Mongo connection for storing configuration of generated lists
 33 | from pymongo import MongoClient
 34 | client = MongoClient(MONGO_URL)
 35 | db = client["tranco"]
 36 | 
 37 | def count_dict(dct, entry, value=1):
 38 |     """ Helper function for updating dictionaries """
 39 |     if not entry in dct:
 40 |         dct[entry] = 0
 41 |     dct[entry] += value
 42 | 
 43 | def date_list(start_date, end_date):
 44 |     """ Generate list of dates between start and end date """
 45 |     start_date_dt = datetime.datetime.strptime(start_date, "%Y-%m-%d")
 46 |     end_date_dt = datetime.datetime.strptime(end_date, "%Y-%m-%d")
 47 |     return [(start_date_dt + datetime.timedelta(days=x)) for x in range((end_date_dt - start_date_dt).days + 1)]
 48 | 
 49 | def _db_id_to_list_id(db_id):
 50 |     """ List number to hash """
 51 |     if db_id:
 52 |         return hsh.encode(db_id)
 53 |     else:
 54 |         return None
 55 | 
 56 | def _list_id_to_db_id(list_id):
 57 |     """ Hash to list number """
 58 |     try:
 59 |         return hsh.decode(list_id)[0]
 60 |     except:
 61 |         return None
 62 | 
 63 | def config_to_list_id(config, insert=True, skip_failed=False):
 64 |     """ List configuration to list hash (either insert new configuration into database, or retrieve ID for existing list with that configuration)
 65 |     :param config: list configuration
 66 |     :param insert: whether to create a new list ID if the given configuration does not exist yet
 67 |     :param skip_failed: skip failed lists
 68 |     :return:
 69 |     """
 70 | 
 71 |     if skip_failed:
 72 |         query = {**config, "failed": {"$ne": True}}
 73 |     else:
 74 |         query = config
 75 |     out = db["lists"].find_one(query)
 76 |     if out:
 77 |         db_id = int(out["_id"])
 78 |     else:
 79 |         if insert:
 80 |             db_id = get_next_db_key()
 81 |             insert_config_in_db(config, db_id)
 82 |         else:
 83 |             return None
 84 |     return _db_id_to_list_id(db_id)
 85 | 
 86 | def list_id_to_config(list_id):
 87 |     """ Retrieve configuration of existing list based on hash """
 88 |     db_id = _list_id_to_db_id(list_id)
 89 |     if db_id:
 90 |         return {**db["lists"].find_one({"_id": int(db_id)}), "list_id": list_id}
 91 | 
 92 | def list_available(list_id):
 93 |     """ Check if list is available for download """
 94 |     db_id = _list_id_to_db_id(list_id)
 95 |     if not db_id:
 96 |         return False
 97 |     doc = db["lists"].find_one({"_id": int(db_id)})
 98 |     return doc is not None and doc.get("finished", False) and not doc.get("failed", True)
 99 | 
100 | def get_next_db_key():
101 |     """ Get next key from list configuration database (for a new list) """
102 |     counter_increase = db["counter"].find_one_and_update({"_id": "lists"}, {'$inc': {'count': 1}})
103 |     return int(counter_increase["count"])
104 | 
105 | def insert_config_in_db(config, db_id):
106 |     """ Insert a new configuration into the database, with the given key """
107 |     db["lists"].insert_one({**config, "_id": db_id, "finished": False,
108 |                             "creationDate": datetime.datetime.now().strftime("%Y-%m-%d"),
109 |                             "creationTime": datetime.datetime.now().isoformat()})
110 | 
111 | def get_generated_list_fp(list_id):
112 |     """ Get file location of existing list (file-based archive) """
113 |     return os.path.join(NETAPP_STORAGE_PATH, "generated_lists/{}".format(LIST_FILENAME_FORMAT.format(list_id)))
114 | 
115 | def get_generated_zip_fp(list_id):
116 |     """ Get file location of existing zip (file-based archive) """
117 |     return os.path.join(NETAPP_STORAGE_PATH, "generated_lists_zip/{}".format(ZIP_FILENAME_FORMAT.format(list_id)))
118 | 
119 | def get_generated_list_s3(list_id):
120 |     """ Get file location of existing list (AWS S3) """
121 |     return "s3://{}/{}".format(TOPLISTS_GENERATED_LIST_S3_BUCKET, LIST_FILENAME_FORMAT.format(list_id))
122 | 
123 | def get_generated_zip_s3(list_id):
124 |     """ Get file location of existing zip (AWS S3) """
125 |     return "s3://{}/{}".format(TOPLISTS_DAILY_LIST_S3_BUCKET, ZIP_FILENAME_FORMAT.format(list_id))
126 | 
127 | def get_list_fp_for_day(provider, date, parts=False):
128 |     """ Get file location for source list (of one of the providers) """
129 |     date = date.strftime("%Y%m%d")
130 |     if parts:
131 |         fp = next(glob.iglob(os.path.join(NETAPP_STORAGE_PATH, "archive/{}/parts/{}_{}_parts.csv".format(provider, provider, date))))
132 |     else:
133 |         fp = next(glob.iglob(os.path.join(NETAPP_STORAGE_PATH, "archive/{}/{}_{}.csv".format(provider, provider, date))))
134 |     return fp
135 | 
136 | def get_s3_key_for_day(provider, date, parts=False):
137 |     """ Get S3 key for source list (of one of the providers) """
138 |     date = date.strftime("%Y%m%d")
139 |     if parts:
140 |         fp = "{}/parts/{}_{}_parts.csv".format(provider, provider, date)
141 |     else:
142 |         fp = "{}/{}_{}.csv".format(provider, provider, date)
143 |     return fp
144 | 
145 | def get_s3_url_for_day(provider, date, parts=False):
146 |     """ Get S3 url for source list (of one of the providers) """
147 |     key = get_s3_key_for_day(provider, date, parts)
148 |     return "s3://{}/{}".format(TOPLISTS_ARCHIVE_S3_BUCKET, key)
149 | 
150 | def get_s3_url_for_fp(fp):
151 |     """ Get S3 url for source list (of one of the providers) """
152 |     return "s3://{}/{}".format(TOPLISTS_ARCHIVE_S3_BUCKET, fp)
153 | 
154 | def generate_prefix_items_file(fp, list_prefix):
155 |     """ Create list of source list items (up to requested list length) """
156 |     with open(fp, encoding='utf8') as f:
157 |         if list_prefix:
158 |             return [r.split(",") for r in islice(f.read().splitlines(), list_prefix)]
159 |         else:
160 |             return [r.split(",") for r in f.read().splitlines()]
161 | 
162 | def generate_prefix_items_s3(fp, list_prefix):
163 |     """ Create list of source list items (up to requested list length) """
164 |     with smart_open(get_s3_url_for_fp(fp)) as f:
165 |         if list_prefix:
166 |             result = [r.decode("utf-8").split(",") for r in islice(f.read().splitlines(), list_prefix)]
167 |         else:
168 |             result = [r.decode("utf-8").split(",") for r in f.read().splitlines()]
169 |         return result
170 | 
171 | def rescale_rank(rank, max_rank_of_input, min_rank_of_output, max_rank_of_output):
172 |     """
173 |     Rescale a given rank to the min/max range provided
174 |     This makes sure that shorter lists are not given a higher importance.
175 |     """
176 |     return min_rank_of_output + (rank - 1)*((max_rank_of_output-min_rank_of_output)/(max_rank_of_input - 1))
177 | 
178 | def borda_count_fp(fps, list_prefix):
179 |     """ Generate aggregate scores for domains based on Borda count """
180 |     borda_scores = {}
181 |     for fp in fps:
182 |         if USE_S3:
183 |             items = generate_prefix_items_s3(fp, list_prefix)
184 |         else:
185 |             items = generate_prefix_items_file(fp, list_prefix)
186 |         max_rank_of_input = len(items)
187 |         max_rank_of_output = min(GLOBAL_MAX_RANK, list_prefix if list_prefix else GLOBAL_MAX_RANK)
188 |         for rank, elem in items:
189 |             count_dict(borda_scores, elem, max_rank_of_output + 1 - rescale_rank(int(rank), max_rank_of_input, 1, max_rank_of_output))  # necessary to rescale shorter lists (i.e. Quantcast)
190 |     return borda_scores
191 | 
192 | def dowdall_count_fp(fps, list_prefix):
193 |     """ Generate aggregate scores for domains based on Dowdall count """
194 |     dowdall_scores = {}
195 |     for fp in fps:
196 |         if USE_S3:
197 |             items = generate_prefix_items_s3(fp, list_prefix)
198 |         else:
199 |             items = generate_prefix_items_file(fp, list_prefix)
200 |         max_rank_of_input = len(items)
201 |         max_rank_of_output = min(GLOBAL_MAX_RANK, list_prefix if list_prefix else GLOBAL_MAX_RANK)
202 |         for rank, elem in items:
203 |             count_dict(dowdall_scores, elem, 1 / rescale_rank(int(rank), max_rank_of_input, 1, max_rank_of_output))  # necessary to rescale shorter lists (i.e. Quantcast)
204 |     return dowdall_scores
205 | 
206 | def filtered_parts_list_file(fp, list_prefix, f_pld=None, f_tlds=None, f_organization=None, f_subdomains=None, maintain_rank=True):
207 |     """ Get list of domains that conform to the set filters """
208 |     with open(fp) as f:
209 |         if list_prefix:
210 |             parts_input = islice(f, list_prefix)
211 |         else:
212 |             parts_input = f
213 |         output = []
214 |         organizations_seen = set()
215 |         new_rank = 1
216 |         max_rank = 0
217 |         for line in parts_input:
218 |             max_rank += 1
219 |             rank, fqdn, pld, sld, subd, ps, tld, is_pld = line.rstrip().split(",")
220 |             if f_tlds and (tld not in f_tlds):
221 |                 continue
222 |             if f_subdomains and (subd not in f_subdomains):
223 |                 continue
224 |             if f_organization:
225 |                 if sld in organizations_seen:
226 |                     continue
227 |                 else:
228 |                     organizations_seen.add(sld)
229 |             if f_pld:
230 |                 if is_pld != "True":
231 |                     continue
232 |             if maintain_rank:
233 |                 output.append((rank, fqdn))
234 |             else:
235 |                 output.append((new_rank, fqdn))
236 |                 new_rank += 1
237 |     return (output, max_rank)
238 | 
239 | def filtered_parts_list_s3(fp, list_prefix, f_pld=None, f_tlds=None, f_organization=None, f_subdomains=None, maintain_rank=True):
240 |     """ Get list of domains that conform to the set filters """
241 |     with smart_open(get_s3_url_for_fp(fp)) as f:
242 |         if list_prefix:
243 |             parts_input = islice(f, list_prefix)
244 |         else:
245 |             parts_input = f
246 |         output = []
247 |         organizations_seen = set()
248 |         new_rank = 1
249 |         max_rank = 0
250 |         for line in parts_input:
251 |             max_rank += 1
252 |             rank, fqdn, pld, sld, subd, ps, tld, is_pld = line.decode("utf-8").rstrip().split(",")
253 |             if f_tlds and (tld not in f_tlds):
254 |                 continue
255 |             if f_subdomains and (subd not in f_subdomains):
256 |                 continue
257 |             if f_organization:
258 |                 if sld in organizations_seen:
259 |                     continue
260 |                 else:
261 |                     organizations_seen.add(sld)
262 |             if f_pld:
263 |                 if is_pld != "True":
264 |                     continue
265 |             if maintain_rank:
266 |                 output.append((rank, fqdn))
267 |             else:
268 |                 output.append((new_rank, fqdn))
269 |                 new_rank += 1
270 |     return (output, max_rank)
271 | 
272 | def get_filtered_parts_lists(fps, input_prefix, config, maintain_rank=True):
273 |     """ Get domains in given source lists that conform to the filters in the configuration """
274 |     for fp in fps:
275 |         if USE_S3:
276 |             yield filtered_parts_list_s3(fp, input_prefix,
277 |                                           config.get("filterPLD", None) == "on",
278 |                                           config.get('filterTLDValue').split(",") if config.get("filterTLDValue",
279 |                                                                                                 None) else None,
280 |                                           config.get("filterOrganization", None) == "on",
281 |                                           config.get('filterSubdomainValue').split(",") if config.get(
282 |                                               "filterSubdomainValue", None) else None,
283 |                                          maintain_rank=maintain_rank
284 |                                           )
285 |         else:
286 |             yield filtered_parts_list_file(fp, input_prefix,
287 |                                          config.get("filterPLD", None) == "on",
288 |                                          config.get('filterTLDValue').split(",") if config.get("filterTLDValue",
289 |                                                                                                None) else None,
290 |                                          config.get("filterOrganization", None) == "on",
291 |                                          config.get('filterSubdomainValue').split(",") if config.get(
292 |                                              "filterSubdomainValue", None) else None,
293 |                                            maintain_rank=maintain_rank
294 |                                          )
295 | 
296 | def borda_count_list(fps, input_prefix, config, maintain_rank=True):
297 |     """ Generate aggregate scores for list of filtered domains based on Borda count """
298 |     borda_scores = {}
299 |     for (filtered_lst, max_rank) in get_filtered_parts_lists(fps, input_prefix, config):
300 |         if maintain_rank:
301 |             max_rank_of_input = max_rank
302 |         else:
303 |             max_rank_of_input = len(filtered_lst)
304 |         max_rank_of_output = min(GLOBAL_MAX_RANK, input_prefix if input_prefix else GLOBAL_MAX_RANK)
305 |         for rank, elem in filtered_lst:
306 |             count_dict(borda_scores, elem, max_rank_of_output + 1 - rescale_rank(int(rank), max_rank_of_input, 1, max_rank_of_output))  # necessary to rescale shorter lists
307 |     return borda_scores
308 | 
309 | def dowdall_count_list(fps, input_prefix, config, maintain_rank=True):
310 |     """ Generate aggregate scores for list of filtered domains based on Dowdall count """
311 |     dowdall_scores = {}
312 |     for (filtered_lst, max_rank) in get_filtered_parts_lists(fps, input_prefix, config):
313 |         if maintain_rank:
314 |             max_rank_of_input = max_rank
315 |         else:
316 |             max_rank_of_input = len(filtered_lst)
317 |         max_rank_of_output = min(GLOBAL_MAX_RANK, input_prefix if input_prefix else GLOBAL_MAX_RANK)
318 |         for rank, elem in filtered_lst:
319 |             count_dict(dowdall_scores, elem, 1 / rescale_rank(int(rank), max_rank_of_input, 1, max_rank_of_output))  # necessary to rescale shorter lists
320 |     return dowdall_scores
321 | 
322 | def sort_counts(scores):
323 |     """ Sort domains based on aggregate scores """
324 |     return sorted(scores.keys(), key=lambda elem: (-scores[elem], elem))
325 | 
326 | def filter_list_1(lst, filter_set, list_size=None):
327 |     """ Filter list of domains on given set of domains """
328 |     if list_size:
329 |         result = []
330 |         for e in lst:
331 |             if e in filter_set:
332 |                 result.append(e)
333 |                 if len(result) >= list_size:
334 |                     break
335 |         return result
336 |     else:
337 |         return [e for e in lst if e in filter_set]
338 | 
339 | def filter_list_multiple(lst, filter_sets):
340 |     """ Filter list of domains on given sets of domains """
341 |     return [e for e in lst if all(e in filter_set for filter_set in filter_sets)]
342 | 
343 | def count_presence_in_fps(fps, prefix):
344 |     """ Counts of occurrences in given files with domains """
345 |     presence = {}
346 |     for fp in fps:
347 |         lst = generate_prefix_items_s3(fp, prefix)
348 |         for i in lst:
349 |             count_dict(presence, i, 1)
350 | 
351 | def count_presence_in_sets(sets,):
352 |     """ Counts of occurrences in given sets """
353 |     presence = {}
354 |     for st in sets:
355 |         for i in st:
356 |             count_dict(presence, i, 1)
357 |     return presence
358 | 
359 | def items_in_any_list(fps, prefix):
360 |     """ Find domains that appear in any of the given lists """
361 |     return set.union(*map(set, [[i[1] for i in generate_prefix_items_s3(fp, prefix)] for fp in fps]))
362 | 
363 | def generate_filter_minimum_presence(fps, prefix, minimum):
364 |     """ An item should appear on all the lists """
365 |     presence = count_presence_in_fps(fps, prefix)
366 |     return {k for k, v in presence.items() if v >= minimum}
367 | 
368 | def generate_filter_minimum_presence_any(groups_of_fps, prefix, minimum):
369 |     """ An item should appear in `minimum` groups, where an item may appear in any list in that group """
370 |     items_per_group = [items_in_any_list(group, prefix) for group in groups_of_fps]
371 |     presence = count_presence_in_sets(items_per_group,)
372 |     return {k for k, v in presence.items() if v >= minimum}
373 | 
374 | def truncate_list(lst, list_size=None):
375 |     """ Return only prefix of given list """
376 |     return lst[:list_size] if list_size else lst
377 | 
378 | def write_sorted_counts(sorted_items, scores, fp):
379 |     """ Write domains and aggregate scores to file """
380 |     with open(fp, 'w', encoding='utf8') as f:
381 |         csvw = csv.writer(f)
382 |         for idx, entry in enumerate(sorted_items):
383 |             csvw.writerow([idx + 1, entry, scores[entry]])
384 | 
385 | def write_list_to_file(lst, list_id):
386 |     """ Write ranks and domains to file """
387 |     with open(get_generated_list_fp(list_id), 'w', encoding='utf8') as f:
388 |         csvw = csv.writer(f)
389 |         for idx, entry in enumerate(lst):
390 |             csvw.writerow([idx + 1, entry])
391 | 
392 | 
393 | def write_zip_to_file(lst, list_id):
394 |     """ Write list of (top 1M) domains to zip file """
395 |     with tempfile.SpooledTemporaryFile(mode='w+b') as z:
396 |         with tempfile.NamedTemporaryFile(mode='w+') as t:
397 |             csvw = csv.writer(t)
398 |             for idx, entry in enumerate(lst):
399 |                 csvw.writerow([idx + 1, entry])
400 | 
401 |             t.seek(0)
402 | 
403 |             with zipfile.ZipFile(z, 'w') as a:
404 |                 a.write(t.name, arcname="top-1m.csv")
405 | 
406 |             z.seek(0)
407 | 
408 |             with open(get_generated_zip_fp(list_id), 'wb') as f:
409 |                 f.write(z.read())
410 | 
411 | 
412 | def write_list_to_s3(lst, list_id):
413 |     """ Write ranks and domains to file """
414 |     with smart_open(get_generated_list_s3(list_id), 'w', encoding='utf8') as f:
415 |         csvw = csv.writer(f)
416 |         for idx, entry in enumerate(lst):
417 |             csvw.writerow([idx + 1, entry])
418 | 
419 | 
420 | def write_zip_to_s3(lst, list_id):
421 |     """ Write list of (top 1M) domains to zip file """
422 |     with tempfile.SpooledTemporaryFile(mode='w+b') as z:
423 |         with tempfile.NamedTemporaryFile(mode='w+') as t:
424 |             csvw = csv.writer(t)
425 |             for idx, entry in enumerate(lst):
426 |                 csvw.writerow([idx + 1, entry])
427 | 
428 |             t.seek(0)
429 | 
430 |             with zipfile.ZipFile(z, 'w') as a:
431 |                 a.write(t.name, arcname="top-1m.csv")
432 | 
433 |             z.seek(0)
434 | 
435 |             with smart_open(get_generated_zip_s3(list_id), 'wb') as f:
436 |                 f.write(z.read())
437 | 
438 | 
439 | def copy_daily_list_s3(list_id):
440 |     """ Copy the daily list on S3 to the fixed URL """
441 |     zip_key = ZIP_FILENAME_FORMAT.format(list_id)
442 |     source = {'Bucket': TOPLISTS_DAILY_LIST_S3_BUCKET, 'Key': zip_key}
443 |     target_bucket = s3_resource.Bucket(TOPLISTS_DAILY_LIST_S3_BUCKET)
444 |     target_bucket.copy(source, 'top-1m.csv.zip')
445 | 
446 | 
447 | def copy_daily_list_file(list_id):
448 |     """ Copy the daily list on file-based archive to the fixed URL """
449 |     zip_file = get_generated_zip_fp(list_id)
450 |     target_file = os.path.join(NETAPP_STORAGE_PATH, "generated_lists_zip/{}".format("top-1m.csv.zip"))
451 |     shutil.copy2(zip_file, target_file)
452 | 
453 | def generate_combined_list(config, list_id, test=False):
454 |     """ Generate combined list by calculating aggregate scores on (potentially filtered) source lists of ranked domains """
455 |     db_id = _list_id_to_db_id(list_id)
456 |     try:
457 |         ### INPUT ###
458 | 
459 |         # If a filter on parts is selected, the preprocessed parts files should be used.
460 |         parts_filter = config.get("filterPLD", False) or (config.get("filterTLD", "false") != "false") or config.get("filterOrganization", False) or config.get('filterSubdomain', False)
461 |         dates = date_list(config.get("startDate"), config.get("endDate"))
462 | 
463 |         # Get source files to process
464 |         fps = []
465 |         fps_on_date = {date: [] for date in dates}
466 |         fps_on_provider = {provider: [] for provider in config['providers']}
467 |         for provider in config['providers']:
468 |             for date in dates:
469 |                 if USE_S3:
470 |                     list_fp = get_s3_key_for_day(provider, date, parts_filter)
471 |                 else:
472 |                     list_fp = get_list_fp_for_day(provider, date, parts_filter)
473 |                 fps.append(list_fp)
474 |                 fps_on_date[date].append(list_fp)
475 |                 fps_on_provider[provider].append(list_fp)
476 | 
477 |         # Get requested list prefix
478 |         if "listPrefix" in config and config['listPrefix']:
479 |             if config['listPrefix'] == "full":
480 |                 input_prefix = None
481 |             elif config['listPrefix'] == "custom":
482 |                 input_prefix = int(config['listPrefixCustomValue'])
483 |             else:
484 |                 input_prefix = int(config['listPrefix'])
485 |         else:
486 |             input_prefix = None
487 | 
488 |         # Generate (sorted) aggregate counts (on parts files if necessary)
489 |         if parts_filter:
490 |             if config['combinationMethod'] == 'borda':
491 |                 scores = borda_count_list(fps, input_prefix, config)
492 |             elif config['combinationMethod'] == 'dowdall':
493 |                 scores = dowdall_count_list(fps, input_prefix, config)
494 |             else:
495 |                 raise Exception("Unknown combination method")
496 |         else:
497 |             if config['combinationMethod'] == 'borda':
498 |                 scores = borda_count_fp(fps, input_prefix)
499 |             elif config['combinationMethod'] == 'dowdall':
500 |                 scores = dowdall_count_fp(fps, input_prefix)
501 |             else:
502 |                 raise Exception("Unknown combination method")
503 |         sorted_domains = sort_counts(scores)
504 |         domains = sorted_domains
505 | 
506 |         ### FILTERS ###
507 | 
508 |         filters_to_apply = []
509 |         if "inclusionDays" in config and config["inclusionDays"]:
510 |             presence_filter = generate_filter_minimum_presence_any([fps_on_date[date] for date in dates], input_prefix, int(config["inclusionDaysValue"]))
511 |             filters_to_apply.append(presence_filter)
512 |         if "inclusionLists" in config and config["inclusionLists"]:
513 |             presence_filter = generate_filter_minimum_presence_any([fps_on_provider[provider] for provider in config['providers']], input_prefix, int(config["inclusionListsValue"]))
514 |             filters_to_apply.append(presence_filter)
515 |         domains = filter_list_multiple(domains, filters_to_apply)
516 | 
517 |         ### OUTPUT ###
518 | 
519 |         if test:
520 |             return domains
521 |         else:
522 |             # Write list to file
523 |             if USE_S3:
524 |                 write_list_to_s3(domains, list_id)
525 |             else:
526 |                 write_list_to_file(domains, list_id)
527 | 
528 |             # If the list is the daily default list, also generate a zip of the top 1M and copy to permanent URL
529 |             try:
530 |                 if "isDailyList" in config and config["isDailyList"] is True:
531 |                     if USE_S3:
532 |                         write_zip_to_s3(domains[:1000000], list_id)
533 |                         copy_daily_list_s3(list_id)
534 |                     else:
535 |                         write_zip_to_file(domains[:1000000], list_id)
536 |                         copy_daily_list_file(list_id)
537 |             except:
538 |                 print("Zip creation failed")
539 |                 traceback.print_exc()
540 | 
541 |             # Update generation success in database
542 |             db["lists"].update_one({"_id": db_id}, {"$set": {"finished": True, "failed": False, "list_id": list_id}})
543 | 
544 |         time.sleep(1)
545 |         # Report success
546 |         return True
547 |     except:
548 |         traceback.print_exc()
549 |         # Update generation failure in database
550 |         db["lists"].update_one({"_id": db_id}, {"$set": {"finished": True, "failed": True}})
551 |         # Report failure
552 |         return False
553 | 
554 | 


--------------------------------------------------------------------------------
/generate_daily_list.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import sys
 3 | 
 4 | from redis import Redis
 5 | from rq import Queue
 6 | 
 7 | import combined_lists
 8 | from shared import DATE_FORMAT_WITH_HYPHEN, DEFAULT_TRANCO_CONFIG
 9 | 
10 | 
11 | 
12 | def get_date_interval_bounds(start_date, end_date, nb_days, nb_days_from):
13 |     if start_date:
14 |         start_date_dt = datetime.datetime.strptime(start_date, DATE_FORMAT_WITH_HYPHEN)
15 |         return (
16 |         start_date, (start_date_dt + datetime.timedelta(days=int(nb_days) - 1)).strftime(DATE_FORMAT_WITH_HYPHEN))
17 |     elif end_date:
18 |         end_date_dt = datetime.datetime.strptime(end_date, DATE_FORMAT_WITH_HYPHEN)
19 |         return ((end_date_dt - datetime.timedelta(days=int(nb_days) - 1)).strftime(DATE_FORMAT_WITH_HYPHEN), end_date)
20 | 
21 | 
22 | def generate_todays_lists(day):
23 |     print("Generating lists for {}...".format(day))
24 |     config = DEFAULT_TRANCO_CONFIG.copy()
25 | 
26 |     if day == "yesterday":
27 |         date = (datetime.datetime.utcnow() - datetime.timedelta(days=1)).strftime(DATE_FORMAT_WITH_HYPHEN)
28 |     elif day == "today":
29 |         date = datetime.datetime.utcnow().strftime(DATE_FORMAT_WITH_HYPHEN)
30 |     else:
31 |         raise ValueError
32 |     config["startDate"], config["endDate"] = get_date_interval_bounds(None, date, 30, "end")
33 |     config["isDailyList"] = True
34 | 
35 |     print("Generating list...")
36 |     list_id = combined_lists.config_to_list_id(config)
37 |     print("Generating list ID {}...".format(list_id))
38 |     if not combined_lists.list_available(list_id):
39 |         conn = Redis('localhost', 6379)
40 |         generate_queue = Queue('generate', connection=conn, default_timeout="1h")
41 |         if list_id not in generate_queue.job_ids:
42 |             generate_queue.enqueue(combined_lists.generate_combined_list, args=(config, list_id), job_id=str(list_id), timeout="1h")
43 |             print("Submitted job for list ID {}".format(list_id))
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     day = "yesterday"
48 |     if len(sys.argv) > 1:
49 |         day = sys.argv[1]
50 |     generate_todays_lists(day)
51 | 


--------------------------------------------------------------------------------
/generate_domain_parts.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import sys
 3 | 
 4 | import tldextract
 5 | 
 6 | 
 7 | def generate_parts_list(input_fp, output_fp):
 8 |     print(input_fp)
 9 |     print(output_fp)
10 |     with open(output_fp, 'w', encoding='UTF-8') as output_file:
11 |         output = csv.writer(output_file)
12 |         with open(input_fp, encoding='UTF-8') as input_file:
13 |             for l in input_file:
14 |                 rank, fqdn = l.rstrip('\n').split(",")
15 |                 ext = tldextract.extract(fqdn)
16 |                 pld = ext.registered_domain
17 |                 is_pld = pld == fqdn
18 |                 ps = ext.suffix
19 |                 tld = fqdn[fqdn.rfind(".") + 1:]
20 |                 sld = ext.domain
21 |                 subd = ext.subdomain
22 |                 output.writerow([rank, fqdn, pld, sld, subd, ps, tld, is_pld])
23 | 
24 | if __name__ == '__main__':
25 |     input_fp = sys.argv[1]
26 |     output_fp = "/".join(input_fp.split("/")[:-1]) + "/parts/" + input_fp.split("/")[-1][:-4] + "_parts.csv"
27 |     generate_parts_list(input_fp, output_fp)


--------------------------------------------------------------------------------
/global_config.py:
--------------------------------------------------------------------------------
 1 | NETAPP_STORAGE_PATH = None  # File-based archive
 2 | MAILGUN_API_KEY = None  # API key for sending email notifications
 3 | TOPLISTS_ARCHIVE_S3_BUCKET = None  # S3 bucket with archived rankings
 4 | TOPLISTS_GENERATED_LIST_S3_BUCKET = None  # S3 bucket with generated lists
 5 | TOPLISTS_DAILY_LIST_S3_BUCKET = None  # S3 bucket with daily default lists
 6 | MONGO_URL = None  # Mongo instance for storing configurations of lists
 7 | USE_S3 = None  # Boolean indicating whether to use AWS services
 8 | GENERATION_REMOTE = None  # Boolean indicating whether list generation is handled remotely
 9 | GENERATION_REMOTE_ENDPOINT = None  # Endpoint accepting list generation jobs
10 | JOB_SERVER_PORT = None  # Port of server accepting list generation jobs


--------------------------------------------------------------------------------
/job_handler.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | 
  3 | from redis import Redis
  4 | from rq import Queue
  5 | from rq.registry import StartedJobRegistry
  6 | 
  7 | import combined_lists
  8 | import notify_email
  9 | 
 10 | 
 11 | class JobHandler:
 12 |     """
 13 |     Manage list generation run on this machine.
 14 |     """
 15 |     def __init__(self, asyncio_loop):
 16 |         self.loop = asyncio_loop
 17 |         self.setup_job_queues()
 18 | 
 19 |     def setup_job_queues(self):
 20 |         """ Setup rq queues for submitting list generation and email notification jobs. """
 21 |         self.conn = Redis('localhost', 6379)
 22 |         self.generate_queue = Queue('generate', connection=self.conn, default_timeout="1h")
 23 |         self.email_queue = Queue('notify_email', connection=self.conn)
 24 | 
 25 |     async def submit_generate_job(self, config, list_id):
 26 |         """ Submit a new job for generating a list (with the given config) """
 27 |         if list_id not in await self.loop.run_in_executor(None, self.current_jobs):
 28 |             await self.loop.run_in_executor(None, functools.partial(self.generate_queue.enqueue, combined_lists.generate_combined_list, args=(config, list_id), job_id=str(list_id), timeout="1h"))
 29 |             return True
 30 |         else:
 31 |             return False
 32 | 
 33 |     async def submit_email_job(self, email_address, list_id, list_size):
 34 |         """ Submit a new job for sending an email once a list has been generated """
 35 |         generate_job = await self.loop.run_in_executor(None, self.generate_queue.fetch_job, list_id)
 36 |         await self.loop.run_in_executor(None, functools.partial(self.email_queue.enqueue, notify_email.send_notification_mailgun_api, email_address, list_id, list_size, depends_on=generate_job))
 37 |         return True
 38 | 
 39 |     def current_jobs(self):
 40 |         """ Track currently active and queued jobs """
 41 |         registry = StartedJobRegistry(queue=self.generate_queue)
 42 |         jobs = registry.get_job_ids() + self.current_jobs()
 43 | 
 44 |         return jobs
 45 | 
 46 |     def jobs_ahead_of_job(self, list_id):
 47 |         """ Count number of jobs ahead of current job """
 48 |         jobs = self.current_jobs()
 49 |         if list_id in jobs:
 50 |             return jobs.index(list_id)
 51 |         else:
 52 |             return 0
 53 | 
 54 |     async def get_job_status(self, list_id):
 55 |         """ Get current status of a job """
 56 |         job_success = await self.loop.run_in_executor(None, self.get_job_success, list_id)
 57 |         jobs_ahead = await self.loop.run_in_executor(None, self.jobs_ahead_of_job, list_id)
 58 |         return {"completed": job_success is not None, "jobs_ahead": jobs_ahead, "success": job_success}
 59 | 
 60 |     def get_job_success(self, list_id):
 61 |         """ Get current rq status of a job """
 62 |         return self.generate_queue.fetch_job(list_id).result
 63 | 
 64 | 
 65 | class JobHandlerRemote:
 66 |     """
 67 |     Manage relaying jobs to a remote machine that generates lists.
 68 |     """
 69 |     def __init__(self, asyncio_loop, endpoint=None, session=None):
 70 |         """
 71 | 
 72 |         :param asyncio_loop:
 73 |         :param endpoint: remote location that generates lists
 74 |         :param session: client session for aiohttp
 75 |         """
 76 |         if not endpoint or not session:
 77 |             raise ValueError
 78 |         self.endpoint = endpoint
 79 |         self.session = session
 80 | 
 81 |     async def submit_generate_job(self, config, list_id):
 82 |         """ Submit a new job for generating a list (with the given config) """
 83 |         async with self.session.post("{}/submit_generate".format(self.endpoint), json={"config": config, "list_id": list_id}) as response:
 84 |             jsn = await response.json()
 85 |             return jsn["success"]
 86 | 
 87 |     async def submit_email_job(self, email_address, list_id, list_size):
 88 |         """ Submit a new job for sending an email once a list has been generated """
 89 |         async with self.session.post("{}/submit_email".format(self.endpoint), json={"email_address": email_address, "list_id": list_id, "list_size": list_size}) as response:
 90 |             jsn = await response.json()
 91 |             return jsn["success"]
 92 | 
 93 |     async def get_job_status(self, list_id):
 94 |         """ Get current status of a job """
 95 |         async with self.session.get("{}/job_status".format(self.endpoint), params={"list_id": list_id}) as response:
 96 |             jsn = await response.json()
 97 |             return jsn
 98 | 
 99 |     async def retrieve_list(self, list_id, slice_size):
100 |         """ Retrieve the contents of a remotely generated list """
101 |         async with self.session.get("{}/retrieve_list".format(self.endpoint), json={"list_id": list_id, "slice_size": slice_size}) as response:
102 |             while True:
103 |                 chunk = await response.content.read(1024)
104 |                 if not chunk:
105 |                     break
106 |                 yield chunk
107 | 


--------------------------------------------------------------------------------
/job_server.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import aitertools
 3 | from aiohttp import web
 4 | 
 5 | import combined_lists
 6 | import job_handler
 7 | from global_config import JOB_SERVER_PORT
 8 | 
 9 | 
10 | class JobServer:
11 |     """ Job server for accepting requests for generating a custom Tranco list (hosted on remote machine) """
12 | 
13 |     def __init__(self, loop):
14 |         self.web_app = None
15 |         self.server = None
16 |         self.runner = None
17 |         self.routes = web.RouteTableDef()
18 |         self.loop = loop
19 |         self.job_handler: job_handler.JobHandler = None
20 | 
21 |     async def submit_generate_job(self, request):
22 |         """ Submit a new job for generating a list (with the given config) """
23 |         post_data = await request.json()
24 |         print("Generating ", post_data)
25 |         result = await self.job_handler.submit_generate_job(post_data["config"], post_data["list_id"])
26 |         return web.json_response({"success": result})
27 | 
28 |     async def submit_email_job(self, request):
29 |         """ Submit a new job for sending an email once a list has been generated """
30 |         post_data = await request.json()
31 |         result = await self.job_handler.submit_email_job(post_data["email_address"], post_data["list_id"], post_data["list_size"])
32 |         return web.json_response({"success": result})
33 | 
34 |     async def get_job_status(self, request):
35 |         """ Get current status of a job """
36 |         list_id = request.query['list_id']
37 |         print("Getting status for ", list_id)
38 |         return web.json_response(await self.job_handler.get_job_status(list_id))
39 | 
40 |     async def retrieve_list(self, request):
41 |         """ Retrieve the contents of a remotely generated list """
42 |         post_data = await request.json()
43 |         list_id = post_data["list_id"]
44 |         slice_size = post_data["slice_size"]
45 |         file_path = await self.loop.run_in_executor(None, combined_lists.get_generated_list_fp, list_id)
46 | 
47 |         async def generator():
48 |             with open(file_path) as csvf:
49 |                 async for line in aitertools.islice(csvf, slice_size):
50 |                     yield line.encode("utf-8")
51 | 
52 |         return web.Response(body=generator(),
53 |                             content_type="text/csv",
54 |                             charset="utf-8",
55 |                             )
56 | 
57 |     async def initialize_routes(self):
58 |         self.web_app.add_routes([
59 |             web.post('/submit_generate', self.submit_generate_job),
60 |             web.post('/submit_email', self.submit_email_job),
61 |             web.get('/job_status', self.get_job_status),
62 |             web.get('/retrieve_list', self.retrieve_list)
63 |         ])
64 | 
65 |     async def run(self):
66 |         self.job_handler = job_handler.JobHandler(self.loop)
67 | 
68 |         self.web_app = web.Application()
69 | 
70 |         await self.initialize_routes()
71 |         self.runner = web.AppRunner(self.web_app)
72 |         await self.runner.setup()
73 |         self.server = web.TCPSite(self.runner, '0.0.0.0', JOB_SERVER_PORT)
74 |         await self.server.start()
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     loop = asyncio.get_event_loop()
79 |     server = JobServer(loop)
80 |     loop.run_until_complete(server.run())
81 |     loop.run_forever()


--------------------------------------------------------------------------------
/notify_email.py:
--------------------------------------------------------------------------------
 1 | import smtplib
 2 | from email.message import EmailMessage
 3 | import email.utils
 4 | 
 5 | import requests
 6 | from rq import Queue, Connection, get_current_connection
 7 | from global_config import MAILGUN_API_KEY
 8 | 
 9 | def send_notification_mailgun_api(email_address, list_id, list_size):
10 |     with Connection(get_current_connection()):
11 |         q = Queue('generate')
12 |         job = q.fetch_job(list_id)
13 |         success = job.result
14 | 
15 |     if success:
16 |         subject = 'The Tranco list: generation succeeded'
17 |         body = "Hello,\n\nWe have successfully generated your requested Tranco list with ID {}. You may retrieve it at https://tranco-list.eu/list/{}/{}\n\nTranco\nhttps://tranco-list.eu/".format(list_id, list_id, list_size)
18 |     else:
19 |         subject = 'The Tranco list: generation failed'
20 |         body = "Hello,\n\nUnfortunately, we were currently unable to generate your requested Tranco list with ID {}. Please try again later.\n\nTranco\nhttps://tranco-list.eu/".format(list_id)
21 | 
22 |     r = requests.post(
23 |             "https://api.eu.mailgun.net/v3/mg.tranco-list.eu/messages",
24 |             auth=("api", MAILGUN_API_KEY),
25 |             data={"from": "Tranco <noreply@mg.tranco-list.eu>",
26 |                   "to": [email_address],
27 |                   "subject": subject,
28 |                   "text": body})
29 |     return int(r.status_code) == 200


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3
2 | smart_open
3 | hashids
4 | pymongo
5 | redis
6 | rq
7 | aiohttp
8 | aitertools


--------------------------------------------------------------------------------
/shared.py:
--------------------------------------------------------------------------------
1 | DATE_FORMAT_WITH_HYPHEN = "%Y-%m-%d"
2 | DEFAULT_TRANCO_CONFIG = {"nbDays": "30", "nbDaysFrom": "end",
3 |                   "combinationMethod": "dowdall",  # TODO make choice based on assessment on stability etc.
4 |                   "listPrefix": 'full',
5 |                   "includeDomains": 'all',  # TODO make choice
6 |                   "filterPLD": "on",
7 |                   "providers": ["alexa", "umbrella", "majestic", "quantcast"]
8 |         }
9 | ZIP_FILENAME_FORMAT = "tranco_{}-1m.csv.zip"


--------------------------------------------------------------------------------