├── README.md ├── combined_lists.py ├── generate_daily_list.py ├── generate_domain_parts.py ├── global_config.py ├── job_handler.py ├── job_server.py ├── notify_email.py ├── requirements.txt └── shared.py /README.md: -------------------------------------------------------------------------------- 1 | # Tranco: A Research-Oriented Top Sites Ranking Hardened Against Manipulation 2 | 3 | *By Victor Le Pochat, Tom Van Goethem, Samaneh Tajalizadehkhoob, Maciej Korczyński and Wouter Joosen* 4 | 5 | This repository contains the source code driving the generation of the Tranco ranking provided at [https://tranco-list.eu/](https://tranco-list.eu/). This new top websites ranking was proposed in our paper [Tranco: A Research-Oriented Top Sites Ranking Hardened Against Manipulation](https://tranco-list.eu/assets/tranco-ndss19.pdf). 6 | 7 | * `combined_lists.py` contains the core code for generating new lists based on a configuration passed to `combined_lists.generate_combined_list`. 8 | * `shared.py` and `global_config.py` contain several configuration variables; `shared.DEFAULT_TRANCO_CONFIG` gives the configuration of the default (daily updated) Tranco list. 9 | * `generate_daily_list.py` runs daily to generate the default Tranco list. 10 | * `job_handler.py` contains either the code for submitting jobs to an `rq` queue for processing, or code to relay requests for list generation to a remote host. 11 | * `job_server.py` accepts request for list generation on a remote host. 12 | * `notify_email.py` contains code to notify users when their list has been generated. 13 | * `generate_domain_parts.py` preprocesses rankings to extract the different components of domains. 14 | -------------------------------------------------------------------------------- /combined_lists.py: -------------------------------------------------------------------------------- 1 | # Imports 2 | import csv 3 | import datetime 4 | import glob 5 | import shutil 6 | import time 7 | import traceback 8 | import zipfile 9 | from itertools import islice 10 | import os 11 | import tempfile 12 | 13 | # Imports of configuration variables 14 | from global_config import * 15 | 16 | # Constants 17 | GLOBAL_MAX_RANK = 1000000 18 | LIST_FILENAME_FORMAT = "{}.csv" 19 | from shared import ZIP_FILENAME_FORMAT 20 | 21 | # When using AWS services, set up retrieval and storage of lists for S3 22 | if USE_S3: 23 | import boto3 24 | s3_resource = boto3.resource('s3', region_name="us-east-1") 25 | toplists_archive_bucket = s3_resource.Bucket(name=TOPLISTS_ARCHIVE_S3_BUCKET) 26 | from smart_open import smart_open 27 | 28 | # List ID generation 29 | from hashids import Hashids 30 | hsh = Hashids(salt="tsr", min_length=4, alphabet="BCDFGHJKLMNPQRSTVWXYZ23456789") 31 | 32 | # Mongo connection for storing configuration of generated lists 33 | from pymongo import MongoClient 34 | client = MongoClient(MONGO_URL) 35 | db = client["tranco"] 36 | 37 | def count_dict(dct, entry, value=1): 38 | """ Helper function for updating dictionaries """ 39 | if not entry in dct: 40 | dct[entry] = 0 41 | dct[entry] += value 42 | 43 | def date_list(start_date, end_date): 44 | """ Generate list of dates between start and end date """ 45 | start_date_dt = datetime.datetime.strptime(start_date, "%Y-%m-%d") 46 | end_date_dt = datetime.datetime.strptime(end_date, "%Y-%m-%d") 47 | return [(start_date_dt + datetime.timedelta(days=x)) for x in range((end_date_dt - start_date_dt).days + 1)] 48 | 49 | def _db_id_to_list_id(db_id): 50 | """ List number to hash """ 51 | if db_id: 52 | return hsh.encode(db_id) 53 | else: 54 | return None 55 | 56 | def _list_id_to_db_id(list_id): 57 | """ Hash to list number """ 58 | try: 59 | return hsh.decode(list_id)[0] 60 | except: 61 | return None 62 | 63 | def config_to_list_id(config, insert=True, skip_failed=False): 64 | """ List configuration to list hash (either insert new configuration into database, or retrieve ID for existing list with that configuration) 65 | :param config: list configuration 66 | :param insert: whether to create a new list ID if the given configuration does not exist yet 67 | :param skip_failed: skip failed lists 68 | :return: 69 | """ 70 | 71 | if skip_failed: 72 | query = {**config, "failed": {"$ne": True}} 73 | else: 74 | query = config 75 | out = db["lists"].find_one(query) 76 | if out: 77 | db_id = int(out["_id"]) 78 | else: 79 | if insert: 80 | db_id = get_next_db_key() 81 | insert_config_in_db(config, db_id) 82 | else: 83 | return None 84 | return _db_id_to_list_id(db_id) 85 | 86 | def list_id_to_config(list_id): 87 | """ Retrieve configuration of existing list based on hash """ 88 | db_id = _list_id_to_db_id(list_id) 89 | if db_id: 90 | return {**db["lists"].find_one({"_id": int(db_id)}), "list_id": list_id} 91 | 92 | def list_available(list_id): 93 | """ Check if list is available for download """ 94 | db_id = _list_id_to_db_id(list_id) 95 | if not db_id: 96 | return False 97 | doc = db["lists"].find_one({"_id": int(db_id)}) 98 | return doc is not None and doc.get("finished", False) and not doc.get("failed", True) 99 | 100 | def get_next_db_key(): 101 | """ Get next key from list configuration database (for a new list) """ 102 | counter_increase = db["counter"].find_one_and_update({"_id": "lists"}, {'$inc': {'count': 1}}) 103 | return int(counter_increase["count"]) 104 | 105 | def insert_config_in_db(config, db_id): 106 | """ Insert a new configuration into the database, with the given key """ 107 | db["lists"].insert_one({**config, "_id": db_id, "finished": False, 108 | "creationDate": datetime.datetime.now().strftime("%Y-%m-%d"), 109 | "creationTime": datetime.datetime.now().isoformat()}) 110 | 111 | def get_generated_list_fp(list_id): 112 | """ Get file location of existing list (file-based archive) """ 113 | return os.path.join(NETAPP_STORAGE_PATH, "generated_lists/{}".format(LIST_FILENAME_FORMAT.format(list_id))) 114 | 115 | def get_generated_zip_fp(list_id): 116 | """ Get file location of existing zip (file-based archive) """ 117 | return os.path.join(NETAPP_STORAGE_PATH, "generated_lists_zip/{}".format(ZIP_FILENAME_FORMAT.format(list_id))) 118 | 119 | def get_generated_list_s3(list_id): 120 | """ Get file location of existing list (AWS S3) """ 121 | return "s3://{}/{}".format(TOPLISTS_GENERATED_LIST_S3_BUCKET, LIST_FILENAME_FORMAT.format(list_id)) 122 | 123 | def get_generated_zip_s3(list_id): 124 | """ Get file location of existing zip (AWS S3) """ 125 | return "s3://{}/{}".format(TOPLISTS_DAILY_LIST_S3_BUCKET, ZIP_FILENAME_FORMAT.format(list_id)) 126 | 127 | def get_list_fp_for_day(provider, date, parts=False): 128 | """ Get file location for source list (of one of the providers) """ 129 | date = date.strftime("%Y%m%d") 130 | if parts: 131 | fp = next(glob.iglob(os.path.join(NETAPP_STORAGE_PATH, "archive/{}/parts/{}_{}_parts.csv".format(provider, provider, date)))) 132 | else: 133 | fp = next(glob.iglob(os.path.join(NETAPP_STORAGE_PATH, "archive/{}/{}_{}.csv".format(provider, provider, date)))) 134 | return fp 135 | 136 | def get_s3_key_for_day(provider, date, parts=False): 137 | """ Get S3 key for source list (of one of the providers) """ 138 | date = date.strftime("%Y%m%d") 139 | if parts: 140 | fp = "{}/parts/{}_{}_parts.csv".format(provider, provider, date) 141 | else: 142 | fp = "{}/{}_{}.csv".format(provider, provider, date) 143 | return fp 144 | 145 | def get_s3_url_for_day(provider, date, parts=False): 146 | """ Get S3 url for source list (of one of the providers) """ 147 | key = get_s3_key_for_day(provider, date, parts) 148 | return "s3://{}/{}".format(TOPLISTS_ARCHIVE_S3_BUCKET, key) 149 | 150 | def get_s3_url_for_fp(fp): 151 | """ Get S3 url for source list (of one of the providers) """ 152 | return "s3://{}/{}".format(TOPLISTS_ARCHIVE_S3_BUCKET, fp) 153 | 154 | def generate_prefix_items_file(fp, list_prefix): 155 | """ Create list of source list items (up to requested list length) """ 156 | with open(fp, encoding='utf8') as f: 157 | if list_prefix: 158 | return [r.split(",") for r in islice(f.read().splitlines(), list_prefix)] 159 | else: 160 | return [r.split(",") for r in f.read().splitlines()] 161 | 162 | def generate_prefix_items_s3(fp, list_prefix): 163 | """ Create list of source list items (up to requested list length) """ 164 | with smart_open(get_s3_url_for_fp(fp)) as f: 165 | if list_prefix: 166 | result = [r.decode("utf-8").split(",") for r in islice(f.read().splitlines(), list_prefix)] 167 | else: 168 | result = [r.decode("utf-8").split(",") for r in f.read().splitlines()] 169 | return result 170 | 171 | def rescale_rank(rank, max_rank_of_input, min_rank_of_output, max_rank_of_output): 172 | """ 173 | Rescale a given rank to the min/max range provided 174 | This makes sure that shorter lists are not given a higher importance. 175 | """ 176 | return min_rank_of_output + (rank - 1)*((max_rank_of_output-min_rank_of_output)/(max_rank_of_input - 1)) 177 | 178 | def borda_count_fp(fps, list_prefix): 179 | """ Generate aggregate scores for domains based on Borda count """ 180 | borda_scores = {} 181 | for fp in fps: 182 | if USE_S3: 183 | items = generate_prefix_items_s3(fp, list_prefix) 184 | else: 185 | items = generate_prefix_items_file(fp, list_prefix) 186 | max_rank_of_input = len(items) 187 | max_rank_of_output = min(GLOBAL_MAX_RANK, list_prefix if list_prefix else GLOBAL_MAX_RANK) 188 | for rank, elem in items: 189 | count_dict(borda_scores, elem, max_rank_of_output + 1 - rescale_rank(int(rank), max_rank_of_input, 1, max_rank_of_output)) # necessary to rescale shorter lists (i.e. Quantcast) 190 | return borda_scores 191 | 192 | def dowdall_count_fp(fps, list_prefix): 193 | """ Generate aggregate scores for domains based on Dowdall count """ 194 | dowdall_scores = {} 195 | for fp in fps: 196 | if USE_S3: 197 | items = generate_prefix_items_s3(fp, list_prefix) 198 | else: 199 | items = generate_prefix_items_file(fp, list_prefix) 200 | max_rank_of_input = len(items) 201 | max_rank_of_output = min(GLOBAL_MAX_RANK, list_prefix if list_prefix else GLOBAL_MAX_RANK) 202 | for rank, elem in items: 203 | count_dict(dowdall_scores, elem, 1 / rescale_rank(int(rank), max_rank_of_input, 1, max_rank_of_output)) # necessary to rescale shorter lists (i.e. Quantcast) 204 | return dowdall_scores 205 | 206 | def filtered_parts_list_file(fp, list_prefix, f_pld=None, f_tlds=None, f_organization=None, f_subdomains=None, maintain_rank=True): 207 | """ Get list of domains that conform to the set filters """ 208 | with open(fp) as f: 209 | if list_prefix: 210 | parts_input = islice(f, list_prefix) 211 | else: 212 | parts_input = f 213 | output = [] 214 | organizations_seen = set() 215 | new_rank = 1 216 | max_rank = 0 217 | for line in parts_input: 218 | max_rank += 1 219 | rank, fqdn, pld, sld, subd, ps, tld, is_pld = line.rstrip().split(",") 220 | if f_tlds and (tld not in f_tlds): 221 | continue 222 | if f_subdomains and (subd not in f_subdomains): 223 | continue 224 | if f_organization: 225 | if sld in organizations_seen: 226 | continue 227 | else: 228 | organizations_seen.add(sld) 229 | if f_pld: 230 | if is_pld != "True": 231 | continue 232 | if maintain_rank: 233 | output.append((rank, fqdn)) 234 | else: 235 | output.append((new_rank, fqdn)) 236 | new_rank += 1 237 | return (output, max_rank) 238 | 239 | def filtered_parts_list_s3(fp, list_prefix, f_pld=None, f_tlds=None, f_organization=None, f_subdomains=None, maintain_rank=True): 240 | """ Get list of domains that conform to the set filters """ 241 | with smart_open(get_s3_url_for_fp(fp)) as f: 242 | if list_prefix: 243 | parts_input = islice(f, list_prefix) 244 | else: 245 | parts_input = f 246 | output = [] 247 | organizations_seen = set() 248 | new_rank = 1 249 | max_rank = 0 250 | for line in parts_input: 251 | max_rank += 1 252 | rank, fqdn, pld, sld, subd, ps, tld, is_pld = line.decode("utf-8").rstrip().split(",") 253 | if f_tlds and (tld not in f_tlds): 254 | continue 255 | if f_subdomains and (subd not in f_subdomains): 256 | continue 257 | if f_organization: 258 | if sld in organizations_seen: 259 | continue 260 | else: 261 | organizations_seen.add(sld) 262 | if f_pld: 263 | if is_pld != "True": 264 | continue 265 | if maintain_rank: 266 | output.append((rank, fqdn)) 267 | else: 268 | output.append((new_rank, fqdn)) 269 | new_rank += 1 270 | return (output, max_rank) 271 | 272 | def get_filtered_parts_lists(fps, input_prefix, config, maintain_rank=True): 273 | """ Get domains in given source lists that conform to the filters in the configuration """ 274 | for fp in fps: 275 | if USE_S3: 276 | yield filtered_parts_list_s3(fp, input_prefix, 277 | config.get("filterPLD", None) == "on", 278 | config.get('filterTLDValue').split(",") if config.get("filterTLDValue", 279 | None) else None, 280 | config.get("filterOrganization", None) == "on", 281 | config.get('filterSubdomainValue').split(",") if config.get( 282 | "filterSubdomainValue", None) else None, 283 | maintain_rank=maintain_rank 284 | ) 285 | else: 286 | yield filtered_parts_list_file(fp, input_prefix, 287 | config.get("filterPLD", None) == "on", 288 | config.get('filterTLDValue').split(",") if config.get("filterTLDValue", 289 | None) else None, 290 | config.get("filterOrganization", None) == "on", 291 | config.get('filterSubdomainValue').split(",") if config.get( 292 | "filterSubdomainValue", None) else None, 293 | maintain_rank=maintain_rank 294 | ) 295 | 296 | def borda_count_list(fps, input_prefix, config, maintain_rank=True): 297 | """ Generate aggregate scores for list of filtered domains based on Borda count """ 298 | borda_scores = {} 299 | for (filtered_lst, max_rank) in get_filtered_parts_lists(fps, input_prefix, config): 300 | if maintain_rank: 301 | max_rank_of_input = max_rank 302 | else: 303 | max_rank_of_input = len(filtered_lst) 304 | max_rank_of_output = min(GLOBAL_MAX_RANK, input_prefix if input_prefix else GLOBAL_MAX_RANK) 305 | for rank, elem in filtered_lst: 306 | count_dict(borda_scores, elem, max_rank_of_output + 1 - rescale_rank(int(rank), max_rank_of_input, 1, max_rank_of_output)) # necessary to rescale shorter lists 307 | return borda_scores 308 | 309 | def dowdall_count_list(fps, input_prefix, config, maintain_rank=True): 310 | """ Generate aggregate scores for list of filtered domains based on Dowdall count """ 311 | dowdall_scores = {} 312 | for (filtered_lst, max_rank) in get_filtered_parts_lists(fps, input_prefix, config): 313 | if maintain_rank: 314 | max_rank_of_input = max_rank 315 | else: 316 | max_rank_of_input = len(filtered_lst) 317 | max_rank_of_output = min(GLOBAL_MAX_RANK, input_prefix if input_prefix else GLOBAL_MAX_RANK) 318 | for rank, elem in filtered_lst: 319 | count_dict(dowdall_scores, elem, 1 / rescale_rank(int(rank), max_rank_of_input, 1, max_rank_of_output)) # necessary to rescale shorter lists 320 | return dowdall_scores 321 | 322 | def sort_counts(scores): 323 | """ Sort domains based on aggregate scores """ 324 | return sorted(scores.keys(), key=lambda elem: (-scores[elem], elem)) 325 | 326 | def filter_list_1(lst, filter_set, list_size=None): 327 | """ Filter list of domains on given set of domains """ 328 | if list_size: 329 | result = [] 330 | for e in lst: 331 | if e in filter_set: 332 | result.append(e) 333 | if len(result) >= list_size: 334 | break 335 | return result 336 | else: 337 | return [e for e in lst if e in filter_set] 338 | 339 | def filter_list_multiple(lst, filter_sets): 340 | """ Filter list of domains on given sets of domains """ 341 | return [e for e in lst if all(e in filter_set for filter_set in filter_sets)] 342 | 343 | def count_presence_in_fps(fps, prefix): 344 | """ Counts of occurrences in given files with domains """ 345 | presence = {} 346 | for fp in fps: 347 | lst = generate_prefix_items_s3(fp, prefix) 348 | for i in lst: 349 | count_dict(presence, i, 1) 350 | 351 | def count_presence_in_sets(sets,): 352 | """ Counts of occurrences in given sets """ 353 | presence = {} 354 | for st in sets: 355 | for i in st: 356 | count_dict(presence, i, 1) 357 | return presence 358 | 359 | def items_in_any_list(fps, prefix): 360 | """ Find domains that appear in any of the given lists """ 361 | return set.union(*map(set, [[i[1] for i in generate_prefix_items_s3(fp, prefix)] for fp in fps])) 362 | 363 | def generate_filter_minimum_presence(fps, prefix, minimum): 364 | """ An item should appear on all the lists """ 365 | presence = count_presence_in_fps(fps, prefix) 366 | return {k for k, v in presence.items() if v >= minimum} 367 | 368 | def generate_filter_minimum_presence_any(groups_of_fps, prefix, minimum): 369 | """ An item should appear in `minimum` groups, where an item may appear in any list in that group """ 370 | items_per_group = [items_in_any_list(group, prefix) for group in groups_of_fps] 371 | presence = count_presence_in_sets(items_per_group,) 372 | return {k for k, v in presence.items() if v >= minimum} 373 | 374 | def truncate_list(lst, list_size=None): 375 | """ Return only prefix of given list """ 376 | return lst[:list_size] if list_size else lst 377 | 378 | def write_sorted_counts(sorted_items, scores, fp): 379 | """ Write domains and aggregate scores to file """ 380 | with open(fp, 'w', encoding='utf8') as f: 381 | csvw = csv.writer(f) 382 | for idx, entry in enumerate(sorted_items): 383 | csvw.writerow([idx + 1, entry, scores[entry]]) 384 | 385 | def write_list_to_file(lst, list_id): 386 | """ Write ranks and domains to file """ 387 | with open(get_generated_list_fp(list_id), 'w', encoding='utf8') as f: 388 | csvw = csv.writer(f) 389 | for idx, entry in enumerate(lst): 390 | csvw.writerow([idx + 1, entry]) 391 | 392 | 393 | def write_zip_to_file(lst, list_id): 394 | """ Write list of (top 1M) domains to zip file """ 395 | with tempfile.SpooledTemporaryFile(mode='w+b') as z: 396 | with tempfile.NamedTemporaryFile(mode='w+') as t: 397 | csvw = csv.writer(t) 398 | for idx, entry in enumerate(lst): 399 | csvw.writerow([idx + 1, entry]) 400 | 401 | t.seek(0) 402 | 403 | with zipfile.ZipFile(z, 'w') as a: 404 | a.write(t.name, arcname="top-1m.csv") 405 | 406 | z.seek(0) 407 | 408 | with open(get_generated_zip_fp(list_id), 'wb') as f: 409 | f.write(z.read()) 410 | 411 | 412 | def write_list_to_s3(lst, list_id): 413 | """ Write ranks and domains to file """ 414 | with smart_open(get_generated_list_s3(list_id), 'w', encoding='utf8') as f: 415 | csvw = csv.writer(f) 416 | for idx, entry in enumerate(lst): 417 | csvw.writerow([idx + 1, entry]) 418 | 419 | 420 | def write_zip_to_s3(lst, list_id): 421 | """ Write list of (top 1M) domains to zip file """ 422 | with tempfile.SpooledTemporaryFile(mode='w+b') as z: 423 | with tempfile.NamedTemporaryFile(mode='w+') as t: 424 | csvw = csv.writer(t) 425 | for idx, entry in enumerate(lst): 426 | csvw.writerow([idx + 1, entry]) 427 | 428 | t.seek(0) 429 | 430 | with zipfile.ZipFile(z, 'w') as a: 431 | a.write(t.name, arcname="top-1m.csv") 432 | 433 | z.seek(0) 434 | 435 | with smart_open(get_generated_zip_s3(list_id), 'wb') as f: 436 | f.write(z.read()) 437 | 438 | 439 | def copy_daily_list_s3(list_id): 440 | """ Copy the daily list on S3 to the fixed URL """ 441 | zip_key = ZIP_FILENAME_FORMAT.format(list_id) 442 | source = {'Bucket': TOPLISTS_DAILY_LIST_S3_BUCKET, 'Key': zip_key} 443 | target_bucket = s3_resource.Bucket(TOPLISTS_DAILY_LIST_S3_BUCKET) 444 | target_bucket.copy(source, 'top-1m.csv.zip') 445 | 446 | 447 | def copy_daily_list_file(list_id): 448 | """ Copy the daily list on file-based archive to the fixed URL """ 449 | zip_file = get_generated_zip_fp(list_id) 450 | target_file = os.path.join(NETAPP_STORAGE_PATH, "generated_lists_zip/{}".format("top-1m.csv.zip")) 451 | shutil.copy2(zip_file, target_file) 452 | 453 | def generate_combined_list(config, list_id, test=False): 454 | """ Generate combined list by calculating aggregate scores on (potentially filtered) source lists of ranked domains """ 455 | db_id = _list_id_to_db_id(list_id) 456 | try: 457 | ### INPUT ### 458 | 459 | # If a filter on parts is selected, the preprocessed parts files should be used. 460 | parts_filter = config.get("filterPLD", False) or (config.get("filterTLD", "false") != "false") or config.get("filterOrganization", False) or config.get('filterSubdomain', False) 461 | dates = date_list(config.get("startDate"), config.get("endDate")) 462 | 463 | # Get source files to process 464 | fps = [] 465 | fps_on_date = {date: [] for date in dates} 466 | fps_on_provider = {provider: [] for provider in config['providers']} 467 | for provider in config['providers']: 468 | for date in dates: 469 | if USE_S3: 470 | list_fp = get_s3_key_for_day(provider, date, parts_filter) 471 | else: 472 | list_fp = get_list_fp_for_day(provider, date, parts_filter) 473 | fps.append(list_fp) 474 | fps_on_date[date].append(list_fp) 475 | fps_on_provider[provider].append(list_fp) 476 | 477 | # Get requested list prefix 478 | if "listPrefix" in config and config['listPrefix']: 479 | if config['listPrefix'] == "full": 480 | input_prefix = None 481 | elif config['listPrefix'] == "custom": 482 | input_prefix = int(config['listPrefixCustomValue']) 483 | else: 484 | input_prefix = int(config['listPrefix']) 485 | else: 486 | input_prefix = None 487 | 488 | # Generate (sorted) aggregate counts (on parts files if necessary) 489 | if parts_filter: 490 | if config['combinationMethod'] == 'borda': 491 | scores = borda_count_list(fps, input_prefix, config) 492 | elif config['combinationMethod'] == 'dowdall': 493 | scores = dowdall_count_list(fps, input_prefix, config) 494 | else: 495 | raise Exception("Unknown combination method") 496 | else: 497 | if config['combinationMethod'] == 'borda': 498 | scores = borda_count_fp(fps, input_prefix) 499 | elif config['combinationMethod'] == 'dowdall': 500 | scores = dowdall_count_fp(fps, input_prefix) 501 | else: 502 | raise Exception("Unknown combination method") 503 | sorted_domains = sort_counts(scores) 504 | domains = sorted_domains 505 | 506 | ### FILTERS ### 507 | 508 | filters_to_apply = [] 509 | if "inclusionDays" in config and config["inclusionDays"]: 510 | presence_filter = generate_filter_minimum_presence_any([fps_on_date[date] for date in dates], input_prefix, int(config["inclusionDaysValue"])) 511 | filters_to_apply.append(presence_filter) 512 | if "inclusionLists" in config and config["inclusionLists"]: 513 | presence_filter = generate_filter_minimum_presence_any([fps_on_provider[provider] for provider in config['providers']], input_prefix, int(config["inclusionListsValue"])) 514 | filters_to_apply.append(presence_filter) 515 | domains = filter_list_multiple(domains, filters_to_apply) 516 | 517 | ### OUTPUT ### 518 | 519 | if test: 520 | return domains 521 | else: 522 | # Write list to file 523 | if USE_S3: 524 | write_list_to_s3(domains, list_id) 525 | else: 526 | write_list_to_file(domains, list_id) 527 | 528 | # If the list is the daily default list, also generate a zip of the top 1M and copy to permanent URL 529 | try: 530 | if "isDailyList" in config and config["isDailyList"] is True: 531 | if USE_S3: 532 | write_zip_to_s3(domains[:1000000], list_id) 533 | copy_daily_list_s3(list_id) 534 | else: 535 | write_zip_to_file(domains[:1000000], list_id) 536 | copy_daily_list_file(list_id) 537 | except: 538 | print("Zip creation failed") 539 | traceback.print_exc() 540 | 541 | # Update generation success in database 542 | db["lists"].update_one({"_id": db_id}, {"$set": {"finished": True, "failed": False, "list_id": list_id}}) 543 | 544 | time.sleep(1) 545 | # Report success 546 | return True 547 | except: 548 | traceback.print_exc() 549 | # Update generation failure in database 550 | db["lists"].update_one({"_id": db_id}, {"$set": {"finished": True, "failed": True}}) 551 | # Report failure 552 | return False 553 | 554 | -------------------------------------------------------------------------------- /generate_daily_list.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import sys 3 | 4 | from redis import Redis 5 | from rq import Queue 6 | 7 | import combined_lists 8 | from shared import DATE_FORMAT_WITH_HYPHEN, DEFAULT_TRANCO_CONFIG 9 | 10 | 11 | 12 | def get_date_interval_bounds(start_date, end_date, nb_days, nb_days_from): 13 | if start_date: 14 | start_date_dt = datetime.datetime.strptime(start_date, DATE_FORMAT_WITH_HYPHEN) 15 | return ( 16 | start_date, (start_date_dt + datetime.timedelta(days=int(nb_days) - 1)).strftime(DATE_FORMAT_WITH_HYPHEN)) 17 | elif end_date: 18 | end_date_dt = datetime.datetime.strptime(end_date, DATE_FORMAT_WITH_HYPHEN) 19 | return ((end_date_dt - datetime.timedelta(days=int(nb_days) - 1)).strftime(DATE_FORMAT_WITH_HYPHEN), end_date) 20 | 21 | 22 | def generate_todays_lists(day): 23 | print("Generating lists for {}...".format(day)) 24 | config = DEFAULT_TRANCO_CONFIG.copy() 25 | 26 | if day == "yesterday": 27 | date = (datetime.datetime.utcnow() - datetime.timedelta(days=1)).strftime(DATE_FORMAT_WITH_HYPHEN) 28 | elif day == "today": 29 | date = datetime.datetime.utcnow().strftime(DATE_FORMAT_WITH_HYPHEN) 30 | else: 31 | raise ValueError 32 | config["startDate"], config["endDate"] = get_date_interval_bounds(None, date, 30, "end") 33 | config["isDailyList"] = True 34 | 35 | print("Generating list...") 36 | list_id = combined_lists.config_to_list_id(config) 37 | print("Generating list ID {}...".format(list_id)) 38 | if not combined_lists.list_available(list_id): 39 | conn = Redis('localhost', 6379) 40 | generate_queue = Queue('generate', connection=conn, default_timeout="1h") 41 | if list_id not in generate_queue.job_ids: 42 | generate_queue.enqueue(combined_lists.generate_combined_list, args=(config, list_id), job_id=str(list_id), timeout="1h") 43 | print("Submitted job for list ID {}".format(list_id)) 44 | 45 | 46 | if __name__ == '__main__': 47 | day = "yesterday" 48 | if len(sys.argv) > 1: 49 | day = sys.argv[1] 50 | generate_todays_lists(day) 51 | -------------------------------------------------------------------------------- /generate_domain_parts.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | 4 | import tldextract 5 | 6 | 7 | def generate_parts_list(input_fp, output_fp): 8 | print(input_fp) 9 | print(output_fp) 10 | with open(output_fp, 'w', encoding='UTF-8') as output_file: 11 | output = csv.writer(output_file) 12 | with open(input_fp, encoding='UTF-8') as input_file: 13 | for l in input_file: 14 | rank, fqdn = l.rstrip('\n').split(",") 15 | ext = tldextract.extract(fqdn) 16 | pld = ext.registered_domain 17 | is_pld = pld == fqdn 18 | ps = ext.suffix 19 | tld = fqdn[fqdn.rfind(".") + 1:] 20 | sld = ext.domain 21 | subd = ext.subdomain 22 | output.writerow([rank, fqdn, pld, sld, subd, ps, tld, is_pld]) 23 | 24 | if __name__ == '__main__': 25 | input_fp = sys.argv[1] 26 | output_fp = "/".join(input_fp.split("/")[:-1]) + "/parts/" + input_fp.split("/")[-1][:-4] + "_parts.csv" 27 | generate_parts_list(input_fp, output_fp) -------------------------------------------------------------------------------- /global_config.py: -------------------------------------------------------------------------------- 1 | NETAPP_STORAGE_PATH = None # File-based archive 2 | MAILGUN_API_KEY = None # API key for sending email notifications 3 | TOPLISTS_ARCHIVE_S3_BUCKET = None # S3 bucket with archived rankings 4 | TOPLISTS_GENERATED_LIST_S3_BUCKET = None # S3 bucket with generated lists 5 | TOPLISTS_DAILY_LIST_S3_BUCKET = None # S3 bucket with daily default lists 6 | MONGO_URL = None # Mongo instance for storing configurations of lists 7 | USE_S3 = None # Boolean indicating whether to use AWS services 8 | GENERATION_REMOTE = None # Boolean indicating whether list generation is handled remotely 9 | GENERATION_REMOTE_ENDPOINT = None # Endpoint accepting list generation jobs 10 | JOB_SERVER_PORT = None # Port of server accepting list generation jobs -------------------------------------------------------------------------------- /job_handler.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | from redis import Redis 4 | from rq import Queue 5 | from rq.registry import StartedJobRegistry 6 | 7 | import combined_lists 8 | import notify_email 9 | 10 | 11 | class JobHandler: 12 | """ 13 | Manage list generation run on this machine. 14 | """ 15 | def __init__(self, asyncio_loop): 16 | self.loop = asyncio_loop 17 | self.setup_job_queues() 18 | 19 | def setup_job_queues(self): 20 | """ Setup rq queues for submitting list generation and email notification jobs. """ 21 | self.conn = Redis('localhost', 6379) 22 | self.generate_queue = Queue('generate', connection=self.conn, default_timeout="1h") 23 | self.email_queue = Queue('notify_email', connection=self.conn) 24 | 25 | async def submit_generate_job(self, config, list_id): 26 | """ Submit a new job for generating a list (with the given config) """ 27 | if list_id not in await self.loop.run_in_executor(None, self.current_jobs): 28 | await self.loop.run_in_executor(None, functools.partial(self.generate_queue.enqueue, combined_lists.generate_combined_list, args=(config, list_id), job_id=str(list_id), timeout="1h")) 29 | return True 30 | else: 31 | return False 32 | 33 | async def submit_email_job(self, email_address, list_id, list_size): 34 | """ Submit a new job for sending an email once a list has been generated """ 35 | generate_job = await self.loop.run_in_executor(None, self.generate_queue.fetch_job, list_id) 36 | await self.loop.run_in_executor(None, functools.partial(self.email_queue.enqueue, notify_email.send_notification_mailgun_api, email_address, list_id, list_size, depends_on=generate_job)) 37 | return True 38 | 39 | def current_jobs(self): 40 | """ Track currently active and queued jobs """ 41 | registry = StartedJobRegistry(queue=self.generate_queue) 42 | jobs = registry.get_job_ids() + self.current_jobs() 43 | 44 | return jobs 45 | 46 | def jobs_ahead_of_job(self, list_id): 47 | """ Count number of jobs ahead of current job """ 48 | jobs = self.current_jobs() 49 | if list_id in jobs: 50 | return jobs.index(list_id) 51 | else: 52 | return 0 53 | 54 | async def get_job_status(self, list_id): 55 | """ Get current status of a job """ 56 | job_success = await self.loop.run_in_executor(None, self.get_job_success, list_id) 57 | jobs_ahead = await self.loop.run_in_executor(None, self.jobs_ahead_of_job, list_id) 58 | return {"completed": job_success is not None, "jobs_ahead": jobs_ahead, "success": job_success} 59 | 60 | def get_job_success(self, list_id): 61 | """ Get current rq status of a job """ 62 | return self.generate_queue.fetch_job(list_id).result 63 | 64 | 65 | class JobHandlerRemote: 66 | """ 67 | Manage relaying jobs to a remote machine that generates lists. 68 | """ 69 | def __init__(self, asyncio_loop, endpoint=None, session=None): 70 | """ 71 | 72 | :param asyncio_loop: 73 | :param endpoint: remote location that generates lists 74 | :param session: client session for aiohttp 75 | """ 76 | if not endpoint or not session: 77 | raise ValueError 78 | self.endpoint = endpoint 79 | self.session = session 80 | 81 | async def submit_generate_job(self, config, list_id): 82 | """ Submit a new job for generating a list (with the given config) """ 83 | async with self.session.post("{}/submit_generate".format(self.endpoint), json={"config": config, "list_id": list_id}) as response: 84 | jsn = await response.json() 85 | return jsn["success"] 86 | 87 | async def submit_email_job(self, email_address, list_id, list_size): 88 | """ Submit a new job for sending an email once a list has been generated """ 89 | async with self.session.post("{}/submit_email".format(self.endpoint), json={"email_address": email_address, "list_id": list_id, "list_size": list_size}) as response: 90 | jsn = await response.json() 91 | return jsn["success"] 92 | 93 | async def get_job_status(self, list_id): 94 | """ Get current status of a job """ 95 | async with self.session.get("{}/job_status".format(self.endpoint), params={"list_id": list_id}) as response: 96 | jsn = await response.json() 97 | return jsn 98 | 99 | async def retrieve_list(self, list_id, slice_size): 100 | """ Retrieve the contents of a remotely generated list """ 101 | async with self.session.get("{}/retrieve_list".format(self.endpoint), json={"list_id": list_id, "slice_size": slice_size}) as response: 102 | while True: 103 | chunk = await response.content.read(1024) 104 | if not chunk: 105 | break 106 | yield chunk 107 | -------------------------------------------------------------------------------- /job_server.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import aitertools 3 | from aiohttp import web 4 | 5 | import combined_lists 6 | import job_handler 7 | from global_config import JOB_SERVER_PORT 8 | 9 | 10 | class JobServer: 11 | """ Job server for accepting requests for generating a custom Tranco list (hosted on remote machine) """ 12 | 13 | def __init__(self, loop): 14 | self.web_app = None 15 | self.server = None 16 | self.runner = None 17 | self.routes = web.RouteTableDef() 18 | self.loop = loop 19 | self.job_handler: job_handler.JobHandler = None 20 | 21 | async def submit_generate_job(self, request): 22 | """ Submit a new job for generating a list (with the given config) """ 23 | post_data = await request.json() 24 | print("Generating ", post_data) 25 | result = await self.job_handler.submit_generate_job(post_data["config"], post_data["list_id"]) 26 | return web.json_response({"success": result}) 27 | 28 | async def submit_email_job(self, request): 29 | """ Submit a new job for sending an email once a list has been generated """ 30 | post_data = await request.json() 31 | result = await self.job_handler.submit_email_job(post_data["email_address"], post_data["list_id"], post_data["list_size"]) 32 | return web.json_response({"success": result}) 33 | 34 | async def get_job_status(self, request): 35 | """ Get current status of a job """ 36 | list_id = request.query['list_id'] 37 | print("Getting status for ", list_id) 38 | return web.json_response(await self.job_handler.get_job_status(list_id)) 39 | 40 | async def retrieve_list(self, request): 41 | """ Retrieve the contents of a remotely generated list """ 42 | post_data = await request.json() 43 | list_id = post_data["list_id"] 44 | slice_size = post_data["slice_size"] 45 | file_path = await self.loop.run_in_executor(None, combined_lists.get_generated_list_fp, list_id) 46 | 47 | async def generator(): 48 | with open(file_path) as csvf: 49 | async for line in aitertools.islice(csvf, slice_size): 50 | yield line.encode("utf-8") 51 | 52 | return web.Response(body=generator(), 53 | content_type="text/csv", 54 | charset="utf-8", 55 | ) 56 | 57 | async def initialize_routes(self): 58 | self.web_app.add_routes([ 59 | web.post('/submit_generate', self.submit_generate_job), 60 | web.post('/submit_email', self.submit_email_job), 61 | web.get('/job_status', self.get_job_status), 62 | web.get('/retrieve_list', self.retrieve_list) 63 | ]) 64 | 65 | async def run(self): 66 | self.job_handler = job_handler.JobHandler(self.loop) 67 | 68 | self.web_app = web.Application() 69 | 70 | await self.initialize_routes() 71 | self.runner = web.AppRunner(self.web_app) 72 | await self.runner.setup() 73 | self.server = web.TCPSite(self.runner, '0.0.0.0', JOB_SERVER_PORT) 74 | await self.server.start() 75 | 76 | 77 | if __name__ == '__main__': 78 | loop = asyncio.get_event_loop() 79 | server = JobServer(loop) 80 | loop.run_until_complete(server.run()) 81 | loop.run_forever() -------------------------------------------------------------------------------- /notify_email.py: -------------------------------------------------------------------------------- 1 | import smtplib 2 | from email.message import EmailMessage 3 | import email.utils 4 | 5 | import requests 6 | from rq import Queue, Connection, get_current_connection 7 | from global_config import MAILGUN_API_KEY 8 | 9 | def send_notification_mailgun_api(email_address, list_id, list_size): 10 | with Connection(get_current_connection()): 11 | q = Queue('generate') 12 | job = q.fetch_job(list_id) 13 | success = job.result 14 | 15 | if success: 16 | subject = 'The Tranco list: generation succeeded' 17 | body = "Hello,\n\nWe have successfully generated your requested Tranco list with ID {}. You may retrieve it at https://tranco-list.eu/list/{}/{}\n\nTranco\nhttps://tranco-list.eu/".format(list_id, list_id, list_size) 18 | else: 19 | subject = 'The Tranco list: generation failed' 20 | body = "Hello,\n\nUnfortunately, we were currently unable to generate your requested Tranco list with ID {}. Please try again later.\n\nTranco\nhttps://tranco-list.eu/".format(list_id) 21 | 22 | r = requests.post( 23 | "https://api.eu.mailgun.net/v3/mg.tranco-list.eu/messages", 24 | auth=("api", MAILGUN_API_KEY), 25 | data={"from": "Tranco ", 26 | "to": [email_address], 27 | "subject": subject, 28 | "text": body}) 29 | return int(r.status_code) == 200 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | boto3 2 | smart_open 3 | hashids 4 | pymongo 5 | redis 6 | rq 7 | aiohttp 8 | aitertools -------------------------------------------------------------------------------- /shared.py: -------------------------------------------------------------------------------- 1 | DATE_FORMAT_WITH_HYPHEN = "%Y-%m-%d" 2 | DEFAULT_TRANCO_CONFIG = {"nbDays": "30", "nbDaysFrom": "end", 3 | "combinationMethod": "dowdall", # TODO make choice based on assessment on stability etc. 4 | "listPrefix": 'full', 5 | "includeDomains": 'all', # TODO make choice 6 | "filterPLD": "on", 7 | "providers": ["alexa", "umbrella", "majestic", "quantcast"] 8 | } 9 | ZIP_FILENAME_FORMAT = "tranco_{}-1m.csv.zip" --------------------------------------------------------------------------------